/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.59 - (hide annotations) (download) (as text)
Sat Sep 8 01:31:44 2007 UTC (17 years, 2 months ago) by wakaba
Branch: MAIN
Changes since 1.58: +17 -2 lines
File MIME type: application/x-wais-source
++ whatpm/Whatpm/CSS/ChangeLog	8 Sep 2007 01:31:14 -0000
2007-09-08  Wakaba  <wakaba@suika.fam.cx>

	* Tokenizer.pm: First working version.

1 wakaba 1.2 package Whatpm::HTML;
2 wakaba 1.1 use strict;
3 wakaba 1.59 our $VERSION=do{my @r=(q$Revision: 1.58 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.1
5 wakaba 1.18 ## ISSUE:
6     ## var doc = implementation.createDocument (null, null, null);
7     ## doc.write ('');
8     ## alert (doc.compatMode);
9 wakaba 1.1
10 wakaba 1.31 ## ISSUE: HTML5 revision 967 says that the encoding layer MUST NOT
11     ## strip BOM and the HTML layer MUST ignore it. Whether we can do it
12     ## is not yet clear.
13     ## "{U+FEFF}..." in UTF-16BE/UTF-16LE is three or four characters?
14     ## "{U+FEFF}..." in GB18030?
15    
16 wakaba 1.1 my $permitted_slash_tag_name = {
17     base => 1,
18     link => 1,
19     meta => 1,
20     hr => 1,
21     br => 1,
22     img=> 1,
23     embed => 1,
24     param => 1,
25     area => 1,
26     col => 1,
27     input => 1,
28     };
29    
30 wakaba 1.4 my $c1_entity_char = {
31 wakaba 1.10 0x80 => 0x20AC,
32     0x81 => 0xFFFD,
33     0x82 => 0x201A,
34     0x83 => 0x0192,
35     0x84 => 0x201E,
36     0x85 => 0x2026,
37     0x86 => 0x2020,
38     0x87 => 0x2021,
39     0x88 => 0x02C6,
40     0x89 => 0x2030,
41     0x8A => 0x0160,
42     0x8B => 0x2039,
43     0x8C => 0x0152,
44     0x8D => 0xFFFD,
45     0x8E => 0x017D,
46     0x8F => 0xFFFD,
47     0x90 => 0xFFFD,
48     0x91 => 0x2018,
49     0x92 => 0x2019,
50     0x93 => 0x201C,
51     0x94 => 0x201D,
52     0x95 => 0x2022,
53     0x96 => 0x2013,
54     0x97 => 0x2014,
55     0x98 => 0x02DC,
56     0x99 => 0x2122,
57     0x9A => 0x0161,
58     0x9B => 0x203A,
59     0x9C => 0x0153,
60     0x9D => 0xFFFD,
61     0x9E => 0x017E,
62     0x9F => 0x0178,
63 wakaba 1.4 }; # $c1_entity_char
64 wakaba 1.1
65     my $special_category = {
66     address => 1, area => 1, base => 1, basefont => 1, bgsound => 1,
67     blockquote => 1, body => 1, br => 1, center => 1, col => 1, colgroup => 1,
68     dd => 1, dir => 1, div => 1, dl => 1, dt => 1, embed => 1, fieldset => 1,
69     form => 1, frame => 1, frameset => 1, h1 => 1, h2 => 1, h3 => 1,
70     h4 => 1, h5 => 1, h6 => 1, head => 1, hr => 1, iframe => 1, image => 1,
71     img => 1, input => 1, isindex => 1, li => 1, link => 1, listing => 1,
72     menu => 1, meta => 1, noembed => 1, noframes => 1, noscript => 1,
73     ol => 1, optgroup => 1, option => 1, p => 1, param => 1, plaintext => 1,
74     pre => 1, script => 1, select => 1, spacer => 1, style => 1, tbody => 1,
75     textarea => 1, tfoot => 1, thead => 1, title => 1, tr => 1, ul => 1, wbr => 1,
76     };
77     my $scoping_category = {
78     button => 1, caption => 1, html => 1, marquee => 1, object => 1,
79     table => 1, td => 1, th => 1,
80     };
81     my $formatting_category = {
82     a => 1, b => 1, big => 1, em => 1, font => 1, i => 1, nobr => 1,
83     s => 1, small => 1, strile => 1, strong => 1, tt => 1, u => 1,
84     };
85     # $phrasing_category: all other elements
86    
87     sub parse_string ($$$;$) {
88     my $self = shift->new;
89     my $s = \$_[0];
90     $self->{document} = $_[1];
91    
92 wakaba 1.3 ## NOTE: |set_inner_html| copies most of this method's code
93    
94 wakaba 1.1 my $i = 0;
95 wakaba 1.3 my $line = 1;
96     my $column = 0;
97 wakaba 1.1 $self->{set_next_input_character} = sub {
98     my $self = shift;
99 wakaba 1.13
100     pop @{$self->{prev_input_character}};
101     unshift @{$self->{prev_input_character}}, $self->{next_input_character};
102    
103 wakaba 1.1 $self->{next_input_character} = -1 and return if $i >= length $$s;
104     $self->{next_input_character} = ord substr $$s, $i++, 1;
105 wakaba 1.3 $column++;
106 wakaba 1.1
107 wakaba 1.4 if ($self->{next_input_character} == 0x000A) { # LF
108     $line++;
109     $column = 0;
110     } elsif ($self->{next_input_character} == 0x000D) { # CR
111 wakaba 1.15 $i++ if substr ($$s, $i, 1) eq "\x0A";
112 wakaba 1.1 $self->{next_input_character} = 0x000A; # LF # MUST
113 wakaba 1.3 $line++;
114 wakaba 1.4 $column = 0;
115 wakaba 1.1 } elsif ($self->{next_input_character} > 0x10FFFF) {
116     $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
117     } elsif ($self->{next_input_character} == 0x0000) { # NULL
118 wakaba 1.8 !!!parse-error (type => 'NULL');
119 wakaba 1.1 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
120     }
121     };
122 wakaba 1.13 $self->{prev_input_character} = [-1, -1, -1];
123     $self->{next_input_character} = -1;
124 wakaba 1.1
125 wakaba 1.3 my $onerror = $_[2] || sub {
126     my (%opt) = @_;
127     warn "Parse error ($opt{type}) at line $opt{line} column $opt{column}\n";
128     };
129     $self->{parse_error} = sub {
130     $onerror->(@_, line => $line, column => $column);
131 wakaba 1.1 };
132    
133     $self->_initialize_tokenizer;
134     $self->_initialize_tree_constructor;
135     $self->_construct_tree;
136     $self->_terminate_tree_constructor;
137    
138     return $self->{document};
139     } # parse_string
140    
141     sub new ($) {
142     my $class = shift;
143     my $self = bless {}, $class;
144     $self->{set_next_input_character} = sub {
145     $self->{next_input_character} = -1;
146     };
147     $self->{parse_error} = sub {
148     #
149     };
150     return $self;
151     } # new
152    
153 wakaba 1.40 sub CM_ENTITY () { 0b001 } # & markup in data
154     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
155     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
156    
157     sub PLAINTEXT_CONTENT_MODEL () { 0 }
158     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
159     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
160     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
161    
162 wakaba 1.57 sub DATA_STATE () { 0 }
163     sub ENTITY_DATA_STATE () { 1 }
164     sub TAG_OPEN_STATE () { 2 }
165     sub CLOSE_TAG_OPEN_STATE () { 3 }
166     sub TAG_NAME_STATE () { 4 }
167     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
168     sub ATTRIBUTE_NAME_STATE () { 6 }
169     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
170     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
171     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
172     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
173     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
174     sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
175     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
176     sub COMMENT_START_STATE () { 14 }
177     sub COMMENT_START_DASH_STATE () { 15 }
178     sub COMMENT_STATE () { 16 }
179     sub COMMENT_END_STATE () { 17 }
180     sub COMMENT_END_DASH_STATE () { 18 }
181     sub BOGUS_COMMENT_STATE () { 19 }
182     sub DOCTYPE_STATE () { 20 }
183     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
184     sub DOCTYPE_NAME_STATE () { 22 }
185     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
186     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
187     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
188     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
189     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
190     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
191     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
192     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
193     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
194     sub BOGUS_DOCTYPE_STATE () { 32 }
195    
196 wakaba 1.55 sub DOCTYPE_TOKEN () { 1 }
197     sub COMMENT_TOKEN () { 2 }
198     sub START_TAG_TOKEN () { 3 }
199     sub END_TAG_TOKEN () { 4 }
200     sub END_OF_FILE_TOKEN () { 5 }
201     sub CHARACTER_TOKEN () { 6 }
202    
203 wakaba 1.54 sub AFTER_HTML_IMS () { 0b100 }
204     sub HEAD_IMS () { 0b1000 }
205     sub BODY_IMS () { 0b10000 }
206 wakaba 1.56 sub BODY_TABLE_IMS () { 0b100000 }
207 wakaba 1.54 sub TABLE_IMS () { 0b1000000 }
208 wakaba 1.56 sub ROW_IMS () { 0b10000000 }
209 wakaba 1.54 sub BODY_AFTER_IMS () { 0b100000000 }
210     sub FRAME_IMS () { 0b1000000000 }
211    
212     sub AFTER_HTML_BODY_IM () { AFTER_HTML_IMS | BODY_AFTER_IMS }
213     sub AFTER_HTML_FRAMESET_IM () { AFTER_HTML_IMS | FRAME_IMS }
214     sub IN_HEAD_IM () { HEAD_IMS | 0b00 }
215     sub IN_HEAD_NOSCRIPT_IM () { HEAD_IMS | 0b01 }
216     sub AFTER_HEAD_IM () { HEAD_IMS | 0b10 }
217     sub BEFORE_HEAD_IM () { HEAD_IMS | 0b11 }
218     sub IN_BODY_IM () { BODY_IMS }
219 wakaba 1.56 sub IN_CELL_IM () { BODY_IMS | BODY_TABLE_IMS | 0b01 }
220     sub IN_CAPTION_IM () { BODY_IMS | BODY_TABLE_IMS | 0b10 }
221     sub IN_ROW_IM () { TABLE_IMS | ROW_IMS | 0b01 }
222     sub IN_TABLE_BODY_IM () { TABLE_IMS | ROW_IMS | 0b10 }
223 wakaba 1.54 sub IN_TABLE_IM () { TABLE_IMS }
224     sub AFTER_BODY_IM () { BODY_AFTER_IMS }
225     sub IN_FRAMESET_IM () { FRAME_IMS | 0b01 }
226     sub AFTER_FRAMESET_IM () { FRAME_IMS | 0b10 }
227     sub IN_SELECT_IM () { 0b01 }
228     sub IN_COLUMN_GROUP_IM () { 0b10 }
229    
230 wakaba 1.1 ## Implementations MUST act as if state machine in the spec
231    
232     sub _initialize_tokenizer ($) {
233     my $self = shift;
234 wakaba 1.57 $self->{state} = DATA_STATE; # MUST
235 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
236 wakaba 1.1 undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE
237     undef $self->{current_attribute};
238     undef $self->{last_emitted_start_tag_name};
239     undef $self->{last_attribute_value_state};
240     $self->{char} = [];
241     # $self->{next_input_character}
242     !!!next-input-character;
243     $self->{token} = [];
244 wakaba 1.18 # $self->{escape}
245 wakaba 1.1 } # _initialize_tokenizer
246    
247     ## A token has:
248 wakaba 1.55 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
249     ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
250     ## ->{name} (DOCTYPE_TOKEN)
251     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
252     ## ->{public_identifier} (DOCTYPE_TOKEN)
253     ## ->{system_identifier} (DOCTYPE_TOKEN)
254     ## ->{correct} == 1 or 0 (DOCTYPE_TOKEN)
255     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
256     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
257 wakaba 1.1
258     ## Emitted token MUST immediately be handled by the tree construction state.
259    
260     ## Before each step, UA MAY check to see if either one of the scripts in
261     ## "list of scripts that will execute as soon as possible" or the first
262     ## script in the "list of scripts that will execute asynchronously",
263     ## has completed loading. If one has, then it MUST be executed
264     ## and removed from the list.
265    
266 wakaba 1.59 ## NOTE: HTML5 "Writing HTML documents" section, applied to
267     ## documents and not to user agents and conformance checkers,
268     ## contains some requirements that are not detected by the
269     ## parsing algorithm:
270     ## - Some requirements on character encoding declarations. ## TODO
271     ## - "Elements MUST NOT contain content that their content model disallows."
272     ## ... Some are parse error, some are not (will be reported by c.c.).
273     ## - Polytheistic slash SHOULD NOT be used. (Applied only to atheists.) ## TODO
274     ## - Text (in elements, attributes, and comments) SHOULD NOT contain
275     ## control characters other than space characters. ## TODO: (what is control character? C0, C1 and DEL? Unicode control character?)
276    
277     ## TODO: HTML5 poses authors two SHOULD-level requirements that cannot
278     ## be detected by the HTML5 parsing algorithm:
279     ## - Text,
280    
281 wakaba 1.1 sub _get_next_token ($) {
282     my $self = shift;
283     if (@{$self->{token}}) {
284     return shift @{$self->{token}};
285     }
286    
287     A: {
288 wakaba 1.57 if ($self->{state} == DATA_STATE) {
289 wakaba 1.1 if ($self->{next_input_character} == 0x0026) { # &
290 wakaba 1.40 if ($self->{content_model} & CM_ENTITY) { # PCDATA | RCDATA
291 wakaba 1.57 $self->{state} = ENTITY_DATA_STATE;
292 wakaba 1.1 !!!next-input-character;
293     redo A;
294     } else {
295     #
296     }
297 wakaba 1.13 } elsif ($self->{next_input_character} == 0x002D) { # -
298 wakaba 1.40 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
299 wakaba 1.13 unless ($self->{escape}) {
300     if ($self->{prev_input_character}->[0] == 0x002D and # -
301     $self->{prev_input_character}->[1] == 0x0021 and # !
302     $self->{prev_input_character}->[2] == 0x003C) { # <
303     $self->{escape} = 1;
304     }
305     }
306     }
307    
308     #
309 wakaba 1.1 } elsif ($self->{next_input_character} == 0x003C) { # <
310 wakaba 1.40 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
311     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
312 wakaba 1.13 not $self->{escape})) {
313 wakaba 1.57 $self->{state} = TAG_OPEN_STATE;
314 wakaba 1.1 !!!next-input-character;
315     redo A;
316     } else {
317     #
318     }
319 wakaba 1.13 } elsif ($self->{next_input_character} == 0x003E) { # >
320     if ($self->{escape} and
321 wakaba 1.40 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
322 wakaba 1.13 if ($self->{prev_input_character}->[0] == 0x002D and # -
323     $self->{prev_input_character}->[1] == 0x002D) { # -
324     delete $self->{escape};
325     }
326     }
327    
328     #
329 wakaba 1.1 } elsif ($self->{next_input_character} == -1) {
330 wakaba 1.55 !!!emit ({type => END_OF_FILE_TOKEN});
331 wakaba 1.1 last A; ## TODO: ok?
332     }
333     # Anything else
334 wakaba 1.55 my $token = {type => CHARACTER_TOKEN,
335 wakaba 1.1 data => chr $self->{next_input_character}};
336     ## Stay in the data state
337     !!!next-input-character;
338    
339     !!!emit ($token);
340    
341     redo A;
342 wakaba 1.57 } elsif ($self->{state} == ENTITY_DATA_STATE) {
343 wakaba 1.1 ## (cannot happen in CDATA state)
344    
345 wakaba 1.26 my $token = $self->_tokenize_attempt_to_consume_an_entity (0);
346 wakaba 1.1
347 wakaba 1.57 $self->{state} = DATA_STATE;
348 wakaba 1.1 # next-input-character is already done
349    
350     unless (defined $token) {
351 wakaba 1.55 !!!emit ({type => CHARACTER_TOKEN, data => '&'});
352 wakaba 1.1 } else {
353     !!!emit ($token);
354     }
355    
356     redo A;
357 wakaba 1.57 } elsif ($self->{state} == TAG_OPEN_STATE) {
358 wakaba 1.40 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
359 wakaba 1.1 if ($self->{next_input_character} == 0x002F) { # /
360     !!!next-input-character;
361 wakaba 1.57 $self->{state} = CLOSE_TAG_OPEN_STATE;
362 wakaba 1.1 redo A;
363     } else {
364     ## reconsume
365 wakaba 1.57 $self->{state} = DATA_STATE;
366 wakaba 1.1
367 wakaba 1.55 !!!emit ({type => CHARACTER_TOKEN, data => '<'});
368 wakaba 1.1
369     redo A;
370     }
371 wakaba 1.40 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
372 wakaba 1.1 if ($self->{next_input_character} == 0x0021) { # !
373 wakaba 1.57 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
374 wakaba 1.1 !!!next-input-character;
375     redo A;
376     } elsif ($self->{next_input_character} == 0x002F) { # /
377 wakaba 1.57 $self->{state} = CLOSE_TAG_OPEN_STATE;
378 wakaba 1.1 !!!next-input-character;
379     redo A;
380     } elsif (0x0041 <= $self->{next_input_character} and
381     $self->{next_input_character} <= 0x005A) { # A..Z
382     $self->{current_token}
383 wakaba 1.55 = {type => START_TAG_TOKEN,
384 wakaba 1.1 tag_name => chr ($self->{next_input_character} + 0x0020)};
385 wakaba 1.57 $self->{state} = TAG_NAME_STATE;
386 wakaba 1.1 !!!next-input-character;
387     redo A;
388     } elsif (0x0061 <= $self->{next_input_character} and
389     $self->{next_input_character} <= 0x007A) { # a..z
390 wakaba 1.55 $self->{current_token} = {type => START_TAG_TOKEN,
391 wakaba 1.1 tag_name => chr ($self->{next_input_character})};
392 wakaba 1.57 $self->{state} = TAG_NAME_STATE;
393 wakaba 1.1 !!!next-input-character;
394     redo A;
395     } elsif ($self->{next_input_character} == 0x003E) { # >
396 wakaba 1.3 !!!parse-error (type => 'empty start tag');
397 wakaba 1.57 $self->{state} = DATA_STATE;
398 wakaba 1.1 !!!next-input-character;
399    
400 wakaba 1.55 !!!emit ({type => CHARACTER_TOKEN, data => '<>'});
401 wakaba 1.1
402     redo A;
403     } elsif ($self->{next_input_character} == 0x003F) { # ?
404 wakaba 1.3 !!!parse-error (type => 'pio');
405 wakaba 1.57 $self->{state} = BOGUS_COMMENT_STATE;
406 wakaba 1.1 ## $self->{next_input_character} is intentionally left as is
407     redo A;
408     } else {
409 wakaba 1.3 !!!parse-error (type => 'bare stago');
410 wakaba 1.57 $self->{state} = DATA_STATE;
411 wakaba 1.1 ## reconsume
412    
413 wakaba 1.55 !!!emit ({type => CHARACTER_TOKEN, data => '<'});
414 wakaba 1.1
415     redo A;
416     }
417     } else {
418 wakaba 1.40 die "$0: $self->{content_model} in tag open";
419 wakaba 1.1 }
420 wakaba 1.57 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
421 wakaba 1.40 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
422 wakaba 1.23 if (defined $self->{last_emitted_start_tag_name}) {
423 wakaba 1.30 ## NOTE: <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>
424 wakaba 1.23 my @next_char;
425     TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
426     push @next_char, $self->{next_input_character};
427     my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
428     my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
429     if ($self->{next_input_character} == $c or $self->{next_input_character} == $C) {
430     !!!next-input-character;
431     next TAGNAME;
432     } else {
433     $self->{next_input_character} = shift @next_char; # reconsume
434     !!!back-next-input-character (@next_char);
435 wakaba 1.57 $self->{state} = DATA_STATE;
436 wakaba 1.23
437 wakaba 1.55 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
438 wakaba 1.23
439     redo A;
440     }
441     }
442 wakaba 1.1 push @next_char, $self->{next_input_character};
443 wakaba 1.23
444     unless ($self->{next_input_character} == 0x0009 or # HT
445     $self->{next_input_character} == 0x000A or # LF
446     $self->{next_input_character} == 0x000B or # VT
447     $self->{next_input_character} == 0x000C or # FF
448     $self->{next_input_character} == 0x0020 or # SP
449     $self->{next_input_character} == 0x003E or # >
450     $self->{next_input_character} == 0x002F or # /
451     $self->{next_input_character} == -1) {
452 wakaba 1.1 $self->{next_input_character} = shift @next_char; # reconsume
453     !!!back-next-input-character (@next_char);
454 wakaba 1.57 $self->{state} = DATA_STATE;
455 wakaba 1.55 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
456 wakaba 1.1 redo A;
457 wakaba 1.23 } else {
458     $self->{next_input_character} = shift @next_char;
459     !!!back-next-input-character (@next_char);
460     # and consume...
461 wakaba 1.1 }
462 wakaba 1.23 } else {
463     ## No start tag token has ever been emitted
464     # next-input-character is already done
465 wakaba 1.57 $self->{state} = DATA_STATE;
466 wakaba 1.55 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
467 wakaba 1.1 redo A;
468     }
469     }
470    
471     if (0x0041 <= $self->{next_input_character} and
472     $self->{next_input_character} <= 0x005A) { # A..Z
473 wakaba 1.55 $self->{current_token} = {type => END_TAG_TOKEN,
474 wakaba 1.1 tag_name => chr ($self->{next_input_character} + 0x0020)};
475 wakaba 1.57 $self->{state} = TAG_NAME_STATE;
476 wakaba 1.1 !!!next-input-character;
477     redo A;
478     } elsif (0x0061 <= $self->{next_input_character} and
479     $self->{next_input_character} <= 0x007A) { # a..z
480 wakaba 1.55 $self->{current_token} = {type => END_TAG_TOKEN,
481 wakaba 1.1 tag_name => chr ($self->{next_input_character})};
482 wakaba 1.57 $self->{state} = TAG_NAME_STATE;
483 wakaba 1.1 !!!next-input-character;
484     redo A;
485     } elsif ($self->{next_input_character} == 0x003E) { # >
486 wakaba 1.3 !!!parse-error (type => 'empty end tag');
487 wakaba 1.57 $self->{state} = DATA_STATE;
488 wakaba 1.1 !!!next-input-character;
489     redo A;
490     } elsif ($self->{next_input_character} == -1) {
491 wakaba 1.3 !!!parse-error (type => 'bare etago');
492 wakaba 1.57 $self->{state} = DATA_STATE;
493 wakaba 1.1 # reconsume
494    
495 wakaba 1.55 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
496 wakaba 1.1
497     redo A;
498     } else {
499 wakaba 1.3 !!!parse-error (type => 'bogus end tag');
500 wakaba 1.57 $self->{state} = BOGUS_COMMENT_STATE;
501 wakaba 1.1 ## $self->{next_input_character} is intentionally left as is
502     redo A;
503     }
504 wakaba 1.57 } elsif ($self->{state} == TAG_NAME_STATE) {
505 wakaba 1.1 if ($self->{next_input_character} == 0x0009 or # HT
506     $self->{next_input_character} == 0x000A or # LF
507     $self->{next_input_character} == 0x000B or # VT
508     $self->{next_input_character} == 0x000C or # FF
509     $self->{next_input_character} == 0x0020) { # SP
510 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
511 wakaba 1.1 !!!next-input-character;
512     redo A;
513     } elsif ($self->{next_input_character} == 0x003E) { # >
514 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
515 wakaba 1.28 $self->{current_token}->{first_start_tag}
516     = not defined $self->{last_emitted_start_tag_name};
517 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
518 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
519 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
520 wakaba 1.1 if ($self->{current_token}->{attributes}) {
521 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
522 wakaba 1.1 }
523     } else {
524     die "$0: $self->{current_token}->{type}: Unknown token type";
525     }
526 wakaba 1.57 $self->{state} = DATA_STATE;
527 wakaba 1.1 !!!next-input-character;
528    
529     !!!emit ($self->{current_token}); # start tag or end tag
530    
531     redo A;
532     } elsif (0x0041 <= $self->{next_input_character} and
533     $self->{next_input_character} <= 0x005A) { # A..Z
534     $self->{current_token}->{tag_name} .= chr ($self->{next_input_character} + 0x0020);
535     # start tag or end tag
536     ## Stay in this state
537     !!!next-input-character;
538     redo A;
539 wakaba 1.17 } elsif ($self->{next_input_character} == -1) {
540 wakaba 1.3 !!!parse-error (type => 'unclosed tag');
541 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
542 wakaba 1.28 $self->{current_token}->{first_start_tag}
543     = not defined $self->{last_emitted_start_tag_name};
544 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
545 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
546 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
547 wakaba 1.1 if ($self->{current_token}->{attributes}) {
548 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
549 wakaba 1.1 }
550     } else {
551     die "$0: $self->{current_token}->{type}: Unknown token type";
552     }
553 wakaba 1.57 $self->{state} = DATA_STATE;
554 wakaba 1.1 # reconsume
555    
556     !!!emit ($self->{current_token}); # start tag or end tag
557    
558     redo A;
559     } elsif ($self->{next_input_character} == 0x002F) { # /
560     !!!next-input-character;
561     if ($self->{next_input_character} == 0x003E and # >
562 wakaba 1.55 $self->{current_token}->{type} == START_TAG_TOKEN and
563 wakaba 1.1 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
564     # permitted slash
565     #
566     } else {
567 wakaba 1.3 !!!parse-error (type => 'nestc');
568 wakaba 1.1 }
569 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
570 wakaba 1.1 # next-input-character is already done
571     redo A;
572     } else {
573     $self->{current_token}->{tag_name} .= chr $self->{next_input_character};
574     # start tag or end tag
575     ## Stay in the state
576     !!!next-input-character;
577     redo A;
578     }
579 wakaba 1.57 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
580 wakaba 1.1 if ($self->{next_input_character} == 0x0009 or # HT
581     $self->{next_input_character} == 0x000A or # LF
582     $self->{next_input_character} == 0x000B or # VT
583     $self->{next_input_character} == 0x000C or # FF
584     $self->{next_input_character} == 0x0020) { # SP
585     ## Stay in the state
586     !!!next-input-character;
587     redo A;
588     } elsif ($self->{next_input_character} == 0x003E) { # >
589 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
590 wakaba 1.28 $self->{current_token}->{first_start_tag}
591     = not defined $self->{last_emitted_start_tag_name};
592 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
593 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
594 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
595 wakaba 1.1 if ($self->{current_token}->{attributes}) {
596 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
597 wakaba 1.1 }
598     } else {
599     die "$0: $self->{current_token}->{type}: Unknown token type";
600     }
601 wakaba 1.57 $self->{state} = DATA_STATE;
602 wakaba 1.1 !!!next-input-character;
603    
604     !!!emit ($self->{current_token}); # start tag or end tag
605    
606     redo A;
607     } elsif (0x0041 <= $self->{next_input_character} and
608     $self->{next_input_character} <= 0x005A) { # A..Z
609     $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
610     value => ''};
611 wakaba 1.57 $self->{state} = ATTRIBUTE_NAME_STATE;
612 wakaba 1.1 !!!next-input-character;
613     redo A;
614     } elsif ($self->{next_input_character} == 0x002F) { # /
615     !!!next-input-character;
616     if ($self->{next_input_character} == 0x003E and # >
617 wakaba 1.55 $self->{current_token}->{type} == START_TAG_TOKEN and
618 wakaba 1.1 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
619     # permitted slash
620     #
621     } else {
622 wakaba 1.3 !!!parse-error (type => 'nestc');
623 wakaba 1.1 }
624     ## Stay in the state
625     # next-input-character is already done
626     redo A;
627 wakaba 1.17 } elsif ($self->{next_input_character} == -1) {
628 wakaba 1.3 !!!parse-error (type => 'unclosed tag');
629 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
630 wakaba 1.28 $self->{current_token}->{first_start_tag}
631     = not defined $self->{last_emitted_start_tag_name};
632 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
633 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
634 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
635 wakaba 1.1 if ($self->{current_token}->{attributes}) {
636 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
637 wakaba 1.1 }
638     } else {
639     die "$0: $self->{current_token}->{type}: Unknown token type";
640     }
641 wakaba 1.57 $self->{state} = DATA_STATE;
642 wakaba 1.1 # reconsume
643    
644     !!!emit ($self->{current_token}); # start tag or end tag
645    
646     redo A;
647     } else {
648     $self->{current_attribute} = {name => chr ($self->{next_input_character}),
649     value => ''};
650 wakaba 1.57 $self->{state} = ATTRIBUTE_NAME_STATE;
651 wakaba 1.1 !!!next-input-character;
652     redo A;
653     }
654 wakaba 1.57 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
655 wakaba 1.1 my $before_leave = sub {
656     if (exists $self->{current_token}->{attributes} # start tag or end tag
657     ->{$self->{current_attribute}->{name}}) { # MUST
658 wakaba 1.39 !!!parse-error (type => 'duplicate attribute:'.$self->{current_attribute}->{name});
659 wakaba 1.1 ## Discard $self->{current_attribute} # MUST
660     } else {
661     $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}
662     = $self->{current_attribute};
663     }
664     }; # $before_leave
665    
666     if ($self->{next_input_character} == 0x0009 or # HT
667     $self->{next_input_character} == 0x000A or # LF
668     $self->{next_input_character} == 0x000B or # VT
669     $self->{next_input_character} == 0x000C or # FF
670     $self->{next_input_character} == 0x0020) { # SP
671     $before_leave->();
672 wakaba 1.57 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
673 wakaba 1.1 !!!next-input-character;
674     redo A;
675     } elsif ($self->{next_input_character} == 0x003D) { # =
676     $before_leave->();
677 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
678 wakaba 1.1 !!!next-input-character;
679     redo A;
680     } elsif ($self->{next_input_character} == 0x003E) { # >
681     $before_leave->();
682 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
683 wakaba 1.28 $self->{current_token}->{first_start_tag}
684     = not defined $self->{last_emitted_start_tag_name};
685 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
686 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
687 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
688 wakaba 1.1 if ($self->{current_token}->{attributes}) {
689 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
690 wakaba 1.1 }
691     } else {
692     die "$0: $self->{current_token}->{type}: Unknown token type";
693     }
694 wakaba 1.57 $self->{state} = DATA_STATE;
695 wakaba 1.1 !!!next-input-character;
696    
697     !!!emit ($self->{current_token}); # start tag or end tag
698    
699     redo A;
700     } elsif (0x0041 <= $self->{next_input_character} and
701     $self->{next_input_character} <= 0x005A) { # A..Z
702     $self->{current_attribute}->{name} .= chr ($self->{next_input_character} + 0x0020);
703     ## Stay in the state
704     !!!next-input-character;
705     redo A;
706     } elsif ($self->{next_input_character} == 0x002F) { # /
707     $before_leave->();
708     !!!next-input-character;
709     if ($self->{next_input_character} == 0x003E and # >
710 wakaba 1.55 $self->{current_token}->{type} == START_TAG_TOKEN and
711 wakaba 1.1 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
712     # permitted slash
713     #
714     } else {
715 wakaba 1.3 !!!parse-error (type => 'nestc');
716 wakaba 1.1 }
717 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
718 wakaba 1.1 # next-input-character is already done
719     redo A;
720 wakaba 1.17 } elsif ($self->{next_input_character} == -1) {
721 wakaba 1.3 !!!parse-error (type => 'unclosed tag');
722 wakaba 1.1 $before_leave->();
723 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
724 wakaba 1.28 $self->{current_token}->{first_start_tag}
725     = not defined $self->{last_emitted_start_tag_name};
726 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
727 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
728 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
729 wakaba 1.1 if ($self->{current_token}->{attributes}) {
730 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
731 wakaba 1.1 }
732     } else {
733     die "$0: $self->{current_token}->{type}: Unknown token type";
734     }
735 wakaba 1.57 $self->{state} = DATA_STATE;
736 wakaba 1.1 # reconsume
737    
738     !!!emit ($self->{current_token}); # start tag or end tag
739    
740     redo A;
741     } else {
742     $self->{current_attribute}->{name} .= chr ($self->{next_input_character});
743     ## Stay in the state
744     !!!next-input-character;
745     redo A;
746     }
747 wakaba 1.57 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
748 wakaba 1.1 if ($self->{next_input_character} == 0x0009 or # HT
749     $self->{next_input_character} == 0x000A or # LF
750     $self->{next_input_character} == 0x000B or # VT
751     $self->{next_input_character} == 0x000C or # FF
752     $self->{next_input_character} == 0x0020) { # SP
753     ## Stay in the state
754     !!!next-input-character;
755     redo A;
756     } elsif ($self->{next_input_character} == 0x003D) { # =
757 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
758 wakaba 1.1 !!!next-input-character;
759     redo A;
760     } elsif ($self->{next_input_character} == 0x003E) { # >
761 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
762 wakaba 1.28 $self->{current_token}->{first_start_tag}
763     = not defined $self->{last_emitted_start_tag_name};
764 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
765 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
766 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
767 wakaba 1.1 if ($self->{current_token}->{attributes}) {
768 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
769 wakaba 1.1 }
770     } else {
771     die "$0: $self->{current_token}->{type}: Unknown token type";
772     }
773 wakaba 1.57 $self->{state} = DATA_STATE;
774 wakaba 1.1 !!!next-input-character;
775    
776     !!!emit ($self->{current_token}); # start tag or end tag
777    
778     redo A;
779     } elsif (0x0041 <= $self->{next_input_character} and
780     $self->{next_input_character} <= 0x005A) { # A..Z
781     $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
782     value => ''};
783 wakaba 1.57 $self->{state} = ATTRIBUTE_NAME_STATE;
784 wakaba 1.1 !!!next-input-character;
785     redo A;
786     } elsif ($self->{next_input_character} == 0x002F) { # /
787     !!!next-input-character;
788     if ($self->{next_input_character} == 0x003E and # >
789 wakaba 1.55 $self->{current_token}->{type} == START_TAG_TOKEN and
790 wakaba 1.1 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
791     # permitted slash
792     #
793     } else {
794 wakaba 1.3 !!!parse-error (type => 'nestc');
795 wakaba 1.33 ## TODO: Different error type for <aa / bb> than <aa/>
796 wakaba 1.1 }
797 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
798 wakaba 1.1 # next-input-character is already done
799     redo A;
800 wakaba 1.17 } elsif ($self->{next_input_character} == -1) {
801 wakaba 1.3 !!!parse-error (type => 'unclosed tag');
802 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
803 wakaba 1.28 $self->{current_token}->{first_start_tag}
804     = not defined $self->{last_emitted_start_tag_name};
805 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
806 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
807 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
808 wakaba 1.1 if ($self->{current_token}->{attributes}) {
809 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
810 wakaba 1.1 }
811     } else {
812     die "$0: $self->{current_token}->{type}: Unknown token type";
813     }
814 wakaba 1.57 $self->{state} = DATA_STATE;
815 wakaba 1.1 # reconsume
816    
817     !!!emit ($self->{current_token}); # start tag or end tag
818    
819     redo A;
820     } else {
821     $self->{current_attribute} = {name => chr ($self->{next_input_character}),
822     value => ''};
823 wakaba 1.57 $self->{state} = ATTRIBUTE_NAME_STATE;
824 wakaba 1.1 !!!next-input-character;
825     redo A;
826     }
827 wakaba 1.57 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
828 wakaba 1.1 if ($self->{next_input_character} == 0x0009 or # HT
829     $self->{next_input_character} == 0x000A or # LF
830     $self->{next_input_character} == 0x000B or # VT
831     $self->{next_input_character} == 0x000C or # FF
832     $self->{next_input_character} == 0x0020) { # SP
833     ## Stay in the state
834     !!!next-input-character;
835     redo A;
836     } elsif ($self->{next_input_character} == 0x0022) { # "
837 wakaba 1.57 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
838 wakaba 1.1 !!!next-input-character;
839     redo A;
840     } elsif ($self->{next_input_character} == 0x0026) { # &
841 wakaba 1.57 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
842 wakaba 1.1 ## reconsume
843     redo A;
844     } elsif ($self->{next_input_character} == 0x0027) { # '
845 wakaba 1.57 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
846 wakaba 1.1 !!!next-input-character;
847     redo A;
848     } elsif ($self->{next_input_character} == 0x003E) { # >
849 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
850 wakaba 1.28 $self->{current_token}->{first_start_tag}
851     = not defined $self->{last_emitted_start_tag_name};
852 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
853 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
854 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
855 wakaba 1.1 if ($self->{current_token}->{attributes}) {
856 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
857 wakaba 1.1 }
858     } else {
859     die "$0: $self->{current_token}->{type}: Unknown token type";
860     }
861 wakaba 1.57 $self->{state} = DATA_STATE;
862 wakaba 1.1 !!!next-input-character;
863    
864     !!!emit ($self->{current_token}); # start tag or end tag
865    
866     redo A;
867 wakaba 1.17 } elsif ($self->{next_input_character} == -1) {
868 wakaba 1.3 !!!parse-error (type => 'unclosed tag');
869 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
870 wakaba 1.28 $self->{current_token}->{first_start_tag}
871     = not defined $self->{last_emitted_start_tag_name};
872 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
873 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
874 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
875 wakaba 1.1 if ($self->{current_token}->{attributes}) {
876 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
877 wakaba 1.1 }
878     } else {
879     die "$0: $self->{current_token}->{type}: Unknown token type";
880     }
881 wakaba 1.57 $self->{state} = DATA_STATE;
882 wakaba 1.1 ## reconsume
883    
884     !!!emit ($self->{current_token}); # start tag or end tag
885    
886     redo A;
887     } else {
888     $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
889 wakaba 1.57 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
890 wakaba 1.1 !!!next-input-character;
891     redo A;
892     }
893 wakaba 1.57 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
894 wakaba 1.1 if ($self->{next_input_character} == 0x0022) { # "
895 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
896 wakaba 1.1 !!!next-input-character;
897     redo A;
898     } elsif ($self->{next_input_character} == 0x0026) { # &
899 wakaba 1.57 $self->{last_attribute_value_state} = $self->{state};
900     $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
901 wakaba 1.1 !!!next-input-character;
902     redo A;
903     } elsif ($self->{next_input_character} == -1) {
904 wakaba 1.3 !!!parse-error (type => 'unclosed attribute value');
905 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
906 wakaba 1.28 $self->{current_token}->{first_start_tag}
907     = not defined $self->{last_emitted_start_tag_name};
908 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
909 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
910 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
911 wakaba 1.1 if ($self->{current_token}->{attributes}) {
912 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
913 wakaba 1.1 }
914     } else {
915     die "$0: $self->{current_token}->{type}: Unknown token type";
916     }
917 wakaba 1.57 $self->{state} = DATA_STATE;
918 wakaba 1.1 ## reconsume
919    
920     !!!emit ($self->{current_token}); # start tag or end tag
921    
922     redo A;
923     } else {
924     $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
925     ## Stay in the state
926     !!!next-input-character;
927     redo A;
928     }
929 wakaba 1.57 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
930 wakaba 1.1 if ($self->{next_input_character} == 0x0027) { # '
931 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
932 wakaba 1.1 !!!next-input-character;
933     redo A;
934     } elsif ($self->{next_input_character} == 0x0026) { # &
935 wakaba 1.57 $self->{last_attribute_value_state} = $self->{state};
936     $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
937 wakaba 1.1 !!!next-input-character;
938     redo A;
939     } elsif ($self->{next_input_character} == -1) {
940 wakaba 1.3 !!!parse-error (type => 'unclosed attribute value');
941 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
942 wakaba 1.28 $self->{current_token}->{first_start_tag}
943     = not defined $self->{last_emitted_start_tag_name};
944 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
945 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
946 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
947 wakaba 1.1 if ($self->{current_token}->{attributes}) {
948 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
949 wakaba 1.1 }
950     } else {
951     die "$0: $self->{current_token}->{type}: Unknown token type";
952     }
953 wakaba 1.57 $self->{state} = DATA_STATE;
954 wakaba 1.1 ## reconsume
955    
956     !!!emit ($self->{current_token}); # start tag or end tag
957    
958     redo A;
959     } else {
960     $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
961     ## Stay in the state
962     !!!next-input-character;
963     redo A;
964     }
965 wakaba 1.57 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
966 wakaba 1.1 if ($self->{next_input_character} == 0x0009 or # HT
967     $self->{next_input_character} == 0x000A or # LF
968     $self->{next_input_character} == 0x000B or # HT
969     $self->{next_input_character} == 0x000C or # FF
970     $self->{next_input_character} == 0x0020) { # SP
971 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
972 wakaba 1.1 !!!next-input-character;
973     redo A;
974     } elsif ($self->{next_input_character} == 0x0026) { # &
975 wakaba 1.57 $self->{last_attribute_value_state} = $self->{state};
976     $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
977 wakaba 1.1 !!!next-input-character;
978     redo A;
979     } elsif ($self->{next_input_character} == 0x003E) { # >
980 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
981 wakaba 1.28 $self->{current_token}->{first_start_tag}
982     = not defined $self->{last_emitted_start_tag_name};
983 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
984 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
985 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
986 wakaba 1.1 if ($self->{current_token}->{attributes}) {
987 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
988 wakaba 1.1 }
989     } else {
990     die "$0: $self->{current_token}->{type}: Unknown token type";
991     }
992 wakaba 1.57 $self->{state} = DATA_STATE;
993 wakaba 1.1 !!!next-input-character;
994    
995     !!!emit ($self->{current_token}); # start tag or end tag
996    
997     redo A;
998 wakaba 1.17 } elsif ($self->{next_input_character} == -1) {
999 wakaba 1.3 !!!parse-error (type => 'unclosed tag');
1000 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1001 wakaba 1.28 $self->{current_token}->{first_start_tag}
1002     = not defined $self->{last_emitted_start_tag_name};
1003 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1004 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1005 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1006 wakaba 1.1 if ($self->{current_token}->{attributes}) {
1007 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1008 wakaba 1.1 }
1009     } else {
1010     die "$0: $self->{current_token}->{type}: Unknown token type";
1011     }
1012 wakaba 1.57 $self->{state} = DATA_STATE;
1013 wakaba 1.1 ## reconsume
1014    
1015     !!!emit ($self->{current_token}); # start tag or end tag
1016    
1017     redo A;
1018     } else {
1019     $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1020     ## Stay in the state
1021     !!!next-input-character;
1022     redo A;
1023     }
1024 wakaba 1.57 } elsif ($self->{state} == ENTITY_IN_ATTRIBUTE_VALUE_STATE) {
1025 wakaba 1.26 my $token = $self->_tokenize_attempt_to_consume_an_entity (1);
1026 wakaba 1.1
1027     unless (defined $token) {
1028     $self->{current_attribute}->{value} .= '&';
1029     } else {
1030     $self->{current_attribute}->{value} .= $token->{data};
1031     ## ISSUE: spec says "append the returned character token to the current attribute's value"
1032     }
1033    
1034     $self->{state} = $self->{last_attribute_value_state};
1035     # next-input-character is already done
1036     redo A;
1037 wakaba 1.57 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1038 wakaba 1.1 ## (only happen if PCDATA state)
1039    
1040 wakaba 1.55 my $token = {type => COMMENT_TOKEN, data => ''};
1041 wakaba 1.1
1042     BC: {
1043     if ($self->{next_input_character} == 0x003E) { # >
1044 wakaba 1.57 $self->{state} = DATA_STATE;
1045 wakaba 1.1 !!!next-input-character;
1046    
1047     !!!emit ($token);
1048    
1049     redo A;
1050     } elsif ($self->{next_input_character} == -1) {
1051 wakaba 1.57 $self->{state} = DATA_STATE;
1052 wakaba 1.1 ## reconsume
1053    
1054     !!!emit ($token);
1055    
1056     redo A;
1057     } else {
1058     $token->{data} .= chr ($self->{next_input_character});
1059     !!!next-input-character;
1060     redo BC;
1061     }
1062     } # BC
1063 wakaba 1.57 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1064 wakaba 1.1 ## (only happen if PCDATA state)
1065    
1066     my @next_char;
1067     push @next_char, $self->{next_input_character};
1068    
1069     if ($self->{next_input_character} == 0x002D) { # -
1070     !!!next-input-character;
1071     push @next_char, $self->{next_input_character};
1072     if ($self->{next_input_character} == 0x002D) { # -
1073 wakaba 1.55 $self->{current_token} = {type => COMMENT_TOKEN, data => ''};
1074 wakaba 1.57 $self->{state} = COMMENT_START_STATE;
1075 wakaba 1.1 !!!next-input-character;
1076     redo A;
1077     }
1078     } elsif ($self->{next_input_character} == 0x0044 or # D
1079     $self->{next_input_character} == 0x0064) { # d
1080     !!!next-input-character;
1081     push @next_char, $self->{next_input_character};
1082     if ($self->{next_input_character} == 0x004F or # O
1083     $self->{next_input_character} == 0x006F) { # o
1084     !!!next-input-character;
1085     push @next_char, $self->{next_input_character};
1086     if ($self->{next_input_character} == 0x0043 or # C
1087     $self->{next_input_character} == 0x0063) { # c
1088     !!!next-input-character;
1089     push @next_char, $self->{next_input_character};
1090     if ($self->{next_input_character} == 0x0054 or # T
1091     $self->{next_input_character} == 0x0074) { # t
1092     !!!next-input-character;
1093     push @next_char, $self->{next_input_character};
1094     if ($self->{next_input_character} == 0x0059 or # Y
1095     $self->{next_input_character} == 0x0079) { # y
1096     !!!next-input-character;
1097     push @next_char, $self->{next_input_character};
1098     if ($self->{next_input_character} == 0x0050 or # P
1099     $self->{next_input_character} == 0x0070) { # p
1100     !!!next-input-character;
1101     push @next_char, $self->{next_input_character};
1102     if ($self->{next_input_character} == 0x0045 or # E
1103     $self->{next_input_character} == 0x0065) { # e
1104     ## ISSUE: What a stupid code this is!
1105 wakaba 1.57 $self->{state} = DOCTYPE_STATE;
1106 wakaba 1.1 !!!next-input-character;
1107     redo A;
1108     }
1109     }
1110     }
1111     }
1112     }
1113     }
1114     }
1115    
1116 wakaba 1.30 !!!parse-error (type => 'bogus comment');
1117 wakaba 1.1 $self->{next_input_character} = shift @next_char;
1118     !!!back-next-input-character (@next_char);
1119 wakaba 1.57 $self->{state} = BOGUS_COMMENT_STATE;
1120 wakaba 1.1 redo A;
1121    
1122     ## ISSUE: typos in spec: chacacters, is is a parse error
1123     ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
1124 wakaba 1.57 } elsif ($self->{state} == COMMENT_START_STATE) {
1125 wakaba 1.23 if ($self->{next_input_character} == 0x002D) { # -
1126 wakaba 1.57 $self->{state} = COMMENT_START_DASH_STATE;
1127 wakaba 1.23 !!!next-input-character;
1128     redo A;
1129     } elsif ($self->{next_input_character} == 0x003E) { # >
1130     !!!parse-error (type => 'bogus comment');
1131 wakaba 1.57 $self->{state} = DATA_STATE;
1132 wakaba 1.23 !!!next-input-character;
1133    
1134     !!!emit ($self->{current_token}); # comment
1135    
1136     redo A;
1137     } elsif ($self->{next_input_character} == -1) {
1138     !!!parse-error (type => 'unclosed comment');
1139 wakaba 1.57 $self->{state} = DATA_STATE;
1140 wakaba 1.23 ## reconsume
1141    
1142     !!!emit ($self->{current_token}); # comment
1143    
1144     redo A;
1145     } else {
1146     $self->{current_token}->{data} # comment
1147     .= chr ($self->{next_input_character});
1148 wakaba 1.57 $self->{state} = COMMENT_STATE;
1149 wakaba 1.23 !!!next-input-character;
1150     redo A;
1151     }
1152 wakaba 1.57 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1153 wakaba 1.23 if ($self->{next_input_character} == 0x002D) { # -
1154 wakaba 1.57 $self->{state} = COMMENT_END_STATE;
1155 wakaba 1.23 !!!next-input-character;
1156     redo A;
1157     } elsif ($self->{next_input_character} == 0x003E) { # >
1158     !!!parse-error (type => 'bogus comment');
1159 wakaba 1.57 $self->{state} = DATA_STATE;
1160 wakaba 1.23 !!!next-input-character;
1161    
1162     !!!emit ($self->{current_token}); # comment
1163    
1164     redo A;
1165     } elsif ($self->{next_input_character} == -1) {
1166     !!!parse-error (type => 'unclosed comment');
1167 wakaba 1.57 $self->{state} = DATA_STATE;
1168 wakaba 1.23 ## reconsume
1169    
1170     !!!emit ($self->{current_token}); # comment
1171    
1172     redo A;
1173     } else {
1174     $self->{current_token}->{data} # comment
1175 wakaba 1.33 .= '-' . chr ($self->{next_input_character});
1176 wakaba 1.57 $self->{state} = COMMENT_STATE;
1177 wakaba 1.23 !!!next-input-character;
1178     redo A;
1179     }
1180 wakaba 1.57 } elsif ($self->{state} == COMMENT_STATE) {
1181 wakaba 1.1 if ($self->{next_input_character} == 0x002D) { # -
1182 wakaba 1.57 $self->{state} = COMMENT_END_DASH_STATE;
1183 wakaba 1.1 !!!next-input-character;
1184     redo A;
1185     } elsif ($self->{next_input_character} == -1) {
1186 wakaba 1.3 !!!parse-error (type => 'unclosed comment');
1187 wakaba 1.57 $self->{state} = DATA_STATE;
1188 wakaba 1.1 ## reconsume
1189    
1190     !!!emit ($self->{current_token}); # comment
1191    
1192     redo A;
1193     } else {
1194     $self->{current_token}->{data} .= chr ($self->{next_input_character}); # comment
1195     ## Stay in the state
1196     !!!next-input-character;
1197     redo A;
1198     }
1199 wakaba 1.57 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1200 wakaba 1.1 if ($self->{next_input_character} == 0x002D) { # -
1201 wakaba 1.57 $self->{state} = COMMENT_END_STATE;
1202 wakaba 1.1 !!!next-input-character;
1203     redo A;
1204     } elsif ($self->{next_input_character} == -1) {
1205 wakaba 1.3 !!!parse-error (type => 'unclosed comment');
1206 wakaba 1.57 $self->{state} = DATA_STATE;
1207 wakaba 1.1 ## reconsume
1208    
1209     !!!emit ($self->{current_token}); # comment
1210    
1211     redo A;
1212     } else {
1213     $self->{current_token}->{data} .= '-' . chr ($self->{next_input_character}); # comment
1214 wakaba 1.57 $self->{state} = COMMENT_STATE;
1215 wakaba 1.1 !!!next-input-character;
1216     redo A;
1217     }
1218 wakaba 1.57 } elsif ($self->{state} == COMMENT_END_STATE) {
1219 wakaba 1.1 if ($self->{next_input_character} == 0x003E) { # >
1220 wakaba 1.57 $self->{state} = DATA_STATE;
1221 wakaba 1.1 !!!next-input-character;
1222    
1223     !!!emit ($self->{current_token}); # comment
1224    
1225     redo A;
1226     } elsif ($self->{next_input_character} == 0x002D) { # -
1227 wakaba 1.3 !!!parse-error (type => 'dash in comment');
1228 wakaba 1.1 $self->{current_token}->{data} .= '-'; # comment
1229     ## Stay in the state
1230     !!!next-input-character;
1231     redo A;
1232     } elsif ($self->{next_input_character} == -1) {
1233 wakaba 1.3 !!!parse-error (type => 'unclosed comment');
1234 wakaba 1.57 $self->{state} = DATA_STATE;
1235 wakaba 1.1 ## reconsume
1236    
1237     !!!emit ($self->{current_token}); # comment
1238    
1239     redo A;
1240     } else {
1241 wakaba 1.3 !!!parse-error (type => 'dash in comment');
1242 wakaba 1.1 $self->{current_token}->{data} .= '--' . chr ($self->{next_input_character}); # comment
1243 wakaba 1.57 $self->{state} = COMMENT_STATE;
1244 wakaba 1.1 !!!next-input-character;
1245     redo A;
1246     }
1247 wakaba 1.57 } elsif ($self->{state} == DOCTYPE_STATE) {
1248 wakaba 1.1 if ($self->{next_input_character} == 0x0009 or # HT
1249     $self->{next_input_character} == 0x000A or # LF
1250     $self->{next_input_character} == 0x000B or # VT
1251     $self->{next_input_character} == 0x000C or # FF
1252     $self->{next_input_character} == 0x0020) { # SP
1253 wakaba 1.57 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1254 wakaba 1.1 !!!next-input-character;
1255     redo A;
1256     } else {
1257 wakaba 1.3 !!!parse-error (type => 'no space before DOCTYPE name');
1258 wakaba 1.57 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1259 wakaba 1.1 ## reconsume
1260     redo A;
1261     }
1262 wakaba 1.57 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
1263 wakaba 1.1 if ($self->{next_input_character} == 0x0009 or # HT
1264     $self->{next_input_character} == 0x000A or # LF
1265     $self->{next_input_character} == 0x000B or # VT
1266     $self->{next_input_character} == 0x000C or # FF
1267     $self->{next_input_character} == 0x0020) { # SP
1268     ## Stay in the state
1269     !!!next-input-character;
1270     redo A;
1271     } elsif ($self->{next_input_character} == 0x003E) { # >
1272 wakaba 1.3 !!!parse-error (type => 'no DOCTYPE name');
1273 wakaba 1.57 $self->{state} = DATA_STATE;
1274 wakaba 1.1 !!!next-input-character;
1275    
1276 wakaba 1.55 !!!emit ({type => DOCTYPE_TOKEN}); # incorrect
1277 wakaba 1.1
1278     redo A;
1279     } elsif ($self->{next_input_character} == -1) {
1280 wakaba 1.3 !!!parse-error (type => 'no DOCTYPE name');
1281 wakaba 1.57 $self->{state} = DATA_STATE;
1282 wakaba 1.1 ## reconsume
1283    
1284 wakaba 1.55 !!!emit ({type => DOCTYPE_TOKEN}); # incorrect
1285 wakaba 1.1
1286     redo A;
1287     } else {
1288 wakaba 1.18 $self->{current_token}
1289 wakaba 1.55 = {type => DOCTYPE_TOKEN,
1290 wakaba 1.18 name => chr ($self->{next_input_character}),
1291     correct => 1};
1292 wakaba 1.4 ## ISSUE: "Set the token's name name to the" in the spec
1293 wakaba 1.57 $self->{state} = DOCTYPE_NAME_STATE;
1294 wakaba 1.1 !!!next-input-character;
1295     redo A;
1296     }
1297 wakaba 1.57 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
1298 wakaba 1.18 ## ISSUE: Redundant "First," in the spec.
1299 wakaba 1.1 if ($self->{next_input_character} == 0x0009 or # HT
1300     $self->{next_input_character} == 0x000A or # LF
1301     $self->{next_input_character} == 0x000B or # VT
1302     $self->{next_input_character} == 0x000C or # FF
1303     $self->{next_input_character} == 0x0020) { # SP
1304 wakaba 1.57 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
1305 wakaba 1.1 !!!next-input-character;
1306     redo A;
1307     } elsif ($self->{next_input_character} == 0x003E) { # >
1308 wakaba 1.57 $self->{state} = DATA_STATE;
1309 wakaba 1.1 !!!next-input-character;
1310    
1311     !!!emit ($self->{current_token}); # DOCTYPE
1312    
1313     redo A;
1314     } elsif ($self->{next_input_character} == -1) {
1315 wakaba 1.3 !!!parse-error (type => 'unclosed DOCTYPE');
1316 wakaba 1.57 $self->{state} = DATA_STATE;
1317 wakaba 1.1 ## reconsume
1318    
1319 wakaba 1.18 delete $self->{current_token}->{correct};
1320     !!!emit ($self->{current_token}); # DOCTYPE
1321 wakaba 1.1
1322     redo A;
1323     } else {
1324     $self->{current_token}->{name}
1325     .= chr ($self->{next_input_character}); # DOCTYPE
1326     ## Stay in the state
1327     !!!next-input-character;
1328     redo A;
1329     }
1330 wakaba 1.57 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
1331 wakaba 1.1 if ($self->{next_input_character} == 0x0009 or # HT
1332     $self->{next_input_character} == 0x000A or # LF
1333     $self->{next_input_character} == 0x000B or # VT
1334     $self->{next_input_character} == 0x000C or # FF
1335     $self->{next_input_character} == 0x0020) { # SP
1336     ## Stay in the state
1337     !!!next-input-character;
1338     redo A;
1339     } elsif ($self->{next_input_character} == 0x003E) { # >
1340 wakaba 1.57 $self->{state} = DATA_STATE;
1341 wakaba 1.1 !!!next-input-character;
1342    
1343     !!!emit ($self->{current_token}); # DOCTYPE
1344    
1345     redo A;
1346     } elsif ($self->{next_input_character} == -1) {
1347 wakaba 1.3 !!!parse-error (type => 'unclosed DOCTYPE');
1348 wakaba 1.57 $self->{state} = DATA_STATE;
1349 wakaba 1.1 ## reconsume
1350    
1351 wakaba 1.18 delete $self->{current_token}->{correct};
1352     !!!emit ($self->{current_token}); # DOCTYPE
1353    
1354     redo A;
1355     } elsif ($self->{next_input_character} == 0x0050 or # P
1356     $self->{next_input_character} == 0x0070) { # p
1357     !!!next-input-character;
1358     if ($self->{next_input_character} == 0x0055 or # U
1359     $self->{next_input_character} == 0x0075) { # u
1360     !!!next-input-character;
1361     if ($self->{next_input_character} == 0x0042 or # B
1362     $self->{next_input_character} == 0x0062) { # b
1363     !!!next-input-character;
1364     if ($self->{next_input_character} == 0x004C or # L
1365     $self->{next_input_character} == 0x006C) { # l
1366     !!!next-input-character;
1367     if ($self->{next_input_character} == 0x0049 or # I
1368     $self->{next_input_character} == 0x0069) { # i
1369     !!!next-input-character;
1370     if ($self->{next_input_character} == 0x0043 or # C
1371     $self->{next_input_character} == 0x0063) { # c
1372 wakaba 1.57 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1373 wakaba 1.18 !!!next-input-character;
1374     redo A;
1375     }
1376     }
1377     }
1378     }
1379     }
1380    
1381     #
1382     } elsif ($self->{next_input_character} == 0x0053 or # S
1383     $self->{next_input_character} == 0x0073) { # s
1384     !!!next-input-character;
1385     if ($self->{next_input_character} == 0x0059 or # Y
1386     $self->{next_input_character} == 0x0079) { # y
1387     !!!next-input-character;
1388     if ($self->{next_input_character} == 0x0053 or # S
1389     $self->{next_input_character} == 0x0073) { # s
1390     !!!next-input-character;
1391     if ($self->{next_input_character} == 0x0054 or # T
1392     $self->{next_input_character} == 0x0074) { # t
1393     !!!next-input-character;
1394     if ($self->{next_input_character} == 0x0045 or # E
1395     $self->{next_input_character} == 0x0065) { # e
1396     !!!next-input-character;
1397     if ($self->{next_input_character} == 0x004D or # M
1398     $self->{next_input_character} == 0x006D) { # m
1399 wakaba 1.57 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1400 wakaba 1.18 !!!next-input-character;
1401     redo A;
1402     }
1403     }
1404     }
1405     }
1406     }
1407    
1408     #
1409     } else {
1410     !!!next-input-character;
1411     #
1412     }
1413    
1414     !!!parse-error (type => 'string after DOCTYPE name');
1415 wakaba 1.57 $self->{state} = BOGUS_DOCTYPE_STATE;
1416 wakaba 1.18 # next-input-character is already done
1417     redo A;
1418 wakaba 1.57 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
1419 wakaba 1.18 if ({
1420     0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1421     #0x000D => 1, # HT, LF, VT, FF, SP, CR
1422     }->{$self->{next_input_character}}) {
1423     ## Stay in the state
1424     !!!next-input-character;
1425     redo A;
1426     } elsif ($self->{next_input_character} eq 0x0022) { # "
1427     $self->{current_token}->{public_identifier} = ''; # DOCTYPE
1428 wakaba 1.57 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
1429 wakaba 1.18 !!!next-input-character;
1430     redo A;
1431     } elsif ($self->{next_input_character} eq 0x0027) { # '
1432     $self->{current_token}->{public_identifier} = ''; # DOCTYPE
1433 wakaba 1.57 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
1434 wakaba 1.18 !!!next-input-character;
1435     redo A;
1436     } elsif ($self->{next_input_character} eq 0x003E) { # >
1437     !!!parse-error (type => 'no PUBLIC literal');
1438    
1439 wakaba 1.57 $self->{state} = DATA_STATE;
1440 wakaba 1.18 !!!next-input-character;
1441    
1442     delete $self->{current_token}->{correct};
1443     !!!emit ($self->{current_token}); # DOCTYPE
1444    
1445     redo A;
1446     } elsif ($self->{next_input_character} == -1) {
1447     !!!parse-error (type => 'unclosed DOCTYPE');
1448    
1449 wakaba 1.57 $self->{state} = DATA_STATE;
1450 wakaba 1.18 ## reconsume
1451    
1452     delete $self->{current_token}->{correct};
1453     !!!emit ($self->{current_token}); # DOCTYPE
1454    
1455     redo A;
1456     } else {
1457     !!!parse-error (type => 'string after PUBLIC');
1458 wakaba 1.57 $self->{state} = BOGUS_DOCTYPE_STATE;
1459 wakaba 1.18 !!!next-input-character;
1460     redo A;
1461     }
1462 wakaba 1.57 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
1463 wakaba 1.18 if ($self->{next_input_character} == 0x0022) { # "
1464 wakaba 1.57 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1465 wakaba 1.18 !!!next-input-character;
1466     redo A;
1467     } elsif ($self->{next_input_character} == -1) {
1468     !!!parse-error (type => 'unclosed PUBLIC literal');
1469    
1470 wakaba 1.57 $self->{state} = DATA_STATE;
1471 wakaba 1.18 ## reconsume
1472    
1473     delete $self->{current_token}->{correct};
1474     !!!emit ($self->{current_token}); # DOCTYPE
1475    
1476     redo A;
1477     } else {
1478     $self->{current_token}->{public_identifier} # DOCTYPE
1479     .= chr $self->{next_input_character};
1480     ## Stay in the state
1481     !!!next-input-character;
1482     redo A;
1483     }
1484 wakaba 1.57 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
1485 wakaba 1.18 if ($self->{next_input_character} == 0x0027) { # '
1486 wakaba 1.57 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1487 wakaba 1.18 !!!next-input-character;
1488     redo A;
1489     } elsif ($self->{next_input_character} == -1) {
1490     !!!parse-error (type => 'unclosed PUBLIC literal');
1491    
1492 wakaba 1.57 $self->{state} = DATA_STATE;
1493 wakaba 1.18 ## reconsume
1494    
1495     delete $self->{current_token}->{correct};
1496     !!!emit ($self->{current_token}); # DOCTYPE
1497    
1498     redo A;
1499     } else {
1500     $self->{current_token}->{public_identifier} # DOCTYPE
1501     .= chr $self->{next_input_character};
1502     ## Stay in the state
1503     !!!next-input-character;
1504     redo A;
1505     }
1506 wakaba 1.57 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
1507 wakaba 1.18 if ({
1508     0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1509     #0x000D => 1, # HT, LF, VT, FF, SP, CR
1510     }->{$self->{next_input_character}}) {
1511     ## Stay in the state
1512     !!!next-input-character;
1513     redo A;
1514     } elsif ($self->{next_input_character} == 0x0022) { # "
1515     $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1516 wakaba 1.57 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
1517 wakaba 1.18 !!!next-input-character;
1518     redo A;
1519     } elsif ($self->{next_input_character} == 0x0027) { # '
1520     $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1521 wakaba 1.57 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
1522 wakaba 1.18 !!!next-input-character;
1523     redo A;
1524     } elsif ($self->{next_input_character} == 0x003E) { # >
1525 wakaba 1.57 $self->{state} = DATA_STATE;
1526 wakaba 1.18 !!!next-input-character;
1527    
1528     !!!emit ($self->{current_token}); # DOCTYPE
1529    
1530     redo A;
1531     } elsif ($self->{next_input_character} == -1) {
1532     !!!parse-error (type => 'unclosed DOCTYPE');
1533    
1534 wakaba 1.57 $self->{state} = DATA_STATE;
1535 wakaba 1.26 ## reconsume
1536 wakaba 1.18
1537     delete $self->{current_token}->{correct};
1538     !!!emit ($self->{current_token}); # DOCTYPE
1539    
1540     redo A;
1541     } else {
1542     !!!parse-error (type => 'string after PUBLIC literal');
1543 wakaba 1.57 $self->{state} = BOGUS_DOCTYPE_STATE;
1544 wakaba 1.18 !!!next-input-character;
1545     redo A;
1546     }
1547 wakaba 1.57 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
1548 wakaba 1.18 if ({
1549     0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1550     #0x000D => 1, # HT, LF, VT, FF, SP, CR
1551     }->{$self->{next_input_character}}) {
1552     ## Stay in the state
1553     !!!next-input-character;
1554     redo A;
1555     } elsif ($self->{next_input_character} == 0x0022) { # "
1556     $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1557 wakaba 1.57 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
1558 wakaba 1.18 !!!next-input-character;
1559     redo A;
1560     } elsif ($self->{next_input_character} == 0x0027) { # '
1561     $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1562 wakaba 1.57 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
1563 wakaba 1.18 !!!next-input-character;
1564     redo A;
1565     } elsif ($self->{next_input_character} == 0x003E) { # >
1566     !!!parse-error (type => 'no SYSTEM literal');
1567 wakaba 1.57 $self->{state} = DATA_STATE;
1568 wakaba 1.18 !!!next-input-character;
1569    
1570     delete $self->{current_token}->{correct};
1571     !!!emit ($self->{current_token}); # DOCTYPE
1572    
1573     redo A;
1574     } elsif ($self->{next_input_character} == -1) {
1575     !!!parse-error (type => 'unclosed DOCTYPE');
1576    
1577 wakaba 1.57 $self->{state} = DATA_STATE;
1578 wakaba 1.26 ## reconsume
1579 wakaba 1.18
1580     delete $self->{current_token}->{correct};
1581     !!!emit ($self->{current_token}); # DOCTYPE
1582    
1583     redo A;
1584     } else {
1585 wakaba 1.30 !!!parse-error (type => 'string after SYSTEM');
1586 wakaba 1.57 $self->{state} = BOGUS_DOCTYPE_STATE;
1587 wakaba 1.18 !!!next-input-character;
1588     redo A;
1589     }
1590 wakaba 1.57 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
1591 wakaba 1.18 if ($self->{next_input_character} == 0x0022) { # "
1592 wakaba 1.57 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1593 wakaba 1.18 !!!next-input-character;
1594     redo A;
1595     } elsif ($self->{next_input_character} == -1) {
1596     !!!parse-error (type => 'unclosed SYSTEM literal');
1597    
1598 wakaba 1.57 $self->{state} = DATA_STATE;
1599 wakaba 1.18 ## reconsume
1600    
1601     delete $self->{current_token}->{correct};
1602     !!!emit ($self->{current_token}); # DOCTYPE
1603    
1604     redo A;
1605     } else {
1606     $self->{current_token}->{system_identifier} # DOCTYPE
1607     .= chr $self->{next_input_character};
1608     ## Stay in the state
1609     !!!next-input-character;
1610     redo A;
1611     }
1612 wakaba 1.57 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
1613 wakaba 1.18 if ($self->{next_input_character} == 0x0027) { # '
1614 wakaba 1.57 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1615 wakaba 1.18 !!!next-input-character;
1616     redo A;
1617     } elsif ($self->{next_input_character} == -1) {
1618     !!!parse-error (type => 'unclosed SYSTEM literal');
1619    
1620 wakaba 1.57 $self->{state} = DATA_STATE;
1621 wakaba 1.18 ## reconsume
1622    
1623     delete $self->{current_token}->{correct};
1624 wakaba 1.1 !!!emit ($self->{current_token}); # DOCTYPE
1625    
1626     redo A;
1627     } else {
1628 wakaba 1.18 $self->{current_token}->{system_identifier} # DOCTYPE
1629     .= chr $self->{next_input_character};
1630     ## Stay in the state
1631     !!!next-input-character;
1632     redo A;
1633     }
1634 wakaba 1.57 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
1635 wakaba 1.18 if ({
1636     0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1637     #0x000D => 1, # HT, LF, VT, FF, SP, CR
1638     }->{$self->{next_input_character}}) {
1639     ## Stay in the state
1640     !!!next-input-character;
1641     redo A;
1642     } elsif ($self->{next_input_character} == 0x003E) { # >
1643 wakaba 1.57 $self->{state} = DATA_STATE;
1644 wakaba 1.18 !!!next-input-character;
1645    
1646     !!!emit ($self->{current_token}); # DOCTYPE
1647    
1648     redo A;
1649     } elsif ($self->{next_input_character} == -1) {
1650     !!!parse-error (type => 'unclosed DOCTYPE');
1651    
1652 wakaba 1.57 $self->{state} = DATA_STATE;
1653 wakaba 1.26 ## reconsume
1654 wakaba 1.18
1655     delete $self->{current_token}->{correct};
1656     !!!emit ($self->{current_token}); # DOCTYPE
1657    
1658     redo A;
1659     } else {
1660     !!!parse-error (type => 'string after SYSTEM literal');
1661 wakaba 1.57 $self->{state} = BOGUS_DOCTYPE_STATE;
1662 wakaba 1.1 !!!next-input-character;
1663     redo A;
1664     }
1665 wakaba 1.57 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
1666 wakaba 1.1 if ($self->{next_input_character} == 0x003E) { # >
1667 wakaba 1.57 $self->{state} = DATA_STATE;
1668 wakaba 1.1 !!!next-input-character;
1669    
1670 wakaba 1.18 delete $self->{current_token}->{correct};
1671 wakaba 1.1 !!!emit ($self->{current_token}); # DOCTYPE
1672    
1673     redo A;
1674     } elsif ($self->{next_input_character} == -1) {
1675 wakaba 1.3 !!!parse-error (type => 'unclosed DOCTYPE');
1676 wakaba 1.57 $self->{state} = DATA_STATE;
1677 wakaba 1.1 ## reconsume
1678    
1679 wakaba 1.18 delete $self->{current_token}->{correct};
1680 wakaba 1.1 !!!emit ($self->{current_token}); # DOCTYPE
1681    
1682     redo A;
1683     } else {
1684     ## Stay in the state
1685     !!!next-input-character;
1686     redo A;
1687     }
1688     } else {
1689     die "$0: $self->{state}: Unknown state";
1690     }
1691     } # A
1692    
1693     die "$0: _get_next_token: unexpected case";
1694     } # _get_next_token
1695    
1696 wakaba 1.26 sub _tokenize_attempt_to_consume_an_entity ($$) {
1697     my ($self, $in_attr) = @_;
1698 wakaba 1.20
1699     if ({
1700     0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
1701     0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR
1702     }->{$self->{next_input_character}}) {
1703     ## Don't consume
1704     ## No error
1705     return undef;
1706     } elsif ($self->{next_input_character} == 0x0023) { # #
1707 wakaba 1.1 !!!next-input-character;
1708     if ($self->{next_input_character} == 0x0078 or # x
1709     $self->{next_input_character} == 0x0058) { # X
1710 wakaba 1.26 my $code;
1711 wakaba 1.1 X: {
1712     my $x_char = $self->{next_input_character};
1713     !!!next-input-character;
1714     if (0x0030 <= $self->{next_input_character} and
1715     $self->{next_input_character} <= 0x0039) { # 0..9
1716 wakaba 1.26 $code ||= 0;
1717     $code *= 0x10;
1718     $code += $self->{next_input_character} - 0x0030;
1719 wakaba 1.1 redo X;
1720     } elsif (0x0061 <= $self->{next_input_character} and
1721     $self->{next_input_character} <= 0x0066) { # a..f
1722 wakaba 1.26 $code ||= 0;
1723     $code *= 0x10;
1724     $code += $self->{next_input_character} - 0x0060 + 9;
1725 wakaba 1.1 redo X;
1726     } elsif (0x0041 <= $self->{next_input_character} and
1727     $self->{next_input_character} <= 0x0046) { # A..F
1728 wakaba 1.26 $code ||= 0;
1729     $code *= 0x10;
1730     $code += $self->{next_input_character} - 0x0040 + 9;
1731 wakaba 1.1 redo X;
1732 wakaba 1.26 } elsif (not defined $code) { # no hexadecimal digit
1733 wakaba 1.3 !!!parse-error (type => 'bare hcro');
1734 wakaba 1.37 !!!back-next-input-character ($x_char, $self->{next_input_character});
1735 wakaba 1.1 $self->{next_input_character} = 0x0023; # #
1736     return undef;
1737     } elsif ($self->{next_input_character} == 0x003B) { # ;
1738     !!!next-input-character;
1739     } else {
1740 wakaba 1.3 !!!parse-error (type => 'no refc');
1741 wakaba 1.1 }
1742    
1743 wakaba 1.26 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
1744     !!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code);
1745     $code = 0xFFFD;
1746     } elsif ($code > 0x10FFFF) {
1747     !!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code);
1748     $code = 0xFFFD;
1749     } elsif ($code == 0x000D) {
1750     !!!parse-error (type => 'CR character reference');
1751     $code = 0x000A;
1752     } elsif (0x80 <= $code and $code <= 0x9F) {
1753 wakaba 1.30 !!!parse-error (type => sprintf 'C1 character reference:U+%04X', $code);
1754 wakaba 1.26 $code = $c1_entity_char->{$code};
1755 wakaba 1.1 }
1756    
1757 wakaba 1.55 return {type => CHARACTER_TOKEN, data => chr $code};
1758 wakaba 1.1 } # X
1759     } elsif (0x0030 <= $self->{next_input_character} and
1760     $self->{next_input_character} <= 0x0039) { # 0..9
1761     my $code = $self->{next_input_character} - 0x0030;
1762     !!!next-input-character;
1763    
1764     while (0x0030 <= $self->{next_input_character} and
1765     $self->{next_input_character} <= 0x0039) { # 0..9
1766     $code *= 10;
1767     $code += $self->{next_input_character} - 0x0030;
1768    
1769     !!!next-input-character;
1770     }
1771    
1772     if ($self->{next_input_character} == 0x003B) { # ;
1773     !!!next-input-character;
1774     } else {
1775 wakaba 1.3 !!!parse-error (type => 'no refc');
1776 wakaba 1.1 }
1777    
1778 wakaba 1.26 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
1779     !!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code);
1780     $code = 0xFFFD;
1781     } elsif ($code > 0x10FFFF) {
1782     !!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code);
1783     $code = 0xFFFD;
1784     } elsif ($code == 0x000D) {
1785     !!!parse-error (type => 'CR character reference');
1786     $code = 0x000A;
1787 wakaba 1.4 } elsif (0x80 <= $code and $code <= 0x9F) {
1788 wakaba 1.30 !!!parse-error (type => sprintf 'C1 character reference:U+%04X', $code);
1789 wakaba 1.4 $code = $c1_entity_char->{$code};
1790 wakaba 1.1 }
1791    
1792 wakaba 1.55 return {type => CHARACTER_TOKEN, data => chr $code};
1793 wakaba 1.1 } else {
1794 wakaba 1.3 !!!parse-error (type => 'bare nero');
1795 wakaba 1.1 !!!back-next-input-character ($self->{next_input_character});
1796     $self->{next_input_character} = 0x0023; # #
1797     return undef;
1798     }
1799     } elsif ((0x0041 <= $self->{next_input_character} and
1800     $self->{next_input_character} <= 0x005A) or
1801     (0x0061 <= $self->{next_input_character} and
1802     $self->{next_input_character} <= 0x007A)) {
1803     my $entity_name = chr $self->{next_input_character};
1804     !!!next-input-character;
1805    
1806     my $value = $entity_name;
1807 wakaba 1.37 my $match = 0;
1808 wakaba 1.16 require Whatpm::_NamedEntityList;
1809     our $EntityChar;
1810 wakaba 1.1
1811     while (length $entity_name < 10 and
1812     ## NOTE: Some number greater than the maximum length of entity name
1813 wakaba 1.16 ((0x0041 <= $self->{next_input_character} and # a
1814     $self->{next_input_character} <= 0x005A) or # x
1815     (0x0061 <= $self->{next_input_character} and # a
1816     $self->{next_input_character} <= 0x007A) or # z
1817     (0x0030 <= $self->{next_input_character} and # 0
1818     $self->{next_input_character} <= 0x0039) or # 9
1819     $self->{next_input_character} == 0x003B)) { # ;
1820 wakaba 1.1 $entity_name .= chr $self->{next_input_character};
1821 wakaba 1.16 if (defined $EntityChar->{$entity_name}) {
1822     if ($self->{next_input_character} == 0x003B) { # ;
1823 wakaba 1.26 $value = $EntityChar->{$entity_name};
1824 wakaba 1.16 $match = 1;
1825     !!!next-input-character;
1826     last;
1827 wakaba 1.37 } else {
1828 wakaba 1.26 $value = $EntityChar->{$entity_name};
1829     $match = -1;
1830 wakaba 1.37 !!!next-input-character;
1831 wakaba 1.16 }
1832 wakaba 1.1 } else {
1833     $value .= chr $self->{next_input_character};
1834 wakaba 1.37 $match *= 2;
1835     !!!next-input-character;
1836 wakaba 1.1 }
1837     }
1838    
1839 wakaba 1.16 if ($match > 0) {
1840 wakaba 1.55 return {type => CHARACTER_TOKEN, data => $value};
1841 wakaba 1.16 } elsif ($match < 0) {
1842 wakaba 1.30 !!!parse-error (type => 'no refc');
1843 wakaba 1.37 if ($in_attr and $match < -1) {
1844 wakaba 1.55 return {type => CHARACTER_TOKEN, data => '&'.$entity_name};
1845 wakaba 1.37 } else {
1846 wakaba 1.55 return {type => CHARACTER_TOKEN, data => $value};
1847 wakaba 1.37 }
1848 wakaba 1.1 } else {
1849 wakaba 1.3 !!!parse-error (type => 'bare ero');
1850 wakaba 1.1 ## NOTE: No characters are consumed in the spec.
1851 wakaba 1.55 return {type => CHARACTER_TOKEN, data => '&'.$value};
1852 wakaba 1.1 }
1853     } else {
1854     ## no characters are consumed
1855 wakaba 1.3 !!!parse-error (type => 'bare ero');
1856 wakaba 1.1 return undef;
1857     }
1858     } # _tokenize_attempt_to_consume_an_entity
1859    
1860     sub _initialize_tree_constructor ($) {
1861     my $self = shift;
1862     ## NOTE: $self->{document} MUST be specified before this method is called
1863     $self->{document}->strict_error_checking (0);
1864     ## TODO: Turn mutation events off # MUST
1865     ## TODO: Turn loose Document option (manakai extension) on
1866 wakaba 1.18 $self->{document}->manakai_is_html (1); # MUST
1867 wakaba 1.1 } # _initialize_tree_constructor
1868    
1869     sub _terminate_tree_constructor ($) {
1870     my $self = shift;
1871     $self->{document}->strict_error_checking (1);
1872     ## TODO: Turn mutation events on
1873     } # _terminate_tree_constructor
1874    
1875     ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
1876    
1877 wakaba 1.3 { # tree construction stage
1878     my $token;
1879    
1880 wakaba 1.1 sub _construct_tree ($) {
1881     my ($self) = @_;
1882    
1883     ## When an interactive UA render the $self->{document} available
1884     ## to the user, or when it begin accepting user input, are
1885     ## not defined.
1886    
1887     ## Append a character: collect it and all subsequent consecutive
1888     ## characters and insert one Text node whose data is concatenation
1889     ## of all those characters. # MUST
1890    
1891     !!!next-token;
1892    
1893 wakaba 1.54 $self->{insertion_mode} = BEFORE_HEAD_IM;
1894 wakaba 1.3 undef $self->{form_element};
1895     undef $self->{head_element};
1896     $self->{open_elements} = [];
1897     undef $self->{inner_html_node};
1898    
1899     $self->_tree_construction_initial; # MUST
1900     $self->_tree_construction_root_element;
1901     $self->_tree_construction_main;
1902     } # _construct_tree
1903    
1904     sub _tree_construction_initial ($) {
1905     my $self = shift;
1906 wakaba 1.18 INITIAL: {
1907 wakaba 1.55 if ($token->{type} == DOCTYPE_TOKEN) {
1908 wakaba 1.18 ## NOTE: Conformance checkers MAY, instead of reporting "not HTML5"
1909     ## error, switch to a conformance checking mode for another
1910     ## language.
1911     my $doctype_name = $token->{name};
1912     $doctype_name = '' unless defined $doctype_name;
1913     $doctype_name =~ tr/a-z/A-Z/;
1914     if (not defined $token->{name} or # <!DOCTYPE>
1915     defined $token->{public_identifier} or
1916     defined $token->{system_identifier}) {
1917     !!!parse-error (type => 'not HTML5');
1918     } elsif ($doctype_name ne 'HTML') {
1919     ## ISSUE: ASCII case-insensitive? (in fact it does not matter)
1920     !!!parse-error (type => 'not HTML5');
1921     }
1922    
1923     my $doctype = $self->{document}->create_document_type_definition
1924     ($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)?
1925     $doctype->public_id ($token->{public_identifier})
1926     if defined $token->{public_identifier};
1927     $doctype->system_id ($token->{system_identifier})
1928     if defined $token->{system_identifier};
1929     ## NOTE: Other DocumentType attributes are null or empty lists.
1930     ## ISSUE: internalSubset = null??
1931     $self->{document}->append_child ($doctype);
1932    
1933     if (not $token->{correct} or $doctype_name ne 'HTML') {
1934     $self->{document}->manakai_compat_mode ('quirks');
1935     } elsif (defined $token->{public_identifier}) {
1936     my $pubid = $token->{public_identifier};
1937     $pubid =~ tr/a-z/A-z/;
1938     if ({
1939     "+//SILMARIL//DTD HTML PRO V0R11 19970101//EN" => 1,
1940     "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
1941     "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
1942     "-//IETF//DTD HTML 2.0 LEVEL 1//EN" => 1,
1943     "-//IETF//DTD HTML 2.0 LEVEL 2//EN" => 1,
1944     "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//EN" => 1,
1945     "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//EN" => 1,
1946     "-//IETF//DTD HTML 2.0 STRICT//EN" => 1,
1947     "-//IETF//DTD HTML 2.0//EN" => 1,
1948     "-//IETF//DTD HTML 2.1E//EN" => 1,
1949     "-//IETF//DTD HTML 3.0//EN" => 1,
1950     "-//IETF//DTD HTML 3.0//EN//" => 1,
1951     "-//IETF//DTD HTML 3.2 FINAL//EN" => 1,
1952     "-//IETF//DTD HTML 3.2//EN" => 1,
1953     "-//IETF//DTD HTML 3//EN" => 1,
1954     "-//IETF//DTD HTML LEVEL 0//EN" => 1,
1955     "-//IETF//DTD HTML LEVEL 0//EN//2.0" => 1,
1956     "-//IETF//DTD HTML LEVEL 1//EN" => 1,
1957     "-//IETF//DTD HTML LEVEL 1//EN//2.0" => 1,
1958     "-//IETF//DTD HTML LEVEL 2//EN" => 1,
1959     "-//IETF//DTD HTML LEVEL 2//EN//2.0" => 1,
1960     "-//IETF//DTD HTML LEVEL 3//EN" => 1,
1961     "-//IETF//DTD HTML LEVEL 3//EN//3.0" => 1,
1962     "-//IETF//DTD HTML STRICT LEVEL 0//EN" => 1,
1963     "-//IETF//DTD HTML STRICT LEVEL 0//EN//2.0" => 1,
1964     "-//IETF//DTD HTML STRICT LEVEL 1//EN" => 1,
1965     "-//IETF//DTD HTML STRICT LEVEL 1//EN//2.0" => 1,
1966     "-//IETF//DTD HTML STRICT LEVEL 2//EN" => 1,
1967     "-//IETF//DTD HTML STRICT LEVEL 2//EN//2.0" => 1,
1968     "-//IETF//DTD HTML STRICT LEVEL 3//EN" => 1,
1969     "-//IETF//DTD HTML STRICT LEVEL 3//EN//3.0" => 1,
1970     "-//IETF//DTD HTML STRICT//EN" => 1,
1971     "-//IETF//DTD HTML STRICT//EN//2.0" => 1,
1972     "-//IETF//DTD HTML STRICT//EN//3.0" => 1,
1973     "-//IETF//DTD HTML//EN" => 1,
1974     "-//IETF//DTD HTML//EN//2.0" => 1,
1975     "-//IETF//DTD HTML//EN//3.0" => 1,
1976     "-//METRIUS//DTD METRIUS PRESENTATIONAL//EN" => 1,
1977     "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//EN" => 1,
1978     "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//EN" => 1,
1979     "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//EN" => 1,
1980     "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//EN" => 1,
1981     "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//EN" => 1,
1982     "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//EN" => 1,
1983     "-//NETSCAPE COMM. CORP.//DTD HTML//EN" => 1,
1984     "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1,
1985     "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1,
1986     "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1,
1987     "-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1,
1988     "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1,
1989     "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1,
1990     "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//EN" => 1,
1991     "-//W3C//DTD HTML 3 1995-03-24//EN" => 1,
1992     "-//W3C//DTD HTML 3.2 DRAFT//EN" => 1,
1993     "-//W3C//DTD HTML 3.2 FINAL//EN" => 1,
1994     "-//W3C//DTD HTML 3.2//EN" => 1,
1995     "-//W3C//DTD HTML 3.2S DRAFT//EN" => 1,
1996     "-//W3C//DTD HTML 4.0 FRAMESET//EN" => 1,
1997     "-//W3C//DTD HTML 4.0 TRANSITIONAL//EN" => 1,
1998     "-//W3C//DTD HTML EXPERIMETNAL 19960712//EN" => 1,
1999     "-//W3C//DTD HTML EXPERIMENTAL 970421//EN" => 1,
2000     "-//W3C//DTD W3 HTML//EN" => 1,
2001     "-//W3O//DTD W3 HTML 3.0//EN" => 1,
2002     "-//W3O//DTD W3 HTML 3.0//EN//" => 1,
2003     "-//W3O//DTD W3 HTML STRICT 3.0//EN//" => 1,
2004     "-//WEBTECHS//DTD MOZILLA HTML 2.0//EN" => 1,
2005     "-//WEBTECHS//DTD MOZILLA HTML//EN" => 1,
2006     "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" => 1,
2007     "HTML" => 1,
2008     }->{$pubid}) {
2009     $self->{document}->manakai_compat_mode ('quirks');
2010     } elsif ($pubid eq "-//W3C//DTD HTML 4.01 FRAMESET//EN" or
2011     $pubid eq "-//W3C//DTD HTML 4.01 TRANSITIONAL//EN") {
2012     if (defined $token->{system_identifier}) {
2013     $self->{document}->manakai_compat_mode ('quirks');
2014     } else {
2015     $self->{document}->manakai_compat_mode ('limited quirks');
2016 wakaba 1.3 }
2017 wakaba 1.18 } elsif ($pubid eq "-//W3C//DTD XHTML 1.0 Frameset//EN" or
2018     $pubid eq "-//W3C//DTD XHTML 1.0 Transitional//EN") {
2019     $self->{document}->manakai_compat_mode ('limited quirks');
2020     }
2021     }
2022     if (defined $token->{system_identifier}) {
2023     my $sysid = $token->{system_identifier};
2024     $sysid =~ tr/A-Z/a-z/;
2025     if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
2026     $self->{document}->manakai_compat_mode ('quirks');
2027     }
2028     }
2029    
2030     ## Go to the root element phase.
2031     !!!next-token;
2032     return;
2033     } elsif ({
2034 wakaba 1.55 START_TAG_TOKEN, 1,
2035     END_TAG_TOKEN, 1,
2036     END_OF_FILE_TOKEN, 1,
2037 wakaba 1.18 }->{$token->{type}}) {
2038     !!!parse-error (type => 'no DOCTYPE');
2039     $self->{document}->manakai_compat_mode ('quirks');
2040     ## Go to the root element phase
2041     ## reprocess
2042     return;
2043 wakaba 1.55 } elsif ($token->{type} == CHARACTER_TOKEN) {
2044 wakaba 1.18 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
2045     ## Ignore the token
2046 wakaba 1.26
2047 wakaba 1.18 unless (length $token->{data}) {
2048     ## Stay in the phase
2049     !!!next-token;
2050     redo INITIAL;
2051 wakaba 1.3 }
2052     }
2053 wakaba 1.18
2054     !!!parse-error (type => 'no DOCTYPE');
2055     $self->{document}->manakai_compat_mode ('quirks');
2056     ## Go to the root element phase
2057     ## reprocess
2058     return;
2059 wakaba 1.55 } elsif ($token->{type} == COMMENT_TOKEN) {
2060 wakaba 1.18 my $comment = $self->{document}->create_comment ($token->{data});
2061     $self->{document}->append_child ($comment);
2062    
2063     ## Stay in the phase.
2064     !!!next-token;
2065     redo INITIAL;
2066     } else {
2067 wakaba 1.55 die "$0: $token->{type}: Unknown token type";
2068 wakaba 1.18 }
2069     } # INITIAL
2070 wakaba 1.3 } # _tree_construction_initial
2071    
2072     sub _tree_construction_root_element ($) {
2073     my $self = shift;
2074    
2075     B: {
2076 wakaba 1.55 if ($token->{type} == DOCTYPE_TOKEN) {
2077 wakaba 1.3 !!!parse-error (type => 'in html:#DOCTYPE');
2078     ## Ignore the token
2079     ## Stay in the phase
2080     !!!next-token;
2081     redo B;
2082 wakaba 1.55 } elsif ($token->{type} == COMMENT_TOKEN) {
2083 wakaba 1.3 my $comment = $self->{document}->create_comment ($token->{data});
2084     $self->{document}->append_child ($comment);
2085     ## Stay in the phase
2086     !!!next-token;
2087     redo B;
2088 wakaba 1.55 } elsif ($token->{type} == CHARACTER_TOKEN) {
2089 wakaba 1.26 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
2090     ## Ignore the token.
2091    
2092 wakaba 1.3 unless (length $token->{data}) {
2093     ## Stay in the phase
2094     !!!next-token;
2095     redo B;
2096     }
2097     }
2098     #
2099     } elsif ({
2100 wakaba 1.55 START_TAG_TOKEN, 1,
2101     END_TAG_TOKEN, 1,
2102     END_OF_FILE_TOKEN, 1,
2103 wakaba 1.3 }->{$token->{type}}) {
2104     ## ISSUE: There is an issue in the spec
2105     #
2106     } else {
2107 wakaba 1.55 die "$0: $token->{type}: Unknown token type";
2108 wakaba 1.3 }
2109     my $root_element; !!!create-element ($root_element, 'html');
2110     $self->{document}->append_child ($root_element);
2111     push @{$self->{open_elements}}, [$root_element, 'html'];
2112     ## reprocess
2113     #redo B;
2114 wakaba 1.35 return; ## Go to the main phase.
2115 wakaba 1.3 } # B
2116     } # _tree_construction_root_element
2117    
2118     sub _reset_insertion_mode ($) {
2119     my $self = shift;
2120    
2121     ## Step 1
2122     my $last;
2123    
2124     ## Step 2
2125     my $i = -1;
2126     my $node = $self->{open_elements}->[$i];
2127    
2128     ## Step 3
2129     S3: {
2130 wakaba 1.29 ## ISSUE: Oops! "If node is the first node in the stack of open
2131     ## elements, then set last to true. If the context element of the
2132     ## HTML fragment parsing algorithm is neither a td element nor a
2133     ## th element, then set node to the context element. (fragment case)":
2134     ## The second "if" is in the scope of the first "if"!?
2135     if ($self->{open_elements}->[0]->[0] eq $node->[0]) {
2136     $last = 1;
2137     if (defined $self->{inner_html_node}) {
2138     if ($self->{inner_html_node}->[1] eq 'td' or
2139     $self->{inner_html_node}->[1] eq 'th') {
2140     #
2141     } else {
2142     $node = $self->{inner_html_node};
2143     }
2144 wakaba 1.3 }
2145     }
2146    
2147     ## Step 4..13
2148     my $new_mode = {
2149 wakaba 1.54 select => IN_SELECT_IM,
2150     td => IN_CELL_IM,
2151     th => IN_CELL_IM,
2152     tr => IN_ROW_IM,
2153     tbody => IN_TABLE_BODY_IM,
2154     thead => IN_TABLE_BODY_IM,
2155     tfoot => IN_TABLE_BODY_IM,
2156     caption => IN_CAPTION_IM,
2157     colgroup => IN_COLUMN_GROUP_IM,
2158     table => IN_TABLE_IM,
2159     head => IN_BODY_IM, # not in head!
2160     body => IN_BODY_IM,
2161     frameset => IN_FRAMESET_IM,
2162 wakaba 1.3 }->{$node->[1]};
2163     $self->{insertion_mode} = $new_mode and return if defined $new_mode;
2164    
2165     ## Step 14
2166     if ($node->[1] eq 'html') {
2167     unless (defined $self->{head_element}) {
2168 wakaba 1.54 $self->{insertion_mode} = BEFORE_HEAD_IM;
2169 wakaba 1.3 } else {
2170 wakaba 1.54 $self->{insertion_mode} = AFTER_HEAD_IM;
2171 wakaba 1.3 }
2172     return;
2173     }
2174    
2175     ## Step 15
2176 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM and return if $last;
2177 wakaba 1.3
2178     ## Step 16
2179     $i--;
2180     $node = $self->{open_elements}->[$i];
2181    
2182     ## Step 17
2183     redo S3;
2184     } # S3
2185     } # _reset_insertion_mode
2186    
2187     sub _tree_construction_main ($) {
2188     my $self = shift;
2189    
2190 wakaba 1.1 my $active_formatting_elements = [];
2191    
2192     my $reconstruct_active_formatting_elements = sub { # MUST
2193     my $insert = shift;
2194    
2195     ## Step 1
2196     return unless @$active_formatting_elements;
2197    
2198     ## Step 3
2199     my $i = -1;
2200     my $entry = $active_formatting_elements->[$i];
2201    
2202     ## Step 2
2203     return if $entry->[0] eq '#marker';
2204 wakaba 1.3 for (@{$self->{open_elements}}) {
2205 wakaba 1.1 if ($entry->[0] eq $_->[0]) {
2206     return;
2207     }
2208     }
2209    
2210     S4: {
2211     ## Step 4
2212     last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
2213    
2214     ## Step 5
2215     $i--;
2216     $entry = $active_formatting_elements->[$i];
2217    
2218     ## Step 6
2219     if ($entry->[0] eq '#marker') {
2220     #
2221     } else {
2222     my $in_open_elements;
2223 wakaba 1.3 OE: for (@{$self->{open_elements}}) {
2224 wakaba 1.1 if ($entry->[0] eq $_->[0]) {
2225     $in_open_elements = 1;
2226     last OE;
2227     }
2228     }
2229     if ($in_open_elements) {
2230     #
2231     } else {
2232     redo S4;
2233     }
2234     }
2235    
2236     ## Step 7
2237     $i++;
2238     $entry = $active_formatting_elements->[$i];
2239     } # S4
2240    
2241     S7: {
2242     ## Step 8
2243     my $clone = [$entry->[0]->clone_node (0), $entry->[1]];
2244    
2245     ## Step 9
2246     $insert->($clone->[0]);
2247 wakaba 1.3 push @{$self->{open_elements}}, $clone;
2248 wakaba 1.1
2249     ## Step 10
2250 wakaba 1.3 $active_formatting_elements->[$i] = $self->{open_elements}->[-1];
2251 wakaba 1.1
2252     ## Step 11
2253     unless ($clone->[0] eq $active_formatting_elements->[-1]->[0]) {
2254     ## Step 7'
2255     $i++;
2256     $entry = $active_formatting_elements->[$i];
2257    
2258     redo S7;
2259     }
2260     } # S7
2261     }; # $reconstruct_active_formatting_elements
2262    
2263     my $clear_up_to_marker = sub {
2264     for (reverse 0..$#$active_formatting_elements) {
2265     if ($active_formatting_elements->[$_]->[0] eq '#marker') {
2266     splice @$active_formatting_elements, $_;
2267     return;
2268     }
2269     }
2270     }; # $clear_up_to_marker
2271    
2272 wakaba 1.25 my $parse_rcdata = sub ($$) {
2273     my ($content_model_flag, $insert) = @_;
2274    
2275     ## Step 1
2276     my $start_tag_name = $token->{tag_name};
2277     my $el;
2278     !!!create-element ($el, $start_tag_name, $token->{attributes});
2279    
2280     ## Step 2
2281     $insert->($el); # /context node/->append_child ($el)
2282    
2283     ## Step 3
2284 wakaba 1.40 $self->{content_model} = $content_model_flag; # CDATA or RCDATA
2285 wakaba 1.13 delete $self->{escape}; # MUST
2286 wakaba 1.25
2287     ## Step 4
2288 wakaba 1.1 my $text = '';
2289     !!!next-token;
2290 wakaba 1.55 while ($token->{type} == CHARACTER_TOKEN) { # or until stop tokenizing
2291 wakaba 1.1 $text .= $token->{data};
2292     !!!next-token;
2293 wakaba 1.25 }
2294    
2295     ## Step 5
2296 wakaba 1.1 if (length $text) {
2297 wakaba 1.25 my $text = $self->{document}->create_text_node ($text);
2298     $el->append_child ($text);
2299 wakaba 1.1 }
2300 wakaba 1.25
2301     ## Step 6
2302 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL;
2303 wakaba 1.25
2304     ## Step 7
2305 wakaba 1.55 if ($token->{type} == END_TAG_TOKEN and $token->{tag_name} eq $start_tag_name) {
2306 wakaba 1.1 ## Ignore the token
2307 wakaba 1.40 } elsif ($content_model_flag == CDATA_CONTENT_MODEL) {
2308     !!!parse-error (type => 'in CDATA:#'.$token->{type});
2309     } elsif ($content_model_flag == RCDATA_CONTENT_MODEL) {
2310     !!!parse-error (type => 'in RCDATA:#'.$token->{type});
2311 wakaba 1.1 } else {
2312 wakaba 1.40 die "$0: $content_model_flag in parse_rcdata";
2313 wakaba 1.1 }
2314     !!!next-token;
2315 wakaba 1.25 }; # $parse_rcdata
2316 wakaba 1.1
2317 wakaba 1.25 my $script_start_tag = sub ($) {
2318     my $insert = $_[0];
2319 wakaba 1.1 my $script_el;
2320     !!!create-element ($script_el, 'script', $token->{attributes});
2321     ## TODO: mark as "parser-inserted"
2322    
2323 wakaba 1.40 $self->{content_model} = CDATA_CONTENT_MODEL;
2324 wakaba 1.13 delete $self->{escape}; # MUST
2325 wakaba 1.1
2326     my $text = '';
2327     !!!next-token;
2328 wakaba 1.55 while ($token->{type} == CHARACTER_TOKEN) {
2329 wakaba 1.1 $text .= $token->{data};
2330     !!!next-token;
2331     } # stop if non-character token or tokenizer stops tokenising
2332     if (length $text) {
2333     $script_el->manakai_append_text ($text);
2334     }
2335    
2336 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL;
2337 wakaba 1.1
2338 wakaba 1.55 if ($token->{type} == END_TAG_TOKEN and
2339 wakaba 1.1 $token->{tag_name} eq 'script') {
2340     ## Ignore the token
2341     } else {
2342 wakaba 1.3 !!!parse-error (type => 'in CDATA:#'.$token->{type});
2343 wakaba 1.1 ## ISSUE: And ignore?
2344     ## TODO: mark as "already executed"
2345     }
2346    
2347 wakaba 1.3 if (defined $self->{inner_html_node}) {
2348     ## TODO: mark as "already executed"
2349     } else {
2350 wakaba 1.1 ## TODO: $old_insertion_point = current insertion point
2351     ## TODO: insertion point = just before the next input character
2352 wakaba 1.25
2353     $insert->($script_el);
2354 wakaba 1.1
2355     ## TODO: insertion point = $old_insertion_point (might be "undefined")
2356    
2357     ## TODO: if there is a script that will execute as soon as the parser resume, then...
2358     }
2359    
2360     !!!next-token;
2361     }; # $script_start_tag
2362    
2363     my $formatting_end_tag = sub {
2364     my $tag_name = shift;
2365    
2366     FET: {
2367     ## Step 1
2368     my $formatting_element;
2369     my $formatting_element_i_in_active;
2370     AFE: for (reverse 0..$#$active_formatting_elements) {
2371     if ($active_formatting_elements->[$_]->[1] eq $tag_name) {
2372     $formatting_element = $active_formatting_elements->[$_];
2373     $formatting_element_i_in_active = $_;
2374     last AFE;
2375     } elsif ($active_formatting_elements->[$_]->[0] eq '#marker') {
2376     last AFE;
2377     }
2378     } # AFE
2379     unless (defined $formatting_element) {
2380 wakaba 1.3 !!!parse-error (type => 'unmatched end tag:'.$tag_name);
2381 wakaba 1.1 ## Ignore the token
2382     !!!next-token;
2383     return;
2384     }
2385     ## has an element in scope
2386     my $in_scope = 1;
2387     my $formatting_element_i_in_open;
2388 wakaba 1.3 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2389     my $node = $self->{open_elements}->[$_];
2390 wakaba 1.1 if ($node->[0] eq $formatting_element->[0]) {
2391     if ($in_scope) {
2392     $formatting_element_i_in_open = $_;
2393     last INSCOPE;
2394     } else { # in open elements but not in scope
2395 wakaba 1.4 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2396 wakaba 1.1 ## Ignore the token
2397     !!!next-token;
2398     return;
2399     }
2400     } elsif ({
2401     table => 1, caption => 1, td => 1, th => 1,
2402     button => 1, marquee => 1, object => 1, html => 1,
2403     }->{$node->[1]}) {
2404     $in_scope = 0;
2405     }
2406     } # INSCOPE
2407     unless (defined $formatting_element_i_in_open) {
2408 wakaba 1.4 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2409 wakaba 1.1 pop @$active_formatting_elements; # $formatting_element
2410     !!!next-token; ## TODO: ok?
2411     return;
2412     }
2413 wakaba 1.3 if (not $self->{open_elements}->[-1]->[0] eq $formatting_element->[0]) {
2414 wakaba 1.4 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
2415 wakaba 1.1 }
2416    
2417     ## Step 2
2418     my $furthest_block;
2419     my $furthest_block_i_in_open;
2420 wakaba 1.3 OE: for (reverse 0..$#{$self->{open_elements}}) {
2421     my $node = $self->{open_elements}->[$_];
2422 wakaba 1.1 if (not $formatting_category->{$node->[1]} and
2423     #not $phrasing_category->{$node->[1]} and
2424     ($special_category->{$node->[1]} or
2425     $scoping_category->{$node->[1]})) {
2426     $furthest_block = $node;
2427     $furthest_block_i_in_open = $_;
2428     } elsif ($node->[0] eq $formatting_element->[0]) {
2429     last OE;
2430     }
2431     } # OE
2432    
2433     ## Step 3
2434     unless (defined $furthest_block) { # MUST
2435 wakaba 1.3 splice @{$self->{open_elements}}, $formatting_element_i_in_open;
2436 wakaba 1.1 splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
2437     !!!next-token;
2438     return;
2439     }
2440    
2441     ## Step 4
2442 wakaba 1.3 my $common_ancestor_node = $self->{open_elements}->[$formatting_element_i_in_open - 1];
2443 wakaba 1.1
2444     ## Step 5
2445     my $furthest_block_parent = $furthest_block->[0]->parent_node;
2446     if (defined $furthest_block_parent) {
2447     $furthest_block_parent->remove_child ($furthest_block->[0]);
2448     }
2449    
2450     ## Step 6
2451     my $bookmark_prev_el
2452     = $active_formatting_elements->[$formatting_element_i_in_active - 1]
2453     ->[0];
2454    
2455     ## Step 7
2456     my $node = $furthest_block;
2457     my $node_i_in_open = $furthest_block_i_in_open;
2458     my $last_node = $furthest_block;
2459     S7: {
2460     ## Step 1
2461     $node_i_in_open--;
2462 wakaba 1.3 $node = $self->{open_elements}->[$node_i_in_open];
2463 wakaba 1.1
2464     ## Step 2
2465     my $node_i_in_active;
2466     S7S2: {
2467     for (reverse 0..$#$active_formatting_elements) {
2468     if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
2469     $node_i_in_active = $_;
2470     last S7S2;
2471     }
2472     }
2473 wakaba 1.3 splice @{$self->{open_elements}}, $node_i_in_open, 1;
2474 wakaba 1.1 redo S7;
2475     } # S7S2
2476    
2477     ## Step 3
2478     last S7 if $node->[0] eq $formatting_element->[0];
2479    
2480     ## Step 4
2481     if ($last_node->[0] eq $furthest_block->[0]) {
2482     $bookmark_prev_el = $node->[0];
2483     }
2484    
2485     ## Step 5
2486     if ($node->[0]->has_child_nodes ()) {
2487     my $clone = [$node->[0]->clone_node (0), $node->[1]];
2488     $active_formatting_elements->[$node_i_in_active] = $clone;
2489 wakaba 1.3 $self->{open_elements}->[$node_i_in_open] = $clone;
2490 wakaba 1.1 $node = $clone;
2491     }
2492    
2493     ## Step 6
2494     $node->[0]->append_child ($last_node->[0]);
2495    
2496     ## Step 7
2497     $last_node = $node;
2498    
2499     ## Step 8
2500     redo S7;
2501     } # S7
2502    
2503     ## Step 8
2504     $common_ancestor_node->[0]->append_child ($last_node->[0]);
2505    
2506     ## Step 9
2507     my $clone = [$formatting_element->[0]->clone_node (0),
2508     $formatting_element->[1]];
2509    
2510     ## Step 10
2511     my @cn = @{$furthest_block->[0]->child_nodes};
2512     $clone->[0]->append_child ($_) for @cn;
2513    
2514     ## Step 11
2515     $furthest_block->[0]->append_child ($clone->[0]);
2516    
2517     ## Step 12
2518     my $i;
2519     AFE: for (reverse 0..$#$active_formatting_elements) {
2520     if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
2521     splice @$active_formatting_elements, $_, 1;
2522     $i-- and last AFE if defined $i;
2523     } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
2524     $i = $_;
2525     }
2526     } # AFE
2527     splice @$active_formatting_elements, $i + 1, 0, $clone;
2528    
2529     ## Step 13
2530     undef $i;
2531 wakaba 1.3 OE: for (reverse 0..$#{$self->{open_elements}}) {
2532     if ($self->{open_elements}->[$_]->[0] eq $formatting_element->[0]) {
2533     splice @{$self->{open_elements}}, $_, 1;
2534 wakaba 1.1 $i-- and last OE if defined $i;
2535 wakaba 1.3 } elsif ($self->{open_elements}->[$_]->[0] eq $furthest_block->[0]) {
2536 wakaba 1.1 $i = $_;
2537     }
2538     } # OE
2539 wakaba 1.3 splice @{$self->{open_elements}}, $i + 1, 1, $clone;
2540 wakaba 1.1
2541     ## Step 14
2542     redo FET;
2543     } # FET
2544     }; # $formatting_end_tag
2545    
2546     my $insert_to_current = sub {
2547 wakaba 1.25 $self->{open_elements}->[-1]->[0]->append_child ($_[0]);
2548 wakaba 1.1 }; # $insert_to_current
2549    
2550     my $insert_to_foster = sub {
2551     my $child = shift;
2552     if ({
2553     table => 1, tbody => 1, tfoot => 1,
2554     thead => 1, tr => 1,
2555 wakaba 1.3 }->{$self->{open_elements}->[-1]->[1]}) {
2556 wakaba 1.1 # MUST
2557     my $foster_parent_element;
2558     my $next_sibling;
2559 wakaba 1.3 OE: for (reverse 0..$#{$self->{open_elements}}) {
2560     if ($self->{open_elements}->[$_]->[1] eq 'table') {
2561     my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
2562 wakaba 1.1 if (defined $parent and $parent->node_type == 1) {
2563     $foster_parent_element = $parent;
2564 wakaba 1.3 $next_sibling = $self->{open_elements}->[$_]->[0];
2565 wakaba 1.1 } else {
2566     $foster_parent_element
2567 wakaba 1.3 = $self->{open_elements}->[$_ - 1]->[0];
2568 wakaba 1.1 }
2569     last OE;
2570     }
2571     } # OE
2572 wakaba 1.3 $foster_parent_element = $self->{open_elements}->[0]->[0]
2573 wakaba 1.1 unless defined $foster_parent_element;
2574     $foster_parent_element->insert_before
2575     ($child, $next_sibling);
2576     } else {
2577 wakaba 1.3 $self->{open_elements}->[-1]->[0]->append_child ($child);
2578 wakaba 1.1 }
2579     }; # $insert_to_foster
2580    
2581 wakaba 1.52 my $insert;
2582 wakaba 1.34
2583 wakaba 1.52 B: {
2584 wakaba 1.55 if ($token->{type} == DOCTYPE_TOKEN) {
2585 wakaba 1.52 !!!parse-error (type => 'DOCTYPE in the middle');
2586     ## Ignore the token
2587     ## Stay in the phase
2588     !!!next-token;
2589     redo B;
2590 wakaba 1.55 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
2591 wakaba 1.56 if ($self->{insertion_mode} & AFTER_HTML_IMS) {
2592 wakaba 1.52 #
2593     } else {
2594     ## Generate implied end tags
2595     if ({
2596     dd => 1, dt => 1, li => 1, p => 1, td => 1, th => 1, tr => 1,
2597     tbody => 1, tfoot=> 1, thead => 1,
2598     }->{$self->{open_elements}->[-1]->[1]}) {
2599     !!!back-token;
2600 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => $self->{open_elements}->[-1]->[1]};
2601 wakaba 1.52 redo B;
2602     }
2603    
2604     if (@{$self->{open_elements}} > 2 or
2605     (@{$self->{open_elements}} == 2 and $self->{open_elements}->[1]->[1] ne 'body')) {
2606     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
2607     } elsif (defined $self->{inner_html_node} and
2608     @{$self->{open_elements}} > 1 and
2609     $self->{open_elements}->[1]->[1] ne 'body') {
2610     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
2611 wakaba 1.34 }
2612    
2613 wakaba 1.52 ## ISSUE: There is an issue in the spec.
2614     }
2615    
2616     ## Stop parsing
2617     last B;
2618 wakaba 1.55 } elsif ($token->{type} == START_TAG_TOKEN and
2619 wakaba 1.52 $token->{tag_name} eq 'html') {
2620 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
2621 wakaba 1.52 ## Turn into the main phase
2622     !!!parse-error (type => 'after html:html');
2623 wakaba 1.54 $self->{insertion_mode} = AFTER_BODY_IM;
2624     } elsif ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
2625 wakaba 1.52 ## Turn into the main phase
2626     !!!parse-error (type => 'after html:html');
2627 wakaba 1.54 $self->{insertion_mode} = AFTER_FRAMESET_IM;
2628 wakaba 1.52 }
2629    
2630     ## ISSUE: "aa<html>" is not a parse error.
2631     ## ISSUE: "<html>" in fragment is not a parse error.
2632     unless ($token->{first_start_tag}) {
2633     !!!parse-error (type => 'not first start tag');
2634     }
2635     my $top_el = $self->{open_elements}->[0]->[0];
2636     for my $attr_name (keys %{$token->{attributes}}) {
2637     unless ($top_el->has_attribute_ns (undef, $attr_name)) {
2638     $top_el->set_attribute_ns
2639     (undef, [undef, $attr_name],
2640     $token->{attributes}->{$attr_name}->{value});
2641     }
2642     }
2643     !!!next-token;
2644     redo B;
2645 wakaba 1.55 } elsif ($token->{type} == COMMENT_TOKEN) {
2646 wakaba 1.52 my $comment = $self->{document}->create_comment ($token->{data});
2647 wakaba 1.56 if ($self->{insertion_mode} & AFTER_HTML_IMS) {
2648 wakaba 1.52 $self->{document}->append_child ($comment);
2649 wakaba 1.54 } elsif ($self->{insertion_mode} == AFTER_BODY_IM) {
2650 wakaba 1.52 $self->{open_elements}->[0]->[0]->append_child ($comment);
2651     } else {
2652     $self->{open_elements}->[-1]->[0]->append_child ($comment);
2653     }
2654     !!!next-token;
2655     redo B;
2656 wakaba 1.56 } elsif ($self->{insertion_mode} & HEAD_IMS) {
2657 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
2658 wakaba 1.52 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
2659     $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
2660     unless (length $token->{data}) {
2661     !!!next-token;
2662     redo B;
2663 wakaba 1.1 }
2664     }
2665 wakaba 1.52
2666 wakaba 1.54 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
2667 wakaba 1.52 ## As if <head>
2668     !!!create-element ($self->{head_element}, 'head');
2669     $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
2670     push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2671    
2672     ## Reprocess in the "in head" insertion mode...
2673     pop @{$self->{open_elements}};
2674    
2675     ## Reprocess in the "after head" insertion mode...
2676 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2677 wakaba 1.52 ## As if </noscript>
2678     pop @{$self->{open_elements}};
2679     !!!parse-error (type => 'in noscript:#character');
2680 wakaba 1.1
2681 wakaba 1.52 ## Reprocess in the "in head" insertion mode...
2682     ## As if </head>
2683     pop @{$self->{open_elements}};
2684    
2685     ## Reprocess in the "after head" insertion mode...
2686 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
2687 wakaba 1.52 pop @{$self->{open_elements}};
2688    
2689     ## Reprocess in the "after head" insertion mode...
2690 wakaba 1.1 }
2691 wakaba 1.52
2692     ## "after head" insertion mode
2693     ## As if <body>
2694     !!!insert-element ('body');
2695 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM;
2696 wakaba 1.52 ## reprocess
2697     redo B;
2698 wakaba 1.55 } elsif ($token->{type} == START_TAG_TOKEN) {
2699 wakaba 1.52 if ($token->{tag_name} eq 'head') {
2700 wakaba 1.54 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
2701 wakaba 1.52 !!!create-element ($self->{head_element}, $token->{tag_name}, $token->{attributes});
2702     $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
2703     push @{$self->{open_elements}}, [$self->{head_element}, $token->{tag_name}];
2704 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_IM;
2705 wakaba 1.52 !!!next-token;
2706     redo B;
2707 wakaba 1.54 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
2708     #
2709     } else {
2710 wakaba 1.52 !!!parse-error (type => 'in head:head'); # or in head noscript
2711     ## Ignore the token
2712     !!!next-token;
2713     redo B;
2714     }
2715 wakaba 1.54 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
2716 wakaba 1.52 ## As if <head>
2717     !!!create-element ($self->{head_element}, 'head');
2718     $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
2719     push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2720    
2721 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_IM;
2722 wakaba 1.52 ## Reprocess in the "in head" insertion mode...
2723 wakaba 1.1 }
2724 wakaba 1.52
2725 wakaba 1.49 if ($token->{tag_name} eq 'base') {
2726 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2727 wakaba 1.49 ## As if </noscript>
2728     pop @{$self->{open_elements}};
2729     !!!parse-error (type => 'in noscript:base');
2730    
2731 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_IM;
2732 wakaba 1.49 ## Reprocess in the "in head" insertion mode...
2733     }
2734    
2735     ## NOTE: There is a "as if in head" code clone.
2736 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
2737 wakaba 1.49 !!!parse-error (type => 'after head:'.$token->{tag_name});
2738     push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2739     }
2740     !!!insert-element ($token->{tag_name}, $token->{attributes});
2741     pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
2742     pop @{$self->{open_elements}}
2743 wakaba 1.54 if $self->{insertion_mode} == AFTER_HEAD_IM;
2744 wakaba 1.49 !!!next-token;
2745     redo B;
2746     } elsif ($token->{tag_name} eq 'link') {
2747 wakaba 1.25 ## NOTE: There is a "as if in head" code clone.
2748 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
2749 wakaba 1.25 !!!parse-error (type => 'after head:'.$token->{tag_name});
2750     push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2751     }
2752     !!!insert-element ($token->{tag_name}, $token->{attributes});
2753     pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
2754     pop @{$self->{open_elements}}
2755 wakaba 1.54 if $self->{insertion_mode} == AFTER_HEAD_IM;
2756 wakaba 1.1 !!!next-token;
2757 wakaba 1.25 redo B;
2758 wakaba 1.34 } elsif ($token->{tag_name} eq 'meta') {
2759     ## NOTE: There is a "as if in head" code clone.
2760 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
2761 wakaba 1.34 !!!parse-error (type => 'after head:'.$token->{tag_name});
2762     push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2763     }
2764     !!!insert-element ($token->{tag_name}, $token->{attributes});
2765     pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
2766    
2767     unless ($self->{confident}) {
2768     my $charset;
2769     if ($token->{attributes}->{charset}) { ## TODO: And if supported
2770     $charset = $token->{attributes}->{charset}->{value};
2771     }
2772     if ($token->{attributes}->{'http-equiv'}) {
2773 wakaba 1.35 ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
2774 wakaba 1.34 if ($token->{attributes}->{'http-equiv'}->{value}
2775     =~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*=
2776     [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
2777     ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
2778     $charset = defined $1 ? $1 : defined $2 ? $2 : $3;
2779     } ## TODO: And if supported
2780     }
2781     ## TODO: Change the encoding
2782     }
2783    
2784     ## TODO: Extracting |charset| from |meta|.
2785     pop @{$self->{open_elements}}
2786 wakaba 1.54 if $self->{insertion_mode} == AFTER_HEAD_IM;
2787 wakaba 1.34 !!!next-token;
2788     redo B;
2789 wakaba 1.49 } elsif ($token->{tag_name} eq 'title') {
2790 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2791 wakaba 1.49 ## As if </noscript>
2792     pop @{$self->{open_elements}};
2793     !!!parse-error (type => 'in noscript:title');
2794    
2795 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_IM;
2796 wakaba 1.49 ## Reprocess in the "in head" insertion mode...
2797 wakaba 1.54 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
2798 wakaba 1.25 !!!parse-error (type => 'after head:'.$token->{tag_name});
2799     push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2800     }
2801 wakaba 1.49
2802     ## NOTE: There is a "as if in head" code clone.
2803 wakaba 1.31 my $parent = defined $self->{head_element} ? $self->{head_element}
2804     : $self->{open_elements}->[-1]->[0];
2805 wakaba 1.40 $parse_rcdata->(RCDATA_CONTENT_MODEL,
2806     sub { $parent->append_child ($_[0]) });
2807 wakaba 1.25 pop @{$self->{open_elements}}
2808 wakaba 1.54 if $self->{insertion_mode} == AFTER_HEAD_IM;
2809 wakaba 1.25 redo B;
2810     } elsif ($token->{tag_name} eq 'style') {
2811     ## NOTE: Or (scripting is enabled and tag_name eq 'noscript' and
2812 wakaba 1.54 ## insertion mode IN_HEAD_IM)
2813 wakaba 1.25 ## NOTE: There is a "as if in head" code clone.
2814 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
2815 wakaba 1.25 !!!parse-error (type => 'after head:'.$token->{tag_name});
2816     push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2817     }
2818 wakaba 1.40 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert_to_current);
2819 wakaba 1.25 pop @{$self->{open_elements}}
2820 wakaba 1.54 if $self->{insertion_mode} == AFTER_HEAD_IM;
2821 wakaba 1.25 redo B;
2822     } elsif ($token->{tag_name} eq 'noscript') {
2823 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_IM) {
2824 wakaba 1.25 ## NOTE: and scripting is disalbed
2825     !!!insert-element ($token->{tag_name}, $token->{attributes});
2826 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_NOSCRIPT_IM;
2827 wakaba 1.1 !!!next-token;
2828 wakaba 1.25 redo B;
2829 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2830 wakaba 1.30 !!!parse-error (type => 'in noscript:noscript');
2831 wakaba 1.1 ## Ignore the token
2832 wakaba 1.41 !!!next-token;
2833 wakaba 1.25 redo B;
2834 wakaba 1.1 } else {
2835 wakaba 1.25 #
2836 wakaba 1.1 }
2837 wakaba 1.49 } elsif ($token->{tag_name} eq 'script') {
2838 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2839 wakaba 1.49 ## As if </noscript>
2840     pop @{$self->{open_elements}};
2841     !!!parse-error (type => 'in noscript:script');
2842    
2843 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_IM;
2844 wakaba 1.49 ## Reprocess in the "in head" insertion mode...
2845 wakaba 1.54 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
2846 wakaba 1.25 !!!parse-error (type => 'after head:'.$token->{tag_name});
2847     push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2848     }
2849 wakaba 1.49
2850 wakaba 1.25 ## NOTE: There is a "as if in head" code clone.
2851     $script_start_tag->($insert_to_current);
2852     pop @{$self->{open_elements}}
2853 wakaba 1.54 if $self->{insertion_mode} == AFTER_HEAD_IM;
2854 wakaba 1.1 redo B;
2855 wakaba 1.49 } elsif ($token->{tag_name} eq 'body' or
2856 wakaba 1.25 $token->{tag_name} eq 'frameset') {
2857 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2858 wakaba 1.49 ## As if </noscript>
2859     pop @{$self->{open_elements}};
2860     !!!parse-error (type => 'in noscript:'.$token->{tag_name});
2861    
2862     ## Reprocess in the "in head" insertion mode...
2863     ## As if </head>
2864     pop @{$self->{open_elements}};
2865    
2866     ## Reprocess in the "after head" insertion mode...
2867 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
2868 wakaba 1.49 pop @{$self->{open_elements}};
2869    
2870     ## Reprocess in the "after head" insertion mode...
2871     }
2872    
2873     ## "after head" insertion mode
2874     !!!insert-element ($token->{tag_name}, $token->{attributes});
2875 wakaba 1.54 if ($token->{tag_name} eq 'body') {
2876     $self->{insertion_mode} = IN_BODY_IM;
2877     } elsif ($token->{tag_name} eq 'frameset') {
2878     $self->{insertion_mode} = IN_FRAMESET_IM;
2879     } else {
2880     die "$0: tag name: $self->{tag_name}";
2881     }
2882 wakaba 1.1 !!!next-token;
2883     redo B;
2884     } else {
2885     #
2886     }
2887 wakaba 1.49
2888 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2889 wakaba 1.49 ## As if </noscript>
2890     pop @{$self->{open_elements}};
2891     !!!parse-error (type => 'in noscript:/'.$token->{tag_name});
2892    
2893     ## Reprocess in the "in head" insertion mode...
2894     ## As if </head>
2895 wakaba 1.25 pop @{$self->{open_elements}};
2896 wakaba 1.49
2897     ## Reprocess in the "after head" insertion mode...
2898 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
2899 wakaba 1.49 ## As if </head>
2900 wakaba 1.25 pop @{$self->{open_elements}};
2901 wakaba 1.49
2902     ## Reprocess in the "after head" insertion mode...
2903     }
2904    
2905     ## "after head" insertion mode
2906     ## As if <body>
2907     !!!insert-element ('body');
2908 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM;
2909 wakaba 1.49 ## reprocess
2910     redo B;
2911 wakaba 1.55 } elsif ($token->{type} == END_TAG_TOKEN) {
2912 wakaba 1.49 if ($token->{tag_name} eq 'head') {
2913 wakaba 1.54 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
2914 wakaba 1.50 ## As if <head>
2915     !!!create-element ($self->{head_element}, 'head');
2916     $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
2917     push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2918    
2919     ## Reprocess in the "in head" insertion mode...
2920     pop @{$self->{open_elements}};
2921 wakaba 1.54 $self->{insertion_mode} = AFTER_HEAD_IM;
2922 wakaba 1.50 !!!next-token;
2923     redo B;
2924 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2925 wakaba 1.49 ## As if </noscript>
2926     pop @{$self->{open_elements}};
2927     !!!parse-error (type => 'in noscript:script');
2928    
2929     ## Reprocess in the "in head" insertion mode...
2930 wakaba 1.50 pop @{$self->{open_elements}};
2931 wakaba 1.54 $self->{insertion_mode} = AFTER_HEAD_IM;
2932 wakaba 1.50 !!!next-token;
2933     redo B;
2934 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
2935 wakaba 1.49 pop @{$self->{open_elements}};
2936 wakaba 1.54 $self->{insertion_mode} = AFTER_HEAD_IM;
2937 wakaba 1.49 !!!next-token;
2938     redo B;
2939     } else {
2940     #
2941     }
2942     } elsif ($token->{tag_name} eq 'noscript') {
2943 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2944 wakaba 1.49 pop @{$self->{open_elements}};
2945 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_IM;
2946 wakaba 1.49 !!!next-token;
2947     redo B;
2948 wakaba 1.54 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
2949 wakaba 1.50 !!!parse-error (type => 'unmatched end tag:noscript');
2950     ## Ignore the token ## ISSUE: An issue in the spec.
2951     !!!next-token;
2952     redo B;
2953 wakaba 1.49 } else {
2954     #
2955     }
2956     } elsif ({
2957 wakaba 1.31 body => 1, html => 1,
2958     }->{$token->{tag_name}}) {
2959 wakaba 1.54 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
2960 wakaba 1.50 ## As if <head>
2961     !!!create-element ($self->{head_element}, 'head');
2962     $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
2963     push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2964    
2965 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_IM;
2966 wakaba 1.50 ## Reprocess in the "in head" insertion mode...
2967 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2968 wakaba 1.49 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2969     ## Ignore the token
2970     !!!next-token;
2971     redo B;
2972     }
2973 wakaba 1.50
2974     #
2975 wakaba 1.49 } elsif ({
2976 wakaba 1.31 p => 1, br => 1,
2977     }->{$token->{tag_name}}) {
2978 wakaba 1.54 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
2979 wakaba 1.50 ## As if <head>
2980     !!!create-element ($self->{head_element}, 'head');
2981     $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
2982     push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2983    
2984 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_IM;
2985 wakaba 1.50 ## Reprocess in the "in head" insertion mode...
2986     }
2987    
2988 wakaba 1.1 #
2989 wakaba 1.25 } else {
2990 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
2991     #
2992     } else {
2993 wakaba 1.49 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2994     ## Ignore the token
2995     !!!next-token;
2996     redo B;
2997     }
2998     }
2999    
3000 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3001 wakaba 1.49 ## As if </noscript>
3002     pop @{$self->{open_elements}};
3003     !!!parse-error (type => 'in noscript:/'.$token->{tag_name});
3004    
3005     ## Reprocess in the "in head" insertion mode...
3006     ## As if </head>
3007     pop @{$self->{open_elements}};
3008    
3009     ## Reprocess in the "after head" insertion mode...
3010 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3011 wakaba 1.49 ## As if </head>
3012     pop @{$self->{open_elements}};
3013    
3014     ## Reprocess in the "after head" insertion mode...
3015 wakaba 1.54 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3016 wakaba 1.50 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3017     ## Ignore the token ## ISSUE: An issue in the spec.
3018     !!!next-token;
3019     redo B;
3020 wakaba 1.1 }
3021    
3022 wakaba 1.49 ## "after head" insertion mode
3023     ## As if <body>
3024 wakaba 1.52 !!!insert-element ('body');
3025 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM;
3026 wakaba 1.52 ## reprocess
3027     redo B;
3028     } else {
3029     die "$0: $token->{type}: Unknown token type";
3030     }
3031    
3032     ## ISSUE: An issue in the spec.
3033 wakaba 1.56 } elsif ($self->{insertion_mode} & BODY_IMS) {
3034 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
3035 wakaba 1.52 ## NOTE: There is a code clone of "character in body".
3036     $reconstruct_active_formatting_elements->($insert_to_current);
3037    
3038     $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3039    
3040     !!!next-token;
3041     redo B;
3042 wakaba 1.55 } elsif ($token->{type} == START_TAG_TOKEN) {
3043 wakaba 1.52 if ({
3044     caption => 1, col => 1, colgroup => 1, tbody => 1,
3045     td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
3046     }->{$token->{tag_name}}) {
3047 wakaba 1.54 if ($self->{insertion_mode} == IN_CELL_IM) {
3048 wakaba 1.52 ## have an element in table scope
3049     my $tn;
3050     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3051     my $node = $self->{open_elements}->[$_];
3052     if ($node->[1] eq 'td' or $node->[1] eq 'th') {
3053     $tn = $node->[1];
3054     last INSCOPE;
3055     } elsif ({
3056     table => 1, html => 1,
3057     }->{$node->[1]}) {
3058     last INSCOPE;
3059     }
3060     } # INSCOPE
3061     unless (defined $tn) {
3062     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3063     ## Ignore the token
3064     !!!next-token;
3065     redo B;
3066     }
3067    
3068     ## Close the cell
3069     !!!back-token; # <?>
3070 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => $tn};
3071 wakaba 1.52 redo B;
3072 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
3073 wakaba 1.52 !!!parse-error (type => 'not closed:caption');
3074    
3075     ## As if </caption>
3076     ## have a table element in table scope
3077     my $i;
3078     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3079     my $node = $self->{open_elements}->[$_];
3080     if ($node->[1] eq 'caption') {
3081     $i = $_;
3082     last INSCOPE;
3083     } elsif ({
3084     table => 1, html => 1,
3085     }->{$node->[1]}) {
3086     last INSCOPE;
3087     }
3088     } # INSCOPE
3089     unless (defined $i) {
3090     !!!parse-error (type => 'unmatched end tag:caption');
3091     ## Ignore the token
3092     !!!next-token;
3093     redo B;
3094     }
3095    
3096     ## generate implied end tags
3097     if ({
3098     dd => 1, dt => 1, li => 1, p => 1,
3099     td => 1, th => 1, tr => 1,
3100     tbody => 1, tfoot=> 1, thead => 1,
3101     }->{$self->{open_elements}->[-1]->[1]}) {
3102     !!!back-token; # <?>
3103 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'caption'};
3104 wakaba 1.52 !!!back-token;
3105 wakaba 1.55 $token = {type => END_TAG_TOKEN,
3106 wakaba 1.52 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3107     redo B;
3108     }
3109    
3110     if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3111     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3112     }
3113    
3114     splice @{$self->{open_elements}}, $i;
3115    
3116     $clear_up_to_marker->();
3117    
3118 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
3119 wakaba 1.52
3120     ## reprocess
3121     redo B;
3122     } else {
3123     #
3124     }
3125     } else {
3126     #
3127     }
3128 wakaba 1.55 } elsif ($token->{type} == END_TAG_TOKEN) {
3129 wakaba 1.52 if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') {
3130 wakaba 1.54 if ($self->{insertion_mode} == IN_CELL_IM) {
3131 wakaba 1.43 ## have an element in table scope
3132 wakaba 1.52 my $i;
3133 wakaba 1.43 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3134     my $node = $self->{open_elements}->[$_];
3135 wakaba 1.52 if ($node->[1] eq $token->{tag_name}) {
3136     $i = $_;
3137 wakaba 1.43 last INSCOPE;
3138     } elsif ({
3139     table => 1, html => 1,
3140     }->{$node->[1]}) {
3141     last INSCOPE;
3142     }
3143     } # INSCOPE
3144 wakaba 1.52 unless (defined $i) {
3145 wakaba 1.43 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3146     ## Ignore the token
3147     !!!next-token;
3148     redo B;
3149     }
3150    
3151 wakaba 1.52 ## generate implied end tags
3152     if ({
3153     dd => 1, dt => 1, li => 1, p => 1,
3154     td => ($token->{tag_name} eq 'th'),
3155     th => ($token->{tag_name} eq 'td'),
3156     tr => 1,
3157     tbody => 1, tfoot=> 1, thead => 1,
3158     }->{$self->{open_elements}->[-1]->[1]}) {
3159     !!!back-token;
3160 wakaba 1.55 $token = {type => END_TAG_TOKEN,
3161 wakaba 1.52 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3162     redo B;
3163     }
3164    
3165     if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
3166     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3167     }
3168    
3169     splice @{$self->{open_elements}}, $i;
3170    
3171     $clear_up_to_marker->();
3172    
3173 wakaba 1.54 $self->{insertion_mode} = IN_ROW_IM;
3174 wakaba 1.52
3175     !!!next-token;
3176 wakaba 1.43 redo B;
3177 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
3178 wakaba 1.52 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3179     ## Ignore the token
3180     !!!next-token;
3181     redo B;
3182     } else {
3183     #
3184     }
3185     } elsif ($token->{tag_name} eq 'caption') {
3186 wakaba 1.54 if ($self->{insertion_mode} == IN_CAPTION_IM) {
3187 wakaba 1.43 ## have a table element in table scope
3188     my $i;
3189     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3190     my $node = $self->{open_elements}->[$_];
3191 wakaba 1.52 if ($node->[1] eq $token->{tag_name}) {
3192 wakaba 1.43 $i = $_;
3193     last INSCOPE;
3194     } elsif ({
3195     table => 1, html => 1,
3196     }->{$node->[1]}) {
3197     last INSCOPE;
3198     }
3199     } # INSCOPE
3200     unless (defined $i) {
3201 wakaba 1.52 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3202 wakaba 1.43 ## Ignore the token
3203     !!!next-token;
3204     redo B;
3205     }
3206    
3207     ## generate implied end tags
3208     if ({
3209     dd => 1, dt => 1, li => 1, p => 1,
3210     td => 1, th => 1, tr => 1,
3211     tbody => 1, tfoot=> 1, thead => 1,
3212     }->{$self->{open_elements}->[-1]->[1]}) {
3213     !!!back-token;
3214 wakaba 1.55 $token = {type => END_TAG_TOKEN,
3215 wakaba 1.43 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3216     redo B;
3217     }
3218 wakaba 1.52
3219     if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3220     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3221     }
3222    
3223     splice @{$self->{open_elements}}, $i;
3224    
3225     $clear_up_to_marker->();
3226    
3227 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
3228 wakaba 1.52
3229     !!!next-token;
3230     redo B;
3231 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_CELL_IM) {
3232 wakaba 1.52 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3233     ## Ignore the token
3234     !!!next-token;
3235     redo B;
3236     } else {
3237     #
3238     }
3239     } elsif ({
3240     table => 1, tbody => 1, tfoot => 1,
3241     thead => 1, tr => 1,
3242     }->{$token->{tag_name}} and
3243 wakaba 1.54 $self->{insertion_mode} == IN_CELL_IM) {
3244 wakaba 1.52 ## have an element in table scope
3245     my $i;
3246     my $tn;
3247     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3248     my $node = $self->{open_elements}->[$_];
3249     if ($node->[1] eq $token->{tag_name}) {
3250     $i = $_;
3251     last INSCOPE;
3252     } elsif ($node->[1] eq 'td' or $node->[1] eq 'th') {
3253     $tn = $node->[1];
3254     ## NOTE: There is exactly one |td| or |th| element
3255     ## in scope in the stack of open elements by definition.
3256     } elsif ({
3257     table => 1, html => 1,
3258     }->{$node->[1]}) {
3259     last INSCOPE;
3260     }
3261     } # INSCOPE
3262     unless (defined $i) {
3263     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3264     ## Ignore the token
3265     !!!next-token;
3266     redo B;
3267     }
3268    
3269     ## Close the cell
3270     !!!back-token; # </?>
3271 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => $tn};
3272 wakaba 1.52 redo B;
3273     } elsif ($token->{tag_name} eq 'table' and
3274 wakaba 1.54 $self->{insertion_mode} == IN_CAPTION_IM) {
3275 wakaba 1.52 !!!parse-error (type => 'not closed:caption');
3276    
3277     ## As if </caption>
3278     ## have a table element in table scope
3279     my $i;
3280     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3281     my $node = $self->{open_elements}->[$_];
3282     if ($node->[1] eq 'caption') {
3283     $i = $_;
3284     last INSCOPE;
3285     } elsif ({
3286     table => 1, html => 1,
3287     }->{$node->[1]}) {
3288     last INSCOPE;
3289     }
3290     } # INSCOPE
3291     unless (defined $i) {
3292     !!!parse-error (type => 'unmatched end tag:caption');
3293     ## Ignore the token
3294     !!!next-token;
3295     redo B;
3296     }
3297    
3298     ## generate implied end tags
3299     if ({
3300     dd => 1, dt => 1, li => 1, p => 1,
3301     td => 1, th => 1, tr => 1,
3302     tbody => 1, tfoot=> 1, thead => 1,
3303     }->{$self->{open_elements}->[-1]->[1]}) {
3304     !!!back-token; # </table>
3305 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'caption'};
3306 wakaba 1.52 !!!back-token;
3307 wakaba 1.55 $token = {type => END_TAG_TOKEN,
3308 wakaba 1.52 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3309     redo B;
3310     }
3311    
3312     if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3313     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3314     }
3315    
3316     splice @{$self->{open_elements}}, $i;
3317    
3318     $clear_up_to_marker->();
3319    
3320 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
3321 wakaba 1.52
3322     ## reprocess
3323     redo B;
3324     } elsif ({
3325     body => 1, col => 1, colgroup => 1, html => 1,
3326     }->{$token->{tag_name}}) {
3327 wakaba 1.56 if ($self->{insertion_mode} & BODY_TABLE_IMS) {
3328 wakaba 1.52 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3329     ## Ignore the token
3330     !!!next-token;
3331     redo B;
3332     } else {
3333     #
3334     }
3335     } elsif ({
3336     tbody => 1, tfoot => 1,
3337     thead => 1, tr => 1,
3338     }->{$token->{tag_name}} and
3339 wakaba 1.54 $self->{insertion_mode} == IN_CAPTION_IM) {
3340 wakaba 1.52 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3341     ## Ignore the token
3342     !!!next-token;
3343     redo B;
3344     } else {
3345     #
3346     }
3347     } else {
3348     die "$0: $token->{type}: Unknown token type";
3349     }
3350    
3351     $insert = $insert_to_current;
3352     #
3353 wakaba 1.56 } elsif ($self->{insertion_mode} & TABLE_IMS) {
3354 wakaba 1.58 if ($token->{type} == CHARACTER_TOKEN) {
3355 wakaba 1.52 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3356     $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3357    
3358     unless (length $token->{data}) {
3359     !!!next-token;
3360     redo B;
3361     }
3362     }
3363    
3364     !!!parse-error (type => 'in table:#character');
3365    
3366     ## As if in body, but insert into foster parent element
3367     ## ISSUE: Spec says that "whenever a node would be inserted
3368     ## into the current node" while characters might not be
3369     ## result in a new Text node.
3370     $reconstruct_active_formatting_elements->($insert_to_foster);
3371    
3372     if ({
3373     table => 1, tbody => 1, tfoot => 1,
3374     thead => 1, tr => 1,
3375     }->{$self->{open_elements}->[-1]->[1]}) {
3376     # MUST
3377     my $foster_parent_element;
3378     my $next_sibling;
3379     my $prev_sibling;
3380     OE: for (reverse 0..$#{$self->{open_elements}}) {
3381     if ($self->{open_elements}->[$_]->[1] eq 'table') {
3382     my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3383     if (defined $parent and $parent->node_type == 1) {
3384     $foster_parent_element = $parent;
3385     $next_sibling = $self->{open_elements}->[$_]->[0];
3386     $prev_sibling = $next_sibling->previous_sibling;
3387     } else {
3388     $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
3389     $prev_sibling = $foster_parent_element->last_child;
3390     }
3391     last OE;
3392     }
3393     } # OE
3394     $foster_parent_element = $self->{open_elements}->[0]->[0] and
3395     $prev_sibling = $foster_parent_element->last_child
3396     unless defined $foster_parent_element;
3397     if (defined $prev_sibling and
3398     $prev_sibling->node_type == 3) {
3399     $prev_sibling->manakai_append_text ($token->{data});
3400     } else {
3401     $foster_parent_element->insert_before
3402     ($self->{document}->create_text_node ($token->{data}),
3403     $next_sibling);
3404     }
3405     } else {
3406     $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3407     }
3408    
3409     !!!next-token;
3410     redo B;
3411 wakaba 1.58 } elsif ($token->{type} == START_TAG_TOKEN) {
3412 wakaba 1.52 if ({
3413 wakaba 1.54 tr => ($self->{insertion_mode} != IN_ROW_IM),
3414 wakaba 1.52 th => 1, td => 1,
3415     }->{$token->{tag_name}}) {
3416 wakaba 1.54 if ($self->{insertion_mode} == IN_TABLE_IM) {
3417 wakaba 1.52 ## Clear back to table context
3418     while ($self->{open_elements}->[-1]->[1] ne 'table' and
3419     $self->{open_elements}->[-1]->[1] ne 'html') {
3420 wakaba 1.43 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3421 wakaba 1.52 pop @{$self->{open_elements}};
3422 wakaba 1.43 }
3423    
3424 wakaba 1.52 !!!insert-element ('tbody');
3425 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_BODY_IM;
3426 wakaba 1.52 ## reprocess in the "in table body" insertion mode...
3427     }
3428    
3429 wakaba 1.54 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
3430 wakaba 1.52 unless ($token->{tag_name} eq 'tr') {
3431     !!!parse-error (type => 'missing start tag:tr');
3432     }
3433 wakaba 1.43
3434 wakaba 1.52 ## Clear back to table body context
3435     while (not {
3436     tbody => 1, tfoot => 1, thead => 1, html => 1,
3437     }->{$self->{open_elements}->[-1]->[1]}) {
3438     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3439     pop @{$self->{open_elements}};
3440     }
3441 wakaba 1.43
3442 wakaba 1.54 $self->{insertion_mode} = IN_ROW_IM;
3443 wakaba 1.52 if ($token->{tag_name} eq 'tr') {
3444     !!!insert-element ($token->{tag_name}, $token->{attributes});
3445     !!!next-token;
3446     redo B;
3447     } else {
3448     !!!insert-element ('tr');
3449     ## reprocess in the "in row" insertion mode
3450     }
3451     }
3452    
3453     ## Clear back to table row context
3454     while (not {
3455     tr => 1, html => 1,
3456     }->{$self->{open_elements}->[-1]->[1]}) {
3457     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3458     pop @{$self->{open_elements}};
3459 wakaba 1.43 }
3460 wakaba 1.52
3461     !!!insert-element ($token->{tag_name}, $token->{attributes});
3462 wakaba 1.54 $self->{insertion_mode} = IN_CELL_IM;
3463 wakaba 1.52
3464     push @$active_formatting_elements, ['#marker', ''];
3465    
3466     !!!next-token;
3467     redo B;
3468     } elsif ({
3469     caption => 1, col => 1, colgroup => 1,
3470     tbody => 1, tfoot => 1, thead => 1,
3471 wakaba 1.54 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
3472 wakaba 1.52 }->{$token->{tag_name}}) {
3473 wakaba 1.54 if ($self->{insertion_mode} == IN_ROW_IM) {
3474 wakaba 1.52 ## As if </tr>
3475 wakaba 1.43 ## have an element in table scope
3476     my $i;
3477     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3478     my $node = $self->{open_elements}->[$_];
3479 wakaba 1.52 if ($node->[1] eq 'tr') {
3480 wakaba 1.43 $i = $_;
3481     last INSCOPE;
3482     } elsif ({
3483     table => 1, html => 1,
3484     }->{$node->[1]}) {
3485     last INSCOPE;
3486     }
3487     } # INSCOPE
3488 wakaba 1.52 unless (defined $i) {
3489     !!!parse-error (type => 'unmacthed end tag:'.$token->{tag_name});
3490     ## Ignore the token
3491     !!!next-token;
3492 wakaba 1.43 redo B;
3493     }
3494    
3495 wakaba 1.52 ## Clear back to table row context
3496     while (not {
3497     tr => 1, html => 1,
3498     }->{$self->{open_elements}->[-1]->[1]}) {
3499 wakaba 1.43 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3500 wakaba 1.52 pop @{$self->{open_elements}};
3501 wakaba 1.1 }
3502 wakaba 1.43
3503 wakaba 1.52 pop @{$self->{open_elements}}; # tr
3504 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_BODY_IM;
3505 wakaba 1.52 if ($token->{tag_name} eq 'tr') {
3506     ## reprocess
3507     redo B;
3508     } else {
3509     ## reprocess in the "in table body" insertion mode...
3510     }
3511 wakaba 1.1 }
3512 wakaba 1.52
3513 wakaba 1.54 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
3514 wakaba 1.52 ## have an element in table scope
3515 wakaba 1.43 my $i;
3516     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3517     my $node = $self->{open_elements}->[$_];
3518 wakaba 1.52 if ({
3519     tbody => 1, thead => 1, tfoot => 1,
3520     }->{$node->[1]}) {
3521 wakaba 1.43 $i = $_;
3522     last INSCOPE;
3523     } elsif ({
3524     table => 1, html => 1,
3525     }->{$node->[1]}) {
3526     last INSCOPE;
3527     }
3528     } # INSCOPE
3529 wakaba 1.52 unless (defined $i) {
3530     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3531     ## Ignore the token
3532     !!!next-token;
3533 wakaba 1.43 redo B;
3534     }
3535 wakaba 1.52
3536     ## Clear back to table body context
3537     while (not {
3538     tbody => 1, tfoot => 1, thead => 1, html => 1,
3539     }->{$self->{open_elements}->[-1]->[1]}) {
3540 wakaba 1.43 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3541 wakaba 1.52 pop @{$self->{open_elements}};
3542 wakaba 1.43 }
3543    
3544 wakaba 1.52 ## As if <{current node}>
3545     ## have an element in table scope
3546     ## true by definition
3547 wakaba 1.43
3548 wakaba 1.52 ## Clear back to table body context
3549     ## nop by definition
3550 wakaba 1.43
3551 wakaba 1.52 pop @{$self->{open_elements}};
3552 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
3553 wakaba 1.52 ## reprocess in "in table" insertion mode...
3554     }
3555    
3556     if ($token->{tag_name} eq 'col') {
3557     ## Clear back to table context
3558     while ($self->{open_elements}->[-1]->[1] ne 'table' and
3559     $self->{open_elements}->[-1]->[1] ne 'html') {
3560     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3561     pop @{$self->{open_elements}};
3562     }
3563 wakaba 1.43
3564 wakaba 1.52 !!!insert-element ('colgroup');
3565 wakaba 1.54 $self->{insertion_mode} = IN_COLUMN_GROUP_IM;
3566 wakaba 1.52 ## reprocess
3567 wakaba 1.43 redo B;
3568 wakaba 1.52 } elsif ({
3569     caption => 1,
3570     colgroup => 1,
3571     tbody => 1, tfoot => 1, thead => 1,
3572     }->{$token->{tag_name}}) {
3573     ## Clear back to table context
3574     while ($self->{open_elements}->[-1]->[1] ne 'table' and
3575     $self->{open_elements}->[-1]->[1] ne 'html') {
3576     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3577     pop @{$self->{open_elements}};
3578 wakaba 1.1 }
3579 wakaba 1.52
3580     push @$active_formatting_elements, ['#marker', '']
3581     if $token->{tag_name} eq 'caption';
3582    
3583     !!!insert-element ($token->{tag_name}, $token->{attributes});
3584     $self->{insertion_mode} = {
3585 wakaba 1.54 caption => IN_CAPTION_IM,
3586     colgroup => IN_COLUMN_GROUP_IM,
3587     tbody => IN_TABLE_BODY_IM,
3588     tfoot => IN_TABLE_BODY_IM,
3589     thead => IN_TABLE_BODY_IM,
3590 wakaba 1.52 }->{$token->{tag_name}};
3591 wakaba 1.1 !!!next-token;
3592     redo B;
3593 wakaba 1.52 } else {
3594     die "$0: in table: <>: $token->{tag_name}";
3595 wakaba 1.1 }
3596 wakaba 1.52 } elsif ($token->{tag_name} eq 'table') {
3597     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3598 wakaba 1.1
3599 wakaba 1.52 ## As if </table>
3600 wakaba 1.1 ## have a table element in table scope
3601     my $i;
3602 wakaba 1.3 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3603     my $node = $self->{open_elements}->[$_];
3604 wakaba 1.52 if ($node->[1] eq 'table') {
3605 wakaba 1.1 $i = $_;
3606     last INSCOPE;
3607     } elsif ({
3608     table => 1, html => 1,
3609     }->{$node->[1]}) {
3610     last INSCOPE;
3611     }
3612     } # INSCOPE
3613     unless (defined $i) {
3614 wakaba 1.52 !!!parse-error (type => 'unmatched end tag:table');
3615     ## Ignore tokens </table><table>
3616 wakaba 1.1 !!!next-token;
3617     redo B;
3618     }
3619    
3620     ## generate implied end tags
3621     if ({
3622     dd => 1, dt => 1, li => 1, p => 1,
3623     td => 1, th => 1, tr => 1,
3624 wakaba 1.31 tbody => 1, tfoot=> 1, thead => 1,
3625 wakaba 1.3 }->{$self->{open_elements}->[-1]->[1]}) {
3626 wakaba 1.52 !!!back-token; # <table>
3627 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'table'};
3628 wakaba 1.1 !!!back-token;
3629 wakaba 1.55 $token = {type => END_TAG_TOKEN,
3630 wakaba 1.3 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3631 wakaba 1.1 redo B;
3632     }
3633    
3634 wakaba 1.52 if ($self->{open_elements}->[-1]->[1] ne 'table') {
3635 wakaba 1.3 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3636 wakaba 1.1 }
3637    
3638 wakaba 1.3 splice @{$self->{open_elements}}, $i;
3639 wakaba 1.1
3640 wakaba 1.52 $self->_reset_insertion_mode;
3641 wakaba 1.1
3642     ## reprocess
3643     redo B;
3644 wakaba 1.58 } else {
3645     !!!parse-error (type => 'in table:'.$token->{tag_name});
3646    
3647     $insert = $insert_to_foster;
3648     #
3649     }
3650     } elsif ($token->{type} == END_TAG_TOKEN) {
3651 wakaba 1.52 if ($token->{tag_name} eq 'tr' and
3652 wakaba 1.54 $self->{insertion_mode} == IN_ROW_IM) {
3653 wakaba 1.52 ## have an element in table scope
3654     my $i;
3655     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3656     my $node = $self->{open_elements}->[$_];
3657     if ($node->[1] eq $token->{tag_name}) {
3658     $i = $_;
3659     last INSCOPE;
3660     } elsif ({
3661     table => 1, html => 1,
3662     }->{$node->[1]}) {
3663     last INSCOPE;
3664     }
3665     } # INSCOPE
3666     unless (defined $i) {
3667     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3668     ## Ignore the token
3669 wakaba 1.42 !!!next-token;
3670     redo B;
3671     }
3672    
3673 wakaba 1.52 ## Clear back to table row context
3674     while (not {
3675     tr => 1, html => 1,
3676     }->{$self->{open_elements}->[-1]->[1]}) {
3677     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3678     pop @{$self->{open_elements}};
3679     }
3680 wakaba 1.42
3681 wakaba 1.52 pop @{$self->{open_elements}}; # tr
3682 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_BODY_IM;
3683 wakaba 1.52 !!!next-token;
3684     redo B;
3685     } elsif ($token->{tag_name} eq 'table') {
3686 wakaba 1.54 if ($self->{insertion_mode} == IN_ROW_IM) {
3687 wakaba 1.52 ## As if </tr>
3688     ## have an element in table scope
3689     my $i;
3690     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3691     my $node = $self->{open_elements}->[$_];
3692     if ($node->[1] eq 'tr') {
3693     $i = $_;
3694     last INSCOPE;
3695     } elsif ({
3696     table => 1, html => 1,
3697     }->{$node->[1]}) {
3698     last INSCOPE;
3699 wakaba 1.42 }
3700 wakaba 1.52 } # INSCOPE
3701     unless (defined $i) {
3702     !!!parse-error (type => 'unmatched end tag:'.$token->{type});
3703     ## Ignore the token
3704     !!!next-token;
3705     redo B;
3706 wakaba 1.42 }
3707 wakaba 1.52
3708     ## Clear back to table row context
3709     while (not {
3710     tr => 1, html => 1,
3711     }->{$self->{open_elements}->[-1]->[1]}) {
3712 wakaba 1.46 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3713     pop @{$self->{open_elements}};
3714 wakaba 1.1 }
3715 wakaba 1.46
3716 wakaba 1.52 pop @{$self->{open_elements}}; # tr
3717 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_BODY_IM;
3718 wakaba 1.46 ## reprocess in the "in table body" insertion mode...
3719 wakaba 1.1 }
3720    
3721 wakaba 1.54 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
3722 wakaba 1.52 ## have an element in table scope
3723     my $i;
3724     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3725     my $node = $self->{open_elements}->[$_];
3726     if ({
3727     tbody => 1, thead => 1, tfoot => 1,
3728     }->{$node->[1]}) {
3729     $i = $_;
3730     last INSCOPE;
3731     } elsif ({
3732     table => 1, html => 1,
3733     }->{$node->[1]}) {
3734     last INSCOPE;
3735     }
3736     } # INSCOPE
3737     unless (defined $i) {
3738     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3739     ## Ignore the token
3740     !!!next-token;
3741     redo B;
3742 wakaba 1.47 }
3743    
3744     ## Clear back to table body context
3745     while (not {
3746     tbody => 1, tfoot => 1, thead => 1, html => 1,
3747     }->{$self->{open_elements}->[-1]->[1]}) {
3748     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3749     pop @{$self->{open_elements}};
3750     }
3751    
3752 wakaba 1.52 ## As if <{current node}>
3753     ## have an element in table scope
3754     ## true by definition
3755    
3756     ## Clear back to table body context
3757     ## nop by definition
3758    
3759     pop @{$self->{open_elements}};
3760 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
3761 wakaba 1.52 ## reprocess in the "in table" insertion mode...
3762     }
3763    
3764     ## have a table element in table scope
3765     my $i;
3766     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3767     my $node = $self->{open_elements}->[$_];
3768     if ($node->[1] eq $token->{tag_name}) {
3769     $i = $_;
3770     last INSCOPE;
3771     } elsif ({
3772     table => 1, html => 1,
3773     }->{$node->[1]}) {
3774     last INSCOPE;
3775 wakaba 1.47 }
3776 wakaba 1.52 } # INSCOPE
3777     unless (defined $i) {
3778     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3779     ## Ignore the token
3780     !!!next-token;
3781     redo B;
3782 wakaba 1.3 }
3783    
3784 wakaba 1.52 ## generate implied end tags
3785     if ({
3786     dd => 1, dt => 1, li => 1, p => 1,
3787     td => 1, th => 1, tr => 1,
3788     tbody => 1, tfoot=> 1, thead => 1,
3789     }->{$self->{open_elements}->[-1]->[1]}) {
3790     !!!back-token;
3791 wakaba 1.55 $token = {type => END_TAG_TOKEN,
3792 wakaba 1.52 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3793     redo B;
3794     }
3795    
3796     if ($self->{open_elements}->[-1]->[1] ne 'table') {
3797 wakaba 1.3 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3798 wakaba 1.1 }
3799 wakaba 1.52
3800     splice @{$self->{open_elements}}, $i;
3801 wakaba 1.1
3802 wakaba 1.52 $self->_reset_insertion_mode;
3803 wakaba 1.47
3804     !!!next-token;
3805     redo B;
3806     } elsif ({
3807 wakaba 1.48 tbody => 1, tfoot => 1, thead => 1,
3808 wakaba 1.52 }->{$token->{tag_name}} and
3809 wakaba 1.56 $self->{insertion_mode} & ROW_IMS) {
3810 wakaba 1.54 if ($self->{insertion_mode} == IN_ROW_IM) {
3811 wakaba 1.52 ## have an element in table scope
3812     my $i;
3813     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3814     my $node = $self->{open_elements}->[$_];
3815     if ($node->[1] eq $token->{tag_name}) {
3816     $i = $_;
3817     last INSCOPE;
3818     } elsif ({
3819     table => 1, html => 1,
3820     }->{$node->[1]}) {
3821     last INSCOPE;
3822     }
3823     } # INSCOPE
3824     unless (defined $i) {
3825     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3826     ## Ignore the token
3827     !!!next-token;
3828     redo B;
3829     }
3830    
3831 wakaba 1.48 ## As if </tr>
3832     ## have an element in table scope
3833     my $i;
3834     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3835     my $node = $self->{open_elements}->[$_];
3836     if ($node->[1] eq 'tr') {
3837     $i = $_;
3838     last INSCOPE;
3839     } elsif ({
3840     table => 1, html => 1,
3841     }->{$node->[1]}) {
3842     last INSCOPE;
3843     }
3844     } # INSCOPE
3845 wakaba 1.52 unless (defined $i) {
3846     !!!parse-error (type => 'unmatched end tag:tr');
3847     ## Ignore the token
3848     !!!next-token;
3849     redo B;
3850     }
3851 wakaba 1.48
3852     ## Clear back to table row context
3853     while (not {
3854     tr => 1, html => 1,
3855     }->{$self->{open_elements}->[-1]->[1]}) {
3856     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3857     pop @{$self->{open_elements}};
3858     }
3859    
3860     pop @{$self->{open_elements}}; # tr
3861 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_BODY_IM;
3862 wakaba 1.52 ## reprocess in the "in table body" insertion mode...
3863     }
3864    
3865     ## have an element in table scope
3866     my $i;
3867     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3868     my $node = $self->{open_elements}->[$_];
3869     if ($node->[1] eq $token->{tag_name}) {
3870     $i = $_;
3871     last INSCOPE;
3872     } elsif ({
3873     table => 1, html => 1,
3874     }->{$node->[1]}) {
3875     last INSCOPE;
3876     }
3877     } # INSCOPE
3878     unless (defined $i) {
3879     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3880     ## Ignore the token
3881     !!!next-token;
3882     redo B;
3883     }
3884    
3885     ## Clear back to table body context
3886     while (not {
3887     tbody => 1, tfoot => 1, thead => 1, html => 1,
3888     }->{$self->{open_elements}->[-1]->[1]}) {
3889     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3890     pop @{$self->{open_elements}};
3891     }
3892    
3893     pop @{$self->{open_elements}};
3894 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
3895 wakaba 1.52 !!!next-token;
3896     redo B;
3897     } elsif ({
3898     body => 1, caption => 1, col => 1, colgroup => 1,
3899     html => 1, td => 1, th => 1,
3900 wakaba 1.54 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
3901     tbody => 1, tfoot => 1, thead => 1, # $self->{insertion_mode} == IN_TABLE_IM
3902 wakaba 1.52 }->{$token->{tag_name}}) {
3903     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3904     ## Ignore the token
3905     !!!next-token;
3906     redo B;
3907 wakaba 1.58 } else {
3908     !!!parse-error (type => 'in table:/'.$token->{tag_name});
3909 wakaba 1.52
3910 wakaba 1.58 $insert = $insert_to_foster;
3911     #
3912     }
3913     } else {
3914     die "$0: $token->{type}: Unknown token type";
3915     }
3916 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
3917 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
3918 wakaba 1.52 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3919     $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3920     unless (length $token->{data}) {
3921     !!!next-token;
3922     redo B;
3923     }
3924     }
3925    
3926     #
3927 wakaba 1.55 } elsif ($token->{type} == START_TAG_TOKEN) {
3928 wakaba 1.52 if ($token->{tag_name} eq 'col') {
3929     !!!insert-element ($token->{tag_name}, $token->{attributes});
3930     pop @{$self->{open_elements}};
3931     !!!next-token;
3932     redo B;
3933     } else {
3934     #
3935     }
3936 wakaba 1.55 } elsif ($token->{type} == END_TAG_TOKEN) {
3937 wakaba 1.52 if ($token->{tag_name} eq 'colgroup') {
3938     if ($self->{open_elements}->[-1]->[1] eq 'html') {
3939     !!!parse-error (type => 'unmatched end tag:colgroup');
3940     ## Ignore the token
3941     !!!next-token;
3942     redo B;
3943     } else {
3944     pop @{$self->{open_elements}}; # colgroup
3945 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
3946 wakaba 1.52 !!!next-token;
3947     redo B;
3948     }
3949     } elsif ($token->{tag_name} eq 'col') {
3950     !!!parse-error (type => 'unmatched end tag:col');
3951     ## Ignore the token
3952     !!!next-token;
3953     redo B;
3954     } else {
3955     #
3956     }
3957     } else {
3958     #
3959     }
3960    
3961     ## As if </colgroup>
3962     if ($self->{open_elements}->[-1]->[1] eq 'html') {
3963     !!!parse-error (type => 'unmatched end tag:colgroup');
3964     ## Ignore the token
3965     !!!next-token;
3966     redo B;
3967     } else {
3968     pop @{$self->{open_elements}}; # colgroup
3969 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
3970 wakaba 1.52 ## reprocess
3971     redo B;
3972     }
3973 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_SELECT_IM) {
3974 wakaba 1.58 if ($token->{type} == CHARACTER_TOKEN) {
3975     $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3976     !!!next-token;
3977     redo B;
3978     } elsif ($token->{type} == START_TAG_TOKEN) {
3979 wakaba 1.52 if ($token->{tag_name} eq 'option') {
3980     if ($self->{open_elements}->[-1]->[1] eq 'option') {
3981     ## As if </option>
3982     pop @{$self->{open_elements}};
3983     }
3984    
3985     !!!insert-element ($token->{tag_name}, $token->{attributes});
3986     !!!next-token;
3987     redo B;
3988     } elsif ($token->{tag_name} eq 'optgroup') {
3989     if ($self->{open_elements}->[-1]->[1] eq 'option') {
3990     ## As if </option>
3991     pop @{$self->{open_elements}};
3992     }
3993    
3994     if ($self->{open_elements}->[-1]->[1] eq 'optgroup') {
3995     ## As if </optgroup>
3996     pop @{$self->{open_elements}};
3997     }
3998    
3999     !!!insert-element ($token->{tag_name}, $token->{attributes});
4000     !!!next-token;
4001     redo B;
4002     } elsif ($token->{tag_name} eq 'select') {
4003     !!!parse-error (type => 'not closed:select');
4004     ## As if </select> instead
4005     ## have an element in table scope
4006     my $i;
4007     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4008     my $node = $self->{open_elements}->[$_];
4009     if ($node->[1] eq $token->{tag_name}) {
4010     $i = $_;
4011     last INSCOPE;
4012     } elsif ({
4013     table => 1, html => 1,
4014     }->{$node->[1]}) {
4015     last INSCOPE;
4016 wakaba 1.47 }
4017 wakaba 1.52 } # INSCOPE
4018     unless (defined $i) {
4019     !!!parse-error (type => 'unmatched end tag:select');
4020     ## Ignore the token
4021     !!!next-token;
4022     redo B;
4023 wakaba 1.47 }
4024 wakaba 1.52
4025     splice @{$self->{open_elements}}, $i;
4026    
4027     $self->_reset_insertion_mode;
4028 wakaba 1.47
4029 wakaba 1.52 !!!next-token;
4030     redo B;
4031 wakaba 1.58 } else {
4032     !!!parse-error (type => 'in select:'.$token->{tag_name});
4033     ## Ignore the token
4034     !!!next-token;
4035     redo B;
4036     }
4037     } elsif ($token->{type} == END_TAG_TOKEN) {
4038 wakaba 1.52 if ($token->{tag_name} eq 'optgroup') {
4039     if ($self->{open_elements}->[-1]->[1] eq 'option' and
4040     $self->{open_elements}->[-2]->[1] eq 'optgroup') {
4041     ## As if </option>
4042     splice @{$self->{open_elements}}, -2;
4043     } elsif ($self->{open_elements}->[-1]->[1] eq 'optgroup') {
4044     pop @{$self->{open_elements}};
4045     } else {
4046     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4047     ## Ignore the token
4048     }
4049     !!!next-token;
4050     redo B;
4051     } elsif ($token->{tag_name} eq 'option') {
4052     if ($self->{open_elements}->[-1]->[1] eq 'option') {
4053 wakaba 1.47 pop @{$self->{open_elements}};
4054 wakaba 1.52 } else {
4055     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4056     ## Ignore the token
4057 wakaba 1.1 }
4058 wakaba 1.52 !!!next-token;
4059     redo B;
4060     } elsif ($token->{tag_name} eq 'select') {
4061     ## have an element in table scope
4062     my $i;
4063     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4064     my $node = $self->{open_elements}->[$_];
4065     if ($node->[1] eq $token->{tag_name}) {
4066     $i = $_;
4067     last INSCOPE;
4068     } elsif ({
4069     table => 1, html => 1,
4070     }->{$node->[1]}) {
4071     last INSCOPE;
4072 wakaba 1.48 }
4073 wakaba 1.52 } # INSCOPE
4074     unless (defined $i) {
4075     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4076     ## Ignore the token
4077     !!!next-token;
4078 wakaba 1.48 redo B;
4079 wakaba 1.52 }
4080    
4081     splice @{$self->{open_elements}}, $i;
4082    
4083     $self->_reset_insertion_mode;
4084    
4085     !!!next-token;
4086     redo B;
4087     } elsif ({
4088     caption => 1, table => 1, tbody => 1,
4089     tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
4090     }->{$token->{tag_name}}) {
4091     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4092    
4093     ## have an element in table scope
4094     my $i;
4095     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4096     my $node = $self->{open_elements}->[$_];
4097     if ($node->[1] eq $token->{tag_name}) {
4098     $i = $_;
4099     last INSCOPE;
4100     } elsif ({
4101     table => 1, html => 1,
4102     }->{$node->[1]}) {
4103     last INSCOPE;
4104 wakaba 1.1 }
4105 wakaba 1.52 } # INSCOPE
4106     unless (defined $i) {
4107     ## Ignore the token
4108 wakaba 1.1 !!!next-token;
4109     redo B;
4110     }
4111 wakaba 1.52
4112     ## As if </select>
4113     ## have an element in table scope
4114     undef $i;
4115 wakaba 1.3 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4116     my $node = $self->{open_elements}->[$_];
4117 wakaba 1.52 if ($node->[1] eq 'select') {
4118 wakaba 1.1 $i = $_;
4119     last INSCOPE;
4120     } elsif ({
4121     table => 1, html => 1,
4122 wakaba 1.52 }->{$node->[1]}) {
4123     last INSCOPE;
4124     }
4125     } # INSCOPE
4126     unless (defined $i) {
4127     !!!parse-error (type => 'unmatched end tag:select');
4128     ## Ignore the </select> token
4129     !!!next-token; ## TODO: ok?
4130     redo B;
4131     }
4132    
4133     splice @{$self->{open_elements}}, $i;
4134    
4135     $self->_reset_insertion_mode;
4136    
4137     ## reprocess
4138     redo B;
4139 wakaba 1.58 } else {
4140     !!!parse-error (type => 'in select:/'.$token->{tag_name});
4141 wakaba 1.52 ## Ignore the token
4142     !!!next-token;
4143     redo B;
4144 wakaba 1.58 }
4145     } else {
4146     die "$0: $token->{type}: Unknown token type";
4147     }
4148 wakaba 1.56 } elsif ($self->{insertion_mode} & BODY_AFTER_IMS) {
4149 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
4150 wakaba 1.52 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4151     my $data = $1;
4152     ## As if in body
4153     $reconstruct_active_formatting_elements->($insert_to_current);
4154    
4155     $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4156    
4157     unless (length $token->{data}) {
4158     !!!next-token;
4159     redo B;
4160     }
4161     }
4162    
4163 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
4164 wakaba 1.52 !!!parse-error (type => 'after html:#character');
4165    
4166     ## Reprocess in the "main" phase, "after body" insertion mode...
4167     }
4168    
4169     ## "after body" insertion mode
4170     !!!parse-error (type => 'after body:#character');
4171    
4172 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM;
4173 wakaba 1.52 ## reprocess
4174     redo B;
4175 wakaba 1.55 } elsif ($token->{type} == START_TAG_TOKEN) {
4176 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
4177 wakaba 1.52 !!!parse-error (type => 'after html:'.$token->{tag_name});
4178    
4179     ## Reprocess in the "main" phase, "after body" insertion mode...
4180     }
4181    
4182     ## "after body" insertion mode
4183     !!!parse-error (type => 'after body:'.$token->{tag_name});
4184    
4185 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM;
4186 wakaba 1.52 ## reprocess
4187     redo B;
4188 wakaba 1.55 } elsif ($token->{type} == END_TAG_TOKEN) {
4189 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
4190 wakaba 1.52 !!!parse-error (type => 'after html:/'.$token->{tag_name});
4191    
4192 wakaba 1.54 $self->{insertion_mode} = AFTER_BODY_IM;
4193 wakaba 1.52 ## Reprocess in the "main" phase, "after body" insertion mode...
4194     }
4195    
4196     ## "after body" insertion mode
4197     if ($token->{tag_name} eq 'html') {
4198     if (defined $self->{inner_html_node}) {
4199     !!!parse-error (type => 'unmatched end tag:html');
4200     ## Ignore the token
4201     !!!next-token;
4202     redo B;
4203     } else {
4204 wakaba 1.54 $self->{insertion_mode} = AFTER_HTML_BODY_IM;
4205 wakaba 1.52 !!!next-token;
4206     redo B;
4207     }
4208     } else {
4209     !!!parse-error (type => 'after body:/'.$token->{tag_name});
4210    
4211 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM;
4212 wakaba 1.52 ## reprocess
4213     redo B;
4214     }
4215     } else {
4216     die "$0: $token->{type}: Unknown token type";
4217     }
4218 wakaba 1.56 } elsif ($self->{insertion_mode} & FRAME_IMS) {
4219 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
4220 wakaba 1.52 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4221     $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4222    
4223     unless (length $token->{data}) {
4224     !!!next-token;
4225     redo B;
4226     }
4227     }
4228    
4229     if ($token->{data} =~ s/^[^\x09\x0A\x0B\x0C\x20]+//) {
4230 wakaba 1.54 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
4231 wakaba 1.52 !!!parse-error (type => 'in frameset:#character');
4232 wakaba 1.54 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
4233 wakaba 1.52 !!!parse-error (type => 'after frameset:#character');
4234     } else { # "after html frameset"
4235     !!!parse-error (type => 'after html:#character');
4236    
4237 wakaba 1.54 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4238 wakaba 1.52 ## Reprocess in the "main" phase, "after frameset"...
4239     !!!parse-error (type => 'after frameset:#character');
4240     }
4241    
4242     ## Ignore the token.
4243     if (length $token->{data}) {
4244     ## reprocess the rest of characters
4245     } else {
4246     !!!next-token;
4247     }
4248     redo B;
4249     }
4250    
4251     die qq[$0: Character "$token->{data}"];
4252 wakaba 1.55 } elsif ($token->{type} == START_TAG_TOKEN) {
4253 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
4254 wakaba 1.52 !!!parse-error (type => 'after html:'.$token->{tag_name});
4255 wakaba 1.1
4256 wakaba 1.54 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4257 wakaba 1.52 ## Process in the "main" phase, "after frameset" insertion mode...
4258     }
4259 wakaba 1.1
4260 wakaba 1.52 if ($token->{tag_name} eq 'frameset' and
4261 wakaba 1.54 $self->{insertion_mode} == IN_FRAMESET_IM) {
4262 wakaba 1.52 !!!insert-element ($token->{tag_name}, $token->{attributes});
4263     !!!next-token;
4264     redo B;
4265     } elsif ($token->{tag_name} eq 'frame' and
4266 wakaba 1.54 $self->{insertion_mode} == IN_FRAMESET_IM) {
4267 wakaba 1.52 !!!insert-element ($token->{tag_name}, $token->{attributes});
4268     pop @{$self->{open_elements}};
4269     !!!next-token;
4270     redo B;
4271     } elsif ($token->{tag_name} eq 'noframes') {
4272     ## NOTE: As if in body.
4273     $parse_rcdata->(CDATA_CONTENT_MODEL, $insert_to_current);
4274     redo B;
4275     } else {
4276 wakaba 1.54 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
4277 wakaba 1.52 !!!parse-error (type => 'in frameset:'.$token->{tag_name});
4278     } else {
4279     !!!parse-error (type => 'after frameset:'.$token->{tag_name});
4280     }
4281     ## Ignore the token
4282     !!!next-token;
4283     redo B;
4284     }
4285 wakaba 1.55 } elsif ($token->{type} == END_TAG_TOKEN) {
4286 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
4287 wakaba 1.52 !!!parse-error (type => 'after html:/'.$token->{tag_name});
4288 wakaba 1.1
4289 wakaba 1.54 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4290 wakaba 1.52 ## Process in the "main" phase, "after frameset" insertion mode...
4291     }
4292 wakaba 1.1
4293 wakaba 1.52 if ($token->{tag_name} eq 'frameset' and
4294 wakaba 1.54 $self->{insertion_mode} == IN_FRAMESET_IM) {
4295 wakaba 1.52 if ($self->{open_elements}->[-1]->[1] eq 'html' and
4296     @{$self->{open_elements}} == 1) {
4297     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4298     ## Ignore the token
4299     !!!next-token;
4300     } else {
4301     pop @{$self->{open_elements}};
4302     !!!next-token;
4303     }
4304 wakaba 1.47
4305 wakaba 1.52 if (not defined $self->{inner_html_node} and
4306     $self->{open_elements}->[-1]->[1] ne 'frameset') {
4307 wakaba 1.54 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4308 wakaba 1.52 }
4309     redo B;
4310     } elsif ($token->{tag_name} eq 'html' and
4311 wakaba 1.54 $self->{insertion_mode} == AFTER_FRAMESET_IM) {
4312     $self->{insertion_mode} = AFTER_HTML_FRAMESET_IM;
4313 wakaba 1.52 !!!next-token;
4314     redo B;
4315     } else {
4316 wakaba 1.54 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
4317 wakaba 1.52 !!!parse-error (type => 'in frameset:/'.$token->{tag_name});
4318     } else {
4319     !!!parse-error (type => 'after frameset:/'.$token->{tag_name});
4320     }
4321     ## Ignore the token
4322     !!!next-token;
4323     redo B;
4324     }
4325     } else {
4326     die "$0: $token->{type}: Unknown token type";
4327     }
4328 wakaba 1.47
4329 wakaba 1.52 ## ISSUE: An issue in spec here
4330     } else {
4331     die "$0: $self->{insertion_mode}: Unknown insertion mode";
4332     }
4333 wakaba 1.47
4334 wakaba 1.52 ## "in body" insertion mode
4335 wakaba 1.55 if ($token->{type} == START_TAG_TOKEN) {
4336 wakaba 1.52 if ($token->{tag_name} eq 'script') {
4337     ## NOTE: This is an "as if in head" code clone
4338     $script_start_tag->($insert);
4339 wakaba 1.53 redo B;
4340 wakaba 1.52 } elsif ($token->{tag_name} eq 'style') {
4341     ## NOTE: This is an "as if in head" code clone
4342     $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);
4343 wakaba 1.53 redo B;
4344 wakaba 1.52 } elsif ({
4345     base => 1, link => 1,
4346     }->{$token->{tag_name}}) {
4347     ## NOTE: This is an "as if in head" code clone, only "-t" differs
4348     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4349     pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4350     !!!next-token;
4351 wakaba 1.53 redo B;
4352 wakaba 1.52 } elsif ($token->{tag_name} eq 'meta') {
4353     ## NOTE: This is an "as if in head" code clone, only "-t" differs
4354     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4355     pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4356 wakaba 1.46
4357 wakaba 1.52 unless ($self->{confident}) {
4358     my $charset;
4359     if ($token->{attributes}->{charset}) { ## TODO: And if supported
4360     $charset = $token->{attributes}->{charset}->{value};
4361     }
4362     if ($token->{attributes}->{'http-equiv'}) {
4363     ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
4364     if ($token->{attributes}->{'http-equiv'}->{value}
4365     =~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*=
4366     [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
4367     ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
4368     $charset = defined $1 ? $1 : defined $2 ? $2 : $3;
4369     } ## TODO: And if supported
4370     }
4371     ## TODO: Change the encoding
4372     }
4373 wakaba 1.1
4374 wakaba 1.52 !!!next-token;
4375 wakaba 1.53 redo B;
4376 wakaba 1.52 } elsif ($token->{tag_name} eq 'title') {
4377     !!!parse-error (type => 'in body:title');
4378     ## NOTE: This is an "as if in head" code clone
4379     $parse_rcdata->(RCDATA_CONTENT_MODEL, sub {
4380     if (defined $self->{head_element}) {
4381     $self->{head_element}->append_child ($_[0]);
4382     } else {
4383     $insert->($_[0]);
4384     }
4385     });
4386 wakaba 1.53 redo B;
4387 wakaba 1.52 } elsif ($token->{tag_name} eq 'body') {
4388     !!!parse-error (type => 'in body:body');
4389 wakaba 1.46
4390 wakaba 1.52 if (@{$self->{open_elements}} == 1 or
4391     $self->{open_elements}->[1]->[1] ne 'body') {
4392     ## Ignore the token
4393     } else {
4394     my $body_el = $self->{open_elements}->[1]->[0];
4395     for my $attr_name (keys %{$token->{attributes}}) {
4396     unless ($body_el->has_attribute_ns (undef, $attr_name)) {
4397     $body_el->set_attribute_ns
4398     (undef, [undef, $attr_name],
4399     $token->{attributes}->{$attr_name}->{value});
4400     }
4401     }
4402     }
4403     !!!next-token;
4404 wakaba 1.53 redo B;
4405 wakaba 1.52 } elsif ({
4406     address => 1, blockquote => 1, center => 1, dir => 1,
4407     div => 1, dl => 1, fieldset => 1, listing => 1,
4408     menu => 1, ol => 1, p => 1, ul => 1,
4409     pre => 1,
4410     }->{$token->{tag_name}}) {
4411     ## has a p element in scope
4412     INSCOPE: for (reverse @{$self->{open_elements}}) {
4413     if ($_->[1] eq 'p') {
4414     !!!back-token;
4415 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4416 wakaba 1.53 redo B;
4417 wakaba 1.52 } elsif ({
4418     table => 1, caption => 1, td => 1, th => 1,
4419     button => 1, marquee => 1, object => 1, html => 1,
4420     }->{$_->[1]}) {
4421     last INSCOPE;
4422     }
4423     } # INSCOPE
4424    
4425     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4426     if ($token->{tag_name} eq 'pre') {
4427     !!!next-token;
4428 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
4429 wakaba 1.52 $token->{data} =~ s/^\x0A//;
4430     unless (length $token->{data}) {
4431 wakaba 1.1 !!!next-token;
4432 wakaba 1.52 }
4433     }
4434     } else {
4435     !!!next-token;
4436     }
4437 wakaba 1.53 redo B;
4438 wakaba 1.52 } elsif ($token->{tag_name} eq 'form') {
4439     if (defined $self->{form_element}) {
4440     !!!parse-error (type => 'in form:form');
4441     ## Ignore the token
4442     !!!next-token;
4443 wakaba 1.53 redo B;
4444 wakaba 1.52 } else {
4445     ## has a p element in scope
4446     INSCOPE: for (reverse @{$self->{open_elements}}) {
4447     if ($_->[1] eq 'p') {
4448     !!!back-token;
4449 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4450 wakaba 1.53 redo B;
4451 wakaba 1.46 } elsif ({
4452 wakaba 1.52 table => 1, caption => 1, td => 1, th => 1,
4453     button => 1, marquee => 1, object => 1, html => 1,
4454     }->{$_->[1]}) {
4455     last INSCOPE;
4456     }
4457     } # INSCOPE
4458    
4459     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4460     $self->{form_element} = $self->{open_elements}->[-1]->[0];
4461     !!!next-token;
4462 wakaba 1.53 redo B;
4463 wakaba 1.52 }
4464     } elsif ($token->{tag_name} eq 'li') {
4465     ## has a p element in scope
4466     INSCOPE: for (reverse @{$self->{open_elements}}) {
4467     if ($_->[1] eq 'p') {
4468     !!!back-token;
4469 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4470 wakaba 1.53 redo B;
4471 wakaba 1.52 } elsif ({
4472     table => 1, caption => 1, td => 1, th => 1,
4473     button => 1, marquee => 1, object => 1, html => 1,
4474     }->{$_->[1]}) {
4475     last INSCOPE;
4476     }
4477     } # INSCOPE
4478    
4479     ## Step 1
4480     my $i = -1;
4481     my $node = $self->{open_elements}->[$i];
4482     LI: {
4483     ## Step 2
4484     if ($node->[1] eq 'li') {
4485     if ($i != -1) {
4486     !!!parse-error (type => 'end tag missing:'.
4487     $self->{open_elements}->[-1]->[1]);
4488     }
4489     splice @{$self->{open_elements}}, $i;
4490     last LI;
4491     }
4492    
4493     ## Step 3
4494     if (not $formatting_category->{$node->[1]} and
4495     #not $phrasing_category->{$node->[1]} and
4496     ($special_category->{$node->[1]} or
4497     $scoping_category->{$node->[1]}) and
4498     $node->[1] ne 'address' and $node->[1] ne 'div') {
4499     last LI;
4500     }
4501    
4502     ## Step 4
4503     $i--;
4504     $node = $self->{open_elements}->[$i];
4505     redo LI;
4506     } # LI
4507    
4508     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4509     !!!next-token;
4510 wakaba 1.53 redo B;
4511 wakaba 1.52 } elsif ($token->{tag_name} eq 'dd' or $token->{tag_name} eq 'dt') {
4512     ## has a p element in scope
4513     INSCOPE: for (reverse @{$self->{open_elements}}) {
4514     if ($_->[1] eq 'p') {
4515     !!!back-token;
4516 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4517 wakaba 1.53 redo B;
4518 wakaba 1.52 } elsif ({
4519     table => 1, caption => 1, td => 1, th => 1,
4520     button => 1, marquee => 1, object => 1, html => 1,
4521     }->{$_->[1]}) {
4522     last INSCOPE;
4523     }
4524     } # INSCOPE
4525    
4526     ## Step 1
4527     my $i = -1;
4528     my $node = $self->{open_elements}->[$i];
4529     LI: {
4530     ## Step 2
4531     if ($node->[1] eq 'dt' or $node->[1] eq 'dd') {
4532     if ($i != -1) {
4533     !!!parse-error (type => 'end tag missing:'.
4534     $self->{open_elements}->[-1]->[1]);
4535 wakaba 1.1 }
4536 wakaba 1.52 splice @{$self->{open_elements}}, $i;
4537     last LI;
4538     }
4539    
4540     ## Step 3
4541     if (not $formatting_category->{$node->[1]} and
4542     #not $phrasing_category->{$node->[1]} and
4543     ($special_category->{$node->[1]} or
4544     $scoping_category->{$node->[1]}) and
4545     $node->[1] ne 'address' and $node->[1] ne 'div') {
4546     last LI;
4547 wakaba 1.1 }
4548 wakaba 1.52
4549     ## Step 4
4550     $i--;
4551     $node = $self->{open_elements}->[$i];
4552     redo LI;
4553     } # LI
4554    
4555     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4556     !!!next-token;
4557 wakaba 1.53 redo B;
4558 wakaba 1.52 } elsif ($token->{tag_name} eq 'plaintext') {
4559     ## has a p element in scope
4560     INSCOPE: for (reverse @{$self->{open_elements}}) {
4561     if ($_->[1] eq 'p') {
4562     !!!back-token;
4563 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4564 wakaba 1.53 redo B;
4565 wakaba 1.52 } elsif ({
4566     table => 1, caption => 1, td => 1, th => 1,
4567     button => 1, marquee => 1, object => 1, html => 1,
4568     }->{$_->[1]}) {
4569     last INSCOPE;
4570 wakaba 1.46 }
4571 wakaba 1.52 } # INSCOPE
4572    
4573     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4574    
4575     $self->{content_model} = PLAINTEXT_CONTENT_MODEL;
4576    
4577     !!!next-token;
4578 wakaba 1.53 redo B;
4579 wakaba 1.52 } elsif ({
4580     h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
4581     }->{$token->{tag_name}}) {
4582     ## has a p element in scope
4583     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4584     my $node = $self->{open_elements}->[$_];
4585     if ($node->[1] eq 'p') {
4586     !!!back-token;
4587 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4588 wakaba 1.53 redo B;
4589 wakaba 1.52 } elsif ({
4590     table => 1, caption => 1, td => 1, th => 1,
4591     button => 1, marquee => 1, object => 1, html => 1,
4592     }->{$node->[1]}) {
4593     last INSCOPE;
4594 wakaba 1.46 }
4595 wakaba 1.52 } # INSCOPE
4596    
4597     ## NOTE: See <http://html5.org/tools/web-apps-tracker?from=925&to=926>
4598     ## has an element in scope
4599     #my $i;
4600     #INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4601     # my $node = $self->{open_elements}->[$_];
4602     # if ({
4603     # h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
4604     # }->{$node->[1]}) {
4605     # $i = $_;
4606     # last INSCOPE;
4607     # } elsif ({
4608     # table => 1, caption => 1, td => 1, th => 1,
4609     # button => 1, marquee => 1, object => 1, html => 1,
4610     # }->{$node->[1]}) {
4611     # last INSCOPE;
4612     # }
4613     #} # INSCOPE
4614     #
4615     #if (defined $i) {
4616     # !!! parse-error (type => 'in hn:hn');
4617     # splice @{$self->{open_elements}}, $i;
4618     #}
4619    
4620     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4621    
4622     !!!next-token;
4623 wakaba 1.53 redo B;
4624 wakaba 1.52 } elsif ($token->{tag_name} eq 'a') {
4625     AFE: for my $i (reverse 0..$#$active_formatting_elements) {
4626     my $node = $active_formatting_elements->[$i];
4627     if ($node->[1] eq 'a') {
4628     !!!parse-error (type => 'in a:a');
4629    
4630     !!!back-token;
4631 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'a'};
4632 wakaba 1.52 $formatting_end_tag->($token->{tag_name});
4633    
4634     AFE2: for (reverse 0..$#$active_formatting_elements) {
4635     if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
4636     splice @$active_formatting_elements, $_, 1;
4637     last AFE2;
4638 wakaba 1.1 }
4639 wakaba 1.52 } # AFE2
4640     OE: for (reverse 0..$#{$self->{open_elements}}) {
4641     if ($self->{open_elements}->[$_]->[0] eq $node->[0]) {
4642     splice @{$self->{open_elements}}, $_, 1;
4643     last OE;
4644 wakaba 1.1 }
4645 wakaba 1.52 } # OE
4646     last AFE;
4647     } elsif ($node->[0] eq '#marker') {
4648     last AFE;
4649     }
4650     } # AFE
4651    
4652     $reconstruct_active_formatting_elements->($insert_to_current);
4653 wakaba 1.1
4654 wakaba 1.52 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4655     push @$active_formatting_elements, $self->{open_elements}->[-1];
4656 wakaba 1.1
4657 wakaba 1.52 !!!next-token;
4658 wakaba 1.53 redo B;
4659 wakaba 1.52 } elsif ({
4660     b => 1, big => 1, em => 1, font => 1, i => 1,
4661     s => 1, small => 1, strile => 1,
4662     strong => 1, tt => 1, u => 1,
4663     }->{$token->{tag_name}}) {
4664     $reconstruct_active_formatting_elements->($insert_to_current);
4665    
4666     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4667     push @$active_formatting_elements, $self->{open_elements}->[-1];
4668    
4669     !!!next-token;
4670 wakaba 1.53 redo B;
4671 wakaba 1.52 } elsif ($token->{tag_name} eq 'nobr') {
4672     $reconstruct_active_formatting_elements->($insert_to_current);
4673 wakaba 1.1
4674 wakaba 1.52 ## has a |nobr| element in scope
4675     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4676     my $node = $self->{open_elements}->[$_];
4677     if ($node->[1] eq 'nobr') {
4678 wakaba 1.58 !!!parse-error (type => 'in nobr:nobr');
4679 wakaba 1.52 !!!back-token;
4680 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'nobr'};
4681 wakaba 1.53 redo B;
4682 wakaba 1.52 } elsif ({
4683     table => 1, caption => 1, td => 1, th => 1,
4684     button => 1, marquee => 1, object => 1, html => 1,
4685     }->{$node->[1]}) {
4686     last INSCOPE;
4687     }
4688     } # INSCOPE
4689    
4690     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4691     push @$active_formatting_elements, $self->{open_elements}->[-1];
4692    
4693     !!!next-token;
4694 wakaba 1.53 redo B;
4695 wakaba 1.52 } elsif ($token->{tag_name} eq 'button') {
4696     ## has a button element in scope
4697     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4698     my $node = $self->{open_elements}->[$_];
4699     if ($node->[1] eq 'button') {
4700     !!!parse-error (type => 'in button:button');
4701     !!!back-token;
4702 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'button'};
4703 wakaba 1.53 redo B;
4704 wakaba 1.52 } elsif ({
4705     table => 1, caption => 1, td => 1, th => 1,
4706     button => 1, marquee => 1, object => 1, html => 1,
4707     }->{$node->[1]}) {
4708     last INSCOPE;
4709     }
4710     } # INSCOPE
4711    
4712     $reconstruct_active_formatting_elements->($insert_to_current);
4713    
4714     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4715     push @$active_formatting_elements, ['#marker', ''];
4716 wakaba 1.1
4717 wakaba 1.52 !!!next-token;
4718 wakaba 1.53 redo B;
4719 wakaba 1.52 } elsif ($token->{tag_name} eq 'marquee' or
4720     $token->{tag_name} eq 'object') {
4721     $reconstruct_active_formatting_elements->($insert_to_current);
4722    
4723     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4724     push @$active_formatting_elements, ['#marker', ''];
4725    
4726     !!!next-token;
4727 wakaba 1.53 redo B;
4728 wakaba 1.52 } elsif ($token->{tag_name} eq 'xmp') {
4729     $reconstruct_active_formatting_elements->($insert_to_current);
4730     $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);
4731 wakaba 1.53 redo B;
4732 wakaba 1.52 } elsif ($token->{tag_name} eq 'table') {
4733     ## has a p element in scope
4734     INSCOPE: for (reverse @{$self->{open_elements}}) {
4735     if ($_->[1] eq 'p') {
4736     !!!back-token;
4737 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4738 wakaba 1.53 redo B;
4739 wakaba 1.52 } elsif ({
4740     table => 1, caption => 1, td => 1, th => 1,
4741     button => 1, marquee => 1, object => 1, html => 1,
4742     }->{$_->[1]}) {
4743     last INSCOPE;
4744     }
4745     } # INSCOPE
4746    
4747     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4748    
4749 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
4750 wakaba 1.52
4751     !!!next-token;
4752 wakaba 1.53 redo B;
4753 wakaba 1.52 } elsif ({
4754     area => 1, basefont => 1, bgsound => 1, br => 1,
4755     embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
4756     image => 1,
4757     }->{$token->{tag_name}}) {
4758     if ($token->{tag_name} eq 'image') {
4759     !!!parse-error (type => 'image');
4760     $token->{tag_name} = 'img';
4761     }
4762 wakaba 1.1
4763 wakaba 1.52 ## NOTE: There is an "as if <br>" code clone.
4764     $reconstruct_active_formatting_elements->($insert_to_current);
4765    
4766     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4767     pop @{$self->{open_elements}};
4768    
4769     !!!next-token;
4770 wakaba 1.53 redo B;
4771 wakaba 1.52 } elsif ($token->{tag_name} eq 'hr') {
4772     ## has a p element in scope
4773     INSCOPE: for (reverse @{$self->{open_elements}}) {
4774     if ($_->[1] eq 'p') {
4775     !!!back-token;
4776 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4777 wakaba 1.53 redo B;
4778 wakaba 1.52 } elsif ({
4779     table => 1, caption => 1, td => 1, th => 1,
4780     button => 1, marquee => 1, object => 1, html => 1,
4781     }->{$_->[1]}) {
4782     last INSCOPE;
4783     }
4784     } # INSCOPE
4785    
4786     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4787     pop @{$self->{open_elements}};
4788    
4789     !!!next-token;
4790 wakaba 1.53 redo B;
4791 wakaba 1.52 } elsif ($token->{tag_name} eq 'input') {
4792     $reconstruct_active_formatting_elements->($insert_to_current);
4793    
4794     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4795     ## TODO: associate with $self->{form_element} if defined
4796     pop @{$self->{open_elements}};
4797    
4798     !!!next-token;
4799 wakaba 1.53 redo B;
4800 wakaba 1.52 } elsif ($token->{tag_name} eq 'isindex') {
4801     !!!parse-error (type => 'isindex');
4802    
4803     if (defined $self->{form_element}) {
4804     ## Ignore the token
4805     !!!next-token;
4806 wakaba 1.53 redo B;
4807 wakaba 1.52 } else {
4808     my $at = $token->{attributes};
4809     my $form_attrs;
4810     $form_attrs->{action} = $at->{action} if $at->{action};
4811     my $prompt_attr = $at->{prompt};
4812     $at->{name} = {name => 'name', value => 'isindex'};
4813     delete $at->{action};
4814     delete $at->{prompt};
4815     my @tokens = (
4816 wakaba 1.55 {type => START_TAG_TOKEN, tag_name => 'form',
4817 wakaba 1.52 attributes => $form_attrs},
4818 wakaba 1.55 {type => START_TAG_TOKEN, tag_name => 'hr'},
4819     {type => START_TAG_TOKEN, tag_name => 'p'},
4820     {type => START_TAG_TOKEN, tag_name => 'label'},
4821 wakaba 1.52 );
4822     if ($prompt_attr) {
4823 wakaba 1.55 push @tokens, {type => CHARACTER_TOKEN, data => $prompt_attr->{value}};
4824 wakaba 1.1 } else {
4825 wakaba 1.55 push @tokens, {type => CHARACTER_TOKEN,
4826 wakaba 1.52 data => 'This is a searchable index. Insert your search keywords here: '}; # SHOULD
4827     ## TODO: make this configurable
4828 wakaba 1.1 }
4829 wakaba 1.52 push @tokens,
4830 wakaba 1.55 {type => START_TAG_TOKEN, tag_name => 'input', attributes => $at},
4831     #{type => CHARACTER_TOKEN, data => ''}, # SHOULD
4832     {type => END_TAG_TOKEN, tag_name => 'label'},
4833     {type => END_TAG_TOKEN, tag_name => 'p'},
4834     {type => START_TAG_TOKEN, tag_name => 'hr'},
4835     {type => END_TAG_TOKEN, tag_name => 'form'};
4836 wakaba 1.52 $token = shift @tokens;
4837     !!!back-token (@tokens);
4838 wakaba 1.53 redo B;
4839 wakaba 1.52 }
4840     } elsif ($token->{tag_name} eq 'textarea') {
4841     my $tag_name = $token->{tag_name};
4842     my $el;
4843     !!!create-element ($el, $token->{tag_name}, $token->{attributes});
4844    
4845     ## TODO: $self->{form_element} if defined
4846     $self->{content_model} = RCDATA_CONTENT_MODEL;
4847     delete $self->{escape}; # MUST
4848    
4849     $insert->($el);
4850    
4851     my $text = '';
4852     !!!next-token;
4853 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
4854 wakaba 1.52 $token->{data} =~ s/^\x0A//;
4855 wakaba 1.51 unless (length $token->{data}) {
4856     !!!next-token;
4857     }
4858     }
4859 wakaba 1.55 while ($token->{type} == CHARACTER_TOKEN) {
4860 wakaba 1.52 $text .= $token->{data};
4861     !!!next-token;
4862     }
4863     if (length $text) {
4864     $el->manakai_append_text ($text);
4865     }
4866    
4867     $self->{content_model} = PCDATA_CONTENT_MODEL;
4868 wakaba 1.51
4869 wakaba 1.55 if ($token->{type} == END_TAG_TOKEN and
4870 wakaba 1.52 $token->{tag_name} eq $tag_name) {
4871     ## Ignore the token
4872     } else {
4873     !!!parse-error (type => 'in RCDATA:#'.$token->{type});
4874 wakaba 1.51 }
4875 wakaba 1.52 !!!next-token;
4876 wakaba 1.53 redo B;
4877 wakaba 1.52 } elsif ({
4878     iframe => 1,
4879     noembed => 1,
4880     noframes => 1,
4881     noscript => 0, ## TODO: 1 if scripting is enabled
4882     }->{$token->{tag_name}}) {
4883 wakaba 1.58 ## NOTE: There is an "as if in body" code clone.
4884 wakaba 1.52 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);
4885 wakaba 1.53 redo B;
4886 wakaba 1.52 } elsif ($token->{tag_name} eq 'select') {
4887     $reconstruct_active_formatting_elements->($insert_to_current);
4888    
4889     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4890    
4891 wakaba 1.54 $self->{insertion_mode} = IN_SELECT_IM;
4892 wakaba 1.52 !!!next-token;
4893 wakaba 1.53 redo B;
4894 wakaba 1.52 } elsif ({
4895     caption => 1, col => 1, colgroup => 1, frame => 1,
4896     frameset => 1, head => 1, option => 1, optgroup => 1,
4897     tbody => 1, td => 1, tfoot => 1, th => 1,
4898     thead => 1, tr => 1,
4899     }->{$token->{tag_name}}) {
4900     !!!parse-error (type => 'in body:'.$token->{tag_name});
4901     ## Ignore the token
4902     !!!next-token;
4903 wakaba 1.53 redo B;
4904 wakaba 1.52
4905     ## ISSUE: An issue on HTML5 new elements in the spec.
4906     } else {
4907     $reconstruct_active_formatting_elements->($insert_to_current);
4908    
4909     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4910 wakaba 1.51
4911 wakaba 1.52 !!!next-token;
4912 wakaba 1.53 redo B;
4913 wakaba 1.52 }
4914 wakaba 1.55 } elsif ($token->{type} == END_TAG_TOKEN) {
4915 wakaba 1.52 if ($token->{tag_name} eq 'body') {
4916     if (@{$self->{open_elements}} > 1 and
4917     $self->{open_elements}->[1]->[1] eq 'body') {
4918     for (@{$self->{open_elements}}) {
4919     unless ({
4920     dd => 1, dt => 1, li => 1, p => 1, td => 1,
4921     th => 1, tr => 1, body => 1, html => 1,
4922     tbody => 1, tfoot => 1, thead => 1,
4923     }->{$_->[1]}) {
4924     !!!parse-error (type => 'not closed:'.$_->[1]);
4925     }
4926     }
4927 wakaba 1.51
4928 wakaba 1.54 $self->{insertion_mode} = AFTER_BODY_IM;
4929 wakaba 1.52 !!!next-token;
4930 wakaba 1.53 redo B;
4931 wakaba 1.52 } else {
4932     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4933     ## Ignore the token
4934     !!!next-token;
4935 wakaba 1.53 redo B;
4936 wakaba 1.51 }
4937 wakaba 1.52 } elsif ($token->{tag_name} eq 'html') {
4938     if (@{$self->{open_elements}} > 1 and $self->{open_elements}->[1]->[1] eq 'body') {
4939     ## ISSUE: There is an issue in the spec.
4940     if ($self->{open_elements}->[-1]->[1] ne 'body') {
4941     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[1]->[1]);
4942 wakaba 1.1 }
4943 wakaba 1.54 $self->{insertion_mode} = AFTER_BODY_IM;
4944 wakaba 1.52 ## reprocess
4945 wakaba 1.53 redo B;
4946 wakaba 1.51 } else {
4947 wakaba 1.52 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4948     ## Ignore the token
4949     !!!next-token;
4950 wakaba 1.53 redo B;
4951 wakaba 1.51 }
4952 wakaba 1.52 } elsif ({
4953     address => 1, blockquote => 1, center => 1, dir => 1,
4954     div => 1, dl => 1, fieldset => 1, listing => 1,
4955     menu => 1, ol => 1, pre => 1, ul => 1,
4956     p => 1,
4957     dd => 1, dt => 1, li => 1,
4958     button => 1, marquee => 1, object => 1,
4959     }->{$token->{tag_name}}) {
4960     ## has an element in scope
4961     my $i;
4962     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4963     my $node = $self->{open_elements}->[$_];
4964     if ($node->[1] eq $token->{tag_name}) {
4965     ## generate implied end tags
4966     if ({
4967     dd => ($token->{tag_name} ne 'dd'),
4968     dt => ($token->{tag_name} ne 'dt'),
4969     li => ($token->{tag_name} ne 'li'),
4970     p => ($token->{tag_name} ne 'p'),
4971     td => 1, th => 1, tr => 1,
4972     tbody => 1, tfoot=> 1, thead => 1,
4973     }->{$self->{open_elements}->[-1]->[1]}) {
4974     !!!back-token;
4975 wakaba 1.55 $token = {type => END_TAG_TOKEN,
4976 wakaba 1.52 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
4977 wakaba 1.53 redo B;
4978 wakaba 1.52 }
4979     $i = $_;
4980     last INSCOPE unless $token->{tag_name} eq 'p';
4981     } elsif ({
4982     table => 1, caption => 1, td => 1, th => 1,
4983     button => 1, marquee => 1, object => 1, html => 1,
4984     }->{$node->[1]}) {
4985     last INSCOPE;
4986 wakaba 1.51 }
4987 wakaba 1.52 } # INSCOPE
4988    
4989     if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
4990     if (defined $i) {
4991     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4992 wakaba 1.51 } else {
4993 wakaba 1.52 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4994 wakaba 1.51 }
4995     }
4996    
4997 wakaba 1.52 if (defined $i) {
4998     splice @{$self->{open_elements}}, $i;
4999     } elsif ($token->{tag_name} eq 'p') {
5000     ## As if <p>, then reprocess the current token
5001     my $el;
5002     !!!create-element ($el, 'p');
5003     $insert->($el);
5004 wakaba 1.51 }
5005 wakaba 1.52 $clear_up_to_marker->()
5006     if {
5007     button => 1, marquee => 1, object => 1,
5008     }->{$token->{tag_name}};
5009     !!!next-token;
5010 wakaba 1.53 redo B;
5011 wakaba 1.52 } elsif ($token->{tag_name} eq 'form') {
5012     ## has an element in scope
5013     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5014     my $node = $self->{open_elements}->[$_];
5015     if ($node->[1] eq $token->{tag_name}) {
5016     ## generate implied end tags
5017     if ({
5018     dd => 1, dt => 1, li => 1, p => 1,
5019     td => 1, th => 1, tr => 1,
5020     tbody => 1, tfoot=> 1, thead => 1,
5021     }->{$self->{open_elements}->[-1]->[1]}) {
5022     !!!back-token;
5023 wakaba 1.55 $token = {type => END_TAG_TOKEN,
5024 wakaba 1.52 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
5025 wakaba 1.53 redo B;
5026 wakaba 1.52 }
5027     last INSCOPE;
5028     } elsif ({
5029     table => 1, caption => 1, td => 1, th => 1,
5030     button => 1, marquee => 1, object => 1, html => 1,
5031     }->{$node->[1]}) {
5032     last INSCOPE;
5033     }
5034     } # INSCOPE
5035    
5036     if ($self->{open_elements}->[-1]->[1] eq $token->{tag_name}) {
5037 wakaba 1.36 pop @{$self->{open_elements}};
5038     } else {
5039 wakaba 1.58 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5040 wakaba 1.52 }
5041    
5042     undef $self->{form_element};
5043     !!!next-token;
5044 wakaba 1.53 redo B;
5045 wakaba 1.52 } elsif ({
5046     h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
5047     }->{$token->{tag_name}}) {
5048     ## has an element in scope
5049     my $i;
5050     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5051     my $node = $self->{open_elements}->[$_];
5052     if ({
5053     h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
5054     }->{$node->[1]}) {
5055     ## generate implied end tags
5056     if ({
5057     dd => 1, dt => 1, li => 1, p => 1,
5058     td => 1, th => 1, tr => 1,
5059     tbody => 1, tfoot=> 1, thead => 1,
5060     }->{$self->{open_elements}->[-1]->[1]}) {
5061     !!!back-token;
5062 wakaba 1.55 $token = {type => END_TAG_TOKEN,
5063 wakaba 1.52 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
5064 wakaba 1.53 redo B;
5065 wakaba 1.52 }
5066     $i = $_;
5067     last INSCOPE;
5068     } elsif ({
5069     table => 1, caption => 1, td => 1, th => 1,
5070     button => 1, marquee => 1, object => 1, html => 1,
5071     }->{$node->[1]}) {
5072     last INSCOPE;
5073 wakaba 1.51 }
5074 wakaba 1.52 } # INSCOPE
5075    
5076     if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
5077 wakaba 1.58 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5078 wakaba 1.36 }
5079 wakaba 1.52
5080     splice @{$self->{open_elements}}, $i if defined $i;
5081     !!!next-token;
5082 wakaba 1.53 redo B;
5083 wakaba 1.52 } elsif ({
5084     a => 1,
5085     b => 1, big => 1, em => 1, font => 1, i => 1,
5086     nobr => 1, s => 1, small => 1, strile => 1,
5087     strong => 1, tt => 1, u => 1,
5088     }->{$token->{tag_name}}) {
5089     $formatting_end_tag->($token->{tag_name});
5090 wakaba 1.53 redo B;
5091 wakaba 1.52 } elsif ($token->{tag_name} eq 'br') {
5092     !!!parse-error (type => 'unmatched end tag:br');
5093    
5094     ## As if <br>
5095     $reconstruct_active_formatting_elements->($insert_to_current);
5096    
5097     my $el;
5098     !!!create-element ($el, 'br');
5099     $insert->($el);
5100    
5101     ## Ignore the token.
5102     !!!next-token;
5103 wakaba 1.53 redo B;
5104 wakaba 1.52 } elsif ({
5105     caption => 1, col => 1, colgroup => 1, frame => 1,
5106     frameset => 1, head => 1, option => 1, optgroup => 1,
5107     tbody => 1, td => 1, tfoot => 1, th => 1,
5108     thead => 1, tr => 1,
5109     area => 1, basefont => 1, bgsound => 1,
5110     embed => 1, hr => 1, iframe => 1, image => 1,
5111     img => 1, input => 1, isindex => 1, noembed => 1,
5112     noframes => 1, param => 1, select => 1, spacer => 1,
5113     table => 1, textarea => 1, wbr => 1,
5114     noscript => 0, ## TODO: if scripting is enabled
5115     }->{$token->{tag_name}}) {
5116     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5117     ## Ignore the token
5118     !!!next-token;
5119 wakaba 1.53 redo B;
5120 wakaba 1.52
5121     ## ISSUE: Issue on HTML5 new elements in spec
5122    
5123     } else {
5124     ## Step 1
5125     my $node_i = -1;
5126     my $node = $self->{open_elements}->[$node_i];
5127 wakaba 1.51
5128 wakaba 1.52 ## Step 2
5129     S2: {
5130     if ($node->[1] eq $token->{tag_name}) {
5131     ## Step 1
5132     ## generate implied end tags
5133     if ({
5134     dd => 1, dt => 1, li => 1, p => 1,
5135     td => 1, th => 1, tr => 1,
5136 wakaba 1.55 tbody => 1, tfoot => 1, thead => 1,
5137 wakaba 1.52 }->{$self->{open_elements}->[-1]->[1]}) {
5138     !!!back-token;
5139 wakaba 1.55 $token = {type => END_TAG_TOKEN,
5140 wakaba 1.52 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
5141 wakaba 1.53 redo B;
5142 wakaba 1.52 }
5143    
5144     ## Step 2
5145     if ($token->{tag_name} ne $self->{open_elements}->[-1]->[1]) {
5146 wakaba 1.58 ## NOTE: <x><y></x>
5147 wakaba 1.52 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
5148     }
5149    
5150     ## Step 3
5151     splice @{$self->{open_elements}}, $node_i;
5152 wakaba 1.51
5153 wakaba 1.1 !!!next-token;
5154 wakaba 1.52 last S2;
5155 wakaba 1.1 } else {
5156 wakaba 1.52 ## Step 3
5157     if (not $formatting_category->{$node->[1]} and
5158     #not $phrasing_category->{$node->[1]} and
5159     ($special_category->{$node->[1]} or
5160     $scoping_category->{$node->[1]})) {
5161     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5162     ## Ignore the token
5163     !!!next-token;
5164     last S2;
5165     }
5166 wakaba 1.1 }
5167 wakaba 1.52
5168     ## Step 4
5169     $node_i--;
5170     $node = $self->{open_elements}->[$node_i];
5171    
5172     ## Step 5;
5173     redo S2;
5174     } # S2
5175 wakaba 1.53 redo B;
5176 wakaba 1.1 }
5177     }
5178 wakaba 1.52 redo B;
5179 wakaba 1.1 } # B
5180    
5181 wakaba 1.51 ## NOTE: The "trailing end" phase in HTML5 is split into
5182     ## two insertion modes: "after html body" and "after html frameset".
5183     ## NOTE: States in the main stage is preserved while
5184     ## the parser stays in the trailing end phase. # MUST
5185    
5186 wakaba 1.1 ## Stop parsing # MUST
5187    
5188     ## TODO: script stuffs
5189 wakaba 1.3 } # _tree_construct_main
5190    
5191     sub set_inner_html ($$$) {
5192     my $class = shift;
5193     my $node = shift;
5194     my $s = \$_[0];
5195     my $onerror = $_[1];
5196    
5197     my $nt = $node->node_type;
5198     if ($nt == 9) {
5199     # MUST
5200    
5201     ## Step 1 # MUST
5202     ## TODO: If the document has an active parser, ...
5203     ## ISSUE: There is an issue in the spec.
5204    
5205     ## Step 2 # MUST
5206     my @cn = @{$node->child_nodes};
5207     for (@cn) {
5208     $node->remove_child ($_);
5209     }
5210    
5211     ## Step 3, 4, 5 # MUST
5212     $class->parse_string ($$s => $node, $onerror);
5213     } elsif ($nt == 1) {
5214     ## TODO: If non-html element
5215    
5216     ## NOTE: Most of this code is copied from |parse_string|
5217    
5218     ## Step 1 # MUST
5219 wakaba 1.14 my $this_doc = $node->owner_document;
5220     my $doc = $this_doc->implementation->create_document;
5221 wakaba 1.18 $doc->manakai_is_html (1);
5222 wakaba 1.3 my $p = $class->new;
5223     $p->{document} = $doc;
5224    
5225     ## Step 9 # MUST
5226     my $i = 0;
5227     my $line = 1;
5228     my $column = 0;
5229     $p->{set_next_input_character} = sub {
5230     my $self = shift;
5231 wakaba 1.14
5232     pop @{$self->{prev_input_character}};
5233     unshift @{$self->{prev_input_character}}, $self->{next_input_character};
5234    
5235 wakaba 1.3 $self->{next_input_character} = -1 and return if $i >= length $$s;
5236     $self->{next_input_character} = ord substr $$s, $i++, 1;
5237     $column++;
5238 wakaba 1.4
5239     if ($self->{next_input_character} == 0x000A) { # LF
5240     $line++;
5241     $column = 0;
5242     } elsif ($self->{next_input_character} == 0x000D) { # CR
5243 wakaba 1.15 $i++ if substr ($$s, $i, 1) eq "\x0A";
5244 wakaba 1.3 $self->{next_input_character} = 0x000A; # LF # MUST
5245     $line++;
5246 wakaba 1.4 $column = 0;
5247 wakaba 1.3 } elsif ($self->{next_input_character} > 0x10FFFF) {
5248     $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
5249     } elsif ($self->{next_input_character} == 0x0000) { # NULL
5250 wakaba 1.14 !!!parse-error (type => 'NULL');
5251 wakaba 1.3 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
5252     }
5253     };
5254 wakaba 1.14 $p->{prev_input_character} = [-1, -1, -1];
5255     $p->{next_input_character} = -1;
5256 wakaba 1.3
5257     my $ponerror = $onerror || sub {
5258     my (%opt) = @_;
5259     warn "Parse error ($opt{type}) at line $opt{line} column $opt{column}\n";
5260     };
5261     $p->{parse_error} = sub {
5262     $ponerror->(@_, line => $line, column => $column);
5263     };
5264    
5265     $p->_initialize_tokenizer;
5266     $p->_initialize_tree_constructor;
5267    
5268     ## Step 2
5269     my $node_ln = $node->local_name;
5270 wakaba 1.40 $p->{content_model} = {
5271     title => RCDATA_CONTENT_MODEL,
5272     textarea => RCDATA_CONTENT_MODEL,
5273     style => CDATA_CONTENT_MODEL,
5274     script => CDATA_CONTENT_MODEL,
5275     xmp => CDATA_CONTENT_MODEL,
5276     iframe => CDATA_CONTENT_MODEL,
5277     noembed => CDATA_CONTENT_MODEL,
5278     noframes => CDATA_CONTENT_MODEL,
5279     noscript => CDATA_CONTENT_MODEL,
5280     plaintext => PLAINTEXT_CONTENT_MODEL,
5281     }->{$node_ln};
5282     $p->{content_model} = PCDATA_CONTENT_MODEL
5283     unless defined $p->{content_model};
5284     ## ISSUE: What is "the name of the element"? local name?
5285 wakaba 1.3
5286     $p->{inner_html_node} = [$node, $node_ln];
5287    
5288     ## Step 4
5289     my $root = $doc->create_element_ns
5290     ('http://www.w3.org/1999/xhtml', [undef, 'html']);
5291    
5292     ## Step 5 # MUST
5293     $doc->append_child ($root);
5294    
5295     ## Step 6 # MUST
5296     push @{$p->{open_elements}}, [$root, 'html'];
5297    
5298     undef $p->{head_element};
5299    
5300     ## Step 7 # MUST
5301     $p->_reset_insertion_mode;
5302    
5303     ## Step 8 # MUST
5304     my $anode = $node;
5305     AN: while (defined $anode) {
5306     if ($anode->node_type == 1) {
5307     my $nsuri = $anode->namespace_uri;
5308     if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {
5309     if ($anode->local_name eq 'form') { ## TODO: case?
5310     $p->{form_element} = $anode;
5311     last AN;
5312     }
5313     }
5314     }
5315     $anode = $anode->parent_node;
5316     } # AN
5317    
5318     ## Step 3 # MUST
5319     ## Step 10 # MUST
5320     {
5321     my $self = $p;
5322     !!!next-token;
5323     }
5324     $p->_tree_construction_main;
5325    
5326     ## Step 11 # MUST
5327     my @cn = @{$node->child_nodes};
5328     for (@cn) {
5329     $node->remove_child ($_);
5330     }
5331     ## ISSUE: mutation events? read-only?
5332    
5333     ## Step 12 # MUST
5334     @cn = @{$root->child_nodes};
5335     for (@cn) {
5336 wakaba 1.14 $this_doc->adopt_node ($_);
5337 wakaba 1.3 $node->append_child ($_);
5338     }
5339 wakaba 1.14 ## ISSUE: mutation events?
5340 wakaba 1.3
5341     $p->_terminate_tree_constructor;
5342     } else {
5343     die "$0: |set_inner_html| is not defined for node of type $nt";
5344     }
5345     } # set_inner_html
5346    
5347     } # tree construction stage
5348 wakaba 1.1
5349     sub get_inner_html ($$$) {
5350 wakaba 1.3 my (undef, $node, $on_error) = @_;
5351 wakaba 1.1
5352     ## Step 1
5353     my $s = '';
5354    
5355     my $in_cdata;
5356     my $parent = $node;
5357     while (defined $parent) {
5358     if ($parent->node_type == 1 and
5359     $parent->namespace_uri eq 'http://www.w3.org/1999/xhtml' and
5360     {
5361     style => 1, script => 1, xmp => 1, iframe => 1,
5362     noembed => 1, noframes => 1, noscript => 1,
5363     }->{$parent->local_name}) { ## TODO: case thingy
5364     $in_cdata = 1;
5365     }
5366     $parent = $parent->parent_node;
5367     }
5368    
5369     ## Step 2
5370     my @node = @{$node->child_nodes};
5371     C: while (@node) {
5372     my $child = shift @node;
5373     unless (ref $child) {
5374     if ($child eq 'cdata-out') {
5375     $in_cdata = 0;
5376     } else {
5377     $s .= $child; # end tag
5378     }
5379     next C;
5380     }
5381    
5382     my $nt = $child->node_type;
5383     if ($nt == 1) { # Element
5384 wakaba 1.27 my $tag_name = $child->tag_name; ## TODO: manakai_tag_name
5385 wakaba 1.1 $s .= '<' . $tag_name;
5386 wakaba 1.27 ## NOTE: Non-HTML case:
5387     ## <http://permalink.gmane.org/gmane.org.w3c.whatwg.discuss/11191>
5388 wakaba 1.1
5389     my @attrs = @{$child->attributes}; # sort order MUST be stable
5390     for my $attr (@attrs) { # order is implementation dependent
5391 wakaba 1.27 my $attr_name = $attr->name; ## TODO: manakai_name
5392 wakaba 1.1 $s .= ' ' . $attr_name . '="';
5393     my $attr_value = $attr->value;
5394     ## escape
5395     $attr_value =~ s/&/&amp;/g;
5396     $attr_value =~ s/</&lt;/g;
5397     $attr_value =~ s/>/&gt;/g;
5398     $attr_value =~ s/"/&quot;/g;
5399     $s .= $attr_value . '"';
5400     }
5401     $s .= '>';
5402    
5403     next C if {
5404     area => 1, base => 1, basefont => 1, bgsound => 1,
5405     br => 1, col => 1, embed => 1, frame => 1, hr => 1,
5406     img => 1, input => 1, link => 1, meta => 1, param => 1,
5407     spacer => 1, wbr => 1,
5408     }->{$tag_name};
5409    
5410 wakaba 1.23 $s .= "\x0A" if $tag_name eq 'pre' or $tag_name eq 'textarea';
5411    
5412 wakaba 1.1 if (not $in_cdata and {
5413     style => 1, script => 1, xmp => 1, iframe => 1,
5414     noembed => 1, noframes => 1, noscript => 1,
5415 wakaba 1.26 plaintext => 1,
5416 wakaba 1.1 }->{$tag_name}) {
5417     unshift @node, 'cdata-out';
5418     $in_cdata = 1;
5419     }
5420    
5421     unshift @node, @{$child->child_nodes}, '</' . $tag_name . '>';
5422     } elsif ($nt == 3 or $nt == 4) {
5423     if ($in_cdata) {
5424     $s .= $child->data;
5425     } else {
5426     my $value = $child->data;
5427     $value =~ s/&/&amp;/g;
5428     $value =~ s/</&lt;/g;
5429     $value =~ s/>/&gt;/g;
5430     $value =~ s/"/&quot;/g;
5431     $s .= $value;
5432     }
5433     } elsif ($nt == 8) {
5434     $s .= '<!--' . $child->data . '-->';
5435     } elsif ($nt == 10) {
5436     $s .= '<!DOCTYPE ' . $child->name . '>';
5437     } elsif ($nt == 5) { # entrefs
5438     push @node, @{$child->child_nodes};
5439     } else {
5440     $on_error->($child) if defined $on_error;
5441     }
5442     ## ISSUE: This code does not support PIs.
5443     } # C
5444    
5445     ## Step 3
5446     return \$s;
5447     } # get_inner_html
5448    
5449     1;
5450 wakaba 1.59 # $Date: 2007/09/04 11:19:07 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24