/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.65 - (hide annotations) (download) (as text)
Mon Nov 19 12:18:26 2007 UTC (16 years, 11 months ago) by wakaba
Branch: MAIN
Changes since 1.64: +7 -3 lines
File MIME type: application/x-wais-source
++ ChangeLog	19 Nov 2007 12:16:52 -0000
2007-11-19  Wakaba  <wakaba@suika.fam.cx>

	* readme.en.html: Link to |Whatpm::Charset::UniversalCharDet|.

++ whatpm/Whatpm/ChangeLog	19 Nov 2007 12:17:47 -0000
2007-11-19  Wakaba  <wakaba@suika.fam.cx>

	* HTML.pm.src (parse_byte_string): Detect charset
	by universalchardet if charset parameter is not specified.

	* Makefile (Charset-all, Charset-clean): New rules.

++ whatpm/Whatpm/Charset/ChangeLog	19 Nov 2007 11:54:20 -0000
2007-11-19  Wakaba  <wakaba@suika.fam.cx>

	* Makefile: New file.

	* UniversalCharDet.pm, UniversalCharDet.pod: New files.

1 wakaba 1.2 package Whatpm::HTML;
2 wakaba 1.1 use strict;
3 wakaba 1.65 our $VERSION=do{my @r=(q$Revision: 1.64 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.63 use Error qw(:try);
5 wakaba 1.1
6 wakaba 1.18 ## ISSUE:
7     ## var doc = implementation.createDocument (null, null, null);
8     ## doc.write ('');
9     ## alert (doc.compatMode);
10 wakaba 1.1
11 wakaba 1.31 ## ISSUE: HTML5 revision 967 says that the encoding layer MUST NOT
12     ## strip BOM and the HTML layer MUST ignore it. Whether we can do it
13     ## is not yet clear.
14     ## "{U+FEFF}..." in UTF-16BE/UTF-16LE is three or four characters?
15     ## "{U+FEFF}..." in GB18030?
16    
17 wakaba 1.1 my $permitted_slash_tag_name = {
18     base => 1,
19     link => 1,
20     meta => 1,
21     hr => 1,
22     br => 1,
23     img=> 1,
24     embed => 1,
25     param => 1,
26     area => 1,
27     col => 1,
28     input => 1,
29     };
30    
31 wakaba 1.4 my $c1_entity_char = {
32 wakaba 1.10 0x80 => 0x20AC,
33     0x81 => 0xFFFD,
34     0x82 => 0x201A,
35     0x83 => 0x0192,
36     0x84 => 0x201E,
37     0x85 => 0x2026,
38     0x86 => 0x2020,
39     0x87 => 0x2021,
40     0x88 => 0x02C6,
41     0x89 => 0x2030,
42     0x8A => 0x0160,
43     0x8B => 0x2039,
44     0x8C => 0x0152,
45     0x8D => 0xFFFD,
46     0x8E => 0x017D,
47     0x8F => 0xFFFD,
48     0x90 => 0xFFFD,
49     0x91 => 0x2018,
50     0x92 => 0x2019,
51     0x93 => 0x201C,
52     0x94 => 0x201D,
53     0x95 => 0x2022,
54     0x96 => 0x2013,
55     0x97 => 0x2014,
56     0x98 => 0x02DC,
57     0x99 => 0x2122,
58     0x9A => 0x0161,
59     0x9B => 0x203A,
60     0x9C => 0x0153,
61     0x9D => 0xFFFD,
62     0x9E => 0x017E,
63     0x9F => 0x0178,
64 wakaba 1.4 }; # $c1_entity_char
65 wakaba 1.1
66     my $special_category = {
67     address => 1, area => 1, base => 1, basefont => 1, bgsound => 1,
68     blockquote => 1, body => 1, br => 1, center => 1, col => 1, colgroup => 1,
69     dd => 1, dir => 1, div => 1, dl => 1, dt => 1, embed => 1, fieldset => 1,
70     form => 1, frame => 1, frameset => 1, h1 => 1, h2 => 1, h3 => 1,
71     h4 => 1, h5 => 1, h6 => 1, head => 1, hr => 1, iframe => 1, image => 1,
72     img => 1, input => 1, isindex => 1, li => 1, link => 1, listing => 1,
73     menu => 1, meta => 1, noembed => 1, noframes => 1, noscript => 1,
74     ol => 1, optgroup => 1, option => 1, p => 1, param => 1, plaintext => 1,
75     pre => 1, script => 1, select => 1, spacer => 1, style => 1, tbody => 1,
76     textarea => 1, tfoot => 1, thead => 1, title => 1, tr => 1, ul => 1, wbr => 1,
77     };
78     my $scoping_category = {
79     button => 1, caption => 1, html => 1, marquee => 1, object => 1,
80     table => 1, td => 1, th => 1,
81     };
82     my $formatting_category = {
83     a => 1, b => 1, big => 1, em => 1, font => 1, i => 1, nobr => 1,
84     s => 1, small => 1, strile => 1, strong => 1, tt => 1, u => 1,
85     };
86     # $phrasing_category: all other elements
87    
88 wakaba 1.63 sub parse_byte_string ($$$$;$) {
89     my $self = ref $_[0] ? shift : shift->new;
90     my $charset = shift;
91     my $bytes_s = ref $_[0] ? $_[0] : \($_[0]);
92     my $s;
93    
94     if (defined $charset) {
95 wakaba 1.64 require Encode; ## TODO: decode(utf8) don't delete BOM
96 wakaba 1.63 $s = \ (Encode::decode ($charset, $$bytes_s));
97 wakaba 1.64 $self->{input_encoding} = lc $charset; ## TODO: normalize name
98 wakaba 1.63 $self->{confident} = 1;
99     } else {
100 wakaba 1.65 ## TODO: Implement HTML5 detection algorithm
101     require Whatpm::Charset::UniversalCharDet;
102     $charset = Whatpm::Charset::UniversalCharDet->detect_byte_string
103     (substr ($$bytes_s, 0, 1024));
104     $charset ||= 'windows-1252';
105 wakaba 1.64 $s = \ (Encode::decode ($charset, $$bytes_s));
106     $self->{input_encoding} = $charset;
107 wakaba 1.63 $self->{confident} = 0;
108     }
109    
110     $self->{change_encoding} = sub {
111     my $self = shift;
112     my $charset = lc shift;
113     ## TODO: if $charset is supported
114     ## TODO: normalize charset name
115    
116     ## "Change the encoding" algorithm:
117    
118     ## Step 1
119     if ($charset eq 'utf-16') { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8?
120     $charset = 'utf-8';
121     }
122    
123     ## Step 2
124     if (defined $self->{input_encoding} and
125     $self->{input_encoding} eq $charset) {
126     $self->{confident} = 1;
127     return;
128     }
129    
130 wakaba 1.64 !!!parse-error (type => 'charset label detected:'.$self->{input_encoding}.
131     ':'.$charset, level => 'w');
132 wakaba 1.63
133     ## Step 3
134     # if (can) {
135     ## change the encoding on the fly.
136     #$self->{confident} = 1;
137     #return;
138     # }
139    
140     ## Step 4
141     throw Whatpm::HTML::RestartParser (charset => $charset);
142     }; # $self->{change_encoding}
143    
144     my @args = @_; shift @args; # $s
145     my $return;
146     try {
147     $return = $self->parse_char_string ($s, @args);
148     } catch Whatpm::HTML::RestartParser with {
149     my $charset = shift->{charset};
150     $s = \ (Encode::decode ($charset, $$bytes_s));
151 wakaba 1.64 $self->{input_encoding} = $charset; ## TODO: normalize
152 wakaba 1.63 $self->{confident} = 1;
153     $return = $self->parse_char_string ($s, @args);
154     };
155     return $return;
156     } # parse_byte_string
157    
158     *parse_char_string = \&parse_string;
159    
160 wakaba 1.1 sub parse_string ($$$;$) {
161 wakaba 1.63 my $self = ref $_[0] ? shift : shift->new;
162     my $s = ref $_[0] ? $_[0] : \($_[0]);
163 wakaba 1.1 $self->{document} = $_[1];
164 wakaba 1.63 @{$self->{document}->child_nodes} = ();
165 wakaba 1.1
166 wakaba 1.3 ## NOTE: |set_inner_html| copies most of this method's code
167    
168 wakaba 1.63 $self->{confident} = 1 unless exists $self->{confident};
169 wakaba 1.64 $self->{document}->input_encoding ($self->{input_encoding})
170     if defined $self->{input_encoding};
171 wakaba 1.63
172 wakaba 1.1 my $i = 0;
173 wakaba 1.3 my $line = 1;
174     my $column = 0;
175 wakaba 1.1 $self->{set_next_input_character} = sub {
176     my $self = shift;
177 wakaba 1.13
178     pop @{$self->{prev_input_character}};
179     unshift @{$self->{prev_input_character}}, $self->{next_input_character};
180    
181 wakaba 1.1 $self->{next_input_character} = -1 and return if $i >= length $$s;
182     $self->{next_input_character} = ord substr $$s, $i++, 1;
183 wakaba 1.3 $column++;
184 wakaba 1.1
185 wakaba 1.4 if ($self->{next_input_character} == 0x000A) { # LF
186     $line++;
187     $column = 0;
188     } elsif ($self->{next_input_character} == 0x000D) { # CR
189 wakaba 1.15 $i++ if substr ($$s, $i, 1) eq "\x0A";
190 wakaba 1.1 $self->{next_input_character} = 0x000A; # LF # MUST
191 wakaba 1.3 $line++;
192 wakaba 1.4 $column = 0;
193 wakaba 1.1 } elsif ($self->{next_input_character} > 0x10FFFF) {
194     $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
195     } elsif ($self->{next_input_character} == 0x0000) { # NULL
196 wakaba 1.8 !!!parse-error (type => 'NULL');
197 wakaba 1.1 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
198     }
199     };
200 wakaba 1.13 $self->{prev_input_character} = [-1, -1, -1];
201     $self->{next_input_character} = -1;
202 wakaba 1.1
203 wakaba 1.3 my $onerror = $_[2] || sub {
204     my (%opt) = @_;
205     warn "Parse error ($opt{type}) at line $opt{line} column $opt{column}\n";
206     };
207     $self->{parse_error} = sub {
208     $onerror->(@_, line => $line, column => $column);
209 wakaba 1.1 };
210    
211     $self->_initialize_tokenizer;
212     $self->_initialize_tree_constructor;
213     $self->_construct_tree;
214     $self->_terminate_tree_constructor;
215    
216     return $self->{document};
217     } # parse_string
218    
219     sub new ($) {
220     my $class = shift;
221     my $self = bless {}, $class;
222     $self->{set_next_input_character} = sub {
223     $self->{next_input_character} = -1;
224     };
225     $self->{parse_error} = sub {
226     #
227     };
228 wakaba 1.63 $self->{change_encoding} = sub {
229     # if ($_[0] is a supported encoding) {
230     # run "change the encoding" algorithm;
231     # throw Whatpm::HTML::RestartParser (charset => $new_encoding);
232     # }
233     };
234 wakaba 1.61 $self->{application_cache_selection} = sub {
235     #
236     };
237 wakaba 1.1 return $self;
238     } # new
239    
240 wakaba 1.40 sub CM_ENTITY () { 0b001 } # & markup in data
241     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
242     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
243    
244     sub PLAINTEXT_CONTENT_MODEL () { 0 }
245     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
246     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
247     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
248    
249 wakaba 1.57 sub DATA_STATE () { 0 }
250     sub ENTITY_DATA_STATE () { 1 }
251     sub TAG_OPEN_STATE () { 2 }
252     sub CLOSE_TAG_OPEN_STATE () { 3 }
253     sub TAG_NAME_STATE () { 4 }
254     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
255     sub ATTRIBUTE_NAME_STATE () { 6 }
256     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
257     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
258     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
259     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
260     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
261     sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
262     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
263     sub COMMENT_START_STATE () { 14 }
264     sub COMMENT_START_DASH_STATE () { 15 }
265     sub COMMENT_STATE () { 16 }
266     sub COMMENT_END_STATE () { 17 }
267     sub COMMENT_END_DASH_STATE () { 18 }
268     sub BOGUS_COMMENT_STATE () { 19 }
269     sub DOCTYPE_STATE () { 20 }
270     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
271     sub DOCTYPE_NAME_STATE () { 22 }
272     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
273     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
274     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
275     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
276     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
277     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
278     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
279     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
280     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
281     sub BOGUS_DOCTYPE_STATE () { 32 }
282    
283 wakaba 1.55 sub DOCTYPE_TOKEN () { 1 }
284     sub COMMENT_TOKEN () { 2 }
285     sub START_TAG_TOKEN () { 3 }
286     sub END_TAG_TOKEN () { 4 }
287     sub END_OF_FILE_TOKEN () { 5 }
288     sub CHARACTER_TOKEN () { 6 }
289    
290 wakaba 1.54 sub AFTER_HTML_IMS () { 0b100 }
291     sub HEAD_IMS () { 0b1000 }
292     sub BODY_IMS () { 0b10000 }
293 wakaba 1.56 sub BODY_TABLE_IMS () { 0b100000 }
294 wakaba 1.54 sub TABLE_IMS () { 0b1000000 }
295 wakaba 1.56 sub ROW_IMS () { 0b10000000 }
296 wakaba 1.54 sub BODY_AFTER_IMS () { 0b100000000 }
297     sub FRAME_IMS () { 0b1000000000 }
298    
299     sub AFTER_HTML_BODY_IM () { AFTER_HTML_IMS | BODY_AFTER_IMS }
300     sub AFTER_HTML_FRAMESET_IM () { AFTER_HTML_IMS | FRAME_IMS }
301     sub IN_HEAD_IM () { HEAD_IMS | 0b00 }
302     sub IN_HEAD_NOSCRIPT_IM () { HEAD_IMS | 0b01 }
303     sub AFTER_HEAD_IM () { HEAD_IMS | 0b10 }
304     sub BEFORE_HEAD_IM () { HEAD_IMS | 0b11 }
305     sub IN_BODY_IM () { BODY_IMS }
306 wakaba 1.56 sub IN_CELL_IM () { BODY_IMS | BODY_TABLE_IMS | 0b01 }
307     sub IN_CAPTION_IM () { BODY_IMS | BODY_TABLE_IMS | 0b10 }
308     sub IN_ROW_IM () { TABLE_IMS | ROW_IMS | 0b01 }
309     sub IN_TABLE_BODY_IM () { TABLE_IMS | ROW_IMS | 0b10 }
310 wakaba 1.54 sub IN_TABLE_IM () { TABLE_IMS }
311     sub AFTER_BODY_IM () { BODY_AFTER_IMS }
312     sub IN_FRAMESET_IM () { FRAME_IMS | 0b01 }
313     sub AFTER_FRAMESET_IM () { FRAME_IMS | 0b10 }
314     sub IN_SELECT_IM () { 0b01 }
315     sub IN_COLUMN_GROUP_IM () { 0b10 }
316    
317 wakaba 1.1 ## Implementations MUST act as if state machine in the spec
318    
319     sub _initialize_tokenizer ($) {
320     my $self = shift;
321 wakaba 1.57 $self->{state} = DATA_STATE; # MUST
322 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
323 wakaba 1.1 undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE
324     undef $self->{current_attribute};
325     undef $self->{last_emitted_start_tag_name};
326     undef $self->{last_attribute_value_state};
327     $self->{char} = [];
328     # $self->{next_input_character}
329     !!!next-input-character;
330     $self->{token} = [];
331 wakaba 1.18 # $self->{escape}
332 wakaba 1.1 } # _initialize_tokenizer
333    
334     ## A token has:
335 wakaba 1.55 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
336     ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
337     ## ->{name} (DOCTYPE_TOKEN)
338     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
339     ## ->{public_identifier} (DOCTYPE_TOKEN)
340     ## ->{system_identifier} (DOCTYPE_TOKEN)
341     ## ->{correct} == 1 or 0 (DOCTYPE_TOKEN)
342     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
343     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
344 wakaba 1.1
345     ## Emitted token MUST immediately be handled by the tree construction state.
346    
347     ## Before each step, UA MAY check to see if either one of the scripts in
348     ## "list of scripts that will execute as soon as possible" or the first
349     ## script in the "list of scripts that will execute asynchronously",
350     ## has completed loading. If one has, then it MUST be executed
351     ## and removed from the list.
352    
353 wakaba 1.59 ## NOTE: HTML5 "Writing HTML documents" section, applied to
354     ## documents and not to user agents and conformance checkers,
355     ## contains some requirements that are not detected by the
356     ## parsing algorithm:
357     ## - Some requirements on character encoding declarations. ## TODO
358     ## - "Elements MUST NOT contain content that their content model disallows."
359     ## ... Some are parse error, some are not (will be reported by c.c.).
360     ## - Polytheistic slash SHOULD NOT be used. (Applied only to atheists.) ## TODO
361     ## - Text (in elements, attributes, and comments) SHOULD NOT contain
362     ## control characters other than space characters. ## TODO: (what is control character? C0, C1 and DEL? Unicode control character?)
363    
364     ## TODO: HTML5 poses authors two SHOULD-level requirements that cannot
365     ## be detected by the HTML5 parsing algorithm:
366     ## - Text,
367    
368 wakaba 1.1 sub _get_next_token ($) {
369     my $self = shift;
370     if (@{$self->{token}}) {
371     return shift @{$self->{token}};
372     }
373    
374     A: {
375 wakaba 1.57 if ($self->{state} == DATA_STATE) {
376 wakaba 1.1 if ($self->{next_input_character} == 0x0026) { # &
377 wakaba 1.40 if ($self->{content_model} & CM_ENTITY) { # PCDATA | RCDATA
378 wakaba 1.57 $self->{state} = ENTITY_DATA_STATE;
379 wakaba 1.1 !!!next-input-character;
380     redo A;
381     } else {
382     #
383     }
384 wakaba 1.13 } elsif ($self->{next_input_character} == 0x002D) { # -
385 wakaba 1.40 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
386 wakaba 1.13 unless ($self->{escape}) {
387     if ($self->{prev_input_character}->[0] == 0x002D and # -
388     $self->{prev_input_character}->[1] == 0x0021 and # !
389     $self->{prev_input_character}->[2] == 0x003C) { # <
390     $self->{escape} = 1;
391     }
392     }
393     }
394    
395     #
396 wakaba 1.1 } elsif ($self->{next_input_character} == 0x003C) { # <
397 wakaba 1.40 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
398     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
399 wakaba 1.13 not $self->{escape})) {
400 wakaba 1.57 $self->{state} = TAG_OPEN_STATE;
401 wakaba 1.1 !!!next-input-character;
402     redo A;
403     } else {
404     #
405     }
406 wakaba 1.13 } elsif ($self->{next_input_character} == 0x003E) { # >
407     if ($self->{escape} and
408 wakaba 1.40 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
409 wakaba 1.13 if ($self->{prev_input_character}->[0] == 0x002D and # -
410     $self->{prev_input_character}->[1] == 0x002D) { # -
411     delete $self->{escape};
412     }
413     }
414    
415     #
416 wakaba 1.1 } elsif ($self->{next_input_character} == -1) {
417 wakaba 1.55 !!!emit ({type => END_OF_FILE_TOKEN});
418 wakaba 1.1 last A; ## TODO: ok?
419     }
420     # Anything else
421 wakaba 1.55 my $token = {type => CHARACTER_TOKEN,
422 wakaba 1.1 data => chr $self->{next_input_character}};
423     ## Stay in the data state
424     !!!next-input-character;
425    
426     !!!emit ($token);
427    
428     redo A;
429 wakaba 1.57 } elsif ($self->{state} == ENTITY_DATA_STATE) {
430 wakaba 1.1 ## (cannot happen in CDATA state)
431    
432 wakaba 1.26 my $token = $self->_tokenize_attempt_to_consume_an_entity (0);
433 wakaba 1.1
434 wakaba 1.57 $self->{state} = DATA_STATE;
435 wakaba 1.1 # next-input-character is already done
436    
437     unless (defined $token) {
438 wakaba 1.55 !!!emit ({type => CHARACTER_TOKEN, data => '&'});
439 wakaba 1.1 } else {
440     !!!emit ($token);
441     }
442    
443     redo A;
444 wakaba 1.57 } elsif ($self->{state} == TAG_OPEN_STATE) {
445 wakaba 1.40 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
446 wakaba 1.1 if ($self->{next_input_character} == 0x002F) { # /
447     !!!next-input-character;
448 wakaba 1.57 $self->{state} = CLOSE_TAG_OPEN_STATE;
449 wakaba 1.1 redo A;
450     } else {
451     ## reconsume
452 wakaba 1.57 $self->{state} = DATA_STATE;
453 wakaba 1.1
454 wakaba 1.55 !!!emit ({type => CHARACTER_TOKEN, data => '<'});
455 wakaba 1.1
456     redo A;
457     }
458 wakaba 1.40 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
459 wakaba 1.1 if ($self->{next_input_character} == 0x0021) { # !
460 wakaba 1.57 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
461 wakaba 1.1 !!!next-input-character;
462     redo A;
463     } elsif ($self->{next_input_character} == 0x002F) { # /
464 wakaba 1.57 $self->{state} = CLOSE_TAG_OPEN_STATE;
465 wakaba 1.1 !!!next-input-character;
466     redo A;
467     } elsif (0x0041 <= $self->{next_input_character} and
468     $self->{next_input_character} <= 0x005A) { # A..Z
469     $self->{current_token}
470 wakaba 1.55 = {type => START_TAG_TOKEN,
471 wakaba 1.1 tag_name => chr ($self->{next_input_character} + 0x0020)};
472 wakaba 1.57 $self->{state} = TAG_NAME_STATE;
473 wakaba 1.1 !!!next-input-character;
474     redo A;
475     } elsif (0x0061 <= $self->{next_input_character} and
476     $self->{next_input_character} <= 0x007A) { # a..z
477 wakaba 1.55 $self->{current_token} = {type => START_TAG_TOKEN,
478 wakaba 1.1 tag_name => chr ($self->{next_input_character})};
479 wakaba 1.57 $self->{state} = TAG_NAME_STATE;
480 wakaba 1.1 !!!next-input-character;
481     redo A;
482     } elsif ($self->{next_input_character} == 0x003E) { # >
483 wakaba 1.3 !!!parse-error (type => 'empty start tag');
484 wakaba 1.57 $self->{state} = DATA_STATE;
485 wakaba 1.1 !!!next-input-character;
486    
487 wakaba 1.55 !!!emit ({type => CHARACTER_TOKEN, data => '<>'});
488 wakaba 1.1
489     redo A;
490     } elsif ($self->{next_input_character} == 0x003F) { # ?
491 wakaba 1.3 !!!parse-error (type => 'pio');
492 wakaba 1.57 $self->{state} = BOGUS_COMMENT_STATE;
493 wakaba 1.1 ## $self->{next_input_character} is intentionally left as is
494     redo A;
495     } else {
496 wakaba 1.3 !!!parse-error (type => 'bare stago');
497 wakaba 1.57 $self->{state} = DATA_STATE;
498 wakaba 1.1 ## reconsume
499    
500 wakaba 1.55 !!!emit ({type => CHARACTER_TOKEN, data => '<'});
501 wakaba 1.1
502     redo A;
503     }
504     } else {
505 wakaba 1.40 die "$0: $self->{content_model} in tag open";
506 wakaba 1.1 }
507 wakaba 1.57 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
508 wakaba 1.40 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
509 wakaba 1.23 if (defined $self->{last_emitted_start_tag_name}) {
510 wakaba 1.30 ## NOTE: <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>
511 wakaba 1.23 my @next_char;
512     TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
513     push @next_char, $self->{next_input_character};
514     my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
515     my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
516     if ($self->{next_input_character} == $c or $self->{next_input_character} == $C) {
517     !!!next-input-character;
518     next TAGNAME;
519     } else {
520     $self->{next_input_character} = shift @next_char; # reconsume
521     !!!back-next-input-character (@next_char);
522 wakaba 1.57 $self->{state} = DATA_STATE;
523 wakaba 1.23
524 wakaba 1.55 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
525 wakaba 1.23
526     redo A;
527     }
528     }
529 wakaba 1.1 push @next_char, $self->{next_input_character};
530 wakaba 1.23
531     unless ($self->{next_input_character} == 0x0009 or # HT
532     $self->{next_input_character} == 0x000A or # LF
533     $self->{next_input_character} == 0x000B or # VT
534     $self->{next_input_character} == 0x000C or # FF
535     $self->{next_input_character} == 0x0020 or # SP
536     $self->{next_input_character} == 0x003E or # >
537     $self->{next_input_character} == 0x002F or # /
538     $self->{next_input_character} == -1) {
539 wakaba 1.1 $self->{next_input_character} = shift @next_char; # reconsume
540     !!!back-next-input-character (@next_char);
541 wakaba 1.57 $self->{state} = DATA_STATE;
542 wakaba 1.55 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
543 wakaba 1.1 redo A;
544 wakaba 1.23 } else {
545     $self->{next_input_character} = shift @next_char;
546     !!!back-next-input-character (@next_char);
547     # and consume...
548 wakaba 1.1 }
549 wakaba 1.23 } else {
550     ## No start tag token has ever been emitted
551     # next-input-character is already done
552 wakaba 1.57 $self->{state} = DATA_STATE;
553 wakaba 1.55 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
554 wakaba 1.1 redo A;
555     }
556     }
557    
558     if (0x0041 <= $self->{next_input_character} and
559     $self->{next_input_character} <= 0x005A) { # A..Z
560 wakaba 1.55 $self->{current_token} = {type => END_TAG_TOKEN,
561 wakaba 1.1 tag_name => chr ($self->{next_input_character} + 0x0020)};
562 wakaba 1.57 $self->{state} = TAG_NAME_STATE;
563 wakaba 1.1 !!!next-input-character;
564     redo A;
565     } elsif (0x0061 <= $self->{next_input_character} and
566     $self->{next_input_character} <= 0x007A) { # a..z
567 wakaba 1.55 $self->{current_token} = {type => END_TAG_TOKEN,
568 wakaba 1.1 tag_name => chr ($self->{next_input_character})};
569 wakaba 1.57 $self->{state} = TAG_NAME_STATE;
570 wakaba 1.1 !!!next-input-character;
571     redo A;
572     } elsif ($self->{next_input_character} == 0x003E) { # >
573 wakaba 1.3 !!!parse-error (type => 'empty end tag');
574 wakaba 1.57 $self->{state} = DATA_STATE;
575 wakaba 1.1 !!!next-input-character;
576     redo A;
577     } elsif ($self->{next_input_character} == -1) {
578 wakaba 1.3 !!!parse-error (type => 'bare etago');
579 wakaba 1.57 $self->{state} = DATA_STATE;
580 wakaba 1.1 # reconsume
581    
582 wakaba 1.55 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
583 wakaba 1.1
584     redo A;
585     } else {
586 wakaba 1.3 !!!parse-error (type => 'bogus end tag');
587 wakaba 1.57 $self->{state} = BOGUS_COMMENT_STATE;
588 wakaba 1.1 ## $self->{next_input_character} is intentionally left as is
589     redo A;
590     }
591 wakaba 1.57 } elsif ($self->{state} == TAG_NAME_STATE) {
592 wakaba 1.1 if ($self->{next_input_character} == 0x0009 or # HT
593     $self->{next_input_character} == 0x000A or # LF
594     $self->{next_input_character} == 0x000B or # VT
595     $self->{next_input_character} == 0x000C or # FF
596     $self->{next_input_character} == 0x0020) { # SP
597 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
598 wakaba 1.1 !!!next-input-character;
599     redo A;
600     } elsif ($self->{next_input_character} == 0x003E) { # >
601 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
602 wakaba 1.28 $self->{current_token}->{first_start_tag}
603     = not defined $self->{last_emitted_start_tag_name};
604 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
605 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
606 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
607 wakaba 1.1 if ($self->{current_token}->{attributes}) {
608 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
609 wakaba 1.1 }
610     } else {
611     die "$0: $self->{current_token}->{type}: Unknown token type";
612     }
613 wakaba 1.57 $self->{state} = DATA_STATE;
614 wakaba 1.1 !!!next-input-character;
615    
616     !!!emit ($self->{current_token}); # start tag or end tag
617    
618     redo A;
619     } elsif (0x0041 <= $self->{next_input_character} and
620     $self->{next_input_character} <= 0x005A) { # A..Z
621     $self->{current_token}->{tag_name} .= chr ($self->{next_input_character} + 0x0020);
622     # start tag or end tag
623     ## Stay in this state
624     !!!next-input-character;
625     redo A;
626 wakaba 1.17 } elsif ($self->{next_input_character} == -1) {
627 wakaba 1.3 !!!parse-error (type => 'unclosed tag');
628 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
629 wakaba 1.28 $self->{current_token}->{first_start_tag}
630     = not defined $self->{last_emitted_start_tag_name};
631 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
632 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
633 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
634 wakaba 1.1 if ($self->{current_token}->{attributes}) {
635 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
636 wakaba 1.1 }
637     } else {
638     die "$0: $self->{current_token}->{type}: Unknown token type";
639     }
640 wakaba 1.57 $self->{state} = DATA_STATE;
641 wakaba 1.1 # reconsume
642    
643     !!!emit ($self->{current_token}); # start tag or end tag
644    
645     redo A;
646     } elsif ($self->{next_input_character} == 0x002F) { # /
647     !!!next-input-character;
648     if ($self->{next_input_character} == 0x003E and # >
649 wakaba 1.55 $self->{current_token}->{type} == START_TAG_TOKEN and
650 wakaba 1.1 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
651     # permitted slash
652     #
653     } else {
654 wakaba 1.3 !!!parse-error (type => 'nestc');
655 wakaba 1.1 }
656 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
657 wakaba 1.1 # next-input-character is already done
658     redo A;
659     } else {
660     $self->{current_token}->{tag_name} .= chr $self->{next_input_character};
661     # start tag or end tag
662     ## Stay in the state
663     !!!next-input-character;
664     redo A;
665     }
666 wakaba 1.57 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
667 wakaba 1.1 if ($self->{next_input_character} == 0x0009 or # HT
668     $self->{next_input_character} == 0x000A or # LF
669     $self->{next_input_character} == 0x000B or # VT
670     $self->{next_input_character} == 0x000C or # FF
671     $self->{next_input_character} == 0x0020) { # SP
672     ## Stay in the state
673     !!!next-input-character;
674     redo A;
675     } elsif ($self->{next_input_character} == 0x003E) { # >
676 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
677 wakaba 1.28 $self->{current_token}->{first_start_tag}
678     = not defined $self->{last_emitted_start_tag_name};
679 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
680 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
681 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
682 wakaba 1.1 if ($self->{current_token}->{attributes}) {
683 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
684 wakaba 1.1 }
685     } else {
686     die "$0: $self->{current_token}->{type}: Unknown token type";
687     }
688 wakaba 1.57 $self->{state} = DATA_STATE;
689 wakaba 1.1 !!!next-input-character;
690    
691     !!!emit ($self->{current_token}); # start tag or end tag
692    
693     redo A;
694     } elsif (0x0041 <= $self->{next_input_character} and
695     $self->{next_input_character} <= 0x005A) { # A..Z
696     $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
697     value => ''};
698 wakaba 1.57 $self->{state} = ATTRIBUTE_NAME_STATE;
699 wakaba 1.1 !!!next-input-character;
700     redo A;
701     } elsif ($self->{next_input_character} == 0x002F) { # /
702     !!!next-input-character;
703     if ($self->{next_input_character} == 0x003E and # >
704 wakaba 1.55 $self->{current_token}->{type} == START_TAG_TOKEN and
705 wakaba 1.1 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
706     # permitted slash
707     #
708     } else {
709 wakaba 1.3 !!!parse-error (type => 'nestc');
710 wakaba 1.1 }
711     ## Stay in the state
712     # next-input-character is already done
713     redo A;
714 wakaba 1.17 } elsif ($self->{next_input_character} == -1) {
715 wakaba 1.3 !!!parse-error (type => 'unclosed tag');
716 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
717 wakaba 1.28 $self->{current_token}->{first_start_tag}
718     = not defined $self->{last_emitted_start_tag_name};
719 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
720 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
721 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
722 wakaba 1.1 if ($self->{current_token}->{attributes}) {
723 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
724 wakaba 1.1 }
725     } else {
726     die "$0: $self->{current_token}->{type}: Unknown token type";
727     }
728 wakaba 1.57 $self->{state} = DATA_STATE;
729 wakaba 1.1 # reconsume
730    
731     !!!emit ($self->{current_token}); # start tag or end tag
732    
733     redo A;
734     } else {
735     $self->{current_attribute} = {name => chr ($self->{next_input_character}),
736     value => ''};
737 wakaba 1.57 $self->{state} = ATTRIBUTE_NAME_STATE;
738 wakaba 1.1 !!!next-input-character;
739     redo A;
740     }
741 wakaba 1.57 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
742 wakaba 1.1 my $before_leave = sub {
743     if (exists $self->{current_token}->{attributes} # start tag or end tag
744     ->{$self->{current_attribute}->{name}}) { # MUST
745 wakaba 1.39 !!!parse-error (type => 'duplicate attribute:'.$self->{current_attribute}->{name});
746 wakaba 1.1 ## Discard $self->{current_attribute} # MUST
747     } else {
748     $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}
749     = $self->{current_attribute};
750     }
751     }; # $before_leave
752    
753     if ($self->{next_input_character} == 0x0009 or # HT
754     $self->{next_input_character} == 0x000A or # LF
755     $self->{next_input_character} == 0x000B or # VT
756     $self->{next_input_character} == 0x000C or # FF
757     $self->{next_input_character} == 0x0020) { # SP
758     $before_leave->();
759 wakaba 1.57 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
760 wakaba 1.1 !!!next-input-character;
761     redo A;
762     } elsif ($self->{next_input_character} == 0x003D) { # =
763     $before_leave->();
764 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
765 wakaba 1.1 !!!next-input-character;
766     redo A;
767     } elsif ($self->{next_input_character} == 0x003E) { # >
768     $before_leave->();
769 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
770 wakaba 1.28 $self->{current_token}->{first_start_tag}
771     = not defined $self->{last_emitted_start_tag_name};
772 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
773 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
774 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
775 wakaba 1.1 if ($self->{current_token}->{attributes}) {
776 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
777 wakaba 1.1 }
778     } else {
779     die "$0: $self->{current_token}->{type}: Unknown token type";
780     }
781 wakaba 1.57 $self->{state} = DATA_STATE;
782 wakaba 1.1 !!!next-input-character;
783    
784     !!!emit ($self->{current_token}); # start tag or end tag
785    
786     redo A;
787     } elsif (0x0041 <= $self->{next_input_character} and
788     $self->{next_input_character} <= 0x005A) { # A..Z
789     $self->{current_attribute}->{name} .= chr ($self->{next_input_character} + 0x0020);
790     ## Stay in the state
791     !!!next-input-character;
792     redo A;
793     } elsif ($self->{next_input_character} == 0x002F) { # /
794     $before_leave->();
795     !!!next-input-character;
796     if ($self->{next_input_character} == 0x003E and # >
797 wakaba 1.55 $self->{current_token}->{type} == START_TAG_TOKEN and
798 wakaba 1.1 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
799     # permitted slash
800     #
801     } else {
802 wakaba 1.3 !!!parse-error (type => 'nestc');
803 wakaba 1.1 }
804 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
805 wakaba 1.1 # next-input-character is already done
806     redo A;
807 wakaba 1.17 } elsif ($self->{next_input_character} == -1) {
808 wakaba 1.3 !!!parse-error (type => 'unclosed tag');
809 wakaba 1.1 $before_leave->();
810 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
811 wakaba 1.28 $self->{current_token}->{first_start_tag}
812     = not defined $self->{last_emitted_start_tag_name};
813 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
814 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
815 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
816 wakaba 1.1 if ($self->{current_token}->{attributes}) {
817 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
818 wakaba 1.1 }
819     } else {
820     die "$0: $self->{current_token}->{type}: Unknown token type";
821     }
822 wakaba 1.57 $self->{state} = DATA_STATE;
823 wakaba 1.1 # reconsume
824    
825     !!!emit ($self->{current_token}); # start tag or end tag
826    
827     redo A;
828     } else {
829     $self->{current_attribute}->{name} .= chr ($self->{next_input_character});
830     ## Stay in the state
831     !!!next-input-character;
832     redo A;
833     }
834 wakaba 1.57 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
835 wakaba 1.1 if ($self->{next_input_character} == 0x0009 or # HT
836     $self->{next_input_character} == 0x000A or # LF
837     $self->{next_input_character} == 0x000B or # VT
838     $self->{next_input_character} == 0x000C or # FF
839     $self->{next_input_character} == 0x0020) { # SP
840     ## Stay in the state
841     !!!next-input-character;
842     redo A;
843     } elsif ($self->{next_input_character} == 0x003D) { # =
844 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
845 wakaba 1.1 !!!next-input-character;
846     redo A;
847     } elsif ($self->{next_input_character} == 0x003E) { # >
848 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
849 wakaba 1.28 $self->{current_token}->{first_start_tag}
850     = not defined $self->{last_emitted_start_tag_name};
851 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
852 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
853 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
854 wakaba 1.1 if ($self->{current_token}->{attributes}) {
855 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
856 wakaba 1.1 }
857     } else {
858     die "$0: $self->{current_token}->{type}: Unknown token type";
859     }
860 wakaba 1.57 $self->{state} = DATA_STATE;
861 wakaba 1.1 !!!next-input-character;
862    
863     !!!emit ($self->{current_token}); # start tag or end tag
864    
865     redo A;
866     } elsif (0x0041 <= $self->{next_input_character} and
867     $self->{next_input_character} <= 0x005A) { # A..Z
868     $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
869     value => ''};
870 wakaba 1.57 $self->{state} = ATTRIBUTE_NAME_STATE;
871 wakaba 1.1 !!!next-input-character;
872     redo A;
873     } elsif ($self->{next_input_character} == 0x002F) { # /
874     !!!next-input-character;
875     if ($self->{next_input_character} == 0x003E and # >
876 wakaba 1.55 $self->{current_token}->{type} == START_TAG_TOKEN and
877 wakaba 1.1 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
878     # permitted slash
879     #
880     } else {
881 wakaba 1.3 !!!parse-error (type => 'nestc');
882 wakaba 1.33 ## TODO: Different error type for <aa / bb> than <aa/>
883 wakaba 1.1 }
884 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
885 wakaba 1.1 # next-input-character is already done
886     redo A;
887 wakaba 1.17 } elsif ($self->{next_input_character} == -1) {
888 wakaba 1.3 !!!parse-error (type => 'unclosed tag');
889 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
890 wakaba 1.28 $self->{current_token}->{first_start_tag}
891     = not defined $self->{last_emitted_start_tag_name};
892 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
893 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
894 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
895 wakaba 1.1 if ($self->{current_token}->{attributes}) {
896 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
897 wakaba 1.1 }
898     } else {
899     die "$0: $self->{current_token}->{type}: Unknown token type";
900     }
901 wakaba 1.57 $self->{state} = DATA_STATE;
902 wakaba 1.1 # reconsume
903    
904     !!!emit ($self->{current_token}); # start tag or end tag
905    
906     redo A;
907     } else {
908     $self->{current_attribute} = {name => chr ($self->{next_input_character}),
909     value => ''};
910 wakaba 1.57 $self->{state} = ATTRIBUTE_NAME_STATE;
911 wakaba 1.1 !!!next-input-character;
912     redo A;
913     }
914 wakaba 1.57 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
915 wakaba 1.1 if ($self->{next_input_character} == 0x0009 or # HT
916     $self->{next_input_character} == 0x000A or # LF
917     $self->{next_input_character} == 0x000B or # VT
918     $self->{next_input_character} == 0x000C or # FF
919     $self->{next_input_character} == 0x0020) { # SP
920     ## Stay in the state
921     !!!next-input-character;
922     redo A;
923     } elsif ($self->{next_input_character} == 0x0022) { # "
924 wakaba 1.57 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
925 wakaba 1.1 !!!next-input-character;
926     redo A;
927     } elsif ($self->{next_input_character} == 0x0026) { # &
928 wakaba 1.57 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
929 wakaba 1.1 ## reconsume
930     redo A;
931     } elsif ($self->{next_input_character} == 0x0027) { # '
932 wakaba 1.57 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
933 wakaba 1.1 !!!next-input-character;
934     redo A;
935     } elsif ($self->{next_input_character} == 0x003E) { # >
936 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
937 wakaba 1.28 $self->{current_token}->{first_start_tag}
938     = not defined $self->{last_emitted_start_tag_name};
939 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
940 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
941 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
942 wakaba 1.1 if ($self->{current_token}->{attributes}) {
943 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
944 wakaba 1.1 }
945     } else {
946     die "$0: $self->{current_token}->{type}: Unknown token type";
947     }
948 wakaba 1.57 $self->{state} = DATA_STATE;
949 wakaba 1.1 !!!next-input-character;
950    
951     !!!emit ($self->{current_token}); # start tag or end tag
952    
953     redo A;
954 wakaba 1.17 } elsif ($self->{next_input_character} == -1) {
955 wakaba 1.3 !!!parse-error (type => 'unclosed tag');
956 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
957 wakaba 1.28 $self->{current_token}->{first_start_tag}
958     = not defined $self->{last_emitted_start_tag_name};
959 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
960 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
961 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
962 wakaba 1.1 if ($self->{current_token}->{attributes}) {
963 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
964 wakaba 1.1 }
965     } else {
966     die "$0: $self->{current_token}->{type}: Unknown token type";
967     }
968 wakaba 1.57 $self->{state} = DATA_STATE;
969 wakaba 1.1 ## reconsume
970    
971     !!!emit ($self->{current_token}); # start tag or end tag
972    
973     redo A;
974     } else {
975     $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
976 wakaba 1.57 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
977 wakaba 1.1 !!!next-input-character;
978     redo A;
979     }
980 wakaba 1.57 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
981 wakaba 1.1 if ($self->{next_input_character} == 0x0022) { # "
982 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
983 wakaba 1.1 !!!next-input-character;
984     redo A;
985     } elsif ($self->{next_input_character} == 0x0026) { # &
986 wakaba 1.57 $self->{last_attribute_value_state} = $self->{state};
987     $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
988 wakaba 1.1 !!!next-input-character;
989     redo A;
990     } elsif ($self->{next_input_character} == -1) {
991 wakaba 1.3 !!!parse-error (type => 'unclosed attribute value');
992 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
993 wakaba 1.28 $self->{current_token}->{first_start_tag}
994     = not defined $self->{last_emitted_start_tag_name};
995 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
996 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
997 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
998 wakaba 1.1 if ($self->{current_token}->{attributes}) {
999 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1000 wakaba 1.1 }
1001     } else {
1002     die "$0: $self->{current_token}->{type}: Unknown token type";
1003     }
1004 wakaba 1.57 $self->{state} = DATA_STATE;
1005 wakaba 1.1 ## reconsume
1006    
1007     !!!emit ($self->{current_token}); # start tag or end tag
1008    
1009     redo A;
1010     } else {
1011     $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1012     ## Stay in the state
1013     !!!next-input-character;
1014     redo A;
1015     }
1016 wakaba 1.57 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1017 wakaba 1.1 if ($self->{next_input_character} == 0x0027) { # '
1018 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1019 wakaba 1.1 !!!next-input-character;
1020     redo A;
1021     } elsif ($self->{next_input_character} == 0x0026) { # &
1022 wakaba 1.57 $self->{last_attribute_value_state} = $self->{state};
1023     $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1024 wakaba 1.1 !!!next-input-character;
1025     redo A;
1026     } elsif ($self->{next_input_character} == -1) {
1027 wakaba 1.3 !!!parse-error (type => 'unclosed attribute value');
1028 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1029 wakaba 1.28 $self->{current_token}->{first_start_tag}
1030     = not defined $self->{last_emitted_start_tag_name};
1031 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1032 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1033 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1034 wakaba 1.1 if ($self->{current_token}->{attributes}) {
1035 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1036 wakaba 1.1 }
1037     } else {
1038     die "$0: $self->{current_token}->{type}: Unknown token type";
1039     }
1040 wakaba 1.57 $self->{state} = DATA_STATE;
1041 wakaba 1.1 ## reconsume
1042    
1043     !!!emit ($self->{current_token}); # start tag or end tag
1044    
1045     redo A;
1046     } else {
1047     $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1048     ## Stay in the state
1049     !!!next-input-character;
1050     redo A;
1051     }
1052 wakaba 1.57 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1053 wakaba 1.1 if ($self->{next_input_character} == 0x0009 or # HT
1054     $self->{next_input_character} == 0x000A or # LF
1055     $self->{next_input_character} == 0x000B or # HT
1056     $self->{next_input_character} == 0x000C or # FF
1057     $self->{next_input_character} == 0x0020) { # SP
1058 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1059 wakaba 1.1 !!!next-input-character;
1060     redo A;
1061     } elsif ($self->{next_input_character} == 0x0026) { # &
1062 wakaba 1.57 $self->{last_attribute_value_state} = $self->{state};
1063     $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1064 wakaba 1.1 !!!next-input-character;
1065     redo A;
1066     } elsif ($self->{next_input_character} == 0x003E) { # >
1067 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1068 wakaba 1.28 $self->{current_token}->{first_start_tag}
1069     = not defined $self->{last_emitted_start_tag_name};
1070 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1071 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1072 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1073 wakaba 1.1 if ($self->{current_token}->{attributes}) {
1074 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1075 wakaba 1.1 }
1076     } else {
1077     die "$0: $self->{current_token}->{type}: Unknown token type";
1078     }
1079 wakaba 1.57 $self->{state} = DATA_STATE;
1080 wakaba 1.1 !!!next-input-character;
1081    
1082     !!!emit ($self->{current_token}); # start tag or end tag
1083    
1084     redo A;
1085 wakaba 1.17 } elsif ($self->{next_input_character} == -1) {
1086 wakaba 1.3 !!!parse-error (type => 'unclosed tag');
1087 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1088 wakaba 1.28 $self->{current_token}->{first_start_tag}
1089     = not defined $self->{last_emitted_start_tag_name};
1090 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1091 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1092 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1093 wakaba 1.1 if ($self->{current_token}->{attributes}) {
1094 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1095 wakaba 1.1 }
1096     } else {
1097     die "$0: $self->{current_token}->{type}: Unknown token type";
1098     }
1099 wakaba 1.57 $self->{state} = DATA_STATE;
1100 wakaba 1.1 ## reconsume
1101    
1102     !!!emit ($self->{current_token}); # start tag or end tag
1103    
1104     redo A;
1105     } else {
1106     $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1107     ## Stay in the state
1108     !!!next-input-character;
1109     redo A;
1110     }
1111 wakaba 1.57 } elsif ($self->{state} == ENTITY_IN_ATTRIBUTE_VALUE_STATE) {
1112 wakaba 1.26 my $token = $self->_tokenize_attempt_to_consume_an_entity (1);
1113 wakaba 1.1
1114     unless (defined $token) {
1115     $self->{current_attribute}->{value} .= '&';
1116     } else {
1117     $self->{current_attribute}->{value} .= $token->{data};
1118     ## ISSUE: spec says "append the returned character token to the current attribute's value"
1119     }
1120    
1121     $self->{state} = $self->{last_attribute_value_state};
1122     # next-input-character is already done
1123     redo A;
1124 wakaba 1.57 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1125 wakaba 1.1 ## (only happen if PCDATA state)
1126    
1127 wakaba 1.55 my $token = {type => COMMENT_TOKEN, data => ''};
1128 wakaba 1.1
1129     BC: {
1130     if ($self->{next_input_character} == 0x003E) { # >
1131 wakaba 1.57 $self->{state} = DATA_STATE;
1132 wakaba 1.1 !!!next-input-character;
1133    
1134     !!!emit ($token);
1135    
1136     redo A;
1137     } elsif ($self->{next_input_character} == -1) {
1138 wakaba 1.57 $self->{state} = DATA_STATE;
1139 wakaba 1.1 ## reconsume
1140    
1141     !!!emit ($token);
1142    
1143     redo A;
1144     } else {
1145     $token->{data} .= chr ($self->{next_input_character});
1146     !!!next-input-character;
1147     redo BC;
1148     }
1149     } # BC
1150 wakaba 1.57 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1151 wakaba 1.1 ## (only happen if PCDATA state)
1152    
1153     my @next_char;
1154     push @next_char, $self->{next_input_character};
1155    
1156     if ($self->{next_input_character} == 0x002D) { # -
1157     !!!next-input-character;
1158     push @next_char, $self->{next_input_character};
1159     if ($self->{next_input_character} == 0x002D) { # -
1160 wakaba 1.55 $self->{current_token} = {type => COMMENT_TOKEN, data => ''};
1161 wakaba 1.57 $self->{state} = COMMENT_START_STATE;
1162 wakaba 1.1 !!!next-input-character;
1163     redo A;
1164     }
1165     } elsif ($self->{next_input_character} == 0x0044 or # D
1166     $self->{next_input_character} == 0x0064) { # d
1167     !!!next-input-character;
1168     push @next_char, $self->{next_input_character};
1169     if ($self->{next_input_character} == 0x004F or # O
1170     $self->{next_input_character} == 0x006F) { # o
1171     !!!next-input-character;
1172     push @next_char, $self->{next_input_character};
1173     if ($self->{next_input_character} == 0x0043 or # C
1174     $self->{next_input_character} == 0x0063) { # c
1175     !!!next-input-character;
1176     push @next_char, $self->{next_input_character};
1177     if ($self->{next_input_character} == 0x0054 or # T
1178     $self->{next_input_character} == 0x0074) { # t
1179     !!!next-input-character;
1180     push @next_char, $self->{next_input_character};
1181     if ($self->{next_input_character} == 0x0059 or # Y
1182     $self->{next_input_character} == 0x0079) { # y
1183     !!!next-input-character;
1184     push @next_char, $self->{next_input_character};
1185     if ($self->{next_input_character} == 0x0050 or # P
1186     $self->{next_input_character} == 0x0070) { # p
1187     !!!next-input-character;
1188     push @next_char, $self->{next_input_character};
1189     if ($self->{next_input_character} == 0x0045 or # E
1190     $self->{next_input_character} == 0x0065) { # e
1191     ## ISSUE: What a stupid code this is!
1192 wakaba 1.57 $self->{state} = DOCTYPE_STATE;
1193 wakaba 1.1 !!!next-input-character;
1194     redo A;
1195     }
1196     }
1197     }
1198     }
1199     }
1200     }
1201     }
1202    
1203 wakaba 1.30 !!!parse-error (type => 'bogus comment');
1204 wakaba 1.1 $self->{next_input_character} = shift @next_char;
1205     !!!back-next-input-character (@next_char);
1206 wakaba 1.57 $self->{state} = BOGUS_COMMENT_STATE;
1207 wakaba 1.1 redo A;
1208    
1209     ## ISSUE: typos in spec: chacacters, is is a parse error
1210     ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
1211 wakaba 1.57 } elsif ($self->{state} == COMMENT_START_STATE) {
1212 wakaba 1.23 if ($self->{next_input_character} == 0x002D) { # -
1213 wakaba 1.57 $self->{state} = COMMENT_START_DASH_STATE;
1214 wakaba 1.23 !!!next-input-character;
1215     redo A;
1216     } elsif ($self->{next_input_character} == 0x003E) { # >
1217     !!!parse-error (type => 'bogus comment');
1218 wakaba 1.57 $self->{state} = DATA_STATE;
1219 wakaba 1.23 !!!next-input-character;
1220    
1221     !!!emit ($self->{current_token}); # comment
1222    
1223     redo A;
1224     } elsif ($self->{next_input_character} == -1) {
1225     !!!parse-error (type => 'unclosed comment');
1226 wakaba 1.57 $self->{state} = DATA_STATE;
1227 wakaba 1.23 ## reconsume
1228    
1229     !!!emit ($self->{current_token}); # comment
1230    
1231     redo A;
1232     } else {
1233     $self->{current_token}->{data} # comment
1234     .= chr ($self->{next_input_character});
1235 wakaba 1.57 $self->{state} = COMMENT_STATE;
1236 wakaba 1.23 !!!next-input-character;
1237     redo A;
1238     }
1239 wakaba 1.57 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1240 wakaba 1.23 if ($self->{next_input_character} == 0x002D) { # -
1241 wakaba 1.57 $self->{state} = COMMENT_END_STATE;
1242 wakaba 1.23 !!!next-input-character;
1243     redo A;
1244     } elsif ($self->{next_input_character} == 0x003E) { # >
1245     !!!parse-error (type => 'bogus comment');
1246 wakaba 1.57 $self->{state} = DATA_STATE;
1247 wakaba 1.23 !!!next-input-character;
1248    
1249     !!!emit ($self->{current_token}); # comment
1250    
1251     redo A;
1252     } elsif ($self->{next_input_character} == -1) {
1253     !!!parse-error (type => 'unclosed comment');
1254 wakaba 1.57 $self->{state} = DATA_STATE;
1255 wakaba 1.23 ## reconsume
1256    
1257     !!!emit ($self->{current_token}); # comment
1258    
1259     redo A;
1260     } else {
1261     $self->{current_token}->{data} # comment
1262 wakaba 1.33 .= '-' . chr ($self->{next_input_character});
1263 wakaba 1.57 $self->{state} = COMMENT_STATE;
1264 wakaba 1.23 !!!next-input-character;
1265     redo A;
1266     }
1267 wakaba 1.57 } elsif ($self->{state} == COMMENT_STATE) {
1268 wakaba 1.1 if ($self->{next_input_character} == 0x002D) { # -
1269 wakaba 1.57 $self->{state} = COMMENT_END_DASH_STATE;
1270 wakaba 1.1 !!!next-input-character;
1271     redo A;
1272     } elsif ($self->{next_input_character} == -1) {
1273 wakaba 1.3 !!!parse-error (type => 'unclosed comment');
1274 wakaba 1.57 $self->{state} = DATA_STATE;
1275 wakaba 1.1 ## reconsume
1276    
1277     !!!emit ($self->{current_token}); # comment
1278    
1279     redo A;
1280     } else {
1281     $self->{current_token}->{data} .= chr ($self->{next_input_character}); # comment
1282     ## Stay in the state
1283     !!!next-input-character;
1284     redo A;
1285     }
1286 wakaba 1.57 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1287 wakaba 1.1 if ($self->{next_input_character} == 0x002D) { # -
1288 wakaba 1.57 $self->{state} = COMMENT_END_STATE;
1289 wakaba 1.1 !!!next-input-character;
1290     redo A;
1291     } elsif ($self->{next_input_character} == -1) {
1292 wakaba 1.3 !!!parse-error (type => 'unclosed comment');
1293 wakaba 1.57 $self->{state} = DATA_STATE;
1294 wakaba 1.1 ## reconsume
1295    
1296     !!!emit ($self->{current_token}); # comment
1297    
1298     redo A;
1299     } else {
1300     $self->{current_token}->{data} .= '-' . chr ($self->{next_input_character}); # comment
1301 wakaba 1.57 $self->{state} = COMMENT_STATE;
1302 wakaba 1.1 !!!next-input-character;
1303     redo A;
1304     }
1305 wakaba 1.57 } elsif ($self->{state} == COMMENT_END_STATE) {
1306 wakaba 1.1 if ($self->{next_input_character} == 0x003E) { # >
1307 wakaba 1.57 $self->{state} = DATA_STATE;
1308 wakaba 1.1 !!!next-input-character;
1309    
1310     !!!emit ($self->{current_token}); # comment
1311    
1312     redo A;
1313     } elsif ($self->{next_input_character} == 0x002D) { # -
1314 wakaba 1.3 !!!parse-error (type => 'dash in comment');
1315 wakaba 1.1 $self->{current_token}->{data} .= '-'; # comment
1316     ## Stay in the state
1317     !!!next-input-character;
1318     redo A;
1319     } elsif ($self->{next_input_character} == -1) {
1320 wakaba 1.3 !!!parse-error (type => 'unclosed comment');
1321 wakaba 1.57 $self->{state} = DATA_STATE;
1322 wakaba 1.1 ## reconsume
1323    
1324     !!!emit ($self->{current_token}); # comment
1325    
1326     redo A;
1327     } else {
1328 wakaba 1.3 !!!parse-error (type => 'dash in comment');
1329 wakaba 1.1 $self->{current_token}->{data} .= '--' . chr ($self->{next_input_character}); # comment
1330 wakaba 1.57 $self->{state} = COMMENT_STATE;
1331 wakaba 1.1 !!!next-input-character;
1332     redo A;
1333     }
1334 wakaba 1.57 } elsif ($self->{state} == DOCTYPE_STATE) {
1335 wakaba 1.1 if ($self->{next_input_character} == 0x0009 or # HT
1336     $self->{next_input_character} == 0x000A or # LF
1337     $self->{next_input_character} == 0x000B or # VT
1338     $self->{next_input_character} == 0x000C or # FF
1339     $self->{next_input_character} == 0x0020) { # SP
1340 wakaba 1.57 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1341 wakaba 1.1 !!!next-input-character;
1342     redo A;
1343     } else {
1344 wakaba 1.3 !!!parse-error (type => 'no space before DOCTYPE name');
1345 wakaba 1.57 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1346 wakaba 1.1 ## reconsume
1347     redo A;
1348     }
1349 wakaba 1.57 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
1350 wakaba 1.1 if ($self->{next_input_character} == 0x0009 or # HT
1351     $self->{next_input_character} == 0x000A or # LF
1352     $self->{next_input_character} == 0x000B or # VT
1353     $self->{next_input_character} == 0x000C or # FF
1354     $self->{next_input_character} == 0x0020) { # SP
1355     ## Stay in the state
1356     !!!next-input-character;
1357     redo A;
1358     } elsif ($self->{next_input_character} == 0x003E) { # >
1359 wakaba 1.3 !!!parse-error (type => 'no DOCTYPE name');
1360 wakaba 1.57 $self->{state} = DATA_STATE;
1361 wakaba 1.1 !!!next-input-character;
1362    
1363 wakaba 1.55 !!!emit ({type => DOCTYPE_TOKEN}); # incorrect
1364 wakaba 1.1
1365     redo A;
1366     } elsif ($self->{next_input_character} == -1) {
1367 wakaba 1.3 !!!parse-error (type => 'no DOCTYPE name');
1368 wakaba 1.57 $self->{state} = DATA_STATE;
1369 wakaba 1.1 ## reconsume
1370    
1371 wakaba 1.55 !!!emit ({type => DOCTYPE_TOKEN}); # incorrect
1372 wakaba 1.1
1373     redo A;
1374     } else {
1375 wakaba 1.18 $self->{current_token}
1376 wakaba 1.55 = {type => DOCTYPE_TOKEN,
1377 wakaba 1.18 name => chr ($self->{next_input_character}),
1378     correct => 1};
1379 wakaba 1.4 ## ISSUE: "Set the token's name name to the" in the spec
1380 wakaba 1.57 $self->{state} = DOCTYPE_NAME_STATE;
1381 wakaba 1.1 !!!next-input-character;
1382     redo A;
1383     }
1384 wakaba 1.57 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
1385 wakaba 1.18 ## ISSUE: Redundant "First," in the spec.
1386 wakaba 1.1 if ($self->{next_input_character} == 0x0009 or # HT
1387     $self->{next_input_character} == 0x000A or # LF
1388     $self->{next_input_character} == 0x000B or # VT
1389     $self->{next_input_character} == 0x000C or # FF
1390     $self->{next_input_character} == 0x0020) { # SP
1391 wakaba 1.57 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
1392 wakaba 1.1 !!!next-input-character;
1393     redo A;
1394     } elsif ($self->{next_input_character} == 0x003E) { # >
1395 wakaba 1.57 $self->{state} = DATA_STATE;
1396 wakaba 1.1 !!!next-input-character;
1397    
1398     !!!emit ($self->{current_token}); # DOCTYPE
1399    
1400     redo A;
1401     } elsif ($self->{next_input_character} == -1) {
1402 wakaba 1.3 !!!parse-error (type => 'unclosed DOCTYPE');
1403 wakaba 1.57 $self->{state} = DATA_STATE;
1404 wakaba 1.1 ## reconsume
1405    
1406 wakaba 1.18 delete $self->{current_token}->{correct};
1407     !!!emit ($self->{current_token}); # DOCTYPE
1408 wakaba 1.1
1409     redo A;
1410     } else {
1411     $self->{current_token}->{name}
1412     .= chr ($self->{next_input_character}); # DOCTYPE
1413     ## Stay in the state
1414     !!!next-input-character;
1415     redo A;
1416     }
1417 wakaba 1.57 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
1418 wakaba 1.1 if ($self->{next_input_character} == 0x0009 or # HT
1419     $self->{next_input_character} == 0x000A or # LF
1420     $self->{next_input_character} == 0x000B or # VT
1421     $self->{next_input_character} == 0x000C or # FF
1422     $self->{next_input_character} == 0x0020) { # SP
1423     ## Stay in the state
1424     !!!next-input-character;
1425     redo A;
1426     } elsif ($self->{next_input_character} == 0x003E) { # >
1427 wakaba 1.57 $self->{state} = DATA_STATE;
1428 wakaba 1.1 !!!next-input-character;
1429    
1430     !!!emit ($self->{current_token}); # DOCTYPE
1431    
1432     redo A;
1433     } elsif ($self->{next_input_character} == -1) {
1434 wakaba 1.3 !!!parse-error (type => 'unclosed DOCTYPE');
1435 wakaba 1.57 $self->{state} = DATA_STATE;
1436 wakaba 1.1 ## reconsume
1437    
1438 wakaba 1.18 delete $self->{current_token}->{correct};
1439     !!!emit ($self->{current_token}); # DOCTYPE
1440    
1441     redo A;
1442     } elsif ($self->{next_input_character} == 0x0050 or # P
1443     $self->{next_input_character} == 0x0070) { # p
1444     !!!next-input-character;
1445     if ($self->{next_input_character} == 0x0055 or # U
1446     $self->{next_input_character} == 0x0075) { # u
1447     !!!next-input-character;
1448     if ($self->{next_input_character} == 0x0042 or # B
1449     $self->{next_input_character} == 0x0062) { # b
1450     !!!next-input-character;
1451     if ($self->{next_input_character} == 0x004C or # L
1452     $self->{next_input_character} == 0x006C) { # l
1453     !!!next-input-character;
1454     if ($self->{next_input_character} == 0x0049 or # I
1455     $self->{next_input_character} == 0x0069) { # i
1456     !!!next-input-character;
1457     if ($self->{next_input_character} == 0x0043 or # C
1458     $self->{next_input_character} == 0x0063) { # c
1459 wakaba 1.57 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1460 wakaba 1.18 !!!next-input-character;
1461     redo A;
1462     }
1463     }
1464     }
1465     }
1466     }
1467    
1468     #
1469     } elsif ($self->{next_input_character} == 0x0053 or # S
1470     $self->{next_input_character} == 0x0073) { # s
1471     !!!next-input-character;
1472     if ($self->{next_input_character} == 0x0059 or # Y
1473     $self->{next_input_character} == 0x0079) { # y
1474     !!!next-input-character;
1475     if ($self->{next_input_character} == 0x0053 or # S
1476     $self->{next_input_character} == 0x0073) { # s
1477     !!!next-input-character;
1478     if ($self->{next_input_character} == 0x0054 or # T
1479     $self->{next_input_character} == 0x0074) { # t
1480     !!!next-input-character;
1481     if ($self->{next_input_character} == 0x0045 or # E
1482     $self->{next_input_character} == 0x0065) { # e
1483     !!!next-input-character;
1484     if ($self->{next_input_character} == 0x004D or # M
1485     $self->{next_input_character} == 0x006D) { # m
1486 wakaba 1.57 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1487 wakaba 1.18 !!!next-input-character;
1488     redo A;
1489     }
1490     }
1491     }
1492     }
1493     }
1494    
1495     #
1496     } else {
1497     !!!next-input-character;
1498     #
1499     }
1500    
1501     !!!parse-error (type => 'string after DOCTYPE name');
1502 wakaba 1.57 $self->{state} = BOGUS_DOCTYPE_STATE;
1503 wakaba 1.18 # next-input-character is already done
1504     redo A;
1505 wakaba 1.57 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
1506 wakaba 1.18 if ({
1507     0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1508     #0x000D => 1, # HT, LF, VT, FF, SP, CR
1509     }->{$self->{next_input_character}}) {
1510     ## Stay in the state
1511     !!!next-input-character;
1512     redo A;
1513     } elsif ($self->{next_input_character} eq 0x0022) { # "
1514     $self->{current_token}->{public_identifier} = ''; # DOCTYPE
1515 wakaba 1.57 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
1516 wakaba 1.18 !!!next-input-character;
1517     redo A;
1518     } elsif ($self->{next_input_character} eq 0x0027) { # '
1519     $self->{current_token}->{public_identifier} = ''; # DOCTYPE
1520 wakaba 1.57 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
1521 wakaba 1.18 !!!next-input-character;
1522     redo A;
1523     } elsif ($self->{next_input_character} eq 0x003E) { # >
1524     !!!parse-error (type => 'no PUBLIC literal');
1525    
1526 wakaba 1.57 $self->{state} = DATA_STATE;
1527 wakaba 1.18 !!!next-input-character;
1528    
1529     delete $self->{current_token}->{correct};
1530     !!!emit ($self->{current_token}); # DOCTYPE
1531    
1532     redo A;
1533     } elsif ($self->{next_input_character} == -1) {
1534     !!!parse-error (type => 'unclosed DOCTYPE');
1535    
1536 wakaba 1.57 $self->{state} = DATA_STATE;
1537 wakaba 1.18 ## reconsume
1538    
1539     delete $self->{current_token}->{correct};
1540     !!!emit ($self->{current_token}); # DOCTYPE
1541    
1542     redo A;
1543     } else {
1544     !!!parse-error (type => 'string after PUBLIC');
1545 wakaba 1.57 $self->{state} = BOGUS_DOCTYPE_STATE;
1546 wakaba 1.18 !!!next-input-character;
1547     redo A;
1548     }
1549 wakaba 1.57 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
1550 wakaba 1.18 if ($self->{next_input_character} == 0x0022) { # "
1551 wakaba 1.57 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1552 wakaba 1.18 !!!next-input-character;
1553     redo A;
1554     } elsif ($self->{next_input_character} == -1) {
1555     !!!parse-error (type => 'unclosed PUBLIC literal');
1556    
1557 wakaba 1.57 $self->{state} = DATA_STATE;
1558 wakaba 1.18 ## reconsume
1559    
1560     delete $self->{current_token}->{correct};
1561     !!!emit ($self->{current_token}); # DOCTYPE
1562    
1563     redo A;
1564     } else {
1565     $self->{current_token}->{public_identifier} # DOCTYPE
1566     .= chr $self->{next_input_character};
1567     ## Stay in the state
1568     !!!next-input-character;
1569     redo A;
1570     }
1571 wakaba 1.57 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
1572 wakaba 1.18 if ($self->{next_input_character} == 0x0027) { # '
1573 wakaba 1.57 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1574 wakaba 1.18 !!!next-input-character;
1575     redo A;
1576     } elsif ($self->{next_input_character} == -1) {
1577     !!!parse-error (type => 'unclosed PUBLIC literal');
1578    
1579 wakaba 1.57 $self->{state} = DATA_STATE;
1580 wakaba 1.18 ## reconsume
1581    
1582     delete $self->{current_token}->{correct};
1583     !!!emit ($self->{current_token}); # DOCTYPE
1584    
1585     redo A;
1586     } else {
1587     $self->{current_token}->{public_identifier} # DOCTYPE
1588     .= chr $self->{next_input_character};
1589     ## Stay in the state
1590     !!!next-input-character;
1591     redo A;
1592     }
1593 wakaba 1.57 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
1594 wakaba 1.18 if ({
1595     0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1596     #0x000D => 1, # HT, LF, VT, FF, SP, CR
1597     }->{$self->{next_input_character}}) {
1598     ## Stay in the state
1599     !!!next-input-character;
1600     redo A;
1601     } elsif ($self->{next_input_character} == 0x0022) { # "
1602     $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1603 wakaba 1.57 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
1604 wakaba 1.18 !!!next-input-character;
1605     redo A;
1606     } elsif ($self->{next_input_character} == 0x0027) { # '
1607     $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1608 wakaba 1.57 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
1609 wakaba 1.18 !!!next-input-character;
1610     redo A;
1611     } elsif ($self->{next_input_character} == 0x003E) { # >
1612 wakaba 1.57 $self->{state} = DATA_STATE;
1613 wakaba 1.18 !!!next-input-character;
1614    
1615     !!!emit ($self->{current_token}); # DOCTYPE
1616    
1617     redo A;
1618     } elsif ($self->{next_input_character} == -1) {
1619     !!!parse-error (type => 'unclosed DOCTYPE');
1620    
1621 wakaba 1.57 $self->{state} = DATA_STATE;
1622 wakaba 1.26 ## reconsume
1623 wakaba 1.18
1624     delete $self->{current_token}->{correct};
1625     !!!emit ($self->{current_token}); # DOCTYPE
1626    
1627     redo A;
1628     } else {
1629     !!!parse-error (type => 'string after PUBLIC literal');
1630 wakaba 1.57 $self->{state} = BOGUS_DOCTYPE_STATE;
1631 wakaba 1.18 !!!next-input-character;
1632     redo A;
1633     }
1634 wakaba 1.57 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
1635 wakaba 1.18 if ({
1636     0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1637     #0x000D => 1, # HT, LF, VT, FF, SP, CR
1638     }->{$self->{next_input_character}}) {
1639     ## Stay in the state
1640     !!!next-input-character;
1641     redo A;
1642     } elsif ($self->{next_input_character} == 0x0022) { # "
1643     $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1644 wakaba 1.57 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
1645 wakaba 1.18 !!!next-input-character;
1646     redo A;
1647     } elsif ($self->{next_input_character} == 0x0027) { # '
1648     $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1649 wakaba 1.57 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
1650 wakaba 1.18 !!!next-input-character;
1651     redo A;
1652     } elsif ($self->{next_input_character} == 0x003E) { # >
1653     !!!parse-error (type => 'no SYSTEM literal');
1654 wakaba 1.57 $self->{state} = DATA_STATE;
1655 wakaba 1.18 !!!next-input-character;
1656    
1657     delete $self->{current_token}->{correct};
1658     !!!emit ($self->{current_token}); # DOCTYPE
1659    
1660     redo A;
1661     } elsif ($self->{next_input_character} == -1) {
1662     !!!parse-error (type => 'unclosed DOCTYPE');
1663    
1664 wakaba 1.57 $self->{state} = DATA_STATE;
1665 wakaba 1.26 ## reconsume
1666 wakaba 1.18
1667     delete $self->{current_token}->{correct};
1668     !!!emit ($self->{current_token}); # DOCTYPE
1669    
1670     redo A;
1671     } else {
1672 wakaba 1.30 !!!parse-error (type => 'string after SYSTEM');
1673 wakaba 1.57 $self->{state} = BOGUS_DOCTYPE_STATE;
1674 wakaba 1.18 !!!next-input-character;
1675     redo A;
1676     }
1677 wakaba 1.57 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
1678 wakaba 1.18 if ($self->{next_input_character} == 0x0022) { # "
1679 wakaba 1.57 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1680 wakaba 1.18 !!!next-input-character;
1681     redo A;
1682     } elsif ($self->{next_input_character} == -1) {
1683     !!!parse-error (type => 'unclosed SYSTEM literal');
1684    
1685 wakaba 1.57 $self->{state} = DATA_STATE;
1686 wakaba 1.18 ## reconsume
1687    
1688     delete $self->{current_token}->{correct};
1689     !!!emit ($self->{current_token}); # DOCTYPE
1690    
1691     redo A;
1692     } else {
1693     $self->{current_token}->{system_identifier} # DOCTYPE
1694     .= chr $self->{next_input_character};
1695     ## Stay in the state
1696     !!!next-input-character;
1697     redo A;
1698     }
1699 wakaba 1.57 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
1700 wakaba 1.18 if ($self->{next_input_character} == 0x0027) { # '
1701 wakaba 1.57 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1702 wakaba 1.18 !!!next-input-character;
1703     redo A;
1704     } elsif ($self->{next_input_character} == -1) {
1705     !!!parse-error (type => 'unclosed SYSTEM literal');
1706    
1707 wakaba 1.57 $self->{state} = DATA_STATE;
1708 wakaba 1.18 ## reconsume
1709    
1710     delete $self->{current_token}->{correct};
1711 wakaba 1.1 !!!emit ($self->{current_token}); # DOCTYPE
1712    
1713     redo A;
1714     } else {
1715 wakaba 1.18 $self->{current_token}->{system_identifier} # DOCTYPE
1716     .= chr $self->{next_input_character};
1717     ## Stay in the state
1718     !!!next-input-character;
1719     redo A;
1720     }
1721 wakaba 1.57 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
1722 wakaba 1.18 if ({
1723     0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1724     #0x000D => 1, # HT, LF, VT, FF, SP, CR
1725     }->{$self->{next_input_character}}) {
1726     ## Stay in the state
1727     !!!next-input-character;
1728     redo A;
1729     } elsif ($self->{next_input_character} == 0x003E) { # >
1730 wakaba 1.57 $self->{state} = DATA_STATE;
1731 wakaba 1.18 !!!next-input-character;
1732    
1733     !!!emit ($self->{current_token}); # DOCTYPE
1734    
1735     redo A;
1736     } elsif ($self->{next_input_character} == -1) {
1737     !!!parse-error (type => 'unclosed DOCTYPE');
1738    
1739 wakaba 1.57 $self->{state} = DATA_STATE;
1740 wakaba 1.26 ## reconsume
1741 wakaba 1.18
1742     delete $self->{current_token}->{correct};
1743     !!!emit ($self->{current_token}); # DOCTYPE
1744    
1745     redo A;
1746     } else {
1747     !!!parse-error (type => 'string after SYSTEM literal');
1748 wakaba 1.57 $self->{state} = BOGUS_DOCTYPE_STATE;
1749 wakaba 1.1 !!!next-input-character;
1750     redo A;
1751     }
1752 wakaba 1.57 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
1753 wakaba 1.1 if ($self->{next_input_character} == 0x003E) { # >
1754 wakaba 1.57 $self->{state} = DATA_STATE;
1755 wakaba 1.1 !!!next-input-character;
1756    
1757 wakaba 1.18 delete $self->{current_token}->{correct};
1758 wakaba 1.1 !!!emit ($self->{current_token}); # DOCTYPE
1759    
1760     redo A;
1761     } elsif ($self->{next_input_character} == -1) {
1762 wakaba 1.3 !!!parse-error (type => 'unclosed DOCTYPE');
1763 wakaba 1.57 $self->{state} = DATA_STATE;
1764 wakaba 1.1 ## reconsume
1765    
1766 wakaba 1.18 delete $self->{current_token}->{correct};
1767 wakaba 1.1 !!!emit ($self->{current_token}); # DOCTYPE
1768    
1769     redo A;
1770     } else {
1771     ## Stay in the state
1772     !!!next-input-character;
1773     redo A;
1774     }
1775     } else {
1776     die "$0: $self->{state}: Unknown state";
1777     }
1778     } # A
1779    
1780     die "$0: _get_next_token: unexpected case";
1781     } # _get_next_token
1782    
1783 wakaba 1.26 sub _tokenize_attempt_to_consume_an_entity ($$) {
1784     my ($self, $in_attr) = @_;
1785 wakaba 1.20
1786     if ({
1787     0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
1788     0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR
1789     }->{$self->{next_input_character}}) {
1790     ## Don't consume
1791     ## No error
1792     return undef;
1793     } elsif ($self->{next_input_character} == 0x0023) { # #
1794 wakaba 1.1 !!!next-input-character;
1795     if ($self->{next_input_character} == 0x0078 or # x
1796     $self->{next_input_character} == 0x0058) { # X
1797 wakaba 1.26 my $code;
1798 wakaba 1.1 X: {
1799     my $x_char = $self->{next_input_character};
1800     !!!next-input-character;
1801     if (0x0030 <= $self->{next_input_character} and
1802     $self->{next_input_character} <= 0x0039) { # 0..9
1803 wakaba 1.26 $code ||= 0;
1804     $code *= 0x10;
1805     $code += $self->{next_input_character} - 0x0030;
1806 wakaba 1.1 redo X;
1807     } elsif (0x0061 <= $self->{next_input_character} and
1808     $self->{next_input_character} <= 0x0066) { # a..f
1809 wakaba 1.26 $code ||= 0;
1810     $code *= 0x10;
1811     $code += $self->{next_input_character} - 0x0060 + 9;
1812 wakaba 1.1 redo X;
1813     } elsif (0x0041 <= $self->{next_input_character} and
1814     $self->{next_input_character} <= 0x0046) { # A..F
1815 wakaba 1.26 $code ||= 0;
1816     $code *= 0x10;
1817     $code += $self->{next_input_character} - 0x0040 + 9;
1818 wakaba 1.1 redo X;
1819 wakaba 1.26 } elsif (not defined $code) { # no hexadecimal digit
1820 wakaba 1.3 !!!parse-error (type => 'bare hcro');
1821 wakaba 1.37 !!!back-next-input-character ($x_char, $self->{next_input_character});
1822 wakaba 1.1 $self->{next_input_character} = 0x0023; # #
1823     return undef;
1824     } elsif ($self->{next_input_character} == 0x003B) { # ;
1825     !!!next-input-character;
1826     } else {
1827 wakaba 1.3 !!!parse-error (type => 'no refc');
1828 wakaba 1.1 }
1829    
1830 wakaba 1.26 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
1831     !!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code);
1832     $code = 0xFFFD;
1833     } elsif ($code > 0x10FFFF) {
1834     !!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code);
1835     $code = 0xFFFD;
1836     } elsif ($code == 0x000D) {
1837     !!!parse-error (type => 'CR character reference');
1838     $code = 0x000A;
1839     } elsif (0x80 <= $code and $code <= 0x9F) {
1840 wakaba 1.30 !!!parse-error (type => sprintf 'C1 character reference:U+%04X', $code);
1841 wakaba 1.26 $code = $c1_entity_char->{$code};
1842 wakaba 1.1 }
1843    
1844 wakaba 1.55 return {type => CHARACTER_TOKEN, data => chr $code};
1845 wakaba 1.1 } # X
1846     } elsif (0x0030 <= $self->{next_input_character} and
1847     $self->{next_input_character} <= 0x0039) { # 0..9
1848     my $code = $self->{next_input_character} - 0x0030;
1849     !!!next-input-character;
1850    
1851     while (0x0030 <= $self->{next_input_character} and
1852     $self->{next_input_character} <= 0x0039) { # 0..9
1853     $code *= 10;
1854     $code += $self->{next_input_character} - 0x0030;
1855    
1856     !!!next-input-character;
1857     }
1858    
1859     if ($self->{next_input_character} == 0x003B) { # ;
1860     !!!next-input-character;
1861     } else {
1862 wakaba 1.3 !!!parse-error (type => 'no refc');
1863 wakaba 1.1 }
1864    
1865 wakaba 1.26 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
1866     !!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code);
1867     $code = 0xFFFD;
1868     } elsif ($code > 0x10FFFF) {
1869     !!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code);
1870     $code = 0xFFFD;
1871     } elsif ($code == 0x000D) {
1872     !!!parse-error (type => 'CR character reference');
1873     $code = 0x000A;
1874 wakaba 1.4 } elsif (0x80 <= $code and $code <= 0x9F) {
1875 wakaba 1.30 !!!parse-error (type => sprintf 'C1 character reference:U+%04X', $code);
1876 wakaba 1.4 $code = $c1_entity_char->{$code};
1877 wakaba 1.1 }
1878    
1879 wakaba 1.55 return {type => CHARACTER_TOKEN, data => chr $code};
1880 wakaba 1.1 } else {
1881 wakaba 1.3 !!!parse-error (type => 'bare nero');
1882 wakaba 1.1 !!!back-next-input-character ($self->{next_input_character});
1883     $self->{next_input_character} = 0x0023; # #
1884     return undef;
1885     }
1886     } elsif ((0x0041 <= $self->{next_input_character} and
1887     $self->{next_input_character} <= 0x005A) or
1888     (0x0061 <= $self->{next_input_character} and
1889     $self->{next_input_character} <= 0x007A)) {
1890     my $entity_name = chr $self->{next_input_character};
1891     !!!next-input-character;
1892    
1893     my $value = $entity_name;
1894 wakaba 1.37 my $match = 0;
1895 wakaba 1.16 require Whatpm::_NamedEntityList;
1896     our $EntityChar;
1897 wakaba 1.1
1898     while (length $entity_name < 10 and
1899     ## NOTE: Some number greater than the maximum length of entity name
1900 wakaba 1.16 ((0x0041 <= $self->{next_input_character} and # a
1901     $self->{next_input_character} <= 0x005A) or # x
1902     (0x0061 <= $self->{next_input_character} and # a
1903     $self->{next_input_character} <= 0x007A) or # z
1904     (0x0030 <= $self->{next_input_character} and # 0
1905     $self->{next_input_character} <= 0x0039) or # 9
1906     $self->{next_input_character} == 0x003B)) { # ;
1907 wakaba 1.1 $entity_name .= chr $self->{next_input_character};
1908 wakaba 1.16 if (defined $EntityChar->{$entity_name}) {
1909     if ($self->{next_input_character} == 0x003B) { # ;
1910 wakaba 1.26 $value = $EntityChar->{$entity_name};
1911 wakaba 1.16 $match = 1;
1912     !!!next-input-character;
1913     last;
1914 wakaba 1.37 } else {
1915 wakaba 1.26 $value = $EntityChar->{$entity_name};
1916     $match = -1;
1917 wakaba 1.37 !!!next-input-character;
1918 wakaba 1.16 }
1919 wakaba 1.1 } else {
1920     $value .= chr $self->{next_input_character};
1921 wakaba 1.37 $match *= 2;
1922     !!!next-input-character;
1923 wakaba 1.1 }
1924     }
1925    
1926 wakaba 1.16 if ($match > 0) {
1927 wakaba 1.55 return {type => CHARACTER_TOKEN, data => $value};
1928 wakaba 1.16 } elsif ($match < 0) {
1929 wakaba 1.30 !!!parse-error (type => 'no refc');
1930 wakaba 1.37 if ($in_attr and $match < -1) {
1931 wakaba 1.55 return {type => CHARACTER_TOKEN, data => '&'.$entity_name};
1932 wakaba 1.37 } else {
1933 wakaba 1.55 return {type => CHARACTER_TOKEN, data => $value};
1934 wakaba 1.37 }
1935 wakaba 1.1 } else {
1936 wakaba 1.3 !!!parse-error (type => 'bare ero');
1937 wakaba 1.1 ## NOTE: No characters are consumed in the spec.
1938 wakaba 1.55 return {type => CHARACTER_TOKEN, data => '&'.$value};
1939 wakaba 1.1 }
1940     } else {
1941     ## no characters are consumed
1942 wakaba 1.3 !!!parse-error (type => 'bare ero');
1943 wakaba 1.1 return undef;
1944     }
1945     } # _tokenize_attempt_to_consume_an_entity
1946    
1947     sub _initialize_tree_constructor ($) {
1948     my $self = shift;
1949     ## NOTE: $self->{document} MUST be specified before this method is called
1950     $self->{document}->strict_error_checking (0);
1951     ## TODO: Turn mutation events off # MUST
1952     ## TODO: Turn loose Document option (manakai extension) on
1953 wakaba 1.18 $self->{document}->manakai_is_html (1); # MUST
1954 wakaba 1.1 } # _initialize_tree_constructor
1955    
1956     sub _terminate_tree_constructor ($) {
1957     my $self = shift;
1958     $self->{document}->strict_error_checking (1);
1959     ## TODO: Turn mutation events on
1960     } # _terminate_tree_constructor
1961    
1962     ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
1963    
1964 wakaba 1.3 { # tree construction stage
1965     my $token;
1966    
1967 wakaba 1.1 sub _construct_tree ($) {
1968     my ($self) = @_;
1969    
1970     ## When an interactive UA render the $self->{document} available
1971     ## to the user, or when it begin accepting user input, are
1972     ## not defined.
1973    
1974     ## Append a character: collect it and all subsequent consecutive
1975     ## characters and insert one Text node whose data is concatenation
1976     ## of all those characters. # MUST
1977    
1978     !!!next-token;
1979    
1980 wakaba 1.54 $self->{insertion_mode} = BEFORE_HEAD_IM;
1981 wakaba 1.3 undef $self->{form_element};
1982     undef $self->{head_element};
1983     $self->{open_elements} = [];
1984     undef $self->{inner_html_node};
1985    
1986     $self->_tree_construction_initial; # MUST
1987     $self->_tree_construction_root_element;
1988     $self->_tree_construction_main;
1989     } # _construct_tree
1990    
1991     sub _tree_construction_initial ($) {
1992     my $self = shift;
1993 wakaba 1.18 INITIAL: {
1994 wakaba 1.55 if ($token->{type} == DOCTYPE_TOKEN) {
1995 wakaba 1.18 ## NOTE: Conformance checkers MAY, instead of reporting "not HTML5"
1996     ## error, switch to a conformance checking mode for another
1997     ## language.
1998     my $doctype_name = $token->{name};
1999     $doctype_name = '' unless defined $doctype_name;
2000     $doctype_name =~ tr/a-z/A-Z/;
2001     if (not defined $token->{name} or # <!DOCTYPE>
2002     defined $token->{public_identifier} or
2003     defined $token->{system_identifier}) {
2004     !!!parse-error (type => 'not HTML5');
2005     } elsif ($doctype_name ne 'HTML') {
2006     ## ISSUE: ASCII case-insensitive? (in fact it does not matter)
2007     !!!parse-error (type => 'not HTML5');
2008     }
2009    
2010     my $doctype = $self->{document}->create_document_type_definition
2011     ($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)?
2012     $doctype->public_id ($token->{public_identifier})
2013     if defined $token->{public_identifier};
2014     $doctype->system_id ($token->{system_identifier})
2015     if defined $token->{system_identifier};
2016     ## NOTE: Other DocumentType attributes are null or empty lists.
2017     ## ISSUE: internalSubset = null??
2018     $self->{document}->append_child ($doctype);
2019    
2020     if (not $token->{correct} or $doctype_name ne 'HTML') {
2021     $self->{document}->manakai_compat_mode ('quirks');
2022     } elsif (defined $token->{public_identifier}) {
2023     my $pubid = $token->{public_identifier};
2024     $pubid =~ tr/a-z/A-z/;
2025     if ({
2026     "+//SILMARIL//DTD HTML PRO V0R11 19970101//EN" => 1,
2027     "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
2028     "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
2029     "-//IETF//DTD HTML 2.0 LEVEL 1//EN" => 1,
2030     "-//IETF//DTD HTML 2.0 LEVEL 2//EN" => 1,
2031     "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//EN" => 1,
2032     "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//EN" => 1,
2033     "-//IETF//DTD HTML 2.0 STRICT//EN" => 1,
2034     "-//IETF//DTD HTML 2.0//EN" => 1,
2035     "-//IETF//DTD HTML 2.1E//EN" => 1,
2036     "-//IETF//DTD HTML 3.0//EN" => 1,
2037     "-//IETF//DTD HTML 3.0//EN//" => 1,
2038     "-//IETF//DTD HTML 3.2 FINAL//EN" => 1,
2039     "-//IETF//DTD HTML 3.2//EN" => 1,
2040     "-//IETF//DTD HTML 3//EN" => 1,
2041     "-//IETF//DTD HTML LEVEL 0//EN" => 1,
2042     "-//IETF//DTD HTML LEVEL 0//EN//2.0" => 1,
2043     "-//IETF//DTD HTML LEVEL 1//EN" => 1,
2044     "-//IETF//DTD HTML LEVEL 1//EN//2.0" => 1,
2045     "-//IETF//DTD HTML LEVEL 2//EN" => 1,
2046     "-//IETF//DTD HTML LEVEL 2//EN//2.0" => 1,
2047     "-//IETF//DTD HTML LEVEL 3//EN" => 1,
2048     "-//IETF//DTD HTML LEVEL 3//EN//3.0" => 1,
2049     "-//IETF//DTD HTML STRICT LEVEL 0//EN" => 1,
2050     "-//IETF//DTD HTML STRICT LEVEL 0//EN//2.0" => 1,
2051     "-//IETF//DTD HTML STRICT LEVEL 1//EN" => 1,
2052     "-//IETF//DTD HTML STRICT LEVEL 1//EN//2.0" => 1,
2053     "-//IETF//DTD HTML STRICT LEVEL 2//EN" => 1,
2054     "-//IETF//DTD HTML STRICT LEVEL 2//EN//2.0" => 1,
2055     "-//IETF//DTD HTML STRICT LEVEL 3//EN" => 1,
2056     "-//IETF//DTD HTML STRICT LEVEL 3//EN//3.0" => 1,
2057     "-//IETF//DTD HTML STRICT//EN" => 1,
2058     "-//IETF//DTD HTML STRICT//EN//2.0" => 1,
2059     "-//IETF//DTD HTML STRICT//EN//3.0" => 1,
2060     "-//IETF//DTD HTML//EN" => 1,
2061     "-//IETF//DTD HTML//EN//2.0" => 1,
2062     "-//IETF//DTD HTML//EN//3.0" => 1,
2063     "-//METRIUS//DTD METRIUS PRESENTATIONAL//EN" => 1,
2064     "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//EN" => 1,
2065     "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//EN" => 1,
2066     "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//EN" => 1,
2067     "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//EN" => 1,
2068     "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//EN" => 1,
2069     "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//EN" => 1,
2070     "-//NETSCAPE COMM. CORP.//DTD HTML//EN" => 1,
2071     "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1,
2072     "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1,
2073     "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1,
2074     "-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1,
2075     "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1,
2076     "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1,
2077     "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//EN" => 1,
2078     "-//W3C//DTD HTML 3 1995-03-24//EN" => 1,
2079     "-//W3C//DTD HTML 3.2 DRAFT//EN" => 1,
2080     "-//W3C//DTD HTML 3.2 FINAL//EN" => 1,
2081     "-//W3C//DTD HTML 3.2//EN" => 1,
2082     "-//W3C//DTD HTML 3.2S DRAFT//EN" => 1,
2083     "-//W3C//DTD HTML 4.0 FRAMESET//EN" => 1,
2084     "-//W3C//DTD HTML 4.0 TRANSITIONAL//EN" => 1,
2085     "-//W3C//DTD HTML EXPERIMETNAL 19960712//EN" => 1,
2086     "-//W3C//DTD HTML EXPERIMENTAL 970421//EN" => 1,
2087     "-//W3C//DTD W3 HTML//EN" => 1,
2088     "-//W3O//DTD W3 HTML 3.0//EN" => 1,
2089     "-//W3O//DTD W3 HTML 3.0//EN//" => 1,
2090     "-//W3O//DTD W3 HTML STRICT 3.0//EN//" => 1,
2091     "-//WEBTECHS//DTD MOZILLA HTML 2.0//EN" => 1,
2092     "-//WEBTECHS//DTD MOZILLA HTML//EN" => 1,
2093     "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" => 1,
2094     "HTML" => 1,
2095     }->{$pubid}) {
2096     $self->{document}->manakai_compat_mode ('quirks');
2097     } elsif ($pubid eq "-//W3C//DTD HTML 4.01 FRAMESET//EN" or
2098     $pubid eq "-//W3C//DTD HTML 4.01 TRANSITIONAL//EN") {
2099     if (defined $token->{system_identifier}) {
2100     $self->{document}->manakai_compat_mode ('quirks');
2101     } else {
2102     $self->{document}->manakai_compat_mode ('limited quirks');
2103 wakaba 1.3 }
2104 wakaba 1.18 } elsif ($pubid eq "-//W3C//DTD XHTML 1.0 Frameset//EN" or
2105     $pubid eq "-//W3C//DTD XHTML 1.0 Transitional//EN") {
2106     $self->{document}->manakai_compat_mode ('limited quirks');
2107     }
2108     }
2109     if (defined $token->{system_identifier}) {
2110     my $sysid = $token->{system_identifier};
2111     $sysid =~ tr/A-Z/a-z/;
2112     if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
2113     $self->{document}->manakai_compat_mode ('quirks');
2114     }
2115     }
2116    
2117     ## Go to the root element phase.
2118     !!!next-token;
2119     return;
2120     } elsif ({
2121 wakaba 1.55 START_TAG_TOKEN, 1,
2122     END_TAG_TOKEN, 1,
2123     END_OF_FILE_TOKEN, 1,
2124 wakaba 1.18 }->{$token->{type}}) {
2125     !!!parse-error (type => 'no DOCTYPE');
2126     $self->{document}->manakai_compat_mode ('quirks');
2127     ## Go to the root element phase
2128     ## reprocess
2129     return;
2130 wakaba 1.55 } elsif ($token->{type} == CHARACTER_TOKEN) {
2131 wakaba 1.18 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
2132     ## Ignore the token
2133 wakaba 1.26
2134 wakaba 1.18 unless (length $token->{data}) {
2135     ## Stay in the phase
2136     !!!next-token;
2137     redo INITIAL;
2138 wakaba 1.3 }
2139     }
2140 wakaba 1.18
2141     !!!parse-error (type => 'no DOCTYPE');
2142     $self->{document}->manakai_compat_mode ('quirks');
2143     ## Go to the root element phase
2144     ## reprocess
2145     return;
2146 wakaba 1.55 } elsif ($token->{type} == COMMENT_TOKEN) {
2147 wakaba 1.18 my $comment = $self->{document}->create_comment ($token->{data});
2148     $self->{document}->append_child ($comment);
2149    
2150     ## Stay in the phase.
2151     !!!next-token;
2152     redo INITIAL;
2153     } else {
2154 wakaba 1.55 die "$0: $token->{type}: Unknown token type";
2155 wakaba 1.18 }
2156     } # INITIAL
2157 wakaba 1.3 } # _tree_construction_initial
2158    
2159     sub _tree_construction_root_element ($) {
2160     my $self = shift;
2161    
2162     B: {
2163 wakaba 1.55 if ($token->{type} == DOCTYPE_TOKEN) {
2164 wakaba 1.3 !!!parse-error (type => 'in html:#DOCTYPE');
2165     ## Ignore the token
2166     ## Stay in the phase
2167     !!!next-token;
2168     redo B;
2169 wakaba 1.55 } elsif ($token->{type} == COMMENT_TOKEN) {
2170 wakaba 1.3 my $comment = $self->{document}->create_comment ($token->{data});
2171     $self->{document}->append_child ($comment);
2172     ## Stay in the phase
2173     !!!next-token;
2174     redo B;
2175 wakaba 1.55 } elsif ($token->{type} == CHARACTER_TOKEN) {
2176 wakaba 1.26 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
2177     ## Ignore the token.
2178    
2179 wakaba 1.3 unless (length $token->{data}) {
2180     ## Stay in the phase
2181     !!!next-token;
2182     redo B;
2183     }
2184     }
2185 wakaba 1.61
2186     $self->{application_cache_selection}->(undef);
2187    
2188     #
2189     } elsif ($token->{type} == START_TAG_TOKEN) {
2190     if ($token->{tag_name} eq 'html' and
2191     $token->{attributes}->{manifest}) { ## ISSUE: Spec spells as "application"
2192     $self->{application_cache_selection}
2193     ->($token->{attributes}->{manifest}->{value});
2194     ## ISSUE: No relative reference resolution?
2195     } else {
2196     $self->{application_cache_selection}->(undef);
2197     }
2198    
2199     ## ISSUE: There is an issue in the spec
2200 wakaba 1.3 #
2201     } elsif ({
2202 wakaba 1.55 END_TAG_TOKEN, 1,
2203     END_OF_FILE_TOKEN, 1,
2204 wakaba 1.3 }->{$token->{type}}) {
2205 wakaba 1.61 $self->{application_cache_selection}->(undef);
2206    
2207 wakaba 1.3 ## ISSUE: There is an issue in the spec
2208     #
2209     } else {
2210 wakaba 1.55 die "$0: $token->{type}: Unknown token type";
2211 wakaba 1.3 }
2212 wakaba 1.61
2213 wakaba 1.3 my $root_element; !!!create-element ($root_element, 'html');
2214     $self->{document}->append_child ($root_element);
2215     push @{$self->{open_elements}}, [$root_element, 'html'];
2216     ## reprocess
2217     #redo B;
2218 wakaba 1.35 return; ## Go to the main phase.
2219 wakaba 1.3 } # B
2220     } # _tree_construction_root_element
2221    
2222     sub _reset_insertion_mode ($) {
2223     my $self = shift;
2224    
2225     ## Step 1
2226     my $last;
2227    
2228     ## Step 2
2229     my $i = -1;
2230     my $node = $self->{open_elements}->[$i];
2231    
2232     ## Step 3
2233     S3: {
2234 wakaba 1.29 ## ISSUE: Oops! "If node is the first node in the stack of open
2235     ## elements, then set last to true. If the context element of the
2236     ## HTML fragment parsing algorithm is neither a td element nor a
2237     ## th element, then set node to the context element. (fragment case)":
2238     ## The second "if" is in the scope of the first "if"!?
2239     if ($self->{open_elements}->[0]->[0] eq $node->[0]) {
2240     $last = 1;
2241     if (defined $self->{inner_html_node}) {
2242     if ($self->{inner_html_node}->[1] eq 'td' or
2243     $self->{inner_html_node}->[1] eq 'th') {
2244     #
2245     } else {
2246     $node = $self->{inner_html_node};
2247     }
2248 wakaba 1.3 }
2249     }
2250    
2251     ## Step 4..13
2252     my $new_mode = {
2253 wakaba 1.54 select => IN_SELECT_IM,
2254     td => IN_CELL_IM,
2255     th => IN_CELL_IM,
2256     tr => IN_ROW_IM,
2257     tbody => IN_TABLE_BODY_IM,
2258     thead => IN_TABLE_BODY_IM,
2259     tfoot => IN_TABLE_BODY_IM,
2260     caption => IN_CAPTION_IM,
2261     colgroup => IN_COLUMN_GROUP_IM,
2262     table => IN_TABLE_IM,
2263     head => IN_BODY_IM, # not in head!
2264     body => IN_BODY_IM,
2265     frameset => IN_FRAMESET_IM,
2266 wakaba 1.3 }->{$node->[1]};
2267     $self->{insertion_mode} = $new_mode and return if defined $new_mode;
2268    
2269     ## Step 14
2270     if ($node->[1] eq 'html') {
2271     unless (defined $self->{head_element}) {
2272 wakaba 1.54 $self->{insertion_mode} = BEFORE_HEAD_IM;
2273 wakaba 1.3 } else {
2274 wakaba 1.54 $self->{insertion_mode} = AFTER_HEAD_IM;
2275 wakaba 1.3 }
2276     return;
2277     }
2278    
2279     ## Step 15
2280 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM and return if $last;
2281 wakaba 1.3
2282     ## Step 16
2283     $i--;
2284     $node = $self->{open_elements}->[$i];
2285    
2286     ## Step 17
2287     redo S3;
2288     } # S3
2289     } # _reset_insertion_mode
2290    
2291     sub _tree_construction_main ($) {
2292     my $self = shift;
2293    
2294 wakaba 1.1 my $active_formatting_elements = [];
2295    
2296     my $reconstruct_active_formatting_elements = sub { # MUST
2297     my $insert = shift;
2298    
2299     ## Step 1
2300     return unless @$active_formatting_elements;
2301    
2302     ## Step 3
2303     my $i = -1;
2304     my $entry = $active_formatting_elements->[$i];
2305    
2306     ## Step 2
2307     return if $entry->[0] eq '#marker';
2308 wakaba 1.3 for (@{$self->{open_elements}}) {
2309 wakaba 1.1 if ($entry->[0] eq $_->[0]) {
2310     return;
2311     }
2312     }
2313    
2314     S4: {
2315     ## Step 4
2316     last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
2317    
2318     ## Step 5
2319     $i--;
2320     $entry = $active_formatting_elements->[$i];
2321    
2322     ## Step 6
2323     if ($entry->[0] eq '#marker') {
2324     #
2325     } else {
2326     my $in_open_elements;
2327 wakaba 1.3 OE: for (@{$self->{open_elements}}) {
2328 wakaba 1.1 if ($entry->[0] eq $_->[0]) {
2329     $in_open_elements = 1;
2330     last OE;
2331     }
2332     }
2333     if ($in_open_elements) {
2334     #
2335     } else {
2336     redo S4;
2337     }
2338     }
2339    
2340     ## Step 7
2341     $i++;
2342     $entry = $active_formatting_elements->[$i];
2343     } # S4
2344    
2345     S7: {
2346     ## Step 8
2347     my $clone = [$entry->[0]->clone_node (0), $entry->[1]];
2348    
2349     ## Step 9
2350     $insert->($clone->[0]);
2351 wakaba 1.3 push @{$self->{open_elements}}, $clone;
2352 wakaba 1.1
2353     ## Step 10
2354 wakaba 1.3 $active_formatting_elements->[$i] = $self->{open_elements}->[-1];
2355 wakaba 1.1
2356     ## Step 11
2357     unless ($clone->[0] eq $active_formatting_elements->[-1]->[0]) {
2358     ## Step 7'
2359     $i++;
2360     $entry = $active_formatting_elements->[$i];
2361    
2362     redo S7;
2363     }
2364     } # S7
2365     }; # $reconstruct_active_formatting_elements
2366    
2367     my $clear_up_to_marker = sub {
2368     for (reverse 0..$#$active_formatting_elements) {
2369     if ($active_formatting_elements->[$_]->[0] eq '#marker') {
2370     splice @$active_formatting_elements, $_;
2371     return;
2372     }
2373     }
2374     }; # $clear_up_to_marker
2375    
2376 wakaba 1.25 my $parse_rcdata = sub ($$) {
2377     my ($content_model_flag, $insert) = @_;
2378    
2379     ## Step 1
2380     my $start_tag_name = $token->{tag_name};
2381     my $el;
2382     !!!create-element ($el, $start_tag_name, $token->{attributes});
2383    
2384     ## Step 2
2385     $insert->($el); # /context node/->append_child ($el)
2386    
2387     ## Step 3
2388 wakaba 1.40 $self->{content_model} = $content_model_flag; # CDATA or RCDATA
2389 wakaba 1.13 delete $self->{escape}; # MUST
2390 wakaba 1.25
2391     ## Step 4
2392 wakaba 1.1 my $text = '';
2393     !!!next-token;
2394 wakaba 1.55 while ($token->{type} == CHARACTER_TOKEN) { # or until stop tokenizing
2395 wakaba 1.1 $text .= $token->{data};
2396     !!!next-token;
2397 wakaba 1.25 }
2398    
2399     ## Step 5
2400 wakaba 1.1 if (length $text) {
2401 wakaba 1.25 my $text = $self->{document}->create_text_node ($text);
2402     $el->append_child ($text);
2403 wakaba 1.1 }
2404 wakaba 1.25
2405     ## Step 6
2406 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL;
2407 wakaba 1.25
2408     ## Step 7
2409 wakaba 1.55 if ($token->{type} == END_TAG_TOKEN and $token->{tag_name} eq $start_tag_name) {
2410 wakaba 1.1 ## Ignore the token
2411 wakaba 1.40 } elsif ($content_model_flag == CDATA_CONTENT_MODEL) {
2412     !!!parse-error (type => 'in CDATA:#'.$token->{type});
2413     } elsif ($content_model_flag == RCDATA_CONTENT_MODEL) {
2414     !!!parse-error (type => 'in RCDATA:#'.$token->{type});
2415 wakaba 1.1 } else {
2416 wakaba 1.40 die "$0: $content_model_flag in parse_rcdata";
2417 wakaba 1.1 }
2418     !!!next-token;
2419 wakaba 1.25 }; # $parse_rcdata
2420 wakaba 1.1
2421 wakaba 1.25 my $script_start_tag = sub ($) {
2422     my $insert = $_[0];
2423 wakaba 1.1 my $script_el;
2424     !!!create-element ($script_el, 'script', $token->{attributes});
2425     ## TODO: mark as "parser-inserted"
2426    
2427 wakaba 1.40 $self->{content_model} = CDATA_CONTENT_MODEL;
2428 wakaba 1.13 delete $self->{escape}; # MUST
2429 wakaba 1.1
2430     my $text = '';
2431     !!!next-token;
2432 wakaba 1.55 while ($token->{type} == CHARACTER_TOKEN) {
2433 wakaba 1.1 $text .= $token->{data};
2434     !!!next-token;
2435     } # stop if non-character token or tokenizer stops tokenising
2436     if (length $text) {
2437     $script_el->manakai_append_text ($text);
2438     }
2439    
2440 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL;
2441 wakaba 1.1
2442 wakaba 1.55 if ($token->{type} == END_TAG_TOKEN and
2443 wakaba 1.1 $token->{tag_name} eq 'script') {
2444     ## Ignore the token
2445     } else {
2446 wakaba 1.3 !!!parse-error (type => 'in CDATA:#'.$token->{type});
2447 wakaba 1.1 ## ISSUE: And ignore?
2448     ## TODO: mark as "already executed"
2449     }
2450    
2451 wakaba 1.3 if (defined $self->{inner_html_node}) {
2452     ## TODO: mark as "already executed"
2453     } else {
2454 wakaba 1.1 ## TODO: $old_insertion_point = current insertion point
2455     ## TODO: insertion point = just before the next input character
2456 wakaba 1.25
2457     $insert->($script_el);
2458 wakaba 1.1
2459     ## TODO: insertion point = $old_insertion_point (might be "undefined")
2460    
2461     ## TODO: if there is a script that will execute as soon as the parser resume, then...
2462     }
2463    
2464     !!!next-token;
2465     }; # $script_start_tag
2466    
2467     my $formatting_end_tag = sub {
2468     my $tag_name = shift;
2469    
2470     FET: {
2471     ## Step 1
2472     my $formatting_element;
2473     my $formatting_element_i_in_active;
2474     AFE: for (reverse 0..$#$active_formatting_elements) {
2475     if ($active_formatting_elements->[$_]->[1] eq $tag_name) {
2476     $formatting_element = $active_formatting_elements->[$_];
2477     $formatting_element_i_in_active = $_;
2478     last AFE;
2479     } elsif ($active_formatting_elements->[$_]->[0] eq '#marker') {
2480     last AFE;
2481     }
2482     } # AFE
2483     unless (defined $formatting_element) {
2484 wakaba 1.3 !!!parse-error (type => 'unmatched end tag:'.$tag_name);
2485 wakaba 1.1 ## Ignore the token
2486     !!!next-token;
2487     return;
2488     }
2489     ## has an element in scope
2490     my $in_scope = 1;
2491     my $formatting_element_i_in_open;
2492 wakaba 1.3 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2493     my $node = $self->{open_elements}->[$_];
2494 wakaba 1.1 if ($node->[0] eq $formatting_element->[0]) {
2495     if ($in_scope) {
2496     $formatting_element_i_in_open = $_;
2497     last INSCOPE;
2498     } else { # in open elements but not in scope
2499 wakaba 1.4 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2500 wakaba 1.1 ## Ignore the token
2501     !!!next-token;
2502     return;
2503     }
2504     } elsif ({
2505     table => 1, caption => 1, td => 1, th => 1,
2506     button => 1, marquee => 1, object => 1, html => 1,
2507     }->{$node->[1]}) {
2508     $in_scope = 0;
2509     }
2510     } # INSCOPE
2511     unless (defined $formatting_element_i_in_open) {
2512 wakaba 1.4 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2513 wakaba 1.1 pop @$active_formatting_elements; # $formatting_element
2514     !!!next-token; ## TODO: ok?
2515     return;
2516     }
2517 wakaba 1.3 if (not $self->{open_elements}->[-1]->[0] eq $formatting_element->[0]) {
2518 wakaba 1.4 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
2519 wakaba 1.1 }
2520    
2521     ## Step 2
2522     my $furthest_block;
2523     my $furthest_block_i_in_open;
2524 wakaba 1.3 OE: for (reverse 0..$#{$self->{open_elements}}) {
2525     my $node = $self->{open_elements}->[$_];
2526 wakaba 1.1 if (not $formatting_category->{$node->[1]} and
2527     #not $phrasing_category->{$node->[1]} and
2528     ($special_category->{$node->[1]} or
2529     $scoping_category->{$node->[1]})) {
2530     $furthest_block = $node;
2531     $furthest_block_i_in_open = $_;
2532     } elsif ($node->[0] eq $formatting_element->[0]) {
2533     last OE;
2534     }
2535     } # OE
2536    
2537     ## Step 3
2538     unless (defined $furthest_block) { # MUST
2539 wakaba 1.3 splice @{$self->{open_elements}}, $formatting_element_i_in_open;
2540 wakaba 1.1 splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
2541     !!!next-token;
2542     return;
2543     }
2544    
2545     ## Step 4
2546 wakaba 1.3 my $common_ancestor_node = $self->{open_elements}->[$formatting_element_i_in_open - 1];
2547 wakaba 1.1
2548     ## Step 5
2549     my $furthest_block_parent = $furthest_block->[0]->parent_node;
2550     if (defined $furthest_block_parent) {
2551     $furthest_block_parent->remove_child ($furthest_block->[0]);
2552     }
2553    
2554     ## Step 6
2555     my $bookmark_prev_el
2556     = $active_formatting_elements->[$formatting_element_i_in_active - 1]
2557     ->[0];
2558    
2559     ## Step 7
2560     my $node = $furthest_block;
2561     my $node_i_in_open = $furthest_block_i_in_open;
2562     my $last_node = $furthest_block;
2563     S7: {
2564     ## Step 1
2565     $node_i_in_open--;
2566 wakaba 1.3 $node = $self->{open_elements}->[$node_i_in_open];
2567 wakaba 1.1
2568     ## Step 2
2569     my $node_i_in_active;
2570     S7S2: {
2571     for (reverse 0..$#$active_formatting_elements) {
2572     if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
2573     $node_i_in_active = $_;
2574     last S7S2;
2575     }
2576     }
2577 wakaba 1.3 splice @{$self->{open_elements}}, $node_i_in_open, 1;
2578 wakaba 1.1 redo S7;
2579     } # S7S2
2580    
2581     ## Step 3
2582     last S7 if $node->[0] eq $formatting_element->[0];
2583    
2584     ## Step 4
2585     if ($last_node->[0] eq $furthest_block->[0]) {
2586     $bookmark_prev_el = $node->[0];
2587     }
2588    
2589     ## Step 5
2590     if ($node->[0]->has_child_nodes ()) {
2591     my $clone = [$node->[0]->clone_node (0), $node->[1]];
2592     $active_formatting_elements->[$node_i_in_active] = $clone;
2593 wakaba 1.3 $self->{open_elements}->[$node_i_in_open] = $clone;
2594 wakaba 1.1 $node = $clone;
2595     }
2596    
2597     ## Step 6
2598     $node->[0]->append_child ($last_node->[0]);
2599    
2600     ## Step 7
2601     $last_node = $node;
2602    
2603     ## Step 8
2604     redo S7;
2605     } # S7
2606    
2607     ## Step 8
2608     $common_ancestor_node->[0]->append_child ($last_node->[0]);
2609    
2610     ## Step 9
2611     my $clone = [$formatting_element->[0]->clone_node (0),
2612     $formatting_element->[1]];
2613    
2614     ## Step 10
2615     my @cn = @{$furthest_block->[0]->child_nodes};
2616     $clone->[0]->append_child ($_) for @cn;
2617    
2618     ## Step 11
2619     $furthest_block->[0]->append_child ($clone->[0]);
2620    
2621     ## Step 12
2622     my $i;
2623     AFE: for (reverse 0..$#$active_formatting_elements) {
2624     if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
2625     splice @$active_formatting_elements, $_, 1;
2626     $i-- and last AFE if defined $i;
2627     } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
2628     $i = $_;
2629     }
2630     } # AFE
2631     splice @$active_formatting_elements, $i + 1, 0, $clone;
2632    
2633     ## Step 13
2634     undef $i;
2635 wakaba 1.3 OE: for (reverse 0..$#{$self->{open_elements}}) {
2636     if ($self->{open_elements}->[$_]->[0] eq $formatting_element->[0]) {
2637     splice @{$self->{open_elements}}, $_, 1;
2638 wakaba 1.1 $i-- and last OE if defined $i;
2639 wakaba 1.3 } elsif ($self->{open_elements}->[$_]->[0] eq $furthest_block->[0]) {
2640 wakaba 1.1 $i = $_;
2641     }
2642     } # OE
2643 wakaba 1.3 splice @{$self->{open_elements}}, $i + 1, 1, $clone;
2644 wakaba 1.1
2645     ## Step 14
2646     redo FET;
2647     } # FET
2648     }; # $formatting_end_tag
2649    
2650     my $insert_to_current = sub {
2651 wakaba 1.25 $self->{open_elements}->[-1]->[0]->append_child ($_[0]);
2652 wakaba 1.1 }; # $insert_to_current
2653    
2654     my $insert_to_foster = sub {
2655     my $child = shift;
2656     if ({
2657     table => 1, tbody => 1, tfoot => 1,
2658     thead => 1, tr => 1,
2659 wakaba 1.3 }->{$self->{open_elements}->[-1]->[1]}) {
2660 wakaba 1.1 # MUST
2661     my $foster_parent_element;
2662     my $next_sibling;
2663 wakaba 1.3 OE: for (reverse 0..$#{$self->{open_elements}}) {
2664     if ($self->{open_elements}->[$_]->[1] eq 'table') {
2665     my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
2666 wakaba 1.1 if (defined $parent and $parent->node_type == 1) {
2667     $foster_parent_element = $parent;
2668 wakaba 1.3 $next_sibling = $self->{open_elements}->[$_]->[0];
2669 wakaba 1.1 } else {
2670     $foster_parent_element
2671 wakaba 1.3 = $self->{open_elements}->[$_ - 1]->[0];
2672 wakaba 1.1 }
2673     last OE;
2674     }
2675     } # OE
2676 wakaba 1.3 $foster_parent_element = $self->{open_elements}->[0]->[0]
2677 wakaba 1.1 unless defined $foster_parent_element;
2678     $foster_parent_element->insert_before
2679     ($child, $next_sibling);
2680     } else {
2681 wakaba 1.3 $self->{open_elements}->[-1]->[0]->append_child ($child);
2682 wakaba 1.1 }
2683     }; # $insert_to_foster
2684    
2685 wakaba 1.52 my $insert;
2686 wakaba 1.34
2687 wakaba 1.52 B: {
2688 wakaba 1.55 if ($token->{type} == DOCTYPE_TOKEN) {
2689 wakaba 1.52 !!!parse-error (type => 'DOCTYPE in the middle');
2690     ## Ignore the token
2691     ## Stay in the phase
2692     !!!next-token;
2693     redo B;
2694 wakaba 1.55 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
2695 wakaba 1.56 if ($self->{insertion_mode} & AFTER_HTML_IMS) {
2696 wakaba 1.52 #
2697     } else {
2698     ## Generate implied end tags
2699     if ({
2700     dd => 1, dt => 1, li => 1, p => 1, td => 1, th => 1, tr => 1,
2701     tbody => 1, tfoot=> 1, thead => 1,
2702     }->{$self->{open_elements}->[-1]->[1]}) {
2703     !!!back-token;
2704 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => $self->{open_elements}->[-1]->[1]};
2705 wakaba 1.52 redo B;
2706     }
2707    
2708     if (@{$self->{open_elements}} > 2 or
2709     (@{$self->{open_elements}} == 2 and $self->{open_elements}->[1]->[1] ne 'body')) {
2710     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
2711     } elsif (defined $self->{inner_html_node} and
2712     @{$self->{open_elements}} > 1 and
2713     $self->{open_elements}->[1]->[1] ne 'body') {
2714     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
2715 wakaba 1.34 }
2716    
2717 wakaba 1.52 ## ISSUE: There is an issue in the spec.
2718     }
2719    
2720     ## Stop parsing
2721     last B;
2722 wakaba 1.55 } elsif ($token->{type} == START_TAG_TOKEN and
2723 wakaba 1.52 $token->{tag_name} eq 'html') {
2724 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
2725 wakaba 1.52 ## Turn into the main phase
2726     !!!parse-error (type => 'after html:html');
2727 wakaba 1.54 $self->{insertion_mode} = AFTER_BODY_IM;
2728     } elsif ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
2729 wakaba 1.52 ## Turn into the main phase
2730     !!!parse-error (type => 'after html:html');
2731 wakaba 1.54 $self->{insertion_mode} = AFTER_FRAMESET_IM;
2732 wakaba 1.52 }
2733    
2734     ## ISSUE: "aa<html>" is not a parse error.
2735     ## ISSUE: "<html>" in fragment is not a parse error.
2736     unless ($token->{first_start_tag}) {
2737     !!!parse-error (type => 'not first start tag');
2738     }
2739     my $top_el = $self->{open_elements}->[0]->[0];
2740     for my $attr_name (keys %{$token->{attributes}}) {
2741     unless ($top_el->has_attribute_ns (undef, $attr_name)) {
2742     $top_el->set_attribute_ns
2743     (undef, [undef, $attr_name],
2744     $token->{attributes}->{$attr_name}->{value});
2745     }
2746     }
2747     !!!next-token;
2748     redo B;
2749 wakaba 1.55 } elsif ($token->{type} == COMMENT_TOKEN) {
2750 wakaba 1.52 my $comment = $self->{document}->create_comment ($token->{data});
2751 wakaba 1.56 if ($self->{insertion_mode} & AFTER_HTML_IMS) {
2752 wakaba 1.52 $self->{document}->append_child ($comment);
2753 wakaba 1.54 } elsif ($self->{insertion_mode} == AFTER_BODY_IM) {
2754 wakaba 1.52 $self->{open_elements}->[0]->[0]->append_child ($comment);
2755     } else {
2756     $self->{open_elements}->[-1]->[0]->append_child ($comment);
2757     }
2758     !!!next-token;
2759     redo B;
2760 wakaba 1.56 } elsif ($self->{insertion_mode} & HEAD_IMS) {
2761 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
2762 wakaba 1.52 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
2763     $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
2764     unless (length $token->{data}) {
2765     !!!next-token;
2766     redo B;
2767 wakaba 1.1 }
2768     }
2769 wakaba 1.52
2770 wakaba 1.54 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
2771 wakaba 1.52 ## As if <head>
2772     !!!create-element ($self->{head_element}, 'head');
2773     $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
2774     push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2775    
2776     ## Reprocess in the "in head" insertion mode...
2777     pop @{$self->{open_elements}};
2778    
2779     ## Reprocess in the "after head" insertion mode...
2780 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2781 wakaba 1.52 ## As if </noscript>
2782     pop @{$self->{open_elements}};
2783     !!!parse-error (type => 'in noscript:#character');
2784 wakaba 1.1
2785 wakaba 1.52 ## Reprocess in the "in head" insertion mode...
2786     ## As if </head>
2787     pop @{$self->{open_elements}};
2788    
2789     ## Reprocess in the "after head" insertion mode...
2790 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
2791 wakaba 1.52 pop @{$self->{open_elements}};
2792    
2793     ## Reprocess in the "after head" insertion mode...
2794 wakaba 1.1 }
2795 wakaba 1.52
2796     ## "after head" insertion mode
2797     ## As if <body>
2798     !!!insert-element ('body');
2799 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM;
2800 wakaba 1.52 ## reprocess
2801     redo B;
2802 wakaba 1.55 } elsif ($token->{type} == START_TAG_TOKEN) {
2803 wakaba 1.52 if ($token->{tag_name} eq 'head') {
2804 wakaba 1.54 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
2805 wakaba 1.52 !!!create-element ($self->{head_element}, $token->{tag_name}, $token->{attributes});
2806     $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
2807     push @{$self->{open_elements}}, [$self->{head_element}, $token->{tag_name}];
2808 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_IM;
2809 wakaba 1.52 !!!next-token;
2810     redo B;
2811 wakaba 1.54 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
2812     #
2813     } else {
2814 wakaba 1.52 !!!parse-error (type => 'in head:head'); # or in head noscript
2815     ## Ignore the token
2816     !!!next-token;
2817     redo B;
2818     }
2819 wakaba 1.54 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
2820 wakaba 1.52 ## As if <head>
2821     !!!create-element ($self->{head_element}, 'head');
2822     $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
2823     push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2824    
2825 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_IM;
2826 wakaba 1.52 ## Reprocess in the "in head" insertion mode...
2827 wakaba 1.1 }
2828 wakaba 1.52
2829 wakaba 1.49 if ($token->{tag_name} eq 'base') {
2830 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2831 wakaba 1.49 ## As if </noscript>
2832     pop @{$self->{open_elements}};
2833     !!!parse-error (type => 'in noscript:base');
2834    
2835 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_IM;
2836 wakaba 1.49 ## Reprocess in the "in head" insertion mode...
2837     }
2838    
2839     ## NOTE: There is a "as if in head" code clone.
2840 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
2841 wakaba 1.49 !!!parse-error (type => 'after head:'.$token->{tag_name});
2842     push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2843     }
2844     !!!insert-element ($token->{tag_name}, $token->{attributes});
2845     pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
2846     pop @{$self->{open_elements}}
2847 wakaba 1.54 if $self->{insertion_mode} == AFTER_HEAD_IM;
2848 wakaba 1.49 !!!next-token;
2849     redo B;
2850     } elsif ($token->{tag_name} eq 'link') {
2851 wakaba 1.25 ## NOTE: There is a "as if in head" code clone.
2852 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
2853 wakaba 1.25 !!!parse-error (type => 'after head:'.$token->{tag_name});
2854     push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2855     }
2856     !!!insert-element ($token->{tag_name}, $token->{attributes});
2857     pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
2858     pop @{$self->{open_elements}}
2859 wakaba 1.54 if $self->{insertion_mode} == AFTER_HEAD_IM;
2860 wakaba 1.1 !!!next-token;
2861 wakaba 1.25 redo B;
2862 wakaba 1.34 } elsif ($token->{tag_name} eq 'meta') {
2863     ## NOTE: There is a "as if in head" code clone.
2864 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
2865 wakaba 1.34 !!!parse-error (type => 'after head:'.$token->{tag_name});
2866     push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2867     }
2868     !!!insert-element ($token->{tag_name}, $token->{attributes});
2869     pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
2870    
2871     unless ($self->{confident}) {
2872     if ($token->{attributes}->{charset}) { ## TODO: And if supported
2873 wakaba 1.63 $self->{change_encoding}
2874     ->($self, $token->{attributes}->{charset}->{value});
2875     } elsif ($token->{attributes}->{content}) {
2876 wakaba 1.35 ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
2877 wakaba 1.63 if ($token->{attributes}->{content}->{value}
2878 wakaba 1.34 =~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*=
2879     [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
2880     ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
2881 wakaba 1.63 $self->{change_encoding}
2882     ->($self, defined $1 ? $1 : defined $2 ? $2 : $3);
2883     }
2884 wakaba 1.34 }
2885     }
2886    
2887     pop @{$self->{open_elements}}
2888 wakaba 1.54 if $self->{insertion_mode} == AFTER_HEAD_IM;
2889 wakaba 1.34 !!!next-token;
2890     redo B;
2891 wakaba 1.49 } elsif ($token->{tag_name} eq 'title') {
2892 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2893 wakaba 1.49 ## As if </noscript>
2894     pop @{$self->{open_elements}};
2895     !!!parse-error (type => 'in noscript:title');
2896    
2897 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_IM;
2898 wakaba 1.49 ## Reprocess in the "in head" insertion mode...
2899 wakaba 1.54 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
2900 wakaba 1.25 !!!parse-error (type => 'after head:'.$token->{tag_name});
2901     push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2902     }
2903 wakaba 1.49
2904     ## NOTE: There is a "as if in head" code clone.
2905 wakaba 1.31 my $parent = defined $self->{head_element} ? $self->{head_element}
2906     : $self->{open_elements}->[-1]->[0];
2907 wakaba 1.40 $parse_rcdata->(RCDATA_CONTENT_MODEL,
2908     sub { $parent->append_child ($_[0]) });
2909 wakaba 1.25 pop @{$self->{open_elements}}
2910 wakaba 1.54 if $self->{insertion_mode} == AFTER_HEAD_IM;
2911 wakaba 1.25 redo B;
2912     } elsif ($token->{tag_name} eq 'style') {
2913     ## NOTE: Or (scripting is enabled and tag_name eq 'noscript' and
2914 wakaba 1.54 ## insertion mode IN_HEAD_IM)
2915 wakaba 1.25 ## NOTE: There is a "as if in head" code clone.
2916 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
2917 wakaba 1.25 !!!parse-error (type => 'after head:'.$token->{tag_name});
2918     push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2919     }
2920 wakaba 1.40 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert_to_current);
2921 wakaba 1.25 pop @{$self->{open_elements}}
2922 wakaba 1.54 if $self->{insertion_mode} == AFTER_HEAD_IM;
2923 wakaba 1.25 redo B;
2924     } elsif ($token->{tag_name} eq 'noscript') {
2925 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_IM) {
2926 wakaba 1.25 ## NOTE: and scripting is disalbed
2927     !!!insert-element ($token->{tag_name}, $token->{attributes});
2928 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_NOSCRIPT_IM;
2929 wakaba 1.1 !!!next-token;
2930 wakaba 1.25 redo B;
2931 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2932 wakaba 1.30 !!!parse-error (type => 'in noscript:noscript');
2933 wakaba 1.1 ## Ignore the token
2934 wakaba 1.41 !!!next-token;
2935 wakaba 1.25 redo B;
2936 wakaba 1.1 } else {
2937 wakaba 1.25 #
2938 wakaba 1.1 }
2939 wakaba 1.49 } elsif ($token->{tag_name} eq 'script') {
2940 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2941 wakaba 1.49 ## As if </noscript>
2942     pop @{$self->{open_elements}};
2943     !!!parse-error (type => 'in noscript:script');
2944    
2945 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_IM;
2946 wakaba 1.49 ## Reprocess in the "in head" insertion mode...
2947 wakaba 1.54 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
2948 wakaba 1.25 !!!parse-error (type => 'after head:'.$token->{tag_name});
2949     push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2950     }
2951 wakaba 1.49
2952 wakaba 1.25 ## NOTE: There is a "as if in head" code clone.
2953     $script_start_tag->($insert_to_current);
2954     pop @{$self->{open_elements}}
2955 wakaba 1.54 if $self->{insertion_mode} == AFTER_HEAD_IM;
2956 wakaba 1.1 redo B;
2957 wakaba 1.49 } elsif ($token->{tag_name} eq 'body' or
2958 wakaba 1.25 $token->{tag_name} eq 'frameset') {
2959 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2960 wakaba 1.49 ## As if </noscript>
2961     pop @{$self->{open_elements}};
2962     !!!parse-error (type => 'in noscript:'.$token->{tag_name});
2963    
2964     ## Reprocess in the "in head" insertion mode...
2965     ## As if </head>
2966     pop @{$self->{open_elements}};
2967    
2968     ## Reprocess in the "after head" insertion mode...
2969 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
2970 wakaba 1.49 pop @{$self->{open_elements}};
2971    
2972     ## Reprocess in the "after head" insertion mode...
2973     }
2974    
2975     ## "after head" insertion mode
2976     !!!insert-element ($token->{tag_name}, $token->{attributes});
2977 wakaba 1.54 if ($token->{tag_name} eq 'body') {
2978     $self->{insertion_mode} = IN_BODY_IM;
2979     } elsif ($token->{tag_name} eq 'frameset') {
2980     $self->{insertion_mode} = IN_FRAMESET_IM;
2981     } else {
2982     die "$0: tag name: $self->{tag_name}";
2983     }
2984 wakaba 1.1 !!!next-token;
2985     redo B;
2986     } else {
2987     #
2988     }
2989 wakaba 1.49
2990 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2991 wakaba 1.49 ## As if </noscript>
2992     pop @{$self->{open_elements}};
2993     !!!parse-error (type => 'in noscript:/'.$token->{tag_name});
2994    
2995     ## Reprocess in the "in head" insertion mode...
2996     ## As if </head>
2997 wakaba 1.25 pop @{$self->{open_elements}};
2998 wakaba 1.49
2999     ## Reprocess in the "after head" insertion mode...
3000 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3001 wakaba 1.49 ## As if </head>
3002 wakaba 1.25 pop @{$self->{open_elements}};
3003 wakaba 1.49
3004     ## Reprocess in the "after head" insertion mode...
3005     }
3006    
3007     ## "after head" insertion mode
3008     ## As if <body>
3009     !!!insert-element ('body');
3010 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM;
3011 wakaba 1.49 ## reprocess
3012     redo B;
3013 wakaba 1.55 } elsif ($token->{type} == END_TAG_TOKEN) {
3014 wakaba 1.49 if ($token->{tag_name} eq 'head') {
3015 wakaba 1.54 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3016 wakaba 1.50 ## As if <head>
3017     !!!create-element ($self->{head_element}, 'head');
3018     $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3019     push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3020    
3021     ## Reprocess in the "in head" insertion mode...
3022     pop @{$self->{open_elements}};
3023 wakaba 1.54 $self->{insertion_mode} = AFTER_HEAD_IM;
3024 wakaba 1.50 !!!next-token;
3025     redo B;
3026 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3027 wakaba 1.49 ## As if </noscript>
3028     pop @{$self->{open_elements}};
3029     !!!parse-error (type => 'in noscript:script');
3030    
3031     ## Reprocess in the "in head" insertion mode...
3032 wakaba 1.50 pop @{$self->{open_elements}};
3033 wakaba 1.54 $self->{insertion_mode} = AFTER_HEAD_IM;
3034 wakaba 1.50 !!!next-token;
3035     redo B;
3036 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3037 wakaba 1.49 pop @{$self->{open_elements}};
3038 wakaba 1.54 $self->{insertion_mode} = AFTER_HEAD_IM;
3039 wakaba 1.49 !!!next-token;
3040     redo B;
3041     } else {
3042     #
3043     }
3044     } elsif ($token->{tag_name} eq 'noscript') {
3045 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3046 wakaba 1.49 pop @{$self->{open_elements}};
3047 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_IM;
3048 wakaba 1.49 !!!next-token;
3049     redo B;
3050 wakaba 1.54 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3051 wakaba 1.50 !!!parse-error (type => 'unmatched end tag:noscript');
3052     ## Ignore the token ## ISSUE: An issue in the spec.
3053     !!!next-token;
3054     redo B;
3055 wakaba 1.49 } else {
3056     #
3057     }
3058     } elsif ({
3059 wakaba 1.31 body => 1, html => 1,
3060     }->{$token->{tag_name}}) {
3061 wakaba 1.54 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3062 wakaba 1.50 ## As if <head>
3063     !!!create-element ($self->{head_element}, 'head');
3064     $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3065     push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3066    
3067 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_IM;
3068 wakaba 1.50 ## Reprocess in the "in head" insertion mode...
3069 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3070 wakaba 1.49 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3071     ## Ignore the token
3072     !!!next-token;
3073     redo B;
3074     }
3075 wakaba 1.50
3076     #
3077 wakaba 1.49 } elsif ({
3078 wakaba 1.31 p => 1, br => 1,
3079     }->{$token->{tag_name}}) {
3080 wakaba 1.54 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3081 wakaba 1.50 ## As if <head>
3082     !!!create-element ($self->{head_element}, 'head');
3083     $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3084     push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3085    
3086 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_IM;
3087 wakaba 1.50 ## Reprocess in the "in head" insertion mode...
3088     }
3089    
3090 wakaba 1.1 #
3091 wakaba 1.25 } else {
3092 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
3093     #
3094     } else {
3095 wakaba 1.49 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3096     ## Ignore the token
3097     !!!next-token;
3098     redo B;
3099     }
3100     }
3101    
3102 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3103 wakaba 1.49 ## As if </noscript>
3104     pop @{$self->{open_elements}};
3105     !!!parse-error (type => 'in noscript:/'.$token->{tag_name});
3106    
3107     ## Reprocess in the "in head" insertion mode...
3108     ## As if </head>
3109     pop @{$self->{open_elements}};
3110    
3111     ## Reprocess in the "after head" insertion mode...
3112 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3113 wakaba 1.49 ## As if </head>
3114     pop @{$self->{open_elements}};
3115    
3116     ## Reprocess in the "after head" insertion mode...
3117 wakaba 1.54 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3118 wakaba 1.50 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3119     ## Ignore the token ## ISSUE: An issue in the spec.
3120     !!!next-token;
3121     redo B;
3122 wakaba 1.1 }
3123    
3124 wakaba 1.49 ## "after head" insertion mode
3125     ## As if <body>
3126 wakaba 1.52 !!!insert-element ('body');
3127 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM;
3128 wakaba 1.52 ## reprocess
3129     redo B;
3130     } else {
3131     die "$0: $token->{type}: Unknown token type";
3132     }
3133    
3134     ## ISSUE: An issue in the spec.
3135 wakaba 1.56 } elsif ($self->{insertion_mode} & BODY_IMS) {
3136 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
3137 wakaba 1.52 ## NOTE: There is a code clone of "character in body".
3138     $reconstruct_active_formatting_elements->($insert_to_current);
3139    
3140     $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3141    
3142     !!!next-token;
3143     redo B;
3144 wakaba 1.55 } elsif ($token->{type} == START_TAG_TOKEN) {
3145 wakaba 1.52 if ({
3146     caption => 1, col => 1, colgroup => 1, tbody => 1,
3147     td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
3148     }->{$token->{tag_name}}) {
3149 wakaba 1.54 if ($self->{insertion_mode} == IN_CELL_IM) {
3150 wakaba 1.52 ## have an element in table scope
3151     my $tn;
3152     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3153     my $node = $self->{open_elements}->[$_];
3154     if ($node->[1] eq 'td' or $node->[1] eq 'th') {
3155     $tn = $node->[1];
3156     last INSCOPE;
3157     } elsif ({
3158     table => 1, html => 1,
3159     }->{$node->[1]}) {
3160     last INSCOPE;
3161     }
3162     } # INSCOPE
3163     unless (defined $tn) {
3164     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3165     ## Ignore the token
3166     !!!next-token;
3167     redo B;
3168     }
3169    
3170     ## Close the cell
3171     !!!back-token; # <?>
3172 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => $tn};
3173 wakaba 1.52 redo B;
3174 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
3175 wakaba 1.52 !!!parse-error (type => 'not closed:caption');
3176    
3177     ## As if </caption>
3178     ## have a table element in table scope
3179     my $i;
3180     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3181     my $node = $self->{open_elements}->[$_];
3182     if ($node->[1] eq 'caption') {
3183     $i = $_;
3184     last INSCOPE;
3185     } elsif ({
3186     table => 1, html => 1,
3187     }->{$node->[1]}) {
3188     last INSCOPE;
3189     }
3190     } # INSCOPE
3191     unless (defined $i) {
3192     !!!parse-error (type => 'unmatched end tag:caption');
3193     ## Ignore the token
3194     !!!next-token;
3195     redo B;
3196     }
3197    
3198     ## generate implied end tags
3199     if ({
3200     dd => 1, dt => 1, li => 1, p => 1,
3201     td => 1, th => 1, tr => 1,
3202     tbody => 1, tfoot=> 1, thead => 1,
3203     }->{$self->{open_elements}->[-1]->[1]}) {
3204     !!!back-token; # <?>
3205 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'caption'};
3206 wakaba 1.52 !!!back-token;
3207 wakaba 1.55 $token = {type => END_TAG_TOKEN,
3208 wakaba 1.52 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3209     redo B;
3210     }
3211    
3212     if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3213     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3214     }
3215    
3216     splice @{$self->{open_elements}}, $i;
3217    
3218     $clear_up_to_marker->();
3219    
3220 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
3221 wakaba 1.52
3222     ## reprocess
3223     redo B;
3224     } else {
3225     #
3226     }
3227     } else {
3228     #
3229     }
3230 wakaba 1.55 } elsif ($token->{type} == END_TAG_TOKEN) {
3231 wakaba 1.52 if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') {
3232 wakaba 1.54 if ($self->{insertion_mode} == IN_CELL_IM) {
3233 wakaba 1.43 ## have an element in table scope
3234 wakaba 1.52 my $i;
3235 wakaba 1.43 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3236     my $node = $self->{open_elements}->[$_];
3237 wakaba 1.52 if ($node->[1] eq $token->{tag_name}) {
3238     $i = $_;
3239 wakaba 1.43 last INSCOPE;
3240     } elsif ({
3241     table => 1, html => 1,
3242     }->{$node->[1]}) {
3243     last INSCOPE;
3244     }
3245     } # INSCOPE
3246 wakaba 1.52 unless (defined $i) {
3247 wakaba 1.43 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3248     ## Ignore the token
3249     !!!next-token;
3250     redo B;
3251     }
3252    
3253 wakaba 1.52 ## generate implied end tags
3254     if ({
3255     dd => 1, dt => 1, li => 1, p => 1,
3256     td => ($token->{tag_name} eq 'th'),
3257     th => ($token->{tag_name} eq 'td'),
3258     tr => 1,
3259     tbody => 1, tfoot=> 1, thead => 1,
3260     }->{$self->{open_elements}->[-1]->[1]}) {
3261     !!!back-token;
3262 wakaba 1.55 $token = {type => END_TAG_TOKEN,
3263 wakaba 1.52 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3264     redo B;
3265     }
3266    
3267     if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
3268     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3269     }
3270    
3271     splice @{$self->{open_elements}}, $i;
3272    
3273     $clear_up_to_marker->();
3274    
3275 wakaba 1.54 $self->{insertion_mode} = IN_ROW_IM;
3276 wakaba 1.52
3277     !!!next-token;
3278 wakaba 1.43 redo B;
3279 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
3280 wakaba 1.52 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3281     ## Ignore the token
3282     !!!next-token;
3283     redo B;
3284     } else {
3285     #
3286     }
3287     } elsif ($token->{tag_name} eq 'caption') {
3288 wakaba 1.54 if ($self->{insertion_mode} == IN_CAPTION_IM) {
3289 wakaba 1.43 ## have a table element in table scope
3290     my $i;
3291     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3292     my $node = $self->{open_elements}->[$_];
3293 wakaba 1.52 if ($node->[1] eq $token->{tag_name}) {
3294 wakaba 1.43 $i = $_;
3295     last INSCOPE;
3296     } elsif ({
3297     table => 1, html => 1,
3298     }->{$node->[1]}) {
3299     last INSCOPE;
3300     }
3301     } # INSCOPE
3302     unless (defined $i) {
3303 wakaba 1.52 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3304 wakaba 1.43 ## Ignore the token
3305     !!!next-token;
3306     redo B;
3307     }
3308    
3309     ## generate implied end tags
3310     if ({
3311     dd => 1, dt => 1, li => 1, p => 1,
3312     td => 1, th => 1, tr => 1,
3313     tbody => 1, tfoot=> 1, thead => 1,
3314     }->{$self->{open_elements}->[-1]->[1]}) {
3315     !!!back-token;
3316 wakaba 1.55 $token = {type => END_TAG_TOKEN,
3317 wakaba 1.43 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3318     redo B;
3319     }
3320 wakaba 1.52
3321     if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3322     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3323     }
3324    
3325     splice @{$self->{open_elements}}, $i;
3326    
3327     $clear_up_to_marker->();
3328    
3329 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
3330 wakaba 1.52
3331     !!!next-token;
3332     redo B;
3333 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_CELL_IM) {
3334 wakaba 1.52 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3335     ## Ignore the token
3336     !!!next-token;
3337     redo B;
3338     } else {
3339     #
3340     }
3341     } elsif ({
3342     table => 1, tbody => 1, tfoot => 1,
3343     thead => 1, tr => 1,
3344     }->{$token->{tag_name}} and
3345 wakaba 1.54 $self->{insertion_mode} == IN_CELL_IM) {
3346 wakaba 1.52 ## have an element in table scope
3347     my $i;
3348     my $tn;
3349     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3350     my $node = $self->{open_elements}->[$_];
3351     if ($node->[1] eq $token->{tag_name}) {
3352     $i = $_;
3353     last INSCOPE;
3354     } elsif ($node->[1] eq 'td' or $node->[1] eq 'th') {
3355     $tn = $node->[1];
3356     ## NOTE: There is exactly one |td| or |th| element
3357     ## in scope in the stack of open elements by definition.
3358     } elsif ({
3359     table => 1, html => 1,
3360     }->{$node->[1]}) {
3361     last INSCOPE;
3362     }
3363     } # INSCOPE
3364     unless (defined $i) {
3365     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3366     ## Ignore the token
3367     !!!next-token;
3368     redo B;
3369     }
3370    
3371     ## Close the cell
3372     !!!back-token; # </?>
3373 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => $tn};
3374 wakaba 1.52 redo B;
3375     } elsif ($token->{tag_name} eq 'table' and
3376 wakaba 1.54 $self->{insertion_mode} == IN_CAPTION_IM) {
3377 wakaba 1.52 !!!parse-error (type => 'not closed:caption');
3378    
3379     ## As if </caption>
3380     ## have a table element in table scope
3381     my $i;
3382     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3383     my $node = $self->{open_elements}->[$_];
3384     if ($node->[1] eq 'caption') {
3385     $i = $_;
3386     last INSCOPE;
3387     } elsif ({
3388     table => 1, html => 1,
3389     }->{$node->[1]}) {
3390     last INSCOPE;
3391     }
3392     } # INSCOPE
3393     unless (defined $i) {
3394     !!!parse-error (type => 'unmatched end tag:caption');
3395     ## Ignore the token
3396     !!!next-token;
3397     redo B;
3398     }
3399    
3400     ## generate implied end tags
3401     if ({
3402     dd => 1, dt => 1, li => 1, p => 1,
3403     td => 1, th => 1, tr => 1,
3404     tbody => 1, tfoot=> 1, thead => 1,
3405     }->{$self->{open_elements}->[-1]->[1]}) {
3406     !!!back-token; # </table>
3407 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'caption'};
3408 wakaba 1.52 !!!back-token;
3409 wakaba 1.55 $token = {type => END_TAG_TOKEN,
3410 wakaba 1.52 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3411     redo B;
3412     }
3413    
3414     if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3415     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3416     }
3417    
3418     splice @{$self->{open_elements}}, $i;
3419    
3420     $clear_up_to_marker->();
3421    
3422 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
3423 wakaba 1.52
3424     ## reprocess
3425     redo B;
3426     } elsif ({
3427     body => 1, col => 1, colgroup => 1, html => 1,
3428     }->{$token->{tag_name}}) {
3429 wakaba 1.56 if ($self->{insertion_mode} & BODY_TABLE_IMS) {
3430 wakaba 1.52 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3431     ## Ignore the token
3432     !!!next-token;
3433     redo B;
3434     } else {
3435     #
3436     }
3437     } elsif ({
3438     tbody => 1, tfoot => 1,
3439     thead => 1, tr => 1,
3440     }->{$token->{tag_name}} and
3441 wakaba 1.54 $self->{insertion_mode} == IN_CAPTION_IM) {
3442 wakaba 1.52 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3443     ## Ignore the token
3444     !!!next-token;
3445     redo B;
3446     } else {
3447     #
3448     }
3449     } else {
3450     die "$0: $token->{type}: Unknown token type";
3451     }
3452    
3453     $insert = $insert_to_current;
3454     #
3455 wakaba 1.56 } elsif ($self->{insertion_mode} & TABLE_IMS) {
3456 wakaba 1.58 if ($token->{type} == CHARACTER_TOKEN) {
3457 wakaba 1.52 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3458     $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3459    
3460     unless (length $token->{data}) {
3461     !!!next-token;
3462     redo B;
3463     }
3464     }
3465    
3466     !!!parse-error (type => 'in table:#character');
3467    
3468     ## As if in body, but insert into foster parent element
3469     ## ISSUE: Spec says that "whenever a node would be inserted
3470     ## into the current node" while characters might not be
3471     ## result in a new Text node.
3472     $reconstruct_active_formatting_elements->($insert_to_foster);
3473    
3474     if ({
3475     table => 1, tbody => 1, tfoot => 1,
3476     thead => 1, tr => 1,
3477     }->{$self->{open_elements}->[-1]->[1]}) {
3478     # MUST
3479     my $foster_parent_element;
3480     my $next_sibling;
3481     my $prev_sibling;
3482     OE: for (reverse 0..$#{$self->{open_elements}}) {
3483     if ($self->{open_elements}->[$_]->[1] eq 'table') {
3484     my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3485     if (defined $parent and $parent->node_type == 1) {
3486     $foster_parent_element = $parent;
3487     $next_sibling = $self->{open_elements}->[$_]->[0];
3488     $prev_sibling = $next_sibling->previous_sibling;
3489     } else {
3490     $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
3491     $prev_sibling = $foster_parent_element->last_child;
3492     }
3493     last OE;
3494     }
3495     } # OE
3496     $foster_parent_element = $self->{open_elements}->[0]->[0] and
3497     $prev_sibling = $foster_parent_element->last_child
3498     unless defined $foster_parent_element;
3499     if (defined $prev_sibling and
3500     $prev_sibling->node_type == 3) {
3501     $prev_sibling->manakai_append_text ($token->{data});
3502     } else {
3503     $foster_parent_element->insert_before
3504     ($self->{document}->create_text_node ($token->{data}),
3505     $next_sibling);
3506     }
3507     } else {
3508     $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3509     }
3510    
3511     !!!next-token;
3512     redo B;
3513 wakaba 1.58 } elsif ($token->{type} == START_TAG_TOKEN) {
3514 wakaba 1.52 if ({
3515 wakaba 1.54 tr => ($self->{insertion_mode} != IN_ROW_IM),
3516 wakaba 1.52 th => 1, td => 1,
3517     }->{$token->{tag_name}}) {
3518 wakaba 1.54 if ($self->{insertion_mode} == IN_TABLE_IM) {
3519 wakaba 1.52 ## Clear back to table context
3520     while ($self->{open_elements}->[-1]->[1] ne 'table' and
3521     $self->{open_elements}->[-1]->[1] ne 'html') {
3522 wakaba 1.43 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3523 wakaba 1.52 pop @{$self->{open_elements}};
3524 wakaba 1.43 }
3525    
3526 wakaba 1.52 !!!insert-element ('tbody');
3527 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_BODY_IM;
3528 wakaba 1.52 ## reprocess in the "in table body" insertion mode...
3529     }
3530    
3531 wakaba 1.54 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
3532 wakaba 1.52 unless ($token->{tag_name} eq 'tr') {
3533     !!!parse-error (type => 'missing start tag:tr');
3534     }
3535 wakaba 1.43
3536 wakaba 1.52 ## Clear back to table body context
3537     while (not {
3538     tbody => 1, tfoot => 1, thead => 1, html => 1,
3539     }->{$self->{open_elements}->[-1]->[1]}) {
3540     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3541     pop @{$self->{open_elements}};
3542     }
3543 wakaba 1.43
3544 wakaba 1.54 $self->{insertion_mode} = IN_ROW_IM;
3545 wakaba 1.52 if ($token->{tag_name} eq 'tr') {
3546     !!!insert-element ($token->{tag_name}, $token->{attributes});
3547     !!!next-token;
3548     redo B;
3549     } else {
3550     !!!insert-element ('tr');
3551     ## reprocess in the "in row" insertion mode
3552     }
3553     }
3554    
3555     ## Clear back to table row context
3556     while (not {
3557     tr => 1, html => 1,
3558     }->{$self->{open_elements}->[-1]->[1]}) {
3559     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3560     pop @{$self->{open_elements}};
3561 wakaba 1.43 }
3562 wakaba 1.52
3563     !!!insert-element ($token->{tag_name}, $token->{attributes});
3564 wakaba 1.54 $self->{insertion_mode} = IN_CELL_IM;
3565 wakaba 1.52
3566     push @$active_formatting_elements, ['#marker', ''];
3567    
3568     !!!next-token;
3569     redo B;
3570     } elsif ({
3571     caption => 1, col => 1, colgroup => 1,
3572     tbody => 1, tfoot => 1, thead => 1,
3573 wakaba 1.54 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
3574 wakaba 1.52 }->{$token->{tag_name}}) {
3575 wakaba 1.54 if ($self->{insertion_mode} == IN_ROW_IM) {
3576 wakaba 1.52 ## As if </tr>
3577 wakaba 1.43 ## have an element in table scope
3578     my $i;
3579     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3580     my $node = $self->{open_elements}->[$_];
3581 wakaba 1.52 if ($node->[1] eq 'tr') {
3582 wakaba 1.43 $i = $_;
3583     last INSCOPE;
3584     } elsif ({
3585     table => 1, html => 1,
3586     }->{$node->[1]}) {
3587     last INSCOPE;
3588     }
3589     } # INSCOPE
3590 wakaba 1.52 unless (defined $i) {
3591     !!!parse-error (type => 'unmacthed end tag:'.$token->{tag_name});
3592     ## Ignore the token
3593     !!!next-token;
3594 wakaba 1.43 redo B;
3595     }
3596    
3597 wakaba 1.52 ## Clear back to table row context
3598     while (not {
3599     tr => 1, html => 1,
3600     }->{$self->{open_elements}->[-1]->[1]}) {
3601 wakaba 1.43 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3602 wakaba 1.52 pop @{$self->{open_elements}};
3603 wakaba 1.1 }
3604 wakaba 1.43
3605 wakaba 1.52 pop @{$self->{open_elements}}; # tr
3606 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_BODY_IM;
3607 wakaba 1.52 if ($token->{tag_name} eq 'tr') {
3608     ## reprocess
3609     redo B;
3610     } else {
3611     ## reprocess in the "in table body" insertion mode...
3612     }
3613 wakaba 1.1 }
3614 wakaba 1.52
3615 wakaba 1.54 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
3616 wakaba 1.52 ## have an element in table scope
3617 wakaba 1.43 my $i;
3618     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3619     my $node = $self->{open_elements}->[$_];
3620 wakaba 1.52 if ({
3621     tbody => 1, thead => 1, tfoot => 1,
3622     }->{$node->[1]}) {
3623 wakaba 1.43 $i = $_;
3624     last INSCOPE;
3625     } elsif ({
3626     table => 1, html => 1,
3627     }->{$node->[1]}) {
3628     last INSCOPE;
3629     }
3630     } # INSCOPE
3631 wakaba 1.52 unless (defined $i) {
3632     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3633     ## Ignore the token
3634     !!!next-token;
3635 wakaba 1.43 redo B;
3636     }
3637 wakaba 1.52
3638     ## Clear back to table body context
3639     while (not {
3640     tbody => 1, tfoot => 1, thead => 1, html => 1,
3641     }->{$self->{open_elements}->[-1]->[1]}) {
3642 wakaba 1.43 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3643 wakaba 1.52 pop @{$self->{open_elements}};
3644 wakaba 1.43 }
3645    
3646 wakaba 1.52 ## As if <{current node}>
3647     ## have an element in table scope
3648     ## true by definition
3649 wakaba 1.43
3650 wakaba 1.52 ## Clear back to table body context
3651     ## nop by definition
3652 wakaba 1.43
3653 wakaba 1.52 pop @{$self->{open_elements}};
3654 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
3655 wakaba 1.52 ## reprocess in "in table" insertion mode...
3656     }
3657    
3658     if ($token->{tag_name} eq 'col') {
3659     ## Clear back to table context
3660     while ($self->{open_elements}->[-1]->[1] ne 'table' and
3661     $self->{open_elements}->[-1]->[1] ne 'html') {
3662     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3663     pop @{$self->{open_elements}};
3664     }
3665 wakaba 1.43
3666 wakaba 1.52 !!!insert-element ('colgroup');
3667 wakaba 1.54 $self->{insertion_mode} = IN_COLUMN_GROUP_IM;
3668 wakaba 1.52 ## reprocess
3669 wakaba 1.43 redo B;
3670 wakaba 1.52 } elsif ({
3671     caption => 1,
3672     colgroup => 1,
3673     tbody => 1, tfoot => 1, thead => 1,
3674     }->{$token->{tag_name}}) {
3675     ## Clear back to table context
3676     while ($self->{open_elements}->[-1]->[1] ne 'table' and
3677     $self->{open_elements}->[-1]->[1] ne 'html') {
3678     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3679     pop @{$self->{open_elements}};
3680 wakaba 1.1 }
3681 wakaba 1.52
3682     push @$active_formatting_elements, ['#marker', '']
3683     if $token->{tag_name} eq 'caption';
3684    
3685     !!!insert-element ($token->{tag_name}, $token->{attributes});
3686     $self->{insertion_mode} = {
3687 wakaba 1.54 caption => IN_CAPTION_IM,
3688     colgroup => IN_COLUMN_GROUP_IM,
3689     tbody => IN_TABLE_BODY_IM,
3690     tfoot => IN_TABLE_BODY_IM,
3691     thead => IN_TABLE_BODY_IM,
3692 wakaba 1.52 }->{$token->{tag_name}};
3693 wakaba 1.1 !!!next-token;
3694     redo B;
3695 wakaba 1.52 } else {
3696     die "$0: in table: <>: $token->{tag_name}";
3697 wakaba 1.1 }
3698 wakaba 1.52 } elsif ($token->{tag_name} eq 'table') {
3699     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3700 wakaba 1.1
3701 wakaba 1.52 ## As if </table>
3702 wakaba 1.1 ## have a table element in table scope
3703     my $i;
3704 wakaba 1.3 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3705     my $node = $self->{open_elements}->[$_];
3706 wakaba 1.52 if ($node->[1] eq 'table') {
3707 wakaba 1.1 $i = $_;
3708     last INSCOPE;
3709     } elsif ({
3710     table => 1, html => 1,
3711     }->{$node->[1]}) {
3712     last INSCOPE;
3713     }
3714     } # INSCOPE
3715     unless (defined $i) {
3716 wakaba 1.52 !!!parse-error (type => 'unmatched end tag:table');
3717     ## Ignore tokens </table><table>
3718 wakaba 1.1 !!!next-token;
3719     redo B;
3720     }
3721    
3722     ## generate implied end tags
3723     if ({
3724     dd => 1, dt => 1, li => 1, p => 1,
3725     td => 1, th => 1, tr => 1,
3726 wakaba 1.31 tbody => 1, tfoot=> 1, thead => 1,
3727 wakaba 1.3 }->{$self->{open_elements}->[-1]->[1]}) {
3728 wakaba 1.52 !!!back-token; # <table>
3729 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'table'};
3730 wakaba 1.1 !!!back-token;
3731 wakaba 1.55 $token = {type => END_TAG_TOKEN,
3732 wakaba 1.3 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3733 wakaba 1.1 redo B;
3734     }
3735    
3736 wakaba 1.52 if ($self->{open_elements}->[-1]->[1] ne 'table') {
3737 wakaba 1.3 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3738 wakaba 1.1 }
3739    
3740 wakaba 1.3 splice @{$self->{open_elements}}, $i;
3741 wakaba 1.1
3742 wakaba 1.52 $self->_reset_insertion_mode;
3743 wakaba 1.1
3744     ## reprocess
3745     redo B;
3746 wakaba 1.58 } else {
3747     !!!parse-error (type => 'in table:'.$token->{tag_name});
3748    
3749     $insert = $insert_to_foster;
3750     #
3751     }
3752     } elsif ($token->{type} == END_TAG_TOKEN) {
3753 wakaba 1.52 if ($token->{tag_name} eq 'tr' and
3754 wakaba 1.54 $self->{insertion_mode} == IN_ROW_IM) {
3755 wakaba 1.52 ## have an element in table scope
3756     my $i;
3757     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3758     my $node = $self->{open_elements}->[$_];
3759     if ($node->[1] eq $token->{tag_name}) {
3760     $i = $_;
3761     last INSCOPE;
3762     } elsif ({
3763     table => 1, html => 1,
3764     }->{$node->[1]}) {
3765     last INSCOPE;
3766     }
3767     } # INSCOPE
3768     unless (defined $i) {
3769     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3770     ## Ignore the token
3771 wakaba 1.42 !!!next-token;
3772     redo B;
3773     }
3774    
3775 wakaba 1.52 ## Clear back to table row context
3776     while (not {
3777     tr => 1, html => 1,
3778     }->{$self->{open_elements}->[-1]->[1]}) {
3779     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3780     pop @{$self->{open_elements}};
3781     }
3782 wakaba 1.42
3783 wakaba 1.52 pop @{$self->{open_elements}}; # tr
3784 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_BODY_IM;
3785 wakaba 1.52 !!!next-token;
3786     redo B;
3787     } elsif ($token->{tag_name} eq 'table') {
3788 wakaba 1.54 if ($self->{insertion_mode} == IN_ROW_IM) {
3789 wakaba 1.52 ## As if </tr>
3790     ## have an element in table scope
3791     my $i;
3792     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3793     my $node = $self->{open_elements}->[$_];
3794     if ($node->[1] eq 'tr') {
3795     $i = $_;
3796     last INSCOPE;
3797     } elsif ({
3798     table => 1, html => 1,
3799     }->{$node->[1]}) {
3800     last INSCOPE;
3801 wakaba 1.42 }
3802 wakaba 1.52 } # INSCOPE
3803     unless (defined $i) {
3804     !!!parse-error (type => 'unmatched end tag:'.$token->{type});
3805     ## Ignore the token
3806     !!!next-token;
3807     redo B;
3808 wakaba 1.42 }
3809 wakaba 1.52
3810     ## Clear back to table row context
3811     while (not {
3812     tr => 1, html => 1,
3813     }->{$self->{open_elements}->[-1]->[1]}) {
3814 wakaba 1.46 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3815     pop @{$self->{open_elements}};
3816 wakaba 1.1 }
3817 wakaba 1.46
3818 wakaba 1.52 pop @{$self->{open_elements}}; # tr
3819 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_BODY_IM;
3820 wakaba 1.46 ## reprocess in the "in table body" insertion mode...
3821 wakaba 1.1 }
3822    
3823 wakaba 1.54 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
3824 wakaba 1.52 ## have an element in table scope
3825     my $i;
3826     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3827     my $node = $self->{open_elements}->[$_];
3828     if ({
3829     tbody => 1, thead => 1, tfoot => 1,
3830     }->{$node->[1]}) {
3831     $i = $_;
3832     last INSCOPE;
3833     } elsif ({
3834     table => 1, html => 1,
3835     }->{$node->[1]}) {
3836     last INSCOPE;
3837     }
3838     } # INSCOPE
3839     unless (defined $i) {
3840     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3841     ## Ignore the token
3842     !!!next-token;
3843     redo B;
3844 wakaba 1.47 }
3845    
3846     ## Clear back to table body context
3847     while (not {
3848     tbody => 1, tfoot => 1, thead => 1, html => 1,
3849     }->{$self->{open_elements}->[-1]->[1]}) {
3850     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3851     pop @{$self->{open_elements}};
3852     }
3853    
3854 wakaba 1.52 ## As if <{current node}>
3855     ## have an element in table scope
3856     ## true by definition
3857    
3858     ## Clear back to table body context
3859     ## nop by definition
3860    
3861     pop @{$self->{open_elements}};
3862 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
3863 wakaba 1.52 ## reprocess in the "in table" insertion mode...
3864     }
3865    
3866     ## have a table element in table scope
3867     my $i;
3868     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3869     my $node = $self->{open_elements}->[$_];
3870     if ($node->[1] eq $token->{tag_name}) {
3871     $i = $_;
3872     last INSCOPE;
3873     } elsif ({
3874     table => 1, html => 1,
3875     }->{$node->[1]}) {
3876     last INSCOPE;
3877 wakaba 1.47 }
3878 wakaba 1.52 } # INSCOPE
3879     unless (defined $i) {
3880     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3881     ## Ignore the token
3882     !!!next-token;
3883     redo B;
3884 wakaba 1.3 }
3885    
3886 wakaba 1.52 ## generate implied end tags
3887     if ({
3888     dd => 1, dt => 1, li => 1, p => 1,
3889     td => 1, th => 1, tr => 1,
3890     tbody => 1, tfoot=> 1, thead => 1,
3891     }->{$self->{open_elements}->[-1]->[1]}) {
3892     !!!back-token;
3893 wakaba 1.55 $token = {type => END_TAG_TOKEN,
3894 wakaba 1.52 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3895     redo B;
3896     }
3897    
3898     if ($self->{open_elements}->[-1]->[1] ne 'table') {
3899 wakaba 1.3 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3900 wakaba 1.1 }
3901 wakaba 1.52
3902     splice @{$self->{open_elements}}, $i;
3903 wakaba 1.1
3904 wakaba 1.52 $self->_reset_insertion_mode;
3905 wakaba 1.47
3906     !!!next-token;
3907     redo B;
3908     } elsif ({
3909 wakaba 1.48 tbody => 1, tfoot => 1, thead => 1,
3910 wakaba 1.52 }->{$token->{tag_name}} and
3911 wakaba 1.56 $self->{insertion_mode} & ROW_IMS) {
3912 wakaba 1.54 if ($self->{insertion_mode} == IN_ROW_IM) {
3913 wakaba 1.52 ## have an element in table scope
3914     my $i;
3915     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3916     my $node = $self->{open_elements}->[$_];
3917     if ($node->[1] eq $token->{tag_name}) {
3918     $i = $_;
3919     last INSCOPE;
3920     } elsif ({
3921     table => 1, html => 1,
3922     }->{$node->[1]}) {
3923     last INSCOPE;
3924     }
3925     } # INSCOPE
3926     unless (defined $i) {
3927     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3928     ## Ignore the token
3929     !!!next-token;
3930     redo B;
3931     }
3932    
3933 wakaba 1.48 ## As if </tr>
3934     ## have an element in table scope
3935     my $i;
3936     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3937     my $node = $self->{open_elements}->[$_];
3938     if ($node->[1] eq 'tr') {
3939     $i = $_;
3940     last INSCOPE;
3941     } elsif ({
3942     table => 1, html => 1,
3943     }->{$node->[1]}) {
3944     last INSCOPE;
3945     }
3946     } # INSCOPE
3947 wakaba 1.52 unless (defined $i) {
3948     !!!parse-error (type => 'unmatched end tag:tr');
3949     ## Ignore the token
3950     !!!next-token;
3951     redo B;
3952     }
3953 wakaba 1.48
3954     ## Clear back to table row context
3955     while (not {
3956     tr => 1, html => 1,
3957     }->{$self->{open_elements}->[-1]->[1]}) {
3958     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3959     pop @{$self->{open_elements}};
3960     }
3961    
3962     pop @{$self->{open_elements}}; # tr
3963 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_BODY_IM;
3964 wakaba 1.52 ## reprocess in the "in table body" insertion mode...
3965     }
3966    
3967     ## have an element in table scope
3968     my $i;
3969     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3970     my $node = $self->{open_elements}->[$_];
3971     if ($node->[1] eq $token->{tag_name}) {
3972     $i = $_;
3973     last INSCOPE;
3974     } elsif ({
3975     table => 1, html => 1,
3976     }->{$node->[1]}) {
3977     last INSCOPE;
3978     }
3979     } # INSCOPE
3980     unless (defined $i) {
3981     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3982     ## Ignore the token
3983     !!!next-token;
3984     redo B;
3985     }
3986    
3987     ## Clear back to table body context
3988     while (not {
3989     tbody => 1, tfoot => 1, thead => 1, html => 1,
3990     }->{$self->{open_elements}->[-1]->[1]}) {
3991     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3992     pop @{$self->{open_elements}};
3993     }
3994    
3995     pop @{$self->{open_elements}};
3996 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
3997 wakaba 1.52 !!!next-token;
3998     redo B;
3999     } elsif ({
4000     body => 1, caption => 1, col => 1, colgroup => 1,
4001     html => 1, td => 1, th => 1,
4002 wakaba 1.54 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
4003     tbody => 1, tfoot => 1, thead => 1, # $self->{insertion_mode} == IN_TABLE_IM
4004 wakaba 1.52 }->{$token->{tag_name}}) {
4005     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4006     ## Ignore the token
4007     !!!next-token;
4008     redo B;
4009 wakaba 1.58 } else {
4010     !!!parse-error (type => 'in table:/'.$token->{tag_name});
4011 wakaba 1.52
4012 wakaba 1.58 $insert = $insert_to_foster;
4013     #
4014     }
4015     } else {
4016     die "$0: $token->{type}: Unknown token type";
4017     }
4018 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
4019 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
4020 wakaba 1.52 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4021     $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4022     unless (length $token->{data}) {
4023     !!!next-token;
4024     redo B;
4025     }
4026     }
4027    
4028     #
4029 wakaba 1.55 } elsif ($token->{type} == START_TAG_TOKEN) {
4030 wakaba 1.52 if ($token->{tag_name} eq 'col') {
4031     !!!insert-element ($token->{tag_name}, $token->{attributes});
4032     pop @{$self->{open_elements}};
4033     !!!next-token;
4034     redo B;
4035     } else {
4036     #
4037     }
4038 wakaba 1.55 } elsif ($token->{type} == END_TAG_TOKEN) {
4039 wakaba 1.52 if ($token->{tag_name} eq 'colgroup') {
4040     if ($self->{open_elements}->[-1]->[1] eq 'html') {
4041     !!!parse-error (type => 'unmatched end tag:colgroup');
4042     ## Ignore the token
4043     !!!next-token;
4044     redo B;
4045     } else {
4046     pop @{$self->{open_elements}}; # colgroup
4047 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
4048 wakaba 1.52 !!!next-token;
4049     redo B;
4050     }
4051     } elsif ($token->{tag_name} eq 'col') {
4052     !!!parse-error (type => 'unmatched end tag:col');
4053     ## Ignore the token
4054     !!!next-token;
4055     redo B;
4056     } else {
4057     #
4058     }
4059     } else {
4060     #
4061     }
4062    
4063     ## As if </colgroup>
4064     if ($self->{open_elements}->[-1]->[1] eq 'html') {
4065     !!!parse-error (type => 'unmatched end tag:colgroup');
4066     ## Ignore the token
4067     !!!next-token;
4068     redo B;
4069     } else {
4070     pop @{$self->{open_elements}}; # colgroup
4071 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
4072 wakaba 1.52 ## reprocess
4073     redo B;
4074     }
4075 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_SELECT_IM) {
4076 wakaba 1.58 if ($token->{type} == CHARACTER_TOKEN) {
4077     $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4078     !!!next-token;
4079     redo B;
4080     } elsif ($token->{type} == START_TAG_TOKEN) {
4081 wakaba 1.52 if ($token->{tag_name} eq 'option') {
4082     if ($self->{open_elements}->[-1]->[1] eq 'option') {
4083     ## As if </option>
4084     pop @{$self->{open_elements}};
4085     }
4086    
4087     !!!insert-element ($token->{tag_name}, $token->{attributes});
4088     !!!next-token;
4089     redo B;
4090     } elsif ($token->{tag_name} eq 'optgroup') {
4091     if ($self->{open_elements}->[-1]->[1] eq 'option') {
4092     ## As if </option>
4093     pop @{$self->{open_elements}};
4094     }
4095    
4096     if ($self->{open_elements}->[-1]->[1] eq 'optgroup') {
4097     ## As if </optgroup>
4098     pop @{$self->{open_elements}};
4099     }
4100    
4101     !!!insert-element ($token->{tag_name}, $token->{attributes});
4102     !!!next-token;
4103     redo B;
4104     } elsif ($token->{tag_name} eq 'select') {
4105     !!!parse-error (type => 'not closed:select');
4106     ## As if </select> instead
4107     ## have an element in table scope
4108     my $i;
4109     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4110     my $node = $self->{open_elements}->[$_];
4111     if ($node->[1] eq $token->{tag_name}) {
4112     $i = $_;
4113     last INSCOPE;
4114     } elsif ({
4115     table => 1, html => 1,
4116     }->{$node->[1]}) {
4117     last INSCOPE;
4118 wakaba 1.47 }
4119 wakaba 1.52 } # INSCOPE
4120     unless (defined $i) {
4121     !!!parse-error (type => 'unmatched end tag:select');
4122     ## Ignore the token
4123     !!!next-token;
4124     redo B;
4125 wakaba 1.47 }
4126 wakaba 1.52
4127     splice @{$self->{open_elements}}, $i;
4128    
4129     $self->_reset_insertion_mode;
4130 wakaba 1.47
4131 wakaba 1.52 !!!next-token;
4132     redo B;
4133 wakaba 1.58 } else {
4134     !!!parse-error (type => 'in select:'.$token->{tag_name});
4135     ## Ignore the token
4136     !!!next-token;
4137     redo B;
4138     }
4139     } elsif ($token->{type} == END_TAG_TOKEN) {
4140 wakaba 1.52 if ($token->{tag_name} eq 'optgroup') {
4141     if ($self->{open_elements}->[-1]->[1] eq 'option' and
4142     $self->{open_elements}->[-2]->[1] eq 'optgroup') {
4143     ## As if </option>
4144     splice @{$self->{open_elements}}, -2;
4145     } elsif ($self->{open_elements}->[-1]->[1] eq 'optgroup') {
4146     pop @{$self->{open_elements}};
4147     } else {
4148     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4149     ## Ignore the token
4150     }
4151     !!!next-token;
4152     redo B;
4153     } elsif ($token->{tag_name} eq 'option') {
4154     if ($self->{open_elements}->[-1]->[1] eq 'option') {
4155 wakaba 1.47 pop @{$self->{open_elements}};
4156 wakaba 1.52 } else {
4157     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4158     ## Ignore the token
4159 wakaba 1.1 }
4160 wakaba 1.52 !!!next-token;
4161     redo B;
4162     } elsif ($token->{tag_name} eq 'select') {
4163     ## have an element in table scope
4164     my $i;
4165     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4166     my $node = $self->{open_elements}->[$_];
4167     if ($node->[1] eq $token->{tag_name}) {
4168     $i = $_;
4169     last INSCOPE;
4170     } elsif ({
4171     table => 1, html => 1,
4172     }->{$node->[1]}) {
4173     last INSCOPE;
4174 wakaba 1.48 }
4175 wakaba 1.52 } # INSCOPE
4176     unless (defined $i) {
4177     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4178     ## Ignore the token
4179     !!!next-token;
4180 wakaba 1.48 redo B;
4181 wakaba 1.52 }
4182    
4183     splice @{$self->{open_elements}}, $i;
4184    
4185     $self->_reset_insertion_mode;
4186    
4187     !!!next-token;
4188     redo B;
4189     } elsif ({
4190     caption => 1, table => 1, tbody => 1,
4191     tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
4192     }->{$token->{tag_name}}) {
4193     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4194    
4195     ## have an element in table scope
4196     my $i;
4197     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4198     my $node = $self->{open_elements}->[$_];
4199     if ($node->[1] eq $token->{tag_name}) {
4200     $i = $_;
4201     last INSCOPE;
4202     } elsif ({
4203     table => 1, html => 1,
4204     }->{$node->[1]}) {
4205     last INSCOPE;
4206 wakaba 1.1 }
4207 wakaba 1.52 } # INSCOPE
4208     unless (defined $i) {
4209     ## Ignore the token
4210 wakaba 1.1 !!!next-token;
4211     redo B;
4212     }
4213 wakaba 1.52
4214     ## As if </select>
4215     ## have an element in table scope
4216     undef $i;
4217 wakaba 1.3 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4218     my $node = $self->{open_elements}->[$_];
4219 wakaba 1.52 if ($node->[1] eq 'select') {
4220 wakaba 1.1 $i = $_;
4221     last INSCOPE;
4222     } elsif ({
4223     table => 1, html => 1,
4224 wakaba 1.52 }->{$node->[1]}) {
4225     last INSCOPE;
4226     }
4227     } # INSCOPE
4228     unless (defined $i) {
4229     !!!parse-error (type => 'unmatched end tag:select');
4230     ## Ignore the </select> token
4231     !!!next-token; ## TODO: ok?
4232     redo B;
4233     }
4234    
4235     splice @{$self->{open_elements}}, $i;
4236    
4237     $self->_reset_insertion_mode;
4238    
4239     ## reprocess
4240     redo B;
4241 wakaba 1.58 } else {
4242     !!!parse-error (type => 'in select:/'.$token->{tag_name});
4243 wakaba 1.52 ## Ignore the token
4244     !!!next-token;
4245     redo B;
4246 wakaba 1.58 }
4247     } else {
4248     die "$0: $token->{type}: Unknown token type";
4249     }
4250 wakaba 1.56 } elsif ($self->{insertion_mode} & BODY_AFTER_IMS) {
4251 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
4252 wakaba 1.52 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4253     my $data = $1;
4254     ## As if in body
4255     $reconstruct_active_formatting_elements->($insert_to_current);
4256    
4257     $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4258    
4259     unless (length $token->{data}) {
4260     !!!next-token;
4261     redo B;
4262     }
4263     }
4264    
4265 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
4266 wakaba 1.52 !!!parse-error (type => 'after html:#character');
4267    
4268     ## Reprocess in the "main" phase, "after body" insertion mode...
4269     }
4270    
4271     ## "after body" insertion mode
4272     !!!parse-error (type => 'after body:#character');
4273    
4274 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM;
4275 wakaba 1.52 ## reprocess
4276     redo B;
4277 wakaba 1.55 } elsif ($token->{type} == START_TAG_TOKEN) {
4278 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
4279 wakaba 1.52 !!!parse-error (type => 'after html:'.$token->{tag_name});
4280    
4281     ## Reprocess in the "main" phase, "after body" insertion mode...
4282     }
4283    
4284     ## "after body" insertion mode
4285     !!!parse-error (type => 'after body:'.$token->{tag_name});
4286    
4287 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM;
4288 wakaba 1.52 ## reprocess
4289     redo B;
4290 wakaba 1.55 } elsif ($token->{type} == END_TAG_TOKEN) {
4291 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
4292 wakaba 1.52 !!!parse-error (type => 'after html:/'.$token->{tag_name});
4293    
4294 wakaba 1.54 $self->{insertion_mode} = AFTER_BODY_IM;
4295 wakaba 1.52 ## Reprocess in the "main" phase, "after body" insertion mode...
4296     }
4297    
4298     ## "after body" insertion mode
4299     if ($token->{tag_name} eq 'html') {
4300     if (defined $self->{inner_html_node}) {
4301     !!!parse-error (type => 'unmatched end tag:html');
4302     ## Ignore the token
4303     !!!next-token;
4304     redo B;
4305     } else {
4306 wakaba 1.54 $self->{insertion_mode} = AFTER_HTML_BODY_IM;
4307 wakaba 1.52 !!!next-token;
4308     redo B;
4309     }
4310     } else {
4311     !!!parse-error (type => 'after body:/'.$token->{tag_name});
4312    
4313 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM;
4314 wakaba 1.52 ## reprocess
4315     redo B;
4316     }
4317     } else {
4318     die "$0: $token->{type}: Unknown token type";
4319     }
4320 wakaba 1.56 } elsif ($self->{insertion_mode} & FRAME_IMS) {
4321 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
4322 wakaba 1.52 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4323     $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4324    
4325     unless (length $token->{data}) {
4326     !!!next-token;
4327     redo B;
4328     }
4329     }
4330    
4331     if ($token->{data} =~ s/^[^\x09\x0A\x0B\x0C\x20]+//) {
4332 wakaba 1.54 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
4333 wakaba 1.52 !!!parse-error (type => 'in frameset:#character');
4334 wakaba 1.54 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
4335 wakaba 1.52 !!!parse-error (type => 'after frameset:#character');
4336     } else { # "after html frameset"
4337     !!!parse-error (type => 'after html:#character');
4338    
4339 wakaba 1.54 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4340 wakaba 1.52 ## Reprocess in the "main" phase, "after frameset"...
4341     !!!parse-error (type => 'after frameset:#character');
4342     }
4343    
4344     ## Ignore the token.
4345     if (length $token->{data}) {
4346     ## reprocess the rest of characters
4347     } else {
4348     !!!next-token;
4349     }
4350     redo B;
4351     }
4352    
4353     die qq[$0: Character "$token->{data}"];
4354 wakaba 1.55 } elsif ($token->{type} == START_TAG_TOKEN) {
4355 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
4356 wakaba 1.52 !!!parse-error (type => 'after html:'.$token->{tag_name});
4357 wakaba 1.1
4358 wakaba 1.54 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4359 wakaba 1.52 ## Process in the "main" phase, "after frameset" insertion mode...
4360     }
4361 wakaba 1.1
4362 wakaba 1.52 if ($token->{tag_name} eq 'frameset' and
4363 wakaba 1.54 $self->{insertion_mode} == IN_FRAMESET_IM) {
4364 wakaba 1.52 !!!insert-element ($token->{tag_name}, $token->{attributes});
4365     !!!next-token;
4366     redo B;
4367     } elsif ($token->{tag_name} eq 'frame' and
4368 wakaba 1.54 $self->{insertion_mode} == IN_FRAMESET_IM) {
4369 wakaba 1.52 !!!insert-element ($token->{tag_name}, $token->{attributes});
4370     pop @{$self->{open_elements}};
4371     !!!next-token;
4372     redo B;
4373     } elsif ($token->{tag_name} eq 'noframes') {
4374     ## NOTE: As if in body.
4375     $parse_rcdata->(CDATA_CONTENT_MODEL, $insert_to_current);
4376     redo B;
4377     } else {
4378 wakaba 1.54 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
4379 wakaba 1.52 !!!parse-error (type => 'in frameset:'.$token->{tag_name});
4380     } else {
4381     !!!parse-error (type => 'after frameset:'.$token->{tag_name});
4382     }
4383     ## Ignore the token
4384     !!!next-token;
4385     redo B;
4386     }
4387 wakaba 1.55 } elsif ($token->{type} == END_TAG_TOKEN) {
4388 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
4389 wakaba 1.52 !!!parse-error (type => 'after html:/'.$token->{tag_name});
4390 wakaba 1.1
4391 wakaba 1.54 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4392 wakaba 1.52 ## Process in the "main" phase, "after frameset" insertion mode...
4393     }
4394 wakaba 1.1
4395 wakaba 1.52 if ($token->{tag_name} eq 'frameset' and
4396 wakaba 1.54 $self->{insertion_mode} == IN_FRAMESET_IM) {
4397 wakaba 1.52 if ($self->{open_elements}->[-1]->[1] eq 'html' and
4398     @{$self->{open_elements}} == 1) {
4399     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4400     ## Ignore the token
4401     !!!next-token;
4402     } else {
4403     pop @{$self->{open_elements}};
4404     !!!next-token;
4405     }
4406 wakaba 1.47
4407 wakaba 1.52 if (not defined $self->{inner_html_node} and
4408     $self->{open_elements}->[-1]->[1] ne 'frameset') {
4409 wakaba 1.54 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4410 wakaba 1.52 }
4411     redo B;
4412     } elsif ($token->{tag_name} eq 'html' and
4413 wakaba 1.54 $self->{insertion_mode} == AFTER_FRAMESET_IM) {
4414     $self->{insertion_mode} = AFTER_HTML_FRAMESET_IM;
4415 wakaba 1.52 !!!next-token;
4416     redo B;
4417     } else {
4418 wakaba 1.54 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
4419 wakaba 1.52 !!!parse-error (type => 'in frameset:/'.$token->{tag_name});
4420     } else {
4421     !!!parse-error (type => 'after frameset:/'.$token->{tag_name});
4422     }
4423     ## Ignore the token
4424     !!!next-token;
4425     redo B;
4426     }
4427     } else {
4428     die "$0: $token->{type}: Unknown token type";
4429     }
4430 wakaba 1.47
4431 wakaba 1.52 ## ISSUE: An issue in spec here
4432     } else {
4433     die "$0: $self->{insertion_mode}: Unknown insertion mode";
4434     }
4435 wakaba 1.47
4436 wakaba 1.52 ## "in body" insertion mode
4437 wakaba 1.55 if ($token->{type} == START_TAG_TOKEN) {
4438 wakaba 1.52 if ($token->{tag_name} eq 'script') {
4439     ## NOTE: This is an "as if in head" code clone
4440     $script_start_tag->($insert);
4441 wakaba 1.53 redo B;
4442 wakaba 1.52 } elsif ($token->{tag_name} eq 'style') {
4443     ## NOTE: This is an "as if in head" code clone
4444     $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);
4445 wakaba 1.53 redo B;
4446 wakaba 1.52 } elsif ({
4447     base => 1, link => 1,
4448     }->{$token->{tag_name}}) {
4449     ## NOTE: This is an "as if in head" code clone, only "-t" differs
4450     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4451     pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4452     !!!next-token;
4453 wakaba 1.53 redo B;
4454 wakaba 1.52 } elsif ($token->{tag_name} eq 'meta') {
4455     ## NOTE: This is an "as if in head" code clone, only "-t" differs
4456     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4457     pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4458 wakaba 1.46
4459 wakaba 1.52 unless ($self->{confident}) {
4460     if ($token->{attributes}->{charset}) { ## TODO: And if supported
4461 wakaba 1.63 $self->{change_encoding}
4462     ->($self, $token->{attributes}->{charset}->{value});
4463     } elsif ($token->{attributes}->{content}) {
4464 wakaba 1.52 ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
4465 wakaba 1.63 if ($token->{attributes}->{content}->{value}
4466 wakaba 1.52 =~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*=
4467     [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
4468     ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
4469 wakaba 1.63 $self->{change_encoding}
4470     ->($self, defined $1 ? $1 : defined $2 ? $2 : $3);
4471     }
4472 wakaba 1.52 }
4473     }
4474 wakaba 1.1
4475 wakaba 1.52 !!!next-token;
4476 wakaba 1.53 redo B;
4477 wakaba 1.52 } elsif ($token->{tag_name} eq 'title') {
4478     !!!parse-error (type => 'in body:title');
4479     ## NOTE: This is an "as if in head" code clone
4480     $parse_rcdata->(RCDATA_CONTENT_MODEL, sub {
4481     if (defined $self->{head_element}) {
4482     $self->{head_element}->append_child ($_[0]);
4483     } else {
4484     $insert->($_[0]);
4485     }
4486     });
4487 wakaba 1.53 redo B;
4488 wakaba 1.52 } elsif ($token->{tag_name} eq 'body') {
4489     !!!parse-error (type => 'in body:body');
4490 wakaba 1.46
4491 wakaba 1.52 if (@{$self->{open_elements}} == 1 or
4492     $self->{open_elements}->[1]->[1] ne 'body') {
4493     ## Ignore the token
4494     } else {
4495     my $body_el = $self->{open_elements}->[1]->[0];
4496     for my $attr_name (keys %{$token->{attributes}}) {
4497     unless ($body_el->has_attribute_ns (undef, $attr_name)) {
4498     $body_el->set_attribute_ns
4499     (undef, [undef, $attr_name],
4500     $token->{attributes}->{$attr_name}->{value});
4501     }
4502     }
4503     }
4504     !!!next-token;
4505 wakaba 1.53 redo B;
4506 wakaba 1.52 } elsif ({
4507     address => 1, blockquote => 1, center => 1, dir => 1,
4508     div => 1, dl => 1, fieldset => 1, listing => 1,
4509     menu => 1, ol => 1, p => 1, ul => 1,
4510     pre => 1,
4511     }->{$token->{tag_name}}) {
4512     ## has a p element in scope
4513     INSCOPE: for (reverse @{$self->{open_elements}}) {
4514     if ($_->[1] eq 'p') {
4515     !!!back-token;
4516 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4517 wakaba 1.53 redo B;
4518 wakaba 1.52 } elsif ({
4519     table => 1, caption => 1, td => 1, th => 1,
4520     button => 1, marquee => 1, object => 1, html => 1,
4521     }->{$_->[1]}) {
4522     last INSCOPE;
4523     }
4524     } # INSCOPE
4525    
4526     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4527     if ($token->{tag_name} eq 'pre') {
4528     !!!next-token;
4529 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
4530 wakaba 1.52 $token->{data} =~ s/^\x0A//;
4531     unless (length $token->{data}) {
4532 wakaba 1.1 !!!next-token;
4533 wakaba 1.52 }
4534     }
4535     } else {
4536     !!!next-token;
4537     }
4538 wakaba 1.53 redo B;
4539 wakaba 1.52 } elsif ($token->{tag_name} eq 'form') {
4540     if (defined $self->{form_element}) {
4541     !!!parse-error (type => 'in form:form');
4542     ## Ignore the token
4543     !!!next-token;
4544 wakaba 1.53 redo B;
4545 wakaba 1.52 } else {
4546     ## has a p element in scope
4547     INSCOPE: for (reverse @{$self->{open_elements}}) {
4548     if ($_->[1] eq 'p') {
4549     !!!back-token;
4550 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4551 wakaba 1.53 redo B;
4552 wakaba 1.46 } elsif ({
4553 wakaba 1.52 table => 1, caption => 1, td => 1, th => 1,
4554     button => 1, marquee => 1, object => 1, html => 1,
4555     }->{$_->[1]}) {
4556     last INSCOPE;
4557     }
4558     } # INSCOPE
4559    
4560     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4561     $self->{form_element} = $self->{open_elements}->[-1]->[0];
4562     !!!next-token;
4563 wakaba 1.53 redo B;
4564 wakaba 1.52 }
4565     } elsif ($token->{tag_name} eq 'li') {
4566     ## has a p element in scope
4567     INSCOPE: for (reverse @{$self->{open_elements}}) {
4568     if ($_->[1] eq 'p') {
4569     !!!back-token;
4570 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4571 wakaba 1.53 redo B;
4572 wakaba 1.52 } elsif ({
4573     table => 1, caption => 1, td => 1, th => 1,
4574     button => 1, marquee => 1, object => 1, html => 1,
4575     }->{$_->[1]}) {
4576     last INSCOPE;
4577     }
4578     } # INSCOPE
4579    
4580     ## Step 1
4581     my $i = -1;
4582     my $node = $self->{open_elements}->[$i];
4583     LI: {
4584     ## Step 2
4585     if ($node->[1] eq 'li') {
4586     if ($i != -1) {
4587     !!!parse-error (type => 'end tag missing:'.
4588     $self->{open_elements}->[-1]->[1]);
4589     }
4590     splice @{$self->{open_elements}}, $i;
4591     last LI;
4592     }
4593    
4594     ## Step 3
4595     if (not $formatting_category->{$node->[1]} and
4596     #not $phrasing_category->{$node->[1]} and
4597     ($special_category->{$node->[1]} or
4598     $scoping_category->{$node->[1]}) and
4599     $node->[1] ne 'address' and $node->[1] ne 'div') {
4600     last LI;
4601     }
4602    
4603     ## Step 4
4604     $i--;
4605     $node = $self->{open_elements}->[$i];
4606     redo LI;
4607     } # LI
4608    
4609     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4610     !!!next-token;
4611 wakaba 1.53 redo B;
4612 wakaba 1.52 } elsif ($token->{tag_name} eq 'dd' or $token->{tag_name} eq 'dt') {
4613     ## has a p element in scope
4614     INSCOPE: for (reverse @{$self->{open_elements}}) {
4615     if ($_->[1] eq 'p') {
4616     !!!back-token;
4617 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4618 wakaba 1.53 redo B;
4619 wakaba 1.52 } elsif ({
4620     table => 1, caption => 1, td => 1, th => 1,
4621     button => 1, marquee => 1, object => 1, html => 1,
4622     }->{$_->[1]}) {
4623     last INSCOPE;
4624     }
4625     } # INSCOPE
4626    
4627     ## Step 1
4628     my $i = -1;
4629     my $node = $self->{open_elements}->[$i];
4630     LI: {
4631     ## Step 2
4632     if ($node->[1] eq 'dt' or $node->[1] eq 'dd') {
4633     if ($i != -1) {
4634     !!!parse-error (type => 'end tag missing:'.
4635     $self->{open_elements}->[-1]->[1]);
4636 wakaba 1.1 }
4637 wakaba 1.52 splice @{$self->{open_elements}}, $i;
4638     last LI;
4639     }
4640    
4641     ## Step 3
4642     if (not $formatting_category->{$node->[1]} and
4643     #not $phrasing_category->{$node->[1]} and
4644     ($special_category->{$node->[1]} or
4645     $scoping_category->{$node->[1]}) and
4646     $node->[1] ne 'address' and $node->[1] ne 'div') {
4647     last LI;
4648 wakaba 1.1 }
4649 wakaba 1.52
4650     ## Step 4
4651     $i--;
4652     $node = $self->{open_elements}->[$i];
4653     redo LI;
4654     } # LI
4655    
4656     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4657     !!!next-token;
4658 wakaba 1.53 redo B;
4659 wakaba 1.52 } elsif ($token->{tag_name} eq 'plaintext') {
4660     ## has a p element in scope
4661     INSCOPE: for (reverse @{$self->{open_elements}}) {
4662     if ($_->[1] eq 'p') {
4663     !!!back-token;
4664 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4665 wakaba 1.53 redo B;
4666 wakaba 1.52 } elsif ({
4667     table => 1, caption => 1, td => 1, th => 1,
4668     button => 1, marquee => 1, object => 1, html => 1,
4669     }->{$_->[1]}) {
4670     last INSCOPE;
4671 wakaba 1.46 }
4672 wakaba 1.52 } # INSCOPE
4673    
4674     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4675    
4676     $self->{content_model} = PLAINTEXT_CONTENT_MODEL;
4677    
4678     !!!next-token;
4679 wakaba 1.53 redo B;
4680 wakaba 1.52 } elsif ({
4681     h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
4682     }->{$token->{tag_name}}) {
4683     ## has a p element in scope
4684     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4685     my $node = $self->{open_elements}->[$_];
4686     if ($node->[1] eq 'p') {
4687     !!!back-token;
4688 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4689 wakaba 1.53 redo B;
4690 wakaba 1.52 } elsif ({
4691     table => 1, caption => 1, td => 1, th => 1,
4692     button => 1, marquee => 1, object => 1, html => 1,
4693     }->{$node->[1]}) {
4694     last INSCOPE;
4695 wakaba 1.46 }
4696 wakaba 1.52 } # INSCOPE
4697    
4698     ## NOTE: See <http://html5.org/tools/web-apps-tracker?from=925&to=926>
4699     ## has an element in scope
4700     #my $i;
4701     #INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4702     # my $node = $self->{open_elements}->[$_];
4703     # if ({
4704     # h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
4705     # }->{$node->[1]}) {
4706     # $i = $_;
4707     # last INSCOPE;
4708     # } elsif ({
4709     # table => 1, caption => 1, td => 1, th => 1,
4710     # button => 1, marquee => 1, object => 1, html => 1,
4711     # }->{$node->[1]}) {
4712     # last INSCOPE;
4713     # }
4714     #} # INSCOPE
4715     #
4716     #if (defined $i) {
4717     # !!! parse-error (type => 'in hn:hn');
4718     # splice @{$self->{open_elements}}, $i;
4719     #}
4720    
4721     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4722    
4723     !!!next-token;
4724 wakaba 1.53 redo B;
4725 wakaba 1.52 } elsif ($token->{tag_name} eq 'a') {
4726     AFE: for my $i (reverse 0..$#$active_formatting_elements) {
4727     my $node = $active_formatting_elements->[$i];
4728     if ($node->[1] eq 'a') {
4729     !!!parse-error (type => 'in a:a');
4730    
4731     !!!back-token;
4732 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'a'};
4733 wakaba 1.52 $formatting_end_tag->($token->{tag_name});
4734    
4735     AFE2: for (reverse 0..$#$active_formatting_elements) {
4736     if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
4737     splice @$active_formatting_elements, $_, 1;
4738     last AFE2;
4739 wakaba 1.1 }
4740 wakaba 1.52 } # AFE2
4741     OE: for (reverse 0..$#{$self->{open_elements}}) {
4742     if ($self->{open_elements}->[$_]->[0] eq $node->[0]) {
4743     splice @{$self->{open_elements}}, $_, 1;
4744     last OE;
4745 wakaba 1.1 }
4746 wakaba 1.52 } # OE
4747     last AFE;
4748     } elsif ($node->[0] eq '#marker') {
4749     last AFE;
4750     }
4751     } # AFE
4752    
4753     $reconstruct_active_formatting_elements->($insert_to_current);
4754 wakaba 1.1
4755 wakaba 1.52 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4756     push @$active_formatting_elements, $self->{open_elements}->[-1];
4757 wakaba 1.1
4758 wakaba 1.52 !!!next-token;
4759 wakaba 1.53 redo B;
4760 wakaba 1.52 } elsif ({
4761     b => 1, big => 1, em => 1, font => 1, i => 1,
4762     s => 1, small => 1, strile => 1,
4763     strong => 1, tt => 1, u => 1,
4764     }->{$token->{tag_name}}) {
4765     $reconstruct_active_formatting_elements->($insert_to_current);
4766    
4767     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4768     push @$active_formatting_elements, $self->{open_elements}->[-1];
4769    
4770     !!!next-token;
4771 wakaba 1.53 redo B;
4772 wakaba 1.52 } elsif ($token->{tag_name} eq 'nobr') {
4773     $reconstruct_active_formatting_elements->($insert_to_current);
4774 wakaba 1.1
4775 wakaba 1.52 ## has a |nobr| element in scope
4776     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4777     my $node = $self->{open_elements}->[$_];
4778     if ($node->[1] eq 'nobr') {
4779 wakaba 1.58 !!!parse-error (type => 'in nobr:nobr');
4780 wakaba 1.52 !!!back-token;
4781 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'nobr'};
4782 wakaba 1.53 redo B;
4783 wakaba 1.52 } elsif ({
4784     table => 1, caption => 1, td => 1, th => 1,
4785     button => 1, marquee => 1, object => 1, html => 1,
4786     }->{$node->[1]}) {
4787     last INSCOPE;
4788     }
4789     } # INSCOPE
4790    
4791     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4792     push @$active_formatting_elements, $self->{open_elements}->[-1];
4793    
4794     !!!next-token;
4795 wakaba 1.53 redo B;
4796 wakaba 1.52 } elsif ($token->{tag_name} eq 'button') {
4797     ## has a button element in scope
4798     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4799     my $node = $self->{open_elements}->[$_];
4800     if ($node->[1] eq 'button') {
4801     !!!parse-error (type => 'in button:button');
4802     !!!back-token;
4803 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'button'};
4804 wakaba 1.53 redo B;
4805 wakaba 1.52 } elsif ({
4806     table => 1, caption => 1, td => 1, th => 1,
4807     button => 1, marquee => 1, object => 1, html => 1,
4808     }->{$node->[1]}) {
4809     last INSCOPE;
4810     }
4811     } # INSCOPE
4812    
4813     $reconstruct_active_formatting_elements->($insert_to_current);
4814    
4815     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4816     push @$active_formatting_elements, ['#marker', ''];
4817 wakaba 1.1
4818 wakaba 1.52 !!!next-token;
4819 wakaba 1.53 redo B;
4820 wakaba 1.52 } elsif ($token->{tag_name} eq 'marquee' or
4821     $token->{tag_name} eq 'object') {
4822     $reconstruct_active_formatting_elements->($insert_to_current);
4823    
4824     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4825     push @$active_formatting_elements, ['#marker', ''];
4826    
4827     !!!next-token;
4828 wakaba 1.53 redo B;
4829 wakaba 1.52 } elsif ($token->{tag_name} eq 'xmp') {
4830     $reconstruct_active_formatting_elements->($insert_to_current);
4831     $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);
4832 wakaba 1.53 redo B;
4833 wakaba 1.52 } elsif ($token->{tag_name} eq 'table') {
4834     ## has a p element in scope
4835     INSCOPE: for (reverse @{$self->{open_elements}}) {
4836     if ($_->[1] eq 'p') {
4837     !!!back-token;
4838 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4839 wakaba 1.53 redo B;
4840 wakaba 1.52 } elsif ({
4841     table => 1, caption => 1, td => 1, th => 1,
4842     button => 1, marquee => 1, object => 1, html => 1,
4843     }->{$_->[1]}) {
4844     last INSCOPE;
4845     }
4846     } # INSCOPE
4847    
4848     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4849    
4850 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
4851 wakaba 1.52
4852     !!!next-token;
4853 wakaba 1.53 redo B;
4854 wakaba 1.52 } elsif ({
4855     area => 1, basefont => 1, bgsound => 1, br => 1,
4856     embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
4857     image => 1,
4858     }->{$token->{tag_name}}) {
4859     if ($token->{tag_name} eq 'image') {
4860     !!!parse-error (type => 'image');
4861     $token->{tag_name} = 'img';
4862     }
4863 wakaba 1.1
4864 wakaba 1.52 ## NOTE: There is an "as if <br>" code clone.
4865     $reconstruct_active_formatting_elements->($insert_to_current);
4866    
4867     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4868     pop @{$self->{open_elements}};
4869    
4870     !!!next-token;
4871 wakaba 1.53 redo B;
4872 wakaba 1.52 } elsif ($token->{tag_name} eq 'hr') {
4873     ## has a p element in scope
4874     INSCOPE: for (reverse @{$self->{open_elements}}) {
4875     if ($_->[1] eq 'p') {
4876     !!!back-token;
4877 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4878 wakaba 1.53 redo B;
4879 wakaba 1.52 } elsif ({
4880     table => 1, caption => 1, td => 1, th => 1,
4881     button => 1, marquee => 1, object => 1, html => 1,
4882     }->{$_->[1]}) {
4883     last INSCOPE;
4884     }
4885     } # INSCOPE
4886    
4887     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4888     pop @{$self->{open_elements}};
4889    
4890     !!!next-token;
4891 wakaba 1.53 redo B;
4892 wakaba 1.52 } elsif ($token->{tag_name} eq 'input') {
4893     $reconstruct_active_formatting_elements->($insert_to_current);
4894    
4895     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4896     ## TODO: associate with $self->{form_element} if defined
4897     pop @{$self->{open_elements}};
4898    
4899     !!!next-token;
4900 wakaba 1.53 redo B;
4901 wakaba 1.52 } elsif ($token->{tag_name} eq 'isindex') {
4902     !!!parse-error (type => 'isindex');
4903    
4904     if (defined $self->{form_element}) {
4905     ## Ignore the token
4906     !!!next-token;
4907 wakaba 1.53 redo B;
4908 wakaba 1.52 } else {
4909     my $at = $token->{attributes};
4910     my $form_attrs;
4911     $form_attrs->{action} = $at->{action} if $at->{action};
4912     my $prompt_attr = $at->{prompt};
4913     $at->{name} = {name => 'name', value => 'isindex'};
4914     delete $at->{action};
4915     delete $at->{prompt};
4916     my @tokens = (
4917 wakaba 1.55 {type => START_TAG_TOKEN, tag_name => 'form',
4918 wakaba 1.52 attributes => $form_attrs},
4919 wakaba 1.55 {type => START_TAG_TOKEN, tag_name => 'hr'},
4920     {type => START_TAG_TOKEN, tag_name => 'p'},
4921     {type => START_TAG_TOKEN, tag_name => 'label'},
4922 wakaba 1.52 );
4923     if ($prompt_attr) {
4924 wakaba 1.55 push @tokens, {type => CHARACTER_TOKEN, data => $prompt_attr->{value}};
4925 wakaba 1.1 } else {
4926 wakaba 1.55 push @tokens, {type => CHARACTER_TOKEN,
4927 wakaba 1.52 data => 'This is a searchable index. Insert your search keywords here: '}; # SHOULD
4928     ## TODO: make this configurable
4929 wakaba 1.1 }
4930 wakaba 1.52 push @tokens,
4931 wakaba 1.55 {type => START_TAG_TOKEN, tag_name => 'input', attributes => $at},
4932     #{type => CHARACTER_TOKEN, data => ''}, # SHOULD
4933     {type => END_TAG_TOKEN, tag_name => 'label'},
4934     {type => END_TAG_TOKEN, tag_name => 'p'},
4935     {type => START_TAG_TOKEN, tag_name => 'hr'},
4936     {type => END_TAG_TOKEN, tag_name => 'form'};
4937 wakaba 1.52 $token = shift @tokens;
4938     !!!back-token (@tokens);
4939 wakaba 1.53 redo B;
4940 wakaba 1.52 }
4941     } elsif ($token->{tag_name} eq 'textarea') {
4942     my $tag_name = $token->{tag_name};
4943     my $el;
4944     !!!create-element ($el, $token->{tag_name}, $token->{attributes});
4945    
4946     ## TODO: $self->{form_element} if defined
4947     $self->{content_model} = RCDATA_CONTENT_MODEL;
4948     delete $self->{escape}; # MUST
4949    
4950     $insert->($el);
4951    
4952     my $text = '';
4953     !!!next-token;
4954 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
4955 wakaba 1.52 $token->{data} =~ s/^\x0A//;
4956 wakaba 1.51 unless (length $token->{data}) {
4957     !!!next-token;
4958     }
4959     }
4960 wakaba 1.55 while ($token->{type} == CHARACTER_TOKEN) {
4961 wakaba 1.52 $text .= $token->{data};
4962     !!!next-token;
4963     }
4964     if (length $text) {
4965     $el->manakai_append_text ($text);
4966     }
4967    
4968     $self->{content_model} = PCDATA_CONTENT_MODEL;
4969 wakaba 1.51
4970 wakaba 1.55 if ($token->{type} == END_TAG_TOKEN and
4971 wakaba 1.52 $token->{tag_name} eq $tag_name) {
4972     ## Ignore the token
4973     } else {
4974     !!!parse-error (type => 'in RCDATA:#'.$token->{type});
4975 wakaba 1.51 }
4976 wakaba 1.52 !!!next-token;
4977 wakaba 1.53 redo B;
4978 wakaba 1.52 } elsif ({
4979     iframe => 1,
4980     noembed => 1,
4981     noframes => 1,
4982     noscript => 0, ## TODO: 1 if scripting is enabled
4983     }->{$token->{tag_name}}) {
4984 wakaba 1.58 ## NOTE: There is an "as if in body" code clone.
4985 wakaba 1.52 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);
4986 wakaba 1.53 redo B;
4987 wakaba 1.52 } elsif ($token->{tag_name} eq 'select') {
4988     $reconstruct_active_formatting_elements->($insert_to_current);
4989    
4990     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4991    
4992 wakaba 1.54 $self->{insertion_mode} = IN_SELECT_IM;
4993 wakaba 1.52 !!!next-token;
4994 wakaba 1.53 redo B;
4995 wakaba 1.52 } elsif ({
4996     caption => 1, col => 1, colgroup => 1, frame => 1,
4997     frameset => 1, head => 1, option => 1, optgroup => 1,
4998     tbody => 1, td => 1, tfoot => 1, th => 1,
4999     thead => 1, tr => 1,
5000     }->{$token->{tag_name}}) {
5001     !!!parse-error (type => 'in body:'.$token->{tag_name});
5002     ## Ignore the token
5003     !!!next-token;
5004 wakaba 1.53 redo B;
5005 wakaba 1.52
5006     ## ISSUE: An issue on HTML5 new elements in the spec.
5007     } else {
5008     $reconstruct_active_formatting_elements->($insert_to_current);
5009    
5010     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5011 wakaba 1.51
5012 wakaba 1.52 !!!next-token;
5013 wakaba 1.53 redo B;
5014 wakaba 1.52 }
5015 wakaba 1.55 } elsif ($token->{type} == END_TAG_TOKEN) {
5016 wakaba 1.52 if ($token->{tag_name} eq 'body') {
5017     if (@{$self->{open_elements}} > 1 and
5018     $self->{open_elements}->[1]->[1] eq 'body') {
5019     for (@{$self->{open_elements}}) {
5020     unless ({
5021     dd => 1, dt => 1, li => 1, p => 1, td => 1,
5022     th => 1, tr => 1, body => 1, html => 1,
5023     tbody => 1, tfoot => 1, thead => 1,
5024     }->{$_->[1]}) {
5025     !!!parse-error (type => 'not closed:'.$_->[1]);
5026     }
5027     }
5028 wakaba 1.51
5029 wakaba 1.54 $self->{insertion_mode} = AFTER_BODY_IM;
5030 wakaba 1.52 !!!next-token;
5031 wakaba 1.53 redo B;
5032 wakaba 1.52 } else {
5033     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5034     ## Ignore the token
5035     !!!next-token;
5036 wakaba 1.53 redo B;
5037 wakaba 1.51 }
5038 wakaba 1.52 } elsif ($token->{tag_name} eq 'html') {
5039     if (@{$self->{open_elements}} > 1 and $self->{open_elements}->[1]->[1] eq 'body') {
5040     ## ISSUE: There is an issue in the spec.
5041     if ($self->{open_elements}->[-1]->[1] ne 'body') {
5042     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[1]->[1]);
5043 wakaba 1.1 }
5044 wakaba 1.54 $self->{insertion_mode} = AFTER_BODY_IM;
5045 wakaba 1.52 ## reprocess
5046 wakaba 1.53 redo B;
5047 wakaba 1.51 } else {
5048 wakaba 1.52 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5049     ## Ignore the token
5050     !!!next-token;
5051 wakaba 1.53 redo B;
5052 wakaba 1.51 }
5053 wakaba 1.52 } elsif ({
5054     address => 1, blockquote => 1, center => 1, dir => 1,
5055     div => 1, dl => 1, fieldset => 1, listing => 1,
5056     menu => 1, ol => 1, pre => 1, ul => 1,
5057     p => 1,
5058     dd => 1, dt => 1, li => 1,
5059     button => 1, marquee => 1, object => 1,
5060     }->{$token->{tag_name}}) {
5061     ## has an element in scope
5062     my $i;
5063     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5064     my $node = $self->{open_elements}->[$_];
5065     if ($node->[1] eq $token->{tag_name}) {
5066     ## generate implied end tags
5067     if ({
5068     dd => ($token->{tag_name} ne 'dd'),
5069     dt => ($token->{tag_name} ne 'dt'),
5070     li => ($token->{tag_name} ne 'li'),
5071     p => ($token->{tag_name} ne 'p'),
5072     td => 1, th => 1, tr => 1,
5073     tbody => 1, tfoot=> 1, thead => 1,
5074     }->{$self->{open_elements}->[-1]->[1]}) {
5075     !!!back-token;
5076 wakaba 1.55 $token = {type => END_TAG_TOKEN,
5077 wakaba 1.52 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
5078 wakaba 1.53 redo B;
5079 wakaba 1.52 }
5080     $i = $_;
5081     last INSCOPE unless $token->{tag_name} eq 'p';
5082     } elsif ({
5083     table => 1, caption => 1, td => 1, th => 1,
5084     button => 1, marquee => 1, object => 1, html => 1,
5085     }->{$node->[1]}) {
5086     last INSCOPE;
5087 wakaba 1.51 }
5088 wakaba 1.52 } # INSCOPE
5089    
5090     if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
5091     if (defined $i) {
5092     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
5093 wakaba 1.51 } else {
5094 wakaba 1.52 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5095 wakaba 1.51 }
5096     }
5097    
5098 wakaba 1.52 if (defined $i) {
5099     splice @{$self->{open_elements}}, $i;
5100     } elsif ($token->{tag_name} eq 'p') {
5101     ## As if <p>, then reprocess the current token
5102     my $el;
5103     !!!create-element ($el, 'p');
5104     $insert->($el);
5105 wakaba 1.51 }
5106 wakaba 1.52 $clear_up_to_marker->()
5107     if {
5108     button => 1, marquee => 1, object => 1,
5109     }->{$token->{tag_name}};
5110     !!!next-token;
5111 wakaba 1.53 redo B;
5112 wakaba 1.52 } elsif ($token->{tag_name} eq 'form') {
5113     ## has an element in scope
5114     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5115     my $node = $self->{open_elements}->[$_];
5116     if ($node->[1] eq $token->{tag_name}) {
5117     ## generate implied end tags
5118     if ({
5119     dd => 1, dt => 1, li => 1, p => 1,
5120     td => 1, th => 1, tr => 1,
5121     tbody => 1, tfoot=> 1, thead => 1,
5122     }->{$self->{open_elements}->[-1]->[1]}) {
5123     !!!back-token;
5124 wakaba 1.55 $token = {type => END_TAG_TOKEN,
5125 wakaba 1.52 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
5126 wakaba 1.53 redo B;
5127 wakaba 1.52 }
5128     last INSCOPE;
5129     } elsif ({
5130     table => 1, caption => 1, td => 1, th => 1,
5131     button => 1, marquee => 1, object => 1, html => 1,
5132     }->{$node->[1]}) {
5133     last INSCOPE;
5134     }
5135     } # INSCOPE
5136    
5137     if ($self->{open_elements}->[-1]->[1] eq $token->{tag_name}) {
5138 wakaba 1.36 pop @{$self->{open_elements}};
5139     } else {
5140 wakaba 1.58 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5141 wakaba 1.52 }
5142    
5143     undef $self->{form_element};
5144     !!!next-token;
5145 wakaba 1.53 redo B;
5146 wakaba 1.52 } elsif ({
5147     h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
5148     }->{$token->{tag_name}}) {
5149     ## has an element in scope
5150     my $i;
5151     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5152     my $node = $self->{open_elements}->[$_];
5153     if ({
5154     h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
5155     }->{$node->[1]}) {
5156     ## generate implied end tags
5157     if ({
5158     dd => 1, dt => 1, li => 1, p => 1,
5159     td => 1, th => 1, tr => 1,
5160     tbody => 1, tfoot=> 1, thead => 1,
5161     }->{$self->{open_elements}->[-1]->[1]}) {
5162     !!!back-token;
5163 wakaba 1.55 $token = {type => END_TAG_TOKEN,
5164 wakaba 1.52 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
5165 wakaba 1.53 redo B;
5166 wakaba 1.52 }
5167     $i = $_;
5168     last INSCOPE;
5169     } elsif ({
5170     table => 1, caption => 1, td => 1, th => 1,
5171     button => 1, marquee => 1, object => 1, html => 1,
5172     }->{$node->[1]}) {
5173     last INSCOPE;
5174 wakaba 1.51 }
5175 wakaba 1.52 } # INSCOPE
5176    
5177     if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
5178 wakaba 1.58 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5179 wakaba 1.36 }
5180 wakaba 1.52
5181     splice @{$self->{open_elements}}, $i if defined $i;
5182     !!!next-token;
5183 wakaba 1.53 redo B;
5184 wakaba 1.52 } elsif ({
5185     a => 1,
5186     b => 1, big => 1, em => 1, font => 1, i => 1,
5187     nobr => 1, s => 1, small => 1, strile => 1,
5188     strong => 1, tt => 1, u => 1,
5189     }->{$token->{tag_name}}) {
5190     $formatting_end_tag->($token->{tag_name});
5191 wakaba 1.53 redo B;
5192 wakaba 1.52 } elsif ($token->{tag_name} eq 'br') {
5193     !!!parse-error (type => 'unmatched end tag:br');
5194    
5195     ## As if <br>
5196     $reconstruct_active_formatting_elements->($insert_to_current);
5197    
5198     my $el;
5199     !!!create-element ($el, 'br');
5200     $insert->($el);
5201    
5202     ## Ignore the token.
5203     !!!next-token;
5204 wakaba 1.53 redo B;
5205 wakaba 1.52 } elsif ({
5206     caption => 1, col => 1, colgroup => 1, frame => 1,
5207     frameset => 1, head => 1, option => 1, optgroup => 1,
5208     tbody => 1, td => 1, tfoot => 1, th => 1,
5209     thead => 1, tr => 1,
5210     area => 1, basefont => 1, bgsound => 1,
5211     embed => 1, hr => 1, iframe => 1, image => 1,
5212     img => 1, input => 1, isindex => 1, noembed => 1,
5213     noframes => 1, param => 1, select => 1, spacer => 1,
5214     table => 1, textarea => 1, wbr => 1,
5215     noscript => 0, ## TODO: if scripting is enabled
5216     }->{$token->{tag_name}}) {
5217     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5218     ## Ignore the token
5219     !!!next-token;
5220 wakaba 1.53 redo B;
5221 wakaba 1.52
5222     ## ISSUE: Issue on HTML5 new elements in spec
5223    
5224     } else {
5225     ## Step 1
5226     my $node_i = -1;
5227     my $node = $self->{open_elements}->[$node_i];
5228 wakaba 1.51
5229 wakaba 1.52 ## Step 2
5230     S2: {
5231     if ($node->[1] eq $token->{tag_name}) {
5232     ## Step 1
5233     ## generate implied end tags
5234     if ({
5235     dd => 1, dt => 1, li => 1, p => 1,
5236     td => 1, th => 1, tr => 1,
5237 wakaba 1.55 tbody => 1, tfoot => 1, thead => 1,
5238 wakaba 1.52 }->{$self->{open_elements}->[-1]->[1]}) {
5239     !!!back-token;
5240 wakaba 1.55 $token = {type => END_TAG_TOKEN,
5241 wakaba 1.52 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
5242 wakaba 1.53 redo B;
5243 wakaba 1.52 }
5244    
5245     ## Step 2
5246     if ($token->{tag_name} ne $self->{open_elements}->[-1]->[1]) {
5247 wakaba 1.58 ## NOTE: <x><y></x>
5248 wakaba 1.52 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
5249     }
5250    
5251     ## Step 3
5252     splice @{$self->{open_elements}}, $node_i;
5253 wakaba 1.51
5254 wakaba 1.1 !!!next-token;
5255 wakaba 1.52 last S2;
5256 wakaba 1.1 } else {
5257 wakaba 1.52 ## Step 3
5258     if (not $formatting_category->{$node->[1]} and
5259     #not $phrasing_category->{$node->[1]} and
5260     ($special_category->{$node->[1]} or
5261     $scoping_category->{$node->[1]})) {
5262     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5263     ## Ignore the token
5264     !!!next-token;
5265     last S2;
5266     }
5267 wakaba 1.1 }
5268 wakaba 1.52
5269     ## Step 4
5270     $node_i--;
5271     $node = $self->{open_elements}->[$node_i];
5272    
5273     ## Step 5;
5274     redo S2;
5275     } # S2
5276 wakaba 1.53 redo B;
5277 wakaba 1.1 }
5278     }
5279 wakaba 1.52 redo B;
5280 wakaba 1.1 } # B
5281    
5282 wakaba 1.51 ## NOTE: The "trailing end" phase in HTML5 is split into
5283     ## two insertion modes: "after html body" and "after html frameset".
5284     ## NOTE: States in the main stage is preserved while
5285     ## the parser stays in the trailing end phase. # MUST
5286    
5287 wakaba 1.1 ## Stop parsing # MUST
5288    
5289     ## TODO: script stuffs
5290 wakaba 1.3 } # _tree_construct_main
5291    
5292     sub set_inner_html ($$$) {
5293     my $class = shift;
5294     my $node = shift;
5295     my $s = \$_[0];
5296     my $onerror = $_[1];
5297    
5298 wakaba 1.63 ## ISSUE: Should {confident} be true?
5299    
5300 wakaba 1.3 my $nt = $node->node_type;
5301     if ($nt == 9) {
5302     # MUST
5303    
5304     ## Step 1 # MUST
5305     ## TODO: If the document has an active parser, ...
5306     ## ISSUE: There is an issue in the spec.
5307    
5308     ## Step 2 # MUST
5309     my @cn = @{$node->child_nodes};
5310     for (@cn) {
5311     $node->remove_child ($_);
5312     }
5313    
5314     ## Step 3, 4, 5 # MUST
5315     $class->parse_string ($$s => $node, $onerror);
5316     } elsif ($nt == 1) {
5317     ## TODO: If non-html element
5318    
5319     ## NOTE: Most of this code is copied from |parse_string|
5320    
5321     ## Step 1 # MUST
5322 wakaba 1.14 my $this_doc = $node->owner_document;
5323     my $doc = $this_doc->implementation->create_document;
5324 wakaba 1.18 $doc->manakai_is_html (1);
5325 wakaba 1.3 my $p = $class->new;
5326     $p->{document} = $doc;
5327    
5328     ## Step 9 # MUST
5329     my $i = 0;
5330     my $line = 1;
5331     my $column = 0;
5332     $p->{set_next_input_character} = sub {
5333     my $self = shift;
5334 wakaba 1.14
5335     pop @{$self->{prev_input_character}};
5336     unshift @{$self->{prev_input_character}}, $self->{next_input_character};
5337    
5338 wakaba 1.3 $self->{next_input_character} = -1 and return if $i >= length $$s;
5339     $self->{next_input_character} = ord substr $$s, $i++, 1;
5340     $column++;
5341 wakaba 1.4
5342     if ($self->{next_input_character} == 0x000A) { # LF
5343     $line++;
5344     $column = 0;
5345     } elsif ($self->{next_input_character} == 0x000D) { # CR
5346 wakaba 1.15 $i++ if substr ($$s, $i, 1) eq "\x0A";
5347 wakaba 1.3 $self->{next_input_character} = 0x000A; # LF # MUST
5348     $line++;
5349 wakaba 1.4 $column = 0;
5350 wakaba 1.3 } elsif ($self->{next_input_character} > 0x10FFFF) {
5351     $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
5352     } elsif ($self->{next_input_character} == 0x0000) { # NULL
5353 wakaba 1.14 !!!parse-error (type => 'NULL');
5354 wakaba 1.3 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
5355     }
5356     };
5357 wakaba 1.14 $p->{prev_input_character} = [-1, -1, -1];
5358     $p->{next_input_character} = -1;
5359 wakaba 1.3
5360     my $ponerror = $onerror || sub {
5361     my (%opt) = @_;
5362     warn "Parse error ($opt{type}) at line $opt{line} column $opt{column}\n";
5363     };
5364     $p->{parse_error} = sub {
5365     $ponerror->(@_, line => $line, column => $column);
5366     };
5367    
5368     $p->_initialize_tokenizer;
5369     $p->_initialize_tree_constructor;
5370    
5371     ## Step 2
5372     my $node_ln = $node->local_name;
5373 wakaba 1.40 $p->{content_model} = {
5374     title => RCDATA_CONTENT_MODEL,
5375     textarea => RCDATA_CONTENT_MODEL,
5376     style => CDATA_CONTENT_MODEL,
5377     script => CDATA_CONTENT_MODEL,
5378     xmp => CDATA_CONTENT_MODEL,
5379     iframe => CDATA_CONTENT_MODEL,
5380     noembed => CDATA_CONTENT_MODEL,
5381     noframes => CDATA_CONTENT_MODEL,
5382     noscript => CDATA_CONTENT_MODEL,
5383     plaintext => PLAINTEXT_CONTENT_MODEL,
5384     }->{$node_ln};
5385     $p->{content_model} = PCDATA_CONTENT_MODEL
5386     unless defined $p->{content_model};
5387     ## ISSUE: What is "the name of the element"? local name?
5388 wakaba 1.3
5389     $p->{inner_html_node} = [$node, $node_ln];
5390    
5391     ## Step 4
5392     my $root = $doc->create_element_ns
5393     ('http://www.w3.org/1999/xhtml', [undef, 'html']);
5394    
5395     ## Step 5 # MUST
5396     $doc->append_child ($root);
5397    
5398     ## Step 6 # MUST
5399     push @{$p->{open_elements}}, [$root, 'html'];
5400    
5401     undef $p->{head_element};
5402    
5403     ## Step 7 # MUST
5404     $p->_reset_insertion_mode;
5405    
5406     ## Step 8 # MUST
5407     my $anode = $node;
5408     AN: while (defined $anode) {
5409     if ($anode->node_type == 1) {
5410     my $nsuri = $anode->namespace_uri;
5411     if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {
5412     if ($anode->local_name eq 'form') { ## TODO: case?
5413     $p->{form_element} = $anode;
5414     last AN;
5415     }
5416     }
5417     }
5418     $anode = $anode->parent_node;
5419     } # AN
5420    
5421     ## Step 3 # MUST
5422     ## Step 10 # MUST
5423     {
5424     my $self = $p;
5425     !!!next-token;
5426     }
5427     $p->_tree_construction_main;
5428    
5429     ## Step 11 # MUST
5430     my @cn = @{$node->child_nodes};
5431     for (@cn) {
5432     $node->remove_child ($_);
5433     }
5434     ## ISSUE: mutation events? read-only?
5435    
5436     ## Step 12 # MUST
5437     @cn = @{$root->child_nodes};
5438     for (@cn) {
5439 wakaba 1.14 $this_doc->adopt_node ($_);
5440 wakaba 1.3 $node->append_child ($_);
5441     }
5442 wakaba 1.14 ## ISSUE: mutation events?
5443 wakaba 1.3
5444     $p->_terminate_tree_constructor;
5445     } else {
5446     die "$0: |set_inner_html| is not defined for node of type $nt";
5447     }
5448     } # set_inner_html
5449    
5450     } # tree construction stage
5451 wakaba 1.1
5452 wakaba 1.63 package Whatpm::HTML::RestartParser;
5453     push our @ISA, 'Error';
5454    
5455 wakaba 1.1 1;
5456 wakaba 1.65 # $Date: 2007/11/11 08:39:42 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24