/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.63 - (hide annotations) (download) (as text)
Sun Nov 11 06:54:36 2007 UTC (16 years, 11 months ago) by wakaba
Branch: MAIN
Changes since 1.62: +98 -21 lines
File MIME type: application/x-wais-source
++ whatpm/Whatpm/ChangeLog	11 Nov 2007 06:54:02 -0000
	* HTML.pm.src (parse_byte_string): New method.
	(parse_char_string): New alias for |parse_string|.
	(main phase): Invoking "change the encoding" algorithm if desired.

	* HTML.pod: Updated.

2007-11-11  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.2 package Whatpm::HTML;
2 wakaba 1.1 use strict;
3 wakaba 1.63 our $VERSION=do{my @r=(q$Revision: 1.62 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4     use Error qw(:try);
5 wakaba 1.1
6 wakaba 1.18 ## ISSUE:
7     ## var doc = implementation.createDocument (null, null, null);
8     ## doc.write ('');
9     ## alert (doc.compatMode);
10 wakaba 1.1
11 wakaba 1.31 ## ISSUE: HTML5 revision 967 says that the encoding layer MUST NOT
12     ## strip BOM and the HTML layer MUST ignore it. Whether we can do it
13     ## is not yet clear.
14     ## "{U+FEFF}..." in UTF-16BE/UTF-16LE is three or four characters?
15     ## "{U+FEFF}..." in GB18030?
16    
17 wakaba 1.1 my $permitted_slash_tag_name = {
18     base => 1,
19     link => 1,
20     meta => 1,
21     hr => 1,
22     br => 1,
23     img=> 1,
24     embed => 1,
25     param => 1,
26     area => 1,
27     col => 1,
28     input => 1,
29     };
30    
31 wakaba 1.4 my $c1_entity_char = {
32 wakaba 1.10 0x80 => 0x20AC,
33     0x81 => 0xFFFD,
34     0x82 => 0x201A,
35     0x83 => 0x0192,
36     0x84 => 0x201E,
37     0x85 => 0x2026,
38     0x86 => 0x2020,
39     0x87 => 0x2021,
40     0x88 => 0x02C6,
41     0x89 => 0x2030,
42     0x8A => 0x0160,
43     0x8B => 0x2039,
44     0x8C => 0x0152,
45     0x8D => 0xFFFD,
46     0x8E => 0x017D,
47     0x8F => 0xFFFD,
48     0x90 => 0xFFFD,
49     0x91 => 0x2018,
50     0x92 => 0x2019,
51     0x93 => 0x201C,
52     0x94 => 0x201D,
53     0x95 => 0x2022,
54     0x96 => 0x2013,
55     0x97 => 0x2014,
56     0x98 => 0x02DC,
57     0x99 => 0x2122,
58     0x9A => 0x0161,
59     0x9B => 0x203A,
60     0x9C => 0x0153,
61     0x9D => 0xFFFD,
62     0x9E => 0x017E,
63     0x9F => 0x0178,
64 wakaba 1.4 }; # $c1_entity_char
65 wakaba 1.1
66     my $special_category = {
67     address => 1, area => 1, base => 1, basefont => 1, bgsound => 1,
68     blockquote => 1, body => 1, br => 1, center => 1, col => 1, colgroup => 1,
69     dd => 1, dir => 1, div => 1, dl => 1, dt => 1, embed => 1, fieldset => 1,
70     form => 1, frame => 1, frameset => 1, h1 => 1, h2 => 1, h3 => 1,
71     h4 => 1, h5 => 1, h6 => 1, head => 1, hr => 1, iframe => 1, image => 1,
72     img => 1, input => 1, isindex => 1, li => 1, link => 1, listing => 1,
73     menu => 1, meta => 1, noembed => 1, noframes => 1, noscript => 1,
74     ol => 1, optgroup => 1, option => 1, p => 1, param => 1, plaintext => 1,
75     pre => 1, script => 1, select => 1, spacer => 1, style => 1, tbody => 1,
76     textarea => 1, tfoot => 1, thead => 1, title => 1, tr => 1, ul => 1, wbr => 1,
77     };
78     my $scoping_category = {
79     button => 1, caption => 1, html => 1, marquee => 1, object => 1,
80     table => 1, td => 1, th => 1,
81     };
82     my $formatting_category = {
83     a => 1, b => 1, big => 1, em => 1, font => 1, i => 1, nobr => 1,
84     s => 1, small => 1, strile => 1, strong => 1, tt => 1, u => 1,
85     };
86     # $phrasing_category: all other elements
87    
88 wakaba 1.63 sub parse_byte_string ($$$$;$) {
89     my $self = ref $_[0] ? shift : shift->new;
90     my $charset = shift;
91     my $bytes_s = ref $_[0] ? $_[0] : \($_[0]);
92     my $s;
93    
94     if (defined $charset) {
95     require Encode;
96     $s = \ (Encode::decode ($charset, $$bytes_s));
97     $self->{input_encoding} = lc $charset; ## TODO: normalize name ## TODO: set $doc->input_encoding
98     $self->{confident} = 1;
99     } else {
100     $s = ref $_[0] ? $_[0] : \($_[0]);
101     $self->{confident} = 0;
102     }
103    
104     $self->{change_encoding} = sub {
105     my $self = shift;
106     my $charset = lc shift;
107     ## TODO: if $charset is supported
108     ## TODO: normalize charset name
109    
110     ## "Change the encoding" algorithm:
111    
112     ## Step 1
113     if ($charset eq 'utf-16') { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8?
114     $charset = 'utf-8';
115     }
116    
117     ## Step 2
118     if (defined $self->{input_encoding} and
119     $self->{input_encoding} eq $charset) {
120     $self->{confident} = 1;
121     return;
122     }
123    
124     !!!parse-error (type => 'charset label detected', level => 'w');
125    
126     ## Step 3
127     # if (can) {
128     ## change the encoding on the fly.
129     #$self->{confident} = 1;
130     #return;
131     # }
132    
133     ## Step 4
134     throw Whatpm::HTML::RestartParser (charset => $charset);
135     }; # $self->{change_encoding}
136    
137     my @args = @_; shift @args; # $s
138     my $return;
139     try {
140     $return = $self->parse_char_string ($s, @args);
141     } catch Whatpm::HTML::RestartParser with {
142     my $charset = shift->{charset};
143     $s = \ (Encode::decode ($charset, $$bytes_s));
144     $self->{input_encoding} = $charset; ## TODO: $doc->input_encoding;
145     $self->{confident} = 1;
146     $return = $self->parse_char_string ($s, @args);
147     };
148     return $return;
149     } # parse_byte_string
150    
151     *parse_char_string = \&parse_string;
152    
153 wakaba 1.1 sub parse_string ($$$;$) {
154 wakaba 1.63 my $self = ref $_[0] ? shift : shift->new;
155     my $s = ref $_[0] ? $_[0] : \($_[0]);
156 wakaba 1.1 $self->{document} = $_[1];
157 wakaba 1.63 @{$self->{document}->child_nodes} = ();
158 wakaba 1.1
159 wakaba 1.3 ## NOTE: |set_inner_html| copies most of this method's code
160    
161 wakaba 1.63 $self->{confident} = 1 unless exists $self->{confident};
162    
163 wakaba 1.1 my $i = 0;
164 wakaba 1.3 my $line = 1;
165     my $column = 0;
166 wakaba 1.1 $self->{set_next_input_character} = sub {
167     my $self = shift;
168 wakaba 1.13
169     pop @{$self->{prev_input_character}};
170     unshift @{$self->{prev_input_character}}, $self->{next_input_character};
171    
172 wakaba 1.1 $self->{next_input_character} = -1 and return if $i >= length $$s;
173     $self->{next_input_character} = ord substr $$s, $i++, 1;
174 wakaba 1.3 $column++;
175 wakaba 1.1
176 wakaba 1.4 if ($self->{next_input_character} == 0x000A) { # LF
177     $line++;
178     $column = 0;
179     } elsif ($self->{next_input_character} == 0x000D) { # CR
180 wakaba 1.15 $i++ if substr ($$s, $i, 1) eq "\x0A";
181 wakaba 1.1 $self->{next_input_character} = 0x000A; # LF # MUST
182 wakaba 1.3 $line++;
183 wakaba 1.4 $column = 0;
184 wakaba 1.1 } elsif ($self->{next_input_character} > 0x10FFFF) {
185     $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
186     } elsif ($self->{next_input_character} == 0x0000) { # NULL
187 wakaba 1.8 !!!parse-error (type => 'NULL');
188 wakaba 1.1 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
189     }
190     };
191 wakaba 1.13 $self->{prev_input_character} = [-1, -1, -1];
192     $self->{next_input_character} = -1;
193 wakaba 1.1
194 wakaba 1.3 my $onerror = $_[2] || sub {
195     my (%opt) = @_;
196     warn "Parse error ($opt{type}) at line $opt{line} column $opt{column}\n";
197     };
198     $self->{parse_error} = sub {
199     $onerror->(@_, line => $line, column => $column);
200 wakaba 1.1 };
201    
202     $self->_initialize_tokenizer;
203     $self->_initialize_tree_constructor;
204     $self->_construct_tree;
205     $self->_terminate_tree_constructor;
206    
207     return $self->{document};
208     } # parse_string
209    
210     sub new ($) {
211     my $class = shift;
212     my $self = bless {}, $class;
213     $self->{set_next_input_character} = sub {
214     $self->{next_input_character} = -1;
215     };
216     $self->{parse_error} = sub {
217     #
218     };
219 wakaba 1.63 $self->{change_encoding} = sub {
220     # if ($_[0] is a supported encoding) {
221     # run "change the encoding" algorithm;
222     # throw Whatpm::HTML::RestartParser (charset => $new_encoding);
223     # }
224     };
225 wakaba 1.61 $self->{application_cache_selection} = sub {
226     #
227     };
228 wakaba 1.1 return $self;
229     } # new
230    
231 wakaba 1.40 sub CM_ENTITY () { 0b001 } # & markup in data
232     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
233     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
234    
235     sub PLAINTEXT_CONTENT_MODEL () { 0 }
236     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
237     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
238     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
239    
240 wakaba 1.57 sub DATA_STATE () { 0 }
241     sub ENTITY_DATA_STATE () { 1 }
242     sub TAG_OPEN_STATE () { 2 }
243     sub CLOSE_TAG_OPEN_STATE () { 3 }
244     sub TAG_NAME_STATE () { 4 }
245     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
246     sub ATTRIBUTE_NAME_STATE () { 6 }
247     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
248     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
249     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
250     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
251     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
252     sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
253     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
254     sub COMMENT_START_STATE () { 14 }
255     sub COMMENT_START_DASH_STATE () { 15 }
256     sub COMMENT_STATE () { 16 }
257     sub COMMENT_END_STATE () { 17 }
258     sub COMMENT_END_DASH_STATE () { 18 }
259     sub BOGUS_COMMENT_STATE () { 19 }
260     sub DOCTYPE_STATE () { 20 }
261     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
262     sub DOCTYPE_NAME_STATE () { 22 }
263     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
264     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
265     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
266     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
267     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
268     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
269     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
270     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
271     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
272     sub BOGUS_DOCTYPE_STATE () { 32 }
273    
274 wakaba 1.55 sub DOCTYPE_TOKEN () { 1 }
275     sub COMMENT_TOKEN () { 2 }
276     sub START_TAG_TOKEN () { 3 }
277     sub END_TAG_TOKEN () { 4 }
278     sub END_OF_FILE_TOKEN () { 5 }
279     sub CHARACTER_TOKEN () { 6 }
280    
281 wakaba 1.54 sub AFTER_HTML_IMS () { 0b100 }
282     sub HEAD_IMS () { 0b1000 }
283     sub BODY_IMS () { 0b10000 }
284 wakaba 1.56 sub BODY_TABLE_IMS () { 0b100000 }
285 wakaba 1.54 sub TABLE_IMS () { 0b1000000 }
286 wakaba 1.56 sub ROW_IMS () { 0b10000000 }
287 wakaba 1.54 sub BODY_AFTER_IMS () { 0b100000000 }
288     sub FRAME_IMS () { 0b1000000000 }
289    
290     sub AFTER_HTML_BODY_IM () { AFTER_HTML_IMS | BODY_AFTER_IMS }
291     sub AFTER_HTML_FRAMESET_IM () { AFTER_HTML_IMS | FRAME_IMS }
292     sub IN_HEAD_IM () { HEAD_IMS | 0b00 }
293     sub IN_HEAD_NOSCRIPT_IM () { HEAD_IMS | 0b01 }
294     sub AFTER_HEAD_IM () { HEAD_IMS | 0b10 }
295     sub BEFORE_HEAD_IM () { HEAD_IMS | 0b11 }
296     sub IN_BODY_IM () { BODY_IMS }
297 wakaba 1.56 sub IN_CELL_IM () { BODY_IMS | BODY_TABLE_IMS | 0b01 }
298     sub IN_CAPTION_IM () { BODY_IMS | BODY_TABLE_IMS | 0b10 }
299     sub IN_ROW_IM () { TABLE_IMS | ROW_IMS | 0b01 }
300     sub IN_TABLE_BODY_IM () { TABLE_IMS | ROW_IMS | 0b10 }
301 wakaba 1.54 sub IN_TABLE_IM () { TABLE_IMS }
302     sub AFTER_BODY_IM () { BODY_AFTER_IMS }
303     sub IN_FRAMESET_IM () { FRAME_IMS | 0b01 }
304     sub AFTER_FRAMESET_IM () { FRAME_IMS | 0b10 }
305     sub IN_SELECT_IM () { 0b01 }
306     sub IN_COLUMN_GROUP_IM () { 0b10 }
307    
308 wakaba 1.1 ## Implementations MUST act as if state machine in the spec
309    
310     sub _initialize_tokenizer ($) {
311     my $self = shift;
312 wakaba 1.57 $self->{state} = DATA_STATE; # MUST
313 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
314 wakaba 1.1 undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE
315     undef $self->{current_attribute};
316     undef $self->{last_emitted_start_tag_name};
317     undef $self->{last_attribute_value_state};
318     $self->{char} = [];
319     # $self->{next_input_character}
320     !!!next-input-character;
321     $self->{token} = [];
322 wakaba 1.18 # $self->{escape}
323 wakaba 1.1 } # _initialize_tokenizer
324    
325     ## A token has:
326 wakaba 1.55 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
327     ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
328     ## ->{name} (DOCTYPE_TOKEN)
329     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
330     ## ->{public_identifier} (DOCTYPE_TOKEN)
331     ## ->{system_identifier} (DOCTYPE_TOKEN)
332     ## ->{correct} == 1 or 0 (DOCTYPE_TOKEN)
333     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
334     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
335 wakaba 1.1
336     ## Emitted token MUST immediately be handled by the tree construction state.
337    
338     ## Before each step, UA MAY check to see if either one of the scripts in
339     ## "list of scripts that will execute as soon as possible" or the first
340     ## script in the "list of scripts that will execute asynchronously",
341     ## has completed loading. If one has, then it MUST be executed
342     ## and removed from the list.
343    
344 wakaba 1.59 ## NOTE: HTML5 "Writing HTML documents" section, applied to
345     ## documents and not to user agents and conformance checkers,
346     ## contains some requirements that are not detected by the
347     ## parsing algorithm:
348     ## - Some requirements on character encoding declarations. ## TODO
349     ## - "Elements MUST NOT contain content that their content model disallows."
350     ## ... Some are parse error, some are not (will be reported by c.c.).
351     ## - Polytheistic slash SHOULD NOT be used. (Applied only to atheists.) ## TODO
352     ## - Text (in elements, attributes, and comments) SHOULD NOT contain
353     ## control characters other than space characters. ## TODO: (what is control character? C0, C1 and DEL? Unicode control character?)
354    
355     ## TODO: HTML5 poses authors two SHOULD-level requirements that cannot
356     ## be detected by the HTML5 parsing algorithm:
357     ## - Text,
358    
359 wakaba 1.1 sub _get_next_token ($) {
360     my $self = shift;
361     if (@{$self->{token}}) {
362     return shift @{$self->{token}};
363     }
364    
365     A: {
366 wakaba 1.57 if ($self->{state} == DATA_STATE) {
367 wakaba 1.1 if ($self->{next_input_character} == 0x0026) { # &
368 wakaba 1.40 if ($self->{content_model} & CM_ENTITY) { # PCDATA | RCDATA
369 wakaba 1.57 $self->{state} = ENTITY_DATA_STATE;
370 wakaba 1.1 !!!next-input-character;
371     redo A;
372     } else {
373     #
374     }
375 wakaba 1.13 } elsif ($self->{next_input_character} == 0x002D) { # -
376 wakaba 1.40 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
377 wakaba 1.13 unless ($self->{escape}) {
378     if ($self->{prev_input_character}->[0] == 0x002D and # -
379     $self->{prev_input_character}->[1] == 0x0021 and # !
380     $self->{prev_input_character}->[2] == 0x003C) { # <
381     $self->{escape} = 1;
382     }
383     }
384     }
385    
386     #
387 wakaba 1.1 } elsif ($self->{next_input_character} == 0x003C) { # <
388 wakaba 1.40 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
389     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
390 wakaba 1.13 not $self->{escape})) {
391 wakaba 1.57 $self->{state} = TAG_OPEN_STATE;
392 wakaba 1.1 !!!next-input-character;
393     redo A;
394     } else {
395     #
396     }
397 wakaba 1.13 } elsif ($self->{next_input_character} == 0x003E) { # >
398     if ($self->{escape} and
399 wakaba 1.40 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
400 wakaba 1.13 if ($self->{prev_input_character}->[0] == 0x002D and # -
401     $self->{prev_input_character}->[1] == 0x002D) { # -
402     delete $self->{escape};
403     }
404     }
405    
406     #
407 wakaba 1.1 } elsif ($self->{next_input_character} == -1) {
408 wakaba 1.55 !!!emit ({type => END_OF_FILE_TOKEN});
409 wakaba 1.1 last A; ## TODO: ok?
410     }
411     # Anything else
412 wakaba 1.55 my $token = {type => CHARACTER_TOKEN,
413 wakaba 1.1 data => chr $self->{next_input_character}};
414     ## Stay in the data state
415     !!!next-input-character;
416    
417     !!!emit ($token);
418    
419     redo A;
420 wakaba 1.57 } elsif ($self->{state} == ENTITY_DATA_STATE) {
421 wakaba 1.1 ## (cannot happen in CDATA state)
422    
423 wakaba 1.26 my $token = $self->_tokenize_attempt_to_consume_an_entity (0);
424 wakaba 1.1
425 wakaba 1.57 $self->{state} = DATA_STATE;
426 wakaba 1.1 # next-input-character is already done
427    
428     unless (defined $token) {
429 wakaba 1.55 !!!emit ({type => CHARACTER_TOKEN, data => '&'});
430 wakaba 1.1 } else {
431     !!!emit ($token);
432     }
433    
434     redo A;
435 wakaba 1.57 } elsif ($self->{state} == TAG_OPEN_STATE) {
436 wakaba 1.40 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
437 wakaba 1.1 if ($self->{next_input_character} == 0x002F) { # /
438     !!!next-input-character;
439 wakaba 1.57 $self->{state} = CLOSE_TAG_OPEN_STATE;
440 wakaba 1.1 redo A;
441     } else {
442     ## reconsume
443 wakaba 1.57 $self->{state} = DATA_STATE;
444 wakaba 1.1
445 wakaba 1.55 !!!emit ({type => CHARACTER_TOKEN, data => '<'});
446 wakaba 1.1
447     redo A;
448     }
449 wakaba 1.40 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
450 wakaba 1.1 if ($self->{next_input_character} == 0x0021) { # !
451 wakaba 1.57 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
452 wakaba 1.1 !!!next-input-character;
453     redo A;
454     } elsif ($self->{next_input_character} == 0x002F) { # /
455 wakaba 1.57 $self->{state} = CLOSE_TAG_OPEN_STATE;
456 wakaba 1.1 !!!next-input-character;
457     redo A;
458     } elsif (0x0041 <= $self->{next_input_character} and
459     $self->{next_input_character} <= 0x005A) { # A..Z
460     $self->{current_token}
461 wakaba 1.55 = {type => START_TAG_TOKEN,
462 wakaba 1.1 tag_name => chr ($self->{next_input_character} + 0x0020)};
463 wakaba 1.57 $self->{state} = TAG_NAME_STATE;
464 wakaba 1.1 !!!next-input-character;
465     redo A;
466     } elsif (0x0061 <= $self->{next_input_character} and
467     $self->{next_input_character} <= 0x007A) { # a..z
468 wakaba 1.55 $self->{current_token} = {type => START_TAG_TOKEN,
469 wakaba 1.1 tag_name => chr ($self->{next_input_character})};
470 wakaba 1.57 $self->{state} = TAG_NAME_STATE;
471 wakaba 1.1 !!!next-input-character;
472     redo A;
473     } elsif ($self->{next_input_character} == 0x003E) { # >
474 wakaba 1.3 !!!parse-error (type => 'empty start tag');
475 wakaba 1.57 $self->{state} = DATA_STATE;
476 wakaba 1.1 !!!next-input-character;
477    
478 wakaba 1.55 !!!emit ({type => CHARACTER_TOKEN, data => '<>'});
479 wakaba 1.1
480     redo A;
481     } elsif ($self->{next_input_character} == 0x003F) { # ?
482 wakaba 1.3 !!!parse-error (type => 'pio');
483 wakaba 1.57 $self->{state} = BOGUS_COMMENT_STATE;
484 wakaba 1.1 ## $self->{next_input_character} is intentionally left as is
485     redo A;
486     } else {
487 wakaba 1.3 !!!parse-error (type => 'bare stago');
488 wakaba 1.57 $self->{state} = DATA_STATE;
489 wakaba 1.1 ## reconsume
490    
491 wakaba 1.55 !!!emit ({type => CHARACTER_TOKEN, data => '<'});
492 wakaba 1.1
493     redo A;
494     }
495     } else {
496 wakaba 1.40 die "$0: $self->{content_model} in tag open";
497 wakaba 1.1 }
498 wakaba 1.57 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
499 wakaba 1.40 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
500 wakaba 1.23 if (defined $self->{last_emitted_start_tag_name}) {
501 wakaba 1.30 ## NOTE: <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>
502 wakaba 1.23 my @next_char;
503     TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
504     push @next_char, $self->{next_input_character};
505     my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
506     my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
507     if ($self->{next_input_character} == $c or $self->{next_input_character} == $C) {
508     !!!next-input-character;
509     next TAGNAME;
510     } else {
511     $self->{next_input_character} = shift @next_char; # reconsume
512     !!!back-next-input-character (@next_char);
513 wakaba 1.57 $self->{state} = DATA_STATE;
514 wakaba 1.23
515 wakaba 1.55 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
516 wakaba 1.23
517     redo A;
518     }
519     }
520 wakaba 1.1 push @next_char, $self->{next_input_character};
521 wakaba 1.23
522     unless ($self->{next_input_character} == 0x0009 or # HT
523     $self->{next_input_character} == 0x000A or # LF
524     $self->{next_input_character} == 0x000B or # VT
525     $self->{next_input_character} == 0x000C or # FF
526     $self->{next_input_character} == 0x0020 or # SP
527     $self->{next_input_character} == 0x003E or # >
528     $self->{next_input_character} == 0x002F or # /
529     $self->{next_input_character} == -1) {
530 wakaba 1.1 $self->{next_input_character} = shift @next_char; # reconsume
531     !!!back-next-input-character (@next_char);
532 wakaba 1.57 $self->{state} = DATA_STATE;
533 wakaba 1.55 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
534 wakaba 1.1 redo A;
535 wakaba 1.23 } else {
536     $self->{next_input_character} = shift @next_char;
537     !!!back-next-input-character (@next_char);
538     # and consume...
539 wakaba 1.1 }
540 wakaba 1.23 } else {
541     ## No start tag token has ever been emitted
542     # next-input-character is already done
543 wakaba 1.57 $self->{state} = DATA_STATE;
544 wakaba 1.55 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
545 wakaba 1.1 redo A;
546     }
547     }
548    
549     if (0x0041 <= $self->{next_input_character} and
550     $self->{next_input_character} <= 0x005A) { # A..Z
551 wakaba 1.55 $self->{current_token} = {type => END_TAG_TOKEN,
552 wakaba 1.1 tag_name => chr ($self->{next_input_character} + 0x0020)};
553 wakaba 1.57 $self->{state} = TAG_NAME_STATE;
554 wakaba 1.1 !!!next-input-character;
555     redo A;
556     } elsif (0x0061 <= $self->{next_input_character} and
557     $self->{next_input_character} <= 0x007A) { # a..z
558 wakaba 1.55 $self->{current_token} = {type => END_TAG_TOKEN,
559 wakaba 1.1 tag_name => chr ($self->{next_input_character})};
560 wakaba 1.57 $self->{state} = TAG_NAME_STATE;
561 wakaba 1.1 !!!next-input-character;
562     redo A;
563     } elsif ($self->{next_input_character} == 0x003E) { # >
564 wakaba 1.3 !!!parse-error (type => 'empty end tag');
565 wakaba 1.57 $self->{state} = DATA_STATE;
566 wakaba 1.1 !!!next-input-character;
567     redo A;
568     } elsif ($self->{next_input_character} == -1) {
569 wakaba 1.3 !!!parse-error (type => 'bare etago');
570 wakaba 1.57 $self->{state} = DATA_STATE;
571 wakaba 1.1 # reconsume
572    
573 wakaba 1.55 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
574 wakaba 1.1
575     redo A;
576     } else {
577 wakaba 1.3 !!!parse-error (type => 'bogus end tag');
578 wakaba 1.57 $self->{state} = BOGUS_COMMENT_STATE;
579 wakaba 1.1 ## $self->{next_input_character} is intentionally left as is
580     redo A;
581     }
582 wakaba 1.57 } elsif ($self->{state} == TAG_NAME_STATE) {
583 wakaba 1.1 if ($self->{next_input_character} == 0x0009 or # HT
584     $self->{next_input_character} == 0x000A or # LF
585     $self->{next_input_character} == 0x000B or # VT
586     $self->{next_input_character} == 0x000C or # FF
587     $self->{next_input_character} == 0x0020) { # SP
588 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
589 wakaba 1.1 !!!next-input-character;
590     redo A;
591     } elsif ($self->{next_input_character} == 0x003E) { # >
592 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
593 wakaba 1.28 $self->{current_token}->{first_start_tag}
594     = not defined $self->{last_emitted_start_tag_name};
595 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
596 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
597 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
598 wakaba 1.1 if ($self->{current_token}->{attributes}) {
599 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
600 wakaba 1.1 }
601     } else {
602     die "$0: $self->{current_token}->{type}: Unknown token type";
603     }
604 wakaba 1.57 $self->{state} = DATA_STATE;
605 wakaba 1.1 !!!next-input-character;
606    
607     !!!emit ($self->{current_token}); # start tag or end tag
608    
609     redo A;
610     } elsif (0x0041 <= $self->{next_input_character} and
611     $self->{next_input_character} <= 0x005A) { # A..Z
612     $self->{current_token}->{tag_name} .= chr ($self->{next_input_character} + 0x0020);
613     # start tag or end tag
614     ## Stay in this state
615     !!!next-input-character;
616     redo A;
617 wakaba 1.17 } elsif ($self->{next_input_character} == -1) {
618 wakaba 1.3 !!!parse-error (type => 'unclosed tag');
619 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
620 wakaba 1.28 $self->{current_token}->{first_start_tag}
621     = not defined $self->{last_emitted_start_tag_name};
622 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
623 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
624 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
625 wakaba 1.1 if ($self->{current_token}->{attributes}) {
626 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
627 wakaba 1.1 }
628     } else {
629     die "$0: $self->{current_token}->{type}: Unknown token type";
630     }
631 wakaba 1.57 $self->{state} = DATA_STATE;
632 wakaba 1.1 # reconsume
633    
634     !!!emit ($self->{current_token}); # start tag or end tag
635    
636     redo A;
637     } elsif ($self->{next_input_character} == 0x002F) { # /
638     !!!next-input-character;
639     if ($self->{next_input_character} == 0x003E and # >
640 wakaba 1.55 $self->{current_token}->{type} == START_TAG_TOKEN and
641 wakaba 1.1 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
642     # permitted slash
643     #
644     } else {
645 wakaba 1.3 !!!parse-error (type => 'nestc');
646 wakaba 1.1 }
647 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
648 wakaba 1.1 # next-input-character is already done
649     redo A;
650     } else {
651     $self->{current_token}->{tag_name} .= chr $self->{next_input_character};
652     # start tag or end tag
653     ## Stay in the state
654     !!!next-input-character;
655     redo A;
656     }
657 wakaba 1.57 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
658 wakaba 1.1 if ($self->{next_input_character} == 0x0009 or # HT
659     $self->{next_input_character} == 0x000A or # LF
660     $self->{next_input_character} == 0x000B or # VT
661     $self->{next_input_character} == 0x000C or # FF
662     $self->{next_input_character} == 0x0020) { # SP
663     ## Stay in the state
664     !!!next-input-character;
665     redo A;
666     } elsif ($self->{next_input_character} == 0x003E) { # >
667 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
668 wakaba 1.28 $self->{current_token}->{first_start_tag}
669     = not defined $self->{last_emitted_start_tag_name};
670 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
671 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
672 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
673 wakaba 1.1 if ($self->{current_token}->{attributes}) {
674 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
675 wakaba 1.1 }
676     } else {
677     die "$0: $self->{current_token}->{type}: Unknown token type";
678     }
679 wakaba 1.57 $self->{state} = DATA_STATE;
680 wakaba 1.1 !!!next-input-character;
681    
682     !!!emit ($self->{current_token}); # start tag or end tag
683    
684     redo A;
685     } elsif (0x0041 <= $self->{next_input_character} and
686     $self->{next_input_character} <= 0x005A) { # A..Z
687     $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
688     value => ''};
689 wakaba 1.57 $self->{state} = ATTRIBUTE_NAME_STATE;
690 wakaba 1.1 !!!next-input-character;
691     redo A;
692     } elsif ($self->{next_input_character} == 0x002F) { # /
693     !!!next-input-character;
694     if ($self->{next_input_character} == 0x003E and # >
695 wakaba 1.55 $self->{current_token}->{type} == START_TAG_TOKEN and
696 wakaba 1.1 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
697     # permitted slash
698     #
699     } else {
700 wakaba 1.3 !!!parse-error (type => 'nestc');
701 wakaba 1.1 }
702     ## Stay in the state
703     # next-input-character is already done
704     redo A;
705 wakaba 1.17 } elsif ($self->{next_input_character} == -1) {
706 wakaba 1.3 !!!parse-error (type => 'unclosed tag');
707 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
708 wakaba 1.28 $self->{current_token}->{first_start_tag}
709     = not defined $self->{last_emitted_start_tag_name};
710 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
711 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
712 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
713 wakaba 1.1 if ($self->{current_token}->{attributes}) {
714 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
715 wakaba 1.1 }
716     } else {
717     die "$0: $self->{current_token}->{type}: Unknown token type";
718     }
719 wakaba 1.57 $self->{state} = DATA_STATE;
720 wakaba 1.1 # reconsume
721    
722     !!!emit ($self->{current_token}); # start tag or end tag
723    
724     redo A;
725     } else {
726     $self->{current_attribute} = {name => chr ($self->{next_input_character}),
727     value => ''};
728 wakaba 1.57 $self->{state} = ATTRIBUTE_NAME_STATE;
729 wakaba 1.1 !!!next-input-character;
730     redo A;
731     }
732 wakaba 1.57 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
733 wakaba 1.1 my $before_leave = sub {
734     if (exists $self->{current_token}->{attributes} # start tag or end tag
735     ->{$self->{current_attribute}->{name}}) { # MUST
736 wakaba 1.39 !!!parse-error (type => 'duplicate attribute:'.$self->{current_attribute}->{name});
737 wakaba 1.1 ## Discard $self->{current_attribute} # MUST
738     } else {
739     $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}
740     = $self->{current_attribute};
741     }
742     }; # $before_leave
743    
744     if ($self->{next_input_character} == 0x0009 or # HT
745     $self->{next_input_character} == 0x000A or # LF
746     $self->{next_input_character} == 0x000B or # VT
747     $self->{next_input_character} == 0x000C or # FF
748     $self->{next_input_character} == 0x0020) { # SP
749     $before_leave->();
750 wakaba 1.57 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
751 wakaba 1.1 !!!next-input-character;
752     redo A;
753     } elsif ($self->{next_input_character} == 0x003D) { # =
754     $before_leave->();
755 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
756 wakaba 1.1 !!!next-input-character;
757     redo A;
758     } elsif ($self->{next_input_character} == 0x003E) { # >
759     $before_leave->();
760 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
761 wakaba 1.28 $self->{current_token}->{first_start_tag}
762     = not defined $self->{last_emitted_start_tag_name};
763 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
764 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
765 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
766 wakaba 1.1 if ($self->{current_token}->{attributes}) {
767 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
768 wakaba 1.1 }
769     } else {
770     die "$0: $self->{current_token}->{type}: Unknown token type";
771     }
772 wakaba 1.57 $self->{state} = DATA_STATE;
773 wakaba 1.1 !!!next-input-character;
774    
775     !!!emit ($self->{current_token}); # start tag or end tag
776    
777     redo A;
778     } elsif (0x0041 <= $self->{next_input_character} and
779     $self->{next_input_character} <= 0x005A) { # A..Z
780     $self->{current_attribute}->{name} .= chr ($self->{next_input_character} + 0x0020);
781     ## Stay in the state
782     !!!next-input-character;
783     redo A;
784     } elsif ($self->{next_input_character} == 0x002F) { # /
785     $before_leave->();
786     !!!next-input-character;
787     if ($self->{next_input_character} == 0x003E and # >
788 wakaba 1.55 $self->{current_token}->{type} == START_TAG_TOKEN and
789 wakaba 1.1 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
790     # permitted slash
791     #
792     } else {
793 wakaba 1.3 !!!parse-error (type => 'nestc');
794 wakaba 1.1 }
795 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
796 wakaba 1.1 # next-input-character is already done
797     redo A;
798 wakaba 1.17 } elsif ($self->{next_input_character} == -1) {
799 wakaba 1.3 !!!parse-error (type => 'unclosed tag');
800 wakaba 1.1 $before_leave->();
801 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
802 wakaba 1.28 $self->{current_token}->{first_start_tag}
803     = not defined $self->{last_emitted_start_tag_name};
804 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
805 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
806 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
807 wakaba 1.1 if ($self->{current_token}->{attributes}) {
808 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
809 wakaba 1.1 }
810     } else {
811     die "$0: $self->{current_token}->{type}: Unknown token type";
812     }
813 wakaba 1.57 $self->{state} = DATA_STATE;
814 wakaba 1.1 # reconsume
815    
816     !!!emit ($self->{current_token}); # start tag or end tag
817    
818     redo A;
819     } else {
820     $self->{current_attribute}->{name} .= chr ($self->{next_input_character});
821     ## Stay in the state
822     !!!next-input-character;
823     redo A;
824     }
825 wakaba 1.57 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
826 wakaba 1.1 if ($self->{next_input_character} == 0x0009 or # HT
827     $self->{next_input_character} == 0x000A or # LF
828     $self->{next_input_character} == 0x000B or # VT
829     $self->{next_input_character} == 0x000C or # FF
830     $self->{next_input_character} == 0x0020) { # SP
831     ## Stay in the state
832     !!!next-input-character;
833     redo A;
834     } elsif ($self->{next_input_character} == 0x003D) { # =
835 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
836 wakaba 1.1 !!!next-input-character;
837     redo A;
838     } elsif ($self->{next_input_character} == 0x003E) { # >
839 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
840 wakaba 1.28 $self->{current_token}->{first_start_tag}
841     = not defined $self->{last_emitted_start_tag_name};
842 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
843 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
844 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
845 wakaba 1.1 if ($self->{current_token}->{attributes}) {
846 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
847 wakaba 1.1 }
848     } else {
849     die "$0: $self->{current_token}->{type}: Unknown token type";
850     }
851 wakaba 1.57 $self->{state} = DATA_STATE;
852 wakaba 1.1 !!!next-input-character;
853    
854     !!!emit ($self->{current_token}); # start tag or end tag
855    
856     redo A;
857     } elsif (0x0041 <= $self->{next_input_character} and
858     $self->{next_input_character} <= 0x005A) { # A..Z
859     $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
860     value => ''};
861 wakaba 1.57 $self->{state} = ATTRIBUTE_NAME_STATE;
862 wakaba 1.1 !!!next-input-character;
863     redo A;
864     } elsif ($self->{next_input_character} == 0x002F) { # /
865     !!!next-input-character;
866     if ($self->{next_input_character} == 0x003E and # >
867 wakaba 1.55 $self->{current_token}->{type} == START_TAG_TOKEN and
868 wakaba 1.1 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
869     # permitted slash
870     #
871     } else {
872 wakaba 1.3 !!!parse-error (type => 'nestc');
873 wakaba 1.33 ## TODO: Different error type for <aa / bb> than <aa/>
874 wakaba 1.1 }
875 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
876 wakaba 1.1 # next-input-character is already done
877     redo A;
878 wakaba 1.17 } elsif ($self->{next_input_character} == -1) {
879 wakaba 1.3 !!!parse-error (type => 'unclosed tag');
880 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
881 wakaba 1.28 $self->{current_token}->{first_start_tag}
882     = not defined $self->{last_emitted_start_tag_name};
883 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
884 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
885 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
886 wakaba 1.1 if ($self->{current_token}->{attributes}) {
887 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
888 wakaba 1.1 }
889     } else {
890     die "$0: $self->{current_token}->{type}: Unknown token type";
891     }
892 wakaba 1.57 $self->{state} = DATA_STATE;
893 wakaba 1.1 # reconsume
894    
895     !!!emit ($self->{current_token}); # start tag or end tag
896    
897     redo A;
898     } else {
899     $self->{current_attribute} = {name => chr ($self->{next_input_character}),
900     value => ''};
901 wakaba 1.57 $self->{state} = ATTRIBUTE_NAME_STATE;
902 wakaba 1.1 !!!next-input-character;
903     redo A;
904     }
905 wakaba 1.57 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
906 wakaba 1.1 if ($self->{next_input_character} == 0x0009 or # HT
907     $self->{next_input_character} == 0x000A or # LF
908     $self->{next_input_character} == 0x000B or # VT
909     $self->{next_input_character} == 0x000C or # FF
910     $self->{next_input_character} == 0x0020) { # SP
911     ## Stay in the state
912     !!!next-input-character;
913     redo A;
914     } elsif ($self->{next_input_character} == 0x0022) { # "
915 wakaba 1.57 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
916 wakaba 1.1 !!!next-input-character;
917     redo A;
918     } elsif ($self->{next_input_character} == 0x0026) { # &
919 wakaba 1.57 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
920 wakaba 1.1 ## reconsume
921     redo A;
922     } elsif ($self->{next_input_character} == 0x0027) { # '
923 wakaba 1.57 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
924 wakaba 1.1 !!!next-input-character;
925     redo A;
926     } elsif ($self->{next_input_character} == 0x003E) { # >
927 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
928 wakaba 1.28 $self->{current_token}->{first_start_tag}
929     = not defined $self->{last_emitted_start_tag_name};
930 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
931 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
932 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
933 wakaba 1.1 if ($self->{current_token}->{attributes}) {
934 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
935 wakaba 1.1 }
936     } else {
937     die "$0: $self->{current_token}->{type}: Unknown token type";
938     }
939 wakaba 1.57 $self->{state} = DATA_STATE;
940 wakaba 1.1 !!!next-input-character;
941    
942     !!!emit ($self->{current_token}); # start tag or end tag
943    
944     redo A;
945 wakaba 1.17 } elsif ($self->{next_input_character} == -1) {
946 wakaba 1.3 !!!parse-error (type => 'unclosed tag');
947 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
948 wakaba 1.28 $self->{current_token}->{first_start_tag}
949     = not defined $self->{last_emitted_start_tag_name};
950 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
951 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
952 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
953 wakaba 1.1 if ($self->{current_token}->{attributes}) {
954 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
955 wakaba 1.1 }
956     } else {
957     die "$0: $self->{current_token}->{type}: Unknown token type";
958     }
959 wakaba 1.57 $self->{state} = DATA_STATE;
960 wakaba 1.1 ## reconsume
961    
962     !!!emit ($self->{current_token}); # start tag or end tag
963    
964     redo A;
965     } else {
966     $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
967 wakaba 1.57 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
968 wakaba 1.1 !!!next-input-character;
969     redo A;
970     }
971 wakaba 1.57 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
972 wakaba 1.1 if ($self->{next_input_character} == 0x0022) { # "
973 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
974 wakaba 1.1 !!!next-input-character;
975     redo A;
976     } elsif ($self->{next_input_character} == 0x0026) { # &
977 wakaba 1.57 $self->{last_attribute_value_state} = $self->{state};
978     $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
979 wakaba 1.1 !!!next-input-character;
980     redo A;
981     } elsif ($self->{next_input_character} == -1) {
982 wakaba 1.3 !!!parse-error (type => 'unclosed attribute value');
983 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
984 wakaba 1.28 $self->{current_token}->{first_start_tag}
985     = not defined $self->{last_emitted_start_tag_name};
986 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
987 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
988 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
989 wakaba 1.1 if ($self->{current_token}->{attributes}) {
990 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
991 wakaba 1.1 }
992     } else {
993     die "$0: $self->{current_token}->{type}: Unknown token type";
994     }
995 wakaba 1.57 $self->{state} = DATA_STATE;
996 wakaba 1.1 ## reconsume
997    
998     !!!emit ($self->{current_token}); # start tag or end tag
999    
1000     redo A;
1001     } else {
1002     $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1003     ## Stay in the state
1004     !!!next-input-character;
1005     redo A;
1006     }
1007 wakaba 1.57 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1008 wakaba 1.1 if ($self->{next_input_character} == 0x0027) { # '
1009 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1010 wakaba 1.1 !!!next-input-character;
1011     redo A;
1012     } elsif ($self->{next_input_character} == 0x0026) { # &
1013 wakaba 1.57 $self->{last_attribute_value_state} = $self->{state};
1014     $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1015 wakaba 1.1 !!!next-input-character;
1016     redo A;
1017     } elsif ($self->{next_input_character} == -1) {
1018 wakaba 1.3 !!!parse-error (type => 'unclosed attribute value');
1019 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1020 wakaba 1.28 $self->{current_token}->{first_start_tag}
1021     = not defined $self->{last_emitted_start_tag_name};
1022 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1023 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1024 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1025 wakaba 1.1 if ($self->{current_token}->{attributes}) {
1026 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1027 wakaba 1.1 }
1028     } else {
1029     die "$0: $self->{current_token}->{type}: Unknown token type";
1030     }
1031 wakaba 1.57 $self->{state} = DATA_STATE;
1032 wakaba 1.1 ## reconsume
1033    
1034     !!!emit ($self->{current_token}); # start tag or end tag
1035    
1036     redo A;
1037     } else {
1038     $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1039     ## Stay in the state
1040     !!!next-input-character;
1041     redo A;
1042     }
1043 wakaba 1.57 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1044 wakaba 1.1 if ($self->{next_input_character} == 0x0009 or # HT
1045     $self->{next_input_character} == 0x000A or # LF
1046     $self->{next_input_character} == 0x000B or # HT
1047     $self->{next_input_character} == 0x000C or # FF
1048     $self->{next_input_character} == 0x0020) { # SP
1049 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1050 wakaba 1.1 !!!next-input-character;
1051     redo A;
1052     } elsif ($self->{next_input_character} == 0x0026) { # &
1053 wakaba 1.57 $self->{last_attribute_value_state} = $self->{state};
1054     $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1055 wakaba 1.1 !!!next-input-character;
1056     redo A;
1057     } elsif ($self->{next_input_character} == 0x003E) { # >
1058 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1059 wakaba 1.28 $self->{current_token}->{first_start_tag}
1060     = not defined $self->{last_emitted_start_tag_name};
1061 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1062 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1063 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1064 wakaba 1.1 if ($self->{current_token}->{attributes}) {
1065 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1066 wakaba 1.1 }
1067     } else {
1068     die "$0: $self->{current_token}->{type}: Unknown token type";
1069     }
1070 wakaba 1.57 $self->{state} = DATA_STATE;
1071 wakaba 1.1 !!!next-input-character;
1072    
1073     !!!emit ($self->{current_token}); # start tag or end tag
1074    
1075     redo A;
1076 wakaba 1.17 } elsif ($self->{next_input_character} == -1) {
1077 wakaba 1.3 !!!parse-error (type => 'unclosed tag');
1078 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1079 wakaba 1.28 $self->{current_token}->{first_start_tag}
1080     = not defined $self->{last_emitted_start_tag_name};
1081 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1082 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1083 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1084 wakaba 1.1 if ($self->{current_token}->{attributes}) {
1085 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1086 wakaba 1.1 }
1087     } else {
1088     die "$0: $self->{current_token}->{type}: Unknown token type";
1089     }
1090 wakaba 1.57 $self->{state} = DATA_STATE;
1091 wakaba 1.1 ## reconsume
1092    
1093     !!!emit ($self->{current_token}); # start tag or end tag
1094    
1095     redo A;
1096     } else {
1097     $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1098     ## Stay in the state
1099     !!!next-input-character;
1100     redo A;
1101     }
1102 wakaba 1.57 } elsif ($self->{state} == ENTITY_IN_ATTRIBUTE_VALUE_STATE) {
1103 wakaba 1.26 my $token = $self->_tokenize_attempt_to_consume_an_entity (1);
1104 wakaba 1.1
1105     unless (defined $token) {
1106     $self->{current_attribute}->{value} .= '&';
1107     } else {
1108     $self->{current_attribute}->{value} .= $token->{data};
1109     ## ISSUE: spec says "append the returned character token to the current attribute's value"
1110     }
1111    
1112     $self->{state} = $self->{last_attribute_value_state};
1113     # next-input-character is already done
1114     redo A;
1115 wakaba 1.57 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1116 wakaba 1.1 ## (only happen if PCDATA state)
1117    
1118 wakaba 1.55 my $token = {type => COMMENT_TOKEN, data => ''};
1119 wakaba 1.1
1120     BC: {
1121     if ($self->{next_input_character} == 0x003E) { # >
1122 wakaba 1.57 $self->{state} = DATA_STATE;
1123 wakaba 1.1 !!!next-input-character;
1124    
1125     !!!emit ($token);
1126    
1127     redo A;
1128     } elsif ($self->{next_input_character} == -1) {
1129 wakaba 1.57 $self->{state} = DATA_STATE;
1130 wakaba 1.1 ## reconsume
1131    
1132     !!!emit ($token);
1133    
1134     redo A;
1135     } else {
1136     $token->{data} .= chr ($self->{next_input_character});
1137     !!!next-input-character;
1138     redo BC;
1139     }
1140     } # BC
1141 wakaba 1.57 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1142 wakaba 1.1 ## (only happen if PCDATA state)
1143    
1144     my @next_char;
1145     push @next_char, $self->{next_input_character};
1146    
1147     if ($self->{next_input_character} == 0x002D) { # -
1148     !!!next-input-character;
1149     push @next_char, $self->{next_input_character};
1150     if ($self->{next_input_character} == 0x002D) { # -
1151 wakaba 1.55 $self->{current_token} = {type => COMMENT_TOKEN, data => ''};
1152 wakaba 1.57 $self->{state} = COMMENT_START_STATE;
1153 wakaba 1.1 !!!next-input-character;
1154     redo A;
1155     }
1156     } elsif ($self->{next_input_character} == 0x0044 or # D
1157     $self->{next_input_character} == 0x0064) { # d
1158     !!!next-input-character;
1159     push @next_char, $self->{next_input_character};
1160     if ($self->{next_input_character} == 0x004F or # O
1161     $self->{next_input_character} == 0x006F) { # o
1162     !!!next-input-character;
1163     push @next_char, $self->{next_input_character};
1164     if ($self->{next_input_character} == 0x0043 or # C
1165     $self->{next_input_character} == 0x0063) { # c
1166     !!!next-input-character;
1167     push @next_char, $self->{next_input_character};
1168     if ($self->{next_input_character} == 0x0054 or # T
1169     $self->{next_input_character} == 0x0074) { # t
1170     !!!next-input-character;
1171     push @next_char, $self->{next_input_character};
1172     if ($self->{next_input_character} == 0x0059 or # Y
1173     $self->{next_input_character} == 0x0079) { # y
1174     !!!next-input-character;
1175     push @next_char, $self->{next_input_character};
1176     if ($self->{next_input_character} == 0x0050 or # P
1177     $self->{next_input_character} == 0x0070) { # p
1178     !!!next-input-character;
1179     push @next_char, $self->{next_input_character};
1180     if ($self->{next_input_character} == 0x0045 or # E
1181     $self->{next_input_character} == 0x0065) { # e
1182     ## ISSUE: What a stupid code this is!
1183 wakaba 1.57 $self->{state} = DOCTYPE_STATE;
1184 wakaba 1.1 !!!next-input-character;
1185     redo A;
1186     }
1187     }
1188     }
1189     }
1190     }
1191     }
1192     }
1193    
1194 wakaba 1.30 !!!parse-error (type => 'bogus comment');
1195 wakaba 1.1 $self->{next_input_character} = shift @next_char;
1196     !!!back-next-input-character (@next_char);
1197 wakaba 1.57 $self->{state} = BOGUS_COMMENT_STATE;
1198 wakaba 1.1 redo A;
1199    
1200     ## ISSUE: typos in spec: chacacters, is is a parse error
1201     ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
1202 wakaba 1.57 } elsif ($self->{state} == COMMENT_START_STATE) {
1203 wakaba 1.23 if ($self->{next_input_character} == 0x002D) { # -
1204 wakaba 1.57 $self->{state} = COMMENT_START_DASH_STATE;
1205 wakaba 1.23 !!!next-input-character;
1206     redo A;
1207     } elsif ($self->{next_input_character} == 0x003E) { # >
1208     !!!parse-error (type => 'bogus comment');
1209 wakaba 1.57 $self->{state} = DATA_STATE;
1210 wakaba 1.23 !!!next-input-character;
1211    
1212     !!!emit ($self->{current_token}); # comment
1213    
1214     redo A;
1215     } elsif ($self->{next_input_character} == -1) {
1216     !!!parse-error (type => 'unclosed comment');
1217 wakaba 1.57 $self->{state} = DATA_STATE;
1218 wakaba 1.23 ## reconsume
1219    
1220     !!!emit ($self->{current_token}); # comment
1221    
1222     redo A;
1223     } else {
1224     $self->{current_token}->{data} # comment
1225     .= chr ($self->{next_input_character});
1226 wakaba 1.57 $self->{state} = COMMENT_STATE;
1227 wakaba 1.23 !!!next-input-character;
1228     redo A;
1229     }
1230 wakaba 1.57 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1231 wakaba 1.23 if ($self->{next_input_character} == 0x002D) { # -
1232 wakaba 1.57 $self->{state} = COMMENT_END_STATE;
1233 wakaba 1.23 !!!next-input-character;
1234     redo A;
1235     } elsif ($self->{next_input_character} == 0x003E) { # >
1236     !!!parse-error (type => 'bogus comment');
1237 wakaba 1.57 $self->{state} = DATA_STATE;
1238 wakaba 1.23 !!!next-input-character;
1239    
1240     !!!emit ($self->{current_token}); # comment
1241    
1242     redo A;
1243     } elsif ($self->{next_input_character} == -1) {
1244     !!!parse-error (type => 'unclosed comment');
1245 wakaba 1.57 $self->{state} = DATA_STATE;
1246 wakaba 1.23 ## reconsume
1247    
1248     !!!emit ($self->{current_token}); # comment
1249    
1250     redo A;
1251     } else {
1252     $self->{current_token}->{data} # comment
1253 wakaba 1.33 .= '-' . chr ($self->{next_input_character});
1254 wakaba 1.57 $self->{state} = COMMENT_STATE;
1255 wakaba 1.23 !!!next-input-character;
1256     redo A;
1257     }
1258 wakaba 1.57 } elsif ($self->{state} == COMMENT_STATE) {
1259 wakaba 1.1 if ($self->{next_input_character} == 0x002D) { # -
1260 wakaba 1.57 $self->{state} = COMMENT_END_DASH_STATE;
1261 wakaba 1.1 !!!next-input-character;
1262     redo A;
1263     } elsif ($self->{next_input_character} == -1) {
1264 wakaba 1.3 !!!parse-error (type => 'unclosed comment');
1265 wakaba 1.57 $self->{state} = DATA_STATE;
1266 wakaba 1.1 ## reconsume
1267    
1268     !!!emit ($self->{current_token}); # comment
1269    
1270     redo A;
1271     } else {
1272     $self->{current_token}->{data} .= chr ($self->{next_input_character}); # comment
1273     ## Stay in the state
1274     !!!next-input-character;
1275     redo A;
1276     }
1277 wakaba 1.57 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1278 wakaba 1.1 if ($self->{next_input_character} == 0x002D) { # -
1279 wakaba 1.57 $self->{state} = COMMENT_END_STATE;
1280 wakaba 1.1 !!!next-input-character;
1281     redo A;
1282     } elsif ($self->{next_input_character} == -1) {
1283 wakaba 1.3 !!!parse-error (type => 'unclosed comment');
1284 wakaba 1.57 $self->{state} = DATA_STATE;
1285 wakaba 1.1 ## reconsume
1286    
1287     !!!emit ($self->{current_token}); # comment
1288    
1289     redo A;
1290     } else {
1291     $self->{current_token}->{data} .= '-' . chr ($self->{next_input_character}); # comment
1292 wakaba 1.57 $self->{state} = COMMENT_STATE;
1293 wakaba 1.1 !!!next-input-character;
1294     redo A;
1295     }
1296 wakaba 1.57 } elsif ($self->{state} == COMMENT_END_STATE) {
1297 wakaba 1.1 if ($self->{next_input_character} == 0x003E) { # >
1298 wakaba 1.57 $self->{state} = DATA_STATE;
1299 wakaba 1.1 !!!next-input-character;
1300    
1301     !!!emit ($self->{current_token}); # comment
1302    
1303     redo A;
1304     } elsif ($self->{next_input_character} == 0x002D) { # -
1305 wakaba 1.3 !!!parse-error (type => 'dash in comment');
1306 wakaba 1.1 $self->{current_token}->{data} .= '-'; # comment
1307     ## Stay in the state
1308     !!!next-input-character;
1309     redo A;
1310     } elsif ($self->{next_input_character} == -1) {
1311 wakaba 1.3 !!!parse-error (type => 'unclosed comment');
1312 wakaba 1.57 $self->{state} = DATA_STATE;
1313 wakaba 1.1 ## reconsume
1314    
1315     !!!emit ($self->{current_token}); # comment
1316    
1317     redo A;
1318     } else {
1319 wakaba 1.3 !!!parse-error (type => 'dash in comment');
1320 wakaba 1.1 $self->{current_token}->{data} .= '--' . chr ($self->{next_input_character}); # comment
1321 wakaba 1.57 $self->{state} = COMMENT_STATE;
1322 wakaba 1.1 !!!next-input-character;
1323     redo A;
1324     }
1325 wakaba 1.57 } elsif ($self->{state} == DOCTYPE_STATE) {
1326 wakaba 1.1 if ($self->{next_input_character} == 0x0009 or # HT
1327     $self->{next_input_character} == 0x000A or # LF
1328     $self->{next_input_character} == 0x000B or # VT
1329     $self->{next_input_character} == 0x000C or # FF
1330     $self->{next_input_character} == 0x0020) { # SP
1331 wakaba 1.57 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1332 wakaba 1.1 !!!next-input-character;
1333     redo A;
1334     } else {
1335 wakaba 1.3 !!!parse-error (type => 'no space before DOCTYPE name');
1336 wakaba 1.57 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1337 wakaba 1.1 ## reconsume
1338     redo A;
1339     }
1340 wakaba 1.57 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
1341 wakaba 1.1 if ($self->{next_input_character} == 0x0009 or # HT
1342     $self->{next_input_character} == 0x000A or # LF
1343     $self->{next_input_character} == 0x000B or # VT
1344     $self->{next_input_character} == 0x000C or # FF
1345     $self->{next_input_character} == 0x0020) { # SP
1346     ## Stay in the state
1347     !!!next-input-character;
1348     redo A;
1349     } elsif ($self->{next_input_character} == 0x003E) { # >
1350 wakaba 1.3 !!!parse-error (type => 'no DOCTYPE name');
1351 wakaba 1.57 $self->{state} = DATA_STATE;
1352 wakaba 1.1 !!!next-input-character;
1353    
1354 wakaba 1.55 !!!emit ({type => DOCTYPE_TOKEN}); # incorrect
1355 wakaba 1.1
1356     redo A;
1357     } elsif ($self->{next_input_character} == -1) {
1358 wakaba 1.3 !!!parse-error (type => 'no DOCTYPE name');
1359 wakaba 1.57 $self->{state} = DATA_STATE;
1360 wakaba 1.1 ## reconsume
1361    
1362 wakaba 1.55 !!!emit ({type => DOCTYPE_TOKEN}); # incorrect
1363 wakaba 1.1
1364     redo A;
1365     } else {
1366 wakaba 1.18 $self->{current_token}
1367 wakaba 1.55 = {type => DOCTYPE_TOKEN,
1368 wakaba 1.18 name => chr ($self->{next_input_character}),
1369     correct => 1};
1370 wakaba 1.4 ## ISSUE: "Set the token's name name to the" in the spec
1371 wakaba 1.57 $self->{state} = DOCTYPE_NAME_STATE;
1372 wakaba 1.1 !!!next-input-character;
1373     redo A;
1374     }
1375 wakaba 1.57 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
1376 wakaba 1.18 ## ISSUE: Redundant "First," in the spec.
1377 wakaba 1.1 if ($self->{next_input_character} == 0x0009 or # HT
1378     $self->{next_input_character} == 0x000A or # LF
1379     $self->{next_input_character} == 0x000B or # VT
1380     $self->{next_input_character} == 0x000C or # FF
1381     $self->{next_input_character} == 0x0020) { # SP
1382 wakaba 1.57 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
1383 wakaba 1.1 !!!next-input-character;
1384     redo A;
1385     } elsif ($self->{next_input_character} == 0x003E) { # >
1386 wakaba 1.57 $self->{state} = DATA_STATE;
1387 wakaba 1.1 !!!next-input-character;
1388    
1389     !!!emit ($self->{current_token}); # DOCTYPE
1390    
1391     redo A;
1392     } elsif ($self->{next_input_character} == -1) {
1393 wakaba 1.3 !!!parse-error (type => 'unclosed DOCTYPE');
1394 wakaba 1.57 $self->{state} = DATA_STATE;
1395 wakaba 1.1 ## reconsume
1396    
1397 wakaba 1.18 delete $self->{current_token}->{correct};
1398     !!!emit ($self->{current_token}); # DOCTYPE
1399 wakaba 1.1
1400     redo A;
1401     } else {
1402     $self->{current_token}->{name}
1403     .= chr ($self->{next_input_character}); # DOCTYPE
1404     ## Stay in the state
1405     !!!next-input-character;
1406     redo A;
1407     }
1408 wakaba 1.57 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
1409 wakaba 1.1 if ($self->{next_input_character} == 0x0009 or # HT
1410     $self->{next_input_character} == 0x000A or # LF
1411     $self->{next_input_character} == 0x000B or # VT
1412     $self->{next_input_character} == 0x000C or # FF
1413     $self->{next_input_character} == 0x0020) { # SP
1414     ## Stay in the state
1415     !!!next-input-character;
1416     redo A;
1417     } elsif ($self->{next_input_character} == 0x003E) { # >
1418 wakaba 1.57 $self->{state} = DATA_STATE;
1419 wakaba 1.1 !!!next-input-character;
1420    
1421     !!!emit ($self->{current_token}); # DOCTYPE
1422    
1423     redo A;
1424     } elsif ($self->{next_input_character} == -1) {
1425 wakaba 1.3 !!!parse-error (type => 'unclosed DOCTYPE');
1426 wakaba 1.57 $self->{state} = DATA_STATE;
1427 wakaba 1.1 ## reconsume
1428    
1429 wakaba 1.18 delete $self->{current_token}->{correct};
1430     !!!emit ($self->{current_token}); # DOCTYPE
1431    
1432     redo A;
1433     } elsif ($self->{next_input_character} == 0x0050 or # P
1434     $self->{next_input_character} == 0x0070) { # p
1435     !!!next-input-character;
1436     if ($self->{next_input_character} == 0x0055 or # U
1437     $self->{next_input_character} == 0x0075) { # u
1438     !!!next-input-character;
1439     if ($self->{next_input_character} == 0x0042 or # B
1440     $self->{next_input_character} == 0x0062) { # b
1441     !!!next-input-character;
1442     if ($self->{next_input_character} == 0x004C or # L
1443     $self->{next_input_character} == 0x006C) { # l
1444     !!!next-input-character;
1445     if ($self->{next_input_character} == 0x0049 or # I
1446     $self->{next_input_character} == 0x0069) { # i
1447     !!!next-input-character;
1448     if ($self->{next_input_character} == 0x0043 or # C
1449     $self->{next_input_character} == 0x0063) { # c
1450 wakaba 1.57 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1451 wakaba 1.18 !!!next-input-character;
1452     redo A;
1453     }
1454     }
1455     }
1456     }
1457     }
1458    
1459     #
1460     } elsif ($self->{next_input_character} == 0x0053 or # S
1461     $self->{next_input_character} == 0x0073) { # s
1462     !!!next-input-character;
1463     if ($self->{next_input_character} == 0x0059 or # Y
1464     $self->{next_input_character} == 0x0079) { # y
1465     !!!next-input-character;
1466     if ($self->{next_input_character} == 0x0053 or # S
1467     $self->{next_input_character} == 0x0073) { # s
1468     !!!next-input-character;
1469     if ($self->{next_input_character} == 0x0054 or # T
1470     $self->{next_input_character} == 0x0074) { # t
1471     !!!next-input-character;
1472     if ($self->{next_input_character} == 0x0045 or # E
1473     $self->{next_input_character} == 0x0065) { # e
1474     !!!next-input-character;
1475     if ($self->{next_input_character} == 0x004D or # M
1476     $self->{next_input_character} == 0x006D) { # m
1477 wakaba 1.57 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1478 wakaba 1.18 !!!next-input-character;
1479     redo A;
1480     }
1481     }
1482     }
1483     }
1484     }
1485    
1486     #
1487     } else {
1488     !!!next-input-character;
1489     #
1490     }
1491    
1492     !!!parse-error (type => 'string after DOCTYPE name');
1493 wakaba 1.57 $self->{state} = BOGUS_DOCTYPE_STATE;
1494 wakaba 1.18 # next-input-character is already done
1495     redo A;
1496 wakaba 1.57 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
1497 wakaba 1.18 if ({
1498     0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1499     #0x000D => 1, # HT, LF, VT, FF, SP, CR
1500     }->{$self->{next_input_character}}) {
1501     ## Stay in the state
1502     !!!next-input-character;
1503     redo A;
1504     } elsif ($self->{next_input_character} eq 0x0022) { # "
1505     $self->{current_token}->{public_identifier} = ''; # DOCTYPE
1506 wakaba 1.57 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
1507 wakaba 1.18 !!!next-input-character;
1508     redo A;
1509     } elsif ($self->{next_input_character} eq 0x0027) { # '
1510     $self->{current_token}->{public_identifier} = ''; # DOCTYPE
1511 wakaba 1.57 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
1512 wakaba 1.18 !!!next-input-character;
1513     redo A;
1514     } elsif ($self->{next_input_character} eq 0x003E) { # >
1515     !!!parse-error (type => 'no PUBLIC literal');
1516    
1517 wakaba 1.57 $self->{state} = DATA_STATE;
1518 wakaba 1.18 !!!next-input-character;
1519    
1520     delete $self->{current_token}->{correct};
1521     !!!emit ($self->{current_token}); # DOCTYPE
1522    
1523     redo A;
1524     } elsif ($self->{next_input_character} == -1) {
1525     !!!parse-error (type => 'unclosed DOCTYPE');
1526    
1527 wakaba 1.57 $self->{state} = DATA_STATE;
1528 wakaba 1.18 ## reconsume
1529    
1530     delete $self->{current_token}->{correct};
1531     !!!emit ($self->{current_token}); # DOCTYPE
1532    
1533     redo A;
1534     } else {
1535     !!!parse-error (type => 'string after PUBLIC');
1536 wakaba 1.57 $self->{state} = BOGUS_DOCTYPE_STATE;
1537 wakaba 1.18 !!!next-input-character;
1538     redo A;
1539     }
1540 wakaba 1.57 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
1541 wakaba 1.18 if ($self->{next_input_character} == 0x0022) { # "
1542 wakaba 1.57 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1543 wakaba 1.18 !!!next-input-character;
1544     redo A;
1545     } elsif ($self->{next_input_character} == -1) {
1546     !!!parse-error (type => 'unclosed PUBLIC literal');
1547    
1548 wakaba 1.57 $self->{state} = DATA_STATE;
1549 wakaba 1.18 ## reconsume
1550    
1551     delete $self->{current_token}->{correct};
1552     !!!emit ($self->{current_token}); # DOCTYPE
1553    
1554     redo A;
1555     } else {
1556     $self->{current_token}->{public_identifier} # DOCTYPE
1557     .= chr $self->{next_input_character};
1558     ## Stay in the state
1559     !!!next-input-character;
1560     redo A;
1561     }
1562 wakaba 1.57 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
1563 wakaba 1.18 if ($self->{next_input_character} == 0x0027) { # '
1564 wakaba 1.57 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1565 wakaba 1.18 !!!next-input-character;
1566     redo A;
1567     } elsif ($self->{next_input_character} == -1) {
1568     !!!parse-error (type => 'unclosed PUBLIC literal');
1569    
1570 wakaba 1.57 $self->{state} = DATA_STATE;
1571 wakaba 1.18 ## reconsume
1572    
1573     delete $self->{current_token}->{correct};
1574     !!!emit ($self->{current_token}); # DOCTYPE
1575    
1576     redo A;
1577     } else {
1578     $self->{current_token}->{public_identifier} # DOCTYPE
1579     .= chr $self->{next_input_character};
1580     ## Stay in the state
1581     !!!next-input-character;
1582     redo A;
1583     }
1584 wakaba 1.57 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
1585 wakaba 1.18 if ({
1586     0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1587     #0x000D => 1, # HT, LF, VT, FF, SP, CR
1588     }->{$self->{next_input_character}}) {
1589     ## Stay in the state
1590     !!!next-input-character;
1591     redo A;
1592     } elsif ($self->{next_input_character} == 0x0022) { # "
1593     $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1594 wakaba 1.57 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
1595 wakaba 1.18 !!!next-input-character;
1596     redo A;
1597     } elsif ($self->{next_input_character} == 0x0027) { # '
1598     $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1599 wakaba 1.57 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
1600 wakaba 1.18 !!!next-input-character;
1601     redo A;
1602     } elsif ($self->{next_input_character} == 0x003E) { # >
1603 wakaba 1.57 $self->{state} = DATA_STATE;
1604 wakaba 1.18 !!!next-input-character;
1605    
1606     !!!emit ($self->{current_token}); # DOCTYPE
1607    
1608     redo A;
1609     } elsif ($self->{next_input_character} == -1) {
1610     !!!parse-error (type => 'unclosed DOCTYPE');
1611    
1612 wakaba 1.57 $self->{state} = DATA_STATE;
1613 wakaba 1.26 ## reconsume
1614 wakaba 1.18
1615     delete $self->{current_token}->{correct};
1616     !!!emit ($self->{current_token}); # DOCTYPE
1617    
1618     redo A;
1619     } else {
1620     !!!parse-error (type => 'string after PUBLIC literal');
1621 wakaba 1.57 $self->{state} = BOGUS_DOCTYPE_STATE;
1622 wakaba 1.18 !!!next-input-character;
1623     redo A;
1624     }
1625 wakaba 1.57 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
1626 wakaba 1.18 if ({
1627     0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1628     #0x000D => 1, # HT, LF, VT, FF, SP, CR
1629     }->{$self->{next_input_character}}) {
1630     ## Stay in the state
1631     !!!next-input-character;
1632     redo A;
1633     } elsif ($self->{next_input_character} == 0x0022) { # "
1634     $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1635 wakaba 1.57 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
1636 wakaba 1.18 !!!next-input-character;
1637     redo A;
1638     } elsif ($self->{next_input_character} == 0x0027) { # '
1639     $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1640 wakaba 1.57 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
1641 wakaba 1.18 !!!next-input-character;
1642     redo A;
1643     } elsif ($self->{next_input_character} == 0x003E) { # >
1644     !!!parse-error (type => 'no SYSTEM literal');
1645 wakaba 1.57 $self->{state} = DATA_STATE;
1646 wakaba 1.18 !!!next-input-character;
1647    
1648     delete $self->{current_token}->{correct};
1649     !!!emit ($self->{current_token}); # DOCTYPE
1650    
1651     redo A;
1652     } elsif ($self->{next_input_character} == -1) {
1653     !!!parse-error (type => 'unclosed DOCTYPE');
1654    
1655 wakaba 1.57 $self->{state} = DATA_STATE;
1656 wakaba 1.26 ## reconsume
1657 wakaba 1.18
1658     delete $self->{current_token}->{correct};
1659     !!!emit ($self->{current_token}); # DOCTYPE
1660    
1661     redo A;
1662     } else {
1663 wakaba 1.30 !!!parse-error (type => 'string after SYSTEM');
1664 wakaba 1.57 $self->{state} = BOGUS_DOCTYPE_STATE;
1665 wakaba 1.18 !!!next-input-character;
1666     redo A;
1667     }
1668 wakaba 1.57 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
1669 wakaba 1.18 if ($self->{next_input_character} == 0x0022) { # "
1670 wakaba 1.57 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1671 wakaba 1.18 !!!next-input-character;
1672     redo A;
1673     } elsif ($self->{next_input_character} == -1) {
1674     !!!parse-error (type => 'unclosed SYSTEM literal');
1675    
1676 wakaba 1.57 $self->{state} = DATA_STATE;
1677 wakaba 1.18 ## reconsume
1678    
1679     delete $self->{current_token}->{correct};
1680     !!!emit ($self->{current_token}); # DOCTYPE
1681    
1682     redo A;
1683     } else {
1684     $self->{current_token}->{system_identifier} # DOCTYPE
1685     .= chr $self->{next_input_character};
1686     ## Stay in the state
1687     !!!next-input-character;
1688     redo A;
1689     }
1690 wakaba 1.57 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
1691 wakaba 1.18 if ($self->{next_input_character} == 0x0027) { # '
1692 wakaba 1.57 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1693 wakaba 1.18 !!!next-input-character;
1694     redo A;
1695     } elsif ($self->{next_input_character} == -1) {
1696     !!!parse-error (type => 'unclosed SYSTEM literal');
1697    
1698 wakaba 1.57 $self->{state} = DATA_STATE;
1699 wakaba 1.18 ## reconsume
1700    
1701     delete $self->{current_token}->{correct};
1702 wakaba 1.1 !!!emit ($self->{current_token}); # DOCTYPE
1703    
1704     redo A;
1705     } else {
1706 wakaba 1.18 $self->{current_token}->{system_identifier} # DOCTYPE
1707     .= chr $self->{next_input_character};
1708     ## Stay in the state
1709     !!!next-input-character;
1710     redo A;
1711     }
1712 wakaba 1.57 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
1713 wakaba 1.18 if ({
1714     0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1715     #0x000D => 1, # HT, LF, VT, FF, SP, CR
1716     }->{$self->{next_input_character}}) {
1717     ## Stay in the state
1718     !!!next-input-character;
1719     redo A;
1720     } elsif ($self->{next_input_character} == 0x003E) { # >
1721 wakaba 1.57 $self->{state} = DATA_STATE;
1722 wakaba 1.18 !!!next-input-character;
1723    
1724     !!!emit ($self->{current_token}); # DOCTYPE
1725    
1726     redo A;
1727     } elsif ($self->{next_input_character} == -1) {
1728     !!!parse-error (type => 'unclosed DOCTYPE');
1729    
1730 wakaba 1.57 $self->{state} = DATA_STATE;
1731 wakaba 1.26 ## reconsume
1732 wakaba 1.18
1733     delete $self->{current_token}->{correct};
1734     !!!emit ($self->{current_token}); # DOCTYPE
1735    
1736     redo A;
1737     } else {
1738     !!!parse-error (type => 'string after SYSTEM literal');
1739 wakaba 1.57 $self->{state} = BOGUS_DOCTYPE_STATE;
1740 wakaba 1.1 !!!next-input-character;
1741     redo A;
1742     }
1743 wakaba 1.57 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
1744 wakaba 1.1 if ($self->{next_input_character} == 0x003E) { # >
1745 wakaba 1.57 $self->{state} = DATA_STATE;
1746 wakaba 1.1 !!!next-input-character;
1747    
1748 wakaba 1.18 delete $self->{current_token}->{correct};
1749 wakaba 1.1 !!!emit ($self->{current_token}); # DOCTYPE
1750    
1751     redo A;
1752     } elsif ($self->{next_input_character} == -1) {
1753 wakaba 1.3 !!!parse-error (type => 'unclosed DOCTYPE');
1754 wakaba 1.57 $self->{state} = DATA_STATE;
1755 wakaba 1.1 ## reconsume
1756    
1757 wakaba 1.18 delete $self->{current_token}->{correct};
1758 wakaba 1.1 !!!emit ($self->{current_token}); # DOCTYPE
1759    
1760     redo A;
1761     } else {
1762     ## Stay in the state
1763     !!!next-input-character;
1764     redo A;
1765     }
1766     } else {
1767     die "$0: $self->{state}: Unknown state";
1768     }
1769     } # A
1770    
1771     die "$0: _get_next_token: unexpected case";
1772     } # _get_next_token
1773    
1774 wakaba 1.26 sub _tokenize_attempt_to_consume_an_entity ($$) {
1775     my ($self, $in_attr) = @_;
1776 wakaba 1.20
1777     if ({
1778     0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
1779     0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR
1780     }->{$self->{next_input_character}}) {
1781     ## Don't consume
1782     ## No error
1783     return undef;
1784     } elsif ($self->{next_input_character} == 0x0023) { # #
1785 wakaba 1.1 !!!next-input-character;
1786     if ($self->{next_input_character} == 0x0078 or # x
1787     $self->{next_input_character} == 0x0058) { # X
1788 wakaba 1.26 my $code;
1789 wakaba 1.1 X: {
1790     my $x_char = $self->{next_input_character};
1791     !!!next-input-character;
1792     if (0x0030 <= $self->{next_input_character} and
1793     $self->{next_input_character} <= 0x0039) { # 0..9
1794 wakaba 1.26 $code ||= 0;
1795     $code *= 0x10;
1796     $code += $self->{next_input_character} - 0x0030;
1797 wakaba 1.1 redo X;
1798     } elsif (0x0061 <= $self->{next_input_character} and
1799     $self->{next_input_character} <= 0x0066) { # a..f
1800 wakaba 1.26 $code ||= 0;
1801     $code *= 0x10;
1802     $code += $self->{next_input_character} - 0x0060 + 9;
1803 wakaba 1.1 redo X;
1804     } elsif (0x0041 <= $self->{next_input_character} and
1805     $self->{next_input_character} <= 0x0046) { # A..F
1806 wakaba 1.26 $code ||= 0;
1807     $code *= 0x10;
1808     $code += $self->{next_input_character} - 0x0040 + 9;
1809 wakaba 1.1 redo X;
1810 wakaba 1.26 } elsif (not defined $code) { # no hexadecimal digit
1811 wakaba 1.3 !!!parse-error (type => 'bare hcro');
1812 wakaba 1.37 !!!back-next-input-character ($x_char, $self->{next_input_character});
1813 wakaba 1.1 $self->{next_input_character} = 0x0023; # #
1814     return undef;
1815     } elsif ($self->{next_input_character} == 0x003B) { # ;
1816     !!!next-input-character;
1817     } else {
1818 wakaba 1.3 !!!parse-error (type => 'no refc');
1819 wakaba 1.1 }
1820    
1821 wakaba 1.26 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
1822     !!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code);
1823     $code = 0xFFFD;
1824     } elsif ($code > 0x10FFFF) {
1825     !!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code);
1826     $code = 0xFFFD;
1827     } elsif ($code == 0x000D) {
1828     !!!parse-error (type => 'CR character reference');
1829     $code = 0x000A;
1830     } elsif (0x80 <= $code and $code <= 0x9F) {
1831 wakaba 1.30 !!!parse-error (type => sprintf 'C1 character reference:U+%04X', $code);
1832 wakaba 1.26 $code = $c1_entity_char->{$code};
1833 wakaba 1.1 }
1834    
1835 wakaba 1.55 return {type => CHARACTER_TOKEN, data => chr $code};
1836 wakaba 1.1 } # X
1837     } elsif (0x0030 <= $self->{next_input_character} and
1838     $self->{next_input_character} <= 0x0039) { # 0..9
1839     my $code = $self->{next_input_character} - 0x0030;
1840     !!!next-input-character;
1841    
1842     while (0x0030 <= $self->{next_input_character} and
1843     $self->{next_input_character} <= 0x0039) { # 0..9
1844     $code *= 10;
1845     $code += $self->{next_input_character} - 0x0030;
1846    
1847     !!!next-input-character;
1848     }
1849    
1850     if ($self->{next_input_character} == 0x003B) { # ;
1851     !!!next-input-character;
1852     } else {
1853 wakaba 1.3 !!!parse-error (type => 'no refc');
1854 wakaba 1.1 }
1855    
1856 wakaba 1.26 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
1857     !!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code);
1858     $code = 0xFFFD;
1859     } elsif ($code > 0x10FFFF) {
1860     !!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code);
1861     $code = 0xFFFD;
1862     } elsif ($code == 0x000D) {
1863     !!!parse-error (type => 'CR character reference');
1864     $code = 0x000A;
1865 wakaba 1.4 } elsif (0x80 <= $code and $code <= 0x9F) {
1866 wakaba 1.30 !!!parse-error (type => sprintf 'C1 character reference:U+%04X', $code);
1867 wakaba 1.4 $code = $c1_entity_char->{$code};
1868 wakaba 1.1 }
1869    
1870 wakaba 1.55 return {type => CHARACTER_TOKEN, data => chr $code};
1871 wakaba 1.1 } else {
1872 wakaba 1.3 !!!parse-error (type => 'bare nero');
1873 wakaba 1.1 !!!back-next-input-character ($self->{next_input_character});
1874     $self->{next_input_character} = 0x0023; # #
1875     return undef;
1876     }
1877     } elsif ((0x0041 <= $self->{next_input_character} and
1878     $self->{next_input_character} <= 0x005A) or
1879     (0x0061 <= $self->{next_input_character} and
1880     $self->{next_input_character} <= 0x007A)) {
1881     my $entity_name = chr $self->{next_input_character};
1882     !!!next-input-character;
1883    
1884     my $value = $entity_name;
1885 wakaba 1.37 my $match = 0;
1886 wakaba 1.16 require Whatpm::_NamedEntityList;
1887     our $EntityChar;
1888 wakaba 1.1
1889     while (length $entity_name < 10 and
1890     ## NOTE: Some number greater than the maximum length of entity name
1891 wakaba 1.16 ((0x0041 <= $self->{next_input_character} and # a
1892     $self->{next_input_character} <= 0x005A) or # x
1893     (0x0061 <= $self->{next_input_character} and # a
1894     $self->{next_input_character} <= 0x007A) or # z
1895     (0x0030 <= $self->{next_input_character} and # 0
1896     $self->{next_input_character} <= 0x0039) or # 9
1897     $self->{next_input_character} == 0x003B)) { # ;
1898 wakaba 1.1 $entity_name .= chr $self->{next_input_character};
1899 wakaba 1.16 if (defined $EntityChar->{$entity_name}) {
1900     if ($self->{next_input_character} == 0x003B) { # ;
1901 wakaba 1.26 $value = $EntityChar->{$entity_name};
1902 wakaba 1.16 $match = 1;
1903     !!!next-input-character;
1904     last;
1905 wakaba 1.37 } else {
1906 wakaba 1.26 $value = $EntityChar->{$entity_name};
1907     $match = -1;
1908 wakaba 1.37 !!!next-input-character;
1909 wakaba 1.16 }
1910 wakaba 1.1 } else {
1911     $value .= chr $self->{next_input_character};
1912 wakaba 1.37 $match *= 2;
1913     !!!next-input-character;
1914 wakaba 1.1 }
1915     }
1916    
1917 wakaba 1.16 if ($match > 0) {
1918 wakaba 1.55 return {type => CHARACTER_TOKEN, data => $value};
1919 wakaba 1.16 } elsif ($match < 0) {
1920 wakaba 1.30 !!!parse-error (type => 'no refc');
1921 wakaba 1.37 if ($in_attr and $match < -1) {
1922 wakaba 1.55 return {type => CHARACTER_TOKEN, data => '&'.$entity_name};
1923 wakaba 1.37 } else {
1924 wakaba 1.55 return {type => CHARACTER_TOKEN, data => $value};
1925 wakaba 1.37 }
1926 wakaba 1.1 } else {
1927 wakaba 1.3 !!!parse-error (type => 'bare ero');
1928 wakaba 1.1 ## NOTE: No characters are consumed in the spec.
1929 wakaba 1.55 return {type => CHARACTER_TOKEN, data => '&'.$value};
1930 wakaba 1.1 }
1931     } else {
1932     ## no characters are consumed
1933 wakaba 1.3 !!!parse-error (type => 'bare ero');
1934 wakaba 1.1 return undef;
1935     }
1936     } # _tokenize_attempt_to_consume_an_entity
1937    
1938     sub _initialize_tree_constructor ($) {
1939     my $self = shift;
1940     ## NOTE: $self->{document} MUST be specified before this method is called
1941     $self->{document}->strict_error_checking (0);
1942     ## TODO: Turn mutation events off # MUST
1943     ## TODO: Turn loose Document option (manakai extension) on
1944 wakaba 1.18 $self->{document}->manakai_is_html (1); # MUST
1945 wakaba 1.1 } # _initialize_tree_constructor
1946    
1947     sub _terminate_tree_constructor ($) {
1948     my $self = shift;
1949     $self->{document}->strict_error_checking (1);
1950     ## TODO: Turn mutation events on
1951     } # _terminate_tree_constructor
1952    
1953     ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
1954    
1955 wakaba 1.3 { # tree construction stage
1956     my $token;
1957    
1958 wakaba 1.1 sub _construct_tree ($) {
1959     my ($self) = @_;
1960    
1961     ## When an interactive UA render the $self->{document} available
1962     ## to the user, or when it begin accepting user input, are
1963     ## not defined.
1964    
1965     ## Append a character: collect it and all subsequent consecutive
1966     ## characters and insert one Text node whose data is concatenation
1967     ## of all those characters. # MUST
1968    
1969     !!!next-token;
1970    
1971 wakaba 1.54 $self->{insertion_mode} = BEFORE_HEAD_IM;
1972 wakaba 1.3 undef $self->{form_element};
1973     undef $self->{head_element};
1974     $self->{open_elements} = [];
1975     undef $self->{inner_html_node};
1976    
1977     $self->_tree_construction_initial; # MUST
1978     $self->_tree_construction_root_element;
1979     $self->_tree_construction_main;
1980     } # _construct_tree
1981    
1982     sub _tree_construction_initial ($) {
1983     my $self = shift;
1984 wakaba 1.18 INITIAL: {
1985 wakaba 1.55 if ($token->{type} == DOCTYPE_TOKEN) {
1986 wakaba 1.18 ## NOTE: Conformance checkers MAY, instead of reporting "not HTML5"
1987     ## error, switch to a conformance checking mode for another
1988     ## language.
1989     my $doctype_name = $token->{name};
1990     $doctype_name = '' unless defined $doctype_name;
1991     $doctype_name =~ tr/a-z/A-Z/;
1992     if (not defined $token->{name} or # <!DOCTYPE>
1993     defined $token->{public_identifier} or
1994     defined $token->{system_identifier}) {
1995     !!!parse-error (type => 'not HTML5');
1996     } elsif ($doctype_name ne 'HTML') {
1997     ## ISSUE: ASCII case-insensitive? (in fact it does not matter)
1998     !!!parse-error (type => 'not HTML5');
1999     }
2000    
2001     my $doctype = $self->{document}->create_document_type_definition
2002     ($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)?
2003     $doctype->public_id ($token->{public_identifier})
2004     if defined $token->{public_identifier};
2005     $doctype->system_id ($token->{system_identifier})
2006     if defined $token->{system_identifier};
2007     ## NOTE: Other DocumentType attributes are null or empty lists.
2008     ## ISSUE: internalSubset = null??
2009     $self->{document}->append_child ($doctype);
2010    
2011     if (not $token->{correct} or $doctype_name ne 'HTML') {
2012     $self->{document}->manakai_compat_mode ('quirks');
2013     } elsif (defined $token->{public_identifier}) {
2014     my $pubid = $token->{public_identifier};
2015     $pubid =~ tr/a-z/A-z/;
2016     if ({
2017     "+//SILMARIL//DTD HTML PRO V0R11 19970101//EN" => 1,
2018     "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
2019     "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
2020     "-//IETF//DTD HTML 2.0 LEVEL 1//EN" => 1,
2021     "-//IETF//DTD HTML 2.0 LEVEL 2//EN" => 1,
2022     "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//EN" => 1,
2023     "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//EN" => 1,
2024     "-//IETF//DTD HTML 2.0 STRICT//EN" => 1,
2025     "-//IETF//DTD HTML 2.0//EN" => 1,
2026     "-//IETF//DTD HTML 2.1E//EN" => 1,
2027     "-//IETF//DTD HTML 3.0//EN" => 1,
2028     "-//IETF//DTD HTML 3.0//EN//" => 1,
2029     "-//IETF//DTD HTML 3.2 FINAL//EN" => 1,
2030     "-//IETF//DTD HTML 3.2//EN" => 1,
2031     "-//IETF//DTD HTML 3//EN" => 1,
2032     "-//IETF//DTD HTML LEVEL 0//EN" => 1,
2033     "-//IETF//DTD HTML LEVEL 0//EN//2.0" => 1,
2034     "-//IETF//DTD HTML LEVEL 1//EN" => 1,
2035     "-//IETF//DTD HTML LEVEL 1//EN//2.0" => 1,
2036     "-//IETF//DTD HTML LEVEL 2//EN" => 1,
2037     "-//IETF//DTD HTML LEVEL 2//EN//2.0" => 1,
2038     "-//IETF//DTD HTML LEVEL 3//EN" => 1,
2039     "-//IETF//DTD HTML LEVEL 3//EN//3.0" => 1,
2040     "-//IETF//DTD HTML STRICT LEVEL 0//EN" => 1,
2041     "-//IETF//DTD HTML STRICT LEVEL 0//EN//2.0" => 1,
2042     "-//IETF//DTD HTML STRICT LEVEL 1//EN" => 1,
2043     "-//IETF//DTD HTML STRICT LEVEL 1//EN//2.0" => 1,
2044     "-//IETF//DTD HTML STRICT LEVEL 2//EN" => 1,
2045     "-//IETF//DTD HTML STRICT LEVEL 2//EN//2.0" => 1,
2046     "-//IETF//DTD HTML STRICT LEVEL 3//EN" => 1,
2047     "-//IETF//DTD HTML STRICT LEVEL 3//EN//3.0" => 1,
2048     "-//IETF//DTD HTML STRICT//EN" => 1,
2049     "-//IETF//DTD HTML STRICT//EN//2.0" => 1,
2050     "-//IETF//DTD HTML STRICT//EN//3.0" => 1,
2051     "-//IETF//DTD HTML//EN" => 1,
2052     "-//IETF//DTD HTML//EN//2.0" => 1,
2053     "-//IETF//DTD HTML//EN//3.0" => 1,
2054     "-//METRIUS//DTD METRIUS PRESENTATIONAL//EN" => 1,
2055     "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//EN" => 1,
2056     "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//EN" => 1,
2057     "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//EN" => 1,
2058     "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//EN" => 1,
2059     "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//EN" => 1,
2060     "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//EN" => 1,
2061     "-//NETSCAPE COMM. CORP.//DTD HTML//EN" => 1,
2062     "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1,
2063     "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1,
2064     "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1,
2065     "-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1,
2066     "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1,
2067     "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1,
2068     "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//EN" => 1,
2069     "-//W3C//DTD HTML 3 1995-03-24//EN" => 1,
2070     "-//W3C//DTD HTML 3.2 DRAFT//EN" => 1,
2071     "-//W3C//DTD HTML 3.2 FINAL//EN" => 1,
2072     "-//W3C//DTD HTML 3.2//EN" => 1,
2073     "-//W3C//DTD HTML 3.2S DRAFT//EN" => 1,
2074     "-//W3C//DTD HTML 4.0 FRAMESET//EN" => 1,
2075     "-//W3C//DTD HTML 4.0 TRANSITIONAL//EN" => 1,
2076     "-//W3C//DTD HTML EXPERIMETNAL 19960712//EN" => 1,
2077     "-//W3C//DTD HTML EXPERIMENTAL 970421//EN" => 1,
2078     "-//W3C//DTD W3 HTML//EN" => 1,
2079     "-//W3O//DTD W3 HTML 3.0//EN" => 1,
2080     "-//W3O//DTD W3 HTML 3.0//EN//" => 1,
2081     "-//W3O//DTD W3 HTML STRICT 3.0//EN//" => 1,
2082     "-//WEBTECHS//DTD MOZILLA HTML 2.0//EN" => 1,
2083     "-//WEBTECHS//DTD MOZILLA HTML//EN" => 1,
2084     "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" => 1,
2085     "HTML" => 1,
2086     }->{$pubid}) {
2087     $self->{document}->manakai_compat_mode ('quirks');
2088     } elsif ($pubid eq "-//W3C//DTD HTML 4.01 FRAMESET//EN" or
2089     $pubid eq "-//W3C//DTD HTML 4.01 TRANSITIONAL//EN") {
2090     if (defined $token->{system_identifier}) {
2091     $self->{document}->manakai_compat_mode ('quirks');
2092     } else {
2093     $self->{document}->manakai_compat_mode ('limited quirks');
2094 wakaba 1.3 }
2095 wakaba 1.18 } elsif ($pubid eq "-//W3C//DTD XHTML 1.0 Frameset//EN" or
2096     $pubid eq "-//W3C//DTD XHTML 1.0 Transitional//EN") {
2097     $self->{document}->manakai_compat_mode ('limited quirks');
2098     }
2099     }
2100     if (defined $token->{system_identifier}) {
2101     my $sysid = $token->{system_identifier};
2102     $sysid =~ tr/A-Z/a-z/;
2103     if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
2104     $self->{document}->manakai_compat_mode ('quirks');
2105     }
2106     }
2107    
2108     ## Go to the root element phase.
2109     !!!next-token;
2110     return;
2111     } elsif ({
2112 wakaba 1.55 START_TAG_TOKEN, 1,
2113     END_TAG_TOKEN, 1,
2114     END_OF_FILE_TOKEN, 1,
2115 wakaba 1.18 }->{$token->{type}}) {
2116     !!!parse-error (type => 'no DOCTYPE');
2117     $self->{document}->manakai_compat_mode ('quirks');
2118     ## Go to the root element phase
2119     ## reprocess
2120     return;
2121 wakaba 1.55 } elsif ($token->{type} == CHARACTER_TOKEN) {
2122 wakaba 1.18 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
2123     ## Ignore the token
2124 wakaba 1.26
2125 wakaba 1.18 unless (length $token->{data}) {
2126     ## Stay in the phase
2127     !!!next-token;
2128     redo INITIAL;
2129 wakaba 1.3 }
2130     }
2131 wakaba 1.18
2132     !!!parse-error (type => 'no DOCTYPE');
2133     $self->{document}->manakai_compat_mode ('quirks');
2134     ## Go to the root element phase
2135     ## reprocess
2136     return;
2137 wakaba 1.55 } elsif ($token->{type} == COMMENT_TOKEN) {
2138 wakaba 1.18 my $comment = $self->{document}->create_comment ($token->{data});
2139     $self->{document}->append_child ($comment);
2140    
2141     ## Stay in the phase.
2142     !!!next-token;
2143     redo INITIAL;
2144     } else {
2145 wakaba 1.55 die "$0: $token->{type}: Unknown token type";
2146 wakaba 1.18 }
2147     } # INITIAL
2148 wakaba 1.3 } # _tree_construction_initial
2149    
2150     sub _tree_construction_root_element ($) {
2151     my $self = shift;
2152    
2153     B: {
2154 wakaba 1.55 if ($token->{type} == DOCTYPE_TOKEN) {
2155 wakaba 1.3 !!!parse-error (type => 'in html:#DOCTYPE');
2156     ## Ignore the token
2157     ## Stay in the phase
2158     !!!next-token;
2159     redo B;
2160 wakaba 1.55 } elsif ($token->{type} == COMMENT_TOKEN) {
2161 wakaba 1.3 my $comment = $self->{document}->create_comment ($token->{data});
2162     $self->{document}->append_child ($comment);
2163     ## Stay in the phase
2164     !!!next-token;
2165     redo B;
2166 wakaba 1.55 } elsif ($token->{type} == CHARACTER_TOKEN) {
2167 wakaba 1.26 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
2168     ## Ignore the token.
2169    
2170 wakaba 1.3 unless (length $token->{data}) {
2171     ## Stay in the phase
2172     !!!next-token;
2173     redo B;
2174     }
2175     }
2176 wakaba 1.61
2177     $self->{application_cache_selection}->(undef);
2178    
2179     #
2180     } elsif ($token->{type} == START_TAG_TOKEN) {
2181     if ($token->{tag_name} eq 'html' and
2182     $token->{attributes}->{manifest}) { ## ISSUE: Spec spells as "application"
2183     $self->{application_cache_selection}
2184     ->($token->{attributes}->{manifest}->{value});
2185     ## ISSUE: No relative reference resolution?
2186     } else {
2187     $self->{application_cache_selection}->(undef);
2188     }
2189    
2190     ## ISSUE: There is an issue in the spec
2191 wakaba 1.3 #
2192     } elsif ({
2193 wakaba 1.55 END_TAG_TOKEN, 1,
2194     END_OF_FILE_TOKEN, 1,
2195 wakaba 1.3 }->{$token->{type}}) {
2196 wakaba 1.61 $self->{application_cache_selection}->(undef);
2197    
2198 wakaba 1.3 ## ISSUE: There is an issue in the spec
2199     #
2200     } else {
2201 wakaba 1.55 die "$0: $token->{type}: Unknown token type";
2202 wakaba 1.3 }
2203 wakaba 1.61
2204 wakaba 1.3 my $root_element; !!!create-element ($root_element, 'html');
2205     $self->{document}->append_child ($root_element);
2206     push @{$self->{open_elements}}, [$root_element, 'html'];
2207     ## reprocess
2208     #redo B;
2209 wakaba 1.35 return; ## Go to the main phase.
2210 wakaba 1.3 } # B
2211     } # _tree_construction_root_element
2212    
2213     sub _reset_insertion_mode ($) {
2214     my $self = shift;
2215    
2216     ## Step 1
2217     my $last;
2218    
2219     ## Step 2
2220     my $i = -1;
2221     my $node = $self->{open_elements}->[$i];
2222    
2223     ## Step 3
2224     S3: {
2225 wakaba 1.29 ## ISSUE: Oops! "If node is the first node in the stack of open
2226     ## elements, then set last to true. If the context element of the
2227     ## HTML fragment parsing algorithm is neither a td element nor a
2228     ## th element, then set node to the context element. (fragment case)":
2229     ## The second "if" is in the scope of the first "if"!?
2230     if ($self->{open_elements}->[0]->[0] eq $node->[0]) {
2231     $last = 1;
2232     if (defined $self->{inner_html_node}) {
2233     if ($self->{inner_html_node}->[1] eq 'td' or
2234     $self->{inner_html_node}->[1] eq 'th') {
2235     #
2236     } else {
2237     $node = $self->{inner_html_node};
2238     }
2239 wakaba 1.3 }
2240     }
2241    
2242     ## Step 4..13
2243     my $new_mode = {
2244 wakaba 1.54 select => IN_SELECT_IM,
2245     td => IN_CELL_IM,
2246     th => IN_CELL_IM,
2247     tr => IN_ROW_IM,
2248     tbody => IN_TABLE_BODY_IM,
2249     thead => IN_TABLE_BODY_IM,
2250     tfoot => IN_TABLE_BODY_IM,
2251     caption => IN_CAPTION_IM,
2252     colgroup => IN_COLUMN_GROUP_IM,
2253     table => IN_TABLE_IM,
2254     head => IN_BODY_IM, # not in head!
2255     body => IN_BODY_IM,
2256     frameset => IN_FRAMESET_IM,
2257 wakaba 1.3 }->{$node->[1]};
2258     $self->{insertion_mode} = $new_mode and return if defined $new_mode;
2259    
2260     ## Step 14
2261     if ($node->[1] eq 'html') {
2262     unless (defined $self->{head_element}) {
2263 wakaba 1.54 $self->{insertion_mode} = BEFORE_HEAD_IM;
2264 wakaba 1.3 } else {
2265 wakaba 1.54 $self->{insertion_mode} = AFTER_HEAD_IM;
2266 wakaba 1.3 }
2267     return;
2268     }
2269    
2270     ## Step 15
2271 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM and return if $last;
2272 wakaba 1.3
2273     ## Step 16
2274     $i--;
2275     $node = $self->{open_elements}->[$i];
2276    
2277     ## Step 17
2278     redo S3;
2279     } # S3
2280     } # _reset_insertion_mode
2281    
2282     sub _tree_construction_main ($) {
2283     my $self = shift;
2284    
2285 wakaba 1.1 my $active_formatting_elements = [];
2286    
2287     my $reconstruct_active_formatting_elements = sub { # MUST
2288     my $insert = shift;
2289    
2290     ## Step 1
2291     return unless @$active_formatting_elements;
2292    
2293     ## Step 3
2294     my $i = -1;
2295     my $entry = $active_formatting_elements->[$i];
2296    
2297     ## Step 2
2298     return if $entry->[0] eq '#marker';
2299 wakaba 1.3 for (@{$self->{open_elements}}) {
2300 wakaba 1.1 if ($entry->[0] eq $_->[0]) {
2301     return;
2302     }
2303     }
2304    
2305     S4: {
2306     ## Step 4
2307     last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
2308    
2309     ## Step 5
2310     $i--;
2311     $entry = $active_formatting_elements->[$i];
2312    
2313     ## Step 6
2314     if ($entry->[0] eq '#marker') {
2315     #
2316     } else {
2317     my $in_open_elements;
2318 wakaba 1.3 OE: for (@{$self->{open_elements}}) {
2319 wakaba 1.1 if ($entry->[0] eq $_->[0]) {
2320     $in_open_elements = 1;
2321     last OE;
2322     }
2323     }
2324     if ($in_open_elements) {
2325     #
2326     } else {
2327     redo S4;
2328     }
2329     }
2330    
2331     ## Step 7
2332     $i++;
2333     $entry = $active_formatting_elements->[$i];
2334     } # S4
2335    
2336     S7: {
2337     ## Step 8
2338     my $clone = [$entry->[0]->clone_node (0), $entry->[1]];
2339    
2340     ## Step 9
2341     $insert->($clone->[0]);
2342 wakaba 1.3 push @{$self->{open_elements}}, $clone;
2343 wakaba 1.1
2344     ## Step 10
2345 wakaba 1.3 $active_formatting_elements->[$i] = $self->{open_elements}->[-1];
2346 wakaba 1.1
2347     ## Step 11
2348     unless ($clone->[0] eq $active_formatting_elements->[-1]->[0]) {
2349     ## Step 7'
2350     $i++;
2351     $entry = $active_formatting_elements->[$i];
2352    
2353     redo S7;
2354     }
2355     } # S7
2356     }; # $reconstruct_active_formatting_elements
2357    
2358     my $clear_up_to_marker = sub {
2359     for (reverse 0..$#$active_formatting_elements) {
2360     if ($active_formatting_elements->[$_]->[0] eq '#marker') {
2361     splice @$active_formatting_elements, $_;
2362     return;
2363     }
2364     }
2365     }; # $clear_up_to_marker
2366    
2367 wakaba 1.25 my $parse_rcdata = sub ($$) {
2368     my ($content_model_flag, $insert) = @_;
2369    
2370     ## Step 1
2371     my $start_tag_name = $token->{tag_name};
2372     my $el;
2373     !!!create-element ($el, $start_tag_name, $token->{attributes});
2374    
2375     ## Step 2
2376     $insert->($el); # /context node/->append_child ($el)
2377    
2378     ## Step 3
2379 wakaba 1.40 $self->{content_model} = $content_model_flag; # CDATA or RCDATA
2380 wakaba 1.13 delete $self->{escape}; # MUST
2381 wakaba 1.25
2382     ## Step 4
2383 wakaba 1.1 my $text = '';
2384     !!!next-token;
2385 wakaba 1.55 while ($token->{type} == CHARACTER_TOKEN) { # or until stop tokenizing
2386 wakaba 1.1 $text .= $token->{data};
2387     !!!next-token;
2388 wakaba 1.25 }
2389    
2390     ## Step 5
2391 wakaba 1.1 if (length $text) {
2392 wakaba 1.25 my $text = $self->{document}->create_text_node ($text);
2393     $el->append_child ($text);
2394 wakaba 1.1 }
2395 wakaba 1.25
2396     ## Step 6
2397 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL;
2398 wakaba 1.25
2399     ## Step 7
2400 wakaba 1.55 if ($token->{type} == END_TAG_TOKEN and $token->{tag_name} eq $start_tag_name) {
2401 wakaba 1.1 ## Ignore the token
2402 wakaba 1.40 } elsif ($content_model_flag == CDATA_CONTENT_MODEL) {
2403     !!!parse-error (type => 'in CDATA:#'.$token->{type});
2404     } elsif ($content_model_flag == RCDATA_CONTENT_MODEL) {
2405     !!!parse-error (type => 'in RCDATA:#'.$token->{type});
2406 wakaba 1.1 } else {
2407 wakaba 1.40 die "$0: $content_model_flag in parse_rcdata";
2408 wakaba 1.1 }
2409     !!!next-token;
2410 wakaba 1.25 }; # $parse_rcdata
2411 wakaba 1.1
2412 wakaba 1.25 my $script_start_tag = sub ($) {
2413     my $insert = $_[0];
2414 wakaba 1.1 my $script_el;
2415     !!!create-element ($script_el, 'script', $token->{attributes});
2416     ## TODO: mark as "parser-inserted"
2417    
2418 wakaba 1.40 $self->{content_model} = CDATA_CONTENT_MODEL;
2419 wakaba 1.13 delete $self->{escape}; # MUST
2420 wakaba 1.1
2421     my $text = '';
2422     !!!next-token;
2423 wakaba 1.55 while ($token->{type} == CHARACTER_TOKEN) {
2424 wakaba 1.1 $text .= $token->{data};
2425     !!!next-token;
2426     } # stop if non-character token or tokenizer stops tokenising
2427     if (length $text) {
2428     $script_el->manakai_append_text ($text);
2429     }
2430    
2431 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL;
2432 wakaba 1.1
2433 wakaba 1.55 if ($token->{type} == END_TAG_TOKEN and
2434 wakaba 1.1 $token->{tag_name} eq 'script') {
2435     ## Ignore the token
2436     } else {
2437 wakaba 1.3 !!!parse-error (type => 'in CDATA:#'.$token->{type});
2438 wakaba 1.1 ## ISSUE: And ignore?
2439     ## TODO: mark as "already executed"
2440     }
2441    
2442 wakaba 1.3 if (defined $self->{inner_html_node}) {
2443     ## TODO: mark as "already executed"
2444     } else {
2445 wakaba 1.1 ## TODO: $old_insertion_point = current insertion point
2446     ## TODO: insertion point = just before the next input character
2447 wakaba 1.25
2448     $insert->($script_el);
2449 wakaba 1.1
2450     ## TODO: insertion point = $old_insertion_point (might be "undefined")
2451    
2452     ## TODO: if there is a script that will execute as soon as the parser resume, then...
2453     }
2454    
2455     !!!next-token;
2456     }; # $script_start_tag
2457    
2458     my $formatting_end_tag = sub {
2459     my $tag_name = shift;
2460    
2461     FET: {
2462     ## Step 1
2463     my $formatting_element;
2464     my $formatting_element_i_in_active;
2465     AFE: for (reverse 0..$#$active_formatting_elements) {
2466     if ($active_formatting_elements->[$_]->[1] eq $tag_name) {
2467     $formatting_element = $active_formatting_elements->[$_];
2468     $formatting_element_i_in_active = $_;
2469     last AFE;
2470     } elsif ($active_formatting_elements->[$_]->[0] eq '#marker') {
2471     last AFE;
2472     }
2473     } # AFE
2474     unless (defined $formatting_element) {
2475 wakaba 1.3 !!!parse-error (type => 'unmatched end tag:'.$tag_name);
2476 wakaba 1.1 ## Ignore the token
2477     !!!next-token;
2478     return;
2479     }
2480     ## has an element in scope
2481     my $in_scope = 1;
2482     my $formatting_element_i_in_open;
2483 wakaba 1.3 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2484     my $node = $self->{open_elements}->[$_];
2485 wakaba 1.1 if ($node->[0] eq $formatting_element->[0]) {
2486     if ($in_scope) {
2487     $formatting_element_i_in_open = $_;
2488     last INSCOPE;
2489     } else { # in open elements but not in scope
2490 wakaba 1.4 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2491 wakaba 1.1 ## Ignore the token
2492     !!!next-token;
2493     return;
2494     }
2495     } elsif ({
2496     table => 1, caption => 1, td => 1, th => 1,
2497     button => 1, marquee => 1, object => 1, html => 1,
2498     }->{$node->[1]}) {
2499     $in_scope = 0;
2500     }
2501     } # INSCOPE
2502     unless (defined $formatting_element_i_in_open) {
2503 wakaba 1.4 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2504 wakaba 1.1 pop @$active_formatting_elements; # $formatting_element
2505     !!!next-token; ## TODO: ok?
2506     return;
2507     }
2508 wakaba 1.3 if (not $self->{open_elements}->[-1]->[0] eq $formatting_element->[0]) {
2509 wakaba 1.4 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
2510 wakaba 1.1 }
2511    
2512     ## Step 2
2513     my $furthest_block;
2514     my $furthest_block_i_in_open;
2515 wakaba 1.3 OE: for (reverse 0..$#{$self->{open_elements}}) {
2516     my $node = $self->{open_elements}->[$_];
2517 wakaba 1.1 if (not $formatting_category->{$node->[1]} and
2518     #not $phrasing_category->{$node->[1]} and
2519     ($special_category->{$node->[1]} or
2520     $scoping_category->{$node->[1]})) {
2521     $furthest_block = $node;
2522     $furthest_block_i_in_open = $_;
2523     } elsif ($node->[0] eq $formatting_element->[0]) {
2524     last OE;
2525     }
2526     } # OE
2527    
2528     ## Step 3
2529     unless (defined $furthest_block) { # MUST
2530 wakaba 1.3 splice @{$self->{open_elements}}, $formatting_element_i_in_open;
2531 wakaba 1.1 splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
2532     !!!next-token;
2533     return;
2534     }
2535    
2536     ## Step 4
2537 wakaba 1.3 my $common_ancestor_node = $self->{open_elements}->[$formatting_element_i_in_open - 1];
2538 wakaba 1.1
2539     ## Step 5
2540     my $furthest_block_parent = $furthest_block->[0]->parent_node;
2541     if (defined $furthest_block_parent) {
2542     $furthest_block_parent->remove_child ($furthest_block->[0]);
2543     }
2544    
2545     ## Step 6
2546     my $bookmark_prev_el
2547     = $active_formatting_elements->[$formatting_element_i_in_active - 1]
2548     ->[0];
2549    
2550     ## Step 7
2551     my $node = $furthest_block;
2552     my $node_i_in_open = $furthest_block_i_in_open;
2553     my $last_node = $furthest_block;
2554     S7: {
2555     ## Step 1
2556     $node_i_in_open--;
2557 wakaba 1.3 $node = $self->{open_elements}->[$node_i_in_open];
2558 wakaba 1.1
2559     ## Step 2
2560     my $node_i_in_active;
2561     S7S2: {
2562     for (reverse 0..$#$active_formatting_elements) {
2563     if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
2564     $node_i_in_active = $_;
2565     last S7S2;
2566     }
2567     }
2568 wakaba 1.3 splice @{$self->{open_elements}}, $node_i_in_open, 1;
2569 wakaba 1.1 redo S7;
2570     } # S7S2
2571    
2572     ## Step 3
2573     last S7 if $node->[0] eq $formatting_element->[0];
2574    
2575     ## Step 4
2576     if ($last_node->[0] eq $furthest_block->[0]) {
2577     $bookmark_prev_el = $node->[0];
2578     }
2579    
2580     ## Step 5
2581     if ($node->[0]->has_child_nodes ()) {
2582     my $clone = [$node->[0]->clone_node (0), $node->[1]];
2583     $active_formatting_elements->[$node_i_in_active] = $clone;
2584 wakaba 1.3 $self->{open_elements}->[$node_i_in_open] = $clone;
2585 wakaba 1.1 $node = $clone;
2586     }
2587    
2588     ## Step 6
2589     $node->[0]->append_child ($last_node->[0]);
2590    
2591     ## Step 7
2592     $last_node = $node;
2593    
2594     ## Step 8
2595     redo S7;
2596     } # S7
2597    
2598     ## Step 8
2599     $common_ancestor_node->[0]->append_child ($last_node->[0]);
2600    
2601     ## Step 9
2602     my $clone = [$formatting_element->[0]->clone_node (0),
2603     $formatting_element->[1]];
2604    
2605     ## Step 10
2606     my @cn = @{$furthest_block->[0]->child_nodes};
2607     $clone->[0]->append_child ($_) for @cn;
2608    
2609     ## Step 11
2610     $furthest_block->[0]->append_child ($clone->[0]);
2611    
2612     ## Step 12
2613     my $i;
2614     AFE: for (reverse 0..$#$active_formatting_elements) {
2615     if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
2616     splice @$active_formatting_elements, $_, 1;
2617     $i-- and last AFE if defined $i;
2618     } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
2619     $i = $_;
2620     }
2621     } # AFE
2622     splice @$active_formatting_elements, $i + 1, 0, $clone;
2623    
2624     ## Step 13
2625     undef $i;
2626 wakaba 1.3 OE: for (reverse 0..$#{$self->{open_elements}}) {
2627     if ($self->{open_elements}->[$_]->[0] eq $formatting_element->[0]) {
2628     splice @{$self->{open_elements}}, $_, 1;
2629 wakaba 1.1 $i-- and last OE if defined $i;
2630 wakaba 1.3 } elsif ($self->{open_elements}->[$_]->[0] eq $furthest_block->[0]) {
2631 wakaba 1.1 $i = $_;
2632     }
2633     } # OE
2634 wakaba 1.3 splice @{$self->{open_elements}}, $i + 1, 1, $clone;
2635 wakaba 1.1
2636     ## Step 14
2637     redo FET;
2638     } # FET
2639     }; # $formatting_end_tag
2640    
2641     my $insert_to_current = sub {
2642 wakaba 1.25 $self->{open_elements}->[-1]->[0]->append_child ($_[0]);
2643 wakaba 1.1 }; # $insert_to_current
2644    
2645     my $insert_to_foster = sub {
2646     my $child = shift;
2647     if ({
2648     table => 1, tbody => 1, tfoot => 1,
2649     thead => 1, tr => 1,
2650 wakaba 1.3 }->{$self->{open_elements}->[-1]->[1]}) {
2651 wakaba 1.1 # MUST
2652     my $foster_parent_element;
2653     my $next_sibling;
2654 wakaba 1.3 OE: for (reverse 0..$#{$self->{open_elements}}) {
2655     if ($self->{open_elements}->[$_]->[1] eq 'table') {
2656     my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
2657 wakaba 1.1 if (defined $parent and $parent->node_type == 1) {
2658     $foster_parent_element = $parent;
2659 wakaba 1.3 $next_sibling = $self->{open_elements}->[$_]->[0];
2660 wakaba 1.1 } else {
2661     $foster_parent_element
2662 wakaba 1.3 = $self->{open_elements}->[$_ - 1]->[0];
2663 wakaba 1.1 }
2664     last OE;
2665     }
2666     } # OE
2667 wakaba 1.3 $foster_parent_element = $self->{open_elements}->[0]->[0]
2668 wakaba 1.1 unless defined $foster_parent_element;
2669     $foster_parent_element->insert_before
2670     ($child, $next_sibling);
2671     } else {
2672 wakaba 1.3 $self->{open_elements}->[-1]->[0]->append_child ($child);
2673 wakaba 1.1 }
2674     }; # $insert_to_foster
2675    
2676 wakaba 1.52 my $insert;
2677 wakaba 1.34
2678 wakaba 1.52 B: {
2679 wakaba 1.55 if ($token->{type} == DOCTYPE_TOKEN) {
2680 wakaba 1.52 !!!parse-error (type => 'DOCTYPE in the middle');
2681     ## Ignore the token
2682     ## Stay in the phase
2683     !!!next-token;
2684     redo B;
2685 wakaba 1.55 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
2686 wakaba 1.56 if ($self->{insertion_mode} & AFTER_HTML_IMS) {
2687 wakaba 1.52 #
2688     } else {
2689     ## Generate implied end tags
2690     if ({
2691     dd => 1, dt => 1, li => 1, p => 1, td => 1, th => 1, tr => 1,
2692     tbody => 1, tfoot=> 1, thead => 1,
2693     }->{$self->{open_elements}->[-1]->[1]}) {
2694     !!!back-token;
2695 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => $self->{open_elements}->[-1]->[1]};
2696 wakaba 1.52 redo B;
2697     }
2698    
2699     if (@{$self->{open_elements}} > 2 or
2700     (@{$self->{open_elements}} == 2 and $self->{open_elements}->[1]->[1] ne 'body')) {
2701     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
2702     } elsif (defined $self->{inner_html_node} and
2703     @{$self->{open_elements}} > 1 and
2704     $self->{open_elements}->[1]->[1] ne 'body') {
2705     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
2706 wakaba 1.34 }
2707    
2708 wakaba 1.52 ## ISSUE: There is an issue in the spec.
2709     }
2710    
2711     ## Stop parsing
2712     last B;
2713 wakaba 1.55 } elsif ($token->{type} == START_TAG_TOKEN and
2714 wakaba 1.52 $token->{tag_name} eq 'html') {
2715 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
2716 wakaba 1.52 ## Turn into the main phase
2717     !!!parse-error (type => 'after html:html');
2718 wakaba 1.54 $self->{insertion_mode} = AFTER_BODY_IM;
2719     } elsif ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
2720 wakaba 1.52 ## Turn into the main phase
2721     !!!parse-error (type => 'after html:html');
2722 wakaba 1.54 $self->{insertion_mode} = AFTER_FRAMESET_IM;
2723 wakaba 1.52 }
2724    
2725     ## ISSUE: "aa<html>" is not a parse error.
2726     ## ISSUE: "<html>" in fragment is not a parse error.
2727     unless ($token->{first_start_tag}) {
2728     !!!parse-error (type => 'not first start tag');
2729     }
2730     my $top_el = $self->{open_elements}->[0]->[0];
2731     for my $attr_name (keys %{$token->{attributes}}) {
2732     unless ($top_el->has_attribute_ns (undef, $attr_name)) {
2733     $top_el->set_attribute_ns
2734     (undef, [undef, $attr_name],
2735     $token->{attributes}->{$attr_name}->{value});
2736     }
2737     }
2738     !!!next-token;
2739     redo B;
2740 wakaba 1.55 } elsif ($token->{type} == COMMENT_TOKEN) {
2741 wakaba 1.52 my $comment = $self->{document}->create_comment ($token->{data});
2742 wakaba 1.56 if ($self->{insertion_mode} & AFTER_HTML_IMS) {
2743 wakaba 1.52 $self->{document}->append_child ($comment);
2744 wakaba 1.54 } elsif ($self->{insertion_mode} == AFTER_BODY_IM) {
2745 wakaba 1.52 $self->{open_elements}->[0]->[0]->append_child ($comment);
2746     } else {
2747     $self->{open_elements}->[-1]->[0]->append_child ($comment);
2748     }
2749     !!!next-token;
2750     redo B;
2751 wakaba 1.56 } elsif ($self->{insertion_mode} & HEAD_IMS) {
2752 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
2753 wakaba 1.52 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
2754     $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
2755     unless (length $token->{data}) {
2756     !!!next-token;
2757     redo B;
2758 wakaba 1.1 }
2759     }
2760 wakaba 1.52
2761 wakaba 1.54 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
2762 wakaba 1.52 ## As if <head>
2763     !!!create-element ($self->{head_element}, 'head');
2764     $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
2765     push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2766    
2767     ## Reprocess in the "in head" insertion mode...
2768     pop @{$self->{open_elements}};
2769    
2770     ## Reprocess in the "after head" insertion mode...
2771 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2772 wakaba 1.52 ## As if </noscript>
2773     pop @{$self->{open_elements}};
2774     !!!parse-error (type => 'in noscript:#character');
2775 wakaba 1.1
2776 wakaba 1.52 ## Reprocess in the "in head" insertion mode...
2777     ## As if </head>
2778     pop @{$self->{open_elements}};
2779    
2780     ## Reprocess in the "after head" insertion mode...
2781 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
2782 wakaba 1.52 pop @{$self->{open_elements}};
2783    
2784     ## Reprocess in the "after head" insertion mode...
2785 wakaba 1.1 }
2786 wakaba 1.52
2787     ## "after head" insertion mode
2788     ## As if <body>
2789     !!!insert-element ('body');
2790 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM;
2791 wakaba 1.52 ## reprocess
2792     redo B;
2793 wakaba 1.55 } elsif ($token->{type} == START_TAG_TOKEN) {
2794 wakaba 1.52 if ($token->{tag_name} eq 'head') {
2795 wakaba 1.54 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
2796 wakaba 1.52 !!!create-element ($self->{head_element}, $token->{tag_name}, $token->{attributes});
2797     $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
2798     push @{$self->{open_elements}}, [$self->{head_element}, $token->{tag_name}];
2799 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_IM;
2800 wakaba 1.52 !!!next-token;
2801     redo B;
2802 wakaba 1.54 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
2803     #
2804     } else {
2805 wakaba 1.52 !!!parse-error (type => 'in head:head'); # or in head noscript
2806     ## Ignore the token
2807     !!!next-token;
2808     redo B;
2809     }
2810 wakaba 1.54 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
2811 wakaba 1.52 ## As if <head>
2812     !!!create-element ($self->{head_element}, 'head');
2813     $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
2814     push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2815    
2816 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_IM;
2817 wakaba 1.52 ## Reprocess in the "in head" insertion mode...
2818 wakaba 1.1 }
2819 wakaba 1.52
2820 wakaba 1.49 if ($token->{tag_name} eq 'base') {
2821 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2822 wakaba 1.49 ## As if </noscript>
2823     pop @{$self->{open_elements}};
2824     !!!parse-error (type => 'in noscript:base');
2825    
2826 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_IM;
2827 wakaba 1.49 ## Reprocess in the "in head" insertion mode...
2828     }
2829    
2830     ## NOTE: There is a "as if in head" code clone.
2831 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
2832 wakaba 1.49 !!!parse-error (type => 'after head:'.$token->{tag_name});
2833     push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2834     }
2835     !!!insert-element ($token->{tag_name}, $token->{attributes});
2836     pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
2837     pop @{$self->{open_elements}}
2838 wakaba 1.54 if $self->{insertion_mode} == AFTER_HEAD_IM;
2839 wakaba 1.49 !!!next-token;
2840     redo B;
2841     } elsif ($token->{tag_name} eq 'link') {
2842 wakaba 1.25 ## NOTE: There is a "as if in head" code clone.
2843 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
2844 wakaba 1.25 !!!parse-error (type => 'after head:'.$token->{tag_name});
2845     push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2846     }
2847     !!!insert-element ($token->{tag_name}, $token->{attributes});
2848     pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
2849     pop @{$self->{open_elements}}
2850 wakaba 1.54 if $self->{insertion_mode} == AFTER_HEAD_IM;
2851 wakaba 1.1 !!!next-token;
2852 wakaba 1.25 redo B;
2853 wakaba 1.34 } elsif ($token->{tag_name} eq 'meta') {
2854     ## NOTE: There is a "as if in head" code clone.
2855 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
2856 wakaba 1.34 !!!parse-error (type => 'after head:'.$token->{tag_name});
2857     push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2858     }
2859     !!!insert-element ($token->{tag_name}, $token->{attributes});
2860     pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
2861    
2862     unless ($self->{confident}) {
2863     if ($token->{attributes}->{charset}) { ## TODO: And if supported
2864 wakaba 1.63 $self->{change_encoding}
2865     ->($self, $token->{attributes}->{charset}->{value});
2866     } elsif ($token->{attributes}->{content}) {
2867 wakaba 1.35 ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
2868 wakaba 1.63 if ($token->{attributes}->{content}->{value}
2869 wakaba 1.34 =~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*=
2870     [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
2871     ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
2872 wakaba 1.63 $self->{change_encoding}
2873     ->($self, defined $1 ? $1 : defined $2 ? $2 : $3);
2874     }
2875 wakaba 1.34 }
2876     }
2877    
2878     pop @{$self->{open_elements}}
2879 wakaba 1.54 if $self->{insertion_mode} == AFTER_HEAD_IM;
2880 wakaba 1.34 !!!next-token;
2881     redo B;
2882 wakaba 1.49 } elsif ($token->{tag_name} eq 'title') {
2883 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2884 wakaba 1.49 ## As if </noscript>
2885     pop @{$self->{open_elements}};
2886     !!!parse-error (type => 'in noscript:title');
2887    
2888 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_IM;
2889 wakaba 1.49 ## Reprocess in the "in head" insertion mode...
2890 wakaba 1.54 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
2891 wakaba 1.25 !!!parse-error (type => 'after head:'.$token->{tag_name});
2892     push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2893     }
2894 wakaba 1.49
2895     ## NOTE: There is a "as if in head" code clone.
2896 wakaba 1.31 my $parent = defined $self->{head_element} ? $self->{head_element}
2897     : $self->{open_elements}->[-1]->[0];
2898 wakaba 1.40 $parse_rcdata->(RCDATA_CONTENT_MODEL,
2899     sub { $parent->append_child ($_[0]) });
2900 wakaba 1.25 pop @{$self->{open_elements}}
2901 wakaba 1.54 if $self->{insertion_mode} == AFTER_HEAD_IM;
2902 wakaba 1.25 redo B;
2903     } elsif ($token->{tag_name} eq 'style') {
2904     ## NOTE: Or (scripting is enabled and tag_name eq 'noscript' and
2905 wakaba 1.54 ## insertion mode IN_HEAD_IM)
2906 wakaba 1.25 ## NOTE: There is a "as if in head" code clone.
2907 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
2908 wakaba 1.25 !!!parse-error (type => 'after head:'.$token->{tag_name});
2909     push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2910     }
2911 wakaba 1.40 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert_to_current);
2912 wakaba 1.25 pop @{$self->{open_elements}}
2913 wakaba 1.54 if $self->{insertion_mode} == AFTER_HEAD_IM;
2914 wakaba 1.25 redo B;
2915     } elsif ($token->{tag_name} eq 'noscript') {
2916 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_IM) {
2917 wakaba 1.25 ## NOTE: and scripting is disalbed
2918     !!!insert-element ($token->{tag_name}, $token->{attributes});
2919 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_NOSCRIPT_IM;
2920 wakaba 1.1 !!!next-token;
2921 wakaba 1.25 redo B;
2922 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2923 wakaba 1.30 !!!parse-error (type => 'in noscript:noscript');
2924 wakaba 1.1 ## Ignore the token
2925 wakaba 1.41 !!!next-token;
2926 wakaba 1.25 redo B;
2927 wakaba 1.1 } else {
2928 wakaba 1.25 #
2929 wakaba 1.1 }
2930 wakaba 1.49 } elsif ($token->{tag_name} eq 'script') {
2931 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2932 wakaba 1.49 ## As if </noscript>
2933     pop @{$self->{open_elements}};
2934     !!!parse-error (type => 'in noscript:script');
2935    
2936 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_IM;
2937 wakaba 1.49 ## Reprocess in the "in head" insertion mode...
2938 wakaba 1.54 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
2939 wakaba 1.25 !!!parse-error (type => 'after head:'.$token->{tag_name});
2940     push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2941     }
2942 wakaba 1.49
2943 wakaba 1.25 ## NOTE: There is a "as if in head" code clone.
2944     $script_start_tag->($insert_to_current);
2945     pop @{$self->{open_elements}}
2946 wakaba 1.54 if $self->{insertion_mode} == AFTER_HEAD_IM;
2947 wakaba 1.1 redo B;
2948 wakaba 1.49 } elsif ($token->{tag_name} eq 'body' or
2949 wakaba 1.25 $token->{tag_name} eq 'frameset') {
2950 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2951 wakaba 1.49 ## As if </noscript>
2952     pop @{$self->{open_elements}};
2953     !!!parse-error (type => 'in noscript:'.$token->{tag_name});
2954    
2955     ## Reprocess in the "in head" insertion mode...
2956     ## As if </head>
2957     pop @{$self->{open_elements}};
2958    
2959     ## Reprocess in the "after head" insertion mode...
2960 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
2961 wakaba 1.49 pop @{$self->{open_elements}};
2962    
2963     ## Reprocess in the "after head" insertion mode...
2964     }
2965    
2966     ## "after head" insertion mode
2967     !!!insert-element ($token->{tag_name}, $token->{attributes});
2968 wakaba 1.54 if ($token->{tag_name} eq 'body') {
2969     $self->{insertion_mode} = IN_BODY_IM;
2970     } elsif ($token->{tag_name} eq 'frameset') {
2971     $self->{insertion_mode} = IN_FRAMESET_IM;
2972     } else {
2973     die "$0: tag name: $self->{tag_name}";
2974     }
2975 wakaba 1.1 !!!next-token;
2976     redo B;
2977     } else {
2978     #
2979     }
2980 wakaba 1.49
2981 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2982 wakaba 1.49 ## As if </noscript>
2983     pop @{$self->{open_elements}};
2984     !!!parse-error (type => 'in noscript:/'.$token->{tag_name});
2985    
2986     ## Reprocess in the "in head" insertion mode...
2987     ## As if </head>
2988 wakaba 1.25 pop @{$self->{open_elements}};
2989 wakaba 1.49
2990     ## Reprocess in the "after head" insertion mode...
2991 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
2992 wakaba 1.49 ## As if </head>
2993 wakaba 1.25 pop @{$self->{open_elements}};
2994 wakaba 1.49
2995     ## Reprocess in the "after head" insertion mode...
2996     }
2997    
2998     ## "after head" insertion mode
2999     ## As if <body>
3000     !!!insert-element ('body');
3001 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM;
3002 wakaba 1.49 ## reprocess
3003     redo B;
3004 wakaba 1.55 } elsif ($token->{type} == END_TAG_TOKEN) {
3005 wakaba 1.49 if ($token->{tag_name} eq 'head') {
3006 wakaba 1.54 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3007 wakaba 1.50 ## As if <head>
3008     !!!create-element ($self->{head_element}, 'head');
3009     $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3010     push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3011    
3012     ## Reprocess in the "in head" insertion mode...
3013     pop @{$self->{open_elements}};
3014 wakaba 1.54 $self->{insertion_mode} = AFTER_HEAD_IM;
3015 wakaba 1.50 !!!next-token;
3016     redo B;
3017 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3018 wakaba 1.49 ## As if </noscript>
3019     pop @{$self->{open_elements}};
3020     !!!parse-error (type => 'in noscript:script');
3021    
3022     ## Reprocess in the "in head" insertion mode...
3023 wakaba 1.50 pop @{$self->{open_elements}};
3024 wakaba 1.54 $self->{insertion_mode} = AFTER_HEAD_IM;
3025 wakaba 1.50 !!!next-token;
3026     redo B;
3027 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3028 wakaba 1.49 pop @{$self->{open_elements}};
3029 wakaba 1.54 $self->{insertion_mode} = AFTER_HEAD_IM;
3030 wakaba 1.49 !!!next-token;
3031     redo B;
3032     } else {
3033     #
3034     }
3035     } elsif ($token->{tag_name} eq 'noscript') {
3036 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3037 wakaba 1.49 pop @{$self->{open_elements}};
3038 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_IM;
3039 wakaba 1.49 !!!next-token;
3040     redo B;
3041 wakaba 1.54 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3042 wakaba 1.50 !!!parse-error (type => 'unmatched end tag:noscript');
3043     ## Ignore the token ## ISSUE: An issue in the spec.
3044     !!!next-token;
3045     redo B;
3046 wakaba 1.49 } else {
3047     #
3048     }
3049     } elsif ({
3050 wakaba 1.31 body => 1, html => 1,
3051     }->{$token->{tag_name}}) {
3052 wakaba 1.54 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3053 wakaba 1.50 ## As if <head>
3054     !!!create-element ($self->{head_element}, 'head');
3055     $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3056     push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3057    
3058 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_IM;
3059 wakaba 1.50 ## Reprocess in the "in head" insertion mode...
3060 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3061 wakaba 1.49 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3062     ## Ignore the token
3063     !!!next-token;
3064     redo B;
3065     }
3066 wakaba 1.50
3067     #
3068 wakaba 1.49 } elsif ({
3069 wakaba 1.31 p => 1, br => 1,
3070     }->{$token->{tag_name}}) {
3071 wakaba 1.54 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3072 wakaba 1.50 ## As if <head>
3073     !!!create-element ($self->{head_element}, 'head');
3074     $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3075     push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3076    
3077 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_IM;
3078 wakaba 1.50 ## Reprocess in the "in head" insertion mode...
3079     }
3080    
3081 wakaba 1.1 #
3082 wakaba 1.25 } else {
3083 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
3084     #
3085     } else {
3086 wakaba 1.49 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3087     ## Ignore the token
3088     !!!next-token;
3089     redo B;
3090     }
3091     }
3092    
3093 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3094 wakaba 1.49 ## As if </noscript>
3095     pop @{$self->{open_elements}};
3096     !!!parse-error (type => 'in noscript:/'.$token->{tag_name});
3097    
3098     ## Reprocess in the "in head" insertion mode...
3099     ## As if </head>
3100     pop @{$self->{open_elements}};
3101    
3102     ## Reprocess in the "after head" insertion mode...
3103 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3104 wakaba 1.49 ## As if </head>
3105     pop @{$self->{open_elements}};
3106    
3107     ## Reprocess in the "after head" insertion mode...
3108 wakaba 1.54 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3109 wakaba 1.50 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3110     ## Ignore the token ## ISSUE: An issue in the spec.
3111     !!!next-token;
3112     redo B;
3113 wakaba 1.1 }
3114    
3115 wakaba 1.49 ## "after head" insertion mode
3116     ## As if <body>
3117 wakaba 1.52 !!!insert-element ('body');
3118 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM;
3119 wakaba 1.52 ## reprocess
3120     redo B;
3121     } else {
3122     die "$0: $token->{type}: Unknown token type";
3123     }
3124    
3125     ## ISSUE: An issue in the spec.
3126 wakaba 1.56 } elsif ($self->{insertion_mode} & BODY_IMS) {
3127 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
3128 wakaba 1.52 ## NOTE: There is a code clone of "character in body".
3129     $reconstruct_active_formatting_elements->($insert_to_current);
3130    
3131     $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3132    
3133     !!!next-token;
3134     redo B;
3135 wakaba 1.55 } elsif ($token->{type} == START_TAG_TOKEN) {
3136 wakaba 1.52 if ({
3137     caption => 1, col => 1, colgroup => 1, tbody => 1,
3138     td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
3139     }->{$token->{tag_name}}) {
3140 wakaba 1.54 if ($self->{insertion_mode} == IN_CELL_IM) {
3141 wakaba 1.52 ## have an element in table scope
3142     my $tn;
3143     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3144     my $node = $self->{open_elements}->[$_];
3145     if ($node->[1] eq 'td' or $node->[1] eq 'th') {
3146     $tn = $node->[1];
3147     last INSCOPE;
3148     } elsif ({
3149     table => 1, html => 1,
3150     }->{$node->[1]}) {
3151     last INSCOPE;
3152     }
3153     } # INSCOPE
3154     unless (defined $tn) {
3155     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3156     ## Ignore the token
3157     !!!next-token;
3158     redo B;
3159     }
3160    
3161     ## Close the cell
3162     !!!back-token; # <?>
3163 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => $tn};
3164 wakaba 1.52 redo B;
3165 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
3166 wakaba 1.52 !!!parse-error (type => 'not closed:caption');
3167    
3168     ## As if </caption>
3169     ## have a table element in table scope
3170     my $i;
3171     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3172     my $node = $self->{open_elements}->[$_];
3173     if ($node->[1] eq 'caption') {
3174     $i = $_;
3175     last INSCOPE;
3176     } elsif ({
3177     table => 1, html => 1,
3178     }->{$node->[1]}) {
3179     last INSCOPE;
3180     }
3181     } # INSCOPE
3182     unless (defined $i) {
3183     !!!parse-error (type => 'unmatched end tag:caption');
3184     ## Ignore the token
3185     !!!next-token;
3186     redo B;
3187     }
3188    
3189     ## generate implied end tags
3190     if ({
3191     dd => 1, dt => 1, li => 1, p => 1,
3192     td => 1, th => 1, tr => 1,
3193     tbody => 1, tfoot=> 1, thead => 1,
3194     }->{$self->{open_elements}->[-1]->[1]}) {
3195     !!!back-token; # <?>
3196 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'caption'};
3197 wakaba 1.52 !!!back-token;
3198 wakaba 1.55 $token = {type => END_TAG_TOKEN,
3199 wakaba 1.52 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3200     redo B;
3201     }
3202    
3203     if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3204     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3205     }
3206    
3207     splice @{$self->{open_elements}}, $i;
3208    
3209     $clear_up_to_marker->();
3210    
3211 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
3212 wakaba 1.52
3213     ## reprocess
3214     redo B;
3215     } else {
3216     #
3217     }
3218     } else {
3219     #
3220     }
3221 wakaba 1.55 } elsif ($token->{type} == END_TAG_TOKEN) {
3222 wakaba 1.52 if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') {
3223 wakaba 1.54 if ($self->{insertion_mode} == IN_CELL_IM) {
3224 wakaba 1.43 ## have an element in table scope
3225 wakaba 1.52 my $i;
3226 wakaba 1.43 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3227     my $node = $self->{open_elements}->[$_];
3228 wakaba 1.52 if ($node->[1] eq $token->{tag_name}) {
3229     $i = $_;
3230 wakaba 1.43 last INSCOPE;
3231     } elsif ({
3232     table => 1, html => 1,
3233     }->{$node->[1]}) {
3234     last INSCOPE;
3235     }
3236     } # INSCOPE
3237 wakaba 1.52 unless (defined $i) {
3238 wakaba 1.43 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3239     ## Ignore the token
3240     !!!next-token;
3241     redo B;
3242     }
3243    
3244 wakaba 1.52 ## generate implied end tags
3245     if ({
3246     dd => 1, dt => 1, li => 1, p => 1,
3247     td => ($token->{tag_name} eq 'th'),
3248     th => ($token->{tag_name} eq 'td'),
3249     tr => 1,
3250     tbody => 1, tfoot=> 1, thead => 1,
3251     }->{$self->{open_elements}->[-1]->[1]}) {
3252     !!!back-token;
3253 wakaba 1.55 $token = {type => END_TAG_TOKEN,
3254 wakaba 1.52 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3255     redo B;
3256     }
3257    
3258     if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
3259     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3260     }
3261    
3262     splice @{$self->{open_elements}}, $i;
3263    
3264     $clear_up_to_marker->();
3265    
3266 wakaba 1.54 $self->{insertion_mode} = IN_ROW_IM;
3267 wakaba 1.52
3268     !!!next-token;
3269 wakaba 1.43 redo B;
3270 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
3271 wakaba 1.52 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3272     ## Ignore the token
3273     !!!next-token;
3274     redo B;
3275     } else {
3276     #
3277     }
3278     } elsif ($token->{tag_name} eq 'caption') {
3279 wakaba 1.54 if ($self->{insertion_mode} == IN_CAPTION_IM) {
3280 wakaba 1.43 ## have a table element in table scope
3281     my $i;
3282     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3283     my $node = $self->{open_elements}->[$_];
3284 wakaba 1.52 if ($node->[1] eq $token->{tag_name}) {
3285 wakaba 1.43 $i = $_;
3286     last INSCOPE;
3287     } elsif ({
3288     table => 1, html => 1,
3289     }->{$node->[1]}) {
3290     last INSCOPE;
3291     }
3292     } # INSCOPE
3293     unless (defined $i) {
3294 wakaba 1.52 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3295 wakaba 1.43 ## Ignore the token
3296     !!!next-token;
3297     redo B;
3298     }
3299    
3300     ## generate implied end tags
3301     if ({
3302     dd => 1, dt => 1, li => 1, p => 1,
3303     td => 1, th => 1, tr => 1,
3304     tbody => 1, tfoot=> 1, thead => 1,
3305     }->{$self->{open_elements}->[-1]->[1]}) {
3306     !!!back-token;
3307 wakaba 1.55 $token = {type => END_TAG_TOKEN,
3308 wakaba 1.43 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3309     redo B;
3310     }
3311 wakaba 1.52
3312     if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3313     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3314     }
3315    
3316     splice @{$self->{open_elements}}, $i;
3317    
3318     $clear_up_to_marker->();
3319    
3320 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
3321 wakaba 1.52
3322     !!!next-token;
3323     redo B;
3324 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_CELL_IM) {
3325 wakaba 1.52 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3326     ## Ignore the token
3327     !!!next-token;
3328     redo B;
3329     } else {
3330     #
3331     }
3332     } elsif ({
3333     table => 1, tbody => 1, tfoot => 1,
3334     thead => 1, tr => 1,
3335     }->{$token->{tag_name}} and
3336 wakaba 1.54 $self->{insertion_mode} == IN_CELL_IM) {
3337 wakaba 1.52 ## have an element in table scope
3338     my $i;
3339     my $tn;
3340     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3341     my $node = $self->{open_elements}->[$_];
3342     if ($node->[1] eq $token->{tag_name}) {
3343     $i = $_;
3344     last INSCOPE;
3345     } elsif ($node->[1] eq 'td' or $node->[1] eq 'th') {
3346     $tn = $node->[1];
3347     ## NOTE: There is exactly one |td| or |th| element
3348     ## in scope in the stack of open elements by definition.
3349     } elsif ({
3350     table => 1, html => 1,
3351     }->{$node->[1]}) {
3352     last INSCOPE;
3353     }
3354     } # INSCOPE
3355     unless (defined $i) {
3356     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3357     ## Ignore the token
3358     !!!next-token;
3359     redo B;
3360     }
3361    
3362     ## Close the cell
3363     !!!back-token; # </?>
3364 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => $tn};
3365 wakaba 1.52 redo B;
3366     } elsif ($token->{tag_name} eq 'table' and
3367 wakaba 1.54 $self->{insertion_mode} == IN_CAPTION_IM) {
3368 wakaba 1.52 !!!parse-error (type => 'not closed:caption');
3369    
3370     ## As if </caption>
3371     ## have a table element in table scope
3372     my $i;
3373     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3374     my $node = $self->{open_elements}->[$_];
3375     if ($node->[1] eq 'caption') {
3376     $i = $_;
3377     last INSCOPE;
3378     } elsif ({
3379     table => 1, html => 1,
3380     }->{$node->[1]}) {
3381     last INSCOPE;
3382     }
3383     } # INSCOPE
3384     unless (defined $i) {
3385     !!!parse-error (type => 'unmatched end tag:caption');
3386     ## Ignore the token
3387     !!!next-token;
3388     redo B;
3389     }
3390    
3391     ## generate implied end tags
3392     if ({
3393     dd => 1, dt => 1, li => 1, p => 1,
3394     td => 1, th => 1, tr => 1,
3395     tbody => 1, tfoot=> 1, thead => 1,
3396     }->{$self->{open_elements}->[-1]->[1]}) {
3397     !!!back-token; # </table>
3398 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'caption'};
3399 wakaba 1.52 !!!back-token;
3400 wakaba 1.55 $token = {type => END_TAG_TOKEN,
3401 wakaba 1.52 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3402     redo B;
3403     }
3404    
3405     if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3406     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3407     }
3408    
3409     splice @{$self->{open_elements}}, $i;
3410    
3411     $clear_up_to_marker->();
3412    
3413 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
3414 wakaba 1.52
3415     ## reprocess
3416     redo B;
3417     } elsif ({
3418     body => 1, col => 1, colgroup => 1, html => 1,
3419     }->{$token->{tag_name}}) {
3420 wakaba 1.56 if ($self->{insertion_mode} & BODY_TABLE_IMS) {
3421 wakaba 1.52 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3422     ## Ignore the token
3423     !!!next-token;
3424     redo B;
3425     } else {
3426     #
3427     }
3428     } elsif ({
3429     tbody => 1, tfoot => 1,
3430     thead => 1, tr => 1,
3431     }->{$token->{tag_name}} and
3432 wakaba 1.54 $self->{insertion_mode} == IN_CAPTION_IM) {
3433 wakaba 1.52 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3434     ## Ignore the token
3435     !!!next-token;
3436     redo B;
3437     } else {
3438     #
3439     }
3440     } else {
3441     die "$0: $token->{type}: Unknown token type";
3442     }
3443    
3444     $insert = $insert_to_current;
3445     #
3446 wakaba 1.56 } elsif ($self->{insertion_mode} & TABLE_IMS) {
3447 wakaba 1.58 if ($token->{type} == CHARACTER_TOKEN) {
3448 wakaba 1.52 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3449     $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3450    
3451     unless (length $token->{data}) {
3452     !!!next-token;
3453     redo B;
3454     }
3455     }
3456    
3457     !!!parse-error (type => 'in table:#character');
3458    
3459     ## As if in body, but insert into foster parent element
3460     ## ISSUE: Spec says that "whenever a node would be inserted
3461     ## into the current node" while characters might not be
3462     ## result in a new Text node.
3463     $reconstruct_active_formatting_elements->($insert_to_foster);
3464    
3465     if ({
3466     table => 1, tbody => 1, tfoot => 1,
3467     thead => 1, tr => 1,
3468     }->{$self->{open_elements}->[-1]->[1]}) {
3469     # MUST
3470     my $foster_parent_element;
3471     my $next_sibling;
3472     my $prev_sibling;
3473     OE: for (reverse 0..$#{$self->{open_elements}}) {
3474     if ($self->{open_elements}->[$_]->[1] eq 'table') {
3475     my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3476     if (defined $parent and $parent->node_type == 1) {
3477     $foster_parent_element = $parent;
3478     $next_sibling = $self->{open_elements}->[$_]->[0];
3479     $prev_sibling = $next_sibling->previous_sibling;
3480     } else {
3481     $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
3482     $prev_sibling = $foster_parent_element->last_child;
3483     }
3484     last OE;
3485     }
3486     } # OE
3487     $foster_parent_element = $self->{open_elements}->[0]->[0] and
3488     $prev_sibling = $foster_parent_element->last_child
3489     unless defined $foster_parent_element;
3490     if (defined $prev_sibling and
3491     $prev_sibling->node_type == 3) {
3492     $prev_sibling->manakai_append_text ($token->{data});
3493     } else {
3494     $foster_parent_element->insert_before
3495     ($self->{document}->create_text_node ($token->{data}),
3496     $next_sibling);
3497     }
3498     } else {
3499     $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3500     }
3501    
3502     !!!next-token;
3503     redo B;
3504 wakaba 1.58 } elsif ($token->{type} == START_TAG_TOKEN) {
3505 wakaba 1.52 if ({
3506 wakaba 1.54 tr => ($self->{insertion_mode} != IN_ROW_IM),
3507 wakaba 1.52 th => 1, td => 1,
3508     }->{$token->{tag_name}}) {
3509 wakaba 1.54 if ($self->{insertion_mode} == IN_TABLE_IM) {
3510 wakaba 1.52 ## Clear back to table context
3511     while ($self->{open_elements}->[-1]->[1] ne 'table' and
3512     $self->{open_elements}->[-1]->[1] ne 'html') {
3513 wakaba 1.43 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3514 wakaba 1.52 pop @{$self->{open_elements}};
3515 wakaba 1.43 }
3516    
3517 wakaba 1.52 !!!insert-element ('tbody');
3518 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_BODY_IM;
3519 wakaba 1.52 ## reprocess in the "in table body" insertion mode...
3520     }
3521    
3522 wakaba 1.54 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
3523 wakaba 1.52 unless ($token->{tag_name} eq 'tr') {
3524     !!!parse-error (type => 'missing start tag:tr');
3525     }
3526 wakaba 1.43
3527 wakaba 1.52 ## Clear back to table body context
3528     while (not {
3529     tbody => 1, tfoot => 1, thead => 1, html => 1,
3530     }->{$self->{open_elements}->[-1]->[1]}) {
3531     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3532     pop @{$self->{open_elements}};
3533     }
3534 wakaba 1.43
3535 wakaba 1.54 $self->{insertion_mode} = IN_ROW_IM;
3536 wakaba 1.52 if ($token->{tag_name} eq 'tr') {
3537     !!!insert-element ($token->{tag_name}, $token->{attributes});
3538     !!!next-token;
3539     redo B;
3540     } else {
3541     !!!insert-element ('tr');
3542     ## reprocess in the "in row" insertion mode
3543     }
3544     }
3545    
3546     ## Clear back to table row context
3547     while (not {
3548     tr => 1, html => 1,
3549     }->{$self->{open_elements}->[-1]->[1]}) {
3550     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3551     pop @{$self->{open_elements}};
3552 wakaba 1.43 }
3553 wakaba 1.52
3554     !!!insert-element ($token->{tag_name}, $token->{attributes});
3555 wakaba 1.54 $self->{insertion_mode} = IN_CELL_IM;
3556 wakaba 1.52
3557     push @$active_formatting_elements, ['#marker', ''];
3558    
3559     !!!next-token;
3560     redo B;
3561     } elsif ({
3562     caption => 1, col => 1, colgroup => 1,
3563     tbody => 1, tfoot => 1, thead => 1,
3564 wakaba 1.54 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
3565 wakaba 1.52 }->{$token->{tag_name}}) {
3566 wakaba 1.54 if ($self->{insertion_mode} == IN_ROW_IM) {
3567 wakaba 1.52 ## As if </tr>
3568 wakaba 1.43 ## have an element in table scope
3569     my $i;
3570     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3571     my $node = $self->{open_elements}->[$_];
3572 wakaba 1.52 if ($node->[1] eq 'tr') {
3573 wakaba 1.43 $i = $_;
3574     last INSCOPE;
3575     } elsif ({
3576     table => 1, html => 1,
3577     }->{$node->[1]}) {
3578     last INSCOPE;
3579     }
3580     } # INSCOPE
3581 wakaba 1.52 unless (defined $i) {
3582     !!!parse-error (type => 'unmacthed end tag:'.$token->{tag_name});
3583     ## Ignore the token
3584     !!!next-token;
3585 wakaba 1.43 redo B;
3586     }
3587    
3588 wakaba 1.52 ## Clear back to table row context
3589     while (not {
3590     tr => 1, html => 1,
3591     }->{$self->{open_elements}->[-1]->[1]}) {
3592 wakaba 1.43 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3593 wakaba 1.52 pop @{$self->{open_elements}};
3594 wakaba 1.1 }
3595 wakaba 1.43
3596 wakaba 1.52 pop @{$self->{open_elements}}; # tr
3597 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_BODY_IM;
3598 wakaba 1.52 if ($token->{tag_name} eq 'tr') {
3599     ## reprocess
3600     redo B;
3601     } else {
3602     ## reprocess in the "in table body" insertion mode...
3603     }
3604 wakaba 1.1 }
3605 wakaba 1.52
3606 wakaba 1.54 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
3607 wakaba 1.52 ## have an element in table scope
3608 wakaba 1.43 my $i;
3609     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3610     my $node = $self->{open_elements}->[$_];
3611 wakaba 1.52 if ({
3612     tbody => 1, thead => 1, tfoot => 1,
3613     }->{$node->[1]}) {
3614 wakaba 1.43 $i = $_;
3615     last INSCOPE;
3616     } elsif ({
3617     table => 1, html => 1,
3618     }->{$node->[1]}) {
3619     last INSCOPE;
3620     }
3621     } # INSCOPE
3622 wakaba 1.52 unless (defined $i) {
3623     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3624     ## Ignore the token
3625     !!!next-token;
3626 wakaba 1.43 redo B;
3627     }
3628 wakaba 1.52
3629     ## Clear back to table body context
3630     while (not {
3631     tbody => 1, tfoot => 1, thead => 1, html => 1,
3632     }->{$self->{open_elements}->[-1]->[1]}) {
3633 wakaba 1.43 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3634 wakaba 1.52 pop @{$self->{open_elements}};
3635 wakaba 1.43 }
3636    
3637 wakaba 1.52 ## As if <{current node}>
3638     ## have an element in table scope
3639     ## true by definition
3640 wakaba 1.43
3641 wakaba 1.52 ## Clear back to table body context
3642     ## nop by definition
3643 wakaba 1.43
3644 wakaba 1.52 pop @{$self->{open_elements}};
3645 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
3646 wakaba 1.52 ## reprocess in "in table" insertion mode...
3647     }
3648    
3649     if ($token->{tag_name} eq 'col') {
3650     ## Clear back to table context
3651     while ($self->{open_elements}->[-1]->[1] ne 'table' and
3652     $self->{open_elements}->[-1]->[1] ne 'html') {
3653     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3654     pop @{$self->{open_elements}};
3655     }
3656 wakaba 1.43
3657 wakaba 1.52 !!!insert-element ('colgroup');
3658 wakaba 1.54 $self->{insertion_mode} = IN_COLUMN_GROUP_IM;
3659 wakaba 1.52 ## reprocess
3660 wakaba 1.43 redo B;
3661 wakaba 1.52 } elsif ({
3662     caption => 1,
3663     colgroup => 1,
3664     tbody => 1, tfoot => 1, thead => 1,
3665     }->{$token->{tag_name}}) {
3666     ## Clear back to table context
3667     while ($self->{open_elements}->[-1]->[1] ne 'table' and
3668     $self->{open_elements}->[-1]->[1] ne 'html') {
3669     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3670     pop @{$self->{open_elements}};
3671 wakaba 1.1 }
3672 wakaba 1.52
3673     push @$active_formatting_elements, ['#marker', '']
3674     if $token->{tag_name} eq 'caption';
3675    
3676     !!!insert-element ($token->{tag_name}, $token->{attributes});
3677     $self->{insertion_mode} = {
3678 wakaba 1.54 caption => IN_CAPTION_IM,
3679     colgroup => IN_COLUMN_GROUP_IM,
3680     tbody => IN_TABLE_BODY_IM,
3681     tfoot => IN_TABLE_BODY_IM,
3682     thead => IN_TABLE_BODY_IM,
3683 wakaba 1.52 }->{$token->{tag_name}};
3684 wakaba 1.1 !!!next-token;
3685     redo B;
3686 wakaba 1.52 } else {
3687     die "$0: in table: <>: $token->{tag_name}";
3688 wakaba 1.1 }
3689 wakaba 1.52 } elsif ($token->{tag_name} eq 'table') {
3690     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3691 wakaba 1.1
3692 wakaba 1.52 ## As if </table>
3693 wakaba 1.1 ## have a table element in table scope
3694     my $i;
3695 wakaba 1.3 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3696     my $node = $self->{open_elements}->[$_];
3697 wakaba 1.52 if ($node->[1] eq 'table') {
3698 wakaba 1.1 $i = $_;
3699     last INSCOPE;
3700     } elsif ({
3701     table => 1, html => 1,
3702     }->{$node->[1]}) {
3703     last INSCOPE;
3704     }
3705     } # INSCOPE
3706     unless (defined $i) {
3707 wakaba 1.52 !!!parse-error (type => 'unmatched end tag:table');
3708     ## Ignore tokens </table><table>
3709 wakaba 1.1 !!!next-token;
3710     redo B;
3711     }
3712    
3713     ## generate implied end tags
3714     if ({
3715     dd => 1, dt => 1, li => 1, p => 1,
3716     td => 1, th => 1, tr => 1,
3717 wakaba 1.31 tbody => 1, tfoot=> 1, thead => 1,
3718 wakaba 1.3 }->{$self->{open_elements}->[-1]->[1]}) {
3719 wakaba 1.52 !!!back-token; # <table>
3720 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'table'};
3721 wakaba 1.1 !!!back-token;
3722 wakaba 1.55 $token = {type => END_TAG_TOKEN,
3723 wakaba 1.3 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3724 wakaba 1.1 redo B;
3725     }
3726    
3727 wakaba 1.52 if ($self->{open_elements}->[-1]->[1] ne 'table') {
3728 wakaba 1.3 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3729 wakaba 1.1 }
3730    
3731 wakaba 1.3 splice @{$self->{open_elements}}, $i;
3732 wakaba 1.1
3733 wakaba 1.52 $self->_reset_insertion_mode;
3734 wakaba 1.1
3735     ## reprocess
3736     redo B;
3737 wakaba 1.58 } else {
3738     !!!parse-error (type => 'in table:'.$token->{tag_name});
3739    
3740     $insert = $insert_to_foster;
3741     #
3742     }
3743     } elsif ($token->{type} == END_TAG_TOKEN) {
3744 wakaba 1.52 if ($token->{tag_name} eq 'tr' and
3745 wakaba 1.54 $self->{insertion_mode} == IN_ROW_IM) {
3746 wakaba 1.52 ## have an element in table scope
3747     my $i;
3748     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3749     my $node = $self->{open_elements}->[$_];
3750     if ($node->[1] eq $token->{tag_name}) {
3751     $i = $_;
3752     last INSCOPE;
3753     } elsif ({
3754     table => 1, html => 1,
3755     }->{$node->[1]}) {
3756     last INSCOPE;
3757     }
3758     } # INSCOPE
3759     unless (defined $i) {
3760     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3761     ## Ignore the token
3762 wakaba 1.42 !!!next-token;
3763     redo B;
3764     }
3765    
3766 wakaba 1.52 ## Clear back to table row context
3767     while (not {
3768     tr => 1, html => 1,
3769     }->{$self->{open_elements}->[-1]->[1]}) {
3770     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3771     pop @{$self->{open_elements}};
3772     }
3773 wakaba 1.42
3774 wakaba 1.52 pop @{$self->{open_elements}}; # tr
3775 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_BODY_IM;
3776 wakaba 1.52 !!!next-token;
3777     redo B;
3778     } elsif ($token->{tag_name} eq 'table') {
3779 wakaba 1.54 if ($self->{insertion_mode} == IN_ROW_IM) {
3780 wakaba 1.52 ## As if </tr>
3781     ## have an element in table scope
3782     my $i;
3783     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3784     my $node = $self->{open_elements}->[$_];
3785     if ($node->[1] eq 'tr') {
3786     $i = $_;
3787     last INSCOPE;
3788     } elsif ({
3789     table => 1, html => 1,
3790     }->{$node->[1]}) {
3791     last INSCOPE;
3792 wakaba 1.42 }
3793 wakaba 1.52 } # INSCOPE
3794     unless (defined $i) {
3795     !!!parse-error (type => 'unmatched end tag:'.$token->{type});
3796     ## Ignore the token
3797     !!!next-token;
3798     redo B;
3799 wakaba 1.42 }
3800 wakaba 1.52
3801     ## Clear back to table row context
3802     while (not {
3803     tr => 1, html => 1,
3804     }->{$self->{open_elements}->[-1]->[1]}) {
3805 wakaba 1.46 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3806     pop @{$self->{open_elements}};
3807 wakaba 1.1 }
3808 wakaba 1.46
3809 wakaba 1.52 pop @{$self->{open_elements}}; # tr
3810 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_BODY_IM;
3811 wakaba 1.46 ## reprocess in the "in table body" insertion mode...
3812 wakaba 1.1 }
3813    
3814 wakaba 1.54 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
3815 wakaba 1.52 ## have an element in table scope
3816     my $i;
3817     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3818     my $node = $self->{open_elements}->[$_];
3819     if ({
3820     tbody => 1, thead => 1, tfoot => 1,
3821     }->{$node->[1]}) {
3822     $i = $_;
3823     last INSCOPE;
3824     } elsif ({
3825     table => 1, html => 1,
3826     }->{$node->[1]}) {
3827     last INSCOPE;
3828     }
3829     } # INSCOPE
3830     unless (defined $i) {
3831     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3832     ## Ignore the token
3833     !!!next-token;
3834     redo B;
3835 wakaba 1.47 }
3836    
3837     ## Clear back to table body context
3838     while (not {
3839     tbody => 1, tfoot => 1, thead => 1, html => 1,
3840     }->{$self->{open_elements}->[-1]->[1]}) {
3841     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3842     pop @{$self->{open_elements}};
3843     }
3844    
3845 wakaba 1.52 ## As if <{current node}>
3846     ## have an element in table scope
3847     ## true by definition
3848    
3849     ## Clear back to table body context
3850     ## nop by definition
3851    
3852     pop @{$self->{open_elements}};
3853 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
3854 wakaba 1.52 ## reprocess in the "in table" insertion mode...
3855     }
3856    
3857     ## have a table element in table scope
3858     my $i;
3859     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3860     my $node = $self->{open_elements}->[$_];
3861     if ($node->[1] eq $token->{tag_name}) {
3862     $i = $_;
3863     last INSCOPE;
3864     } elsif ({
3865     table => 1, html => 1,
3866     }->{$node->[1]}) {
3867     last INSCOPE;
3868 wakaba 1.47 }
3869 wakaba 1.52 } # INSCOPE
3870     unless (defined $i) {
3871     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3872     ## Ignore the token
3873     !!!next-token;
3874     redo B;
3875 wakaba 1.3 }
3876    
3877 wakaba 1.52 ## generate implied end tags
3878     if ({
3879     dd => 1, dt => 1, li => 1, p => 1,
3880     td => 1, th => 1, tr => 1,
3881     tbody => 1, tfoot=> 1, thead => 1,
3882     }->{$self->{open_elements}->[-1]->[1]}) {
3883     !!!back-token;
3884 wakaba 1.55 $token = {type => END_TAG_TOKEN,
3885 wakaba 1.52 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3886     redo B;
3887     }
3888    
3889     if ($self->{open_elements}->[-1]->[1] ne 'table') {
3890 wakaba 1.3 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3891 wakaba 1.1 }
3892 wakaba 1.52
3893     splice @{$self->{open_elements}}, $i;
3894 wakaba 1.1
3895 wakaba 1.52 $self->_reset_insertion_mode;
3896 wakaba 1.47
3897     !!!next-token;
3898     redo B;
3899     } elsif ({
3900 wakaba 1.48 tbody => 1, tfoot => 1, thead => 1,
3901 wakaba 1.52 }->{$token->{tag_name}} and
3902 wakaba 1.56 $self->{insertion_mode} & ROW_IMS) {
3903 wakaba 1.54 if ($self->{insertion_mode} == IN_ROW_IM) {
3904 wakaba 1.52 ## have an element in table scope
3905     my $i;
3906     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3907     my $node = $self->{open_elements}->[$_];
3908     if ($node->[1] eq $token->{tag_name}) {
3909     $i = $_;
3910     last INSCOPE;
3911     } elsif ({
3912     table => 1, html => 1,
3913     }->{$node->[1]}) {
3914     last INSCOPE;
3915     }
3916     } # INSCOPE
3917     unless (defined $i) {
3918     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3919     ## Ignore the token
3920     !!!next-token;
3921     redo B;
3922     }
3923    
3924 wakaba 1.48 ## As if </tr>
3925     ## have an element in table scope
3926     my $i;
3927     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3928     my $node = $self->{open_elements}->[$_];
3929     if ($node->[1] eq 'tr') {
3930     $i = $_;
3931     last INSCOPE;
3932     } elsif ({
3933     table => 1, html => 1,
3934     }->{$node->[1]}) {
3935     last INSCOPE;
3936     }
3937     } # INSCOPE
3938 wakaba 1.52 unless (defined $i) {
3939     !!!parse-error (type => 'unmatched end tag:tr');
3940     ## Ignore the token
3941     !!!next-token;
3942     redo B;
3943     }
3944 wakaba 1.48
3945     ## Clear back to table row context
3946     while (not {
3947     tr => 1, html => 1,
3948     }->{$self->{open_elements}->[-1]->[1]}) {
3949     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3950     pop @{$self->{open_elements}};
3951     }
3952    
3953     pop @{$self->{open_elements}}; # tr
3954 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_BODY_IM;
3955 wakaba 1.52 ## reprocess in the "in table body" insertion mode...
3956     }
3957    
3958     ## have an element in table scope
3959     my $i;
3960     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3961     my $node = $self->{open_elements}->[$_];
3962     if ($node->[1] eq $token->{tag_name}) {
3963     $i = $_;
3964     last INSCOPE;
3965     } elsif ({
3966     table => 1, html => 1,
3967     }->{$node->[1]}) {
3968     last INSCOPE;
3969     }
3970     } # INSCOPE
3971     unless (defined $i) {
3972     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3973     ## Ignore the token
3974     !!!next-token;
3975     redo B;
3976     }
3977    
3978     ## Clear back to table body context
3979     while (not {
3980     tbody => 1, tfoot => 1, thead => 1, html => 1,
3981     }->{$self->{open_elements}->[-1]->[1]}) {
3982     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3983     pop @{$self->{open_elements}};
3984     }
3985    
3986     pop @{$self->{open_elements}};
3987 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
3988 wakaba 1.52 !!!next-token;
3989     redo B;
3990     } elsif ({
3991     body => 1, caption => 1, col => 1, colgroup => 1,
3992     html => 1, td => 1, th => 1,
3993 wakaba 1.54 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
3994     tbody => 1, tfoot => 1, thead => 1, # $self->{insertion_mode} == IN_TABLE_IM
3995 wakaba 1.52 }->{$token->{tag_name}}) {
3996     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3997     ## Ignore the token
3998     !!!next-token;
3999     redo B;
4000 wakaba 1.58 } else {
4001     !!!parse-error (type => 'in table:/'.$token->{tag_name});
4002 wakaba 1.52
4003 wakaba 1.58 $insert = $insert_to_foster;
4004     #
4005     }
4006     } else {
4007     die "$0: $token->{type}: Unknown token type";
4008     }
4009 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
4010 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
4011 wakaba 1.52 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4012     $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4013     unless (length $token->{data}) {
4014     !!!next-token;
4015     redo B;
4016     }
4017     }
4018    
4019     #
4020 wakaba 1.55 } elsif ($token->{type} == START_TAG_TOKEN) {
4021 wakaba 1.52 if ($token->{tag_name} eq 'col') {
4022     !!!insert-element ($token->{tag_name}, $token->{attributes});
4023     pop @{$self->{open_elements}};
4024     !!!next-token;
4025     redo B;
4026     } else {
4027     #
4028     }
4029 wakaba 1.55 } elsif ($token->{type} == END_TAG_TOKEN) {
4030 wakaba 1.52 if ($token->{tag_name} eq 'colgroup') {
4031     if ($self->{open_elements}->[-1]->[1] eq 'html') {
4032     !!!parse-error (type => 'unmatched end tag:colgroup');
4033     ## Ignore the token
4034     !!!next-token;
4035     redo B;
4036     } else {
4037     pop @{$self->{open_elements}}; # colgroup
4038 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
4039 wakaba 1.52 !!!next-token;
4040     redo B;
4041     }
4042     } elsif ($token->{tag_name} eq 'col') {
4043     !!!parse-error (type => 'unmatched end tag:col');
4044     ## Ignore the token
4045     !!!next-token;
4046     redo B;
4047     } else {
4048     #
4049     }
4050     } else {
4051     #
4052     }
4053    
4054     ## As if </colgroup>
4055     if ($self->{open_elements}->[-1]->[1] eq 'html') {
4056     !!!parse-error (type => 'unmatched end tag:colgroup');
4057     ## Ignore the token
4058     !!!next-token;
4059     redo B;
4060     } else {
4061     pop @{$self->{open_elements}}; # colgroup
4062 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
4063 wakaba 1.52 ## reprocess
4064     redo B;
4065     }
4066 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_SELECT_IM) {
4067 wakaba 1.58 if ($token->{type} == CHARACTER_TOKEN) {
4068     $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4069     !!!next-token;
4070     redo B;
4071     } elsif ($token->{type} == START_TAG_TOKEN) {
4072 wakaba 1.52 if ($token->{tag_name} eq 'option') {
4073     if ($self->{open_elements}->[-1]->[1] eq 'option') {
4074     ## As if </option>
4075     pop @{$self->{open_elements}};
4076     }
4077    
4078     !!!insert-element ($token->{tag_name}, $token->{attributes});
4079     !!!next-token;
4080     redo B;
4081     } elsif ($token->{tag_name} eq 'optgroup') {
4082     if ($self->{open_elements}->[-1]->[1] eq 'option') {
4083     ## As if </option>
4084     pop @{$self->{open_elements}};
4085     }
4086    
4087     if ($self->{open_elements}->[-1]->[1] eq 'optgroup') {
4088     ## As if </optgroup>
4089     pop @{$self->{open_elements}};
4090     }
4091    
4092     !!!insert-element ($token->{tag_name}, $token->{attributes});
4093     !!!next-token;
4094     redo B;
4095     } elsif ($token->{tag_name} eq 'select') {
4096     !!!parse-error (type => 'not closed:select');
4097     ## As if </select> instead
4098     ## have an element in table scope
4099     my $i;
4100     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4101     my $node = $self->{open_elements}->[$_];
4102     if ($node->[1] eq $token->{tag_name}) {
4103     $i = $_;
4104     last INSCOPE;
4105     } elsif ({
4106     table => 1, html => 1,
4107     }->{$node->[1]}) {
4108     last INSCOPE;
4109 wakaba 1.47 }
4110 wakaba 1.52 } # INSCOPE
4111     unless (defined $i) {
4112     !!!parse-error (type => 'unmatched end tag:select');
4113     ## Ignore the token
4114     !!!next-token;
4115     redo B;
4116 wakaba 1.47 }
4117 wakaba 1.52
4118     splice @{$self->{open_elements}}, $i;
4119    
4120     $self->_reset_insertion_mode;
4121 wakaba 1.47
4122 wakaba 1.52 !!!next-token;
4123     redo B;
4124 wakaba 1.58 } else {
4125     !!!parse-error (type => 'in select:'.$token->{tag_name});
4126     ## Ignore the token
4127     !!!next-token;
4128     redo B;
4129     }
4130     } elsif ($token->{type} == END_TAG_TOKEN) {
4131 wakaba 1.52 if ($token->{tag_name} eq 'optgroup') {
4132     if ($self->{open_elements}->[-1]->[1] eq 'option' and
4133     $self->{open_elements}->[-2]->[1] eq 'optgroup') {
4134     ## As if </option>
4135     splice @{$self->{open_elements}}, -2;
4136     } elsif ($self->{open_elements}->[-1]->[1] eq 'optgroup') {
4137     pop @{$self->{open_elements}};
4138     } else {
4139     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4140     ## Ignore the token
4141     }
4142     !!!next-token;
4143     redo B;
4144     } elsif ($token->{tag_name} eq 'option') {
4145     if ($self->{open_elements}->[-1]->[1] eq 'option') {
4146 wakaba 1.47 pop @{$self->{open_elements}};
4147 wakaba 1.52 } else {
4148     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4149     ## Ignore the token
4150 wakaba 1.1 }
4151 wakaba 1.52 !!!next-token;
4152     redo B;
4153     } elsif ($token->{tag_name} eq 'select') {
4154     ## have an element in table scope
4155     my $i;
4156     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4157     my $node = $self->{open_elements}->[$_];
4158     if ($node->[1] eq $token->{tag_name}) {
4159     $i = $_;
4160     last INSCOPE;
4161     } elsif ({
4162     table => 1, html => 1,
4163     }->{$node->[1]}) {
4164     last INSCOPE;
4165 wakaba 1.48 }
4166 wakaba 1.52 } # INSCOPE
4167     unless (defined $i) {
4168     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4169     ## Ignore the token
4170     !!!next-token;
4171 wakaba 1.48 redo B;
4172 wakaba 1.52 }
4173    
4174     splice @{$self->{open_elements}}, $i;
4175    
4176     $self->_reset_insertion_mode;
4177    
4178     !!!next-token;
4179     redo B;
4180     } elsif ({
4181     caption => 1, table => 1, tbody => 1,
4182     tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
4183     }->{$token->{tag_name}}) {
4184     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4185    
4186     ## have an element in table scope
4187     my $i;
4188     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4189     my $node = $self->{open_elements}->[$_];
4190     if ($node->[1] eq $token->{tag_name}) {
4191     $i = $_;
4192     last INSCOPE;
4193     } elsif ({
4194     table => 1, html => 1,
4195     }->{$node->[1]}) {
4196     last INSCOPE;
4197 wakaba 1.1 }
4198 wakaba 1.52 } # INSCOPE
4199     unless (defined $i) {
4200     ## Ignore the token
4201 wakaba 1.1 !!!next-token;
4202     redo B;
4203     }
4204 wakaba 1.52
4205     ## As if </select>
4206     ## have an element in table scope
4207     undef $i;
4208 wakaba 1.3 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4209     my $node = $self->{open_elements}->[$_];
4210 wakaba 1.52 if ($node->[1] eq 'select') {
4211 wakaba 1.1 $i = $_;
4212     last INSCOPE;
4213     } elsif ({
4214     table => 1, html => 1,
4215 wakaba 1.52 }->{$node->[1]}) {
4216     last INSCOPE;
4217     }
4218     } # INSCOPE
4219     unless (defined $i) {
4220     !!!parse-error (type => 'unmatched end tag:select');
4221     ## Ignore the </select> token
4222     !!!next-token; ## TODO: ok?
4223     redo B;
4224     }
4225    
4226     splice @{$self->{open_elements}}, $i;
4227    
4228     $self->_reset_insertion_mode;
4229    
4230     ## reprocess
4231     redo B;
4232 wakaba 1.58 } else {
4233     !!!parse-error (type => 'in select:/'.$token->{tag_name});
4234 wakaba 1.52 ## Ignore the token
4235     !!!next-token;
4236     redo B;
4237 wakaba 1.58 }
4238     } else {
4239     die "$0: $token->{type}: Unknown token type";
4240     }
4241 wakaba 1.56 } elsif ($self->{insertion_mode} & BODY_AFTER_IMS) {
4242 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
4243 wakaba 1.52 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4244     my $data = $1;
4245     ## As if in body
4246     $reconstruct_active_formatting_elements->($insert_to_current);
4247    
4248     $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4249    
4250     unless (length $token->{data}) {
4251     !!!next-token;
4252     redo B;
4253     }
4254     }
4255    
4256 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
4257 wakaba 1.52 !!!parse-error (type => 'after html:#character');
4258    
4259     ## Reprocess in the "main" phase, "after body" insertion mode...
4260     }
4261    
4262     ## "after body" insertion mode
4263     !!!parse-error (type => 'after body:#character');
4264    
4265 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM;
4266 wakaba 1.52 ## reprocess
4267     redo B;
4268 wakaba 1.55 } elsif ($token->{type} == START_TAG_TOKEN) {
4269 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
4270 wakaba 1.52 !!!parse-error (type => 'after html:'.$token->{tag_name});
4271    
4272     ## Reprocess in the "main" phase, "after body" insertion mode...
4273     }
4274    
4275     ## "after body" insertion mode
4276     !!!parse-error (type => 'after body:'.$token->{tag_name});
4277    
4278 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM;
4279 wakaba 1.52 ## reprocess
4280     redo B;
4281 wakaba 1.55 } elsif ($token->{type} == END_TAG_TOKEN) {
4282 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
4283 wakaba 1.52 !!!parse-error (type => 'after html:/'.$token->{tag_name});
4284    
4285 wakaba 1.54 $self->{insertion_mode} = AFTER_BODY_IM;
4286 wakaba 1.52 ## Reprocess in the "main" phase, "after body" insertion mode...
4287     }
4288    
4289     ## "after body" insertion mode
4290     if ($token->{tag_name} eq 'html') {
4291     if (defined $self->{inner_html_node}) {
4292     !!!parse-error (type => 'unmatched end tag:html');
4293     ## Ignore the token
4294     !!!next-token;
4295     redo B;
4296     } else {
4297 wakaba 1.54 $self->{insertion_mode} = AFTER_HTML_BODY_IM;
4298 wakaba 1.52 !!!next-token;
4299     redo B;
4300     }
4301     } else {
4302     !!!parse-error (type => 'after body:/'.$token->{tag_name});
4303    
4304 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM;
4305 wakaba 1.52 ## reprocess
4306     redo B;
4307     }
4308     } else {
4309     die "$0: $token->{type}: Unknown token type";
4310     }
4311 wakaba 1.56 } elsif ($self->{insertion_mode} & FRAME_IMS) {
4312 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
4313 wakaba 1.52 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4314     $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4315    
4316     unless (length $token->{data}) {
4317     !!!next-token;
4318     redo B;
4319     }
4320     }
4321    
4322     if ($token->{data} =~ s/^[^\x09\x0A\x0B\x0C\x20]+//) {
4323 wakaba 1.54 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
4324 wakaba 1.52 !!!parse-error (type => 'in frameset:#character');
4325 wakaba 1.54 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
4326 wakaba 1.52 !!!parse-error (type => 'after frameset:#character');
4327     } else { # "after html frameset"
4328     !!!parse-error (type => 'after html:#character');
4329    
4330 wakaba 1.54 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4331 wakaba 1.52 ## Reprocess in the "main" phase, "after frameset"...
4332     !!!parse-error (type => 'after frameset:#character');
4333     }
4334    
4335     ## Ignore the token.
4336     if (length $token->{data}) {
4337     ## reprocess the rest of characters
4338     } else {
4339     !!!next-token;
4340     }
4341     redo B;
4342     }
4343    
4344     die qq[$0: Character "$token->{data}"];
4345 wakaba 1.55 } elsif ($token->{type} == START_TAG_TOKEN) {
4346 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
4347 wakaba 1.52 !!!parse-error (type => 'after html:'.$token->{tag_name});
4348 wakaba 1.1
4349 wakaba 1.54 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4350 wakaba 1.52 ## Process in the "main" phase, "after frameset" insertion mode...
4351     }
4352 wakaba 1.1
4353 wakaba 1.52 if ($token->{tag_name} eq 'frameset' and
4354 wakaba 1.54 $self->{insertion_mode} == IN_FRAMESET_IM) {
4355 wakaba 1.52 !!!insert-element ($token->{tag_name}, $token->{attributes});
4356     !!!next-token;
4357     redo B;
4358     } elsif ($token->{tag_name} eq 'frame' and
4359 wakaba 1.54 $self->{insertion_mode} == IN_FRAMESET_IM) {
4360 wakaba 1.52 !!!insert-element ($token->{tag_name}, $token->{attributes});
4361     pop @{$self->{open_elements}};
4362     !!!next-token;
4363     redo B;
4364     } elsif ($token->{tag_name} eq 'noframes') {
4365     ## NOTE: As if in body.
4366     $parse_rcdata->(CDATA_CONTENT_MODEL, $insert_to_current);
4367     redo B;
4368     } else {
4369 wakaba 1.54 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
4370 wakaba 1.52 !!!parse-error (type => 'in frameset:'.$token->{tag_name});
4371     } else {
4372     !!!parse-error (type => 'after frameset:'.$token->{tag_name});
4373     }
4374     ## Ignore the token
4375     !!!next-token;
4376     redo B;
4377     }
4378 wakaba 1.55 } elsif ($token->{type} == END_TAG_TOKEN) {
4379 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
4380 wakaba 1.52 !!!parse-error (type => 'after html:/'.$token->{tag_name});
4381 wakaba 1.1
4382 wakaba 1.54 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4383 wakaba 1.52 ## Process in the "main" phase, "after frameset" insertion mode...
4384     }
4385 wakaba 1.1
4386 wakaba 1.52 if ($token->{tag_name} eq 'frameset' and
4387 wakaba 1.54 $self->{insertion_mode} == IN_FRAMESET_IM) {
4388 wakaba 1.52 if ($self->{open_elements}->[-1]->[1] eq 'html' and
4389     @{$self->{open_elements}} == 1) {
4390     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4391     ## Ignore the token
4392     !!!next-token;
4393     } else {
4394     pop @{$self->{open_elements}};
4395     !!!next-token;
4396     }
4397 wakaba 1.47
4398 wakaba 1.52 if (not defined $self->{inner_html_node} and
4399     $self->{open_elements}->[-1]->[1] ne 'frameset') {
4400 wakaba 1.54 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4401 wakaba 1.52 }
4402     redo B;
4403     } elsif ($token->{tag_name} eq 'html' and
4404 wakaba 1.54 $self->{insertion_mode} == AFTER_FRAMESET_IM) {
4405     $self->{insertion_mode} = AFTER_HTML_FRAMESET_IM;
4406 wakaba 1.52 !!!next-token;
4407     redo B;
4408     } else {
4409 wakaba 1.54 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
4410 wakaba 1.52 !!!parse-error (type => 'in frameset:/'.$token->{tag_name});
4411     } else {
4412     !!!parse-error (type => 'after frameset:/'.$token->{tag_name});
4413     }
4414     ## Ignore the token
4415     !!!next-token;
4416     redo B;
4417     }
4418     } else {
4419     die "$0: $token->{type}: Unknown token type";
4420     }
4421 wakaba 1.47
4422 wakaba 1.52 ## ISSUE: An issue in spec here
4423     } else {
4424     die "$0: $self->{insertion_mode}: Unknown insertion mode";
4425     }
4426 wakaba 1.47
4427 wakaba 1.52 ## "in body" insertion mode
4428 wakaba 1.55 if ($token->{type} == START_TAG_TOKEN) {
4429 wakaba 1.52 if ($token->{tag_name} eq 'script') {
4430     ## NOTE: This is an "as if in head" code clone
4431     $script_start_tag->($insert);
4432 wakaba 1.53 redo B;
4433 wakaba 1.52 } elsif ($token->{tag_name} eq 'style') {
4434     ## NOTE: This is an "as if in head" code clone
4435     $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);
4436 wakaba 1.53 redo B;
4437 wakaba 1.52 } elsif ({
4438     base => 1, link => 1,
4439     }->{$token->{tag_name}}) {
4440     ## NOTE: This is an "as if in head" code clone, only "-t" differs
4441     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4442     pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4443     !!!next-token;
4444 wakaba 1.53 redo B;
4445 wakaba 1.52 } elsif ($token->{tag_name} eq 'meta') {
4446     ## NOTE: This is an "as if in head" code clone, only "-t" differs
4447     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4448     pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4449 wakaba 1.46
4450 wakaba 1.52 unless ($self->{confident}) {
4451     if ($token->{attributes}->{charset}) { ## TODO: And if supported
4452 wakaba 1.63 $self->{change_encoding}
4453     ->($self, $token->{attributes}->{charset}->{value});
4454     } elsif ($token->{attributes}->{content}) {
4455 wakaba 1.52 ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
4456 wakaba 1.63 if ($token->{attributes}->{content}->{value}
4457 wakaba 1.52 =~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*=
4458     [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
4459     ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
4460 wakaba 1.63 $self->{change_encoding}
4461     ->($self, defined $1 ? $1 : defined $2 ? $2 : $3);
4462     }
4463 wakaba 1.52 }
4464     }
4465 wakaba 1.1
4466 wakaba 1.52 !!!next-token;
4467 wakaba 1.53 redo B;
4468 wakaba 1.52 } elsif ($token->{tag_name} eq 'title') {
4469     !!!parse-error (type => 'in body:title');
4470     ## NOTE: This is an "as if in head" code clone
4471     $parse_rcdata->(RCDATA_CONTENT_MODEL, sub {
4472     if (defined $self->{head_element}) {
4473     $self->{head_element}->append_child ($_[0]);
4474     } else {
4475     $insert->($_[0]);
4476     }
4477     });
4478 wakaba 1.53 redo B;
4479 wakaba 1.52 } elsif ($token->{tag_name} eq 'body') {
4480     !!!parse-error (type => 'in body:body');
4481 wakaba 1.46
4482 wakaba 1.52 if (@{$self->{open_elements}} == 1 or
4483     $self->{open_elements}->[1]->[1] ne 'body') {
4484     ## Ignore the token
4485     } else {
4486     my $body_el = $self->{open_elements}->[1]->[0];
4487     for my $attr_name (keys %{$token->{attributes}}) {
4488     unless ($body_el->has_attribute_ns (undef, $attr_name)) {
4489     $body_el->set_attribute_ns
4490     (undef, [undef, $attr_name],
4491     $token->{attributes}->{$attr_name}->{value});
4492     }
4493     }
4494     }
4495     !!!next-token;
4496 wakaba 1.53 redo B;
4497 wakaba 1.52 } elsif ({
4498     address => 1, blockquote => 1, center => 1, dir => 1,
4499     div => 1, dl => 1, fieldset => 1, listing => 1,
4500     menu => 1, ol => 1, p => 1, ul => 1,
4501     pre => 1,
4502     }->{$token->{tag_name}}) {
4503     ## has a p element in scope
4504     INSCOPE: for (reverse @{$self->{open_elements}}) {
4505     if ($_->[1] eq 'p') {
4506     !!!back-token;
4507 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4508 wakaba 1.53 redo B;
4509 wakaba 1.52 } elsif ({
4510     table => 1, caption => 1, td => 1, th => 1,
4511     button => 1, marquee => 1, object => 1, html => 1,
4512     }->{$_->[1]}) {
4513     last INSCOPE;
4514     }
4515     } # INSCOPE
4516    
4517     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4518     if ($token->{tag_name} eq 'pre') {
4519     !!!next-token;
4520 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
4521 wakaba 1.52 $token->{data} =~ s/^\x0A//;
4522     unless (length $token->{data}) {
4523 wakaba 1.1 !!!next-token;
4524 wakaba 1.52 }
4525     }
4526     } else {
4527     !!!next-token;
4528     }
4529 wakaba 1.53 redo B;
4530 wakaba 1.52 } elsif ($token->{tag_name} eq 'form') {
4531     if (defined $self->{form_element}) {
4532     !!!parse-error (type => 'in form:form');
4533     ## Ignore the token
4534     !!!next-token;
4535 wakaba 1.53 redo B;
4536 wakaba 1.52 } else {
4537     ## has a p element in scope
4538     INSCOPE: for (reverse @{$self->{open_elements}}) {
4539     if ($_->[1] eq 'p') {
4540     !!!back-token;
4541 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4542 wakaba 1.53 redo B;
4543 wakaba 1.46 } elsif ({
4544 wakaba 1.52 table => 1, caption => 1, td => 1, th => 1,
4545     button => 1, marquee => 1, object => 1, html => 1,
4546     }->{$_->[1]}) {
4547     last INSCOPE;
4548     }
4549     } # INSCOPE
4550    
4551     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4552     $self->{form_element} = $self->{open_elements}->[-1]->[0];
4553     !!!next-token;
4554 wakaba 1.53 redo B;
4555 wakaba 1.52 }
4556     } elsif ($token->{tag_name} eq 'li') {
4557     ## has a p element in scope
4558     INSCOPE: for (reverse @{$self->{open_elements}}) {
4559     if ($_->[1] eq 'p') {
4560     !!!back-token;
4561 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4562 wakaba 1.53 redo B;
4563 wakaba 1.52 } elsif ({
4564     table => 1, caption => 1, td => 1, th => 1,
4565     button => 1, marquee => 1, object => 1, html => 1,
4566     }->{$_->[1]}) {
4567     last INSCOPE;
4568     }
4569     } # INSCOPE
4570    
4571     ## Step 1
4572     my $i = -1;
4573     my $node = $self->{open_elements}->[$i];
4574     LI: {
4575     ## Step 2
4576     if ($node->[1] eq 'li') {
4577     if ($i != -1) {
4578     !!!parse-error (type => 'end tag missing:'.
4579     $self->{open_elements}->[-1]->[1]);
4580     }
4581     splice @{$self->{open_elements}}, $i;
4582     last LI;
4583     }
4584    
4585     ## Step 3
4586     if (not $formatting_category->{$node->[1]} and
4587     #not $phrasing_category->{$node->[1]} and
4588     ($special_category->{$node->[1]} or
4589     $scoping_category->{$node->[1]}) and
4590     $node->[1] ne 'address' and $node->[1] ne 'div') {
4591     last LI;
4592     }
4593    
4594     ## Step 4
4595     $i--;
4596     $node = $self->{open_elements}->[$i];
4597     redo LI;
4598     } # LI
4599    
4600     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4601     !!!next-token;
4602 wakaba 1.53 redo B;
4603 wakaba 1.52 } elsif ($token->{tag_name} eq 'dd' or $token->{tag_name} eq 'dt') {
4604     ## has a p element in scope
4605     INSCOPE: for (reverse @{$self->{open_elements}}) {
4606     if ($_->[1] eq 'p') {
4607     !!!back-token;
4608 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4609 wakaba 1.53 redo B;
4610 wakaba 1.52 } elsif ({
4611     table => 1, caption => 1, td => 1, th => 1,
4612     button => 1, marquee => 1, object => 1, html => 1,
4613     }->{$_->[1]}) {
4614     last INSCOPE;
4615     }
4616     } # INSCOPE
4617    
4618     ## Step 1
4619     my $i = -1;
4620     my $node = $self->{open_elements}->[$i];
4621     LI: {
4622     ## Step 2
4623     if ($node->[1] eq 'dt' or $node->[1] eq 'dd') {
4624     if ($i != -1) {
4625     !!!parse-error (type => 'end tag missing:'.
4626     $self->{open_elements}->[-1]->[1]);
4627 wakaba 1.1 }
4628 wakaba 1.52 splice @{$self->{open_elements}}, $i;
4629     last LI;
4630     }
4631    
4632     ## Step 3
4633     if (not $formatting_category->{$node->[1]} and
4634     #not $phrasing_category->{$node->[1]} and
4635     ($special_category->{$node->[1]} or
4636     $scoping_category->{$node->[1]}) and
4637     $node->[1] ne 'address' and $node->[1] ne 'div') {
4638     last LI;
4639 wakaba 1.1 }
4640 wakaba 1.52
4641     ## Step 4
4642     $i--;
4643     $node = $self->{open_elements}->[$i];
4644     redo LI;
4645     } # LI
4646    
4647     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4648     !!!next-token;
4649 wakaba 1.53 redo B;
4650 wakaba 1.52 } elsif ($token->{tag_name} eq 'plaintext') {
4651     ## has a p element in scope
4652     INSCOPE: for (reverse @{$self->{open_elements}}) {
4653     if ($_->[1] eq 'p') {
4654     !!!back-token;
4655 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4656 wakaba 1.53 redo B;
4657 wakaba 1.52 } elsif ({
4658     table => 1, caption => 1, td => 1, th => 1,
4659     button => 1, marquee => 1, object => 1, html => 1,
4660     }->{$_->[1]}) {
4661     last INSCOPE;
4662 wakaba 1.46 }
4663 wakaba 1.52 } # INSCOPE
4664    
4665     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4666    
4667     $self->{content_model} = PLAINTEXT_CONTENT_MODEL;
4668    
4669     !!!next-token;
4670 wakaba 1.53 redo B;
4671 wakaba 1.52 } elsif ({
4672     h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
4673     }->{$token->{tag_name}}) {
4674     ## has a p element in scope
4675     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4676     my $node = $self->{open_elements}->[$_];
4677     if ($node->[1] eq 'p') {
4678     !!!back-token;
4679 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4680 wakaba 1.53 redo B;
4681 wakaba 1.52 } elsif ({
4682     table => 1, caption => 1, td => 1, th => 1,
4683     button => 1, marquee => 1, object => 1, html => 1,
4684     }->{$node->[1]}) {
4685     last INSCOPE;
4686 wakaba 1.46 }
4687 wakaba 1.52 } # INSCOPE
4688    
4689     ## NOTE: See <http://html5.org/tools/web-apps-tracker?from=925&to=926>
4690     ## has an element in scope
4691     #my $i;
4692     #INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4693     # my $node = $self->{open_elements}->[$_];
4694     # if ({
4695     # h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
4696     # }->{$node->[1]}) {
4697     # $i = $_;
4698     # last INSCOPE;
4699     # } elsif ({
4700     # table => 1, caption => 1, td => 1, th => 1,
4701     # button => 1, marquee => 1, object => 1, html => 1,
4702     # }->{$node->[1]}) {
4703     # last INSCOPE;
4704     # }
4705     #} # INSCOPE
4706     #
4707     #if (defined $i) {
4708     # !!! parse-error (type => 'in hn:hn');
4709     # splice @{$self->{open_elements}}, $i;
4710     #}
4711    
4712     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4713    
4714     !!!next-token;
4715 wakaba 1.53 redo B;
4716 wakaba 1.52 } elsif ($token->{tag_name} eq 'a') {
4717     AFE: for my $i (reverse 0..$#$active_formatting_elements) {
4718     my $node = $active_formatting_elements->[$i];
4719     if ($node->[1] eq 'a') {
4720     !!!parse-error (type => 'in a:a');
4721    
4722     !!!back-token;
4723 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'a'};
4724 wakaba 1.52 $formatting_end_tag->($token->{tag_name});
4725    
4726     AFE2: for (reverse 0..$#$active_formatting_elements) {
4727     if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
4728     splice @$active_formatting_elements, $_, 1;
4729     last AFE2;
4730 wakaba 1.1 }
4731 wakaba 1.52 } # AFE2
4732     OE: for (reverse 0..$#{$self->{open_elements}}) {
4733     if ($self->{open_elements}->[$_]->[0] eq $node->[0]) {
4734     splice @{$self->{open_elements}}, $_, 1;
4735     last OE;
4736 wakaba 1.1 }
4737 wakaba 1.52 } # OE
4738     last AFE;
4739     } elsif ($node->[0] eq '#marker') {
4740     last AFE;
4741     }
4742     } # AFE
4743    
4744     $reconstruct_active_formatting_elements->($insert_to_current);
4745 wakaba 1.1
4746 wakaba 1.52 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4747     push @$active_formatting_elements, $self->{open_elements}->[-1];
4748 wakaba 1.1
4749 wakaba 1.52 !!!next-token;
4750 wakaba 1.53 redo B;
4751 wakaba 1.52 } elsif ({
4752     b => 1, big => 1, em => 1, font => 1, i => 1,
4753     s => 1, small => 1, strile => 1,
4754     strong => 1, tt => 1, u => 1,
4755     }->{$token->{tag_name}}) {
4756     $reconstruct_active_formatting_elements->($insert_to_current);
4757    
4758     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4759     push @$active_formatting_elements, $self->{open_elements}->[-1];
4760    
4761     !!!next-token;
4762 wakaba 1.53 redo B;
4763 wakaba 1.52 } elsif ($token->{tag_name} eq 'nobr') {
4764     $reconstruct_active_formatting_elements->($insert_to_current);
4765 wakaba 1.1
4766 wakaba 1.52 ## has a |nobr| element in scope
4767     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4768     my $node = $self->{open_elements}->[$_];
4769     if ($node->[1] eq 'nobr') {
4770 wakaba 1.58 !!!parse-error (type => 'in nobr:nobr');
4771 wakaba 1.52 !!!back-token;
4772 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'nobr'};
4773 wakaba 1.53 redo B;
4774 wakaba 1.52 } elsif ({
4775     table => 1, caption => 1, td => 1, th => 1,
4776     button => 1, marquee => 1, object => 1, html => 1,
4777     }->{$node->[1]}) {
4778     last INSCOPE;
4779     }
4780     } # INSCOPE
4781    
4782     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4783     push @$active_formatting_elements, $self->{open_elements}->[-1];
4784    
4785     !!!next-token;
4786 wakaba 1.53 redo B;
4787 wakaba 1.52 } elsif ($token->{tag_name} eq 'button') {
4788     ## has a button element in scope
4789     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4790     my $node = $self->{open_elements}->[$_];
4791     if ($node->[1] eq 'button') {
4792     !!!parse-error (type => 'in button:button');
4793     !!!back-token;
4794 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'button'};
4795 wakaba 1.53 redo B;
4796 wakaba 1.52 } elsif ({
4797     table => 1, caption => 1, td => 1, th => 1,
4798     button => 1, marquee => 1, object => 1, html => 1,
4799     }->{$node->[1]}) {
4800     last INSCOPE;
4801     }
4802     } # INSCOPE
4803    
4804     $reconstruct_active_formatting_elements->($insert_to_current);
4805    
4806     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4807     push @$active_formatting_elements, ['#marker', ''];
4808 wakaba 1.1
4809 wakaba 1.52 !!!next-token;
4810 wakaba 1.53 redo B;
4811 wakaba 1.52 } elsif ($token->{tag_name} eq 'marquee' or
4812     $token->{tag_name} eq 'object') {
4813     $reconstruct_active_formatting_elements->($insert_to_current);
4814    
4815     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4816     push @$active_formatting_elements, ['#marker', ''];
4817    
4818     !!!next-token;
4819 wakaba 1.53 redo B;
4820 wakaba 1.52 } elsif ($token->{tag_name} eq 'xmp') {
4821     $reconstruct_active_formatting_elements->($insert_to_current);
4822     $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);
4823 wakaba 1.53 redo B;
4824 wakaba 1.52 } elsif ($token->{tag_name} eq 'table') {
4825     ## has a p element in scope
4826     INSCOPE: for (reverse @{$self->{open_elements}}) {
4827     if ($_->[1] eq 'p') {
4828     !!!back-token;
4829 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4830 wakaba 1.53 redo B;
4831 wakaba 1.52 } elsif ({
4832     table => 1, caption => 1, td => 1, th => 1,
4833     button => 1, marquee => 1, object => 1, html => 1,
4834     }->{$_->[1]}) {
4835     last INSCOPE;
4836     }
4837     } # INSCOPE
4838    
4839     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4840    
4841 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
4842 wakaba 1.52
4843     !!!next-token;
4844 wakaba 1.53 redo B;
4845 wakaba 1.52 } elsif ({
4846     area => 1, basefont => 1, bgsound => 1, br => 1,
4847     embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
4848     image => 1,
4849     }->{$token->{tag_name}}) {
4850     if ($token->{tag_name} eq 'image') {
4851     !!!parse-error (type => 'image');
4852     $token->{tag_name} = 'img';
4853     }
4854 wakaba 1.1
4855 wakaba 1.52 ## NOTE: There is an "as if <br>" code clone.
4856     $reconstruct_active_formatting_elements->($insert_to_current);
4857    
4858     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4859     pop @{$self->{open_elements}};
4860    
4861     !!!next-token;
4862 wakaba 1.53 redo B;
4863 wakaba 1.52 } elsif ($token->{tag_name} eq 'hr') {
4864     ## has a p element in scope
4865     INSCOPE: for (reverse @{$self->{open_elements}}) {
4866     if ($_->[1] eq 'p') {
4867     !!!back-token;
4868 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4869 wakaba 1.53 redo B;
4870 wakaba 1.52 } elsif ({
4871     table => 1, caption => 1, td => 1, th => 1,
4872     button => 1, marquee => 1, object => 1, html => 1,
4873     }->{$_->[1]}) {
4874     last INSCOPE;
4875     }
4876     } # INSCOPE
4877    
4878     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4879     pop @{$self->{open_elements}};
4880    
4881     !!!next-token;
4882 wakaba 1.53 redo B;
4883 wakaba 1.52 } elsif ($token->{tag_name} eq 'input') {
4884     $reconstruct_active_formatting_elements->($insert_to_current);
4885    
4886     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4887     ## TODO: associate with $self->{form_element} if defined
4888     pop @{$self->{open_elements}};
4889    
4890     !!!next-token;
4891 wakaba 1.53 redo B;
4892 wakaba 1.52 } elsif ($token->{tag_name} eq 'isindex') {
4893     !!!parse-error (type => 'isindex');
4894    
4895     if (defined $self->{form_element}) {
4896     ## Ignore the token
4897     !!!next-token;
4898 wakaba 1.53 redo B;
4899 wakaba 1.52 } else {
4900     my $at = $token->{attributes};
4901     my $form_attrs;
4902     $form_attrs->{action} = $at->{action} if $at->{action};
4903     my $prompt_attr = $at->{prompt};
4904     $at->{name} = {name => 'name', value => 'isindex'};
4905     delete $at->{action};
4906     delete $at->{prompt};
4907     my @tokens = (
4908 wakaba 1.55 {type => START_TAG_TOKEN, tag_name => 'form',
4909 wakaba 1.52 attributes => $form_attrs},
4910 wakaba 1.55 {type => START_TAG_TOKEN, tag_name => 'hr'},
4911     {type => START_TAG_TOKEN, tag_name => 'p'},
4912     {type => START_TAG_TOKEN, tag_name => 'label'},
4913 wakaba 1.52 );
4914     if ($prompt_attr) {
4915 wakaba 1.55 push @tokens, {type => CHARACTER_TOKEN, data => $prompt_attr->{value}};
4916 wakaba 1.1 } else {
4917 wakaba 1.55 push @tokens, {type => CHARACTER_TOKEN,
4918 wakaba 1.52 data => 'This is a searchable index. Insert your search keywords here: '}; # SHOULD
4919     ## TODO: make this configurable
4920 wakaba 1.1 }
4921 wakaba 1.52 push @tokens,
4922 wakaba 1.55 {type => START_TAG_TOKEN, tag_name => 'input', attributes => $at},
4923     #{type => CHARACTER_TOKEN, data => ''}, # SHOULD
4924     {type => END_TAG_TOKEN, tag_name => 'label'},
4925     {type => END_TAG_TOKEN, tag_name => 'p'},
4926     {type => START_TAG_TOKEN, tag_name => 'hr'},
4927     {type => END_TAG_TOKEN, tag_name => 'form'};
4928 wakaba 1.52 $token = shift @tokens;
4929     !!!back-token (@tokens);
4930 wakaba 1.53 redo B;
4931 wakaba 1.52 }
4932     } elsif ($token->{tag_name} eq 'textarea') {
4933     my $tag_name = $token->{tag_name};
4934     my $el;
4935     !!!create-element ($el, $token->{tag_name}, $token->{attributes});
4936    
4937     ## TODO: $self->{form_element} if defined
4938     $self->{content_model} = RCDATA_CONTENT_MODEL;
4939     delete $self->{escape}; # MUST
4940    
4941     $insert->($el);
4942    
4943     my $text = '';
4944     !!!next-token;
4945 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
4946 wakaba 1.52 $token->{data} =~ s/^\x0A//;
4947 wakaba 1.51 unless (length $token->{data}) {
4948     !!!next-token;
4949     }
4950     }
4951 wakaba 1.55 while ($token->{type} == CHARACTER_TOKEN) {
4952 wakaba 1.52 $text .= $token->{data};
4953     !!!next-token;
4954     }
4955     if (length $text) {
4956     $el->manakai_append_text ($text);
4957     }
4958    
4959     $self->{content_model} = PCDATA_CONTENT_MODEL;
4960 wakaba 1.51
4961 wakaba 1.55 if ($token->{type} == END_TAG_TOKEN and
4962 wakaba 1.52 $token->{tag_name} eq $tag_name) {
4963     ## Ignore the token
4964     } else {
4965     !!!parse-error (type => 'in RCDATA:#'.$token->{type});
4966 wakaba 1.51 }
4967 wakaba 1.52 !!!next-token;
4968 wakaba 1.53 redo B;
4969 wakaba 1.52 } elsif ({
4970     iframe => 1,
4971     noembed => 1,
4972     noframes => 1,
4973     noscript => 0, ## TODO: 1 if scripting is enabled
4974     }->{$token->{tag_name}}) {
4975 wakaba 1.58 ## NOTE: There is an "as if in body" code clone.
4976 wakaba 1.52 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);
4977 wakaba 1.53 redo B;
4978 wakaba 1.52 } elsif ($token->{tag_name} eq 'select') {
4979     $reconstruct_active_formatting_elements->($insert_to_current);
4980    
4981     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4982    
4983 wakaba 1.54 $self->{insertion_mode} = IN_SELECT_IM;
4984 wakaba 1.52 !!!next-token;
4985 wakaba 1.53 redo B;
4986 wakaba 1.52 } elsif ({
4987     caption => 1, col => 1, colgroup => 1, frame => 1,
4988     frameset => 1, head => 1, option => 1, optgroup => 1,
4989     tbody => 1, td => 1, tfoot => 1, th => 1,
4990     thead => 1, tr => 1,
4991     }->{$token->{tag_name}}) {
4992     !!!parse-error (type => 'in body:'.$token->{tag_name});
4993     ## Ignore the token
4994     !!!next-token;
4995 wakaba 1.53 redo B;
4996 wakaba 1.52
4997     ## ISSUE: An issue on HTML5 new elements in the spec.
4998     } else {
4999     $reconstruct_active_formatting_elements->($insert_to_current);
5000    
5001     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5002 wakaba 1.51
5003 wakaba 1.52 !!!next-token;
5004 wakaba 1.53 redo B;
5005 wakaba 1.52 }
5006 wakaba 1.55 } elsif ($token->{type} == END_TAG_TOKEN) {
5007 wakaba 1.52 if ($token->{tag_name} eq 'body') {
5008     if (@{$self->{open_elements}} > 1 and
5009     $self->{open_elements}->[1]->[1] eq 'body') {
5010     for (@{$self->{open_elements}}) {
5011     unless ({
5012     dd => 1, dt => 1, li => 1, p => 1, td => 1,
5013     th => 1, tr => 1, body => 1, html => 1,
5014     tbody => 1, tfoot => 1, thead => 1,
5015     }->{$_->[1]}) {
5016     !!!parse-error (type => 'not closed:'.$_->[1]);
5017     }
5018     }
5019 wakaba 1.51
5020 wakaba 1.54 $self->{insertion_mode} = AFTER_BODY_IM;
5021 wakaba 1.52 !!!next-token;
5022 wakaba 1.53 redo B;
5023 wakaba 1.52 } else {
5024     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5025     ## Ignore the token
5026     !!!next-token;
5027 wakaba 1.53 redo B;
5028 wakaba 1.51 }
5029 wakaba 1.52 } elsif ($token->{tag_name} eq 'html') {
5030     if (@{$self->{open_elements}} > 1 and $self->{open_elements}->[1]->[1] eq 'body') {
5031     ## ISSUE: There is an issue in the spec.
5032     if ($self->{open_elements}->[-1]->[1] ne 'body') {
5033     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[1]->[1]);
5034 wakaba 1.1 }
5035 wakaba 1.54 $self->{insertion_mode} = AFTER_BODY_IM;
5036 wakaba 1.52 ## reprocess
5037 wakaba 1.53 redo B;
5038 wakaba 1.51 } else {
5039 wakaba 1.52 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5040     ## Ignore the token
5041     !!!next-token;
5042 wakaba 1.53 redo B;
5043 wakaba 1.51 }
5044 wakaba 1.52 } elsif ({
5045     address => 1, blockquote => 1, center => 1, dir => 1,
5046     div => 1, dl => 1, fieldset => 1, listing => 1,
5047     menu => 1, ol => 1, pre => 1, ul => 1,
5048     p => 1,
5049     dd => 1, dt => 1, li => 1,
5050     button => 1, marquee => 1, object => 1,
5051     }->{$token->{tag_name}}) {
5052     ## has an element in scope
5053     my $i;
5054     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5055     my $node = $self->{open_elements}->[$_];
5056     if ($node->[1] eq $token->{tag_name}) {
5057     ## generate implied end tags
5058     if ({
5059     dd => ($token->{tag_name} ne 'dd'),
5060     dt => ($token->{tag_name} ne 'dt'),
5061     li => ($token->{tag_name} ne 'li'),
5062     p => ($token->{tag_name} ne 'p'),
5063     td => 1, th => 1, tr => 1,
5064     tbody => 1, tfoot=> 1, thead => 1,
5065     }->{$self->{open_elements}->[-1]->[1]}) {
5066     !!!back-token;
5067 wakaba 1.55 $token = {type => END_TAG_TOKEN,
5068 wakaba 1.52 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
5069 wakaba 1.53 redo B;
5070 wakaba 1.52 }
5071     $i = $_;
5072     last INSCOPE unless $token->{tag_name} eq 'p';
5073     } elsif ({
5074     table => 1, caption => 1, td => 1, th => 1,
5075     button => 1, marquee => 1, object => 1, html => 1,
5076     }->{$node->[1]}) {
5077     last INSCOPE;
5078 wakaba 1.51 }
5079 wakaba 1.52 } # INSCOPE
5080    
5081     if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
5082     if (defined $i) {
5083     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
5084 wakaba 1.51 } else {
5085 wakaba 1.52 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5086 wakaba 1.51 }
5087     }
5088    
5089 wakaba 1.52 if (defined $i) {
5090     splice @{$self->{open_elements}}, $i;
5091     } elsif ($token->{tag_name} eq 'p') {
5092     ## As if <p>, then reprocess the current token
5093     my $el;
5094     !!!create-element ($el, 'p');
5095     $insert->($el);
5096 wakaba 1.51 }
5097 wakaba 1.52 $clear_up_to_marker->()
5098     if {
5099     button => 1, marquee => 1, object => 1,
5100     }->{$token->{tag_name}};
5101     !!!next-token;
5102 wakaba 1.53 redo B;
5103 wakaba 1.52 } elsif ($token->{tag_name} eq 'form') {
5104     ## has an element in scope
5105     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5106     my $node = $self->{open_elements}->[$_];
5107     if ($node->[1] eq $token->{tag_name}) {
5108     ## generate implied end tags
5109     if ({
5110     dd => 1, dt => 1, li => 1, p => 1,
5111     td => 1, th => 1, tr => 1,
5112     tbody => 1, tfoot=> 1, thead => 1,
5113     }->{$self->{open_elements}->[-1]->[1]}) {
5114     !!!back-token;
5115 wakaba 1.55 $token = {type => END_TAG_TOKEN,
5116 wakaba 1.52 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
5117 wakaba 1.53 redo B;
5118 wakaba 1.52 }
5119     last INSCOPE;
5120     } elsif ({
5121     table => 1, caption => 1, td => 1, th => 1,
5122     button => 1, marquee => 1, object => 1, html => 1,
5123     }->{$node->[1]}) {
5124     last INSCOPE;
5125     }
5126     } # INSCOPE
5127    
5128     if ($self->{open_elements}->[-1]->[1] eq $token->{tag_name}) {
5129 wakaba 1.36 pop @{$self->{open_elements}};
5130     } else {
5131 wakaba 1.58 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5132 wakaba 1.52 }
5133    
5134     undef $self->{form_element};
5135     !!!next-token;
5136 wakaba 1.53 redo B;
5137 wakaba 1.52 } elsif ({
5138     h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
5139     }->{$token->{tag_name}}) {
5140     ## has an element in scope
5141     my $i;
5142     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5143     my $node = $self->{open_elements}->[$_];
5144     if ({
5145     h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
5146     }->{$node->[1]}) {
5147     ## generate implied end tags
5148     if ({
5149     dd => 1, dt => 1, li => 1, p => 1,
5150     td => 1, th => 1, tr => 1,
5151     tbody => 1, tfoot=> 1, thead => 1,
5152     }->{$self->{open_elements}->[-1]->[1]}) {
5153     !!!back-token;
5154 wakaba 1.55 $token = {type => END_TAG_TOKEN,
5155 wakaba 1.52 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
5156 wakaba 1.53 redo B;
5157 wakaba 1.52 }
5158     $i = $_;
5159     last INSCOPE;
5160     } elsif ({
5161     table => 1, caption => 1, td => 1, th => 1,
5162     button => 1, marquee => 1, object => 1, html => 1,
5163     }->{$node->[1]}) {
5164     last INSCOPE;
5165 wakaba 1.51 }
5166 wakaba 1.52 } # INSCOPE
5167    
5168     if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
5169 wakaba 1.58 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5170 wakaba 1.36 }
5171 wakaba 1.52
5172     splice @{$self->{open_elements}}, $i if defined $i;
5173     !!!next-token;
5174 wakaba 1.53 redo B;
5175 wakaba 1.52 } elsif ({
5176     a => 1,
5177     b => 1, big => 1, em => 1, font => 1, i => 1,
5178     nobr => 1, s => 1, small => 1, strile => 1,
5179     strong => 1, tt => 1, u => 1,
5180     }->{$token->{tag_name}}) {
5181     $formatting_end_tag->($token->{tag_name});
5182 wakaba 1.53 redo B;
5183 wakaba 1.52 } elsif ($token->{tag_name} eq 'br') {
5184     !!!parse-error (type => 'unmatched end tag:br');
5185    
5186     ## As if <br>
5187     $reconstruct_active_formatting_elements->($insert_to_current);
5188    
5189     my $el;
5190     !!!create-element ($el, 'br');
5191     $insert->($el);
5192    
5193     ## Ignore the token.
5194     !!!next-token;
5195 wakaba 1.53 redo B;
5196 wakaba 1.52 } elsif ({
5197     caption => 1, col => 1, colgroup => 1, frame => 1,
5198     frameset => 1, head => 1, option => 1, optgroup => 1,
5199     tbody => 1, td => 1, tfoot => 1, th => 1,
5200     thead => 1, tr => 1,
5201     area => 1, basefont => 1, bgsound => 1,
5202     embed => 1, hr => 1, iframe => 1, image => 1,
5203     img => 1, input => 1, isindex => 1, noembed => 1,
5204     noframes => 1, param => 1, select => 1, spacer => 1,
5205     table => 1, textarea => 1, wbr => 1,
5206     noscript => 0, ## TODO: if scripting is enabled
5207     }->{$token->{tag_name}}) {
5208     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5209     ## Ignore the token
5210     !!!next-token;
5211 wakaba 1.53 redo B;
5212 wakaba 1.52
5213     ## ISSUE: Issue on HTML5 new elements in spec
5214    
5215     } else {
5216     ## Step 1
5217     my $node_i = -1;
5218     my $node = $self->{open_elements}->[$node_i];
5219 wakaba 1.51
5220 wakaba 1.52 ## Step 2
5221     S2: {
5222     if ($node->[1] eq $token->{tag_name}) {
5223     ## Step 1
5224     ## generate implied end tags
5225     if ({
5226     dd => 1, dt => 1, li => 1, p => 1,
5227     td => 1, th => 1, tr => 1,
5228 wakaba 1.55 tbody => 1, tfoot => 1, thead => 1,
5229 wakaba 1.52 }->{$self->{open_elements}->[-1]->[1]}) {
5230     !!!back-token;
5231 wakaba 1.55 $token = {type => END_TAG_TOKEN,
5232 wakaba 1.52 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
5233 wakaba 1.53 redo B;
5234 wakaba 1.52 }
5235    
5236     ## Step 2
5237     if ($token->{tag_name} ne $self->{open_elements}->[-1]->[1]) {
5238 wakaba 1.58 ## NOTE: <x><y></x>
5239 wakaba 1.52 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
5240     }
5241    
5242     ## Step 3
5243     splice @{$self->{open_elements}}, $node_i;
5244 wakaba 1.51
5245 wakaba 1.1 !!!next-token;
5246 wakaba 1.52 last S2;
5247 wakaba 1.1 } else {
5248 wakaba 1.52 ## Step 3
5249     if (not $formatting_category->{$node->[1]} and
5250     #not $phrasing_category->{$node->[1]} and
5251     ($special_category->{$node->[1]} or
5252     $scoping_category->{$node->[1]})) {
5253     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5254     ## Ignore the token
5255     !!!next-token;
5256     last S2;
5257     }
5258 wakaba 1.1 }
5259 wakaba 1.52
5260     ## Step 4
5261     $node_i--;
5262     $node = $self->{open_elements}->[$node_i];
5263    
5264     ## Step 5;
5265     redo S2;
5266     } # S2
5267 wakaba 1.53 redo B;
5268 wakaba 1.1 }
5269     }
5270 wakaba 1.52 redo B;
5271 wakaba 1.1 } # B
5272    
5273 wakaba 1.51 ## NOTE: The "trailing end" phase in HTML5 is split into
5274     ## two insertion modes: "after html body" and "after html frameset".
5275     ## NOTE: States in the main stage is preserved while
5276     ## the parser stays in the trailing end phase. # MUST
5277    
5278 wakaba 1.1 ## Stop parsing # MUST
5279    
5280     ## TODO: script stuffs
5281 wakaba 1.3 } # _tree_construct_main
5282    
5283     sub set_inner_html ($$$) {
5284     my $class = shift;
5285     my $node = shift;
5286     my $s = \$_[0];
5287     my $onerror = $_[1];
5288    
5289 wakaba 1.63 ## ISSUE: Should {confident} be true?
5290    
5291 wakaba 1.3 my $nt = $node->node_type;
5292     if ($nt == 9) {
5293     # MUST
5294    
5295     ## Step 1 # MUST
5296     ## TODO: If the document has an active parser, ...
5297     ## ISSUE: There is an issue in the spec.
5298    
5299     ## Step 2 # MUST
5300     my @cn = @{$node->child_nodes};
5301     for (@cn) {
5302     $node->remove_child ($_);
5303     }
5304    
5305     ## Step 3, 4, 5 # MUST
5306     $class->parse_string ($$s => $node, $onerror);
5307     } elsif ($nt == 1) {
5308     ## TODO: If non-html element
5309    
5310     ## NOTE: Most of this code is copied from |parse_string|
5311    
5312     ## Step 1 # MUST
5313 wakaba 1.14 my $this_doc = $node->owner_document;
5314     my $doc = $this_doc->implementation->create_document;
5315 wakaba 1.18 $doc->manakai_is_html (1);
5316 wakaba 1.3 my $p = $class->new;
5317     $p->{document} = $doc;
5318    
5319     ## Step 9 # MUST
5320     my $i = 0;
5321     my $line = 1;
5322     my $column = 0;
5323     $p->{set_next_input_character} = sub {
5324     my $self = shift;
5325 wakaba 1.14
5326     pop @{$self->{prev_input_character}};
5327     unshift @{$self->{prev_input_character}}, $self->{next_input_character};
5328    
5329 wakaba 1.3 $self->{next_input_character} = -1 and return if $i >= length $$s;
5330     $self->{next_input_character} = ord substr $$s, $i++, 1;
5331     $column++;
5332 wakaba 1.4
5333     if ($self->{next_input_character} == 0x000A) { # LF
5334     $line++;
5335     $column = 0;
5336     } elsif ($self->{next_input_character} == 0x000D) { # CR
5337 wakaba 1.15 $i++ if substr ($$s, $i, 1) eq "\x0A";
5338 wakaba 1.3 $self->{next_input_character} = 0x000A; # LF # MUST
5339     $line++;
5340 wakaba 1.4 $column = 0;
5341 wakaba 1.3 } elsif ($self->{next_input_character} > 0x10FFFF) {
5342     $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
5343     } elsif ($self->{next_input_character} == 0x0000) { # NULL
5344 wakaba 1.14 !!!parse-error (type => 'NULL');
5345 wakaba 1.3 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
5346     }
5347     };
5348 wakaba 1.14 $p->{prev_input_character} = [-1, -1, -1];
5349     $p->{next_input_character} = -1;
5350 wakaba 1.3
5351     my $ponerror = $onerror || sub {
5352     my (%opt) = @_;
5353     warn "Parse error ($opt{type}) at line $opt{line} column $opt{column}\n";
5354     };
5355     $p->{parse_error} = sub {
5356     $ponerror->(@_, line => $line, column => $column);
5357     };
5358    
5359     $p->_initialize_tokenizer;
5360     $p->_initialize_tree_constructor;
5361    
5362     ## Step 2
5363     my $node_ln = $node->local_name;
5364 wakaba 1.40 $p->{content_model} = {
5365     title => RCDATA_CONTENT_MODEL,
5366     textarea => RCDATA_CONTENT_MODEL,
5367     style => CDATA_CONTENT_MODEL,
5368     script => CDATA_CONTENT_MODEL,
5369     xmp => CDATA_CONTENT_MODEL,
5370     iframe => CDATA_CONTENT_MODEL,
5371     noembed => CDATA_CONTENT_MODEL,
5372     noframes => CDATA_CONTENT_MODEL,
5373     noscript => CDATA_CONTENT_MODEL,
5374     plaintext => PLAINTEXT_CONTENT_MODEL,
5375     }->{$node_ln};
5376     $p->{content_model} = PCDATA_CONTENT_MODEL
5377     unless defined $p->{content_model};
5378     ## ISSUE: What is "the name of the element"? local name?
5379 wakaba 1.3
5380     $p->{inner_html_node} = [$node, $node_ln];
5381    
5382     ## Step 4
5383     my $root = $doc->create_element_ns
5384     ('http://www.w3.org/1999/xhtml', [undef, 'html']);
5385    
5386     ## Step 5 # MUST
5387     $doc->append_child ($root);
5388    
5389     ## Step 6 # MUST
5390     push @{$p->{open_elements}}, [$root, 'html'];
5391    
5392     undef $p->{head_element};
5393    
5394     ## Step 7 # MUST
5395     $p->_reset_insertion_mode;
5396    
5397     ## Step 8 # MUST
5398     my $anode = $node;
5399     AN: while (defined $anode) {
5400     if ($anode->node_type == 1) {
5401     my $nsuri = $anode->namespace_uri;
5402     if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {
5403     if ($anode->local_name eq 'form') { ## TODO: case?
5404     $p->{form_element} = $anode;
5405     last AN;
5406     }
5407     }
5408     }
5409     $anode = $anode->parent_node;
5410     } # AN
5411    
5412     ## Step 3 # MUST
5413     ## Step 10 # MUST
5414     {
5415     my $self = $p;
5416     !!!next-token;
5417     }
5418     $p->_tree_construction_main;
5419    
5420     ## Step 11 # MUST
5421     my @cn = @{$node->child_nodes};
5422     for (@cn) {
5423     $node->remove_child ($_);
5424     }
5425     ## ISSUE: mutation events? read-only?
5426    
5427     ## Step 12 # MUST
5428     @cn = @{$root->child_nodes};
5429     for (@cn) {
5430 wakaba 1.14 $this_doc->adopt_node ($_);
5431 wakaba 1.3 $node->append_child ($_);
5432     }
5433 wakaba 1.14 ## ISSUE: mutation events?
5434 wakaba 1.3
5435     $p->_terminate_tree_constructor;
5436     } else {
5437     die "$0: |set_inner_html| is not defined for node of type $nt";
5438     }
5439     } # set_inner_html
5440    
5441     } # tree construction stage
5442 wakaba 1.1
5443 wakaba 1.63 package Whatpm::HTML::RestartParser;
5444     push our @ISA, 'Error';
5445    
5446 wakaba 1.1 1;
5447 wakaba 1.63 # $Date: 2007/11/11 04:59:35 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24