/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.70 - (hide annotations) (download) (as text)
Sat Mar 1 00:42:52 2008 UTC (16 years, 8 months ago) by wakaba
Branch: MAIN
Changes since 1.69: +10 -4 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	1 Mar 2008 00:26:59 -0000
2008-03-01  Wakaba  <wakaba@suika.fam.cx>

	* tokenizer-test-1.test: Updated (HTML5 recision 1286).

	* content-model-2.dat: Updated (HTML5 revision 1275).

++ whatpm/Whatpm/ChangeLog	1 Mar 2008 00:19:36 -0000
2008-03-01  Wakaba  <wakaba@suika.fam.cx>

	* _NamedEntityList.pm: Updated (HTML5 revision 1286).

	* HTML.pm.src: |charset| in |content| attribute is
	case-insensitive (HTML5 revision 1270).

++ whatpm/Whatpm/HTML/ChangeLog	1 Mar 2008 00:07:44 -0000
2008-03-01  Wakaba  <wakaba@suika.fam.cx>

	* Serializer.pm (get_inner_html): Escape NBSP (HTML5 revision
	1277).

++ whatpm/Whatpm/ContentChecker/ChangeLog	29 Feb 2008 23:29:54 -0000
2008-03-01  Wakaba  <wakaba@suika.fam.cx>

	* HTML.pm: Sectioning root category added.  |blockquote|
	is no longer a sectioning content.

1 wakaba 1.2 package Whatpm::HTML;
2 wakaba 1.1 use strict;
3 wakaba 1.70 our $VERSION=do{my @r=(q$Revision: 1.69 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.63 use Error qw(:try);
5 wakaba 1.1
6 wakaba 1.18 ## ISSUE:
7     ## var doc = implementation.createDocument (null, null, null);
8     ## doc.write ('');
9     ## alert (doc.compatMode);
10 wakaba 1.1
11 wakaba 1.31 ## ISSUE: HTML5 revision 967 says that the encoding layer MUST NOT
12     ## strip BOM and the HTML layer MUST ignore it. Whether we can do it
13     ## is not yet clear.
14     ## "{U+FEFF}..." in UTF-16BE/UTF-16LE is three or four characters?
15     ## "{U+FEFF}..." in GB18030?
16    
17 wakaba 1.70 ## TODO: Control charcters and noncharacters are not allowed (HTML5 revision 1263)
18     ## TODO: 1252 parse error (revision 1264)
19     ## TODO: 8859-11 = 874 (revision 1271)
20    
21 wakaba 1.1 my $permitted_slash_tag_name = {
22     base => 1,
23     link => 1,
24     meta => 1,
25     hr => 1,
26     br => 1,
27     img=> 1,
28     embed => 1,
29     param => 1,
30     area => 1,
31     col => 1,
32     input => 1,
33     };
34    
35 wakaba 1.4 my $c1_entity_char = {
36 wakaba 1.10 0x80 => 0x20AC,
37     0x81 => 0xFFFD,
38     0x82 => 0x201A,
39     0x83 => 0x0192,
40     0x84 => 0x201E,
41     0x85 => 0x2026,
42     0x86 => 0x2020,
43     0x87 => 0x2021,
44     0x88 => 0x02C6,
45     0x89 => 0x2030,
46     0x8A => 0x0160,
47     0x8B => 0x2039,
48     0x8C => 0x0152,
49     0x8D => 0xFFFD,
50     0x8E => 0x017D,
51     0x8F => 0xFFFD,
52     0x90 => 0xFFFD,
53     0x91 => 0x2018,
54     0x92 => 0x2019,
55     0x93 => 0x201C,
56     0x94 => 0x201D,
57     0x95 => 0x2022,
58     0x96 => 0x2013,
59     0x97 => 0x2014,
60     0x98 => 0x02DC,
61     0x99 => 0x2122,
62     0x9A => 0x0161,
63     0x9B => 0x203A,
64     0x9C => 0x0153,
65     0x9D => 0xFFFD,
66     0x9E => 0x017E,
67     0x9F => 0x0178,
68 wakaba 1.4 }; # $c1_entity_char
69 wakaba 1.1
70     my $special_category = {
71     address => 1, area => 1, base => 1, basefont => 1, bgsound => 1,
72     blockquote => 1, body => 1, br => 1, center => 1, col => 1, colgroup => 1,
73     dd => 1, dir => 1, div => 1, dl => 1, dt => 1, embed => 1, fieldset => 1,
74     form => 1, frame => 1, frameset => 1, h1 => 1, h2 => 1, h3 => 1,
75     h4 => 1, h5 => 1, h6 => 1, head => 1, hr => 1, iframe => 1, image => 1,
76     img => 1, input => 1, isindex => 1, li => 1, link => 1, listing => 1,
77     menu => 1, meta => 1, noembed => 1, noframes => 1, noscript => 1,
78     ol => 1, optgroup => 1, option => 1, p => 1, param => 1, plaintext => 1,
79     pre => 1, script => 1, select => 1, spacer => 1, style => 1, tbody => 1,
80     textarea => 1, tfoot => 1, thead => 1, title => 1, tr => 1, ul => 1, wbr => 1,
81     };
82     my $scoping_category = {
83     button => 1, caption => 1, html => 1, marquee => 1, object => 1,
84     table => 1, td => 1, th => 1,
85     };
86     my $formatting_category = {
87     a => 1, b => 1, big => 1, em => 1, font => 1, i => 1, nobr => 1,
88     s => 1, small => 1, strile => 1, strong => 1, tt => 1, u => 1,
89     };
90     # $phrasing_category: all other elements
91    
92 wakaba 1.63 sub parse_byte_string ($$$$;$) {
93     my $self = ref $_[0] ? shift : shift->new;
94     my $charset = shift;
95     my $bytes_s = ref $_[0] ? $_[0] : \($_[0]);
96     my $s;
97    
98     if (defined $charset) {
99 wakaba 1.64 require Encode; ## TODO: decode(utf8) don't delete BOM
100 wakaba 1.63 $s = \ (Encode::decode ($charset, $$bytes_s));
101 wakaba 1.64 $self->{input_encoding} = lc $charset; ## TODO: normalize name
102 wakaba 1.63 $self->{confident} = 1;
103     } else {
104 wakaba 1.65 ## TODO: Implement HTML5 detection algorithm
105     require Whatpm::Charset::UniversalCharDet;
106     $charset = Whatpm::Charset::UniversalCharDet->detect_byte_string
107     (substr ($$bytes_s, 0, 1024));
108     $charset ||= 'windows-1252';
109 wakaba 1.64 $s = \ (Encode::decode ($charset, $$bytes_s));
110     $self->{input_encoding} = $charset;
111 wakaba 1.63 $self->{confident} = 0;
112     }
113    
114     $self->{change_encoding} = sub {
115     my $self = shift;
116     my $charset = lc shift;
117     ## TODO: if $charset is supported
118     ## TODO: normalize charset name
119    
120     ## "Change the encoding" algorithm:
121    
122     ## Step 1
123     if ($charset eq 'utf-16') { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8?
124     $charset = 'utf-8';
125     }
126    
127     ## Step 2
128     if (defined $self->{input_encoding} and
129     $self->{input_encoding} eq $charset) {
130     $self->{confident} = 1;
131     return;
132     }
133    
134 wakaba 1.64 !!!parse-error (type => 'charset label detected:'.$self->{input_encoding}.
135     ':'.$charset, level => 'w');
136 wakaba 1.63
137     ## Step 3
138     # if (can) {
139     ## change the encoding on the fly.
140     #$self->{confident} = 1;
141     #return;
142     # }
143    
144     ## Step 4
145     throw Whatpm::HTML::RestartParser (charset => $charset);
146     }; # $self->{change_encoding}
147    
148     my @args = @_; shift @args; # $s
149     my $return;
150     try {
151     $return = $self->parse_char_string ($s, @args);
152     } catch Whatpm::HTML::RestartParser with {
153     my $charset = shift->{charset};
154     $s = \ (Encode::decode ($charset, $$bytes_s));
155 wakaba 1.64 $self->{input_encoding} = $charset; ## TODO: normalize
156 wakaba 1.63 $self->{confident} = 1;
157     $return = $self->parse_char_string ($s, @args);
158     };
159     return $return;
160     } # parse_byte_string
161    
162     *parse_char_string = \&parse_string;
163    
164 wakaba 1.1 sub parse_string ($$$;$) {
165 wakaba 1.63 my $self = ref $_[0] ? shift : shift->new;
166     my $s = ref $_[0] ? $_[0] : \($_[0]);
167 wakaba 1.1 $self->{document} = $_[1];
168 wakaba 1.63 @{$self->{document}->child_nodes} = ();
169 wakaba 1.1
170 wakaba 1.3 ## NOTE: |set_inner_html| copies most of this method's code
171    
172 wakaba 1.63 $self->{confident} = 1 unless exists $self->{confident};
173 wakaba 1.64 $self->{document}->input_encoding ($self->{input_encoding})
174     if defined $self->{input_encoding};
175 wakaba 1.63
176 wakaba 1.1 my $i = 0;
177 wakaba 1.3 my $line = 1;
178     my $column = 0;
179 wakaba 1.1 $self->{set_next_input_character} = sub {
180     my $self = shift;
181 wakaba 1.13
182     pop @{$self->{prev_input_character}};
183     unshift @{$self->{prev_input_character}}, $self->{next_input_character};
184    
185 wakaba 1.1 $self->{next_input_character} = -1 and return if $i >= length $$s;
186     $self->{next_input_character} = ord substr $$s, $i++, 1;
187 wakaba 1.3 $column++;
188 wakaba 1.1
189 wakaba 1.4 if ($self->{next_input_character} == 0x000A) { # LF
190     $line++;
191     $column = 0;
192     } elsif ($self->{next_input_character} == 0x000D) { # CR
193 wakaba 1.15 $i++ if substr ($$s, $i, 1) eq "\x0A";
194 wakaba 1.1 $self->{next_input_character} = 0x000A; # LF # MUST
195 wakaba 1.3 $line++;
196 wakaba 1.4 $column = 0;
197 wakaba 1.1 } elsif ($self->{next_input_character} > 0x10FFFF) {
198     $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
199     } elsif ($self->{next_input_character} == 0x0000) { # NULL
200 wakaba 1.8 !!!parse-error (type => 'NULL');
201 wakaba 1.1 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
202     }
203     };
204 wakaba 1.13 $self->{prev_input_character} = [-1, -1, -1];
205     $self->{next_input_character} = -1;
206 wakaba 1.1
207 wakaba 1.3 my $onerror = $_[2] || sub {
208     my (%opt) = @_;
209     warn "Parse error ($opt{type}) at line $opt{line} column $opt{column}\n";
210     };
211     $self->{parse_error} = sub {
212     $onerror->(@_, line => $line, column => $column);
213 wakaba 1.1 };
214    
215     $self->_initialize_tokenizer;
216     $self->_initialize_tree_constructor;
217     $self->_construct_tree;
218     $self->_terminate_tree_constructor;
219    
220     return $self->{document};
221     } # parse_string
222    
223     sub new ($) {
224     my $class = shift;
225     my $self = bless {}, $class;
226     $self->{set_next_input_character} = sub {
227     $self->{next_input_character} = -1;
228     };
229     $self->{parse_error} = sub {
230     #
231     };
232 wakaba 1.63 $self->{change_encoding} = sub {
233     # if ($_[0] is a supported encoding) {
234     # run "change the encoding" algorithm;
235     # throw Whatpm::HTML::RestartParser (charset => $new_encoding);
236     # }
237     };
238 wakaba 1.61 $self->{application_cache_selection} = sub {
239     #
240     };
241 wakaba 1.1 return $self;
242     } # new
243    
244 wakaba 1.40 sub CM_ENTITY () { 0b001 } # & markup in data
245     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
246     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
247    
248     sub PLAINTEXT_CONTENT_MODEL () { 0 }
249     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
250     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
251     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
252    
253 wakaba 1.57 sub DATA_STATE () { 0 }
254     sub ENTITY_DATA_STATE () { 1 }
255     sub TAG_OPEN_STATE () { 2 }
256     sub CLOSE_TAG_OPEN_STATE () { 3 }
257     sub TAG_NAME_STATE () { 4 }
258     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
259     sub ATTRIBUTE_NAME_STATE () { 6 }
260     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
261     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
262     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
263     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
264     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
265     sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
266     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
267     sub COMMENT_START_STATE () { 14 }
268     sub COMMENT_START_DASH_STATE () { 15 }
269     sub COMMENT_STATE () { 16 }
270     sub COMMENT_END_STATE () { 17 }
271     sub COMMENT_END_DASH_STATE () { 18 }
272     sub BOGUS_COMMENT_STATE () { 19 }
273     sub DOCTYPE_STATE () { 20 }
274     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
275     sub DOCTYPE_NAME_STATE () { 22 }
276     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
277     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
278     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
279     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
280     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
281     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
282     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
283     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
284     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
285     sub BOGUS_DOCTYPE_STATE () { 32 }
286    
287 wakaba 1.55 sub DOCTYPE_TOKEN () { 1 }
288     sub COMMENT_TOKEN () { 2 }
289     sub START_TAG_TOKEN () { 3 }
290     sub END_TAG_TOKEN () { 4 }
291     sub END_OF_FILE_TOKEN () { 5 }
292     sub CHARACTER_TOKEN () { 6 }
293    
294 wakaba 1.54 sub AFTER_HTML_IMS () { 0b100 }
295     sub HEAD_IMS () { 0b1000 }
296     sub BODY_IMS () { 0b10000 }
297 wakaba 1.56 sub BODY_TABLE_IMS () { 0b100000 }
298 wakaba 1.54 sub TABLE_IMS () { 0b1000000 }
299 wakaba 1.56 sub ROW_IMS () { 0b10000000 }
300 wakaba 1.54 sub BODY_AFTER_IMS () { 0b100000000 }
301     sub FRAME_IMS () { 0b1000000000 }
302    
303     sub AFTER_HTML_BODY_IM () { AFTER_HTML_IMS | BODY_AFTER_IMS }
304     sub AFTER_HTML_FRAMESET_IM () { AFTER_HTML_IMS | FRAME_IMS }
305     sub IN_HEAD_IM () { HEAD_IMS | 0b00 }
306     sub IN_HEAD_NOSCRIPT_IM () { HEAD_IMS | 0b01 }
307     sub AFTER_HEAD_IM () { HEAD_IMS | 0b10 }
308     sub BEFORE_HEAD_IM () { HEAD_IMS | 0b11 }
309     sub IN_BODY_IM () { BODY_IMS }
310 wakaba 1.56 sub IN_CELL_IM () { BODY_IMS | BODY_TABLE_IMS | 0b01 }
311     sub IN_CAPTION_IM () { BODY_IMS | BODY_TABLE_IMS | 0b10 }
312     sub IN_ROW_IM () { TABLE_IMS | ROW_IMS | 0b01 }
313     sub IN_TABLE_BODY_IM () { TABLE_IMS | ROW_IMS | 0b10 }
314 wakaba 1.54 sub IN_TABLE_IM () { TABLE_IMS }
315     sub AFTER_BODY_IM () { BODY_AFTER_IMS }
316     sub IN_FRAMESET_IM () { FRAME_IMS | 0b01 }
317     sub AFTER_FRAMESET_IM () { FRAME_IMS | 0b10 }
318     sub IN_SELECT_IM () { 0b01 }
319     sub IN_COLUMN_GROUP_IM () { 0b10 }
320    
321 wakaba 1.1 ## Implementations MUST act as if state machine in the spec
322    
323     sub _initialize_tokenizer ($) {
324     my $self = shift;
325 wakaba 1.57 $self->{state} = DATA_STATE; # MUST
326 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
327 wakaba 1.1 undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE
328     undef $self->{current_attribute};
329     undef $self->{last_emitted_start_tag_name};
330     undef $self->{last_attribute_value_state};
331     $self->{char} = [];
332     # $self->{next_input_character}
333     !!!next-input-character;
334     $self->{token} = [];
335 wakaba 1.18 # $self->{escape}
336 wakaba 1.1 } # _initialize_tokenizer
337    
338     ## A token has:
339 wakaba 1.55 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
340     ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
341     ## ->{name} (DOCTYPE_TOKEN)
342     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
343     ## ->{public_identifier} (DOCTYPE_TOKEN)
344     ## ->{system_identifier} (DOCTYPE_TOKEN)
345     ## ->{correct} == 1 or 0 (DOCTYPE_TOKEN)
346     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
347 wakaba 1.66 ## ->{name}
348     ## ->{value}
349     ## ->{has_reference} == 1 or 0
350 wakaba 1.55 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
351 wakaba 1.1
352     ## Emitted token MUST immediately be handled by the tree construction state.
353    
354     ## Before each step, UA MAY check to see if either one of the scripts in
355     ## "list of scripts that will execute as soon as possible" or the first
356     ## script in the "list of scripts that will execute asynchronously",
357     ## has completed loading. If one has, then it MUST be executed
358     ## and removed from the list.
359    
360 wakaba 1.59 ## NOTE: HTML5 "Writing HTML documents" section, applied to
361     ## documents and not to user agents and conformance checkers,
362     ## contains some requirements that are not detected by the
363     ## parsing algorithm:
364     ## - Some requirements on character encoding declarations. ## TODO
365     ## - "Elements MUST NOT contain content that their content model disallows."
366     ## ... Some are parse error, some are not (will be reported by c.c.).
367     ## - Polytheistic slash SHOULD NOT be used. (Applied only to atheists.) ## TODO
368     ## - Text (in elements, attributes, and comments) SHOULD NOT contain
369     ## control characters other than space characters. ## TODO: (what is control character? C0, C1 and DEL? Unicode control character?)
370    
371     ## TODO: HTML5 poses authors two SHOULD-level requirements that cannot
372     ## be detected by the HTML5 parsing algorithm:
373     ## - Text,
374    
375 wakaba 1.1 sub _get_next_token ($) {
376     my $self = shift;
377     if (@{$self->{token}}) {
378     return shift @{$self->{token}};
379     }
380    
381     A: {
382 wakaba 1.57 if ($self->{state} == DATA_STATE) {
383 wakaba 1.1 if ($self->{next_input_character} == 0x0026) { # &
384 wakaba 1.40 if ($self->{content_model} & CM_ENTITY) { # PCDATA | RCDATA
385 wakaba 1.57 $self->{state} = ENTITY_DATA_STATE;
386 wakaba 1.1 !!!next-input-character;
387     redo A;
388     } else {
389     #
390     }
391 wakaba 1.13 } elsif ($self->{next_input_character} == 0x002D) { # -
392 wakaba 1.40 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
393 wakaba 1.13 unless ($self->{escape}) {
394     if ($self->{prev_input_character}->[0] == 0x002D and # -
395     $self->{prev_input_character}->[1] == 0x0021 and # !
396     $self->{prev_input_character}->[2] == 0x003C) { # <
397     $self->{escape} = 1;
398     }
399     }
400     }
401    
402     #
403 wakaba 1.1 } elsif ($self->{next_input_character} == 0x003C) { # <
404 wakaba 1.40 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
405     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
406 wakaba 1.13 not $self->{escape})) {
407 wakaba 1.57 $self->{state} = TAG_OPEN_STATE;
408 wakaba 1.1 !!!next-input-character;
409     redo A;
410     } else {
411     #
412     }
413 wakaba 1.13 } elsif ($self->{next_input_character} == 0x003E) { # >
414     if ($self->{escape} and
415 wakaba 1.40 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
416 wakaba 1.13 if ($self->{prev_input_character}->[0] == 0x002D and # -
417     $self->{prev_input_character}->[1] == 0x002D) { # -
418     delete $self->{escape};
419     }
420     }
421    
422     #
423 wakaba 1.1 } elsif ($self->{next_input_character} == -1) {
424 wakaba 1.55 !!!emit ({type => END_OF_FILE_TOKEN});
425 wakaba 1.1 last A; ## TODO: ok?
426     }
427     # Anything else
428 wakaba 1.55 my $token = {type => CHARACTER_TOKEN,
429 wakaba 1.1 data => chr $self->{next_input_character}};
430     ## Stay in the data state
431     !!!next-input-character;
432    
433     !!!emit ($token);
434    
435     redo A;
436 wakaba 1.57 } elsif ($self->{state} == ENTITY_DATA_STATE) {
437 wakaba 1.1 ## (cannot happen in CDATA state)
438    
439 wakaba 1.26 my $token = $self->_tokenize_attempt_to_consume_an_entity (0);
440 wakaba 1.1
441 wakaba 1.57 $self->{state} = DATA_STATE;
442 wakaba 1.1 # next-input-character is already done
443    
444     unless (defined $token) {
445 wakaba 1.55 !!!emit ({type => CHARACTER_TOKEN, data => '&'});
446 wakaba 1.1 } else {
447     !!!emit ($token);
448     }
449    
450     redo A;
451 wakaba 1.57 } elsif ($self->{state} == TAG_OPEN_STATE) {
452 wakaba 1.40 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
453 wakaba 1.1 if ($self->{next_input_character} == 0x002F) { # /
454     !!!next-input-character;
455 wakaba 1.57 $self->{state} = CLOSE_TAG_OPEN_STATE;
456 wakaba 1.1 redo A;
457     } else {
458     ## reconsume
459 wakaba 1.57 $self->{state} = DATA_STATE;
460 wakaba 1.1
461 wakaba 1.55 !!!emit ({type => CHARACTER_TOKEN, data => '<'});
462 wakaba 1.1
463     redo A;
464     }
465 wakaba 1.40 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
466 wakaba 1.1 if ($self->{next_input_character} == 0x0021) { # !
467 wakaba 1.57 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
468 wakaba 1.1 !!!next-input-character;
469     redo A;
470     } elsif ($self->{next_input_character} == 0x002F) { # /
471 wakaba 1.57 $self->{state} = CLOSE_TAG_OPEN_STATE;
472 wakaba 1.1 !!!next-input-character;
473     redo A;
474     } elsif (0x0041 <= $self->{next_input_character} and
475     $self->{next_input_character} <= 0x005A) { # A..Z
476     $self->{current_token}
477 wakaba 1.55 = {type => START_TAG_TOKEN,
478 wakaba 1.1 tag_name => chr ($self->{next_input_character} + 0x0020)};
479 wakaba 1.57 $self->{state} = TAG_NAME_STATE;
480 wakaba 1.1 !!!next-input-character;
481     redo A;
482     } elsif (0x0061 <= $self->{next_input_character} and
483     $self->{next_input_character} <= 0x007A) { # a..z
484 wakaba 1.55 $self->{current_token} = {type => START_TAG_TOKEN,
485 wakaba 1.1 tag_name => chr ($self->{next_input_character})};
486 wakaba 1.57 $self->{state} = TAG_NAME_STATE;
487 wakaba 1.1 !!!next-input-character;
488     redo A;
489     } elsif ($self->{next_input_character} == 0x003E) { # >
490 wakaba 1.3 !!!parse-error (type => 'empty start tag');
491 wakaba 1.57 $self->{state} = DATA_STATE;
492 wakaba 1.1 !!!next-input-character;
493    
494 wakaba 1.55 !!!emit ({type => CHARACTER_TOKEN, data => '<>'});
495 wakaba 1.1
496     redo A;
497     } elsif ($self->{next_input_character} == 0x003F) { # ?
498 wakaba 1.3 !!!parse-error (type => 'pio');
499 wakaba 1.57 $self->{state} = BOGUS_COMMENT_STATE;
500 wakaba 1.1 ## $self->{next_input_character} is intentionally left as is
501     redo A;
502     } else {
503 wakaba 1.3 !!!parse-error (type => 'bare stago');
504 wakaba 1.57 $self->{state} = DATA_STATE;
505 wakaba 1.1 ## reconsume
506    
507 wakaba 1.55 !!!emit ({type => CHARACTER_TOKEN, data => '<'});
508 wakaba 1.1
509     redo A;
510     }
511     } else {
512 wakaba 1.40 die "$0: $self->{content_model} in tag open";
513 wakaba 1.1 }
514 wakaba 1.57 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
515 wakaba 1.40 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
516 wakaba 1.23 if (defined $self->{last_emitted_start_tag_name}) {
517 wakaba 1.30 ## NOTE: <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>
518 wakaba 1.23 my @next_char;
519     TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
520     push @next_char, $self->{next_input_character};
521     my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
522     my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
523     if ($self->{next_input_character} == $c or $self->{next_input_character} == $C) {
524     !!!next-input-character;
525     next TAGNAME;
526     } else {
527     $self->{next_input_character} = shift @next_char; # reconsume
528     !!!back-next-input-character (@next_char);
529 wakaba 1.57 $self->{state} = DATA_STATE;
530 wakaba 1.23
531 wakaba 1.55 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
532 wakaba 1.23
533     redo A;
534     }
535     }
536 wakaba 1.1 push @next_char, $self->{next_input_character};
537 wakaba 1.23
538     unless ($self->{next_input_character} == 0x0009 or # HT
539     $self->{next_input_character} == 0x000A or # LF
540     $self->{next_input_character} == 0x000B or # VT
541     $self->{next_input_character} == 0x000C or # FF
542     $self->{next_input_character} == 0x0020 or # SP
543     $self->{next_input_character} == 0x003E or # >
544     $self->{next_input_character} == 0x002F or # /
545     $self->{next_input_character} == -1) {
546 wakaba 1.1 $self->{next_input_character} = shift @next_char; # reconsume
547     !!!back-next-input-character (@next_char);
548 wakaba 1.57 $self->{state} = DATA_STATE;
549 wakaba 1.55 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
550 wakaba 1.1 redo A;
551 wakaba 1.23 } else {
552     $self->{next_input_character} = shift @next_char;
553     !!!back-next-input-character (@next_char);
554     # and consume...
555 wakaba 1.1 }
556 wakaba 1.23 } else {
557     ## No start tag token has ever been emitted
558     # next-input-character is already done
559 wakaba 1.57 $self->{state} = DATA_STATE;
560 wakaba 1.55 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
561 wakaba 1.1 redo A;
562     }
563     }
564    
565     if (0x0041 <= $self->{next_input_character} and
566     $self->{next_input_character} <= 0x005A) { # A..Z
567 wakaba 1.55 $self->{current_token} = {type => END_TAG_TOKEN,
568 wakaba 1.1 tag_name => chr ($self->{next_input_character} + 0x0020)};
569 wakaba 1.57 $self->{state} = TAG_NAME_STATE;
570 wakaba 1.1 !!!next-input-character;
571     redo A;
572     } elsif (0x0061 <= $self->{next_input_character} and
573     $self->{next_input_character} <= 0x007A) { # a..z
574 wakaba 1.55 $self->{current_token} = {type => END_TAG_TOKEN,
575 wakaba 1.1 tag_name => chr ($self->{next_input_character})};
576 wakaba 1.57 $self->{state} = TAG_NAME_STATE;
577 wakaba 1.1 !!!next-input-character;
578     redo A;
579     } elsif ($self->{next_input_character} == 0x003E) { # >
580 wakaba 1.3 !!!parse-error (type => 'empty end tag');
581 wakaba 1.57 $self->{state} = DATA_STATE;
582 wakaba 1.1 !!!next-input-character;
583     redo A;
584     } elsif ($self->{next_input_character} == -1) {
585 wakaba 1.3 !!!parse-error (type => 'bare etago');
586 wakaba 1.57 $self->{state} = DATA_STATE;
587 wakaba 1.1 # reconsume
588    
589 wakaba 1.55 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
590 wakaba 1.1
591     redo A;
592     } else {
593 wakaba 1.3 !!!parse-error (type => 'bogus end tag');
594 wakaba 1.57 $self->{state} = BOGUS_COMMENT_STATE;
595 wakaba 1.1 ## $self->{next_input_character} is intentionally left as is
596     redo A;
597     }
598 wakaba 1.57 } elsif ($self->{state} == TAG_NAME_STATE) {
599 wakaba 1.1 if ($self->{next_input_character} == 0x0009 or # HT
600     $self->{next_input_character} == 0x000A or # LF
601     $self->{next_input_character} == 0x000B or # VT
602     $self->{next_input_character} == 0x000C or # FF
603     $self->{next_input_character} == 0x0020) { # SP
604 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
605 wakaba 1.1 !!!next-input-character;
606     redo A;
607     } elsif ($self->{next_input_character} == 0x003E) { # >
608 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
609 wakaba 1.28 $self->{current_token}->{first_start_tag}
610     = not defined $self->{last_emitted_start_tag_name};
611 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
612 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
613 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
614 wakaba 1.1 if ($self->{current_token}->{attributes}) {
615 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
616 wakaba 1.1 }
617     } else {
618     die "$0: $self->{current_token}->{type}: Unknown token type";
619     }
620 wakaba 1.57 $self->{state} = DATA_STATE;
621 wakaba 1.1 !!!next-input-character;
622    
623     !!!emit ($self->{current_token}); # start tag or end tag
624    
625     redo A;
626     } elsif (0x0041 <= $self->{next_input_character} and
627     $self->{next_input_character} <= 0x005A) { # A..Z
628     $self->{current_token}->{tag_name} .= chr ($self->{next_input_character} + 0x0020);
629     # start tag or end tag
630     ## Stay in this state
631     !!!next-input-character;
632     redo A;
633 wakaba 1.17 } elsif ($self->{next_input_character} == -1) {
634 wakaba 1.3 !!!parse-error (type => 'unclosed tag');
635 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
636 wakaba 1.28 $self->{current_token}->{first_start_tag}
637     = not defined $self->{last_emitted_start_tag_name};
638 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
639 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
640 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
641 wakaba 1.1 if ($self->{current_token}->{attributes}) {
642 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
643 wakaba 1.1 }
644     } else {
645     die "$0: $self->{current_token}->{type}: Unknown token type";
646     }
647 wakaba 1.57 $self->{state} = DATA_STATE;
648 wakaba 1.1 # reconsume
649    
650     !!!emit ($self->{current_token}); # start tag or end tag
651    
652     redo A;
653     } elsif ($self->{next_input_character} == 0x002F) { # /
654     !!!next-input-character;
655     if ($self->{next_input_character} == 0x003E and # >
656 wakaba 1.55 $self->{current_token}->{type} == START_TAG_TOKEN and
657 wakaba 1.1 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
658     # permitted slash
659     #
660     } else {
661 wakaba 1.3 !!!parse-error (type => 'nestc');
662 wakaba 1.1 }
663 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
664 wakaba 1.1 # next-input-character is already done
665     redo A;
666     } else {
667     $self->{current_token}->{tag_name} .= chr $self->{next_input_character};
668     # start tag or end tag
669     ## Stay in the state
670     !!!next-input-character;
671     redo A;
672     }
673 wakaba 1.57 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
674 wakaba 1.1 if ($self->{next_input_character} == 0x0009 or # HT
675     $self->{next_input_character} == 0x000A or # LF
676     $self->{next_input_character} == 0x000B or # VT
677     $self->{next_input_character} == 0x000C or # FF
678     $self->{next_input_character} == 0x0020) { # SP
679     ## Stay in the state
680     !!!next-input-character;
681     redo A;
682     } elsif ($self->{next_input_character} == 0x003E) { # >
683 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
684 wakaba 1.28 $self->{current_token}->{first_start_tag}
685     = not defined $self->{last_emitted_start_tag_name};
686 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
687 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
688 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
689 wakaba 1.1 if ($self->{current_token}->{attributes}) {
690 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
691 wakaba 1.1 }
692     } else {
693     die "$0: $self->{current_token}->{type}: Unknown token type";
694     }
695 wakaba 1.57 $self->{state} = DATA_STATE;
696 wakaba 1.1 !!!next-input-character;
697    
698     !!!emit ($self->{current_token}); # start tag or end tag
699    
700     redo A;
701     } elsif (0x0041 <= $self->{next_input_character} and
702     $self->{next_input_character} <= 0x005A) { # A..Z
703     $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
704     value => ''};
705 wakaba 1.57 $self->{state} = ATTRIBUTE_NAME_STATE;
706 wakaba 1.1 !!!next-input-character;
707     redo A;
708     } elsif ($self->{next_input_character} == 0x002F) { # /
709     !!!next-input-character;
710     if ($self->{next_input_character} == 0x003E and # >
711 wakaba 1.55 $self->{current_token}->{type} == START_TAG_TOKEN and
712 wakaba 1.1 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
713     # permitted slash
714     #
715     } else {
716 wakaba 1.3 !!!parse-error (type => 'nestc');
717 wakaba 1.1 }
718     ## Stay in the state
719     # next-input-character is already done
720     redo A;
721 wakaba 1.17 } elsif ($self->{next_input_character} == -1) {
722 wakaba 1.3 !!!parse-error (type => 'unclosed tag');
723 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
724 wakaba 1.28 $self->{current_token}->{first_start_tag}
725     = not defined $self->{last_emitted_start_tag_name};
726 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
727 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
728 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
729 wakaba 1.1 if ($self->{current_token}->{attributes}) {
730 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
731 wakaba 1.1 }
732     } else {
733     die "$0: $self->{current_token}->{type}: Unknown token type";
734     }
735 wakaba 1.57 $self->{state} = DATA_STATE;
736 wakaba 1.1 # reconsume
737    
738     !!!emit ($self->{current_token}); # start tag or end tag
739    
740     redo A;
741     } else {
742     $self->{current_attribute} = {name => chr ($self->{next_input_character}),
743     value => ''};
744 wakaba 1.57 $self->{state} = ATTRIBUTE_NAME_STATE;
745 wakaba 1.1 !!!next-input-character;
746     redo A;
747     }
748 wakaba 1.57 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
749 wakaba 1.1 my $before_leave = sub {
750     if (exists $self->{current_token}->{attributes} # start tag or end tag
751     ->{$self->{current_attribute}->{name}}) { # MUST
752 wakaba 1.39 !!!parse-error (type => 'duplicate attribute:'.$self->{current_attribute}->{name});
753 wakaba 1.1 ## Discard $self->{current_attribute} # MUST
754     } else {
755     $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}
756     = $self->{current_attribute};
757     }
758     }; # $before_leave
759    
760     if ($self->{next_input_character} == 0x0009 or # HT
761     $self->{next_input_character} == 0x000A or # LF
762     $self->{next_input_character} == 0x000B or # VT
763     $self->{next_input_character} == 0x000C or # FF
764     $self->{next_input_character} == 0x0020) { # SP
765     $before_leave->();
766 wakaba 1.57 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
767 wakaba 1.1 !!!next-input-character;
768     redo A;
769     } elsif ($self->{next_input_character} == 0x003D) { # =
770     $before_leave->();
771 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
772 wakaba 1.1 !!!next-input-character;
773     redo A;
774     } elsif ($self->{next_input_character} == 0x003E) { # >
775     $before_leave->();
776 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
777 wakaba 1.28 $self->{current_token}->{first_start_tag}
778     = not defined $self->{last_emitted_start_tag_name};
779 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
780 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
781 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
782 wakaba 1.1 if ($self->{current_token}->{attributes}) {
783 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
784 wakaba 1.1 }
785     } else {
786     die "$0: $self->{current_token}->{type}: Unknown token type";
787     }
788 wakaba 1.57 $self->{state} = DATA_STATE;
789 wakaba 1.1 !!!next-input-character;
790    
791     !!!emit ($self->{current_token}); # start tag or end tag
792    
793     redo A;
794     } elsif (0x0041 <= $self->{next_input_character} and
795     $self->{next_input_character} <= 0x005A) { # A..Z
796     $self->{current_attribute}->{name} .= chr ($self->{next_input_character} + 0x0020);
797     ## Stay in the state
798     !!!next-input-character;
799     redo A;
800     } elsif ($self->{next_input_character} == 0x002F) { # /
801     $before_leave->();
802     !!!next-input-character;
803     if ($self->{next_input_character} == 0x003E and # >
804 wakaba 1.55 $self->{current_token}->{type} == START_TAG_TOKEN and
805 wakaba 1.1 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
806     # permitted slash
807     #
808     } else {
809 wakaba 1.3 !!!parse-error (type => 'nestc');
810 wakaba 1.1 }
811 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
812 wakaba 1.1 # next-input-character is already done
813     redo A;
814 wakaba 1.17 } elsif ($self->{next_input_character} == -1) {
815 wakaba 1.3 !!!parse-error (type => 'unclosed tag');
816 wakaba 1.1 $before_leave->();
817 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
818 wakaba 1.28 $self->{current_token}->{first_start_tag}
819     = not defined $self->{last_emitted_start_tag_name};
820 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
821 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
822 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
823 wakaba 1.1 if ($self->{current_token}->{attributes}) {
824 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
825 wakaba 1.1 }
826     } else {
827     die "$0: $self->{current_token}->{type}: Unknown token type";
828     }
829 wakaba 1.57 $self->{state} = DATA_STATE;
830 wakaba 1.1 # reconsume
831    
832     !!!emit ($self->{current_token}); # start tag or end tag
833    
834     redo A;
835     } else {
836     $self->{current_attribute}->{name} .= chr ($self->{next_input_character});
837     ## Stay in the state
838     !!!next-input-character;
839     redo A;
840     }
841 wakaba 1.57 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
842 wakaba 1.1 if ($self->{next_input_character} == 0x0009 or # HT
843     $self->{next_input_character} == 0x000A or # LF
844     $self->{next_input_character} == 0x000B or # VT
845     $self->{next_input_character} == 0x000C or # FF
846     $self->{next_input_character} == 0x0020) { # SP
847     ## Stay in the state
848     !!!next-input-character;
849     redo A;
850     } elsif ($self->{next_input_character} == 0x003D) { # =
851 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
852 wakaba 1.1 !!!next-input-character;
853     redo A;
854     } elsif ($self->{next_input_character} == 0x003E) { # >
855 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
856 wakaba 1.28 $self->{current_token}->{first_start_tag}
857     = not defined $self->{last_emitted_start_tag_name};
858 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
859 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
860 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
861 wakaba 1.1 if ($self->{current_token}->{attributes}) {
862 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
863 wakaba 1.1 }
864     } else {
865     die "$0: $self->{current_token}->{type}: Unknown token type";
866     }
867 wakaba 1.57 $self->{state} = DATA_STATE;
868 wakaba 1.1 !!!next-input-character;
869    
870     !!!emit ($self->{current_token}); # start tag or end tag
871    
872     redo A;
873     } elsif (0x0041 <= $self->{next_input_character} and
874     $self->{next_input_character} <= 0x005A) { # A..Z
875     $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
876     value => ''};
877 wakaba 1.57 $self->{state} = ATTRIBUTE_NAME_STATE;
878 wakaba 1.1 !!!next-input-character;
879     redo A;
880     } elsif ($self->{next_input_character} == 0x002F) { # /
881     !!!next-input-character;
882     if ($self->{next_input_character} == 0x003E and # >
883 wakaba 1.55 $self->{current_token}->{type} == START_TAG_TOKEN and
884 wakaba 1.1 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
885     # permitted slash
886     #
887     } else {
888 wakaba 1.3 !!!parse-error (type => 'nestc');
889 wakaba 1.33 ## TODO: Different error type for <aa / bb> than <aa/>
890 wakaba 1.1 }
891 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
892 wakaba 1.1 # next-input-character is already done
893     redo A;
894 wakaba 1.17 } elsif ($self->{next_input_character} == -1) {
895 wakaba 1.3 !!!parse-error (type => 'unclosed tag');
896 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
897 wakaba 1.28 $self->{current_token}->{first_start_tag}
898     = not defined $self->{last_emitted_start_tag_name};
899 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
900 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
901 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
902 wakaba 1.1 if ($self->{current_token}->{attributes}) {
903 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
904 wakaba 1.1 }
905     } else {
906     die "$0: $self->{current_token}->{type}: Unknown token type";
907     }
908 wakaba 1.57 $self->{state} = DATA_STATE;
909 wakaba 1.1 # reconsume
910    
911     !!!emit ($self->{current_token}); # start tag or end tag
912    
913     redo A;
914     } else {
915     $self->{current_attribute} = {name => chr ($self->{next_input_character}),
916     value => ''};
917 wakaba 1.57 $self->{state} = ATTRIBUTE_NAME_STATE;
918 wakaba 1.1 !!!next-input-character;
919     redo A;
920     }
921 wakaba 1.57 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
922 wakaba 1.1 if ($self->{next_input_character} == 0x0009 or # HT
923     $self->{next_input_character} == 0x000A or # LF
924     $self->{next_input_character} == 0x000B or # VT
925     $self->{next_input_character} == 0x000C or # FF
926     $self->{next_input_character} == 0x0020) { # SP
927     ## Stay in the state
928     !!!next-input-character;
929     redo A;
930     } elsif ($self->{next_input_character} == 0x0022) { # "
931 wakaba 1.57 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
932 wakaba 1.1 !!!next-input-character;
933     redo A;
934     } elsif ($self->{next_input_character} == 0x0026) { # &
935 wakaba 1.57 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
936 wakaba 1.1 ## reconsume
937     redo A;
938     } elsif ($self->{next_input_character} == 0x0027) { # '
939 wakaba 1.57 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
940 wakaba 1.1 !!!next-input-character;
941     redo A;
942     } elsif ($self->{next_input_character} == 0x003E) { # >
943 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
944 wakaba 1.28 $self->{current_token}->{first_start_tag}
945     = not defined $self->{last_emitted_start_tag_name};
946 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
947 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
948 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
949 wakaba 1.1 if ($self->{current_token}->{attributes}) {
950 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
951 wakaba 1.1 }
952     } else {
953     die "$0: $self->{current_token}->{type}: Unknown token type";
954     }
955 wakaba 1.57 $self->{state} = DATA_STATE;
956 wakaba 1.1 !!!next-input-character;
957    
958     !!!emit ($self->{current_token}); # start tag or end tag
959    
960     redo A;
961 wakaba 1.17 } elsif ($self->{next_input_character} == -1) {
962 wakaba 1.3 !!!parse-error (type => 'unclosed tag');
963 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
964 wakaba 1.28 $self->{current_token}->{first_start_tag}
965     = not defined $self->{last_emitted_start_tag_name};
966 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
967 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
968 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
969 wakaba 1.1 if ($self->{current_token}->{attributes}) {
970 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
971 wakaba 1.1 }
972     } else {
973     die "$0: $self->{current_token}->{type}: Unknown token type";
974     }
975 wakaba 1.57 $self->{state} = DATA_STATE;
976 wakaba 1.1 ## reconsume
977    
978     !!!emit ($self->{current_token}); # start tag or end tag
979    
980     redo A;
981     } else {
982     $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
983 wakaba 1.57 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
984 wakaba 1.1 !!!next-input-character;
985     redo A;
986     }
987 wakaba 1.57 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
988 wakaba 1.1 if ($self->{next_input_character} == 0x0022) { # "
989 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
990 wakaba 1.1 !!!next-input-character;
991     redo A;
992     } elsif ($self->{next_input_character} == 0x0026) { # &
993 wakaba 1.57 $self->{last_attribute_value_state} = $self->{state};
994     $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
995 wakaba 1.1 !!!next-input-character;
996     redo A;
997     } elsif ($self->{next_input_character} == -1) {
998 wakaba 1.3 !!!parse-error (type => 'unclosed attribute value');
999 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1000 wakaba 1.28 $self->{current_token}->{first_start_tag}
1001     = not defined $self->{last_emitted_start_tag_name};
1002 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1003 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1004 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1005 wakaba 1.1 if ($self->{current_token}->{attributes}) {
1006 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1007 wakaba 1.1 }
1008     } else {
1009     die "$0: $self->{current_token}->{type}: Unknown token type";
1010     }
1011 wakaba 1.57 $self->{state} = DATA_STATE;
1012 wakaba 1.1 ## reconsume
1013    
1014     !!!emit ($self->{current_token}); # start tag or end tag
1015    
1016     redo A;
1017     } else {
1018     $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1019     ## Stay in the state
1020     !!!next-input-character;
1021     redo A;
1022     }
1023 wakaba 1.57 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1024 wakaba 1.1 if ($self->{next_input_character} == 0x0027) { # '
1025 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1026 wakaba 1.1 !!!next-input-character;
1027     redo A;
1028     } elsif ($self->{next_input_character} == 0x0026) { # &
1029 wakaba 1.57 $self->{last_attribute_value_state} = $self->{state};
1030     $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1031 wakaba 1.1 !!!next-input-character;
1032     redo A;
1033     } elsif ($self->{next_input_character} == -1) {
1034 wakaba 1.3 !!!parse-error (type => 'unclosed attribute value');
1035 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1036 wakaba 1.28 $self->{current_token}->{first_start_tag}
1037     = not defined $self->{last_emitted_start_tag_name};
1038 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1039 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1040 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1041 wakaba 1.1 if ($self->{current_token}->{attributes}) {
1042 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1043 wakaba 1.1 }
1044     } else {
1045     die "$0: $self->{current_token}->{type}: Unknown token type";
1046     }
1047 wakaba 1.57 $self->{state} = DATA_STATE;
1048 wakaba 1.1 ## reconsume
1049    
1050     !!!emit ($self->{current_token}); # start tag or end tag
1051    
1052     redo A;
1053     } else {
1054     $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1055     ## Stay in the state
1056     !!!next-input-character;
1057     redo A;
1058     }
1059 wakaba 1.57 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1060 wakaba 1.1 if ($self->{next_input_character} == 0x0009 or # HT
1061     $self->{next_input_character} == 0x000A or # LF
1062     $self->{next_input_character} == 0x000B or # HT
1063     $self->{next_input_character} == 0x000C or # FF
1064     $self->{next_input_character} == 0x0020) { # SP
1065 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1066 wakaba 1.1 !!!next-input-character;
1067     redo A;
1068     } elsif ($self->{next_input_character} == 0x0026) { # &
1069 wakaba 1.57 $self->{last_attribute_value_state} = $self->{state};
1070     $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1071 wakaba 1.1 !!!next-input-character;
1072     redo A;
1073     } elsif ($self->{next_input_character} == 0x003E) { # >
1074 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1075 wakaba 1.28 $self->{current_token}->{first_start_tag}
1076     = not defined $self->{last_emitted_start_tag_name};
1077 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1078 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1079 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1080 wakaba 1.1 if ($self->{current_token}->{attributes}) {
1081 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1082 wakaba 1.1 }
1083     } else {
1084     die "$0: $self->{current_token}->{type}: Unknown token type";
1085     }
1086 wakaba 1.57 $self->{state} = DATA_STATE;
1087 wakaba 1.1 !!!next-input-character;
1088    
1089     !!!emit ($self->{current_token}); # start tag or end tag
1090    
1091     redo A;
1092 wakaba 1.17 } elsif ($self->{next_input_character} == -1) {
1093 wakaba 1.3 !!!parse-error (type => 'unclosed tag');
1094 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1095 wakaba 1.28 $self->{current_token}->{first_start_tag}
1096     = not defined $self->{last_emitted_start_tag_name};
1097 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1098 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1099 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1100 wakaba 1.1 if ($self->{current_token}->{attributes}) {
1101 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1102 wakaba 1.1 }
1103     } else {
1104     die "$0: $self->{current_token}->{type}: Unknown token type";
1105     }
1106 wakaba 1.57 $self->{state} = DATA_STATE;
1107 wakaba 1.1 ## reconsume
1108    
1109     !!!emit ($self->{current_token}); # start tag or end tag
1110    
1111     redo A;
1112     } else {
1113     $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1114     ## Stay in the state
1115     !!!next-input-character;
1116     redo A;
1117     }
1118 wakaba 1.57 } elsif ($self->{state} == ENTITY_IN_ATTRIBUTE_VALUE_STATE) {
1119 wakaba 1.26 my $token = $self->_tokenize_attempt_to_consume_an_entity (1);
1120 wakaba 1.1
1121     unless (defined $token) {
1122     $self->{current_attribute}->{value} .= '&';
1123     } else {
1124     $self->{current_attribute}->{value} .= $token->{data};
1125 wakaba 1.66 $self->{current_attribute}->{has_reference} = $token->{has_reference};
1126 wakaba 1.1 ## ISSUE: spec says "append the returned character token to the current attribute's value"
1127     }
1128    
1129     $self->{state} = $self->{last_attribute_value_state};
1130     # next-input-character is already done
1131     redo A;
1132 wakaba 1.57 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1133 wakaba 1.1 ## (only happen if PCDATA state)
1134    
1135 wakaba 1.55 my $token = {type => COMMENT_TOKEN, data => ''};
1136 wakaba 1.1
1137     BC: {
1138     if ($self->{next_input_character} == 0x003E) { # >
1139 wakaba 1.57 $self->{state} = DATA_STATE;
1140 wakaba 1.1 !!!next-input-character;
1141    
1142     !!!emit ($token);
1143    
1144     redo A;
1145     } elsif ($self->{next_input_character} == -1) {
1146 wakaba 1.57 $self->{state} = DATA_STATE;
1147 wakaba 1.1 ## reconsume
1148    
1149     !!!emit ($token);
1150    
1151     redo A;
1152     } else {
1153     $token->{data} .= chr ($self->{next_input_character});
1154     !!!next-input-character;
1155     redo BC;
1156     }
1157     } # BC
1158 wakaba 1.57 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1159 wakaba 1.1 ## (only happen if PCDATA state)
1160    
1161     my @next_char;
1162     push @next_char, $self->{next_input_character};
1163    
1164     if ($self->{next_input_character} == 0x002D) { # -
1165     !!!next-input-character;
1166     push @next_char, $self->{next_input_character};
1167     if ($self->{next_input_character} == 0x002D) { # -
1168 wakaba 1.55 $self->{current_token} = {type => COMMENT_TOKEN, data => ''};
1169 wakaba 1.57 $self->{state} = COMMENT_START_STATE;
1170 wakaba 1.1 !!!next-input-character;
1171     redo A;
1172     }
1173     } elsif ($self->{next_input_character} == 0x0044 or # D
1174     $self->{next_input_character} == 0x0064) { # d
1175     !!!next-input-character;
1176     push @next_char, $self->{next_input_character};
1177     if ($self->{next_input_character} == 0x004F or # O
1178     $self->{next_input_character} == 0x006F) { # o
1179     !!!next-input-character;
1180     push @next_char, $self->{next_input_character};
1181     if ($self->{next_input_character} == 0x0043 or # C
1182     $self->{next_input_character} == 0x0063) { # c
1183     !!!next-input-character;
1184     push @next_char, $self->{next_input_character};
1185     if ($self->{next_input_character} == 0x0054 or # T
1186     $self->{next_input_character} == 0x0074) { # t
1187     !!!next-input-character;
1188     push @next_char, $self->{next_input_character};
1189     if ($self->{next_input_character} == 0x0059 or # Y
1190     $self->{next_input_character} == 0x0079) { # y
1191     !!!next-input-character;
1192     push @next_char, $self->{next_input_character};
1193     if ($self->{next_input_character} == 0x0050 or # P
1194     $self->{next_input_character} == 0x0070) { # p
1195     !!!next-input-character;
1196     push @next_char, $self->{next_input_character};
1197     if ($self->{next_input_character} == 0x0045 or # E
1198     $self->{next_input_character} == 0x0065) { # e
1199     ## ISSUE: What a stupid code this is!
1200 wakaba 1.57 $self->{state} = DOCTYPE_STATE;
1201 wakaba 1.1 !!!next-input-character;
1202     redo A;
1203     }
1204     }
1205     }
1206     }
1207     }
1208     }
1209     }
1210    
1211 wakaba 1.30 !!!parse-error (type => 'bogus comment');
1212 wakaba 1.1 $self->{next_input_character} = shift @next_char;
1213     !!!back-next-input-character (@next_char);
1214 wakaba 1.57 $self->{state} = BOGUS_COMMENT_STATE;
1215 wakaba 1.1 redo A;
1216    
1217     ## ISSUE: typos in spec: chacacters, is is a parse error
1218     ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
1219 wakaba 1.57 } elsif ($self->{state} == COMMENT_START_STATE) {
1220 wakaba 1.23 if ($self->{next_input_character} == 0x002D) { # -
1221 wakaba 1.57 $self->{state} = COMMENT_START_DASH_STATE;
1222 wakaba 1.23 !!!next-input-character;
1223     redo A;
1224     } elsif ($self->{next_input_character} == 0x003E) { # >
1225     !!!parse-error (type => 'bogus comment');
1226 wakaba 1.57 $self->{state} = DATA_STATE;
1227 wakaba 1.23 !!!next-input-character;
1228    
1229     !!!emit ($self->{current_token}); # comment
1230    
1231     redo A;
1232     } elsif ($self->{next_input_character} == -1) {
1233     !!!parse-error (type => 'unclosed comment');
1234 wakaba 1.57 $self->{state} = DATA_STATE;
1235 wakaba 1.23 ## reconsume
1236    
1237     !!!emit ($self->{current_token}); # comment
1238    
1239     redo A;
1240     } else {
1241     $self->{current_token}->{data} # comment
1242     .= chr ($self->{next_input_character});
1243 wakaba 1.57 $self->{state} = COMMENT_STATE;
1244 wakaba 1.23 !!!next-input-character;
1245     redo A;
1246     }
1247 wakaba 1.57 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1248 wakaba 1.23 if ($self->{next_input_character} == 0x002D) { # -
1249 wakaba 1.57 $self->{state} = COMMENT_END_STATE;
1250 wakaba 1.23 !!!next-input-character;
1251     redo A;
1252     } elsif ($self->{next_input_character} == 0x003E) { # >
1253     !!!parse-error (type => 'bogus comment');
1254 wakaba 1.57 $self->{state} = DATA_STATE;
1255 wakaba 1.23 !!!next-input-character;
1256    
1257     !!!emit ($self->{current_token}); # comment
1258    
1259     redo A;
1260     } elsif ($self->{next_input_character} == -1) {
1261     !!!parse-error (type => 'unclosed comment');
1262 wakaba 1.57 $self->{state} = DATA_STATE;
1263 wakaba 1.23 ## reconsume
1264    
1265     !!!emit ($self->{current_token}); # comment
1266    
1267     redo A;
1268     } else {
1269     $self->{current_token}->{data} # comment
1270 wakaba 1.33 .= '-' . chr ($self->{next_input_character});
1271 wakaba 1.57 $self->{state} = COMMENT_STATE;
1272 wakaba 1.23 !!!next-input-character;
1273     redo A;
1274     }
1275 wakaba 1.57 } elsif ($self->{state} == COMMENT_STATE) {
1276 wakaba 1.1 if ($self->{next_input_character} == 0x002D) { # -
1277 wakaba 1.57 $self->{state} = COMMENT_END_DASH_STATE;
1278 wakaba 1.1 !!!next-input-character;
1279     redo A;
1280     } elsif ($self->{next_input_character} == -1) {
1281 wakaba 1.3 !!!parse-error (type => 'unclosed comment');
1282 wakaba 1.57 $self->{state} = DATA_STATE;
1283 wakaba 1.1 ## reconsume
1284    
1285     !!!emit ($self->{current_token}); # comment
1286    
1287     redo A;
1288     } else {
1289     $self->{current_token}->{data} .= chr ($self->{next_input_character}); # comment
1290     ## Stay in the state
1291     !!!next-input-character;
1292     redo A;
1293     }
1294 wakaba 1.57 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1295 wakaba 1.1 if ($self->{next_input_character} == 0x002D) { # -
1296 wakaba 1.57 $self->{state} = COMMENT_END_STATE;
1297 wakaba 1.1 !!!next-input-character;
1298     redo A;
1299     } elsif ($self->{next_input_character} == -1) {
1300 wakaba 1.3 !!!parse-error (type => 'unclosed comment');
1301 wakaba 1.57 $self->{state} = DATA_STATE;
1302 wakaba 1.1 ## reconsume
1303    
1304     !!!emit ($self->{current_token}); # comment
1305    
1306     redo A;
1307     } else {
1308     $self->{current_token}->{data} .= '-' . chr ($self->{next_input_character}); # comment
1309 wakaba 1.57 $self->{state} = COMMENT_STATE;
1310 wakaba 1.1 !!!next-input-character;
1311     redo A;
1312     }
1313 wakaba 1.57 } elsif ($self->{state} == COMMENT_END_STATE) {
1314 wakaba 1.1 if ($self->{next_input_character} == 0x003E) { # >
1315 wakaba 1.57 $self->{state} = DATA_STATE;
1316 wakaba 1.1 !!!next-input-character;
1317    
1318     !!!emit ($self->{current_token}); # comment
1319    
1320     redo A;
1321     } elsif ($self->{next_input_character} == 0x002D) { # -
1322 wakaba 1.3 !!!parse-error (type => 'dash in comment');
1323 wakaba 1.1 $self->{current_token}->{data} .= '-'; # comment
1324     ## Stay in the state
1325     !!!next-input-character;
1326     redo A;
1327     } elsif ($self->{next_input_character} == -1) {
1328 wakaba 1.3 !!!parse-error (type => 'unclosed comment');
1329 wakaba 1.57 $self->{state} = DATA_STATE;
1330 wakaba 1.1 ## reconsume
1331    
1332     !!!emit ($self->{current_token}); # comment
1333    
1334     redo A;
1335     } else {
1336 wakaba 1.3 !!!parse-error (type => 'dash in comment');
1337 wakaba 1.1 $self->{current_token}->{data} .= '--' . chr ($self->{next_input_character}); # comment
1338 wakaba 1.57 $self->{state} = COMMENT_STATE;
1339 wakaba 1.1 !!!next-input-character;
1340     redo A;
1341     }
1342 wakaba 1.57 } elsif ($self->{state} == DOCTYPE_STATE) {
1343 wakaba 1.1 if ($self->{next_input_character} == 0x0009 or # HT
1344     $self->{next_input_character} == 0x000A or # LF
1345     $self->{next_input_character} == 0x000B or # VT
1346     $self->{next_input_character} == 0x000C or # FF
1347     $self->{next_input_character} == 0x0020) { # SP
1348 wakaba 1.57 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1349 wakaba 1.1 !!!next-input-character;
1350     redo A;
1351     } else {
1352 wakaba 1.3 !!!parse-error (type => 'no space before DOCTYPE name');
1353 wakaba 1.57 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1354 wakaba 1.1 ## reconsume
1355     redo A;
1356     }
1357 wakaba 1.57 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
1358 wakaba 1.1 if ($self->{next_input_character} == 0x0009 or # HT
1359     $self->{next_input_character} == 0x000A or # LF
1360     $self->{next_input_character} == 0x000B or # VT
1361     $self->{next_input_character} == 0x000C or # FF
1362     $self->{next_input_character} == 0x0020) { # SP
1363     ## Stay in the state
1364     !!!next-input-character;
1365     redo A;
1366     } elsif ($self->{next_input_character} == 0x003E) { # >
1367 wakaba 1.3 !!!parse-error (type => 'no DOCTYPE name');
1368 wakaba 1.57 $self->{state} = DATA_STATE;
1369 wakaba 1.1 !!!next-input-character;
1370    
1371 wakaba 1.55 !!!emit ({type => DOCTYPE_TOKEN}); # incorrect
1372 wakaba 1.1
1373     redo A;
1374     } elsif ($self->{next_input_character} == -1) {
1375 wakaba 1.3 !!!parse-error (type => 'no DOCTYPE name');
1376 wakaba 1.57 $self->{state} = DATA_STATE;
1377 wakaba 1.1 ## reconsume
1378    
1379 wakaba 1.55 !!!emit ({type => DOCTYPE_TOKEN}); # incorrect
1380 wakaba 1.1
1381     redo A;
1382     } else {
1383 wakaba 1.18 $self->{current_token}
1384 wakaba 1.55 = {type => DOCTYPE_TOKEN,
1385 wakaba 1.18 name => chr ($self->{next_input_character}),
1386     correct => 1};
1387 wakaba 1.4 ## ISSUE: "Set the token's name name to the" in the spec
1388 wakaba 1.57 $self->{state} = DOCTYPE_NAME_STATE;
1389 wakaba 1.1 !!!next-input-character;
1390     redo A;
1391     }
1392 wakaba 1.57 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
1393 wakaba 1.18 ## ISSUE: Redundant "First," in the spec.
1394 wakaba 1.1 if ($self->{next_input_character} == 0x0009 or # HT
1395     $self->{next_input_character} == 0x000A or # LF
1396     $self->{next_input_character} == 0x000B or # VT
1397     $self->{next_input_character} == 0x000C or # FF
1398     $self->{next_input_character} == 0x0020) { # SP
1399 wakaba 1.57 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
1400 wakaba 1.1 !!!next-input-character;
1401     redo A;
1402     } elsif ($self->{next_input_character} == 0x003E) { # >
1403 wakaba 1.57 $self->{state} = DATA_STATE;
1404 wakaba 1.1 !!!next-input-character;
1405    
1406     !!!emit ($self->{current_token}); # DOCTYPE
1407    
1408     redo A;
1409     } elsif ($self->{next_input_character} == -1) {
1410 wakaba 1.3 !!!parse-error (type => 'unclosed DOCTYPE');
1411 wakaba 1.57 $self->{state} = DATA_STATE;
1412 wakaba 1.1 ## reconsume
1413    
1414 wakaba 1.18 delete $self->{current_token}->{correct};
1415     !!!emit ($self->{current_token}); # DOCTYPE
1416 wakaba 1.1
1417     redo A;
1418     } else {
1419     $self->{current_token}->{name}
1420     .= chr ($self->{next_input_character}); # DOCTYPE
1421     ## Stay in the state
1422     !!!next-input-character;
1423     redo A;
1424     }
1425 wakaba 1.57 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
1426 wakaba 1.1 if ($self->{next_input_character} == 0x0009 or # HT
1427     $self->{next_input_character} == 0x000A or # LF
1428     $self->{next_input_character} == 0x000B or # VT
1429     $self->{next_input_character} == 0x000C or # FF
1430     $self->{next_input_character} == 0x0020) { # SP
1431     ## Stay in the state
1432     !!!next-input-character;
1433     redo A;
1434     } elsif ($self->{next_input_character} == 0x003E) { # >
1435 wakaba 1.57 $self->{state} = DATA_STATE;
1436 wakaba 1.1 !!!next-input-character;
1437    
1438     !!!emit ($self->{current_token}); # DOCTYPE
1439    
1440     redo A;
1441     } elsif ($self->{next_input_character} == -1) {
1442 wakaba 1.3 !!!parse-error (type => 'unclosed DOCTYPE');
1443 wakaba 1.57 $self->{state} = DATA_STATE;
1444 wakaba 1.1 ## reconsume
1445    
1446 wakaba 1.18 delete $self->{current_token}->{correct};
1447     !!!emit ($self->{current_token}); # DOCTYPE
1448    
1449     redo A;
1450     } elsif ($self->{next_input_character} == 0x0050 or # P
1451     $self->{next_input_character} == 0x0070) { # p
1452     !!!next-input-character;
1453     if ($self->{next_input_character} == 0x0055 or # U
1454     $self->{next_input_character} == 0x0075) { # u
1455     !!!next-input-character;
1456     if ($self->{next_input_character} == 0x0042 or # B
1457     $self->{next_input_character} == 0x0062) { # b
1458     !!!next-input-character;
1459     if ($self->{next_input_character} == 0x004C or # L
1460     $self->{next_input_character} == 0x006C) { # l
1461     !!!next-input-character;
1462     if ($self->{next_input_character} == 0x0049 or # I
1463     $self->{next_input_character} == 0x0069) { # i
1464     !!!next-input-character;
1465     if ($self->{next_input_character} == 0x0043 or # C
1466     $self->{next_input_character} == 0x0063) { # c
1467 wakaba 1.57 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1468 wakaba 1.18 !!!next-input-character;
1469     redo A;
1470     }
1471     }
1472     }
1473     }
1474     }
1475    
1476     #
1477     } elsif ($self->{next_input_character} == 0x0053 or # S
1478     $self->{next_input_character} == 0x0073) { # s
1479     !!!next-input-character;
1480     if ($self->{next_input_character} == 0x0059 or # Y
1481     $self->{next_input_character} == 0x0079) { # y
1482     !!!next-input-character;
1483     if ($self->{next_input_character} == 0x0053 or # S
1484     $self->{next_input_character} == 0x0073) { # s
1485     !!!next-input-character;
1486     if ($self->{next_input_character} == 0x0054 or # T
1487     $self->{next_input_character} == 0x0074) { # t
1488     !!!next-input-character;
1489     if ($self->{next_input_character} == 0x0045 or # E
1490     $self->{next_input_character} == 0x0065) { # e
1491     !!!next-input-character;
1492     if ($self->{next_input_character} == 0x004D or # M
1493     $self->{next_input_character} == 0x006D) { # m
1494 wakaba 1.57 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1495 wakaba 1.18 !!!next-input-character;
1496     redo A;
1497     }
1498     }
1499     }
1500     }
1501     }
1502    
1503     #
1504     } else {
1505     !!!next-input-character;
1506     #
1507     }
1508    
1509     !!!parse-error (type => 'string after DOCTYPE name');
1510 wakaba 1.57 $self->{state} = BOGUS_DOCTYPE_STATE;
1511 wakaba 1.18 # next-input-character is already done
1512     redo A;
1513 wakaba 1.57 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
1514 wakaba 1.18 if ({
1515     0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1516     #0x000D => 1, # HT, LF, VT, FF, SP, CR
1517     }->{$self->{next_input_character}}) {
1518     ## Stay in the state
1519     !!!next-input-character;
1520     redo A;
1521     } elsif ($self->{next_input_character} eq 0x0022) { # "
1522     $self->{current_token}->{public_identifier} = ''; # DOCTYPE
1523 wakaba 1.57 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
1524 wakaba 1.18 !!!next-input-character;
1525     redo A;
1526     } elsif ($self->{next_input_character} eq 0x0027) { # '
1527     $self->{current_token}->{public_identifier} = ''; # DOCTYPE
1528 wakaba 1.57 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
1529 wakaba 1.18 !!!next-input-character;
1530     redo A;
1531     } elsif ($self->{next_input_character} eq 0x003E) { # >
1532     !!!parse-error (type => 'no PUBLIC literal');
1533    
1534 wakaba 1.57 $self->{state} = DATA_STATE;
1535 wakaba 1.18 !!!next-input-character;
1536    
1537     delete $self->{current_token}->{correct};
1538     !!!emit ($self->{current_token}); # DOCTYPE
1539    
1540     redo A;
1541     } elsif ($self->{next_input_character} == -1) {
1542     !!!parse-error (type => 'unclosed DOCTYPE');
1543    
1544 wakaba 1.57 $self->{state} = DATA_STATE;
1545 wakaba 1.18 ## reconsume
1546    
1547     delete $self->{current_token}->{correct};
1548     !!!emit ($self->{current_token}); # DOCTYPE
1549    
1550     redo A;
1551     } else {
1552     !!!parse-error (type => 'string after PUBLIC');
1553 wakaba 1.57 $self->{state} = BOGUS_DOCTYPE_STATE;
1554 wakaba 1.18 !!!next-input-character;
1555     redo A;
1556     }
1557 wakaba 1.57 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
1558 wakaba 1.18 if ($self->{next_input_character} == 0x0022) { # "
1559 wakaba 1.57 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1560 wakaba 1.18 !!!next-input-character;
1561     redo A;
1562 wakaba 1.69 } elsif ($self->{next_input_character} == 0x003E) { # >
1563     !!!parse-error (type => 'unclosed PUBLIC literal');
1564    
1565     $self->{state} = DATA_STATE;
1566     !!!next-input-character;
1567    
1568     delete $self->{current_token}->{correct};
1569     !!!emit ($self->{current_token}); # DOCTYPE
1570    
1571     redo A;
1572 wakaba 1.18 } elsif ($self->{next_input_character} == -1) {
1573     !!!parse-error (type => 'unclosed PUBLIC literal');
1574    
1575 wakaba 1.57 $self->{state} = DATA_STATE;
1576 wakaba 1.18 ## reconsume
1577    
1578     delete $self->{current_token}->{correct};
1579     !!!emit ($self->{current_token}); # DOCTYPE
1580    
1581     redo A;
1582     } else {
1583     $self->{current_token}->{public_identifier} # DOCTYPE
1584     .= chr $self->{next_input_character};
1585     ## Stay in the state
1586     !!!next-input-character;
1587     redo A;
1588     }
1589 wakaba 1.57 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
1590 wakaba 1.18 if ($self->{next_input_character} == 0x0027) { # '
1591 wakaba 1.57 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1592 wakaba 1.18 !!!next-input-character;
1593     redo A;
1594 wakaba 1.69 } elsif ($self->{next_input_character} == 0x003E) { # >
1595     !!!parse-error (type => 'unclosed PUBLIC literal');
1596    
1597     $self->{state} = DATA_STATE;
1598     !!!next-input-character;
1599    
1600     delete $self->{current_token}->{correct};
1601     !!!emit ($self->{current_token}); # DOCTYPE
1602    
1603     redo A;
1604 wakaba 1.18 } elsif ($self->{next_input_character} == -1) {
1605     !!!parse-error (type => 'unclosed PUBLIC literal');
1606    
1607 wakaba 1.57 $self->{state} = DATA_STATE;
1608 wakaba 1.18 ## reconsume
1609    
1610     delete $self->{current_token}->{correct};
1611     !!!emit ($self->{current_token}); # DOCTYPE
1612    
1613     redo A;
1614     } else {
1615     $self->{current_token}->{public_identifier} # DOCTYPE
1616     .= chr $self->{next_input_character};
1617     ## Stay in the state
1618     !!!next-input-character;
1619     redo A;
1620     }
1621 wakaba 1.57 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
1622 wakaba 1.18 if ({
1623     0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1624     #0x000D => 1, # HT, LF, VT, FF, SP, CR
1625     }->{$self->{next_input_character}}) {
1626     ## Stay in the state
1627     !!!next-input-character;
1628     redo A;
1629     } elsif ($self->{next_input_character} == 0x0022) { # "
1630     $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1631 wakaba 1.57 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
1632 wakaba 1.18 !!!next-input-character;
1633     redo A;
1634     } elsif ($self->{next_input_character} == 0x0027) { # '
1635     $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1636 wakaba 1.57 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
1637 wakaba 1.18 !!!next-input-character;
1638     redo A;
1639     } elsif ($self->{next_input_character} == 0x003E) { # >
1640 wakaba 1.57 $self->{state} = DATA_STATE;
1641 wakaba 1.18 !!!next-input-character;
1642    
1643     !!!emit ($self->{current_token}); # DOCTYPE
1644    
1645     redo A;
1646     } elsif ($self->{next_input_character} == -1) {
1647     !!!parse-error (type => 'unclosed DOCTYPE');
1648    
1649 wakaba 1.57 $self->{state} = DATA_STATE;
1650 wakaba 1.26 ## reconsume
1651 wakaba 1.18
1652     delete $self->{current_token}->{correct};
1653     !!!emit ($self->{current_token}); # DOCTYPE
1654    
1655     redo A;
1656     } else {
1657     !!!parse-error (type => 'string after PUBLIC literal');
1658 wakaba 1.57 $self->{state} = BOGUS_DOCTYPE_STATE;
1659 wakaba 1.18 !!!next-input-character;
1660     redo A;
1661     }
1662 wakaba 1.57 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
1663 wakaba 1.18 if ({
1664     0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1665     #0x000D => 1, # HT, LF, VT, FF, SP, CR
1666     }->{$self->{next_input_character}}) {
1667     ## Stay in the state
1668     !!!next-input-character;
1669     redo A;
1670     } elsif ($self->{next_input_character} == 0x0022) { # "
1671     $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1672 wakaba 1.57 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
1673 wakaba 1.18 !!!next-input-character;
1674     redo A;
1675     } elsif ($self->{next_input_character} == 0x0027) { # '
1676     $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1677 wakaba 1.57 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
1678 wakaba 1.18 !!!next-input-character;
1679     redo A;
1680     } elsif ($self->{next_input_character} == 0x003E) { # >
1681     !!!parse-error (type => 'no SYSTEM literal');
1682 wakaba 1.57 $self->{state} = DATA_STATE;
1683 wakaba 1.18 !!!next-input-character;
1684    
1685     delete $self->{current_token}->{correct};
1686     !!!emit ($self->{current_token}); # DOCTYPE
1687    
1688     redo A;
1689     } elsif ($self->{next_input_character} == -1) {
1690     !!!parse-error (type => 'unclosed DOCTYPE');
1691    
1692 wakaba 1.57 $self->{state} = DATA_STATE;
1693 wakaba 1.26 ## reconsume
1694 wakaba 1.18
1695     delete $self->{current_token}->{correct};
1696     !!!emit ($self->{current_token}); # DOCTYPE
1697    
1698     redo A;
1699     } else {
1700 wakaba 1.30 !!!parse-error (type => 'string after SYSTEM');
1701 wakaba 1.57 $self->{state} = BOGUS_DOCTYPE_STATE;
1702 wakaba 1.18 !!!next-input-character;
1703     redo A;
1704     }
1705 wakaba 1.57 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
1706 wakaba 1.18 if ($self->{next_input_character} == 0x0022) { # "
1707 wakaba 1.57 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1708 wakaba 1.18 !!!next-input-character;
1709     redo A;
1710 wakaba 1.69 } elsif ($self->{next_input_character} == 0x003E) { # >
1711     !!!parse-error (type => 'unclosed PUBLIC literal');
1712    
1713     $self->{state} = DATA_STATE;
1714     !!!next-input-character;
1715    
1716     delete $self->{current_token}->{correct};
1717     !!!emit ($self->{current_token}); # DOCTYPE
1718    
1719     redo A;
1720 wakaba 1.18 } elsif ($self->{next_input_character} == -1) {
1721     !!!parse-error (type => 'unclosed SYSTEM literal');
1722    
1723 wakaba 1.57 $self->{state} = DATA_STATE;
1724 wakaba 1.18 ## reconsume
1725    
1726     delete $self->{current_token}->{correct};
1727     !!!emit ($self->{current_token}); # DOCTYPE
1728    
1729     redo A;
1730     } else {
1731     $self->{current_token}->{system_identifier} # DOCTYPE
1732     .= chr $self->{next_input_character};
1733     ## Stay in the state
1734     !!!next-input-character;
1735     redo A;
1736     }
1737 wakaba 1.57 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
1738 wakaba 1.18 if ($self->{next_input_character} == 0x0027) { # '
1739 wakaba 1.57 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1740 wakaba 1.18 !!!next-input-character;
1741     redo A;
1742 wakaba 1.69 } elsif ($self->{next_input_character} == 0x003E) { # >
1743     !!!parse-error (type => 'unclosed PUBLIC literal');
1744    
1745     $self->{state} = DATA_STATE;
1746     !!!next-input-character;
1747    
1748     delete $self->{current_token}->{correct};
1749     !!!emit ($self->{current_token}); # DOCTYPE
1750    
1751     redo A;
1752 wakaba 1.18 } elsif ($self->{next_input_character} == -1) {
1753     !!!parse-error (type => 'unclosed SYSTEM literal');
1754    
1755 wakaba 1.57 $self->{state} = DATA_STATE;
1756 wakaba 1.18 ## reconsume
1757    
1758     delete $self->{current_token}->{correct};
1759 wakaba 1.1 !!!emit ($self->{current_token}); # DOCTYPE
1760    
1761     redo A;
1762     } else {
1763 wakaba 1.18 $self->{current_token}->{system_identifier} # DOCTYPE
1764     .= chr $self->{next_input_character};
1765     ## Stay in the state
1766     !!!next-input-character;
1767     redo A;
1768     }
1769 wakaba 1.57 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
1770 wakaba 1.18 if ({
1771     0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1772     #0x000D => 1, # HT, LF, VT, FF, SP, CR
1773     }->{$self->{next_input_character}}) {
1774     ## Stay in the state
1775     !!!next-input-character;
1776     redo A;
1777     } elsif ($self->{next_input_character} == 0x003E) { # >
1778 wakaba 1.57 $self->{state} = DATA_STATE;
1779 wakaba 1.18 !!!next-input-character;
1780    
1781     !!!emit ($self->{current_token}); # DOCTYPE
1782    
1783     redo A;
1784     } elsif ($self->{next_input_character} == -1) {
1785     !!!parse-error (type => 'unclosed DOCTYPE');
1786    
1787 wakaba 1.57 $self->{state} = DATA_STATE;
1788 wakaba 1.26 ## reconsume
1789 wakaba 1.18
1790     delete $self->{current_token}->{correct};
1791     !!!emit ($self->{current_token}); # DOCTYPE
1792    
1793     redo A;
1794     } else {
1795     !!!parse-error (type => 'string after SYSTEM literal');
1796 wakaba 1.57 $self->{state} = BOGUS_DOCTYPE_STATE;
1797 wakaba 1.1 !!!next-input-character;
1798     redo A;
1799     }
1800 wakaba 1.57 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
1801 wakaba 1.1 if ($self->{next_input_character} == 0x003E) { # >
1802 wakaba 1.57 $self->{state} = DATA_STATE;
1803 wakaba 1.1 !!!next-input-character;
1804    
1805 wakaba 1.18 delete $self->{current_token}->{correct};
1806 wakaba 1.1 !!!emit ($self->{current_token}); # DOCTYPE
1807    
1808     redo A;
1809     } elsif ($self->{next_input_character} == -1) {
1810 wakaba 1.3 !!!parse-error (type => 'unclosed DOCTYPE');
1811 wakaba 1.57 $self->{state} = DATA_STATE;
1812 wakaba 1.1 ## reconsume
1813    
1814 wakaba 1.18 delete $self->{current_token}->{correct};
1815 wakaba 1.1 !!!emit ($self->{current_token}); # DOCTYPE
1816    
1817     redo A;
1818     } else {
1819     ## Stay in the state
1820     !!!next-input-character;
1821     redo A;
1822     }
1823     } else {
1824     die "$0: $self->{state}: Unknown state";
1825     }
1826     } # A
1827    
1828     die "$0: _get_next_token: unexpected case";
1829     } # _get_next_token
1830    
1831 wakaba 1.26 sub _tokenize_attempt_to_consume_an_entity ($$) {
1832     my ($self, $in_attr) = @_;
1833 wakaba 1.20
1834     if ({
1835     0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
1836     0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR
1837     }->{$self->{next_input_character}}) {
1838     ## Don't consume
1839     ## No error
1840     return undef;
1841     } elsif ($self->{next_input_character} == 0x0023) { # #
1842 wakaba 1.1 !!!next-input-character;
1843     if ($self->{next_input_character} == 0x0078 or # x
1844     $self->{next_input_character} == 0x0058) { # X
1845 wakaba 1.26 my $code;
1846 wakaba 1.1 X: {
1847     my $x_char = $self->{next_input_character};
1848     !!!next-input-character;
1849     if (0x0030 <= $self->{next_input_character} and
1850     $self->{next_input_character} <= 0x0039) { # 0..9
1851 wakaba 1.26 $code ||= 0;
1852     $code *= 0x10;
1853     $code += $self->{next_input_character} - 0x0030;
1854 wakaba 1.1 redo X;
1855     } elsif (0x0061 <= $self->{next_input_character} and
1856     $self->{next_input_character} <= 0x0066) { # a..f
1857 wakaba 1.26 $code ||= 0;
1858     $code *= 0x10;
1859     $code += $self->{next_input_character} - 0x0060 + 9;
1860 wakaba 1.1 redo X;
1861     } elsif (0x0041 <= $self->{next_input_character} and
1862     $self->{next_input_character} <= 0x0046) { # A..F
1863 wakaba 1.26 $code ||= 0;
1864     $code *= 0x10;
1865     $code += $self->{next_input_character} - 0x0040 + 9;
1866 wakaba 1.1 redo X;
1867 wakaba 1.26 } elsif (not defined $code) { # no hexadecimal digit
1868 wakaba 1.3 !!!parse-error (type => 'bare hcro');
1869 wakaba 1.37 !!!back-next-input-character ($x_char, $self->{next_input_character});
1870 wakaba 1.1 $self->{next_input_character} = 0x0023; # #
1871     return undef;
1872     } elsif ($self->{next_input_character} == 0x003B) { # ;
1873     !!!next-input-character;
1874     } else {
1875 wakaba 1.3 !!!parse-error (type => 'no refc');
1876 wakaba 1.1 }
1877    
1878 wakaba 1.26 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
1879     !!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code);
1880     $code = 0xFFFD;
1881     } elsif ($code > 0x10FFFF) {
1882     !!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code);
1883     $code = 0xFFFD;
1884     } elsif ($code == 0x000D) {
1885     !!!parse-error (type => 'CR character reference');
1886     $code = 0x000A;
1887     } elsif (0x80 <= $code and $code <= 0x9F) {
1888 wakaba 1.30 !!!parse-error (type => sprintf 'C1 character reference:U+%04X', $code);
1889 wakaba 1.26 $code = $c1_entity_char->{$code};
1890 wakaba 1.1 }
1891    
1892 wakaba 1.66 return {type => CHARACTER_TOKEN, data => chr $code,
1893     has_reference => 1};
1894 wakaba 1.1 } # X
1895     } elsif (0x0030 <= $self->{next_input_character} and
1896     $self->{next_input_character} <= 0x0039) { # 0..9
1897     my $code = $self->{next_input_character} - 0x0030;
1898     !!!next-input-character;
1899    
1900     while (0x0030 <= $self->{next_input_character} and
1901     $self->{next_input_character} <= 0x0039) { # 0..9
1902     $code *= 10;
1903     $code += $self->{next_input_character} - 0x0030;
1904    
1905     !!!next-input-character;
1906     }
1907    
1908     if ($self->{next_input_character} == 0x003B) { # ;
1909     !!!next-input-character;
1910     } else {
1911 wakaba 1.3 !!!parse-error (type => 'no refc');
1912 wakaba 1.1 }
1913    
1914 wakaba 1.26 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
1915     !!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code);
1916     $code = 0xFFFD;
1917     } elsif ($code > 0x10FFFF) {
1918     !!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code);
1919     $code = 0xFFFD;
1920     } elsif ($code == 0x000D) {
1921     !!!parse-error (type => 'CR character reference');
1922     $code = 0x000A;
1923 wakaba 1.4 } elsif (0x80 <= $code and $code <= 0x9F) {
1924 wakaba 1.30 !!!parse-error (type => sprintf 'C1 character reference:U+%04X', $code);
1925 wakaba 1.4 $code = $c1_entity_char->{$code};
1926 wakaba 1.1 }
1927    
1928 wakaba 1.66 return {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1};
1929 wakaba 1.1 } else {
1930 wakaba 1.3 !!!parse-error (type => 'bare nero');
1931 wakaba 1.1 !!!back-next-input-character ($self->{next_input_character});
1932     $self->{next_input_character} = 0x0023; # #
1933     return undef;
1934     }
1935     } elsif ((0x0041 <= $self->{next_input_character} and
1936     $self->{next_input_character} <= 0x005A) or
1937     (0x0061 <= $self->{next_input_character} and
1938     $self->{next_input_character} <= 0x007A)) {
1939     my $entity_name = chr $self->{next_input_character};
1940     !!!next-input-character;
1941    
1942     my $value = $entity_name;
1943 wakaba 1.37 my $match = 0;
1944 wakaba 1.16 require Whatpm::_NamedEntityList;
1945     our $EntityChar;
1946 wakaba 1.1
1947     while (length $entity_name < 10 and
1948     ## NOTE: Some number greater than the maximum length of entity name
1949 wakaba 1.16 ((0x0041 <= $self->{next_input_character} and # a
1950     $self->{next_input_character} <= 0x005A) or # x
1951     (0x0061 <= $self->{next_input_character} and # a
1952     $self->{next_input_character} <= 0x007A) or # z
1953     (0x0030 <= $self->{next_input_character} and # 0
1954     $self->{next_input_character} <= 0x0039) or # 9
1955     $self->{next_input_character} == 0x003B)) { # ;
1956 wakaba 1.1 $entity_name .= chr $self->{next_input_character};
1957 wakaba 1.16 if (defined $EntityChar->{$entity_name}) {
1958     if ($self->{next_input_character} == 0x003B) { # ;
1959 wakaba 1.26 $value = $EntityChar->{$entity_name};
1960 wakaba 1.16 $match = 1;
1961     !!!next-input-character;
1962     last;
1963 wakaba 1.37 } else {
1964 wakaba 1.26 $value = $EntityChar->{$entity_name};
1965     $match = -1;
1966 wakaba 1.37 !!!next-input-character;
1967 wakaba 1.16 }
1968 wakaba 1.1 } else {
1969     $value .= chr $self->{next_input_character};
1970 wakaba 1.37 $match *= 2;
1971     !!!next-input-character;
1972 wakaba 1.1 }
1973     }
1974    
1975 wakaba 1.16 if ($match > 0) {
1976 wakaba 1.66 return {type => CHARACTER_TOKEN, data => $value, has_reference => 1};
1977 wakaba 1.16 } elsif ($match < 0) {
1978 wakaba 1.30 !!!parse-error (type => 'no refc');
1979 wakaba 1.37 if ($in_attr and $match < -1) {
1980 wakaba 1.55 return {type => CHARACTER_TOKEN, data => '&'.$entity_name};
1981 wakaba 1.37 } else {
1982 wakaba 1.66 return {type => CHARACTER_TOKEN, data => $value, has_reference => 1};
1983 wakaba 1.37 }
1984 wakaba 1.1 } else {
1985 wakaba 1.3 !!!parse-error (type => 'bare ero');
1986 wakaba 1.66 ## NOTE: "No characters are consumed" in the spec.
1987 wakaba 1.55 return {type => CHARACTER_TOKEN, data => '&'.$value};
1988 wakaba 1.1 }
1989     } else {
1990     ## no characters are consumed
1991 wakaba 1.3 !!!parse-error (type => 'bare ero');
1992 wakaba 1.1 return undef;
1993     }
1994     } # _tokenize_attempt_to_consume_an_entity
1995    
1996     sub _initialize_tree_constructor ($) {
1997     my $self = shift;
1998     ## NOTE: $self->{document} MUST be specified before this method is called
1999     $self->{document}->strict_error_checking (0);
2000     ## TODO: Turn mutation events off # MUST
2001     ## TODO: Turn loose Document option (manakai extension) on
2002 wakaba 1.18 $self->{document}->manakai_is_html (1); # MUST
2003 wakaba 1.1 } # _initialize_tree_constructor
2004    
2005     sub _terminate_tree_constructor ($) {
2006     my $self = shift;
2007     $self->{document}->strict_error_checking (1);
2008     ## TODO: Turn mutation events on
2009     } # _terminate_tree_constructor
2010    
2011     ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
2012    
2013 wakaba 1.3 { # tree construction stage
2014     my $token;
2015    
2016 wakaba 1.1 sub _construct_tree ($) {
2017     my ($self) = @_;
2018    
2019     ## When an interactive UA render the $self->{document} available
2020     ## to the user, or when it begin accepting user input, are
2021     ## not defined.
2022    
2023     ## Append a character: collect it and all subsequent consecutive
2024     ## characters and insert one Text node whose data is concatenation
2025     ## of all those characters. # MUST
2026    
2027     !!!next-token;
2028    
2029 wakaba 1.54 $self->{insertion_mode} = BEFORE_HEAD_IM;
2030 wakaba 1.3 undef $self->{form_element};
2031     undef $self->{head_element};
2032     $self->{open_elements} = [];
2033     undef $self->{inner_html_node};
2034    
2035     $self->_tree_construction_initial; # MUST
2036     $self->_tree_construction_root_element;
2037     $self->_tree_construction_main;
2038     } # _construct_tree
2039    
2040     sub _tree_construction_initial ($) {
2041     my $self = shift;
2042 wakaba 1.18 INITIAL: {
2043 wakaba 1.55 if ($token->{type} == DOCTYPE_TOKEN) {
2044 wakaba 1.18 ## NOTE: Conformance checkers MAY, instead of reporting "not HTML5"
2045     ## error, switch to a conformance checking mode for another
2046     ## language.
2047     my $doctype_name = $token->{name};
2048     $doctype_name = '' unless defined $doctype_name;
2049     $doctype_name =~ tr/a-z/A-Z/;
2050     if (not defined $token->{name} or # <!DOCTYPE>
2051     defined $token->{public_identifier} or
2052     defined $token->{system_identifier}) {
2053     !!!parse-error (type => 'not HTML5');
2054     } elsif ($doctype_name ne 'HTML') {
2055     ## ISSUE: ASCII case-insensitive? (in fact it does not matter)
2056     !!!parse-error (type => 'not HTML5');
2057     }
2058    
2059     my $doctype = $self->{document}->create_document_type_definition
2060     ($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)?
2061     $doctype->public_id ($token->{public_identifier})
2062     if defined $token->{public_identifier};
2063     $doctype->system_id ($token->{system_identifier})
2064     if defined $token->{system_identifier};
2065     ## NOTE: Other DocumentType attributes are null or empty lists.
2066     ## ISSUE: internalSubset = null??
2067     $self->{document}->append_child ($doctype);
2068    
2069     if (not $token->{correct} or $doctype_name ne 'HTML') {
2070     $self->{document}->manakai_compat_mode ('quirks');
2071     } elsif (defined $token->{public_identifier}) {
2072     my $pubid = $token->{public_identifier};
2073     $pubid =~ tr/a-z/A-z/;
2074     if ({
2075     "+//SILMARIL//DTD HTML PRO V0R11 19970101//EN" => 1,
2076     "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
2077     "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
2078     "-//IETF//DTD HTML 2.0 LEVEL 1//EN" => 1,
2079     "-//IETF//DTD HTML 2.0 LEVEL 2//EN" => 1,
2080     "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//EN" => 1,
2081     "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//EN" => 1,
2082     "-//IETF//DTD HTML 2.0 STRICT//EN" => 1,
2083     "-//IETF//DTD HTML 2.0//EN" => 1,
2084     "-//IETF//DTD HTML 2.1E//EN" => 1,
2085     "-//IETF//DTD HTML 3.0//EN" => 1,
2086     "-//IETF//DTD HTML 3.0//EN//" => 1,
2087     "-//IETF//DTD HTML 3.2 FINAL//EN" => 1,
2088     "-//IETF//DTD HTML 3.2//EN" => 1,
2089     "-//IETF//DTD HTML 3//EN" => 1,
2090     "-//IETF//DTD HTML LEVEL 0//EN" => 1,
2091     "-//IETF//DTD HTML LEVEL 0//EN//2.0" => 1,
2092     "-//IETF//DTD HTML LEVEL 1//EN" => 1,
2093     "-//IETF//DTD HTML LEVEL 1//EN//2.0" => 1,
2094     "-//IETF//DTD HTML LEVEL 2//EN" => 1,
2095     "-//IETF//DTD HTML LEVEL 2//EN//2.0" => 1,
2096     "-//IETF//DTD HTML LEVEL 3//EN" => 1,
2097     "-//IETF//DTD HTML LEVEL 3//EN//3.0" => 1,
2098     "-//IETF//DTD HTML STRICT LEVEL 0//EN" => 1,
2099     "-//IETF//DTD HTML STRICT LEVEL 0//EN//2.0" => 1,
2100     "-//IETF//DTD HTML STRICT LEVEL 1//EN" => 1,
2101     "-//IETF//DTD HTML STRICT LEVEL 1//EN//2.0" => 1,
2102     "-//IETF//DTD HTML STRICT LEVEL 2//EN" => 1,
2103     "-//IETF//DTD HTML STRICT LEVEL 2//EN//2.0" => 1,
2104     "-//IETF//DTD HTML STRICT LEVEL 3//EN" => 1,
2105     "-//IETF//DTD HTML STRICT LEVEL 3//EN//3.0" => 1,
2106     "-//IETF//DTD HTML STRICT//EN" => 1,
2107     "-//IETF//DTD HTML STRICT//EN//2.0" => 1,
2108     "-//IETF//DTD HTML STRICT//EN//3.0" => 1,
2109     "-//IETF//DTD HTML//EN" => 1,
2110     "-//IETF//DTD HTML//EN//2.0" => 1,
2111     "-//IETF//DTD HTML//EN//3.0" => 1,
2112     "-//METRIUS//DTD METRIUS PRESENTATIONAL//EN" => 1,
2113     "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//EN" => 1,
2114     "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//EN" => 1,
2115     "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//EN" => 1,
2116     "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//EN" => 1,
2117     "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//EN" => 1,
2118     "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//EN" => 1,
2119     "-//NETSCAPE COMM. CORP.//DTD HTML//EN" => 1,
2120     "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1,
2121     "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1,
2122     "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1,
2123     "-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1,
2124     "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1,
2125     "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1,
2126     "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//EN" => 1,
2127     "-//W3C//DTD HTML 3 1995-03-24//EN" => 1,
2128     "-//W3C//DTD HTML 3.2 DRAFT//EN" => 1,
2129     "-//W3C//DTD HTML 3.2 FINAL//EN" => 1,
2130     "-//W3C//DTD HTML 3.2//EN" => 1,
2131     "-//W3C//DTD HTML 3.2S DRAFT//EN" => 1,
2132     "-//W3C//DTD HTML 4.0 FRAMESET//EN" => 1,
2133     "-//W3C//DTD HTML 4.0 TRANSITIONAL//EN" => 1,
2134     "-//W3C//DTD HTML EXPERIMETNAL 19960712//EN" => 1,
2135     "-//W3C//DTD HTML EXPERIMENTAL 970421//EN" => 1,
2136     "-//W3C//DTD W3 HTML//EN" => 1,
2137     "-//W3O//DTD W3 HTML 3.0//EN" => 1,
2138     "-//W3O//DTD W3 HTML 3.0//EN//" => 1,
2139     "-//W3O//DTD W3 HTML STRICT 3.0//EN//" => 1,
2140     "-//WEBTECHS//DTD MOZILLA HTML 2.0//EN" => 1,
2141     "-//WEBTECHS//DTD MOZILLA HTML//EN" => 1,
2142     "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" => 1,
2143     "HTML" => 1,
2144     }->{$pubid}) {
2145     $self->{document}->manakai_compat_mode ('quirks');
2146     } elsif ($pubid eq "-//W3C//DTD HTML 4.01 FRAMESET//EN" or
2147     $pubid eq "-//W3C//DTD HTML 4.01 TRANSITIONAL//EN") {
2148     if (defined $token->{system_identifier}) {
2149     $self->{document}->manakai_compat_mode ('quirks');
2150     } else {
2151     $self->{document}->manakai_compat_mode ('limited quirks');
2152 wakaba 1.3 }
2153 wakaba 1.18 } elsif ($pubid eq "-//W3C//DTD XHTML 1.0 Frameset//EN" or
2154     $pubid eq "-//W3C//DTD XHTML 1.0 Transitional//EN") {
2155     $self->{document}->manakai_compat_mode ('limited quirks');
2156     }
2157     }
2158     if (defined $token->{system_identifier}) {
2159     my $sysid = $token->{system_identifier};
2160     $sysid =~ tr/A-Z/a-z/;
2161     if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
2162     $self->{document}->manakai_compat_mode ('quirks');
2163     }
2164     }
2165    
2166     ## Go to the root element phase.
2167     !!!next-token;
2168     return;
2169     } elsif ({
2170 wakaba 1.55 START_TAG_TOKEN, 1,
2171     END_TAG_TOKEN, 1,
2172     END_OF_FILE_TOKEN, 1,
2173 wakaba 1.18 }->{$token->{type}}) {
2174     !!!parse-error (type => 'no DOCTYPE');
2175     $self->{document}->manakai_compat_mode ('quirks');
2176     ## Go to the root element phase
2177     ## reprocess
2178     return;
2179 wakaba 1.55 } elsif ($token->{type} == CHARACTER_TOKEN) {
2180 wakaba 1.18 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
2181     ## Ignore the token
2182 wakaba 1.26
2183 wakaba 1.18 unless (length $token->{data}) {
2184     ## Stay in the phase
2185     !!!next-token;
2186     redo INITIAL;
2187 wakaba 1.3 }
2188     }
2189 wakaba 1.18
2190     !!!parse-error (type => 'no DOCTYPE');
2191     $self->{document}->manakai_compat_mode ('quirks');
2192     ## Go to the root element phase
2193     ## reprocess
2194     return;
2195 wakaba 1.55 } elsif ($token->{type} == COMMENT_TOKEN) {
2196 wakaba 1.18 my $comment = $self->{document}->create_comment ($token->{data});
2197     $self->{document}->append_child ($comment);
2198    
2199     ## Stay in the phase.
2200     !!!next-token;
2201     redo INITIAL;
2202     } else {
2203 wakaba 1.55 die "$0: $token->{type}: Unknown token type";
2204 wakaba 1.18 }
2205     } # INITIAL
2206 wakaba 1.3 } # _tree_construction_initial
2207    
2208     sub _tree_construction_root_element ($) {
2209     my $self = shift;
2210    
2211     B: {
2212 wakaba 1.55 if ($token->{type} == DOCTYPE_TOKEN) {
2213 wakaba 1.3 !!!parse-error (type => 'in html:#DOCTYPE');
2214     ## Ignore the token
2215     ## Stay in the phase
2216     !!!next-token;
2217     redo B;
2218 wakaba 1.55 } elsif ($token->{type} == COMMENT_TOKEN) {
2219 wakaba 1.3 my $comment = $self->{document}->create_comment ($token->{data});
2220     $self->{document}->append_child ($comment);
2221     ## Stay in the phase
2222     !!!next-token;
2223     redo B;
2224 wakaba 1.55 } elsif ($token->{type} == CHARACTER_TOKEN) {
2225 wakaba 1.26 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
2226     ## Ignore the token.
2227    
2228 wakaba 1.3 unless (length $token->{data}) {
2229     ## Stay in the phase
2230     !!!next-token;
2231     redo B;
2232     }
2233     }
2234 wakaba 1.61
2235     $self->{application_cache_selection}->(undef);
2236    
2237     #
2238     } elsif ($token->{type} == START_TAG_TOKEN) {
2239     if ($token->{tag_name} eq 'html' and
2240 wakaba 1.67 $token->{attributes}->{manifest}) {
2241 wakaba 1.61 $self->{application_cache_selection}
2242     ->($token->{attributes}->{manifest}->{value});
2243     ## ISSUE: No relative reference resolution?
2244     } else {
2245     $self->{application_cache_selection}->(undef);
2246     }
2247    
2248     ## ISSUE: There is an issue in the spec
2249 wakaba 1.3 #
2250     } elsif ({
2251 wakaba 1.55 END_TAG_TOKEN, 1,
2252     END_OF_FILE_TOKEN, 1,
2253 wakaba 1.3 }->{$token->{type}}) {
2254 wakaba 1.61 $self->{application_cache_selection}->(undef);
2255    
2256 wakaba 1.3 ## ISSUE: There is an issue in the spec
2257     #
2258     } else {
2259 wakaba 1.55 die "$0: $token->{type}: Unknown token type";
2260 wakaba 1.3 }
2261 wakaba 1.61
2262 wakaba 1.3 my $root_element; !!!create-element ($root_element, 'html');
2263     $self->{document}->append_child ($root_element);
2264     push @{$self->{open_elements}}, [$root_element, 'html'];
2265     ## reprocess
2266     #redo B;
2267 wakaba 1.35 return; ## Go to the main phase.
2268 wakaba 1.3 } # B
2269     } # _tree_construction_root_element
2270    
2271     sub _reset_insertion_mode ($) {
2272     my $self = shift;
2273    
2274     ## Step 1
2275     my $last;
2276    
2277     ## Step 2
2278     my $i = -1;
2279     my $node = $self->{open_elements}->[$i];
2280    
2281     ## Step 3
2282     S3: {
2283 wakaba 1.29 ## ISSUE: Oops! "If node is the first node in the stack of open
2284     ## elements, then set last to true. If the context element of the
2285     ## HTML fragment parsing algorithm is neither a td element nor a
2286     ## th element, then set node to the context element. (fragment case)":
2287     ## The second "if" is in the scope of the first "if"!?
2288     if ($self->{open_elements}->[0]->[0] eq $node->[0]) {
2289     $last = 1;
2290     if (defined $self->{inner_html_node}) {
2291     if ($self->{inner_html_node}->[1] eq 'td' or
2292     $self->{inner_html_node}->[1] eq 'th') {
2293     #
2294     } else {
2295     $node = $self->{inner_html_node};
2296     }
2297 wakaba 1.3 }
2298     }
2299    
2300     ## Step 4..13
2301     my $new_mode = {
2302 wakaba 1.54 select => IN_SELECT_IM,
2303     td => IN_CELL_IM,
2304     th => IN_CELL_IM,
2305     tr => IN_ROW_IM,
2306     tbody => IN_TABLE_BODY_IM,
2307     thead => IN_TABLE_BODY_IM,
2308     tfoot => IN_TABLE_BODY_IM,
2309     caption => IN_CAPTION_IM,
2310     colgroup => IN_COLUMN_GROUP_IM,
2311     table => IN_TABLE_IM,
2312     head => IN_BODY_IM, # not in head!
2313     body => IN_BODY_IM,
2314     frameset => IN_FRAMESET_IM,
2315 wakaba 1.3 }->{$node->[1]};
2316     $self->{insertion_mode} = $new_mode and return if defined $new_mode;
2317    
2318     ## Step 14
2319     if ($node->[1] eq 'html') {
2320     unless (defined $self->{head_element}) {
2321 wakaba 1.54 $self->{insertion_mode} = BEFORE_HEAD_IM;
2322 wakaba 1.3 } else {
2323 wakaba 1.54 $self->{insertion_mode} = AFTER_HEAD_IM;
2324 wakaba 1.3 }
2325     return;
2326     }
2327    
2328     ## Step 15
2329 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM and return if $last;
2330 wakaba 1.3
2331     ## Step 16
2332     $i--;
2333     $node = $self->{open_elements}->[$i];
2334    
2335     ## Step 17
2336     redo S3;
2337     } # S3
2338     } # _reset_insertion_mode
2339    
2340     sub _tree_construction_main ($) {
2341     my $self = shift;
2342    
2343 wakaba 1.1 my $active_formatting_elements = [];
2344    
2345     my $reconstruct_active_formatting_elements = sub { # MUST
2346     my $insert = shift;
2347    
2348     ## Step 1
2349     return unless @$active_formatting_elements;
2350    
2351     ## Step 3
2352     my $i = -1;
2353     my $entry = $active_formatting_elements->[$i];
2354    
2355     ## Step 2
2356     return if $entry->[0] eq '#marker';
2357 wakaba 1.3 for (@{$self->{open_elements}}) {
2358 wakaba 1.1 if ($entry->[0] eq $_->[0]) {
2359     return;
2360     }
2361     }
2362    
2363     S4: {
2364     ## Step 4
2365     last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
2366    
2367     ## Step 5
2368     $i--;
2369     $entry = $active_formatting_elements->[$i];
2370    
2371     ## Step 6
2372     if ($entry->[0] eq '#marker') {
2373     #
2374     } else {
2375     my $in_open_elements;
2376 wakaba 1.3 OE: for (@{$self->{open_elements}}) {
2377 wakaba 1.1 if ($entry->[0] eq $_->[0]) {
2378     $in_open_elements = 1;
2379     last OE;
2380     }
2381     }
2382     if ($in_open_elements) {
2383     #
2384     } else {
2385     redo S4;
2386     }
2387     }
2388    
2389     ## Step 7
2390     $i++;
2391     $entry = $active_formatting_elements->[$i];
2392     } # S4
2393    
2394     S7: {
2395     ## Step 8
2396     my $clone = [$entry->[0]->clone_node (0), $entry->[1]];
2397    
2398     ## Step 9
2399     $insert->($clone->[0]);
2400 wakaba 1.3 push @{$self->{open_elements}}, $clone;
2401 wakaba 1.1
2402     ## Step 10
2403 wakaba 1.3 $active_formatting_elements->[$i] = $self->{open_elements}->[-1];
2404 wakaba 1.1
2405     ## Step 11
2406     unless ($clone->[0] eq $active_formatting_elements->[-1]->[0]) {
2407     ## Step 7'
2408     $i++;
2409     $entry = $active_formatting_elements->[$i];
2410    
2411     redo S7;
2412     }
2413     } # S7
2414     }; # $reconstruct_active_formatting_elements
2415    
2416     my $clear_up_to_marker = sub {
2417     for (reverse 0..$#$active_formatting_elements) {
2418     if ($active_formatting_elements->[$_]->[0] eq '#marker') {
2419     splice @$active_formatting_elements, $_;
2420     return;
2421     }
2422     }
2423     }; # $clear_up_to_marker
2424    
2425 wakaba 1.25 my $parse_rcdata = sub ($$) {
2426     my ($content_model_flag, $insert) = @_;
2427    
2428     ## Step 1
2429     my $start_tag_name = $token->{tag_name};
2430     my $el;
2431     !!!create-element ($el, $start_tag_name, $token->{attributes});
2432    
2433     ## Step 2
2434     $insert->($el); # /context node/->append_child ($el)
2435    
2436     ## Step 3
2437 wakaba 1.40 $self->{content_model} = $content_model_flag; # CDATA or RCDATA
2438 wakaba 1.13 delete $self->{escape}; # MUST
2439 wakaba 1.25
2440     ## Step 4
2441 wakaba 1.1 my $text = '';
2442     !!!next-token;
2443 wakaba 1.55 while ($token->{type} == CHARACTER_TOKEN) { # or until stop tokenizing
2444 wakaba 1.1 $text .= $token->{data};
2445     !!!next-token;
2446 wakaba 1.25 }
2447    
2448     ## Step 5
2449 wakaba 1.1 if (length $text) {
2450 wakaba 1.25 my $text = $self->{document}->create_text_node ($text);
2451     $el->append_child ($text);
2452 wakaba 1.1 }
2453 wakaba 1.25
2454     ## Step 6
2455 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL;
2456 wakaba 1.25
2457     ## Step 7
2458 wakaba 1.55 if ($token->{type} == END_TAG_TOKEN and $token->{tag_name} eq $start_tag_name) {
2459 wakaba 1.1 ## Ignore the token
2460 wakaba 1.40 } elsif ($content_model_flag == CDATA_CONTENT_MODEL) {
2461     !!!parse-error (type => 'in CDATA:#'.$token->{type});
2462     } elsif ($content_model_flag == RCDATA_CONTENT_MODEL) {
2463     !!!parse-error (type => 'in RCDATA:#'.$token->{type});
2464 wakaba 1.1 } else {
2465 wakaba 1.40 die "$0: $content_model_flag in parse_rcdata";
2466 wakaba 1.1 }
2467     !!!next-token;
2468 wakaba 1.25 }; # $parse_rcdata
2469 wakaba 1.1
2470 wakaba 1.25 my $script_start_tag = sub ($) {
2471     my $insert = $_[0];
2472 wakaba 1.1 my $script_el;
2473     !!!create-element ($script_el, 'script', $token->{attributes});
2474     ## TODO: mark as "parser-inserted"
2475    
2476 wakaba 1.40 $self->{content_model} = CDATA_CONTENT_MODEL;
2477 wakaba 1.13 delete $self->{escape}; # MUST
2478 wakaba 1.1
2479     my $text = '';
2480     !!!next-token;
2481 wakaba 1.55 while ($token->{type} == CHARACTER_TOKEN) {
2482 wakaba 1.1 $text .= $token->{data};
2483     !!!next-token;
2484     } # stop if non-character token or tokenizer stops tokenising
2485     if (length $text) {
2486     $script_el->manakai_append_text ($text);
2487     }
2488    
2489 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL;
2490 wakaba 1.1
2491 wakaba 1.55 if ($token->{type} == END_TAG_TOKEN and
2492 wakaba 1.1 $token->{tag_name} eq 'script') {
2493     ## Ignore the token
2494     } else {
2495 wakaba 1.3 !!!parse-error (type => 'in CDATA:#'.$token->{type});
2496 wakaba 1.1 ## ISSUE: And ignore?
2497     ## TODO: mark as "already executed"
2498     }
2499    
2500 wakaba 1.3 if (defined $self->{inner_html_node}) {
2501     ## TODO: mark as "already executed"
2502     } else {
2503 wakaba 1.1 ## TODO: $old_insertion_point = current insertion point
2504     ## TODO: insertion point = just before the next input character
2505 wakaba 1.25
2506     $insert->($script_el);
2507 wakaba 1.1
2508     ## TODO: insertion point = $old_insertion_point (might be "undefined")
2509    
2510     ## TODO: if there is a script that will execute as soon as the parser resume, then...
2511     }
2512    
2513     !!!next-token;
2514     }; # $script_start_tag
2515    
2516     my $formatting_end_tag = sub {
2517     my $tag_name = shift;
2518    
2519     FET: {
2520     ## Step 1
2521     my $formatting_element;
2522     my $formatting_element_i_in_active;
2523     AFE: for (reverse 0..$#$active_formatting_elements) {
2524     if ($active_formatting_elements->[$_]->[1] eq $tag_name) {
2525     $formatting_element = $active_formatting_elements->[$_];
2526     $formatting_element_i_in_active = $_;
2527     last AFE;
2528     } elsif ($active_formatting_elements->[$_]->[0] eq '#marker') {
2529     last AFE;
2530     }
2531     } # AFE
2532     unless (defined $formatting_element) {
2533 wakaba 1.3 !!!parse-error (type => 'unmatched end tag:'.$tag_name);
2534 wakaba 1.1 ## Ignore the token
2535     !!!next-token;
2536     return;
2537     }
2538     ## has an element in scope
2539     my $in_scope = 1;
2540     my $formatting_element_i_in_open;
2541 wakaba 1.3 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2542     my $node = $self->{open_elements}->[$_];
2543 wakaba 1.1 if ($node->[0] eq $formatting_element->[0]) {
2544     if ($in_scope) {
2545     $formatting_element_i_in_open = $_;
2546     last INSCOPE;
2547     } else { # in open elements but not in scope
2548 wakaba 1.4 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2549 wakaba 1.1 ## Ignore the token
2550     !!!next-token;
2551     return;
2552     }
2553     } elsif ({
2554     table => 1, caption => 1, td => 1, th => 1,
2555     button => 1, marquee => 1, object => 1, html => 1,
2556     }->{$node->[1]}) {
2557     $in_scope = 0;
2558     }
2559     } # INSCOPE
2560     unless (defined $formatting_element_i_in_open) {
2561 wakaba 1.4 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2562 wakaba 1.1 pop @$active_formatting_elements; # $formatting_element
2563     !!!next-token; ## TODO: ok?
2564     return;
2565     }
2566 wakaba 1.3 if (not $self->{open_elements}->[-1]->[0] eq $formatting_element->[0]) {
2567 wakaba 1.4 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
2568 wakaba 1.1 }
2569    
2570     ## Step 2
2571     my $furthest_block;
2572     my $furthest_block_i_in_open;
2573 wakaba 1.3 OE: for (reverse 0..$#{$self->{open_elements}}) {
2574     my $node = $self->{open_elements}->[$_];
2575 wakaba 1.1 if (not $formatting_category->{$node->[1]} and
2576     #not $phrasing_category->{$node->[1]} and
2577     ($special_category->{$node->[1]} or
2578     $scoping_category->{$node->[1]})) {
2579     $furthest_block = $node;
2580     $furthest_block_i_in_open = $_;
2581     } elsif ($node->[0] eq $formatting_element->[0]) {
2582     last OE;
2583     }
2584     } # OE
2585    
2586     ## Step 3
2587     unless (defined $furthest_block) { # MUST
2588 wakaba 1.3 splice @{$self->{open_elements}}, $formatting_element_i_in_open;
2589 wakaba 1.1 splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
2590     !!!next-token;
2591     return;
2592     }
2593    
2594     ## Step 4
2595 wakaba 1.3 my $common_ancestor_node = $self->{open_elements}->[$formatting_element_i_in_open - 1];
2596 wakaba 1.1
2597     ## Step 5
2598     my $furthest_block_parent = $furthest_block->[0]->parent_node;
2599     if (defined $furthest_block_parent) {
2600     $furthest_block_parent->remove_child ($furthest_block->[0]);
2601     }
2602    
2603     ## Step 6
2604     my $bookmark_prev_el
2605     = $active_formatting_elements->[$formatting_element_i_in_active - 1]
2606     ->[0];
2607    
2608     ## Step 7
2609     my $node = $furthest_block;
2610     my $node_i_in_open = $furthest_block_i_in_open;
2611     my $last_node = $furthest_block;
2612     S7: {
2613     ## Step 1
2614     $node_i_in_open--;
2615 wakaba 1.3 $node = $self->{open_elements}->[$node_i_in_open];
2616 wakaba 1.1
2617     ## Step 2
2618     my $node_i_in_active;
2619     S7S2: {
2620     for (reverse 0..$#$active_formatting_elements) {
2621     if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
2622     $node_i_in_active = $_;
2623     last S7S2;
2624     }
2625     }
2626 wakaba 1.3 splice @{$self->{open_elements}}, $node_i_in_open, 1;
2627 wakaba 1.1 redo S7;
2628     } # S7S2
2629    
2630     ## Step 3
2631     last S7 if $node->[0] eq $formatting_element->[0];
2632    
2633     ## Step 4
2634     if ($last_node->[0] eq $furthest_block->[0]) {
2635     $bookmark_prev_el = $node->[0];
2636     }
2637    
2638     ## Step 5
2639     if ($node->[0]->has_child_nodes ()) {
2640     my $clone = [$node->[0]->clone_node (0), $node->[1]];
2641     $active_formatting_elements->[$node_i_in_active] = $clone;
2642 wakaba 1.3 $self->{open_elements}->[$node_i_in_open] = $clone;
2643 wakaba 1.1 $node = $clone;
2644     }
2645    
2646     ## Step 6
2647     $node->[0]->append_child ($last_node->[0]);
2648    
2649     ## Step 7
2650     $last_node = $node;
2651    
2652     ## Step 8
2653     redo S7;
2654     } # S7
2655    
2656     ## Step 8
2657     $common_ancestor_node->[0]->append_child ($last_node->[0]);
2658    
2659     ## Step 9
2660     my $clone = [$formatting_element->[0]->clone_node (0),
2661     $formatting_element->[1]];
2662    
2663     ## Step 10
2664     my @cn = @{$furthest_block->[0]->child_nodes};
2665     $clone->[0]->append_child ($_) for @cn;
2666    
2667     ## Step 11
2668     $furthest_block->[0]->append_child ($clone->[0]);
2669    
2670     ## Step 12
2671     my $i;
2672     AFE: for (reverse 0..$#$active_formatting_elements) {
2673     if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
2674     splice @$active_formatting_elements, $_, 1;
2675     $i-- and last AFE if defined $i;
2676     } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
2677     $i = $_;
2678     }
2679     } # AFE
2680     splice @$active_formatting_elements, $i + 1, 0, $clone;
2681    
2682     ## Step 13
2683     undef $i;
2684 wakaba 1.3 OE: for (reverse 0..$#{$self->{open_elements}}) {
2685     if ($self->{open_elements}->[$_]->[0] eq $formatting_element->[0]) {
2686     splice @{$self->{open_elements}}, $_, 1;
2687 wakaba 1.1 $i-- and last OE if defined $i;
2688 wakaba 1.3 } elsif ($self->{open_elements}->[$_]->[0] eq $furthest_block->[0]) {
2689 wakaba 1.1 $i = $_;
2690     }
2691     } # OE
2692 wakaba 1.3 splice @{$self->{open_elements}}, $i + 1, 1, $clone;
2693 wakaba 1.1
2694     ## Step 14
2695     redo FET;
2696     } # FET
2697     }; # $formatting_end_tag
2698    
2699     my $insert_to_current = sub {
2700 wakaba 1.25 $self->{open_elements}->[-1]->[0]->append_child ($_[0]);
2701 wakaba 1.1 }; # $insert_to_current
2702    
2703     my $insert_to_foster = sub {
2704     my $child = shift;
2705     if ({
2706     table => 1, tbody => 1, tfoot => 1,
2707     thead => 1, tr => 1,
2708 wakaba 1.3 }->{$self->{open_elements}->[-1]->[1]}) {
2709 wakaba 1.1 # MUST
2710     my $foster_parent_element;
2711     my $next_sibling;
2712 wakaba 1.3 OE: for (reverse 0..$#{$self->{open_elements}}) {
2713     if ($self->{open_elements}->[$_]->[1] eq 'table') {
2714     my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
2715 wakaba 1.1 if (defined $parent and $parent->node_type == 1) {
2716     $foster_parent_element = $parent;
2717 wakaba 1.3 $next_sibling = $self->{open_elements}->[$_]->[0];
2718 wakaba 1.1 } else {
2719     $foster_parent_element
2720 wakaba 1.3 = $self->{open_elements}->[$_ - 1]->[0];
2721 wakaba 1.1 }
2722     last OE;
2723     }
2724     } # OE
2725 wakaba 1.3 $foster_parent_element = $self->{open_elements}->[0]->[0]
2726 wakaba 1.1 unless defined $foster_parent_element;
2727     $foster_parent_element->insert_before
2728     ($child, $next_sibling);
2729     } else {
2730 wakaba 1.3 $self->{open_elements}->[-1]->[0]->append_child ($child);
2731 wakaba 1.1 }
2732     }; # $insert_to_foster
2733    
2734 wakaba 1.52 my $insert;
2735 wakaba 1.34
2736 wakaba 1.52 B: {
2737 wakaba 1.55 if ($token->{type} == DOCTYPE_TOKEN) {
2738 wakaba 1.52 !!!parse-error (type => 'DOCTYPE in the middle');
2739     ## Ignore the token
2740     ## Stay in the phase
2741     !!!next-token;
2742     redo B;
2743 wakaba 1.55 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
2744 wakaba 1.56 if ($self->{insertion_mode} & AFTER_HTML_IMS) {
2745 wakaba 1.52 #
2746     } else {
2747     ## Generate implied end tags
2748     if ({
2749     dd => 1, dt => 1, li => 1, p => 1, td => 1, th => 1, tr => 1,
2750     tbody => 1, tfoot=> 1, thead => 1,
2751     }->{$self->{open_elements}->[-1]->[1]}) {
2752     !!!back-token;
2753 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => $self->{open_elements}->[-1]->[1]};
2754 wakaba 1.52 redo B;
2755     }
2756    
2757     if (@{$self->{open_elements}} > 2 or
2758     (@{$self->{open_elements}} == 2 and $self->{open_elements}->[1]->[1] ne 'body')) {
2759     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
2760     } elsif (defined $self->{inner_html_node} and
2761     @{$self->{open_elements}} > 1 and
2762     $self->{open_elements}->[1]->[1] ne 'body') {
2763     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
2764 wakaba 1.34 }
2765    
2766 wakaba 1.52 ## ISSUE: There is an issue in the spec.
2767     }
2768    
2769     ## Stop parsing
2770     last B;
2771 wakaba 1.55 } elsif ($token->{type} == START_TAG_TOKEN and
2772 wakaba 1.52 $token->{tag_name} eq 'html') {
2773 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
2774 wakaba 1.52 ## Turn into the main phase
2775     !!!parse-error (type => 'after html:html');
2776 wakaba 1.54 $self->{insertion_mode} = AFTER_BODY_IM;
2777     } elsif ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
2778 wakaba 1.52 ## Turn into the main phase
2779     !!!parse-error (type => 'after html:html');
2780 wakaba 1.54 $self->{insertion_mode} = AFTER_FRAMESET_IM;
2781 wakaba 1.52 }
2782    
2783     ## ISSUE: "aa<html>" is not a parse error.
2784     ## ISSUE: "<html>" in fragment is not a parse error.
2785     unless ($token->{first_start_tag}) {
2786     !!!parse-error (type => 'not first start tag');
2787     }
2788     my $top_el = $self->{open_elements}->[0]->[0];
2789     for my $attr_name (keys %{$token->{attributes}}) {
2790     unless ($top_el->has_attribute_ns (undef, $attr_name)) {
2791     $top_el->set_attribute_ns
2792     (undef, [undef, $attr_name],
2793     $token->{attributes}->{$attr_name}->{value});
2794     }
2795     }
2796     !!!next-token;
2797     redo B;
2798 wakaba 1.55 } elsif ($token->{type} == COMMENT_TOKEN) {
2799 wakaba 1.52 my $comment = $self->{document}->create_comment ($token->{data});
2800 wakaba 1.56 if ($self->{insertion_mode} & AFTER_HTML_IMS) {
2801 wakaba 1.52 $self->{document}->append_child ($comment);
2802 wakaba 1.54 } elsif ($self->{insertion_mode} == AFTER_BODY_IM) {
2803 wakaba 1.52 $self->{open_elements}->[0]->[0]->append_child ($comment);
2804     } else {
2805     $self->{open_elements}->[-1]->[0]->append_child ($comment);
2806     }
2807     !!!next-token;
2808     redo B;
2809 wakaba 1.56 } elsif ($self->{insertion_mode} & HEAD_IMS) {
2810 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
2811 wakaba 1.52 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
2812     $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
2813     unless (length $token->{data}) {
2814     !!!next-token;
2815     redo B;
2816 wakaba 1.1 }
2817     }
2818 wakaba 1.52
2819 wakaba 1.54 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
2820 wakaba 1.52 ## As if <head>
2821     !!!create-element ($self->{head_element}, 'head');
2822     $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
2823     push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2824    
2825     ## Reprocess in the "in head" insertion mode...
2826     pop @{$self->{open_elements}};
2827    
2828     ## Reprocess in the "after head" insertion mode...
2829 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2830 wakaba 1.52 ## As if </noscript>
2831     pop @{$self->{open_elements}};
2832     !!!parse-error (type => 'in noscript:#character');
2833 wakaba 1.1
2834 wakaba 1.52 ## Reprocess in the "in head" insertion mode...
2835     ## As if </head>
2836     pop @{$self->{open_elements}};
2837    
2838     ## Reprocess in the "after head" insertion mode...
2839 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
2840 wakaba 1.52 pop @{$self->{open_elements}};
2841    
2842     ## Reprocess in the "after head" insertion mode...
2843 wakaba 1.1 }
2844 wakaba 1.52
2845     ## "after head" insertion mode
2846     ## As if <body>
2847     !!!insert-element ('body');
2848 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM;
2849 wakaba 1.52 ## reprocess
2850     redo B;
2851 wakaba 1.55 } elsif ($token->{type} == START_TAG_TOKEN) {
2852 wakaba 1.52 if ($token->{tag_name} eq 'head') {
2853 wakaba 1.54 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
2854 wakaba 1.52 !!!create-element ($self->{head_element}, $token->{tag_name}, $token->{attributes});
2855     $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
2856     push @{$self->{open_elements}}, [$self->{head_element}, $token->{tag_name}];
2857 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_IM;
2858 wakaba 1.52 !!!next-token;
2859     redo B;
2860 wakaba 1.54 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
2861     #
2862     } else {
2863 wakaba 1.52 !!!parse-error (type => 'in head:head'); # or in head noscript
2864     ## Ignore the token
2865     !!!next-token;
2866     redo B;
2867     }
2868 wakaba 1.54 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
2869 wakaba 1.52 ## As if <head>
2870     !!!create-element ($self->{head_element}, 'head');
2871     $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
2872     push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2873    
2874 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_IM;
2875 wakaba 1.52 ## Reprocess in the "in head" insertion mode...
2876 wakaba 1.1 }
2877 wakaba 1.52
2878 wakaba 1.49 if ($token->{tag_name} eq 'base') {
2879 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2880 wakaba 1.49 ## As if </noscript>
2881     pop @{$self->{open_elements}};
2882     !!!parse-error (type => 'in noscript:base');
2883    
2884 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_IM;
2885 wakaba 1.49 ## Reprocess in the "in head" insertion mode...
2886     }
2887    
2888     ## NOTE: There is a "as if in head" code clone.
2889 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
2890 wakaba 1.49 !!!parse-error (type => 'after head:'.$token->{tag_name});
2891     push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2892     }
2893     !!!insert-element ($token->{tag_name}, $token->{attributes});
2894     pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
2895     pop @{$self->{open_elements}}
2896 wakaba 1.54 if $self->{insertion_mode} == AFTER_HEAD_IM;
2897 wakaba 1.49 !!!next-token;
2898     redo B;
2899     } elsif ($token->{tag_name} eq 'link') {
2900 wakaba 1.25 ## NOTE: There is a "as if in head" code clone.
2901 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
2902 wakaba 1.25 !!!parse-error (type => 'after head:'.$token->{tag_name});
2903     push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2904     }
2905     !!!insert-element ($token->{tag_name}, $token->{attributes});
2906     pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
2907     pop @{$self->{open_elements}}
2908 wakaba 1.54 if $self->{insertion_mode} == AFTER_HEAD_IM;
2909 wakaba 1.1 !!!next-token;
2910 wakaba 1.25 redo B;
2911 wakaba 1.34 } elsif ($token->{tag_name} eq 'meta') {
2912     ## NOTE: There is a "as if in head" code clone.
2913 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
2914 wakaba 1.34 !!!parse-error (type => 'after head:'.$token->{tag_name});
2915     push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2916     }
2917     !!!insert-element ($token->{tag_name}, $token->{attributes});
2918 wakaba 1.66 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
2919 wakaba 1.34
2920     unless ($self->{confident}) {
2921     if ($token->{attributes}->{charset}) { ## TODO: And if supported
2922 wakaba 1.63 $self->{change_encoding}
2923     ->($self, $token->{attributes}->{charset}->{value});
2924 wakaba 1.66
2925     $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
2926     ->set_user_data (manakai_has_reference =>
2927     $token->{attributes}->{charset}
2928     ->{has_reference});
2929 wakaba 1.63 } elsif ($token->{attributes}->{content}) {
2930 wakaba 1.35 ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
2931 wakaba 1.63 if ($token->{attributes}->{content}->{value}
2932 wakaba 1.70 =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
2933     [\x09-\x0D\x20]*=
2934 wakaba 1.34 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
2935     ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
2936 wakaba 1.63 $self->{change_encoding}
2937     ->($self, defined $1 ? $1 : defined $2 ? $2 : $3);
2938 wakaba 1.68 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
2939     ->set_user_data (manakai_has_reference =>
2940     $token->{attributes}->{content}
2941     ->{has_reference});
2942 wakaba 1.63 }
2943 wakaba 1.34 }
2944 wakaba 1.66 } else {
2945     if ($token->{attributes}->{charset}) {
2946     $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
2947     ->set_user_data (manakai_has_reference =>
2948     $token->{attributes}->{charset}
2949     ->{has_reference});
2950     }
2951 wakaba 1.68 if ($token->{attributes}->{content}) {
2952     $meta_el->[0]->get_attribute_node_ns (undef, 'content')
2953     ->set_user_data (manakai_has_reference =>
2954     $token->{attributes}->{content}
2955     ->{has_reference});
2956     }
2957 wakaba 1.34 }
2958    
2959     pop @{$self->{open_elements}}
2960 wakaba 1.54 if $self->{insertion_mode} == AFTER_HEAD_IM;
2961 wakaba 1.34 !!!next-token;
2962     redo B;
2963 wakaba 1.49 } elsif ($token->{tag_name} eq 'title') {
2964 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2965 wakaba 1.49 ## As if </noscript>
2966     pop @{$self->{open_elements}};
2967     !!!parse-error (type => 'in noscript:title');
2968    
2969 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_IM;
2970 wakaba 1.49 ## Reprocess in the "in head" insertion mode...
2971 wakaba 1.54 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
2972 wakaba 1.25 !!!parse-error (type => 'after head:'.$token->{tag_name});
2973     push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2974     }
2975 wakaba 1.49
2976     ## NOTE: There is a "as if in head" code clone.
2977 wakaba 1.31 my $parent = defined $self->{head_element} ? $self->{head_element}
2978     : $self->{open_elements}->[-1]->[0];
2979 wakaba 1.40 $parse_rcdata->(RCDATA_CONTENT_MODEL,
2980     sub { $parent->append_child ($_[0]) });
2981 wakaba 1.25 pop @{$self->{open_elements}}
2982 wakaba 1.54 if $self->{insertion_mode} == AFTER_HEAD_IM;
2983 wakaba 1.25 redo B;
2984     } elsif ($token->{tag_name} eq 'style') {
2985     ## NOTE: Or (scripting is enabled and tag_name eq 'noscript' and
2986 wakaba 1.54 ## insertion mode IN_HEAD_IM)
2987 wakaba 1.25 ## NOTE: There is a "as if in head" code clone.
2988 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
2989 wakaba 1.25 !!!parse-error (type => 'after head:'.$token->{tag_name});
2990     push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2991     }
2992 wakaba 1.40 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert_to_current);
2993 wakaba 1.25 pop @{$self->{open_elements}}
2994 wakaba 1.54 if $self->{insertion_mode} == AFTER_HEAD_IM;
2995 wakaba 1.25 redo B;
2996     } elsif ($token->{tag_name} eq 'noscript') {
2997 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_IM) {
2998 wakaba 1.25 ## NOTE: and scripting is disalbed
2999     !!!insert-element ($token->{tag_name}, $token->{attributes});
3000 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_NOSCRIPT_IM;
3001 wakaba 1.1 !!!next-token;
3002 wakaba 1.25 redo B;
3003 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3004 wakaba 1.30 !!!parse-error (type => 'in noscript:noscript');
3005 wakaba 1.1 ## Ignore the token
3006 wakaba 1.41 !!!next-token;
3007 wakaba 1.25 redo B;
3008 wakaba 1.1 } else {
3009 wakaba 1.25 #
3010 wakaba 1.1 }
3011 wakaba 1.49 } elsif ($token->{tag_name} eq 'script') {
3012 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3013 wakaba 1.49 ## As if </noscript>
3014     pop @{$self->{open_elements}};
3015     !!!parse-error (type => 'in noscript:script');
3016    
3017 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_IM;
3018 wakaba 1.49 ## Reprocess in the "in head" insertion mode...
3019 wakaba 1.54 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
3020 wakaba 1.25 !!!parse-error (type => 'after head:'.$token->{tag_name});
3021     push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3022     }
3023 wakaba 1.49
3024 wakaba 1.25 ## NOTE: There is a "as if in head" code clone.
3025     $script_start_tag->($insert_to_current);
3026     pop @{$self->{open_elements}}
3027 wakaba 1.54 if $self->{insertion_mode} == AFTER_HEAD_IM;
3028 wakaba 1.1 redo B;
3029 wakaba 1.49 } elsif ($token->{tag_name} eq 'body' or
3030 wakaba 1.25 $token->{tag_name} eq 'frameset') {
3031 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3032 wakaba 1.49 ## As if </noscript>
3033     pop @{$self->{open_elements}};
3034     !!!parse-error (type => 'in noscript:'.$token->{tag_name});
3035    
3036     ## Reprocess in the "in head" insertion mode...
3037     ## As if </head>
3038     pop @{$self->{open_elements}};
3039    
3040     ## Reprocess in the "after head" insertion mode...
3041 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3042 wakaba 1.49 pop @{$self->{open_elements}};
3043    
3044     ## Reprocess in the "after head" insertion mode...
3045     }
3046    
3047     ## "after head" insertion mode
3048     !!!insert-element ($token->{tag_name}, $token->{attributes});
3049 wakaba 1.54 if ($token->{tag_name} eq 'body') {
3050     $self->{insertion_mode} = IN_BODY_IM;
3051     } elsif ($token->{tag_name} eq 'frameset') {
3052     $self->{insertion_mode} = IN_FRAMESET_IM;
3053     } else {
3054     die "$0: tag name: $self->{tag_name}";
3055     }
3056 wakaba 1.1 !!!next-token;
3057     redo B;
3058     } else {
3059     #
3060     }
3061 wakaba 1.49
3062 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3063 wakaba 1.49 ## As if </noscript>
3064     pop @{$self->{open_elements}};
3065     !!!parse-error (type => 'in noscript:/'.$token->{tag_name});
3066    
3067     ## Reprocess in the "in head" insertion mode...
3068     ## As if </head>
3069 wakaba 1.25 pop @{$self->{open_elements}};
3070 wakaba 1.49
3071     ## Reprocess in the "after head" insertion mode...
3072 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3073 wakaba 1.49 ## As if </head>
3074 wakaba 1.25 pop @{$self->{open_elements}};
3075 wakaba 1.49
3076     ## Reprocess in the "after head" insertion mode...
3077     }
3078    
3079     ## "after head" insertion mode
3080     ## As if <body>
3081     !!!insert-element ('body');
3082 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM;
3083 wakaba 1.49 ## reprocess
3084     redo B;
3085 wakaba 1.55 } elsif ($token->{type} == END_TAG_TOKEN) {
3086 wakaba 1.49 if ($token->{tag_name} eq 'head') {
3087 wakaba 1.54 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3088 wakaba 1.50 ## As if <head>
3089     !!!create-element ($self->{head_element}, 'head');
3090     $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3091     push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3092    
3093     ## Reprocess in the "in head" insertion mode...
3094     pop @{$self->{open_elements}};
3095 wakaba 1.54 $self->{insertion_mode} = AFTER_HEAD_IM;
3096 wakaba 1.50 !!!next-token;
3097     redo B;
3098 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3099 wakaba 1.49 ## As if </noscript>
3100     pop @{$self->{open_elements}};
3101     !!!parse-error (type => 'in noscript:script');
3102    
3103     ## Reprocess in the "in head" insertion mode...
3104 wakaba 1.50 pop @{$self->{open_elements}};
3105 wakaba 1.54 $self->{insertion_mode} = AFTER_HEAD_IM;
3106 wakaba 1.50 !!!next-token;
3107     redo B;
3108 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3109 wakaba 1.49 pop @{$self->{open_elements}};
3110 wakaba 1.54 $self->{insertion_mode} = AFTER_HEAD_IM;
3111 wakaba 1.49 !!!next-token;
3112     redo B;
3113     } else {
3114     #
3115     }
3116     } elsif ($token->{tag_name} eq 'noscript') {
3117 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3118 wakaba 1.49 pop @{$self->{open_elements}};
3119 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_IM;
3120 wakaba 1.49 !!!next-token;
3121     redo B;
3122 wakaba 1.54 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3123 wakaba 1.50 !!!parse-error (type => 'unmatched end tag:noscript');
3124     ## Ignore the token ## ISSUE: An issue in the spec.
3125     !!!next-token;
3126     redo B;
3127 wakaba 1.49 } else {
3128     #
3129     }
3130     } elsif ({
3131 wakaba 1.31 body => 1, html => 1,
3132     }->{$token->{tag_name}}) {
3133 wakaba 1.54 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3134 wakaba 1.50 ## As if <head>
3135     !!!create-element ($self->{head_element}, 'head');
3136     $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3137     push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3138    
3139 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_IM;
3140 wakaba 1.50 ## Reprocess in the "in head" insertion mode...
3141 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3142 wakaba 1.49 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3143     ## Ignore the token
3144     !!!next-token;
3145     redo B;
3146     }
3147 wakaba 1.50
3148     #
3149 wakaba 1.49 } elsif ({
3150 wakaba 1.31 p => 1, br => 1,
3151     }->{$token->{tag_name}}) {
3152 wakaba 1.54 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3153 wakaba 1.50 ## As if <head>
3154     !!!create-element ($self->{head_element}, 'head');
3155     $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3156     push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3157    
3158 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_IM;
3159 wakaba 1.50 ## Reprocess in the "in head" insertion mode...
3160     }
3161    
3162 wakaba 1.1 #
3163 wakaba 1.25 } else {
3164 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
3165     #
3166     } else {
3167 wakaba 1.49 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3168     ## Ignore the token
3169     !!!next-token;
3170     redo B;
3171     }
3172     }
3173    
3174 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3175 wakaba 1.49 ## As if </noscript>
3176     pop @{$self->{open_elements}};
3177     !!!parse-error (type => 'in noscript:/'.$token->{tag_name});
3178    
3179     ## Reprocess in the "in head" insertion mode...
3180     ## As if </head>
3181     pop @{$self->{open_elements}};
3182    
3183     ## Reprocess in the "after head" insertion mode...
3184 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3185 wakaba 1.49 ## As if </head>
3186     pop @{$self->{open_elements}};
3187    
3188     ## Reprocess in the "after head" insertion mode...
3189 wakaba 1.54 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3190 wakaba 1.50 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3191     ## Ignore the token ## ISSUE: An issue in the spec.
3192     !!!next-token;
3193     redo B;
3194 wakaba 1.1 }
3195    
3196 wakaba 1.49 ## "after head" insertion mode
3197     ## As if <body>
3198 wakaba 1.52 !!!insert-element ('body');
3199 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM;
3200 wakaba 1.52 ## reprocess
3201     redo B;
3202     } else {
3203     die "$0: $token->{type}: Unknown token type";
3204     }
3205    
3206     ## ISSUE: An issue in the spec.
3207 wakaba 1.56 } elsif ($self->{insertion_mode} & BODY_IMS) {
3208 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
3209 wakaba 1.52 ## NOTE: There is a code clone of "character in body".
3210     $reconstruct_active_formatting_elements->($insert_to_current);
3211    
3212     $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3213    
3214     !!!next-token;
3215     redo B;
3216 wakaba 1.55 } elsif ($token->{type} == START_TAG_TOKEN) {
3217 wakaba 1.52 if ({
3218     caption => 1, col => 1, colgroup => 1, tbody => 1,
3219     td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
3220     }->{$token->{tag_name}}) {
3221 wakaba 1.54 if ($self->{insertion_mode} == IN_CELL_IM) {
3222 wakaba 1.52 ## have an element in table scope
3223     my $tn;
3224     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3225     my $node = $self->{open_elements}->[$_];
3226     if ($node->[1] eq 'td' or $node->[1] eq 'th') {
3227     $tn = $node->[1];
3228     last INSCOPE;
3229     } elsif ({
3230     table => 1, html => 1,
3231     }->{$node->[1]}) {
3232     last INSCOPE;
3233     }
3234     } # INSCOPE
3235     unless (defined $tn) {
3236     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3237     ## Ignore the token
3238     !!!next-token;
3239     redo B;
3240     }
3241    
3242     ## Close the cell
3243     !!!back-token; # <?>
3244 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => $tn};
3245 wakaba 1.52 redo B;
3246 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
3247 wakaba 1.52 !!!parse-error (type => 'not closed:caption');
3248    
3249     ## As if </caption>
3250     ## have a table element in table scope
3251     my $i;
3252     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3253     my $node = $self->{open_elements}->[$_];
3254     if ($node->[1] eq 'caption') {
3255     $i = $_;
3256     last INSCOPE;
3257     } elsif ({
3258     table => 1, html => 1,
3259     }->{$node->[1]}) {
3260     last INSCOPE;
3261     }
3262     } # INSCOPE
3263     unless (defined $i) {
3264     !!!parse-error (type => 'unmatched end tag:caption');
3265     ## Ignore the token
3266     !!!next-token;
3267     redo B;
3268     }
3269    
3270     ## generate implied end tags
3271     if ({
3272     dd => 1, dt => 1, li => 1, p => 1,
3273     td => 1, th => 1, tr => 1,
3274     tbody => 1, tfoot=> 1, thead => 1,
3275     }->{$self->{open_elements}->[-1]->[1]}) {
3276     !!!back-token; # <?>
3277 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'caption'};
3278 wakaba 1.52 !!!back-token;
3279 wakaba 1.55 $token = {type => END_TAG_TOKEN,
3280 wakaba 1.52 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3281     redo B;
3282     }
3283    
3284     if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3285     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3286     }
3287    
3288     splice @{$self->{open_elements}}, $i;
3289    
3290     $clear_up_to_marker->();
3291    
3292 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
3293 wakaba 1.52
3294     ## reprocess
3295     redo B;
3296     } else {
3297     #
3298     }
3299     } else {
3300     #
3301     }
3302 wakaba 1.55 } elsif ($token->{type} == END_TAG_TOKEN) {
3303 wakaba 1.52 if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') {
3304 wakaba 1.54 if ($self->{insertion_mode} == IN_CELL_IM) {
3305 wakaba 1.43 ## have an element in table scope
3306 wakaba 1.52 my $i;
3307 wakaba 1.43 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3308     my $node = $self->{open_elements}->[$_];
3309 wakaba 1.52 if ($node->[1] eq $token->{tag_name}) {
3310     $i = $_;
3311 wakaba 1.43 last INSCOPE;
3312     } elsif ({
3313     table => 1, html => 1,
3314     }->{$node->[1]}) {
3315     last INSCOPE;
3316     }
3317     } # INSCOPE
3318 wakaba 1.52 unless (defined $i) {
3319 wakaba 1.43 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3320     ## Ignore the token
3321     !!!next-token;
3322     redo B;
3323     }
3324    
3325 wakaba 1.52 ## generate implied end tags
3326     if ({
3327     dd => 1, dt => 1, li => 1, p => 1,
3328     td => ($token->{tag_name} eq 'th'),
3329     th => ($token->{tag_name} eq 'td'),
3330     tr => 1,
3331     tbody => 1, tfoot=> 1, thead => 1,
3332     }->{$self->{open_elements}->[-1]->[1]}) {
3333     !!!back-token;
3334 wakaba 1.55 $token = {type => END_TAG_TOKEN,
3335 wakaba 1.52 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3336     redo B;
3337     }
3338    
3339     if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
3340     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3341     }
3342    
3343     splice @{$self->{open_elements}}, $i;
3344    
3345     $clear_up_to_marker->();
3346    
3347 wakaba 1.54 $self->{insertion_mode} = IN_ROW_IM;
3348 wakaba 1.52
3349     !!!next-token;
3350 wakaba 1.43 redo B;
3351 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
3352 wakaba 1.52 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3353     ## Ignore the token
3354     !!!next-token;
3355     redo B;
3356     } else {
3357     #
3358     }
3359     } elsif ($token->{tag_name} eq 'caption') {
3360 wakaba 1.54 if ($self->{insertion_mode} == IN_CAPTION_IM) {
3361 wakaba 1.43 ## have a table element in table scope
3362     my $i;
3363     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3364     my $node = $self->{open_elements}->[$_];
3365 wakaba 1.52 if ($node->[1] eq $token->{tag_name}) {
3366 wakaba 1.43 $i = $_;
3367     last INSCOPE;
3368     } elsif ({
3369     table => 1, html => 1,
3370     }->{$node->[1]}) {
3371     last INSCOPE;
3372     }
3373     } # INSCOPE
3374     unless (defined $i) {
3375 wakaba 1.52 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3376 wakaba 1.43 ## Ignore the token
3377     !!!next-token;
3378     redo B;
3379     }
3380    
3381     ## generate implied end tags
3382     if ({
3383     dd => 1, dt => 1, li => 1, p => 1,
3384     td => 1, th => 1, tr => 1,
3385     tbody => 1, tfoot=> 1, thead => 1,
3386     }->{$self->{open_elements}->[-1]->[1]}) {
3387     !!!back-token;
3388 wakaba 1.55 $token = {type => END_TAG_TOKEN,
3389 wakaba 1.43 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3390     redo B;
3391     }
3392 wakaba 1.52
3393     if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3394     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3395     }
3396    
3397     splice @{$self->{open_elements}}, $i;
3398    
3399     $clear_up_to_marker->();
3400    
3401 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
3402 wakaba 1.52
3403     !!!next-token;
3404     redo B;
3405 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_CELL_IM) {
3406 wakaba 1.52 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3407     ## Ignore the token
3408     !!!next-token;
3409     redo B;
3410     } else {
3411     #
3412     }
3413     } elsif ({
3414     table => 1, tbody => 1, tfoot => 1,
3415     thead => 1, tr => 1,
3416     }->{$token->{tag_name}} and
3417 wakaba 1.54 $self->{insertion_mode} == IN_CELL_IM) {
3418 wakaba 1.52 ## have an element in table scope
3419     my $i;
3420     my $tn;
3421     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3422     my $node = $self->{open_elements}->[$_];
3423     if ($node->[1] eq $token->{tag_name}) {
3424     $i = $_;
3425     last INSCOPE;
3426     } elsif ($node->[1] eq 'td' or $node->[1] eq 'th') {
3427     $tn = $node->[1];
3428     ## NOTE: There is exactly one |td| or |th| element
3429     ## in scope in the stack of open elements by definition.
3430     } elsif ({
3431     table => 1, html => 1,
3432     }->{$node->[1]}) {
3433     last INSCOPE;
3434     }
3435     } # INSCOPE
3436     unless (defined $i) {
3437     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3438     ## Ignore the token
3439     !!!next-token;
3440     redo B;
3441     }
3442    
3443     ## Close the cell
3444     !!!back-token; # </?>
3445 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => $tn};
3446 wakaba 1.52 redo B;
3447     } elsif ($token->{tag_name} eq 'table' and
3448 wakaba 1.54 $self->{insertion_mode} == IN_CAPTION_IM) {
3449 wakaba 1.52 !!!parse-error (type => 'not closed:caption');
3450    
3451     ## As if </caption>
3452     ## have a table element in table scope
3453     my $i;
3454     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3455     my $node = $self->{open_elements}->[$_];
3456     if ($node->[1] eq 'caption') {
3457     $i = $_;
3458     last INSCOPE;
3459     } elsif ({
3460     table => 1, html => 1,
3461     }->{$node->[1]}) {
3462     last INSCOPE;
3463     }
3464     } # INSCOPE
3465     unless (defined $i) {
3466     !!!parse-error (type => 'unmatched end tag:caption');
3467     ## Ignore the token
3468     !!!next-token;
3469     redo B;
3470     }
3471    
3472     ## generate implied end tags
3473     if ({
3474     dd => 1, dt => 1, li => 1, p => 1,
3475     td => 1, th => 1, tr => 1,
3476     tbody => 1, tfoot=> 1, thead => 1,
3477     }->{$self->{open_elements}->[-1]->[1]}) {
3478     !!!back-token; # </table>
3479 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'caption'};
3480 wakaba 1.52 !!!back-token;
3481 wakaba 1.55 $token = {type => END_TAG_TOKEN,
3482 wakaba 1.52 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3483     redo B;
3484     }
3485    
3486     if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3487     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3488     }
3489    
3490     splice @{$self->{open_elements}}, $i;
3491    
3492     $clear_up_to_marker->();
3493    
3494 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
3495 wakaba 1.52
3496     ## reprocess
3497     redo B;
3498     } elsif ({
3499     body => 1, col => 1, colgroup => 1, html => 1,
3500     }->{$token->{tag_name}}) {
3501 wakaba 1.56 if ($self->{insertion_mode} & BODY_TABLE_IMS) {
3502 wakaba 1.52 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3503     ## Ignore the token
3504     !!!next-token;
3505     redo B;
3506     } else {
3507     #
3508     }
3509     } elsif ({
3510     tbody => 1, tfoot => 1,
3511     thead => 1, tr => 1,
3512     }->{$token->{tag_name}} and
3513 wakaba 1.54 $self->{insertion_mode} == IN_CAPTION_IM) {
3514 wakaba 1.52 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3515     ## Ignore the token
3516     !!!next-token;
3517     redo B;
3518     } else {
3519     #
3520     }
3521     } else {
3522     die "$0: $token->{type}: Unknown token type";
3523     }
3524    
3525     $insert = $insert_to_current;
3526     #
3527 wakaba 1.56 } elsif ($self->{insertion_mode} & TABLE_IMS) {
3528 wakaba 1.58 if ($token->{type} == CHARACTER_TOKEN) {
3529 wakaba 1.52 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3530     $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3531    
3532     unless (length $token->{data}) {
3533     !!!next-token;
3534     redo B;
3535     }
3536     }
3537    
3538     !!!parse-error (type => 'in table:#character');
3539    
3540     ## As if in body, but insert into foster parent element
3541     ## ISSUE: Spec says that "whenever a node would be inserted
3542     ## into the current node" while characters might not be
3543     ## result in a new Text node.
3544     $reconstruct_active_formatting_elements->($insert_to_foster);
3545    
3546     if ({
3547     table => 1, tbody => 1, tfoot => 1,
3548     thead => 1, tr => 1,
3549     }->{$self->{open_elements}->[-1]->[1]}) {
3550     # MUST
3551     my $foster_parent_element;
3552     my $next_sibling;
3553     my $prev_sibling;
3554     OE: for (reverse 0..$#{$self->{open_elements}}) {
3555     if ($self->{open_elements}->[$_]->[1] eq 'table') {
3556     my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3557     if (defined $parent and $parent->node_type == 1) {
3558     $foster_parent_element = $parent;
3559     $next_sibling = $self->{open_elements}->[$_]->[0];
3560     $prev_sibling = $next_sibling->previous_sibling;
3561     } else {
3562     $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
3563     $prev_sibling = $foster_parent_element->last_child;
3564     }
3565     last OE;
3566     }
3567     } # OE
3568     $foster_parent_element = $self->{open_elements}->[0]->[0] and
3569     $prev_sibling = $foster_parent_element->last_child
3570     unless defined $foster_parent_element;
3571     if (defined $prev_sibling and
3572     $prev_sibling->node_type == 3) {
3573     $prev_sibling->manakai_append_text ($token->{data});
3574     } else {
3575     $foster_parent_element->insert_before
3576     ($self->{document}->create_text_node ($token->{data}),
3577     $next_sibling);
3578     }
3579     } else {
3580     $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3581     }
3582    
3583     !!!next-token;
3584     redo B;
3585 wakaba 1.58 } elsif ($token->{type} == START_TAG_TOKEN) {
3586 wakaba 1.52 if ({
3587 wakaba 1.54 tr => ($self->{insertion_mode} != IN_ROW_IM),
3588 wakaba 1.52 th => 1, td => 1,
3589     }->{$token->{tag_name}}) {
3590 wakaba 1.54 if ($self->{insertion_mode} == IN_TABLE_IM) {
3591 wakaba 1.52 ## Clear back to table context
3592     while ($self->{open_elements}->[-1]->[1] ne 'table' and
3593     $self->{open_elements}->[-1]->[1] ne 'html') {
3594 wakaba 1.43 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3595 wakaba 1.52 pop @{$self->{open_elements}};
3596 wakaba 1.43 }
3597    
3598 wakaba 1.52 !!!insert-element ('tbody');
3599 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_BODY_IM;
3600 wakaba 1.52 ## reprocess in the "in table body" insertion mode...
3601     }
3602    
3603 wakaba 1.54 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
3604 wakaba 1.52 unless ($token->{tag_name} eq 'tr') {
3605     !!!parse-error (type => 'missing start tag:tr');
3606     }
3607 wakaba 1.43
3608 wakaba 1.52 ## Clear back to table body context
3609     while (not {
3610     tbody => 1, tfoot => 1, thead => 1, html => 1,
3611     }->{$self->{open_elements}->[-1]->[1]}) {
3612     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3613     pop @{$self->{open_elements}};
3614     }
3615 wakaba 1.43
3616 wakaba 1.54 $self->{insertion_mode} = IN_ROW_IM;
3617 wakaba 1.52 if ($token->{tag_name} eq 'tr') {
3618     !!!insert-element ($token->{tag_name}, $token->{attributes});
3619     !!!next-token;
3620     redo B;
3621     } else {
3622     !!!insert-element ('tr');
3623     ## reprocess in the "in row" insertion mode
3624     }
3625     }
3626    
3627     ## Clear back to table row context
3628     while (not {
3629     tr => 1, html => 1,
3630     }->{$self->{open_elements}->[-1]->[1]}) {
3631     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3632     pop @{$self->{open_elements}};
3633 wakaba 1.43 }
3634 wakaba 1.52
3635     !!!insert-element ($token->{tag_name}, $token->{attributes});
3636 wakaba 1.54 $self->{insertion_mode} = IN_CELL_IM;
3637 wakaba 1.52
3638     push @$active_formatting_elements, ['#marker', ''];
3639    
3640     !!!next-token;
3641     redo B;
3642     } elsif ({
3643     caption => 1, col => 1, colgroup => 1,
3644     tbody => 1, tfoot => 1, thead => 1,
3645 wakaba 1.54 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
3646 wakaba 1.52 }->{$token->{tag_name}}) {
3647 wakaba 1.54 if ($self->{insertion_mode} == IN_ROW_IM) {
3648 wakaba 1.52 ## As if </tr>
3649 wakaba 1.43 ## have an element in table scope
3650     my $i;
3651     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3652     my $node = $self->{open_elements}->[$_];
3653 wakaba 1.52 if ($node->[1] eq 'tr') {
3654 wakaba 1.43 $i = $_;
3655     last INSCOPE;
3656     } elsif ({
3657     table => 1, html => 1,
3658     }->{$node->[1]}) {
3659     last INSCOPE;
3660     }
3661     } # INSCOPE
3662 wakaba 1.52 unless (defined $i) {
3663     !!!parse-error (type => 'unmacthed end tag:'.$token->{tag_name});
3664     ## Ignore the token
3665     !!!next-token;
3666 wakaba 1.43 redo B;
3667     }
3668    
3669 wakaba 1.52 ## Clear back to table row context
3670     while (not {
3671     tr => 1, html => 1,
3672     }->{$self->{open_elements}->[-1]->[1]}) {
3673 wakaba 1.43 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3674 wakaba 1.52 pop @{$self->{open_elements}};
3675 wakaba 1.1 }
3676 wakaba 1.43
3677 wakaba 1.52 pop @{$self->{open_elements}}; # tr
3678 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_BODY_IM;
3679 wakaba 1.52 if ($token->{tag_name} eq 'tr') {
3680     ## reprocess
3681     redo B;
3682     } else {
3683     ## reprocess in the "in table body" insertion mode...
3684     }
3685 wakaba 1.1 }
3686 wakaba 1.52
3687 wakaba 1.54 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
3688 wakaba 1.52 ## have an element in table scope
3689 wakaba 1.43 my $i;
3690     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3691     my $node = $self->{open_elements}->[$_];
3692 wakaba 1.52 if ({
3693     tbody => 1, thead => 1, tfoot => 1,
3694     }->{$node->[1]}) {
3695 wakaba 1.43 $i = $_;
3696     last INSCOPE;
3697     } elsif ({
3698     table => 1, html => 1,
3699     }->{$node->[1]}) {
3700     last INSCOPE;
3701     }
3702     } # INSCOPE
3703 wakaba 1.52 unless (defined $i) {
3704     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3705     ## Ignore the token
3706     !!!next-token;
3707 wakaba 1.43 redo B;
3708     }
3709 wakaba 1.52
3710     ## Clear back to table body context
3711     while (not {
3712     tbody => 1, tfoot => 1, thead => 1, html => 1,
3713     }->{$self->{open_elements}->[-1]->[1]}) {
3714 wakaba 1.43 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3715 wakaba 1.52 pop @{$self->{open_elements}};
3716 wakaba 1.43 }
3717    
3718 wakaba 1.52 ## As if <{current node}>
3719     ## have an element in table scope
3720     ## true by definition
3721 wakaba 1.43
3722 wakaba 1.52 ## Clear back to table body context
3723     ## nop by definition
3724 wakaba 1.43
3725 wakaba 1.52 pop @{$self->{open_elements}};
3726 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
3727 wakaba 1.52 ## reprocess in "in table" insertion mode...
3728     }
3729    
3730     if ($token->{tag_name} eq 'col') {
3731     ## Clear back to table context
3732     while ($self->{open_elements}->[-1]->[1] ne 'table' and
3733     $self->{open_elements}->[-1]->[1] ne 'html') {
3734     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3735     pop @{$self->{open_elements}};
3736     }
3737 wakaba 1.43
3738 wakaba 1.52 !!!insert-element ('colgroup');
3739 wakaba 1.54 $self->{insertion_mode} = IN_COLUMN_GROUP_IM;
3740 wakaba 1.52 ## reprocess
3741 wakaba 1.43 redo B;
3742 wakaba 1.52 } elsif ({
3743     caption => 1,
3744     colgroup => 1,
3745     tbody => 1, tfoot => 1, thead => 1,
3746     }->{$token->{tag_name}}) {
3747     ## Clear back to table context
3748     while ($self->{open_elements}->[-1]->[1] ne 'table' and
3749     $self->{open_elements}->[-1]->[1] ne 'html') {
3750     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3751     pop @{$self->{open_elements}};
3752 wakaba 1.1 }
3753 wakaba 1.52
3754     push @$active_formatting_elements, ['#marker', '']
3755     if $token->{tag_name} eq 'caption';
3756    
3757     !!!insert-element ($token->{tag_name}, $token->{attributes});
3758     $self->{insertion_mode} = {
3759 wakaba 1.54 caption => IN_CAPTION_IM,
3760     colgroup => IN_COLUMN_GROUP_IM,
3761     tbody => IN_TABLE_BODY_IM,
3762     tfoot => IN_TABLE_BODY_IM,
3763     thead => IN_TABLE_BODY_IM,
3764 wakaba 1.52 }->{$token->{tag_name}};
3765 wakaba 1.1 !!!next-token;
3766     redo B;
3767 wakaba 1.52 } else {
3768     die "$0: in table: <>: $token->{tag_name}";
3769 wakaba 1.1 }
3770 wakaba 1.52 } elsif ($token->{tag_name} eq 'table') {
3771     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3772 wakaba 1.1
3773 wakaba 1.52 ## As if </table>
3774 wakaba 1.1 ## have a table element in table scope
3775     my $i;
3776 wakaba 1.3 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3777     my $node = $self->{open_elements}->[$_];
3778 wakaba 1.52 if ($node->[1] eq 'table') {
3779 wakaba 1.1 $i = $_;
3780     last INSCOPE;
3781     } elsif ({
3782     table => 1, html => 1,
3783     }->{$node->[1]}) {
3784     last INSCOPE;
3785     }
3786     } # INSCOPE
3787     unless (defined $i) {
3788 wakaba 1.52 !!!parse-error (type => 'unmatched end tag:table');
3789     ## Ignore tokens </table><table>
3790 wakaba 1.1 !!!next-token;
3791     redo B;
3792     }
3793    
3794     ## generate implied end tags
3795     if ({
3796     dd => 1, dt => 1, li => 1, p => 1,
3797     td => 1, th => 1, tr => 1,
3798 wakaba 1.31 tbody => 1, tfoot=> 1, thead => 1,
3799 wakaba 1.3 }->{$self->{open_elements}->[-1]->[1]}) {
3800 wakaba 1.52 !!!back-token; # <table>
3801 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'table'};
3802 wakaba 1.1 !!!back-token;
3803 wakaba 1.55 $token = {type => END_TAG_TOKEN,
3804 wakaba 1.3 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3805 wakaba 1.1 redo B;
3806     }
3807    
3808 wakaba 1.52 if ($self->{open_elements}->[-1]->[1] ne 'table') {
3809 wakaba 1.3 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3810 wakaba 1.1 }
3811    
3812 wakaba 1.3 splice @{$self->{open_elements}}, $i;
3813 wakaba 1.1
3814 wakaba 1.52 $self->_reset_insertion_mode;
3815 wakaba 1.1
3816     ## reprocess
3817     redo B;
3818 wakaba 1.58 } else {
3819     !!!parse-error (type => 'in table:'.$token->{tag_name});
3820    
3821     $insert = $insert_to_foster;
3822     #
3823     }
3824     } elsif ($token->{type} == END_TAG_TOKEN) {
3825 wakaba 1.52 if ($token->{tag_name} eq 'tr' and
3826 wakaba 1.54 $self->{insertion_mode} == IN_ROW_IM) {
3827 wakaba 1.52 ## have an element in table scope
3828     my $i;
3829     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3830     my $node = $self->{open_elements}->[$_];
3831     if ($node->[1] eq $token->{tag_name}) {
3832     $i = $_;
3833     last INSCOPE;
3834     } elsif ({
3835     table => 1, html => 1,
3836     }->{$node->[1]}) {
3837     last INSCOPE;
3838     }
3839     } # INSCOPE
3840     unless (defined $i) {
3841     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3842     ## Ignore the token
3843 wakaba 1.42 !!!next-token;
3844     redo B;
3845     }
3846    
3847 wakaba 1.52 ## Clear back to table row context
3848     while (not {
3849     tr => 1, html => 1,
3850     }->{$self->{open_elements}->[-1]->[1]}) {
3851     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3852     pop @{$self->{open_elements}};
3853     }
3854 wakaba 1.42
3855 wakaba 1.52 pop @{$self->{open_elements}}; # tr
3856 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_BODY_IM;
3857 wakaba 1.52 !!!next-token;
3858     redo B;
3859     } elsif ($token->{tag_name} eq 'table') {
3860 wakaba 1.54 if ($self->{insertion_mode} == IN_ROW_IM) {
3861 wakaba 1.52 ## As if </tr>
3862     ## have an element in table scope
3863     my $i;
3864     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3865     my $node = $self->{open_elements}->[$_];
3866     if ($node->[1] eq 'tr') {
3867     $i = $_;
3868     last INSCOPE;
3869     } elsif ({
3870     table => 1, html => 1,
3871     }->{$node->[1]}) {
3872     last INSCOPE;
3873 wakaba 1.42 }
3874 wakaba 1.52 } # INSCOPE
3875     unless (defined $i) {
3876     !!!parse-error (type => 'unmatched end tag:'.$token->{type});
3877     ## Ignore the token
3878     !!!next-token;
3879     redo B;
3880 wakaba 1.42 }
3881 wakaba 1.52
3882     ## Clear back to table row context
3883     while (not {
3884     tr => 1, html => 1,
3885     }->{$self->{open_elements}->[-1]->[1]}) {
3886 wakaba 1.46 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3887     pop @{$self->{open_elements}};
3888 wakaba 1.1 }
3889 wakaba 1.46
3890 wakaba 1.52 pop @{$self->{open_elements}}; # tr
3891 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_BODY_IM;
3892 wakaba 1.46 ## reprocess in the "in table body" insertion mode...
3893 wakaba 1.1 }
3894    
3895 wakaba 1.54 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
3896 wakaba 1.52 ## have an element in table scope
3897     my $i;
3898     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3899     my $node = $self->{open_elements}->[$_];
3900     if ({
3901     tbody => 1, thead => 1, tfoot => 1,
3902     }->{$node->[1]}) {
3903     $i = $_;
3904     last INSCOPE;
3905     } elsif ({
3906     table => 1, html => 1,
3907     }->{$node->[1]}) {
3908     last INSCOPE;
3909     }
3910     } # INSCOPE
3911     unless (defined $i) {
3912     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3913     ## Ignore the token
3914     !!!next-token;
3915     redo B;
3916 wakaba 1.47 }
3917    
3918     ## Clear back to table body context
3919     while (not {
3920     tbody => 1, tfoot => 1, thead => 1, html => 1,
3921     }->{$self->{open_elements}->[-1]->[1]}) {
3922     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3923     pop @{$self->{open_elements}};
3924     }
3925    
3926 wakaba 1.52 ## As if <{current node}>
3927     ## have an element in table scope
3928     ## true by definition
3929    
3930     ## Clear back to table body context
3931     ## nop by definition
3932    
3933     pop @{$self->{open_elements}};
3934 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
3935 wakaba 1.52 ## reprocess in the "in table" insertion mode...
3936     }
3937    
3938     ## have a table element in table scope
3939     my $i;
3940     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3941     my $node = $self->{open_elements}->[$_];
3942     if ($node->[1] eq $token->{tag_name}) {
3943     $i = $_;
3944     last INSCOPE;
3945     } elsif ({
3946     table => 1, html => 1,
3947     }->{$node->[1]}) {
3948     last INSCOPE;
3949 wakaba 1.47 }
3950 wakaba 1.52 } # INSCOPE
3951     unless (defined $i) {
3952     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3953     ## Ignore the token
3954     !!!next-token;
3955     redo B;
3956 wakaba 1.3 }
3957    
3958 wakaba 1.52 ## generate implied end tags
3959     if ({
3960     dd => 1, dt => 1, li => 1, p => 1,
3961     td => 1, th => 1, tr => 1,
3962     tbody => 1, tfoot=> 1, thead => 1,
3963     }->{$self->{open_elements}->[-1]->[1]}) {
3964     !!!back-token;
3965 wakaba 1.55 $token = {type => END_TAG_TOKEN,
3966 wakaba 1.52 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3967     redo B;
3968     }
3969    
3970     if ($self->{open_elements}->[-1]->[1] ne 'table') {
3971 wakaba 1.3 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3972 wakaba 1.1 }
3973 wakaba 1.52
3974     splice @{$self->{open_elements}}, $i;
3975 wakaba 1.1
3976 wakaba 1.52 $self->_reset_insertion_mode;
3977 wakaba 1.47
3978     !!!next-token;
3979     redo B;
3980     } elsif ({
3981 wakaba 1.48 tbody => 1, tfoot => 1, thead => 1,
3982 wakaba 1.52 }->{$token->{tag_name}} and
3983 wakaba 1.56 $self->{insertion_mode} & ROW_IMS) {
3984 wakaba 1.54 if ($self->{insertion_mode} == IN_ROW_IM) {
3985 wakaba 1.52 ## have an element in table scope
3986     my $i;
3987     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3988     my $node = $self->{open_elements}->[$_];
3989     if ($node->[1] eq $token->{tag_name}) {
3990     $i = $_;
3991     last INSCOPE;
3992     } elsif ({
3993     table => 1, html => 1,
3994     }->{$node->[1]}) {
3995     last INSCOPE;
3996     }
3997     } # INSCOPE
3998     unless (defined $i) {
3999     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4000     ## Ignore the token
4001     !!!next-token;
4002     redo B;
4003     }
4004    
4005 wakaba 1.48 ## As if </tr>
4006     ## have an element in table scope
4007     my $i;
4008     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4009     my $node = $self->{open_elements}->[$_];
4010     if ($node->[1] eq 'tr') {
4011     $i = $_;
4012     last INSCOPE;
4013     } elsif ({
4014     table => 1, html => 1,
4015     }->{$node->[1]}) {
4016     last INSCOPE;
4017     }
4018     } # INSCOPE
4019 wakaba 1.52 unless (defined $i) {
4020     !!!parse-error (type => 'unmatched end tag:tr');
4021     ## Ignore the token
4022     !!!next-token;
4023     redo B;
4024     }
4025 wakaba 1.48
4026     ## Clear back to table row context
4027     while (not {
4028     tr => 1, html => 1,
4029     }->{$self->{open_elements}->[-1]->[1]}) {
4030     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4031     pop @{$self->{open_elements}};
4032     }
4033    
4034     pop @{$self->{open_elements}}; # tr
4035 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_BODY_IM;
4036 wakaba 1.52 ## reprocess in the "in table body" insertion mode...
4037     }
4038    
4039     ## have an element in table scope
4040     my $i;
4041     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4042     my $node = $self->{open_elements}->[$_];
4043     if ($node->[1] eq $token->{tag_name}) {
4044     $i = $_;
4045     last INSCOPE;
4046     } elsif ({
4047     table => 1, html => 1,
4048     }->{$node->[1]}) {
4049     last INSCOPE;
4050     }
4051     } # INSCOPE
4052     unless (defined $i) {
4053     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4054     ## Ignore the token
4055     !!!next-token;
4056     redo B;
4057     }
4058    
4059     ## Clear back to table body context
4060     while (not {
4061     tbody => 1, tfoot => 1, thead => 1, html => 1,
4062     }->{$self->{open_elements}->[-1]->[1]}) {
4063     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4064     pop @{$self->{open_elements}};
4065     }
4066    
4067     pop @{$self->{open_elements}};
4068 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
4069 wakaba 1.52 !!!next-token;
4070     redo B;
4071     } elsif ({
4072     body => 1, caption => 1, col => 1, colgroup => 1,
4073     html => 1, td => 1, th => 1,
4074 wakaba 1.54 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
4075     tbody => 1, tfoot => 1, thead => 1, # $self->{insertion_mode} == IN_TABLE_IM
4076 wakaba 1.52 }->{$token->{tag_name}}) {
4077     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4078     ## Ignore the token
4079     !!!next-token;
4080     redo B;
4081 wakaba 1.58 } else {
4082     !!!parse-error (type => 'in table:/'.$token->{tag_name});
4083 wakaba 1.52
4084 wakaba 1.58 $insert = $insert_to_foster;
4085     #
4086     }
4087     } else {
4088     die "$0: $token->{type}: Unknown token type";
4089     }
4090 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
4091 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
4092 wakaba 1.52 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4093     $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4094     unless (length $token->{data}) {
4095     !!!next-token;
4096     redo B;
4097     }
4098     }
4099    
4100     #
4101 wakaba 1.55 } elsif ($token->{type} == START_TAG_TOKEN) {
4102 wakaba 1.52 if ($token->{tag_name} eq 'col') {
4103     !!!insert-element ($token->{tag_name}, $token->{attributes});
4104     pop @{$self->{open_elements}};
4105     !!!next-token;
4106     redo B;
4107     } else {
4108     #
4109     }
4110 wakaba 1.55 } elsif ($token->{type} == END_TAG_TOKEN) {
4111 wakaba 1.52 if ($token->{tag_name} eq 'colgroup') {
4112     if ($self->{open_elements}->[-1]->[1] eq 'html') {
4113     !!!parse-error (type => 'unmatched end tag:colgroup');
4114     ## Ignore the token
4115     !!!next-token;
4116     redo B;
4117     } else {
4118     pop @{$self->{open_elements}}; # colgroup
4119 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
4120 wakaba 1.52 !!!next-token;
4121     redo B;
4122     }
4123     } elsif ($token->{tag_name} eq 'col') {
4124     !!!parse-error (type => 'unmatched end tag:col');
4125     ## Ignore the token
4126     !!!next-token;
4127     redo B;
4128     } else {
4129     #
4130     }
4131     } else {
4132     #
4133     }
4134    
4135     ## As if </colgroup>
4136     if ($self->{open_elements}->[-1]->[1] eq 'html') {
4137     !!!parse-error (type => 'unmatched end tag:colgroup');
4138     ## Ignore the token
4139     !!!next-token;
4140     redo B;
4141     } else {
4142     pop @{$self->{open_elements}}; # colgroup
4143 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
4144 wakaba 1.52 ## reprocess
4145     redo B;
4146     }
4147 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_SELECT_IM) {
4148 wakaba 1.58 if ($token->{type} == CHARACTER_TOKEN) {
4149     $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4150     !!!next-token;
4151     redo B;
4152     } elsif ($token->{type} == START_TAG_TOKEN) {
4153 wakaba 1.52 if ($token->{tag_name} eq 'option') {
4154     if ($self->{open_elements}->[-1]->[1] eq 'option') {
4155     ## As if </option>
4156     pop @{$self->{open_elements}};
4157     }
4158    
4159     !!!insert-element ($token->{tag_name}, $token->{attributes});
4160     !!!next-token;
4161     redo B;
4162     } elsif ($token->{tag_name} eq 'optgroup') {
4163     if ($self->{open_elements}->[-1]->[1] eq 'option') {
4164     ## As if </option>
4165     pop @{$self->{open_elements}};
4166     }
4167    
4168     if ($self->{open_elements}->[-1]->[1] eq 'optgroup') {
4169     ## As if </optgroup>
4170     pop @{$self->{open_elements}};
4171     }
4172    
4173     !!!insert-element ($token->{tag_name}, $token->{attributes});
4174     !!!next-token;
4175     redo B;
4176     } elsif ($token->{tag_name} eq 'select') {
4177     !!!parse-error (type => 'not closed:select');
4178     ## As if </select> instead
4179     ## have an element in table scope
4180     my $i;
4181     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4182     my $node = $self->{open_elements}->[$_];
4183     if ($node->[1] eq $token->{tag_name}) {
4184     $i = $_;
4185     last INSCOPE;
4186     } elsif ({
4187     table => 1, html => 1,
4188     }->{$node->[1]}) {
4189     last INSCOPE;
4190 wakaba 1.47 }
4191 wakaba 1.52 } # INSCOPE
4192     unless (defined $i) {
4193     !!!parse-error (type => 'unmatched end tag:select');
4194     ## Ignore the token
4195     !!!next-token;
4196     redo B;
4197 wakaba 1.47 }
4198 wakaba 1.52
4199     splice @{$self->{open_elements}}, $i;
4200    
4201     $self->_reset_insertion_mode;
4202 wakaba 1.47
4203 wakaba 1.52 !!!next-token;
4204     redo B;
4205 wakaba 1.58 } else {
4206     !!!parse-error (type => 'in select:'.$token->{tag_name});
4207     ## Ignore the token
4208     !!!next-token;
4209     redo B;
4210     }
4211     } elsif ($token->{type} == END_TAG_TOKEN) {
4212 wakaba 1.52 if ($token->{tag_name} eq 'optgroup') {
4213     if ($self->{open_elements}->[-1]->[1] eq 'option' and
4214     $self->{open_elements}->[-2]->[1] eq 'optgroup') {
4215     ## As if </option>
4216     splice @{$self->{open_elements}}, -2;
4217     } elsif ($self->{open_elements}->[-1]->[1] eq 'optgroup') {
4218     pop @{$self->{open_elements}};
4219     } else {
4220     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4221     ## Ignore the token
4222     }
4223     !!!next-token;
4224     redo B;
4225     } elsif ($token->{tag_name} eq 'option') {
4226     if ($self->{open_elements}->[-1]->[1] eq 'option') {
4227 wakaba 1.47 pop @{$self->{open_elements}};
4228 wakaba 1.52 } else {
4229     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4230     ## Ignore the token
4231 wakaba 1.1 }
4232 wakaba 1.52 !!!next-token;
4233     redo B;
4234     } elsif ($token->{tag_name} eq 'select') {
4235     ## have an element in table scope
4236     my $i;
4237     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4238     my $node = $self->{open_elements}->[$_];
4239     if ($node->[1] eq $token->{tag_name}) {
4240     $i = $_;
4241     last INSCOPE;
4242     } elsif ({
4243     table => 1, html => 1,
4244     }->{$node->[1]}) {
4245     last INSCOPE;
4246 wakaba 1.48 }
4247 wakaba 1.52 } # INSCOPE
4248     unless (defined $i) {
4249     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4250     ## Ignore the token
4251     !!!next-token;
4252 wakaba 1.48 redo B;
4253 wakaba 1.52 }
4254    
4255     splice @{$self->{open_elements}}, $i;
4256    
4257     $self->_reset_insertion_mode;
4258    
4259     !!!next-token;
4260     redo B;
4261     } elsif ({
4262     caption => 1, table => 1, tbody => 1,
4263     tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
4264     }->{$token->{tag_name}}) {
4265     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4266    
4267     ## have an element in table scope
4268     my $i;
4269     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4270     my $node = $self->{open_elements}->[$_];
4271     if ($node->[1] eq $token->{tag_name}) {
4272     $i = $_;
4273     last INSCOPE;
4274     } elsif ({
4275     table => 1, html => 1,
4276     }->{$node->[1]}) {
4277     last INSCOPE;
4278 wakaba 1.1 }
4279 wakaba 1.52 } # INSCOPE
4280     unless (defined $i) {
4281     ## Ignore the token
4282 wakaba 1.1 !!!next-token;
4283     redo B;
4284     }
4285 wakaba 1.52
4286     ## As if </select>
4287     ## have an element in table scope
4288     undef $i;
4289 wakaba 1.3 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4290     my $node = $self->{open_elements}->[$_];
4291 wakaba 1.52 if ($node->[1] eq 'select') {
4292 wakaba 1.1 $i = $_;
4293     last INSCOPE;
4294     } elsif ({
4295     table => 1, html => 1,
4296 wakaba 1.52 }->{$node->[1]}) {
4297     last INSCOPE;
4298     }
4299     } # INSCOPE
4300     unless (defined $i) {
4301     !!!parse-error (type => 'unmatched end tag:select');
4302     ## Ignore the </select> token
4303     !!!next-token; ## TODO: ok?
4304     redo B;
4305     }
4306    
4307     splice @{$self->{open_elements}}, $i;
4308    
4309     $self->_reset_insertion_mode;
4310    
4311     ## reprocess
4312     redo B;
4313 wakaba 1.58 } else {
4314     !!!parse-error (type => 'in select:/'.$token->{tag_name});
4315 wakaba 1.52 ## Ignore the token
4316     !!!next-token;
4317     redo B;
4318 wakaba 1.58 }
4319     } else {
4320     die "$0: $token->{type}: Unknown token type";
4321     }
4322 wakaba 1.56 } elsif ($self->{insertion_mode} & BODY_AFTER_IMS) {
4323 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
4324 wakaba 1.52 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4325     my $data = $1;
4326     ## As if in body
4327     $reconstruct_active_formatting_elements->($insert_to_current);
4328    
4329     $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4330    
4331     unless (length $token->{data}) {
4332     !!!next-token;
4333     redo B;
4334     }
4335     }
4336    
4337 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
4338 wakaba 1.52 !!!parse-error (type => 'after html:#character');
4339    
4340     ## Reprocess in the "main" phase, "after body" insertion mode...
4341     }
4342    
4343     ## "after body" insertion mode
4344     !!!parse-error (type => 'after body:#character');
4345    
4346 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM;
4347 wakaba 1.52 ## reprocess
4348     redo B;
4349 wakaba 1.55 } elsif ($token->{type} == START_TAG_TOKEN) {
4350 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
4351 wakaba 1.52 !!!parse-error (type => 'after html:'.$token->{tag_name});
4352    
4353     ## Reprocess in the "main" phase, "after body" insertion mode...
4354     }
4355    
4356     ## "after body" insertion mode
4357     !!!parse-error (type => 'after body:'.$token->{tag_name});
4358    
4359 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM;
4360 wakaba 1.52 ## reprocess
4361     redo B;
4362 wakaba 1.55 } elsif ($token->{type} == END_TAG_TOKEN) {
4363 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
4364 wakaba 1.52 !!!parse-error (type => 'after html:/'.$token->{tag_name});
4365    
4366 wakaba 1.54 $self->{insertion_mode} = AFTER_BODY_IM;
4367 wakaba 1.52 ## Reprocess in the "main" phase, "after body" insertion mode...
4368     }
4369    
4370     ## "after body" insertion mode
4371     if ($token->{tag_name} eq 'html') {
4372     if (defined $self->{inner_html_node}) {
4373     !!!parse-error (type => 'unmatched end tag:html');
4374     ## Ignore the token
4375     !!!next-token;
4376     redo B;
4377     } else {
4378 wakaba 1.54 $self->{insertion_mode} = AFTER_HTML_BODY_IM;
4379 wakaba 1.52 !!!next-token;
4380     redo B;
4381     }
4382     } else {
4383     !!!parse-error (type => 'after body:/'.$token->{tag_name});
4384    
4385 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM;
4386 wakaba 1.52 ## reprocess
4387     redo B;
4388     }
4389     } else {
4390     die "$0: $token->{type}: Unknown token type";
4391     }
4392 wakaba 1.56 } elsif ($self->{insertion_mode} & FRAME_IMS) {
4393 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
4394 wakaba 1.52 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4395     $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4396    
4397     unless (length $token->{data}) {
4398     !!!next-token;
4399     redo B;
4400     }
4401     }
4402    
4403     if ($token->{data} =~ s/^[^\x09\x0A\x0B\x0C\x20]+//) {
4404 wakaba 1.54 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
4405 wakaba 1.52 !!!parse-error (type => 'in frameset:#character');
4406 wakaba 1.54 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
4407 wakaba 1.52 !!!parse-error (type => 'after frameset:#character');
4408     } else { # "after html frameset"
4409     !!!parse-error (type => 'after html:#character');
4410    
4411 wakaba 1.54 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4412 wakaba 1.52 ## Reprocess in the "main" phase, "after frameset"...
4413     !!!parse-error (type => 'after frameset:#character');
4414     }
4415    
4416     ## Ignore the token.
4417     if (length $token->{data}) {
4418     ## reprocess the rest of characters
4419     } else {
4420     !!!next-token;
4421     }
4422     redo B;
4423     }
4424    
4425     die qq[$0: Character "$token->{data}"];
4426 wakaba 1.55 } elsif ($token->{type} == START_TAG_TOKEN) {
4427 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
4428 wakaba 1.52 !!!parse-error (type => 'after html:'.$token->{tag_name});
4429 wakaba 1.1
4430 wakaba 1.54 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4431 wakaba 1.52 ## Process in the "main" phase, "after frameset" insertion mode...
4432     }
4433 wakaba 1.1
4434 wakaba 1.52 if ($token->{tag_name} eq 'frameset' and
4435 wakaba 1.54 $self->{insertion_mode} == IN_FRAMESET_IM) {
4436 wakaba 1.52 !!!insert-element ($token->{tag_name}, $token->{attributes});
4437     !!!next-token;
4438     redo B;
4439     } elsif ($token->{tag_name} eq 'frame' and
4440 wakaba 1.54 $self->{insertion_mode} == IN_FRAMESET_IM) {
4441 wakaba 1.52 !!!insert-element ($token->{tag_name}, $token->{attributes});
4442     pop @{$self->{open_elements}};
4443     !!!next-token;
4444     redo B;
4445     } elsif ($token->{tag_name} eq 'noframes') {
4446     ## NOTE: As if in body.
4447     $parse_rcdata->(CDATA_CONTENT_MODEL, $insert_to_current);
4448     redo B;
4449     } else {
4450 wakaba 1.54 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
4451 wakaba 1.52 !!!parse-error (type => 'in frameset:'.$token->{tag_name});
4452     } else {
4453     !!!parse-error (type => 'after frameset:'.$token->{tag_name});
4454     }
4455     ## Ignore the token
4456     !!!next-token;
4457     redo B;
4458     }
4459 wakaba 1.55 } elsif ($token->{type} == END_TAG_TOKEN) {
4460 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
4461 wakaba 1.52 !!!parse-error (type => 'after html:/'.$token->{tag_name});
4462 wakaba 1.1
4463 wakaba 1.54 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4464 wakaba 1.52 ## Process in the "main" phase, "after frameset" insertion mode...
4465     }
4466 wakaba 1.1
4467 wakaba 1.52 if ($token->{tag_name} eq 'frameset' and
4468 wakaba 1.54 $self->{insertion_mode} == IN_FRAMESET_IM) {
4469 wakaba 1.52 if ($self->{open_elements}->[-1]->[1] eq 'html' and
4470     @{$self->{open_elements}} == 1) {
4471     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4472     ## Ignore the token
4473     !!!next-token;
4474     } else {
4475     pop @{$self->{open_elements}};
4476     !!!next-token;
4477     }
4478 wakaba 1.47
4479 wakaba 1.52 if (not defined $self->{inner_html_node} and
4480     $self->{open_elements}->[-1]->[1] ne 'frameset') {
4481 wakaba 1.54 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4482 wakaba 1.52 }
4483     redo B;
4484     } elsif ($token->{tag_name} eq 'html' and
4485 wakaba 1.54 $self->{insertion_mode} == AFTER_FRAMESET_IM) {
4486     $self->{insertion_mode} = AFTER_HTML_FRAMESET_IM;
4487 wakaba 1.52 !!!next-token;
4488     redo B;
4489     } else {
4490 wakaba 1.54 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
4491 wakaba 1.52 !!!parse-error (type => 'in frameset:/'.$token->{tag_name});
4492     } else {
4493     !!!parse-error (type => 'after frameset:/'.$token->{tag_name});
4494     }
4495     ## Ignore the token
4496     !!!next-token;
4497     redo B;
4498     }
4499     } else {
4500     die "$0: $token->{type}: Unknown token type";
4501     }
4502 wakaba 1.47
4503 wakaba 1.52 ## ISSUE: An issue in spec here
4504     } else {
4505     die "$0: $self->{insertion_mode}: Unknown insertion mode";
4506     }
4507 wakaba 1.47
4508 wakaba 1.52 ## "in body" insertion mode
4509 wakaba 1.55 if ($token->{type} == START_TAG_TOKEN) {
4510 wakaba 1.52 if ($token->{tag_name} eq 'script') {
4511     ## NOTE: This is an "as if in head" code clone
4512     $script_start_tag->($insert);
4513 wakaba 1.53 redo B;
4514 wakaba 1.52 } elsif ($token->{tag_name} eq 'style') {
4515     ## NOTE: This is an "as if in head" code clone
4516     $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);
4517 wakaba 1.53 redo B;
4518 wakaba 1.52 } elsif ({
4519     base => 1, link => 1,
4520     }->{$token->{tag_name}}) {
4521     ## NOTE: This is an "as if in head" code clone, only "-t" differs
4522     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4523     pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4524     !!!next-token;
4525 wakaba 1.53 redo B;
4526 wakaba 1.52 } elsif ($token->{tag_name} eq 'meta') {
4527     ## NOTE: This is an "as if in head" code clone, only "-t" differs
4528     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4529 wakaba 1.66 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4530 wakaba 1.46
4531 wakaba 1.52 unless ($self->{confident}) {
4532     if ($token->{attributes}->{charset}) { ## TODO: And if supported
4533 wakaba 1.63 $self->{change_encoding}
4534     ->($self, $token->{attributes}->{charset}->{value});
4535 wakaba 1.66
4536     $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4537     ->set_user_data (manakai_has_reference =>
4538     $token->{attributes}->{charset}
4539     ->{has_reference});
4540 wakaba 1.63 } elsif ($token->{attributes}->{content}) {
4541 wakaba 1.52 ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
4542 wakaba 1.63 if ($token->{attributes}->{content}->{value}
4543 wakaba 1.70 =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
4544     [\x09-\x0D\x20]*=
4545 wakaba 1.52 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
4546     ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
4547 wakaba 1.63 $self->{change_encoding}
4548     ->($self, defined $1 ? $1 : defined $2 ? $2 : $3);
4549 wakaba 1.68 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4550     ->set_user_data (manakai_has_reference =>
4551     $token->{attributes}->{content}
4552     ->{has_reference});
4553 wakaba 1.63 }
4554 wakaba 1.52 }
4555 wakaba 1.66 } else {
4556     if ($token->{attributes}->{charset}) {
4557     $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4558     ->set_user_data (manakai_has_reference =>
4559     $token->{attributes}->{charset}
4560     ->{has_reference});
4561     }
4562 wakaba 1.68 if ($token->{attributes}->{content}) {
4563     $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4564     ->set_user_data (manakai_has_reference =>
4565     $token->{attributes}->{content}
4566     ->{has_reference});
4567     }
4568 wakaba 1.52 }
4569 wakaba 1.1
4570 wakaba 1.52 !!!next-token;
4571 wakaba 1.53 redo B;
4572 wakaba 1.52 } elsif ($token->{tag_name} eq 'title') {
4573     !!!parse-error (type => 'in body:title');
4574     ## NOTE: This is an "as if in head" code clone
4575     $parse_rcdata->(RCDATA_CONTENT_MODEL, sub {
4576     if (defined $self->{head_element}) {
4577     $self->{head_element}->append_child ($_[0]);
4578     } else {
4579     $insert->($_[0]);
4580     }
4581     });
4582 wakaba 1.53 redo B;
4583 wakaba 1.52 } elsif ($token->{tag_name} eq 'body') {
4584     !!!parse-error (type => 'in body:body');
4585 wakaba 1.46
4586 wakaba 1.52 if (@{$self->{open_elements}} == 1 or
4587     $self->{open_elements}->[1]->[1] ne 'body') {
4588     ## Ignore the token
4589     } else {
4590     my $body_el = $self->{open_elements}->[1]->[0];
4591     for my $attr_name (keys %{$token->{attributes}}) {
4592     unless ($body_el->has_attribute_ns (undef, $attr_name)) {
4593     $body_el->set_attribute_ns
4594     (undef, [undef, $attr_name],
4595     $token->{attributes}->{$attr_name}->{value});
4596     }
4597     }
4598     }
4599     !!!next-token;
4600 wakaba 1.53 redo B;
4601 wakaba 1.52 } elsif ({
4602     address => 1, blockquote => 1, center => 1, dir => 1,
4603     div => 1, dl => 1, fieldset => 1, listing => 1,
4604     menu => 1, ol => 1, p => 1, ul => 1,
4605     pre => 1,
4606     }->{$token->{tag_name}}) {
4607     ## has a p element in scope
4608     INSCOPE: for (reverse @{$self->{open_elements}}) {
4609     if ($_->[1] eq 'p') {
4610     !!!back-token;
4611 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4612 wakaba 1.53 redo B;
4613 wakaba 1.52 } elsif ({
4614     table => 1, caption => 1, td => 1, th => 1,
4615     button => 1, marquee => 1, object => 1, html => 1,
4616     }->{$_->[1]}) {
4617     last INSCOPE;
4618     }
4619     } # INSCOPE
4620    
4621     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4622     if ($token->{tag_name} eq 'pre') {
4623     !!!next-token;
4624 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
4625 wakaba 1.52 $token->{data} =~ s/^\x0A//;
4626     unless (length $token->{data}) {
4627 wakaba 1.1 !!!next-token;
4628 wakaba 1.52 }
4629     }
4630     } else {
4631     !!!next-token;
4632     }
4633 wakaba 1.53 redo B;
4634 wakaba 1.52 } elsif ($token->{tag_name} eq 'form') {
4635     if (defined $self->{form_element}) {
4636     !!!parse-error (type => 'in form:form');
4637     ## Ignore the token
4638     !!!next-token;
4639 wakaba 1.53 redo B;
4640 wakaba 1.52 } else {
4641     ## has a p element in scope
4642     INSCOPE: for (reverse @{$self->{open_elements}}) {
4643     if ($_->[1] eq 'p') {
4644     !!!back-token;
4645 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4646 wakaba 1.53 redo B;
4647 wakaba 1.46 } elsif ({
4648 wakaba 1.52 table => 1, caption => 1, td => 1, th => 1,
4649     button => 1, marquee => 1, object => 1, html => 1,
4650     }->{$_->[1]}) {
4651     last INSCOPE;
4652     }
4653     } # INSCOPE
4654    
4655     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4656     $self->{form_element} = $self->{open_elements}->[-1]->[0];
4657     !!!next-token;
4658 wakaba 1.53 redo B;
4659 wakaba 1.52 }
4660     } elsif ($token->{tag_name} eq 'li') {
4661     ## has a p element in scope
4662     INSCOPE: for (reverse @{$self->{open_elements}}) {
4663     if ($_->[1] eq 'p') {
4664     !!!back-token;
4665 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4666 wakaba 1.53 redo B;
4667 wakaba 1.52 } elsif ({
4668     table => 1, caption => 1, td => 1, th => 1,
4669     button => 1, marquee => 1, object => 1, html => 1,
4670     }->{$_->[1]}) {
4671     last INSCOPE;
4672     }
4673     } # INSCOPE
4674    
4675     ## Step 1
4676     my $i = -1;
4677     my $node = $self->{open_elements}->[$i];
4678     LI: {
4679     ## Step 2
4680     if ($node->[1] eq 'li') {
4681     if ($i != -1) {
4682     !!!parse-error (type => 'end tag missing:'.
4683     $self->{open_elements}->[-1]->[1]);
4684     }
4685     splice @{$self->{open_elements}}, $i;
4686     last LI;
4687     }
4688    
4689     ## Step 3
4690     if (not $formatting_category->{$node->[1]} and
4691     #not $phrasing_category->{$node->[1]} and
4692     ($special_category->{$node->[1]} or
4693     $scoping_category->{$node->[1]}) and
4694     $node->[1] ne 'address' and $node->[1] ne 'div') {
4695     last LI;
4696     }
4697    
4698     ## Step 4
4699     $i--;
4700     $node = $self->{open_elements}->[$i];
4701     redo LI;
4702     } # LI
4703    
4704     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4705     !!!next-token;
4706 wakaba 1.53 redo B;
4707 wakaba 1.52 } elsif ($token->{tag_name} eq 'dd' or $token->{tag_name} eq 'dt') {
4708     ## has a p element in scope
4709     INSCOPE: for (reverse @{$self->{open_elements}}) {
4710     if ($_->[1] eq 'p') {
4711     !!!back-token;
4712 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4713 wakaba 1.53 redo B;
4714 wakaba 1.52 } elsif ({
4715     table => 1, caption => 1, td => 1, th => 1,
4716     button => 1, marquee => 1, object => 1, html => 1,
4717     }->{$_->[1]}) {
4718     last INSCOPE;
4719     }
4720     } # INSCOPE
4721    
4722     ## Step 1
4723     my $i = -1;
4724     my $node = $self->{open_elements}->[$i];
4725     LI: {
4726     ## Step 2
4727     if ($node->[1] eq 'dt' or $node->[1] eq 'dd') {
4728     if ($i != -1) {
4729     !!!parse-error (type => 'end tag missing:'.
4730     $self->{open_elements}->[-1]->[1]);
4731 wakaba 1.1 }
4732 wakaba 1.52 splice @{$self->{open_elements}}, $i;
4733     last LI;
4734     }
4735    
4736     ## Step 3
4737     if (not $formatting_category->{$node->[1]} and
4738     #not $phrasing_category->{$node->[1]} and
4739     ($special_category->{$node->[1]} or
4740     $scoping_category->{$node->[1]}) and
4741     $node->[1] ne 'address' and $node->[1] ne 'div') {
4742     last LI;
4743 wakaba 1.1 }
4744 wakaba 1.52
4745     ## Step 4
4746     $i--;
4747     $node = $self->{open_elements}->[$i];
4748     redo LI;
4749     } # LI
4750    
4751     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4752     !!!next-token;
4753 wakaba 1.53 redo B;
4754 wakaba 1.52 } elsif ($token->{tag_name} eq 'plaintext') {
4755     ## has a p element in scope
4756     INSCOPE: for (reverse @{$self->{open_elements}}) {
4757     if ($_->[1] eq 'p') {
4758     !!!back-token;
4759 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4760 wakaba 1.53 redo B;
4761 wakaba 1.52 } elsif ({
4762     table => 1, caption => 1, td => 1, th => 1,
4763     button => 1, marquee => 1, object => 1, html => 1,
4764     }->{$_->[1]}) {
4765     last INSCOPE;
4766 wakaba 1.46 }
4767 wakaba 1.52 } # INSCOPE
4768    
4769     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4770    
4771     $self->{content_model} = PLAINTEXT_CONTENT_MODEL;
4772    
4773     !!!next-token;
4774 wakaba 1.53 redo B;
4775 wakaba 1.52 } elsif ({
4776     h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
4777     }->{$token->{tag_name}}) {
4778     ## has a p element in scope
4779     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4780     my $node = $self->{open_elements}->[$_];
4781     if ($node->[1] eq 'p') {
4782     !!!back-token;
4783 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4784 wakaba 1.53 redo B;
4785 wakaba 1.52 } elsif ({
4786     table => 1, caption => 1, td => 1, th => 1,
4787     button => 1, marquee => 1, object => 1, html => 1,
4788     }->{$node->[1]}) {
4789     last INSCOPE;
4790 wakaba 1.46 }
4791 wakaba 1.52 } # INSCOPE
4792    
4793     ## NOTE: See <http://html5.org/tools/web-apps-tracker?from=925&to=926>
4794     ## has an element in scope
4795     #my $i;
4796     #INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4797     # my $node = $self->{open_elements}->[$_];
4798     # if ({
4799     # h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
4800     # }->{$node->[1]}) {
4801     # $i = $_;
4802     # last INSCOPE;
4803     # } elsif ({
4804     # table => 1, caption => 1, td => 1, th => 1,
4805     # button => 1, marquee => 1, object => 1, html => 1,
4806     # }->{$node->[1]}) {
4807     # last INSCOPE;
4808     # }
4809     #} # INSCOPE
4810     #
4811     #if (defined $i) {
4812     # !!! parse-error (type => 'in hn:hn');
4813     # splice @{$self->{open_elements}}, $i;
4814     #}
4815    
4816     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4817    
4818     !!!next-token;
4819 wakaba 1.53 redo B;
4820 wakaba 1.52 } elsif ($token->{tag_name} eq 'a') {
4821     AFE: for my $i (reverse 0..$#$active_formatting_elements) {
4822     my $node = $active_formatting_elements->[$i];
4823     if ($node->[1] eq 'a') {
4824     !!!parse-error (type => 'in a:a');
4825    
4826     !!!back-token;
4827 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'a'};
4828 wakaba 1.52 $formatting_end_tag->($token->{tag_name});
4829    
4830     AFE2: for (reverse 0..$#$active_formatting_elements) {
4831     if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
4832     splice @$active_formatting_elements, $_, 1;
4833     last AFE2;
4834 wakaba 1.1 }
4835 wakaba 1.52 } # AFE2
4836     OE: for (reverse 0..$#{$self->{open_elements}}) {
4837     if ($self->{open_elements}->[$_]->[0] eq $node->[0]) {
4838     splice @{$self->{open_elements}}, $_, 1;
4839     last OE;
4840 wakaba 1.1 }
4841 wakaba 1.52 } # OE
4842     last AFE;
4843     } elsif ($node->[0] eq '#marker') {
4844     last AFE;
4845     }
4846     } # AFE
4847    
4848     $reconstruct_active_formatting_elements->($insert_to_current);
4849 wakaba 1.1
4850 wakaba 1.52 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4851     push @$active_formatting_elements, $self->{open_elements}->[-1];
4852 wakaba 1.1
4853 wakaba 1.52 !!!next-token;
4854 wakaba 1.53 redo B;
4855 wakaba 1.52 } elsif ({
4856     b => 1, big => 1, em => 1, font => 1, i => 1,
4857     s => 1, small => 1, strile => 1,
4858     strong => 1, tt => 1, u => 1,
4859     }->{$token->{tag_name}}) {
4860     $reconstruct_active_formatting_elements->($insert_to_current);
4861    
4862     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4863     push @$active_formatting_elements, $self->{open_elements}->[-1];
4864    
4865     !!!next-token;
4866 wakaba 1.53 redo B;
4867 wakaba 1.52 } elsif ($token->{tag_name} eq 'nobr') {
4868     $reconstruct_active_formatting_elements->($insert_to_current);
4869 wakaba 1.1
4870 wakaba 1.52 ## has a |nobr| element in scope
4871     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4872     my $node = $self->{open_elements}->[$_];
4873     if ($node->[1] eq 'nobr') {
4874 wakaba 1.58 !!!parse-error (type => 'in nobr:nobr');
4875 wakaba 1.52 !!!back-token;
4876 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'nobr'};
4877 wakaba 1.53 redo B;
4878 wakaba 1.52 } elsif ({
4879     table => 1, caption => 1, td => 1, th => 1,
4880     button => 1, marquee => 1, object => 1, html => 1,
4881     }->{$node->[1]}) {
4882     last INSCOPE;
4883     }
4884     } # INSCOPE
4885    
4886     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4887     push @$active_formatting_elements, $self->{open_elements}->[-1];
4888    
4889     !!!next-token;
4890 wakaba 1.53 redo B;
4891 wakaba 1.52 } elsif ($token->{tag_name} eq 'button') {
4892     ## has a button element in scope
4893     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4894     my $node = $self->{open_elements}->[$_];
4895     if ($node->[1] eq 'button') {
4896     !!!parse-error (type => 'in button:button');
4897     !!!back-token;
4898 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'button'};
4899 wakaba 1.53 redo B;
4900 wakaba 1.52 } elsif ({
4901     table => 1, caption => 1, td => 1, th => 1,
4902     button => 1, marquee => 1, object => 1, html => 1,
4903     }->{$node->[1]}) {
4904     last INSCOPE;
4905     }
4906     } # INSCOPE
4907    
4908     $reconstruct_active_formatting_elements->($insert_to_current);
4909    
4910     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4911     push @$active_formatting_elements, ['#marker', ''];
4912 wakaba 1.1
4913 wakaba 1.52 !!!next-token;
4914 wakaba 1.53 redo B;
4915 wakaba 1.52 } elsif ($token->{tag_name} eq 'marquee' or
4916     $token->{tag_name} eq 'object') {
4917     $reconstruct_active_formatting_elements->($insert_to_current);
4918    
4919     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4920     push @$active_formatting_elements, ['#marker', ''];
4921    
4922     !!!next-token;
4923 wakaba 1.53 redo B;
4924 wakaba 1.52 } elsif ($token->{tag_name} eq 'xmp') {
4925     $reconstruct_active_formatting_elements->($insert_to_current);
4926     $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);
4927 wakaba 1.53 redo B;
4928 wakaba 1.52 } elsif ($token->{tag_name} eq 'table') {
4929     ## has a p element in scope
4930     INSCOPE: for (reverse @{$self->{open_elements}}) {
4931     if ($_->[1] eq 'p') {
4932     !!!back-token;
4933 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4934 wakaba 1.53 redo B;
4935 wakaba 1.52 } elsif ({
4936     table => 1, caption => 1, td => 1, th => 1,
4937     button => 1, marquee => 1, object => 1, html => 1,
4938     }->{$_->[1]}) {
4939     last INSCOPE;
4940     }
4941     } # INSCOPE
4942    
4943     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4944    
4945 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
4946 wakaba 1.52
4947     !!!next-token;
4948 wakaba 1.53 redo B;
4949 wakaba 1.52 } elsif ({
4950     area => 1, basefont => 1, bgsound => 1, br => 1,
4951     embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
4952     image => 1,
4953     }->{$token->{tag_name}}) {
4954     if ($token->{tag_name} eq 'image') {
4955     !!!parse-error (type => 'image');
4956     $token->{tag_name} = 'img';
4957     }
4958 wakaba 1.1
4959 wakaba 1.52 ## NOTE: There is an "as if <br>" code clone.
4960     $reconstruct_active_formatting_elements->($insert_to_current);
4961    
4962     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4963     pop @{$self->{open_elements}};
4964    
4965     !!!next-token;
4966 wakaba 1.53 redo B;
4967 wakaba 1.52 } elsif ($token->{tag_name} eq 'hr') {
4968     ## has a p element in scope
4969     INSCOPE: for (reverse @{$self->{open_elements}}) {
4970     if ($_->[1] eq 'p') {
4971     !!!back-token;
4972 wakaba 1.55 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4973 wakaba 1.53 redo B;
4974 wakaba 1.52 } elsif ({
4975     table => 1, caption => 1, td => 1, th => 1,
4976     button => 1, marquee => 1, object => 1, html => 1,
4977     }->{$_->[1]}) {
4978     last INSCOPE;
4979     }
4980     } # INSCOPE
4981    
4982     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4983     pop @{$self->{open_elements}};
4984    
4985     !!!next-token;
4986 wakaba 1.53 redo B;
4987 wakaba 1.52 } elsif ($token->{tag_name} eq 'input') {
4988     $reconstruct_active_formatting_elements->($insert_to_current);
4989    
4990     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4991     ## TODO: associate with $self->{form_element} if defined
4992     pop @{$self->{open_elements}};
4993    
4994     !!!next-token;
4995 wakaba 1.53 redo B;
4996 wakaba 1.52 } elsif ($token->{tag_name} eq 'isindex') {
4997     !!!parse-error (type => 'isindex');
4998    
4999     if (defined $self->{form_element}) {
5000     ## Ignore the token
5001     !!!next-token;
5002 wakaba 1.53 redo B;
5003 wakaba 1.52 } else {
5004     my $at = $token->{attributes};
5005     my $form_attrs;
5006     $form_attrs->{action} = $at->{action} if $at->{action};
5007     my $prompt_attr = $at->{prompt};
5008     $at->{name} = {name => 'name', value => 'isindex'};
5009     delete $at->{action};
5010     delete $at->{prompt};
5011     my @tokens = (
5012 wakaba 1.55 {type => START_TAG_TOKEN, tag_name => 'form',
5013 wakaba 1.52 attributes => $form_attrs},
5014 wakaba 1.55 {type => START_TAG_TOKEN, tag_name => 'hr'},
5015     {type => START_TAG_TOKEN, tag_name => 'p'},
5016     {type => START_TAG_TOKEN, tag_name => 'label'},
5017 wakaba 1.52 );
5018     if ($prompt_attr) {
5019 wakaba 1.55 push @tokens, {type => CHARACTER_TOKEN, data => $prompt_attr->{value}};
5020 wakaba 1.1 } else {
5021 wakaba 1.55 push @tokens, {type => CHARACTER_TOKEN,
5022 wakaba 1.52 data => 'This is a searchable index. Insert your search keywords here: '}; # SHOULD
5023     ## TODO: make this configurable
5024 wakaba 1.1 }
5025 wakaba 1.52 push @tokens,
5026 wakaba 1.55 {type => START_TAG_TOKEN, tag_name => 'input', attributes => $at},
5027     #{type => CHARACTER_TOKEN, data => ''}, # SHOULD
5028     {type => END_TAG_TOKEN, tag_name => 'label'},
5029     {type => END_TAG_TOKEN, tag_name => 'p'},
5030     {type => START_TAG_TOKEN, tag_name => 'hr'},
5031     {type => END_TAG_TOKEN, tag_name => 'form'};
5032 wakaba 1.52 $token = shift @tokens;
5033     !!!back-token (@tokens);
5034 wakaba 1.53 redo B;
5035 wakaba 1.52 }
5036     } elsif ($token->{tag_name} eq 'textarea') {
5037     my $tag_name = $token->{tag_name};
5038     my $el;
5039     !!!create-element ($el, $token->{tag_name}, $token->{attributes});
5040    
5041     ## TODO: $self->{form_element} if defined
5042     $self->{content_model} = RCDATA_CONTENT_MODEL;
5043     delete $self->{escape}; # MUST
5044    
5045     $insert->($el);
5046    
5047     my $text = '';
5048     !!!next-token;
5049 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
5050 wakaba 1.52 $token->{data} =~ s/^\x0A//;
5051 wakaba 1.51 unless (length $token->{data}) {
5052     !!!next-token;
5053     }
5054     }
5055 wakaba 1.55 while ($token->{type} == CHARACTER_TOKEN) {
5056 wakaba 1.52 $text .= $token->{data};
5057     !!!next-token;
5058     }
5059     if (length $text) {
5060     $el->manakai_append_text ($text);
5061     }
5062    
5063     $self->{content_model} = PCDATA_CONTENT_MODEL;
5064 wakaba 1.51
5065 wakaba 1.55 if ($token->{type} == END_TAG_TOKEN and
5066 wakaba 1.52 $token->{tag_name} eq $tag_name) {
5067     ## Ignore the token
5068     } else {
5069     !!!parse-error (type => 'in RCDATA:#'.$token->{type});
5070 wakaba 1.51 }
5071 wakaba 1.52 !!!next-token;
5072 wakaba 1.53 redo B;
5073 wakaba 1.52 } elsif ({
5074     iframe => 1,
5075     noembed => 1,
5076     noframes => 1,
5077     noscript => 0, ## TODO: 1 if scripting is enabled
5078     }->{$token->{tag_name}}) {
5079 wakaba 1.58 ## NOTE: There is an "as if in body" code clone.
5080 wakaba 1.52 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);
5081 wakaba 1.53 redo B;
5082 wakaba 1.52 } elsif ($token->{tag_name} eq 'select') {
5083     $reconstruct_active_formatting_elements->($insert_to_current);
5084    
5085     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5086    
5087 wakaba 1.54 $self->{insertion_mode} = IN_SELECT_IM;
5088 wakaba 1.52 !!!next-token;
5089 wakaba 1.53 redo B;
5090 wakaba 1.52 } elsif ({
5091     caption => 1, col => 1, colgroup => 1, frame => 1,
5092     frameset => 1, head => 1, option => 1, optgroup => 1,
5093     tbody => 1, td => 1, tfoot => 1, th => 1,
5094     thead => 1, tr => 1,
5095     }->{$token->{tag_name}}) {
5096     !!!parse-error (type => 'in body:'.$token->{tag_name});
5097     ## Ignore the token
5098     !!!next-token;
5099 wakaba 1.53 redo B;
5100 wakaba 1.52
5101     ## ISSUE: An issue on HTML5 new elements in the spec.
5102     } else {
5103     $reconstruct_active_formatting_elements->($insert_to_current);
5104    
5105     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5106 wakaba 1.51
5107 wakaba 1.52 !!!next-token;
5108 wakaba 1.53 redo B;
5109 wakaba 1.52 }
5110 wakaba 1.55 } elsif ($token->{type} == END_TAG_TOKEN) {
5111 wakaba 1.52 if ($token->{tag_name} eq 'body') {
5112     if (@{$self->{open_elements}} > 1 and
5113     $self->{open_elements}->[1]->[1] eq 'body') {
5114     for (@{$self->{open_elements}}) {
5115     unless ({
5116     dd => 1, dt => 1, li => 1, p => 1, td => 1,
5117     th => 1, tr => 1, body => 1, html => 1,
5118     tbody => 1, tfoot => 1, thead => 1,
5119     }->{$_->[1]}) {
5120     !!!parse-error (type => 'not closed:'.$_->[1]);
5121     }
5122     }
5123 wakaba 1.51
5124 wakaba 1.54 $self->{insertion_mode} = AFTER_BODY_IM;
5125 wakaba 1.52 !!!next-token;
5126 wakaba 1.53 redo B;
5127 wakaba 1.52 } else {
5128     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5129     ## Ignore the token
5130     !!!next-token;
5131 wakaba 1.53 redo B;
5132 wakaba 1.51 }
5133 wakaba 1.52 } elsif ($token->{tag_name} eq 'html') {
5134     if (@{$self->{open_elements}} > 1 and $self->{open_elements}->[1]->[1] eq 'body') {
5135     ## ISSUE: There is an issue in the spec.
5136     if ($self->{open_elements}->[-1]->[1] ne 'body') {
5137     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[1]->[1]);
5138 wakaba 1.1 }
5139 wakaba 1.54 $self->{insertion_mode} = AFTER_BODY_IM;
5140 wakaba 1.52 ## reprocess
5141 wakaba 1.53 redo B;
5142 wakaba 1.51 } else {
5143 wakaba 1.52 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5144     ## Ignore the token
5145     !!!next-token;
5146 wakaba 1.53 redo B;
5147 wakaba 1.51 }
5148 wakaba 1.52 } elsif ({
5149     address => 1, blockquote => 1, center => 1, dir => 1,
5150     div => 1, dl => 1, fieldset => 1, listing => 1,
5151     menu => 1, ol => 1, pre => 1, ul => 1,
5152     p => 1,
5153     dd => 1, dt => 1, li => 1,
5154     button => 1, marquee => 1, object => 1,
5155     }->{$token->{tag_name}}) {
5156     ## has an element in scope
5157     my $i;
5158     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5159     my $node = $self->{open_elements}->[$_];
5160     if ($node->[1] eq $token->{tag_name}) {
5161     ## generate implied end tags
5162     if ({
5163     dd => ($token->{tag_name} ne 'dd'),
5164     dt => ($token->{tag_name} ne 'dt'),
5165     li => ($token->{tag_name} ne 'li'),
5166     p => ($token->{tag_name} ne 'p'),
5167     td => 1, th => 1, tr => 1,
5168     tbody => 1, tfoot=> 1, thead => 1,
5169     }->{$self->{open_elements}->[-1]->[1]}) {
5170     !!!back-token;
5171 wakaba 1.55 $token = {type => END_TAG_TOKEN,
5172 wakaba 1.52 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
5173 wakaba 1.53 redo B;
5174 wakaba 1.52 }
5175     $i = $_;
5176     last INSCOPE unless $token->{tag_name} eq 'p';
5177     } elsif ({
5178     table => 1, caption => 1, td => 1, th => 1,
5179     button => 1, marquee => 1, object => 1, html => 1,
5180     }->{$node->[1]}) {
5181     last INSCOPE;
5182 wakaba 1.51 }
5183 wakaba 1.52 } # INSCOPE
5184    
5185     if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
5186     if (defined $i) {
5187     !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
5188 wakaba 1.51 } else {
5189 wakaba 1.52 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5190 wakaba 1.51 }
5191     }
5192    
5193 wakaba 1.52 if (defined $i) {
5194     splice @{$self->{open_elements}}, $i;
5195     } elsif ($token->{tag_name} eq 'p') {
5196     ## As if <p>, then reprocess the current token
5197     my $el;
5198     !!!create-element ($el, 'p');
5199     $insert->($el);
5200 wakaba 1.51 }
5201 wakaba 1.52 $clear_up_to_marker->()
5202     if {
5203     button => 1, marquee => 1, object => 1,
5204     }->{$token->{tag_name}};
5205     !!!next-token;
5206 wakaba 1.53 redo B;
5207 wakaba 1.52 } elsif ($token->{tag_name} eq 'form') {
5208     ## has an element in scope
5209     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5210     my $node = $self->{open_elements}->[$_];
5211     if ($node->[1] eq $token->{tag_name}) {
5212     ## generate implied end tags
5213     if ({
5214     dd => 1, dt => 1, li => 1, p => 1,
5215     td => 1, th => 1, tr => 1,
5216     tbody => 1, tfoot=> 1, thead => 1,
5217     }->{$self->{open_elements}->[-1]->[1]}) {
5218     !!!back-token;
5219 wakaba 1.55 $token = {type => END_TAG_TOKEN,
5220 wakaba 1.52 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
5221 wakaba 1.53 redo B;
5222 wakaba 1.52 }
5223     last INSCOPE;
5224     } elsif ({
5225     table => 1, caption => 1, td => 1, th => 1,
5226     button => 1, marquee => 1, object => 1, html => 1,
5227     }->{$node->[1]}) {
5228     last INSCOPE;
5229     }
5230     } # INSCOPE
5231    
5232     if ($self->{open_elements}->[-1]->[1] eq $token->{tag_name}) {
5233 wakaba 1.36 pop @{$self->{open_elements}};
5234     } else {
5235 wakaba 1.58 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5236 wakaba 1.52 }
5237    
5238     undef $self->{form_element};
5239     !!!next-token;
5240 wakaba 1.53 redo B;
5241 wakaba 1.52 } elsif ({
5242     h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
5243     }->{$token->{tag_name}}) {
5244     ## has an element in scope
5245     my $i;
5246     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5247     my $node = $self->{open_elements}->[$_];
5248     if ({
5249     h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
5250     }->{$node->[1]}) {
5251     ## generate implied end tags
5252     if ({
5253     dd => 1, dt => 1, li => 1, p => 1,
5254     td => 1, th => 1, tr => 1,
5255     tbody => 1, tfoot=> 1, thead => 1,
5256     }->{$self->{open_elements}->[-1]->[1]}) {
5257     !!!back-token;
5258 wakaba 1.55 $token = {type => END_TAG_TOKEN,
5259 wakaba 1.52 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
5260 wakaba 1.53 redo B;
5261 wakaba 1.52 }
5262     $i = $_;
5263     last INSCOPE;
5264     } elsif ({
5265     table => 1, caption => 1, td => 1, th => 1,
5266     button => 1, marquee => 1, object => 1, html => 1,
5267     }->{$node->[1]}) {
5268     last INSCOPE;
5269 wakaba 1.51 }
5270 wakaba 1.52 } # INSCOPE
5271    
5272     if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
5273 wakaba 1.58 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5274 wakaba 1.36 }
5275 wakaba 1.52
5276     splice @{$self->{open_elements}}, $i if defined $i;
5277     !!!next-token;
5278 wakaba 1.53 redo B;
5279 wakaba 1.52 } elsif ({
5280     a => 1,
5281     b => 1, big => 1, em => 1, font => 1, i => 1,
5282     nobr => 1, s => 1, small => 1, strile => 1,
5283     strong => 1, tt => 1, u => 1,
5284     }->{$token->{tag_name}}) {
5285     $formatting_end_tag->($token->{tag_name});
5286 wakaba 1.53 redo B;
5287 wakaba 1.52 } elsif ($token->{tag_name} eq 'br') {
5288     !!!parse-error (type => 'unmatched end tag:br');
5289    
5290     ## As if <br>
5291     $reconstruct_active_formatting_elements->($insert_to_current);
5292    
5293     my $el;
5294     !!!create-element ($el, 'br');
5295     $insert->($el);
5296    
5297     ## Ignore the token.
5298     !!!next-token;
5299 wakaba 1.53 redo B;
5300 wakaba 1.52 } elsif ({
5301     caption => 1, col => 1, colgroup => 1, frame => 1,
5302     frameset => 1, head => 1, option => 1, optgroup => 1,
5303     tbody => 1, td => 1, tfoot => 1, th => 1,
5304     thead => 1, tr => 1,
5305     area => 1, basefont => 1, bgsound => 1,
5306     embed => 1, hr => 1, iframe => 1, image => 1,
5307     img => 1, input => 1, isindex => 1, noembed => 1,
5308     noframes => 1, param => 1, select => 1, spacer => 1,
5309     table => 1, textarea => 1, wbr => 1,
5310     noscript => 0, ## TODO: if scripting is enabled
5311     }->{$token->{tag_name}}) {
5312     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5313     ## Ignore the token
5314     !!!next-token;
5315 wakaba 1.53 redo B;
5316 wakaba 1.52
5317     ## ISSUE: Issue on HTML5 new elements in spec
5318    
5319     } else {
5320     ## Step 1
5321     my $node_i = -1;
5322     my $node = $self->{open_elements}->[$node_i];
5323 wakaba 1.51
5324 wakaba 1.52 ## Step 2
5325     S2: {
5326     if ($node->[1] eq $token->{tag_name}) {
5327     ## Step 1
5328     ## generate implied end tags
5329     if ({
5330     dd => 1, dt => 1, li => 1, p => 1,
5331     td => 1, th => 1, tr => 1,
5332 wakaba 1.55 tbody => 1, tfoot => 1, thead => 1,
5333 wakaba 1.52 }->{$self->{open_elements}->[-1]->[1]}) {
5334     !!!back-token;
5335 wakaba 1.55 $token = {type => END_TAG_TOKEN,
5336 wakaba 1.52 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
5337 wakaba 1.53 redo B;
5338 wakaba 1.52 }
5339    
5340     ## Step 2
5341     if ($token->{tag_name} ne $self->{open_elements}->[-1]->[1]) {
5342 wakaba 1.58 ## NOTE: <x><y></x>
5343 wakaba 1.52 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
5344     }
5345    
5346     ## Step 3
5347     splice @{$self->{open_elements}}, $node_i;
5348 wakaba 1.51
5349 wakaba 1.1 !!!next-token;
5350 wakaba 1.52 last S2;
5351 wakaba 1.1 } else {
5352 wakaba 1.52 ## Step 3
5353     if (not $formatting_category->{$node->[1]} and
5354     #not $phrasing_category->{$node->[1]} and
5355     ($special_category->{$node->[1]} or
5356     $scoping_category->{$node->[1]})) {
5357     !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5358     ## Ignore the token
5359     !!!next-token;
5360     last S2;
5361     }
5362 wakaba 1.1 }
5363 wakaba 1.52
5364     ## Step 4
5365     $node_i--;
5366     $node = $self->{open_elements}->[$node_i];
5367    
5368     ## Step 5;
5369     redo S2;
5370     } # S2
5371 wakaba 1.53 redo B;
5372 wakaba 1.1 }
5373     }
5374 wakaba 1.52 redo B;
5375 wakaba 1.1 } # B
5376    
5377 wakaba 1.51 ## NOTE: The "trailing end" phase in HTML5 is split into
5378     ## two insertion modes: "after html body" and "after html frameset".
5379     ## NOTE: States in the main stage is preserved while
5380     ## the parser stays in the trailing end phase. # MUST
5381    
5382 wakaba 1.1 ## Stop parsing # MUST
5383    
5384     ## TODO: script stuffs
5385 wakaba 1.3 } # _tree_construct_main
5386    
5387     sub set_inner_html ($$$) {
5388     my $class = shift;
5389     my $node = shift;
5390     my $s = \$_[0];
5391     my $onerror = $_[1];
5392    
5393 wakaba 1.63 ## ISSUE: Should {confident} be true?
5394    
5395 wakaba 1.3 my $nt = $node->node_type;
5396     if ($nt == 9) {
5397     # MUST
5398    
5399     ## Step 1 # MUST
5400     ## TODO: If the document has an active parser, ...
5401     ## ISSUE: There is an issue in the spec.
5402    
5403     ## Step 2 # MUST
5404     my @cn = @{$node->child_nodes};
5405     for (@cn) {
5406     $node->remove_child ($_);
5407     }
5408    
5409     ## Step 3, 4, 5 # MUST
5410     $class->parse_string ($$s => $node, $onerror);
5411     } elsif ($nt == 1) {
5412     ## TODO: If non-html element
5413    
5414     ## NOTE: Most of this code is copied from |parse_string|
5415    
5416     ## Step 1 # MUST
5417 wakaba 1.14 my $this_doc = $node->owner_document;
5418     my $doc = $this_doc->implementation->create_document;
5419 wakaba 1.18 $doc->manakai_is_html (1);
5420 wakaba 1.3 my $p = $class->new;
5421     $p->{document} = $doc;
5422    
5423     ## Step 9 # MUST
5424     my $i = 0;
5425     my $line = 1;
5426     my $column = 0;
5427     $p->{set_next_input_character} = sub {
5428     my $self = shift;
5429 wakaba 1.14
5430     pop @{$self->{prev_input_character}};
5431     unshift @{$self->{prev_input_character}}, $self->{next_input_character};
5432    
5433 wakaba 1.3 $self->{next_input_character} = -1 and return if $i >= length $$s;
5434     $self->{next_input_character} = ord substr $$s, $i++, 1;
5435     $column++;
5436 wakaba 1.4
5437     if ($self->{next_input_character} == 0x000A) { # LF
5438     $line++;
5439     $column = 0;
5440     } elsif ($self->{next_input_character} == 0x000D) { # CR
5441 wakaba 1.15 $i++ if substr ($$s, $i, 1) eq "\x0A";
5442 wakaba 1.3 $self->{next_input_character} = 0x000A; # LF # MUST
5443     $line++;
5444 wakaba 1.4 $column = 0;
5445 wakaba 1.3 } elsif ($self->{next_input_character} > 0x10FFFF) {
5446     $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
5447     } elsif ($self->{next_input_character} == 0x0000) { # NULL
5448 wakaba 1.14 !!!parse-error (type => 'NULL');
5449 wakaba 1.3 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
5450     }
5451     };
5452 wakaba 1.14 $p->{prev_input_character} = [-1, -1, -1];
5453     $p->{next_input_character} = -1;
5454 wakaba 1.3
5455     my $ponerror = $onerror || sub {
5456     my (%opt) = @_;
5457     warn "Parse error ($opt{type}) at line $opt{line} column $opt{column}\n";
5458     };
5459     $p->{parse_error} = sub {
5460     $ponerror->(@_, line => $line, column => $column);
5461     };
5462    
5463     $p->_initialize_tokenizer;
5464     $p->_initialize_tree_constructor;
5465    
5466     ## Step 2
5467     my $node_ln = $node->local_name;
5468 wakaba 1.40 $p->{content_model} = {
5469     title => RCDATA_CONTENT_MODEL,
5470     textarea => RCDATA_CONTENT_MODEL,
5471     style => CDATA_CONTENT_MODEL,
5472     script => CDATA_CONTENT_MODEL,
5473     xmp => CDATA_CONTENT_MODEL,
5474     iframe => CDATA_CONTENT_MODEL,
5475     noembed => CDATA_CONTENT_MODEL,
5476     noframes => CDATA_CONTENT_MODEL,
5477     noscript => CDATA_CONTENT_MODEL,
5478     plaintext => PLAINTEXT_CONTENT_MODEL,
5479     }->{$node_ln};
5480     $p->{content_model} = PCDATA_CONTENT_MODEL
5481     unless defined $p->{content_model};
5482     ## ISSUE: What is "the name of the element"? local name?
5483 wakaba 1.3
5484     $p->{inner_html_node} = [$node, $node_ln];
5485    
5486     ## Step 4
5487     my $root = $doc->create_element_ns
5488     ('http://www.w3.org/1999/xhtml', [undef, 'html']);
5489    
5490     ## Step 5 # MUST
5491     $doc->append_child ($root);
5492    
5493     ## Step 6 # MUST
5494     push @{$p->{open_elements}}, [$root, 'html'];
5495    
5496     undef $p->{head_element};
5497    
5498     ## Step 7 # MUST
5499     $p->_reset_insertion_mode;
5500    
5501     ## Step 8 # MUST
5502     my $anode = $node;
5503     AN: while (defined $anode) {
5504     if ($anode->node_type == 1) {
5505     my $nsuri = $anode->namespace_uri;
5506     if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {
5507     if ($anode->local_name eq 'form') { ## TODO: case?
5508     $p->{form_element} = $anode;
5509     last AN;
5510     }
5511     }
5512     }
5513     $anode = $anode->parent_node;
5514     } # AN
5515    
5516     ## Step 3 # MUST
5517     ## Step 10 # MUST
5518     {
5519     my $self = $p;
5520     !!!next-token;
5521     }
5522     $p->_tree_construction_main;
5523    
5524     ## Step 11 # MUST
5525     my @cn = @{$node->child_nodes};
5526     for (@cn) {
5527     $node->remove_child ($_);
5528     }
5529     ## ISSUE: mutation events? read-only?
5530    
5531     ## Step 12 # MUST
5532     @cn = @{$root->child_nodes};
5533     for (@cn) {
5534 wakaba 1.14 $this_doc->adopt_node ($_);
5535 wakaba 1.3 $node->append_child ($_);
5536     }
5537 wakaba 1.14 ## ISSUE: mutation events?
5538 wakaba 1.3
5539     $p->_terminate_tree_constructor;
5540     } else {
5541     die "$0: |set_inner_html| is not defined for node of type $nt";
5542     }
5543     } # set_inner_html
5544    
5545     } # tree construction stage
5546 wakaba 1.1
5547 wakaba 1.63 package Whatpm::HTML::RestartParser;
5548     push our @ISA, 'Error';
5549    
5550 wakaba 1.1 1;
5551 wakaba 1.70 # $Date: 2008/02/17 12:39:32 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24