/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.70 - (show annotations) (download) (as text)
Sat Mar 1 00:42:52 2008 UTC (16 years, 8 months ago) by wakaba
Branch: MAIN
Changes since 1.69: +10 -4 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	1 Mar 2008 00:26:59 -0000
2008-03-01  Wakaba  <wakaba@suika.fam.cx>

	* tokenizer-test-1.test: Updated (HTML5 recision 1286).

	* content-model-2.dat: Updated (HTML5 revision 1275).

++ whatpm/Whatpm/ChangeLog	1 Mar 2008 00:19:36 -0000
2008-03-01  Wakaba  <wakaba@suika.fam.cx>

	* _NamedEntityList.pm: Updated (HTML5 revision 1286).

	* HTML.pm.src: |charset| in |content| attribute is
	case-insensitive (HTML5 revision 1270).

++ whatpm/Whatpm/HTML/ChangeLog	1 Mar 2008 00:07:44 -0000
2008-03-01  Wakaba  <wakaba@suika.fam.cx>

	* Serializer.pm (get_inner_html): Escape NBSP (HTML5 revision
	1277).

++ whatpm/Whatpm/ContentChecker/ChangeLog	29 Feb 2008 23:29:54 -0000
2008-03-01  Wakaba  <wakaba@suika.fam.cx>

	* HTML.pm: Sectioning root category added.  |blockquote|
	is no longer a sectioning content.

1 package Whatpm::HTML;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.69 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 use Error qw(:try);
5
6 ## ISSUE:
7 ## var doc = implementation.createDocument (null, null, null);
8 ## doc.write ('');
9 ## alert (doc.compatMode);
10
11 ## ISSUE: HTML5 revision 967 says that the encoding layer MUST NOT
12 ## strip BOM and the HTML layer MUST ignore it. Whether we can do it
13 ## is not yet clear.
14 ## "{U+FEFF}..." in UTF-16BE/UTF-16LE is three or four characters?
15 ## "{U+FEFF}..." in GB18030?
16
17 ## TODO: Control charcters and noncharacters are not allowed (HTML5 revision 1263)
18 ## TODO: 1252 parse error (revision 1264)
19 ## TODO: 8859-11 = 874 (revision 1271)
20
21 my $permitted_slash_tag_name = {
22 base => 1,
23 link => 1,
24 meta => 1,
25 hr => 1,
26 br => 1,
27 img=> 1,
28 embed => 1,
29 param => 1,
30 area => 1,
31 col => 1,
32 input => 1,
33 };
34
35 my $c1_entity_char = {
36 0x80 => 0x20AC,
37 0x81 => 0xFFFD,
38 0x82 => 0x201A,
39 0x83 => 0x0192,
40 0x84 => 0x201E,
41 0x85 => 0x2026,
42 0x86 => 0x2020,
43 0x87 => 0x2021,
44 0x88 => 0x02C6,
45 0x89 => 0x2030,
46 0x8A => 0x0160,
47 0x8B => 0x2039,
48 0x8C => 0x0152,
49 0x8D => 0xFFFD,
50 0x8E => 0x017D,
51 0x8F => 0xFFFD,
52 0x90 => 0xFFFD,
53 0x91 => 0x2018,
54 0x92 => 0x2019,
55 0x93 => 0x201C,
56 0x94 => 0x201D,
57 0x95 => 0x2022,
58 0x96 => 0x2013,
59 0x97 => 0x2014,
60 0x98 => 0x02DC,
61 0x99 => 0x2122,
62 0x9A => 0x0161,
63 0x9B => 0x203A,
64 0x9C => 0x0153,
65 0x9D => 0xFFFD,
66 0x9E => 0x017E,
67 0x9F => 0x0178,
68 }; # $c1_entity_char
69
70 my $special_category = {
71 address => 1, area => 1, base => 1, basefont => 1, bgsound => 1,
72 blockquote => 1, body => 1, br => 1, center => 1, col => 1, colgroup => 1,
73 dd => 1, dir => 1, div => 1, dl => 1, dt => 1, embed => 1, fieldset => 1,
74 form => 1, frame => 1, frameset => 1, h1 => 1, h2 => 1, h3 => 1,
75 h4 => 1, h5 => 1, h6 => 1, head => 1, hr => 1, iframe => 1, image => 1,
76 img => 1, input => 1, isindex => 1, li => 1, link => 1, listing => 1,
77 menu => 1, meta => 1, noembed => 1, noframes => 1, noscript => 1,
78 ol => 1, optgroup => 1, option => 1, p => 1, param => 1, plaintext => 1,
79 pre => 1, script => 1, select => 1, spacer => 1, style => 1, tbody => 1,
80 textarea => 1, tfoot => 1, thead => 1, title => 1, tr => 1, ul => 1, wbr => 1,
81 };
82 my $scoping_category = {
83 button => 1, caption => 1, html => 1, marquee => 1, object => 1,
84 table => 1, td => 1, th => 1,
85 };
86 my $formatting_category = {
87 a => 1, b => 1, big => 1, em => 1, font => 1, i => 1, nobr => 1,
88 s => 1, small => 1, strile => 1, strong => 1, tt => 1, u => 1,
89 };
90 # $phrasing_category: all other elements
91
92 sub parse_byte_string ($$$$;$) {
93 my $self = ref $_[0] ? shift : shift->new;
94 my $charset = shift;
95 my $bytes_s = ref $_[0] ? $_[0] : \($_[0]);
96 my $s;
97
98 if (defined $charset) {
99 require Encode; ## TODO: decode(utf8) don't delete BOM
100 $s = \ (Encode::decode ($charset, $$bytes_s));
101 $self->{input_encoding} = lc $charset; ## TODO: normalize name
102 $self->{confident} = 1;
103 } else {
104 ## TODO: Implement HTML5 detection algorithm
105 require Whatpm::Charset::UniversalCharDet;
106 $charset = Whatpm::Charset::UniversalCharDet->detect_byte_string
107 (substr ($$bytes_s, 0, 1024));
108 $charset ||= 'windows-1252';
109 $s = \ (Encode::decode ($charset, $$bytes_s));
110 $self->{input_encoding} = $charset;
111 $self->{confident} = 0;
112 }
113
114 $self->{change_encoding} = sub {
115 my $self = shift;
116 my $charset = lc shift;
117 ## TODO: if $charset is supported
118 ## TODO: normalize charset name
119
120 ## "Change the encoding" algorithm:
121
122 ## Step 1
123 if ($charset eq 'utf-16') { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8?
124 $charset = 'utf-8';
125 }
126
127 ## Step 2
128 if (defined $self->{input_encoding} and
129 $self->{input_encoding} eq $charset) {
130 $self->{confident} = 1;
131 return;
132 }
133
134 !!!parse-error (type => 'charset label detected:'.$self->{input_encoding}.
135 ':'.$charset, level => 'w');
136
137 ## Step 3
138 # if (can) {
139 ## change the encoding on the fly.
140 #$self->{confident} = 1;
141 #return;
142 # }
143
144 ## Step 4
145 throw Whatpm::HTML::RestartParser (charset => $charset);
146 }; # $self->{change_encoding}
147
148 my @args = @_; shift @args; # $s
149 my $return;
150 try {
151 $return = $self->parse_char_string ($s, @args);
152 } catch Whatpm::HTML::RestartParser with {
153 my $charset = shift->{charset};
154 $s = \ (Encode::decode ($charset, $$bytes_s));
155 $self->{input_encoding} = $charset; ## TODO: normalize
156 $self->{confident} = 1;
157 $return = $self->parse_char_string ($s, @args);
158 };
159 return $return;
160 } # parse_byte_string
161
162 *parse_char_string = \&parse_string;
163
164 sub parse_string ($$$;$) {
165 my $self = ref $_[0] ? shift : shift->new;
166 my $s = ref $_[0] ? $_[0] : \($_[0]);
167 $self->{document} = $_[1];
168 @{$self->{document}->child_nodes} = ();
169
170 ## NOTE: |set_inner_html| copies most of this method's code
171
172 $self->{confident} = 1 unless exists $self->{confident};
173 $self->{document}->input_encoding ($self->{input_encoding})
174 if defined $self->{input_encoding};
175
176 my $i = 0;
177 my $line = 1;
178 my $column = 0;
179 $self->{set_next_input_character} = sub {
180 my $self = shift;
181
182 pop @{$self->{prev_input_character}};
183 unshift @{$self->{prev_input_character}}, $self->{next_input_character};
184
185 $self->{next_input_character} = -1 and return if $i >= length $$s;
186 $self->{next_input_character} = ord substr $$s, $i++, 1;
187 $column++;
188
189 if ($self->{next_input_character} == 0x000A) { # LF
190 $line++;
191 $column = 0;
192 } elsif ($self->{next_input_character} == 0x000D) { # CR
193 $i++ if substr ($$s, $i, 1) eq "\x0A";
194 $self->{next_input_character} = 0x000A; # LF # MUST
195 $line++;
196 $column = 0;
197 } elsif ($self->{next_input_character} > 0x10FFFF) {
198 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
199 } elsif ($self->{next_input_character} == 0x0000) { # NULL
200 !!!parse-error (type => 'NULL');
201 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
202 }
203 };
204 $self->{prev_input_character} = [-1, -1, -1];
205 $self->{next_input_character} = -1;
206
207 my $onerror = $_[2] || sub {
208 my (%opt) = @_;
209 warn "Parse error ($opt{type}) at line $opt{line} column $opt{column}\n";
210 };
211 $self->{parse_error} = sub {
212 $onerror->(@_, line => $line, column => $column);
213 };
214
215 $self->_initialize_tokenizer;
216 $self->_initialize_tree_constructor;
217 $self->_construct_tree;
218 $self->_terminate_tree_constructor;
219
220 return $self->{document};
221 } # parse_string
222
223 sub new ($) {
224 my $class = shift;
225 my $self = bless {}, $class;
226 $self->{set_next_input_character} = sub {
227 $self->{next_input_character} = -1;
228 };
229 $self->{parse_error} = sub {
230 #
231 };
232 $self->{change_encoding} = sub {
233 # if ($_[0] is a supported encoding) {
234 # run "change the encoding" algorithm;
235 # throw Whatpm::HTML::RestartParser (charset => $new_encoding);
236 # }
237 };
238 $self->{application_cache_selection} = sub {
239 #
240 };
241 return $self;
242 } # new
243
244 sub CM_ENTITY () { 0b001 } # & markup in data
245 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
246 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
247
248 sub PLAINTEXT_CONTENT_MODEL () { 0 }
249 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
250 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
251 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
252
253 sub DATA_STATE () { 0 }
254 sub ENTITY_DATA_STATE () { 1 }
255 sub TAG_OPEN_STATE () { 2 }
256 sub CLOSE_TAG_OPEN_STATE () { 3 }
257 sub TAG_NAME_STATE () { 4 }
258 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
259 sub ATTRIBUTE_NAME_STATE () { 6 }
260 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
261 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
262 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
263 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
264 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
265 sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
266 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
267 sub COMMENT_START_STATE () { 14 }
268 sub COMMENT_START_DASH_STATE () { 15 }
269 sub COMMENT_STATE () { 16 }
270 sub COMMENT_END_STATE () { 17 }
271 sub COMMENT_END_DASH_STATE () { 18 }
272 sub BOGUS_COMMENT_STATE () { 19 }
273 sub DOCTYPE_STATE () { 20 }
274 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
275 sub DOCTYPE_NAME_STATE () { 22 }
276 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
277 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
278 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
279 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
280 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
281 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
282 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
283 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
284 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
285 sub BOGUS_DOCTYPE_STATE () { 32 }
286
287 sub DOCTYPE_TOKEN () { 1 }
288 sub COMMENT_TOKEN () { 2 }
289 sub START_TAG_TOKEN () { 3 }
290 sub END_TAG_TOKEN () { 4 }
291 sub END_OF_FILE_TOKEN () { 5 }
292 sub CHARACTER_TOKEN () { 6 }
293
294 sub AFTER_HTML_IMS () { 0b100 }
295 sub HEAD_IMS () { 0b1000 }
296 sub BODY_IMS () { 0b10000 }
297 sub BODY_TABLE_IMS () { 0b100000 }
298 sub TABLE_IMS () { 0b1000000 }
299 sub ROW_IMS () { 0b10000000 }
300 sub BODY_AFTER_IMS () { 0b100000000 }
301 sub FRAME_IMS () { 0b1000000000 }
302
303 sub AFTER_HTML_BODY_IM () { AFTER_HTML_IMS | BODY_AFTER_IMS }
304 sub AFTER_HTML_FRAMESET_IM () { AFTER_HTML_IMS | FRAME_IMS }
305 sub IN_HEAD_IM () { HEAD_IMS | 0b00 }
306 sub IN_HEAD_NOSCRIPT_IM () { HEAD_IMS | 0b01 }
307 sub AFTER_HEAD_IM () { HEAD_IMS | 0b10 }
308 sub BEFORE_HEAD_IM () { HEAD_IMS | 0b11 }
309 sub IN_BODY_IM () { BODY_IMS }
310 sub IN_CELL_IM () { BODY_IMS | BODY_TABLE_IMS | 0b01 }
311 sub IN_CAPTION_IM () { BODY_IMS | BODY_TABLE_IMS | 0b10 }
312 sub IN_ROW_IM () { TABLE_IMS | ROW_IMS | 0b01 }
313 sub IN_TABLE_BODY_IM () { TABLE_IMS | ROW_IMS | 0b10 }
314 sub IN_TABLE_IM () { TABLE_IMS }
315 sub AFTER_BODY_IM () { BODY_AFTER_IMS }
316 sub IN_FRAMESET_IM () { FRAME_IMS | 0b01 }
317 sub AFTER_FRAMESET_IM () { FRAME_IMS | 0b10 }
318 sub IN_SELECT_IM () { 0b01 }
319 sub IN_COLUMN_GROUP_IM () { 0b10 }
320
321 ## Implementations MUST act as if state machine in the spec
322
323 sub _initialize_tokenizer ($) {
324 my $self = shift;
325 $self->{state} = DATA_STATE; # MUST
326 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
327 undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE
328 undef $self->{current_attribute};
329 undef $self->{last_emitted_start_tag_name};
330 undef $self->{last_attribute_value_state};
331 $self->{char} = [];
332 # $self->{next_input_character}
333 !!!next-input-character;
334 $self->{token} = [];
335 # $self->{escape}
336 } # _initialize_tokenizer
337
338 ## A token has:
339 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
340 ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
341 ## ->{name} (DOCTYPE_TOKEN)
342 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
343 ## ->{public_identifier} (DOCTYPE_TOKEN)
344 ## ->{system_identifier} (DOCTYPE_TOKEN)
345 ## ->{correct} == 1 or 0 (DOCTYPE_TOKEN)
346 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
347 ## ->{name}
348 ## ->{value}
349 ## ->{has_reference} == 1 or 0
350 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
351
352 ## Emitted token MUST immediately be handled by the tree construction state.
353
354 ## Before each step, UA MAY check to see if either one of the scripts in
355 ## "list of scripts that will execute as soon as possible" or the first
356 ## script in the "list of scripts that will execute asynchronously",
357 ## has completed loading. If one has, then it MUST be executed
358 ## and removed from the list.
359
360 ## NOTE: HTML5 "Writing HTML documents" section, applied to
361 ## documents and not to user agents and conformance checkers,
362 ## contains some requirements that are not detected by the
363 ## parsing algorithm:
364 ## - Some requirements on character encoding declarations. ## TODO
365 ## - "Elements MUST NOT contain content that their content model disallows."
366 ## ... Some are parse error, some are not (will be reported by c.c.).
367 ## - Polytheistic slash SHOULD NOT be used. (Applied only to atheists.) ## TODO
368 ## - Text (in elements, attributes, and comments) SHOULD NOT contain
369 ## control characters other than space characters. ## TODO: (what is control character? C0, C1 and DEL? Unicode control character?)
370
371 ## TODO: HTML5 poses authors two SHOULD-level requirements that cannot
372 ## be detected by the HTML5 parsing algorithm:
373 ## - Text,
374
375 sub _get_next_token ($) {
376 my $self = shift;
377 if (@{$self->{token}}) {
378 return shift @{$self->{token}};
379 }
380
381 A: {
382 if ($self->{state} == DATA_STATE) {
383 if ($self->{next_input_character} == 0x0026) { # &
384 if ($self->{content_model} & CM_ENTITY) { # PCDATA | RCDATA
385 $self->{state} = ENTITY_DATA_STATE;
386 !!!next-input-character;
387 redo A;
388 } else {
389 #
390 }
391 } elsif ($self->{next_input_character} == 0x002D) { # -
392 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
393 unless ($self->{escape}) {
394 if ($self->{prev_input_character}->[0] == 0x002D and # -
395 $self->{prev_input_character}->[1] == 0x0021 and # !
396 $self->{prev_input_character}->[2] == 0x003C) { # <
397 $self->{escape} = 1;
398 }
399 }
400 }
401
402 #
403 } elsif ($self->{next_input_character} == 0x003C) { # <
404 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
405 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
406 not $self->{escape})) {
407 $self->{state} = TAG_OPEN_STATE;
408 !!!next-input-character;
409 redo A;
410 } else {
411 #
412 }
413 } elsif ($self->{next_input_character} == 0x003E) { # >
414 if ($self->{escape} and
415 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
416 if ($self->{prev_input_character}->[0] == 0x002D and # -
417 $self->{prev_input_character}->[1] == 0x002D) { # -
418 delete $self->{escape};
419 }
420 }
421
422 #
423 } elsif ($self->{next_input_character} == -1) {
424 !!!emit ({type => END_OF_FILE_TOKEN});
425 last A; ## TODO: ok?
426 }
427 # Anything else
428 my $token = {type => CHARACTER_TOKEN,
429 data => chr $self->{next_input_character}};
430 ## Stay in the data state
431 !!!next-input-character;
432
433 !!!emit ($token);
434
435 redo A;
436 } elsif ($self->{state} == ENTITY_DATA_STATE) {
437 ## (cannot happen in CDATA state)
438
439 my $token = $self->_tokenize_attempt_to_consume_an_entity (0);
440
441 $self->{state} = DATA_STATE;
442 # next-input-character is already done
443
444 unless (defined $token) {
445 !!!emit ({type => CHARACTER_TOKEN, data => '&'});
446 } else {
447 !!!emit ($token);
448 }
449
450 redo A;
451 } elsif ($self->{state} == TAG_OPEN_STATE) {
452 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
453 if ($self->{next_input_character} == 0x002F) { # /
454 !!!next-input-character;
455 $self->{state} = CLOSE_TAG_OPEN_STATE;
456 redo A;
457 } else {
458 ## reconsume
459 $self->{state} = DATA_STATE;
460
461 !!!emit ({type => CHARACTER_TOKEN, data => '<'});
462
463 redo A;
464 }
465 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
466 if ($self->{next_input_character} == 0x0021) { # !
467 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
468 !!!next-input-character;
469 redo A;
470 } elsif ($self->{next_input_character} == 0x002F) { # /
471 $self->{state} = CLOSE_TAG_OPEN_STATE;
472 !!!next-input-character;
473 redo A;
474 } elsif (0x0041 <= $self->{next_input_character} and
475 $self->{next_input_character} <= 0x005A) { # A..Z
476 $self->{current_token}
477 = {type => START_TAG_TOKEN,
478 tag_name => chr ($self->{next_input_character} + 0x0020)};
479 $self->{state} = TAG_NAME_STATE;
480 !!!next-input-character;
481 redo A;
482 } elsif (0x0061 <= $self->{next_input_character} and
483 $self->{next_input_character} <= 0x007A) { # a..z
484 $self->{current_token} = {type => START_TAG_TOKEN,
485 tag_name => chr ($self->{next_input_character})};
486 $self->{state} = TAG_NAME_STATE;
487 !!!next-input-character;
488 redo A;
489 } elsif ($self->{next_input_character} == 0x003E) { # >
490 !!!parse-error (type => 'empty start tag');
491 $self->{state} = DATA_STATE;
492 !!!next-input-character;
493
494 !!!emit ({type => CHARACTER_TOKEN, data => '<>'});
495
496 redo A;
497 } elsif ($self->{next_input_character} == 0x003F) { # ?
498 !!!parse-error (type => 'pio');
499 $self->{state} = BOGUS_COMMENT_STATE;
500 ## $self->{next_input_character} is intentionally left as is
501 redo A;
502 } else {
503 !!!parse-error (type => 'bare stago');
504 $self->{state} = DATA_STATE;
505 ## reconsume
506
507 !!!emit ({type => CHARACTER_TOKEN, data => '<'});
508
509 redo A;
510 }
511 } else {
512 die "$0: $self->{content_model} in tag open";
513 }
514 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
515 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
516 if (defined $self->{last_emitted_start_tag_name}) {
517 ## NOTE: <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>
518 my @next_char;
519 TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
520 push @next_char, $self->{next_input_character};
521 my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
522 my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
523 if ($self->{next_input_character} == $c or $self->{next_input_character} == $C) {
524 !!!next-input-character;
525 next TAGNAME;
526 } else {
527 $self->{next_input_character} = shift @next_char; # reconsume
528 !!!back-next-input-character (@next_char);
529 $self->{state} = DATA_STATE;
530
531 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
532
533 redo A;
534 }
535 }
536 push @next_char, $self->{next_input_character};
537
538 unless ($self->{next_input_character} == 0x0009 or # HT
539 $self->{next_input_character} == 0x000A or # LF
540 $self->{next_input_character} == 0x000B or # VT
541 $self->{next_input_character} == 0x000C or # FF
542 $self->{next_input_character} == 0x0020 or # SP
543 $self->{next_input_character} == 0x003E or # >
544 $self->{next_input_character} == 0x002F or # /
545 $self->{next_input_character} == -1) {
546 $self->{next_input_character} = shift @next_char; # reconsume
547 !!!back-next-input-character (@next_char);
548 $self->{state} = DATA_STATE;
549 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
550 redo A;
551 } else {
552 $self->{next_input_character} = shift @next_char;
553 !!!back-next-input-character (@next_char);
554 # and consume...
555 }
556 } else {
557 ## No start tag token has ever been emitted
558 # next-input-character is already done
559 $self->{state} = DATA_STATE;
560 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
561 redo A;
562 }
563 }
564
565 if (0x0041 <= $self->{next_input_character} and
566 $self->{next_input_character} <= 0x005A) { # A..Z
567 $self->{current_token} = {type => END_TAG_TOKEN,
568 tag_name => chr ($self->{next_input_character} + 0x0020)};
569 $self->{state} = TAG_NAME_STATE;
570 !!!next-input-character;
571 redo A;
572 } elsif (0x0061 <= $self->{next_input_character} and
573 $self->{next_input_character} <= 0x007A) { # a..z
574 $self->{current_token} = {type => END_TAG_TOKEN,
575 tag_name => chr ($self->{next_input_character})};
576 $self->{state} = TAG_NAME_STATE;
577 !!!next-input-character;
578 redo A;
579 } elsif ($self->{next_input_character} == 0x003E) { # >
580 !!!parse-error (type => 'empty end tag');
581 $self->{state} = DATA_STATE;
582 !!!next-input-character;
583 redo A;
584 } elsif ($self->{next_input_character} == -1) {
585 !!!parse-error (type => 'bare etago');
586 $self->{state} = DATA_STATE;
587 # reconsume
588
589 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
590
591 redo A;
592 } else {
593 !!!parse-error (type => 'bogus end tag');
594 $self->{state} = BOGUS_COMMENT_STATE;
595 ## $self->{next_input_character} is intentionally left as is
596 redo A;
597 }
598 } elsif ($self->{state} == TAG_NAME_STATE) {
599 if ($self->{next_input_character} == 0x0009 or # HT
600 $self->{next_input_character} == 0x000A or # LF
601 $self->{next_input_character} == 0x000B or # VT
602 $self->{next_input_character} == 0x000C or # FF
603 $self->{next_input_character} == 0x0020) { # SP
604 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
605 !!!next-input-character;
606 redo A;
607 } elsif ($self->{next_input_character} == 0x003E) { # >
608 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
609 $self->{current_token}->{first_start_tag}
610 = not defined $self->{last_emitted_start_tag_name};
611 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
612 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
613 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
614 if ($self->{current_token}->{attributes}) {
615 !!!parse-error (type => 'end tag attribute');
616 }
617 } else {
618 die "$0: $self->{current_token}->{type}: Unknown token type";
619 }
620 $self->{state} = DATA_STATE;
621 !!!next-input-character;
622
623 !!!emit ($self->{current_token}); # start tag or end tag
624
625 redo A;
626 } elsif (0x0041 <= $self->{next_input_character} and
627 $self->{next_input_character} <= 0x005A) { # A..Z
628 $self->{current_token}->{tag_name} .= chr ($self->{next_input_character} + 0x0020);
629 # start tag or end tag
630 ## Stay in this state
631 !!!next-input-character;
632 redo A;
633 } elsif ($self->{next_input_character} == -1) {
634 !!!parse-error (type => 'unclosed tag');
635 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
636 $self->{current_token}->{first_start_tag}
637 = not defined $self->{last_emitted_start_tag_name};
638 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
639 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
640 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
641 if ($self->{current_token}->{attributes}) {
642 !!!parse-error (type => 'end tag attribute');
643 }
644 } else {
645 die "$0: $self->{current_token}->{type}: Unknown token type";
646 }
647 $self->{state} = DATA_STATE;
648 # reconsume
649
650 !!!emit ($self->{current_token}); # start tag or end tag
651
652 redo A;
653 } elsif ($self->{next_input_character} == 0x002F) { # /
654 !!!next-input-character;
655 if ($self->{next_input_character} == 0x003E and # >
656 $self->{current_token}->{type} == START_TAG_TOKEN and
657 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
658 # permitted slash
659 #
660 } else {
661 !!!parse-error (type => 'nestc');
662 }
663 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
664 # next-input-character is already done
665 redo A;
666 } else {
667 $self->{current_token}->{tag_name} .= chr $self->{next_input_character};
668 # start tag or end tag
669 ## Stay in the state
670 !!!next-input-character;
671 redo A;
672 }
673 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
674 if ($self->{next_input_character} == 0x0009 or # HT
675 $self->{next_input_character} == 0x000A or # LF
676 $self->{next_input_character} == 0x000B or # VT
677 $self->{next_input_character} == 0x000C or # FF
678 $self->{next_input_character} == 0x0020) { # SP
679 ## Stay in the state
680 !!!next-input-character;
681 redo A;
682 } elsif ($self->{next_input_character} == 0x003E) { # >
683 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
684 $self->{current_token}->{first_start_tag}
685 = not defined $self->{last_emitted_start_tag_name};
686 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
687 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
688 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
689 if ($self->{current_token}->{attributes}) {
690 !!!parse-error (type => 'end tag attribute');
691 }
692 } else {
693 die "$0: $self->{current_token}->{type}: Unknown token type";
694 }
695 $self->{state} = DATA_STATE;
696 !!!next-input-character;
697
698 !!!emit ($self->{current_token}); # start tag or end tag
699
700 redo A;
701 } elsif (0x0041 <= $self->{next_input_character} and
702 $self->{next_input_character} <= 0x005A) { # A..Z
703 $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
704 value => ''};
705 $self->{state} = ATTRIBUTE_NAME_STATE;
706 !!!next-input-character;
707 redo A;
708 } elsif ($self->{next_input_character} == 0x002F) { # /
709 !!!next-input-character;
710 if ($self->{next_input_character} == 0x003E and # >
711 $self->{current_token}->{type} == START_TAG_TOKEN and
712 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
713 # permitted slash
714 #
715 } else {
716 !!!parse-error (type => 'nestc');
717 }
718 ## Stay in the state
719 # next-input-character is already done
720 redo A;
721 } elsif ($self->{next_input_character} == -1) {
722 !!!parse-error (type => 'unclosed tag');
723 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
724 $self->{current_token}->{first_start_tag}
725 = not defined $self->{last_emitted_start_tag_name};
726 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
727 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
728 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
729 if ($self->{current_token}->{attributes}) {
730 !!!parse-error (type => 'end tag attribute');
731 }
732 } else {
733 die "$0: $self->{current_token}->{type}: Unknown token type";
734 }
735 $self->{state} = DATA_STATE;
736 # reconsume
737
738 !!!emit ($self->{current_token}); # start tag or end tag
739
740 redo A;
741 } else {
742 $self->{current_attribute} = {name => chr ($self->{next_input_character}),
743 value => ''};
744 $self->{state} = ATTRIBUTE_NAME_STATE;
745 !!!next-input-character;
746 redo A;
747 }
748 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
749 my $before_leave = sub {
750 if (exists $self->{current_token}->{attributes} # start tag or end tag
751 ->{$self->{current_attribute}->{name}}) { # MUST
752 !!!parse-error (type => 'duplicate attribute:'.$self->{current_attribute}->{name});
753 ## Discard $self->{current_attribute} # MUST
754 } else {
755 $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}
756 = $self->{current_attribute};
757 }
758 }; # $before_leave
759
760 if ($self->{next_input_character} == 0x0009 or # HT
761 $self->{next_input_character} == 0x000A or # LF
762 $self->{next_input_character} == 0x000B or # VT
763 $self->{next_input_character} == 0x000C or # FF
764 $self->{next_input_character} == 0x0020) { # SP
765 $before_leave->();
766 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
767 !!!next-input-character;
768 redo A;
769 } elsif ($self->{next_input_character} == 0x003D) { # =
770 $before_leave->();
771 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
772 !!!next-input-character;
773 redo A;
774 } elsif ($self->{next_input_character} == 0x003E) { # >
775 $before_leave->();
776 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
777 $self->{current_token}->{first_start_tag}
778 = not defined $self->{last_emitted_start_tag_name};
779 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
780 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
781 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
782 if ($self->{current_token}->{attributes}) {
783 !!!parse-error (type => 'end tag attribute');
784 }
785 } else {
786 die "$0: $self->{current_token}->{type}: Unknown token type";
787 }
788 $self->{state} = DATA_STATE;
789 !!!next-input-character;
790
791 !!!emit ($self->{current_token}); # start tag or end tag
792
793 redo A;
794 } elsif (0x0041 <= $self->{next_input_character} and
795 $self->{next_input_character} <= 0x005A) { # A..Z
796 $self->{current_attribute}->{name} .= chr ($self->{next_input_character} + 0x0020);
797 ## Stay in the state
798 !!!next-input-character;
799 redo A;
800 } elsif ($self->{next_input_character} == 0x002F) { # /
801 $before_leave->();
802 !!!next-input-character;
803 if ($self->{next_input_character} == 0x003E and # >
804 $self->{current_token}->{type} == START_TAG_TOKEN and
805 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
806 # permitted slash
807 #
808 } else {
809 !!!parse-error (type => 'nestc');
810 }
811 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
812 # next-input-character is already done
813 redo A;
814 } elsif ($self->{next_input_character} == -1) {
815 !!!parse-error (type => 'unclosed tag');
816 $before_leave->();
817 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
818 $self->{current_token}->{first_start_tag}
819 = not defined $self->{last_emitted_start_tag_name};
820 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
821 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
822 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
823 if ($self->{current_token}->{attributes}) {
824 !!!parse-error (type => 'end tag attribute');
825 }
826 } else {
827 die "$0: $self->{current_token}->{type}: Unknown token type";
828 }
829 $self->{state} = DATA_STATE;
830 # reconsume
831
832 !!!emit ($self->{current_token}); # start tag or end tag
833
834 redo A;
835 } else {
836 $self->{current_attribute}->{name} .= chr ($self->{next_input_character});
837 ## Stay in the state
838 !!!next-input-character;
839 redo A;
840 }
841 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
842 if ($self->{next_input_character} == 0x0009 or # HT
843 $self->{next_input_character} == 0x000A or # LF
844 $self->{next_input_character} == 0x000B or # VT
845 $self->{next_input_character} == 0x000C or # FF
846 $self->{next_input_character} == 0x0020) { # SP
847 ## Stay in the state
848 !!!next-input-character;
849 redo A;
850 } elsif ($self->{next_input_character} == 0x003D) { # =
851 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
852 !!!next-input-character;
853 redo A;
854 } elsif ($self->{next_input_character} == 0x003E) { # >
855 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
856 $self->{current_token}->{first_start_tag}
857 = not defined $self->{last_emitted_start_tag_name};
858 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
859 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
860 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
861 if ($self->{current_token}->{attributes}) {
862 !!!parse-error (type => 'end tag attribute');
863 }
864 } else {
865 die "$0: $self->{current_token}->{type}: Unknown token type";
866 }
867 $self->{state} = DATA_STATE;
868 !!!next-input-character;
869
870 !!!emit ($self->{current_token}); # start tag or end tag
871
872 redo A;
873 } elsif (0x0041 <= $self->{next_input_character} and
874 $self->{next_input_character} <= 0x005A) { # A..Z
875 $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
876 value => ''};
877 $self->{state} = ATTRIBUTE_NAME_STATE;
878 !!!next-input-character;
879 redo A;
880 } elsif ($self->{next_input_character} == 0x002F) { # /
881 !!!next-input-character;
882 if ($self->{next_input_character} == 0x003E and # >
883 $self->{current_token}->{type} == START_TAG_TOKEN and
884 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
885 # permitted slash
886 #
887 } else {
888 !!!parse-error (type => 'nestc');
889 ## TODO: Different error type for <aa / bb> than <aa/>
890 }
891 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
892 # next-input-character is already done
893 redo A;
894 } elsif ($self->{next_input_character} == -1) {
895 !!!parse-error (type => 'unclosed tag');
896 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
897 $self->{current_token}->{first_start_tag}
898 = not defined $self->{last_emitted_start_tag_name};
899 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
900 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
901 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
902 if ($self->{current_token}->{attributes}) {
903 !!!parse-error (type => 'end tag attribute');
904 }
905 } else {
906 die "$0: $self->{current_token}->{type}: Unknown token type";
907 }
908 $self->{state} = DATA_STATE;
909 # reconsume
910
911 !!!emit ($self->{current_token}); # start tag or end tag
912
913 redo A;
914 } else {
915 $self->{current_attribute} = {name => chr ($self->{next_input_character}),
916 value => ''};
917 $self->{state} = ATTRIBUTE_NAME_STATE;
918 !!!next-input-character;
919 redo A;
920 }
921 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
922 if ($self->{next_input_character} == 0x0009 or # HT
923 $self->{next_input_character} == 0x000A or # LF
924 $self->{next_input_character} == 0x000B or # VT
925 $self->{next_input_character} == 0x000C or # FF
926 $self->{next_input_character} == 0x0020) { # SP
927 ## Stay in the state
928 !!!next-input-character;
929 redo A;
930 } elsif ($self->{next_input_character} == 0x0022) { # "
931 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
932 !!!next-input-character;
933 redo A;
934 } elsif ($self->{next_input_character} == 0x0026) { # &
935 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
936 ## reconsume
937 redo A;
938 } elsif ($self->{next_input_character} == 0x0027) { # '
939 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
940 !!!next-input-character;
941 redo A;
942 } elsif ($self->{next_input_character} == 0x003E) { # >
943 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
944 $self->{current_token}->{first_start_tag}
945 = not defined $self->{last_emitted_start_tag_name};
946 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
947 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
948 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
949 if ($self->{current_token}->{attributes}) {
950 !!!parse-error (type => 'end tag attribute');
951 }
952 } else {
953 die "$0: $self->{current_token}->{type}: Unknown token type";
954 }
955 $self->{state} = DATA_STATE;
956 !!!next-input-character;
957
958 !!!emit ($self->{current_token}); # start tag or end tag
959
960 redo A;
961 } elsif ($self->{next_input_character} == -1) {
962 !!!parse-error (type => 'unclosed tag');
963 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
964 $self->{current_token}->{first_start_tag}
965 = not defined $self->{last_emitted_start_tag_name};
966 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
967 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
968 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
969 if ($self->{current_token}->{attributes}) {
970 !!!parse-error (type => 'end tag attribute');
971 }
972 } else {
973 die "$0: $self->{current_token}->{type}: Unknown token type";
974 }
975 $self->{state} = DATA_STATE;
976 ## reconsume
977
978 !!!emit ($self->{current_token}); # start tag or end tag
979
980 redo A;
981 } else {
982 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
983 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
984 !!!next-input-character;
985 redo A;
986 }
987 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
988 if ($self->{next_input_character} == 0x0022) { # "
989 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
990 !!!next-input-character;
991 redo A;
992 } elsif ($self->{next_input_character} == 0x0026) { # &
993 $self->{last_attribute_value_state} = $self->{state};
994 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
995 !!!next-input-character;
996 redo A;
997 } elsif ($self->{next_input_character} == -1) {
998 !!!parse-error (type => 'unclosed attribute value');
999 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1000 $self->{current_token}->{first_start_tag}
1001 = not defined $self->{last_emitted_start_tag_name};
1002 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1003 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1004 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1005 if ($self->{current_token}->{attributes}) {
1006 !!!parse-error (type => 'end tag attribute');
1007 }
1008 } else {
1009 die "$0: $self->{current_token}->{type}: Unknown token type";
1010 }
1011 $self->{state} = DATA_STATE;
1012 ## reconsume
1013
1014 !!!emit ($self->{current_token}); # start tag or end tag
1015
1016 redo A;
1017 } else {
1018 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1019 ## Stay in the state
1020 !!!next-input-character;
1021 redo A;
1022 }
1023 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1024 if ($self->{next_input_character} == 0x0027) { # '
1025 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1026 !!!next-input-character;
1027 redo A;
1028 } elsif ($self->{next_input_character} == 0x0026) { # &
1029 $self->{last_attribute_value_state} = $self->{state};
1030 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1031 !!!next-input-character;
1032 redo A;
1033 } elsif ($self->{next_input_character} == -1) {
1034 !!!parse-error (type => 'unclosed attribute value');
1035 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1036 $self->{current_token}->{first_start_tag}
1037 = not defined $self->{last_emitted_start_tag_name};
1038 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1039 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1040 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1041 if ($self->{current_token}->{attributes}) {
1042 !!!parse-error (type => 'end tag attribute');
1043 }
1044 } else {
1045 die "$0: $self->{current_token}->{type}: Unknown token type";
1046 }
1047 $self->{state} = DATA_STATE;
1048 ## reconsume
1049
1050 !!!emit ($self->{current_token}); # start tag or end tag
1051
1052 redo A;
1053 } else {
1054 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1055 ## Stay in the state
1056 !!!next-input-character;
1057 redo A;
1058 }
1059 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1060 if ($self->{next_input_character} == 0x0009 or # HT
1061 $self->{next_input_character} == 0x000A or # LF
1062 $self->{next_input_character} == 0x000B or # HT
1063 $self->{next_input_character} == 0x000C or # FF
1064 $self->{next_input_character} == 0x0020) { # SP
1065 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1066 !!!next-input-character;
1067 redo A;
1068 } elsif ($self->{next_input_character} == 0x0026) { # &
1069 $self->{last_attribute_value_state} = $self->{state};
1070 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1071 !!!next-input-character;
1072 redo A;
1073 } elsif ($self->{next_input_character} == 0x003E) { # >
1074 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1075 $self->{current_token}->{first_start_tag}
1076 = not defined $self->{last_emitted_start_tag_name};
1077 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1078 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1079 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1080 if ($self->{current_token}->{attributes}) {
1081 !!!parse-error (type => 'end tag attribute');
1082 }
1083 } else {
1084 die "$0: $self->{current_token}->{type}: Unknown token type";
1085 }
1086 $self->{state} = DATA_STATE;
1087 !!!next-input-character;
1088
1089 !!!emit ($self->{current_token}); # start tag or end tag
1090
1091 redo A;
1092 } elsif ($self->{next_input_character} == -1) {
1093 !!!parse-error (type => 'unclosed tag');
1094 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1095 $self->{current_token}->{first_start_tag}
1096 = not defined $self->{last_emitted_start_tag_name};
1097 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1098 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1099 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1100 if ($self->{current_token}->{attributes}) {
1101 !!!parse-error (type => 'end tag attribute');
1102 }
1103 } else {
1104 die "$0: $self->{current_token}->{type}: Unknown token type";
1105 }
1106 $self->{state} = DATA_STATE;
1107 ## reconsume
1108
1109 !!!emit ($self->{current_token}); # start tag or end tag
1110
1111 redo A;
1112 } else {
1113 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1114 ## Stay in the state
1115 !!!next-input-character;
1116 redo A;
1117 }
1118 } elsif ($self->{state} == ENTITY_IN_ATTRIBUTE_VALUE_STATE) {
1119 my $token = $self->_tokenize_attempt_to_consume_an_entity (1);
1120
1121 unless (defined $token) {
1122 $self->{current_attribute}->{value} .= '&';
1123 } else {
1124 $self->{current_attribute}->{value} .= $token->{data};
1125 $self->{current_attribute}->{has_reference} = $token->{has_reference};
1126 ## ISSUE: spec says "append the returned character token to the current attribute's value"
1127 }
1128
1129 $self->{state} = $self->{last_attribute_value_state};
1130 # next-input-character is already done
1131 redo A;
1132 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1133 ## (only happen if PCDATA state)
1134
1135 my $token = {type => COMMENT_TOKEN, data => ''};
1136
1137 BC: {
1138 if ($self->{next_input_character} == 0x003E) { # >
1139 $self->{state} = DATA_STATE;
1140 !!!next-input-character;
1141
1142 !!!emit ($token);
1143
1144 redo A;
1145 } elsif ($self->{next_input_character} == -1) {
1146 $self->{state} = DATA_STATE;
1147 ## reconsume
1148
1149 !!!emit ($token);
1150
1151 redo A;
1152 } else {
1153 $token->{data} .= chr ($self->{next_input_character});
1154 !!!next-input-character;
1155 redo BC;
1156 }
1157 } # BC
1158 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1159 ## (only happen if PCDATA state)
1160
1161 my @next_char;
1162 push @next_char, $self->{next_input_character};
1163
1164 if ($self->{next_input_character} == 0x002D) { # -
1165 !!!next-input-character;
1166 push @next_char, $self->{next_input_character};
1167 if ($self->{next_input_character} == 0x002D) { # -
1168 $self->{current_token} = {type => COMMENT_TOKEN, data => ''};
1169 $self->{state} = COMMENT_START_STATE;
1170 !!!next-input-character;
1171 redo A;
1172 }
1173 } elsif ($self->{next_input_character} == 0x0044 or # D
1174 $self->{next_input_character} == 0x0064) { # d
1175 !!!next-input-character;
1176 push @next_char, $self->{next_input_character};
1177 if ($self->{next_input_character} == 0x004F or # O
1178 $self->{next_input_character} == 0x006F) { # o
1179 !!!next-input-character;
1180 push @next_char, $self->{next_input_character};
1181 if ($self->{next_input_character} == 0x0043 or # C
1182 $self->{next_input_character} == 0x0063) { # c
1183 !!!next-input-character;
1184 push @next_char, $self->{next_input_character};
1185 if ($self->{next_input_character} == 0x0054 or # T
1186 $self->{next_input_character} == 0x0074) { # t
1187 !!!next-input-character;
1188 push @next_char, $self->{next_input_character};
1189 if ($self->{next_input_character} == 0x0059 or # Y
1190 $self->{next_input_character} == 0x0079) { # y
1191 !!!next-input-character;
1192 push @next_char, $self->{next_input_character};
1193 if ($self->{next_input_character} == 0x0050 or # P
1194 $self->{next_input_character} == 0x0070) { # p
1195 !!!next-input-character;
1196 push @next_char, $self->{next_input_character};
1197 if ($self->{next_input_character} == 0x0045 or # E
1198 $self->{next_input_character} == 0x0065) { # e
1199 ## ISSUE: What a stupid code this is!
1200 $self->{state} = DOCTYPE_STATE;
1201 !!!next-input-character;
1202 redo A;
1203 }
1204 }
1205 }
1206 }
1207 }
1208 }
1209 }
1210
1211 !!!parse-error (type => 'bogus comment');
1212 $self->{next_input_character} = shift @next_char;
1213 !!!back-next-input-character (@next_char);
1214 $self->{state} = BOGUS_COMMENT_STATE;
1215 redo A;
1216
1217 ## ISSUE: typos in spec: chacacters, is is a parse error
1218 ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
1219 } elsif ($self->{state} == COMMENT_START_STATE) {
1220 if ($self->{next_input_character} == 0x002D) { # -
1221 $self->{state} = COMMENT_START_DASH_STATE;
1222 !!!next-input-character;
1223 redo A;
1224 } elsif ($self->{next_input_character} == 0x003E) { # >
1225 !!!parse-error (type => 'bogus comment');
1226 $self->{state} = DATA_STATE;
1227 !!!next-input-character;
1228
1229 !!!emit ($self->{current_token}); # comment
1230
1231 redo A;
1232 } elsif ($self->{next_input_character} == -1) {
1233 !!!parse-error (type => 'unclosed comment');
1234 $self->{state} = DATA_STATE;
1235 ## reconsume
1236
1237 !!!emit ($self->{current_token}); # comment
1238
1239 redo A;
1240 } else {
1241 $self->{current_token}->{data} # comment
1242 .= chr ($self->{next_input_character});
1243 $self->{state} = COMMENT_STATE;
1244 !!!next-input-character;
1245 redo A;
1246 }
1247 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1248 if ($self->{next_input_character} == 0x002D) { # -
1249 $self->{state} = COMMENT_END_STATE;
1250 !!!next-input-character;
1251 redo A;
1252 } elsif ($self->{next_input_character} == 0x003E) { # >
1253 !!!parse-error (type => 'bogus comment');
1254 $self->{state} = DATA_STATE;
1255 !!!next-input-character;
1256
1257 !!!emit ($self->{current_token}); # comment
1258
1259 redo A;
1260 } elsif ($self->{next_input_character} == -1) {
1261 !!!parse-error (type => 'unclosed comment');
1262 $self->{state} = DATA_STATE;
1263 ## reconsume
1264
1265 !!!emit ($self->{current_token}); # comment
1266
1267 redo A;
1268 } else {
1269 $self->{current_token}->{data} # comment
1270 .= '-' . chr ($self->{next_input_character});
1271 $self->{state} = COMMENT_STATE;
1272 !!!next-input-character;
1273 redo A;
1274 }
1275 } elsif ($self->{state} == COMMENT_STATE) {
1276 if ($self->{next_input_character} == 0x002D) { # -
1277 $self->{state} = COMMENT_END_DASH_STATE;
1278 !!!next-input-character;
1279 redo A;
1280 } elsif ($self->{next_input_character} == -1) {
1281 !!!parse-error (type => 'unclosed comment');
1282 $self->{state} = DATA_STATE;
1283 ## reconsume
1284
1285 !!!emit ($self->{current_token}); # comment
1286
1287 redo A;
1288 } else {
1289 $self->{current_token}->{data} .= chr ($self->{next_input_character}); # comment
1290 ## Stay in the state
1291 !!!next-input-character;
1292 redo A;
1293 }
1294 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1295 if ($self->{next_input_character} == 0x002D) { # -
1296 $self->{state} = COMMENT_END_STATE;
1297 !!!next-input-character;
1298 redo A;
1299 } elsif ($self->{next_input_character} == -1) {
1300 !!!parse-error (type => 'unclosed comment');
1301 $self->{state} = DATA_STATE;
1302 ## reconsume
1303
1304 !!!emit ($self->{current_token}); # comment
1305
1306 redo A;
1307 } else {
1308 $self->{current_token}->{data} .= '-' . chr ($self->{next_input_character}); # comment
1309 $self->{state} = COMMENT_STATE;
1310 !!!next-input-character;
1311 redo A;
1312 }
1313 } elsif ($self->{state} == COMMENT_END_STATE) {
1314 if ($self->{next_input_character} == 0x003E) { # >
1315 $self->{state} = DATA_STATE;
1316 !!!next-input-character;
1317
1318 !!!emit ($self->{current_token}); # comment
1319
1320 redo A;
1321 } elsif ($self->{next_input_character} == 0x002D) { # -
1322 !!!parse-error (type => 'dash in comment');
1323 $self->{current_token}->{data} .= '-'; # comment
1324 ## Stay in the state
1325 !!!next-input-character;
1326 redo A;
1327 } elsif ($self->{next_input_character} == -1) {
1328 !!!parse-error (type => 'unclosed comment');
1329 $self->{state} = DATA_STATE;
1330 ## reconsume
1331
1332 !!!emit ($self->{current_token}); # comment
1333
1334 redo A;
1335 } else {
1336 !!!parse-error (type => 'dash in comment');
1337 $self->{current_token}->{data} .= '--' . chr ($self->{next_input_character}); # comment
1338 $self->{state} = COMMENT_STATE;
1339 !!!next-input-character;
1340 redo A;
1341 }
1342 } elsif ($self->{state} == DOCTYPE_STATE) {
1343 if ($self->{next_input_character} == 0x0009 or # HT
1344 $self->{next_input_character} == 0x000A or # LF
1345 $self->{next_input_character} == 0x000B or # VT
1346 $self->{next_input_character} == 0x000C or # FF
1347 $self->{next_input_character} == 0x0020) { # SP
1348 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1349 !!!next-input-character;
1350 redo A;
1351 } else {
1352 !!!parse-error (type => 'no space before DOCTYPE name');
1353 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1354 ## reconsume
1355 redo A;
1356 }
1357 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
1358 if ($self->{next_input_character} == 0x0009 or # HT
1359 $self->{next_input_character} == 0x000A or # LF
1360 $self->{next_input_character} == 0x000B or # VT
1361 $self->{next_input_character} == 0x000C or # FF
1362 $self->{next_input_character} == 0x0020) { # SP
1363 ## Stay in the state
1364 !!!next-input-character;
1365 redo A;
1366 } elsif ($self->{next_input_character} == 0x003E) { # >
1367 !!!parse-error (type => 'no DOCTYPE name');
1368 $self->{state} = DATA_STATE;
1369 !!!next-input-character;
1370
1371 !!!emit ({type => DOCTYPE_TOKEN}); # incorrect
1372
1373 redo A;
1374 } elsif ($self->{next_input_character} == -1) {
1375 !!!parse-error (type => 'no DOCTYPE name');
1376 $self->{state} = DATA_STATE;
1377 ## reconsume
1378
1379 !!!emit ({type => DOCTYPE_TOKEN}); # incorrect
1380
1381 redo A;
1382 } else {
1383 $self->{current_token}
1384 = {type => DOCTYPE_TOKEN,
1385 name => chr ($self->{next_input_character}),
1386 correct => 1};
1387 ## ISSUE: "Set the token's name name to the" in the spec
1388 $self->{state} = DOCTYPE_NAME_STATE;
1389 !!!next-input-character;
1390 redo A;
1391 }
1392 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
1393 ## ISSUE: Redundant "First," in the spec.
1394 if ($self->{next_input_character} == 0x0009 or # HT
1395 $self->{next_input_character} == 0x000A or # LF
1396 $self->{next_input_character} == 0x000B or # VT
1397 $self->{next_input_character} == 0x000C or # FF
1398 $self->{next_input_character} == 0x0020) { # SP
1399 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
1400 !!!next-input-character;
1401 redo A;
1402 } elsif ($self->{next_input_character} == 0x003E) { # >
1403 $self->{state} = DATA_STATE;
1404 !!!next-input-character;
1405
1406 !!!emit ($self->{current_token}); # DOCTYPE
1407
1408 redo A;
1409 } elsif ($self->{next_input_character} == -1) {
1410 !!!parse-error (type => 'unclosed DOCTYPE');
1411 $self->{state} = DATA_STATE;
1412 ## reconsume
1413
1414 delete $self->{current_token}->{correct};
1415 !!!emit ($self->{current_token}); # DOCTYPE
1416
1417 redo A;
1418 } else {
1419 $self->{current_token}->{name}
1420 .= chr ($self->{next_input_character}); # DOCTYPE
1421 ## Stay in the state
1422 !!!next-input-character;
1423 redo A;
1424 }
1425 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
1426 if ($self->{next_input_character} == 0x0009 or # HT
1427 $self->{next_input_character} == 0x000A or # LF
1428 $self->{next_input_character} == 0x000B or # VT
1429 $self->{next_input_character} == 0x000C or # FF
1430 $self->{next_input_character} == 0x0020) { # SP
1431 ## Stay in the state
1432 !!!next-input-character;
1433 redo A;
1434 } elsif ($self->{next_input_character} == 0x003E) { # >
1435 $self->{state} = DATA_STATE;
1436 !!!next-input-character;
1437
1438 !!!emit ($self->{current_token}); # DOCTYPE
1439
1440 redo A;
1441 } elsif ($self->{next_input_character} == -1) {
1442 !!!parse-error (type => 'unclosed DOCTYPE');
1443 $self->{state} = DATA_STATE;
1444 ## reconsume
1445
1446 delete $self->{current_token}->{correct};
1447 !!!emit ($self->{current_token}); # DOCTYPE
1448
1449 redo A;
1450 } elsif ($self->{next_input_character} == 0x0050 or # P
1451 $self->{next_input_character} == 0x0070) { # p
1452 !!!next-input-character;
1453 if ($self->{next_input_character} == 0x0055 or # U
1454 $self->{next_input_character} == 0x0075) { # u
1455 !!!next-input-character;
1456 if ($self->{next_input_character} == 0x0042 or # B
1457 $self->{next_input_character} == 0x0062) { # b
1458 !!!next-input-character;
1459 if ($self->{next_input_character} == 0x004C or # L
1460 $self->{next_input_character} == 0x006C) { # l
1461 !!!next-input-character;
1462 if ($self->{next_input_character} == 0x0049 or # I
1463 $self->{next_input_character} == 0x0069) { # i
1464 !!!next-input-character;
1465 if ($self->{next_input_character} == 0x0043 or # C
1466 $self->{next_input_character} == 0x0063) { # c
1467 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1468 !!!next-input-character;
1469 redo A;
1470 }
1471 }
1472 }
1473 }
1474 }
1475
1476 #
1477 } elsif ($self->{next_input_character} == 0x0053 or # S
1478 $self->{next_input_character} == 0x0073) { # s
1479 !!!next-input-character;
1480 if ($self->{next_input_character} == 0x0059 or # Y
1481 $self->{next_input_character} == 0x0079) { # y
1482 !!!next-input-character;
1483 if ($self->{next_input_character} == 0x0053 or # S
1484 $self->{next_input_character} == 0x0073) { # s
1485 !!!next-input-character;
1486 if ($self->{next_input_character} == 0x0054 or # T
1487 $self->{next_input_character} == 0x0074) { # t
1488 !!!next-input-character;
1489 if ($self->{next_input_character} == 0x0045 or # E
1490 $self->{next_input_character} == 0x0065) { # e
1491 !!!next-input-character;
1492 if ($self->{next_input_character} == 0x004D or # M
1493 $self->{next_input_character} == 0x006D) { # m
1494 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1495 !!!next-input-character;
1496 redo A;
1497 }
1498 }
1499 }
1500 }
1501 }
1502
1503 #
1504 } else {
1505 !!!next-input-character;
1506 #
1507 }
1508
1509 !!!parse-error (type => 'string after DOCTYPE name');
1510 $self->{state} = BOGUS_DOCTYPE_STATE;
1511 # next-input-character is already done
1512 redo A;
1513 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
1514 if ({
1515 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1516 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1517 }->{$self->{next_input_character}}) {
1518 ## Stay in the state
1519 !!!next-input-character;
1520 redo A;
1521 } elsif ($self->{next_input_character} eq 0x0022) { # "
1522 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
1523 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
1524 !!!next-input-character;
1525 redo A;
1526 } elsif ($self->{next_input_character} eq 0x0027) { # '
1527 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
1528 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
1529 !!!next-input-character;
1530 redo A;
1531 } elsif ($self->{next_input_character} eq 0x003E) { # >
1532 !!!parse-error (type => 'no PUBLIC literal');
1533
1534 $self->{state} = DATA_STATE;
1535 !!!next-input-character;
1536
1537 delete $self->{current_token}->{correct};
1538 !!!emit ($self->{current_token}); # DOCTYPE
1539
1540 redo A;
1541 } elsif ($self->{next_input_character} == -1) {
1542 !!!parse-error (type => 'unclosed DOCTYPE');
1543
1544 $self->{state} = DATA_STATE;
1545 ## reconsume
1546
1547 delete $self->{current_token}->{correct};
1548 !!!emit ($self->{current_token}); # DOCTYPE
1549
1550 redo A;
1551 } else {
1552 !!!parse-error (type => 'string after PUBLIC');
1553 $self->{state} = BOGUS_DOCTYPE_STATE;
1554 !!!next-input-character;
1555 redo A;
1556 }
1557 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
1558 if ($self->{next_input_character} == 0x0022) { # "
1559 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1560 !!!next-input-character;
1561 redo A;
1562 } elsif ($self->{next_input_character} == 0x003E) { # >
1563 !!!parse-error (type => 'unclosed PUBLIC literal');
1564
1565 $self->{state} = DATA_STATE;
1566 !!!next-input-character;
1567
1568 delete $self->{current_token}->{correct};
1569 !!!emit ($self->{current_token}); # DOCTYPE
1570
1571 redo A;
1572 } elsif ($self->{next_input_character} == -1) {
1573 !!!parse-error (type => 'unclosed PUBLIC literal');
1574
1575 $self->{state} = DATA_STATE;
1576 ## reconsume
1577
1578 delete $self->{current_token}->{correct};
1579 !!!emit ($self->{current_token}); # DOCTYPE
1580
1581 redo A;
1582 } else {
1583 $self->{current_token}->{public_identifier} # DOCTYPE
1584 .= chr $self->{next_input_character};
1585 ## Stay in the state
1586 !!!next-input-character;
1587 redo A;
1588 }
1589 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
1590 if ($self->{next_input_character} == 0x0027) { # '
1591 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1592 !!!next-input-character;
1593 redo A;
1594 } elsif ($self->{next_input_character} == 0x003E) { # >
1595 !!!parse-error (type => 'unclosed PUBLIC literal');
1596
1597 $self->{state} = DATA_STATE;
1598 !!!next-input-character;
1599
1600 delete $self->{current_token}->{correct};
1601 !!!emit ($self->{current_token}); # DOCTYPE
1602
1603 redo A;
1604 } elsif ($self->{next_input_character} == -1) {
1605 !!!parse-error (type => 'unclosed PUBLIC literal');
1606
1607 $self->{state} = DATA_STATE;
1608 ## reconsume
1609
1610 delete $self->{current_token}->{correct};
1611 !!!emit ($self->{current_token}); # DOCTYPE
1612
1613 redo A;
1614 } else {
1615 $self->{current_token}->{public_identifier} # DOCTYPE
1616 .= chr $self->{next_input_character};
1617 ## Stay in the state
1618 !!!next-input-character;
1619 redo A;
1620 }
1621 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
1622 if ({
1623 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1624 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1625 }->{$self->{next_input_character}}) {
1626 ## Stay in the state
1627 !!!next-input-character;
1628 redo A;
1629 } elsif ($self->{next_input_character} == 0x0022) { # "
1630 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1631 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
1632 !!!next-input-character;
1633 redo A;
1634 } elsif ($self->{next_input_character} == 0x0027) { # '
1635 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1636 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
1637 !!!next-input-character;
1638 redo A;
1639 } elsif ($self->{next_input_character} == 0x003E) { # >
1640 $self->{state} = DATA_STATE;
1641 !!!next-input-character;
1642
1643 !!!emit ($self->{current_token}); # DOCTYPE
1644
1645 redo A;
1646 } elsif ($self->{next_input_character} == -1) {
1647 !!!parse-error (type => 'unclosed DOCTYPE');
1648
1649 $self->{state} = DATA_STATE;
1650 ## reconsume
1651
1652 delete $self->{current_token}->{correct};
1653 !!!emit ($self->{current_token}); # DOCTYPE
1654
1655 redo A;
1656 } else {
1657 !!!parse-error (type => 'string after PUBLIC literal');
1658 $self->{state} = BOGUS_DOCTYPE_STATE;
1659 !!!next-input-character;
1660 redo A;
1661 }
1662 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
1663 if ({
1664 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1665 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1666 }->{$self->{next_input_character}}) {
1667 ## Stay in the state
1668 !!!next-input-character;
1669 redo A;
1670 } elsif ($self->{next_input_character} == 0x0022) { # "
1671 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1672 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
1673 !!!next-input-character;
1674 redo A;
1675 } elsif ($self->{next_input_character} == 0x0027) { # '
1676 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1677 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
1678 !!!next-input-character;
1679 redo A;
1680 } elsif ($self->{next_input_character} == 0x003E) { # >
1681 !!!parse-error (type => 'no SYSTEM literal');
1682 $self->{state} = DATA_STATE;
1683 !!!next-input-character;
1684
1685 delete $self->{current_token}->{correct};
1686 !!!emit ($self->{current_token}); # DOCTYPE
1687
1688 redo A;
1689 } elsif ($self->{next_input_character} == -1) {
1690 !!!parse-error (type => 'unclosed DOCTYPE');
1691
1692 $self->{state} = DATA_STATE;
1693 ## reconsume
1694
1695 delete $self->{current_token}->{correct};
1696 !!!emit ($self->{current_token}); # DOCTYPE
1697
1698 redo A;
1699 } else {
1700 !!!parse-error (type => 'string after SYSTEM');
1701 $self->{state} = BOGUS_DOCTYPE_STATE;
1702 !!!next-input-character;
1703 redo A;
1704 }
1705 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
1706 if ($self->{next_input_character} == 0x0022) { # "
1707 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1708 !!!next-input-character;
1709 redo A;
1710 } elsif ($self->{next_input_character} == 0x003E) { # >
1711 !!!parse-error (type => 'unclosed PUBLIC literal');
1712
1713 $self->{state} = DATA_STATE;
1714 !!!next-input-character;
1715
1716 delete $self->{current_token}->{correct};
1717 !!!emit ($self->{current_token}); # DOCTYPE
1718
1719 redo A;
1720 } elsif ($self->{next_input_character} == -1) {
1721 !!!parse-error (type => 'unclosed SYSTEM literal');
1722
1723 $self->{state} = DATA_STATE;
1724 ## reconsume
1725
1726 delete $self->{current_token}->{correct};
1727 !!!emit ($self->{current_token}); # DOCTYPE
1728
1729 redo A;
1730 } else {
1731 $self->{current_token}->{system_identifier} # DOCTYPE
1732 .= chr $self->{next_input_character};
1733 ## Stay in the state
1734 !!!next-input-character;
1735 redo A;
1736 }
1737 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
1738 if ($self->{next_input_character} == 0x0027) { # '
1739 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1740 !!!next-input-character;
1741 redo A;
1742 } elsif ($self->{next_input_character} == 0x003E) { # >
1743 !!!parse-error (type => 'unclosed PUBLIC literal');
1744
1745 $self->{state} = DATA_STATE;
1746 !!!next-input-character;
1747
1748 delete $self->{current_token}->{correct};
1749 !!!emit ($self->{current_token}); # DOCTYPE
1750
1751 redo A;
1752 } elsif ($self->{next_input_character} == -1) {
1753 !!!parse-error (type => 'unclosed SYSTEM literal');
1754
1755 $self->{state} = DATA_STATE;
1756 ## reconsume
1757
1758 delete $self->{current_token}->{correct};
1759 !!!emit ($self->{current_token}); # DOCTYPE
1760
1761 redo A;
1762 } else {
1763 $self->{current_token}->{system_identifier} # DOCTYPE
1764 .= chr $self->{next_input_character};
1765 ## Stay in the state
1766 !!!next-input-character;
1767 redo A;
1768 }
1769 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
1770 if ({
1771 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1772 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1773 }->{$self->{next_input_character}}) {
1774 ## Stay in the state
1775 !!!next-input-character;
1776 redo A;
1777 } elsif ($self->{next_input_character} == 0x003E) { # >
1778 $self->{state} = DATA_STATE;
1779 !!!next-input-character;
1780
1781 !!!emit ($self->{current_token}); # DOCTYPE
1782
1783 redo A;
1784 } elsif ($self->{next_input_character} == -1) {
1785 !!!parse-error (type => 'unclosed DOCTYPE');
1786
1787 $self->{state} = DATA_STATE;
1788 ## reconsume
1789
1790 delete $self->{current_token}->{correct};
1791 !!!emit ($self->{current_token}); # DOCTYPE
1792
1793 redo A;
1794 } else {
1795 !!!parse-error (type => 'string after SYSTEM literal');
1796 $self->{state} = BOGUS_DOCTYPE_STATE;
1797 !!!next-input-character;
1798 redo A;
1799 }
1800 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
1801 if ($self->{next_input_character} == 0x003E) { # >
1802 $self->{state} = DATA_STATE;
1803 !!!next-input-character;
1804
1805 delete $self->{current_token}->{correct};
1806 !!!emit ($self->{current_token}); # DOCTYPE
1807
1808 redo A;
1809 } elsif ($self->{next_input_character} == -1) {
1810 !!!parse-error (type => 'unclosed DOCTYPE');
1811 $self->{state} = DATA_STATE;
1812 ## reconsume
1813
1814 delete $self->{current_token}->{correct};
1815 !!!emit ($self->{current_token}); # DOCTYPE
1816
1817 redo A;
1818 } else {
1819 ## Stay in the state
1820 !!!next-input-character;
1821 redo A;
1822 }
1823 } else {
1824 die "$0: $self->{state}: Unknown state";
1825 }
1826 } # A
1827
1828 die "$0: _get_next_token: unexpected case";
1829 } # _get_next_token
1830
1831 sub _tokenize_attempt_to_consume_an_entity ($$) {
1832 my ($self, $in_attr) = @_;
1833
1834 if ({
1835 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
1836 0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR
1837 }->{$self->{next_input_character}}) {
1838 ## Don't consume
1839 ## No error
1840 return undef;
1841 } elsif ($self->{next_input_character} == 0x0023) { # #
1842 !!!next-input-character;
1843 if ($self->{next_input_character} == 0x0078 or # x
1844 $self->{next_input_character} == 0x0058) { # X
1845 my $code;
1846 X: {
1847 my $x_char = $self->{next_input_character};
1848 !!!next-input-character;
1849 if (0x0030 <= $self->{next_input_character} and
1850 $self->{next_input_character} <= 0x0039) { # 0..9
1851 $code ||= 0;
1852 $code *= 0x10;
1853 $code += $self->{next_input_character} - 0x0030;
1854 redo X;
1855 } elsif (0x0061 <= $self->{next_input_character} and
1856 $self->{next_input_character} <= 0x0066) { # a..f
1857 $code ||= 0;
1858 $code *= 0x10;
1859 $code += $self->{next_input_character} - 0x0060 + 9;
1860 redo X;
1861 } elsif (0x0041 <= $self->{next_input_character} and
1862 $self->{next_input_character} <= 0x0046) { # A..F
1863 $code ||= 0;
1864 $code *= 0x10;
1865 $code += $self->{next_input_character} - 0x0040 + 9;
1866 redo X;
1867 } elsif (not defined $code) { # no hexadecimal digit
1868 !!!parse-error (type => 'bare hcro');
1869 !!!back-next-input-character ($x_char, $self->{next_input_character});
1870 $self->{next_input_character} = 0x0023; # #
1871 return undef;
1872 } elsif ($self->{next_input_character} == 0x003B) { # ;
1873 !!!next-input-character;
1874 } else {
1875 !!!parse-error (type => 'no refc');
1876 }
1877
1878 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
1879 !!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code);
1880 $code = 0xFFFD;
1881 } elsif ($code > 0x10FFFF) {
1882 !!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code);
1883 $code = 0xFFFD;
1884 } elsif ($code == 0x000D) {
1885 !!!parse-error (type => 'CR character reference');
1886 $code = 0x000A;
1887 } elsif (0x80 <= $code and $code <= 0x9F) {
1888 !!!parse-error (type => sprintf 'C1 character reference:U+%04X', $code);
1889 $code = $c1_entity_char->{$code};
1890 }
1891
1892 return {type => CHARACTER_TOKEN, data => chr $code,
1893 has_reference => 1};
1894 } # X
1895 } elsif (0x0030 <= $self->{next_input_character} and
1896 $self->{next_input_character} <= 0x0039) { # 0..9
1897 my $code = $self->{next_input_character} - 0x0030;
1898 !!!next-input-character;
1899
1900 while (0x0030 <= $self->{next_input_character} and
1901 $self->{next_input_character} <= 0x0039) { # 0..9
1902 $code *= 10;
1903 $code += $self->{next_input_character} - 0x0030;
1904
1905 !!!next-input-character;
1906 }
1907
1908 if ($self->{next_input_character} == 0x003B) { # ;
1909 !!!next-input-character;
1910 } else {
1911 !!!parse-error (type => 'no refc');
1912 }
1913
1914 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
1915 !!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code);
1916 $code = 0xFFFD;
1917 } elsif ($code > 0x10FFFF) {
1918 !!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code);
1919 $code = 0xFFFD;
1920 } elsif ($code == 0x000D) {
1921 !!!parse-error (type => 'CR character reference');
1922 $code = 0x000A;
1923 } elsif (0x80 <= $code and $code <= 0x9F) {
1924 !!!parse-error (type => sprintf 'C1 character reference:U+%04X', $code);
1925 $code = $c1_entity_char->{$code};
1926 }
1927
1928 return {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1};
1929 } else {
1930 !!!parse-error (type => 'bare nero');
1931 !!!back-next-input-character ($self->{next_input_character});
1932 $self->{next_input_character} = 0x0023; # #
1933 return undef;
1934 }
1935 } elsif ((0x0041 <= $self->{next_input_character} and
1936 $self->{next_input_character} <= 0x005A) or
1937 (0x0061 <= $self->{next_input_character} and
1938 $self->{next_input_character} <= 0x007A)) {
1939 my $entity_name = chr $self->{next_input_character};
1940 !!!next-input-character;
1941
1942 my $value = $entity_name;
1943 my $match = 0;
1944 require Whatpm::_NamedEntityList;
1945 our $EntityChar;
1946
1947 while (length $entity_name < 10 and
1948 ## NOTE: Some number greater than the maximum length of entity name
1949 ((0x0041 <= $self->{next_input_character} and # a
1950 $self->{next_input_character} <= 0x005A) or # x
1951 (0x0061 <= $self->{next_input_character} and # a
1952 $self->{next_input_character} <= 0x007A) or # z
1953 (0x0030 <= $self->{next_input_character} and # 0
1954 $self->{next_input_character} <= 0x0039) or # 9
1955 $self->{next_input_character} == 0x003B)) { # ;
1956 $entity_name .= chr $self->{next_input_character};
1957 if (defined $EntityChar->{$entity_name}) {
1958 if ($self->{next_input_character} == 0x003B) { # ;
1959 $value = $EntityChar->{$entity_name};
1960 $match = 1;
1961 !!!next-input-character;
1962 last;
1963 } else {
1964 $value = $EntityChar->{$entity_name};
1965 $match = -1;
1966 !!!next-input-character;
1967 }
1968 } else {
1969 $value .= chr $self->{next_input_character};
1970 $match *= 2;
1971 !!!next-input-character;
1972 }
1973 }
1974
1975 if ($match > 0) {
1976 return {type => CHARACTER_TOKEN, data => $value, has_reference => 1};
1977 } elsif ($match < 0) {
1978 !!!parse-error (type => 'no refc');
1979 if ($in_attr and $match < -1) {
1980 return {type => CHARACTER_TOKEN, data => '&'.$entity_name};
1981 } else {
1982 return {type => CHARACTER_TOKEN, data => $value, has_reference => 1};
1983 }
1984 } else {
1985 !!!parse-error (type => 'bare ero');
1986 ## NOTE: "No characters are consumed" in the spec.
1987 return {type => CHARACTER_TOKEN, data => '&'.$value};
1988 }
1989 } else {
1990 ## no characters are consumed
1991 !!!parse-error (type => 'bare ero');
1992 return undef;
1993 }
1994 } # _tokenize_attempt_to_consume_an_entity
1995
1996 sub _initialize_tree_constructor ($) {
1997 my $self = shift;
1998 ## NOTE: $self->{document} MUST be specified before this method is called
1999 $self->{document}->strict_error_checking (0);
2000 ## TODO: Turn mutation events off # MUST
2001 ## TODO: Turn loose Document option (manakai extension) on
2002 $self->{document}->manakai_is_html (1); # MUST
2003 } # _initialize_tree_constructor
2004
2005 sub _terminate_tree_constructor ($) {
2006 my $self = shift;
2007 $self->{document}->strict_error_checking (1);
2008 ## TODO: Turn mutation events on
2009 } # _terminate_tree_constructor
2010
2011 ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
2012
2013 { # tree construction stage
2014 my $token;
2015
2016 sub _construct_tree ($) {
2017 my ($self) = @_;
2018
2019 ## When an interactive UA render the $self->{document} available
2020 ## to the user, or when it begin accepting user input, are
2021 ## not defined.
2022
2023 ## Append a character: collect it and all subsequent consecutive
2024 ## characters and insert one Text node whose data is concatenation
2025 ## of all those characters. # MUST
2026
2027 !!!next-token;
2028
2029 $self->{insertion_mode} = BEFORE_HEAD_IM;
2030 undef $self->{form_element};
2031 undef $self->{head_element};
2032 $self->{open_elements} = [];
2033 undef $self->{inner_html_node};
2034
2035 $self->_tree_construction_initial; # MUST
2036 $self->_tree_construction_root_element;
2037 $self->_tree_construction_main;
2038 } # _construct_tree
2039
2040 sub _tree_construction_initial ($) {
2041 my $self = shift;
2042 INITIAL: {
2043 if ($token->{type} == DOCTYPE_TOKEN) {
2044 ## NOTE: Conformance checkers MAY, instead of reporting "not HTML5"
2045 ## error, switch to a conformance checking mode for another
2046 ## language.
2047 my $doctype_name = $token->{name};
2048 $doctype_name = '' unless defined $doctype_name;
2049 $doctype_name =~ tr/a-z/A-Z/;
2050 if (not defined $token->{name} or # <!DOCTYPE>
2051 defined $token->{public_identifier} or
2052 defined $token->{system_identifier}) {
2053 !!!parse-error (type => 'not HTML5');
2054 } elsif ($doctype_name ne 'HTML') {
2055 ## ISSUE: ASCII case-insensitive? (in fact it does not matter)
2056 !!!parse-error (type => 'not HTML5');
2057 }
2058
2059 my $doctype = $self->{document}->create_document_type_definition
2060 ($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)?
2061 $doctype->public_id ($token->{public_identifier})
2062 if defined $token->{public_identifier};
2063 $doctype->system_id ($token->{system_identifier})
2064 if defined $token->{system_identifier};
2065 ## NOTE: Other DocumentType attributes are null or empty lists.
2066 ## ISSUE: internalSubset = null??
2067 $self->{document}->append_child ($doctype);
2068
2069 if (not $token->{correct} or $doctype_name ne 'HTML') {
2070 $self->{document}->manakai_compat_mode ('quirks');
2071 } elsif (defined $token->{public_identifier}) {
2072 my $pubid = $token->{public_identifier};
2073 $pubid =~ tr/a-z/A-z/;
2074 if ({
2075 "+//SILMARIL//DTD HTML PRO V0R11 19970101//EN" => 1,
2076 "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
2077 "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
2078 "-//IETF//DTD HTML 2.0 LEVEL 1//EN" => 1,
2079 "-//IETF//DTD HTML 2.0 LEVEL 2//EN" => 1,
2080 "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//EN" => 1,
2081 "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//EN" => 1,
2082 "-//IETF//DTD HTML 2.0 STRICT//EN" => 1,
2083 "-//IETF//DTD HTML 2.0//EN" => 1,
2084 "-//IETF//DTD HTML 2.1E//EN" => 1,
2085 "-//IETF//DTD HTML 3.0//EN" => 1,
2086 "-//IETF//DTD HTML 3.0//EN//" => 1,
2087 "-//IETF//DTD HTML 3.2 FINAL//EN" => 1,
2088 "-//IETF//DTD HTML 3.2//EN" => 1,
2089 "-//IETF//DTD HTML 3//EN" => 1,
2090 "-//IETF//DTD HTML LEVEL 0//EN" => 1,
2091 "-//IETF//DTD HTML LEVEL 0//EN//2.0" => 1,
2092 "-//IETF//DTD HTML LEVEL 1//EN" => 1,
2093 "-//IETF//DTD HTML LEVEL 1//EN//2.0" => 1,
2094 "-//IETF//DTD HTML LEVEL 2//EN" => 1,
2095 "-//IETF//DTD HTML LEVEL 2//EN//2.0" => 1,
2096 "-//IETF//DTD HTML LEVEL 3//EN" => 1,
2097 "-//IETF//DTD HTML LEVEL 3//EN//3.0" => 1,
2098 "-//IETF//DTD HTML STRICT LEVEL 0//EN" => 1,
2099 "-//IETF//DTD HTML STRICT LEVEL 0//EN//2.0" => 1,
2100 "-//IETF//DTD HTML STRICT LEVEL 1//EN" => 1,
2101 "-//IETF//DTD HTML STRICT LEVEL 1//EN//2.0" => 1,
2102 "-//IETF//DTD HTML STRICT LEVEL 2//EN" => 1,
2103 "-//IETF//DTD HTML STRICT LEVEL 2//EN//2.0" => 1,
2104 "-//IETF//DTD HTML STRICT LEVEL 3//EN" => 1,
2105 "-//IETF//DTD HTML STRICT LEVEL 3//EN//3.0" => 1,
2106 "-//IETF//DTD HTML STRICT//EN" => 1,
2107 "-//IETF//DTD HTML STRICT//EN//2.0" => 1,
2108 "-//IETF//DTD HTML STRICT//EN//3.0" => 1,
2109 "-//IETF//DTD HTML//EN" => 1,
2110 "-//IETF//DTD HTML//EN//2.0" => 1,
2111 "-//IETF//DTD HTML//EN//3.0" => 1,
2112 "-//METRIUS//DTD METRIUS PRESENTATIONAL//EN" => 1,
2113 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//EN" => 1,
2114 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//EN" => 1,
2115 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//EN" => 1,
2116 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//EN" => 1,
2117 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//EN" => 1,
2118 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//EN" => 1,
2119 "-//NETSCAPE COMM. CORP.//DTD HTML//EN" => 1,
2120 "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1,
2121 "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1,
2122 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1,
2123 "-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1,
2124 "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1,
2125 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1,
2126 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//EN" => 1,
2127 "-//W3C//DTD HTML 3 1995-03-24//EN" => 1,
2128 "-//W3C//DTD HTML 3.2 DRAFT//EN" => 1,
2129 "-//W3C//DTD HTML 3.2 FINAL//EN" => 1,
2130 "-//W3C//DTD HTML 3.2//EN" => 1,
2131 "-//W3C//DTD HTML 3.2S DRAFT//EN" => 1,
2132 "-//W3C//DTD HTML 4.0 FRAMESET//EN" => 1,
2133 "-//W3C//DTD HTML 4.0 TRANSITIONAL//EN" => 1,
2134 "-//W3C//DTD HTML EXPERIMETNAL 19960712//EN" => 1,
2135 "-//W3C//DTD HTML EXPERIMENTAL 970421//EN" => 1,
2136 "-//W3C//DTD W3 HTML//EN" => 1,
2137 "-//W3O//DTD W3 HTML 3.0//EN" => 1,
2138 "-//W3O//DTD W3 HTML 3.0//EN//" => 1,
2139 "-//W3O//DTD W3 HTML STRICT 3.0//EN//" => 1,
2140 "-//WEBTECHS//DTD MOZILLA HTML 2.0//EN" => 1,
2141 "-//WEBTECHS//DTD MOZILLA HTML//EN" => 1,
2142 "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" => 1,
2143 "HTML" => 1,
2144 }->{$pubid}) {
2145 $self->{document}->manakai_compat_mode ('quirks');
2146 } elsif ($pubid eq "-//W3C//DTD HTML 4.01 FRAMESET//EN" or
2147 $pubid eq "-//W3C//DTD HTML 4.01 TRANSITIONAL//EN") {
2148 if (defined $token->{system_identifier}) {
2149 $self->{document}->manakai_compat_mode ('quirks');
2150 } else {
2151 $self->{document}->manakai_compat_mode ('limited quirks');
2152 }
2153 } elsif ($pubid eq "-//W3C//DTD XHTML 1.0 Frameset//EN" or
2154 $pubid eq "-//W3C//DTD XHTML 1.0 Transitional//EN") {
2155 $self->{document}->manakai_compat_mode ('limited quirks');
2156 }
2157 }
2158 if (defined $token->{system_identifier}) {
2159 my $sysid = $token->{system_identifier};
2160 $sysid =~ tr/A-Z/a-z/;
2161 if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
2162 $self->{document}->manakai_compat_mode ('quirks');
2163 }
2164 }
2165
2166 ## Go to the root element phase.
2167 !!!next-token;
2168 return;
2169 } elsif ({
2170 START_TAG_TOKEN, 1,
2171 END_TAG_TOKEN, 1,
2172 END_OF_FILE_TOKEN, 1,
2173 }->{$token->{type}}) {
2174 !!!parse-error (type => 'no DOCTYPE');
2175 $self->{document}->manakai_compat_mode ('quirks');
2176 ## Go to the root element phase
2177 ## reprocess
2178 return;
2179 } elsif ($token->{type} == CHARACTER_TOKEN) {
2180 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
2181 ## Ignore the token
2182
2183 unless (length $token->{data}) {
2184 ## Stay in the phase
2185 !!!next-token;
2186 redo INITIAL;
2187 }
2188 }
2189
2190 !!!parse-error (type => 'no DOCTYPE');
2191 $self->{document}->manakai_compat_mode ('quirks');
2192 ## Go to the root element phase
2193 ## reprocess
2194 return;
2195 } elsif ($token->{type} == COMMENT_TOKEN) {
2196 my $comment = $self->{document}->create_comment ($token->{data});
2197 $self->{document}->append_child ($comment);
2198
2199 ## Stay in the phase.
2200 !!!next-token;
2201 redo INITIAL;
2202 } else {
2203 die "$0: $token->{type}: Unknown token type";
2204 }
2205 } # INITIAL
2206 } # _tree_construction_initial
2207
2208 sub _tree_construction_root_element ($) {
2209 my $self = shift;
2210
2211 B: {
2212 if ($token->{type} == DOCTYPE_TOKEN) {
2213 !!!parse-error (type => 'in html:#DOCTYPE');
2214 ## Ignore the token
2215 ## Stay in the phase
2216 !!!next-token;
2217 redo B;
2218 } elsif ($token->{type} == COMMENT_TOKEN) {
2219 my $comment = $self->{document}->create_comment ($token->{data});
2220 $self->{document}->append_child ($comment);
2221 ## Stay in the phase
2222 !!!next-token;
2223 redo B;
2224 } elsif ($token->{type} == CHARACTER_TOKEN) {
2225 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
2226 ## Ignore the token.
2227
2228 unless (length $token->{data}) {
2229 ## Stay in the phase
2230 !!!next-token;
2231 redo B;
2232 }
2233 }
2234
2235 $self->{application_cache_selection}->(undef);
2236
2237 #
2238 } elsif ($token->{type} == START_TAG_TOKEN) {
2239 if ($token->{tag_name} eq 'html' and
2240 $token->{attributes}->{manifest}) {
2241 $self->{application_cache_selection}
2242 ->($token->{attributes}->{manifest}->{value});
2243 ## ISSUE: No relative reference resolution?
2244 } else {
2245 $self->{application_cache_selection}->(undef);
2246 }
2247
2248 ## ISSUE: There is an issue in the spec
2249 #
2250 } elsif ({
2251 END_TAG_TOKEN, 1,
2252 END_OF_FILE_TOKEN, 1,
2253 }->{$token->{type}}) {
2254 $self->{application_cache_selection}->(undef);
2255
2256 ## ISSUE: There is an issue in the spec
2257 #
2258 } else {
2259 die "$0: $token->{type}: Unknown token type";
2260 }
2261
2262 my $root_element; !!!create-element ($root_element, 'html');
2263 $self->{document}->append_child ($root_element);
2264 push @{$self->{open_elements}}, [$root_element, 'html'];
2265 ## reprocess
2266 #redo B;
2267 return; ## Go to the main phase.
2268 } # B
2269 } # _tree_construction_root_element
2270
2271 sub _reset_insertion_mode ($) {
2272 my $self = shift;
2273
2274 ## Step 1
2275 my $last;
2276
2277 ## Step 2
2278 my $i = -1;
2279 my $node = $self->{open_elements}->[$i];
2280
2281 ## Step 3
2282 S3: {
2283 ## ISSUE: Oops! "If node is the first node in the stack of open
2284 ## elements, then set last to true. If the context element of the
2285 ## HTML fragment parsing algorithm is neither a td element nor a
2286 ## th element, then set node to the context element. (fragment case)":
2287 ## The second "if" is in the scope of the first "if"!?
2288 if ($self->{open_elements}->[0]->[0] eq $node->[0]) {
2289 $last = 1;
2290 if (defined $self->{inner_html_node}) {
2291 if ($self->{inner_html_node}->[1] eq 'td' or
2292 $self->{inner_html_node}->[1] eq 'th') {
2293 #
2294 } else {
2295 $node = $self->{inner_html_node};
2296 }
2297 }
2298 }
2299
2300 ## Step 4..13
2301 my $new_mode = {
2302 select => IN_SELECT_IM,
2303 td => IN_CELL_IM,
2304 th => IN_CELL_IM,
2305 tr => IN_ROW_IM,
2306 tbody => IN_TABLE_BODY_IM,
2307 thead => IN_TABLE_BODY_IM,
2308 tfoot => IN_TABLE_BODY_IM,
2309 caption => IN_CAPTION_IM,
2310 colgroup => IN_COLUMN_GROUP_IM,
2311 table => IN_TABLE_IM,
2312 head => IN_BODY_IM, # not in head!
2313 body => IN_BODY_IM,
2314 frameset => IN_FRAMESET_IM,
2315 }->{$node->[1]};
2316 $self->{insertion_mode} = $new_mode and return if defined $new_mode;
2317
2318 ## Step 14
2319 if ($node->[1] eq 'html') {
2320 unless (defined $self->{head_element}) {
2321 $self->{insertion_mode} = BEFORE_HEAD_IM;
2322 } else {
2323 $self->{insertion_mode} = AFTER_HEAD_IM;
2324 }
2325 return;
2326 }
2327
2328 ## Step 15
2329 $self->{insertion_mode} = IN_BODY_IM and return if $last;
2330
2331 ## Step 16
2332 $i--;
2333 $node = $self->{open_elements}->[$i];
2334
2335 ## Step 17
2336 redo S3;
2337 } # S3
2338 } # _reset_insertion_mode
2339
2340 sub _tree_construction_main ($) {
2341 my $self = shift;
2342
2343 my $active_formatting_elements = [];
2344
2345 my $reconstruct_active_formatting_elements = sub { # MUST
2346 my $insert = shift;
2347
2348 ## Step 1
2349 return unless @$active_formatting_elements;
2350
2351 ## Step 3
2352 my $i = -1;
2353 my $entry = $active_formatting_elements->[$i];
2354
2355 ## Step 2
2356 return if $entry->[0] eq '#marker';
2357 for (@{$self->{open_elements}}) {
2358 if ($entry->[0] eq $_->[0]) {
2359 return;
2360 }
2361 }
2362
2363 S4: {
2364 ## Step 4
2365 last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
2366
2367 ## Step 5
2368 $i--;
2369 $entry = $active_formatting_elements->[$i];
2370
2371 ## Step 6
2372 if ($entry->[0] eq '#marker') {
2373 #
2374 } else {
2375 my $in_open_elements;
2376 OE: for (@{$self->{open_elements}}) {
2377 if ($entry->[0] eq $_->[0]) {
2378 $in_open_elements = 1;
2379 last OE;
2380 }
2381 }
2382 if ($in_open_elements) {
2383 #
2384 } else {
2385 redo S4;
2386 }
2387 }
2388
2389 ## Step 7
2390 $i++;
2391 $entry = $active_formatting_elements->[$i];
2392 } # S4
2393
2394 S7: {
2395 ## Step 8
2396 my $clone = [$entry->[0]->clone_node (0), $entry->[1]];
2397
2398 ## Step 9
2399 $insert->($clone->[0]);
2400 push @{$self->{open_elements}}, $clone;
2401
2402 ## Step 10
2403 $active_formatting_elements->[$i] = $self->{open_elements}->[-1];
2404
2405 ## Step 11
2406 unless ($clone->[0] eq $active_formatting_elements->[-1]->[0]) {
2407 ## Step 7'
2408 $i++;
2409 $entry = $active_formatting_elements->[$i];
2410
2411 redo S7;
2412 }
2413 } # S7
2414 }; # $reconstruct_active_formatting_elements
2415
2416 my $clear_up_to_marker = sub {
2417 for (reverse 0..$#$active_formatting_elements) {
2418 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
2419 splice @$active_formatting_elements, $_;
2420 return;
2421 }
2422 }
2423 }; # $clear_up_to_marker
2424
2425 my $parse_rcdata = sub ($$) {
2426 my ($content_model_flag, $insert) = @_;
2427
2428 ## Step 1
2429 my $start_tag_name = $token->{tag_name};
2430 my $el;
2431 !!!create-element ($el, $start_tag_name, $token->{attributes});
2432
2433 ## Step 2
2434 $insert->($el); # /context node/->append_child ($el)
2435
2436 ## Step 3
2437 $self->{content_model} = $content_model_flag; # CDATA or RCDATA
2438 delete $self->{escape}; # MUST
2439
2440 ## Step 4
2441 my $text = '';
2442 !!!next-token;
2443 while ($token->{type} == CHARACTER_TOKEN) { # or until stop tokenizing
2444 $text .= $token->{data};
2445 !!!next-token;
2446 }
2447
2448 ## Step 5
2449 if (length $text) {
2450 my $text = $self->{document}->create_text_node ($text);
2451 $el->append_child ($text);
2452 }
2453
2454 ## Step 6
2455 $self->{content_model} = PCDATA_CONTENT_MODEL;
2456
2457 ## Step 7
2458 if ($token->{type} == END_TAG_TOKEN and $token->{tag_name} eq $start_tag_name) {
2459 ## Ignore the token
2460 } elsif ($content_model_flag == CDATA_CONTENT_MODEL) {
2461 !!!parse-error (type => 'in CDATA:#'.$token->{type});
2462 } elsif ($content_model_flag == RCDATA_CONTENT_MODEL) {
2463 !!!parse-error (type => 'in RCDATA:#'.$token->{type});
2464 } else {
2465 die "$0: $content_model_flag in parse_rcdata";
2466 }
2467 !!!next-token;
2468 }; # $parse_rcdata
2469
2470 my $script_start_tag = sub ($) {
2471 my $insert = $_[0];
2472 my $script_el;
2473 !!!create-element ($script_el, 'script', $token->{attributes});
2474 ## TODO: mark as "parser-inserted"
2475
2476 $self->{content_model} = CDATA_CONTENT_MODEL;
2477 delete $self->{escape}; # MUST
2478
2479 my $text = '';
2480 !!!next-token;
2481 while ($token->{type} == CHARACTER_TOKEN) {
2482 $text .= $token->{data};
2483 !!!next-token;
2484 } # stop if non-character token or tokenizer stops tokenising
2485 if (length $text) {
2486 $script_el->manakai_append_text ($text);
2487 }
2488
2489 $self->{content_model} = PCDATA_CONTENT_MODEL;
2490
2491 if ($token->{type} == END_TAG_TOKEN and
2492 $token->{tag_name} eq 'script') {
2493 ## Ignore the token
2494 } else {
2495 !!!parse-error (type => 'in CDATA:#'.$token->{type});
2496 ## ISSUE: And ignore?
2497 ## TODO: mark as "already executed"
2498 }
2499
2500 if (defined $self->{inner_html_node}) {
2501 ## TODO: mark as "already executed"
2502 } else {
2503 ## TODO: $old_insertion_point = current insertion point
2504 ## TODO: insertion point = just before the next input character
2505
2506 $insert->($script_el);
2507
2508 ## TODO: insertion point = $old_insertion_point (might be "undefined")
2509
2510 ## TODO: if there is a script that will execute as soon as the parser resume, then...
2511 }
2512
2513 !!!next-token;
2514 }; # $script_start_tag
2515
2516 my $formatting_end_tag = sub {
2517 my $tag_name = shift;
2518
2519 FET: {
2520 ## Step 1
2521 my $formatting_element;
2522 my $formatting_element_i_in_active;
2523 AFE: for (reverse 0..$#$active_formatting_elements) {
2524 if ($active_formatting_elements->[$_]->[1] eq $tag_name) {
2525 $formatting_element = $active_formatting_elements->[$_];
2526 $formatting_element_i_in_active = $_;
2527 last AFE;
2528 } elsif ($active_formatting_elements->[$_]->[0] eq '#marker') {
2529 last AFE;
2530 }
2531 } # AFE
2532 unless (defined $formatting_element) {
2533 !!!parse-error (type => 'unmatched end tag:'.$tag_name);
2534 ## Ignore the token
2535 !!!next-token;
2536 return;
2537 }
2538 ## has an element in scope
2539 my $in_scope = 1;
2540 my $formatting_element_i_in_open;
2541 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2542 my $node = $self->{open_elements}->[$_];
2543 if ($node->[0] eq $formatting_element->[0]) {
2544 if ($in_scope) {
2545 $formatting_element_i_in_open = $_;
2546 last INSCOPE;
2547 } else { # in open elements but not in scope
2548 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2549 ## Ignore the token
2550 !!!next-token;
2551 return;
2552 }
2553 } elsif ({
2554 table => 1, caption => 1, td => 1, th => 1,
2555 button => 1, marquee => 1, object => 1, html => 1,
2556 }->{$node->[1]}) {
2557 $in_scope = 0;
2558 }
2559 } # INSCOPE
2560 unless (defined $formatting_element_i_in_open) {
2561 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2562 pop @$active_formatting_elements; # $formatting_element
2563 !!!next-token; ## TODO: ok?
2564 return;
2565 }
2566 if (not $self->{open_elements}->[-1]->[0] eq $formatting_element->[0]) {
2567 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
2568 }
2569
2570 ## Step 2
2571 my $furthest_block;
2572 my $furthest_block_i_in_open;
2573 OE: for (reverse 0..$#{$self->{open_elements}}) {
2574 my $node = $self->{open_elements}->[$_];
2575 if (not $formatting_category->{$node->[1]} and
2576 #not $phrasing_category->{$node->[1]} and
2577 ($special_category->{$node->[1]} or
2578 $scoping_category->{$node->[1]})) {
2579 $furthest_block = $node;
2580 $furthest_block_i_in_open = $_;
2581 } elsif ($node->[0] eq $formatting_element->[0]) {
2582 last OE;
2583 }
2584 } # OE
2585
2586 ## Step 3
2587 unless (defined $furthest_block) { # MUST
2588 splice @{$self->{open_elements}}, $formatting_element_i_in_open;
2589 splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
2590 !!!next-token;
2591 return;
2592 }
2593
2594 ## Step 4
2595 my $common_ancestor_node = $self->{open_elements}->[$formatting_element_i_in_open - 1];
2596
2597 ## Step 5
2598 my $furthest_block_parent = $furthest_block->[0]->parent_node;
2599 if (defined $furthest_block_parent) {
2600 $furthest_block_parent->remove_child ($furthest_block->[0]);
2601 }
2602
2603 ## Step 6
2604 my $bookmark_prev_el
2605 = $active_formatting_elements->[$formatting_element_i_in_active - 1]
2606 ->[0];
2607
2608 ## Step 7
2609 my $node = $furthest_block;
2610 my $node_i_in_open = $furthest_block_i_in_open;
2611 my $last_node = $furthest_block;
2612 S7: {
2613 ## Step 1
2614 $node_i_in_open--;
2615 $node = $self->{open_elements}->[$node_i_in_open];
2616
2617 ## Step 2
2618 my $node_i_in_active;
2619 S7S2: {
2620 for (reverse 0..$#$active_formatting_elements) {
2621 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
2622 $node_i_in_active = $_;
2623 last S7S2;
2624 }
2625 }
2626 splice @{$self->{open_elements}}, $node_i_in_open, 1;
2627 redo S7;
2628 } # S7S2
2629
2630 ## Step 3
2631 last S7 if $node->[0] eq $formatting_element->[0];
2632
2633 ## Step 4
2634 if ($last_node->[0] eq $furthest_block->[0]) {
2635 $bookmark_prev_el = $node->[0];
2636 }
2637
2638 ## Step 5
2639 if ($node->[0]->has_child_nodes ()) {
2640 my $clone = [$node->[0]->clone_node (0), $node->[1]];
2641 $active_formatting_elements->[$node_i_in_active] = $clone;
2642 $self->{open_elements}->[$node_i_in_open] = $clone;
2643 $node = $clone;
2644 }
2645
2646 ## Step 6
2647 $node->[0]->append_child ($last_node->[0]);
2648
2649 ## Step 7
2650 $last_node = $node;
2651
2652 ## Step 8
2653 redo S7;
2654 } # S7
2655
2656 ## Step 8
2657 $common_ancestor_node->[0]->append_child ($last_node->[0]);
2658
2659 ## Step 9
2660 my $clone = [$formatting_element->[0]->clone_node (0),
2661 $formatting_element->[1]];
2662
2663 ## Step 10
2664 my @cn = @{$furthest_block->[0]->child_nodes};
2665 $clone->[0]->append_child ($_) for @cn;
2666
2667 ## Step 11
2668 $furthest_block->[0]->append_child ($clone->[0]);
2669
2670 ## Step 12
2671 my $i;
2672 AFE: for (reverse 0..$#$active_formatting_elements) {
2673 if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
2674 splice @$active_formatting_elements, $_, 1;
2675 $i-- and last AFE if defined $i;
2676 } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
2677 $i = $_;
2678 }
2679 } # AFE
2680 splice @$active_formatting_elements, $i + 1, 0, $clone;
2681
2682 ## Step 13
2683 undef $i;
2684 OE: for (reverse 0..$#{$self->{open_elements}}) {
2685 if ($self->{open_elements}->[$_]->[0] eq $formatting_element->[0]) {
2686 splice @{$self->{open_elements}}, $_, 1;
2687 $i-- and last OE if defined $i;
2688 } elsif ($self->{open_elements}->[$_]->[0] eq $furthest_block->[0]) {
2689 $i = $_;
2690 }
2691 } # OE
2692 splice @{$self->{open_elements}}, $i + 1, 1, $clone;
2693
2694 ## Step 14
2695 redo FET;
2696 } # FET
2697 }; # $formatting_end_tag
2698
2699 my $insert_to_current = sub {
2700 $self->{open_elements}->[-1]->[0]->append_child ($_[0]);
2701 }; # $insert_to_current
2702
2703 my $insert_to_foster = sub {
2704 my $child = shift;
2705 if ({
2706 table => 1, tbody => 1, tfoot => 1,
2707 thead => 1, tr => 1,
2708 }->{$self->{open_elements}->[-1]->[1]}) {
2709 # MUST
2710 my $foster_parent_element;
2711 my $next_sibling;
2712 OE: for (reverse 0..$#{$self->{open_elements}}) {
2713 if ($self->{open_elements}->[$_]->[1] eq 'table') {
2714 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
2715 if (defined $parent and $parent->node_type == 1) {
2716 $foster_parent_element = $parent;
2717 $next_sibling = $self->{open_elements}->[$_]->[0];
2718 } else {
2719 $foster_parent_element
2720 = $self->{open_elements}->[$_ - 1]->[0];
2721 }
2722 last OE;
2723 }
2724 } # OE
2725 $foster_parent_element = $self->{open_elements}->[0]->[0]
2726 unless defined $foster_parent_element;
2727 $foster_parent_element->insert_before
2728 ($child, $next_sibling);
2729 } else {
2730 $self->{open_elements}->[-1]->[0]->append_child ($child);
2731 }
2732 }; # $insert_to_foster
2733
2734 my $insert;
2735
2736 B: {
2737 if ($token->{type} == DOCTYPE_TOKEN) {
2738 !!!parse-error (type => 'DOCTYPE in the middle');
2739 ## Ignore the token
2740 ## Stay in the phase
2741 !!!next-token;
2742 redo B;
2743 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
2744 if ($self->{insertion_mode} & AFTER_HTML_IMS) {
2745 #
2746 } else {
2747 ## Generate implied end tags
2748 if ({
2749 dd => 1, dt => 1, li => 1, p => 1, td => 1, th => 1, tr => 1,
2750 tbody => 1, tfoot=> 1, thead => 1,
2751 }->{$self->{open_elements}->[-1]->[1]}) {
2752 !!!back-token;
2753 $token = {type => END_TAG_TOKEN, tag_name => $self->{open_elements}->[-1]->[1]};
2754 redo B;
2755 }
2756
2757 if (@{$self->{open_elements}} > 2 or
2758 (@{$self->{open_elements}} == 2 and $self->{open_elements}->[1]->[1] ne 'body')) {
2759 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
2760 } elsif (defined $self->{inner_html_node} and
2761 @{$self->{open_elements}} > 1 and
2762 $self->{open_elements}->[1]->[1] ne 'body') {
2763 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
2764 }
2765
2766 ## ISSUE: There is an issue in the spec.
2767 }
2768
2769 ## Stop parsing
2770 last B;
2771 } elsif ($token->{type} == START_TAG_TOKEN and
2772 $token->{tag_name} eq 'html') {
2773 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
2774 ## Turn into the main phase
2775 !!!parse-error (type => 'after html:html');
2776 $self->{insertion_mode} = AFTER_BODY_IM;
2777 } elsif ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
2778 ## Turn into the main phase
2779 !!!parse-error (type => 'after html:html');
2780 $self->{insertion_mode} = AFTER_FRAMESET_IM;
2781 }
2782
2783 ## ISSUE: "aa<html>" is not a parse error.
2784 ## ISSUE: "<html>" in fragment is not a parse error.
2785 unless ($token->{first_start_tag}) {
2786 !!!parse-error (type => 'not first start tag');
2787 }
2788 my $top_el = $self->{open_elements}->[0]->[0];
2789 for my $attr_name (keys %{$token->{attributes}}) {
2790 unless ($top_el->has_attribute_ns (undef, $attr_name)) {
2791 $top_el->set_attribute_ns
2792 (undef, [undef, $attr_name],
2793 $token->{attributes}->{$attr_name}->{value});
2794 }
2795 }
2796 !!!next-token;
2797 redo B;
2798 } elsif ($token->{type} == COMMENT_TOKEN) {
2799 my $comment = $self->{document}->create_comment ($token->{data});
2800 if ($self->{insertion_mode} & AFTER_HTML_IMS) {
2801 $self->{document}->append_child ($comment);
2802 } elsif ($self->{insertion_mode} == AFTER_BODY_IM) {
2803 $self->{open_elements}->[0]->[0]->append_child ($comment);
2804 } else {
2805 $self->{open_elements}->[-1]->[0]->append_child ($comment);
2806 }
2807 !!!next-token;
2808 redo B;
2809 } elsif ($self->{insertion_mode} & HEAD_IMS) {
2810 if ($token->{type} == CHARACTER_TOKEN) {
2811 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
2812 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
2813 unless (length $token->{data}) {
2814 !!!next-token;
2815 redo B;
2816 }
2817 }
2818
2819 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
2820 ## As if <head>
2821 !!!create-element ($self->{head_element}, 'head');
2822 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
2823 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2824
2825 ## Reprocess in the "in head" insertion mode...
2826 pop @{$self->{open_elements}};
2827
2828 ## Reprocess in the "after head" insertion mode...
2829 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2830 ## As if </noscript>
2831 pop @{$self->{open_elements}};
2832 !!!parse-error (type => 'in noscript:#character');
2833
2834 ## Reprocess in the "in head" insertion mode...
2835 ## As if </head>
2836 pop @{$self->{open_elements}};
2837
2838 ## Reprocess in the "after head" insertion mode...
2839 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
2840 pop @{$self->{open_elements}};
2841
2842 ## Reprocess in the "after head" insertion mode...
2843 }
2844
2845 ## "after head" insertion mode
2846 ## As if <body>
2847 !!!insert-element ('body');
2848 $self->{insertion_mode} = IN_BODY_IM;
2849 ## reprocess
2850 redo B;
2851 } elsif ($token->{type} == START_TAG_TOKEN) {
2852 if ($token->{tag_name} eq 'head') {
2853 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
2854 !!!create-element ($self->{head_element}, $token->{tag_name}, $token->{attributes});
2855 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
2856 push @{$self->{open_elements}}, [$self->{head_element}, $token->{tag_name}];
2857 $self->{insertion_mode} = IN_HEAD_IM;
2858 !!!next-token;
2859 redo B;
2860 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
2861 #
2862 } else {
2863 !!!parse-error (type => 'in head:head'); # or in head noscript
2864 ## Ignore the token
2865 !!!next-token;
2866 redo B;
2867 }
2868 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
2869 ## As if <head>
2870 !!!create-element ($self->{head_element}, 'head');
2871 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
2872 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2873
2874 $self->{insertion_mode} = IN_HEAD_IM;
2875 ## Reprocess in the "in head" insertion mode...
2876 }
2877
2878 if ($token->{tag_name} eq 'base') {
2879 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2880 ## As if </noscript>
2881 pop @{$self->{open_elements}};
2882 !!!parse-error (type => 'in noscript:base');
2883
2884 $self->{insertion_mode} = IN_HEAD_IM;
2885 ## Reprocess in the "in head" insertion mode...
2886 }
2887
2888 ## NOTE: There is a "as if in head" code clone.
2889 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
2890 !!!parse-error (type => 'after head:'.$token->{tag_name});
2891 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2892 }
2893 !!!insert-element ($token->{tag_name}, $token->{attributes});
2894 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
2895 pop @{$self->{open_elements}}
2896 if $self->{insertion_mode} == AFTER_HEAD_IM;
2897 !!!next-token;
2898 redo B;
2899 } elsif ($token->{tag_name} eq 'link') {
2900 ## NOTE: There is a "as if in head" code clone.
2901 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
2902 !!!parse-error (type => 'after head:'.$token->{tag_name});
2903 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2904 }
2905 !!!insert-element ($token->{tag_name}, $token->{attributes});
2906 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
2907 pop @{$self->{open_elements}}
2908 if $self->{insertion_mode} == AFTER_HEAD_IM;
2909 !!!next-token;
2910 redo B;
2911 } elsif ($token->{tag_name} eq 'meta') {
2912 ## NOTE: There is a "as if in head" code clone.
2913 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
2914 !!!parse-error (type => 'after head:'.$token->{tag_name});
2915 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2916 }
2917 !!!insert-element ($token->{tag_name}, $token->{attributes});
2918 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
2919
2920 unless ($self->{confident}) {
2921 if ($token->{attributes}->{charset}) { ## TODO: And if supported
2922 $self->{change_encoding}
2923 ->($self, $token->{attributes}->{charset}->{value});
2924
2925 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
2926 ->set_user_data (manakai_has_reference =>
2927 $token->{attributes}->{charset}
2928 ->{has_reference});
2929 } elsif ($token->{attributes}->{content}) {
2930 ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
2931 if ($token->{attributes}->{content}->{value}
2932 =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
2933 [\x09-\x0D\x20]*=
2934 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
2935 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
2936 $self->{change_encoding}
2937 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3);
2938 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
2939 ->set_user_data (manakai_has_reference =>
2940 $token->{attributes}->{content}
2941 ->{has_reference});
2942 }
2943 }
2944 } else {
2945 if ($token->{attributes}->{charset}) {
2946 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
2947 ->set_user_data (manakai_has_reference =>
2948 $token->{attributes}->{charset}
2949 ->{has_reference});
2950 }
2951 if ($token->{attributes}->{content}) {
2952 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
2953 ->set_user_data (manakai_has_reference =>
2954 $token->{attributes}->{content}
2955 ->{has_reference});
2956 }
2957 }
2958
2959 pop @{$self->{open_elements}}
2960 if $self->{insertion_mode} == AFTER_HEAD_IM;
2961 !!!next-token;
2962 redo B;
2963 } elsif ($token->{tag_name} eq 'title') {
2964 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2965 ## As if </noscript>
2966 pop @{$self->{open_elements}};
2967 !!!parse-error (type => 'in noscript:title');
2968
2969 $self->{insertion_mode} = IN_HEAD_IM;
2970 ## Reprocess in the "in head" insertion mode...
2971 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
2972 !!!parse-error (type => 'after head:'.$token->{tag_name});
2973 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2974 }
2975
2976 ## NOTE: There is a "as if in head" code clone.
2977 my $parent = defined $self->{head_element} ? $self->{head_element}
2978 : $self->{open_elements}->[-1]->[0];
2979 $parse_rcdata->(RCDATA_CONTENT_MODEL,
2980 sub { $parent->append_child ($_[0]) });
2981 pop @{$self->{open_elements}}
2982 if $self->{insertion_mode} == AFTER_HEAD_IM;
2983 redo B;
2984 } elsif ($token->{tag_name} eq 'style') {
2985 ## NOTE: Or (scripting is enabled and tag_name eq 'noscript' and
2986 ## insertion mode IN_HEAD_IM)
2987 ## NOTE: There is a "as if in head" code clone.
2988 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
2989 !!!parse-error (type => 'after head:'.$token->{tag_name});
2990 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2991 }
2992 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert_to_current);
2993 pop @{$self->{open_elements}}
2994 if $self->{insertion_mode} == AFTER_HEAD_IM;
2995 redo B;
2996 } elsif ($token->{tag_name} eq 'noscript') {
2997 if ($self->{insertion_mode} == IN_HEAD_IM) {
2998 ## NOTE: and scripting is disalbed
2999 !!!insert-element ($token->{tag_name}, $token->{attributes});
3000 $self->{insertion_mode} = IN_HEAD_NOSCRIPT_IM;
3001 !!!next-token;
3002 redo B;
3003 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3004 !!!parse-error (type => 'in noscript:noscript');
3005 ## Ignore the token
3006 !!!next-token;
3007 redo B;
3008 } else {
3009 #
3010 }
3011 } elsif ($token->{tag_name} eq 'script') {
3012 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3013 ## As if </noscript>
3014 pop @{$self->{open_elements}};
3015 !!!parse-error (type => 'in noscript:script');
3016
3017 $self->{insertion_mode} = IN_HEAD_IM;
3018 ## Reprocess in the "in head" insertion mode...
3019 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
3020 !!!parse-error (type => 'after head:'.$token->{tag_name});
3021 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3022 }
3023
3024 ## NOTE: There is a "as if in head" code clone.
3025 $script_start_tag->($insert_to_current);
3026 pop @{$self->{open_elements}}
3027 if $self->{insertion_mode} == AFTER_HEAD_IM;
3028 redo B;
3029 } elsif ($token->{tag_name} eq 'body' or
3030 $token->{tag_name} eq 'frameset') {
3031 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3032 ## As if </noscript>
3033 pop @{$self->{open_elements}};
3034 !!!parse-error (type => 'in noscript:'.$token->{tag_name});
3035
3036 ## Reprocess in the "in head" insertion mode...
3037 ## As if </head>
3038 pop @{$self->{open_elements}};
3039
3040 ## Reprocess in the "after head" insertion mode...
3041 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3042 pop @{$self->{open_elements}};
3043
3044 ## Reprocess in the "after head" insertion mode...
3045 }
3046
3047 ## "after head" insertion mode
3048 !!!insert-element ($token->{tag_name}, $token->{attributes});
3049 if ($token->{tag_name} eq 'body') {
3050 $self->{insertion_mode} = IN_BODY_IM;
3051 } elsif ($token->{tag_name} eq 'frameset') {
3052 $self->{insertion_mode} = IN_FRAMESET_IM;
3053 } else {
3054 die "$0: tag name: $self->{tag_name}";
3055 }
3056 !!!next-token;
3057 redo B;
3058 } else {
3059 #
3060 }
3061
3062 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3063 ## As if </noscript>
3064 pop @{$self->{open_elements}};
3065 !!!parse-error (type => 'in noscript:/'.$token->{tag_name});
3066
3067 ## Reprocess in the "in head" insertion mode...
3068 ## As if </head>
3069 pop @{$self->{open_elements}};
3070
3071 ## Reprocess in the "after head" insertion mode...
3072 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3073 ## As if </head>
3074 pop @{$self->{open_elements}};
3075
3076 ## Reprocess in the "after head" insertion mode...
3077 }
3078
3079 ## "after head" insertion mode
3080 ## As if <body>
3081 !!!insert-element ('body');
3082 $self->{insertion_mode} = IN_BODY_IM;
3083 ## reprocess
3084 redo B;
3085 } elsif ($token->{type} == END_TAG_TOKEN) {
3086 if ($token->{tag_name} eq 'head') {
3087 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3088 ## As if <head>
3089 !!!create-element ($self->{head_element}, 'head');
3090 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3091 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3092
3093 ## Reprocess in the "in head" insertion mode...
3094 pop @{$self->{open_elements}};
3095 $self->{insertion_mode} = AFTER_HEAD_IM;
3096 !!!next-token;
3097 redo B;
3098 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3099 ## As if </noscript>
3100 pop @{$self->{open_elements}};
3101 !!!parse-error (type => 'in noscript:script');
3102
3103 ## Reprocess in the "in head" insertion mode...
3104 pop @{$self->{open_elements}};
3105 $self->{insertion_mode} = AFTER_HEAD_IM;
3106 !!!next-token;
3107 redo B;
3108 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3109 pop @{$self->{open_elements}};
3110 $self->{insertion_mode} = AFTER_HEAD_IM;
3111 !!!next-token;
3112 redo B;
3113 } else {
3114 #
3115 }
3116 } elsif ($token->{tag_name} eq 'noscript') {
3117 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3118 pop @{$self->{open_elements}};
3119 $self->{insertion_mode} = IN_HEAD_IM;
3120 !!!next-token;
3121 redo B;
3122 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3123 !!!parse-error (type => 'unmatched end tag:noscript');
3124 ## Ignore the token ## ISSUE: An issue in the spec.
3125 !!!next-token;
3126 redo B;
3127 } else {
3128 #
3129 }
3130 } elsif ({
3131 body => 1, html => 1,
3132 }->{$token->{tag_name}}) {
3133 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3134 ## As if <head>
3135 !!!create-element ($self->{head_element}, 'head');
3136 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3137 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3138
3139 $self->{insertion_mode} = IN_HEAD_IM;
3140 ## Reprocess in the "in head" insertion mode...
3141 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3142 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3143 ## Ignore the token
3144 !!!next-token;
3145 redo B;
3146 }
3147
3148 #
3149 } elsif ({
3150 p => 1, br => 1,
3151 }->{$token->{tag_name}}) {
3152 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3153 ## As if <head>
3154 !!!create-element ($self->{head_element}, 'head');
3155 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3156 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3157
3158 $self->{insertion_mode} = IN_HEAD_IM;
3159 ## Reprocess in the "in head" insertion mode...
3160 }
3161
3162 #
3163 } else {
3164 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
3165 #
3166 } else {
3167 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3168 ## Ignore the token
3169 !!!next-token;
3170 redo B;
3171 }
3172 }
3173
3174 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3175 ## As if </noscript>
3176 pop @{$self->{open_elements}};
3177 !!!parse-error (type => 'in noscript:/'.$token->{tag_name});
3178
3179 ## Reprocess in the "in head" insertion mode...
3180 ## As if </head>
3181 pop @{$self->{open_elements}};
3182
3183 ## Reprocess in the "after head" insertion mode...
3184 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3185 ## As if </head>
3186 pop @{$self->{open_elements}};
3187
3188 ## Reprocess in the "after head" insertion mode...
3189 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3190 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3191 ## Ignore the token ## ISSUE: An issue in the spec.
3192 !!!next-token;
3193 redo B;
3194 }
3195
3196 ## "after head" insertion mode
3197 ## As if <body>
3198 !!!insert-element ('body');
3199 $self->{insertion_mode} = IN_BODY_IM;
3200 ## reprocess
3201 redo B;
3202 } else {
3203 die "$0: $token->{type}: Unknown token type";
3204 }
3205
3206 ## ISSUE: An issue in the spec.
3207 } elsif ($self->{insertion_mode} & BODY_IMS) {
3208 if ($token->{type} == CHARACTER_TOKEN) {
3209 ## NOTE: There is a code clone of "character in body".
3210 $reconstruct_active_formatting_elements->($insert_to_current);
3211
3212 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3213
3214 !!!next-token;
3215 redo B;
3216 } elsif ($token->{type} == START_TAG_TOKEN) {
3217 if ({
3218 caption => 1, col => 1, colgroup => 1, tbody => 1,
3219 td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
3220 }->{$token->{tag_name}}) {
3221 if ($self->{insertion_mode} == IN_CELL_IM) {
3222 ## have an element in table scope
3223 my $tn;
3224 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3225 my $node = $self->{open_elements}->[$_];
3226 if ($node->[1] eq 'td' or $node->[1] eq 'th') {
3227 $tn = $node->[1];
3228 last INSCOPE;
3229 } elsif ({
3230 table => 1, html => 1,
3231 }->{$node->[1]}) {
3232 last INSCOPE;
3233 }
3234 } # INSCOPE
3235 unless (defined $tn) {
3236 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3237 ## Ignore the token
3238 !!!next-token;
3239 redo B;
3240 }
3241
3242 ## Close the cell
3243 !!!back-token; # <?>
3244 $token = {type => END_TAG_TOKEN, tag_name => $tn};
3245 redo B;
3246 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
3247 !!!parse-error (type => 'not closed:caption');
3248
3249 ## As if </caption>
3250 ## have a table element in table scope
3251 my $i;
3252 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3253 my $node = $self->{open_elements}->[$_];
3254 if ($node->[1] eq 'caption') {
3255 $i = $_;
3256 last INSCOPE;
3257 } elsif ({
3258 table => 1, html => 1,
3259 }->{$node->[1]}) {
3260 last INSCOPE;
3261 }
3262 } # INSCOPE
3263 unless (defined $i) {
3264 !!!parse-error (type => 'unmatched end tag:caption');
3265 ## Ignore the token
3266 !!!next-token;
3267 redo B;
3268 }
3269
3270 ## generate implied end tags
3271 if ({
3272 dd => 1, dt => 1, li => 1, p => 1,
3273 td => 1, th => 1, tr => 1,
3274 tbody => 1, tfoot=> 1, thead => 1,
3275 }->{$self->{open_elements}->[-1]->[1]}) {
3276 !!!back-token; # <?>
3277 $token = {type => END_TAG_TOKEN, tag_name => 'caption'};
3278 !!!back-token;
3279 $token = {type => END_TAG_TOKEN,
3280 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3281 redo B;
3282 }
3283
3284 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3285 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3286 }
3287
3288 splice @{$self->{open_elements}}, $i;
3289
3290 $clear_up_to_marker->();
3291
3292 $self->{insertion_mode} = IN_TABLE_IM;
3293
3294 ## reprocess
3295 redo B;
3296 } else {
3297 #
3298 }
3299 } else {
3300 #
3301 }
3302 } elsif ($token->{type} == END_TAG_TOKEN) {
3303 if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') {
3304 if ($self->{insertion_mode} == IN_CELL_IM) {
3305 ## have an element in table scope
3306 my $i;
3307 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3308 my $node = $self->{open_elements}->[$_];
3309 if ($node->[1] eq $token->{tag_name}) {
3310 $i = $_;
3311 last INSCOPE;
3312 } elsif ({
3313 table => 1, html => 1,
3314 }->{$node->[1]}) {
3315 last INSCOPE;
3316 }
3317 } # INSCOPE
3318 unless (defined $i) {
3319 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3320 ## Ignore the token
3321 !!!next-token;
3322 redo B;
3323 }
3324
3325 ## generate implied end tags
3326 if ({
3327 dd => 1, dt => 1, li => 1, p => 1,
3328 td => ($token->{tag_name} eq 'th'),
3329 th => ($token->{tag_name} eq 'td'),
3330 tr => 1,
3331 tbody => 1, tfoot=> 1, thead => 1,
3332 }->{$self->{open_elements}->[-1]->[1]}) {
3333 !!!back-token;
3334 $token = {type => END_TAG_TOKEN,
3335 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3336 redo B;
3337 }
3338
3339 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
3340 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3341 }
3342
3343 splice @{$self->{open_elements}}, $i;
3344
3345 $clear_up_to_marker->();
3346
3347 $self->{insertion_mode} = IN_ROW_IM;
3348
3349 !!!next-token;
3350 redo B;
3351 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
3352 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3353 ## Ignore the token
3354 !!!next-token;
3355 redo B;
3356 } else {
3357 #
3358 }
3359 } elsif ($token->{tag_name} eq 'caption') {
3360 if ($self->{insertion_mode} == IN_CAPTION_IM) {
3361 ## have a table element in table scope
3362 my $i;
3363 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3364 my $node = $self->{open_elements}->[$_];
3365 if ($node->[1] eq $token->{tag_name}) {
3366 $i = $_;
3367 last INSCOPE;
3368 } elsif ({
3369 table => 1, html => 1,
3370 }->{$node->[1]}) {
3371 last INSCOPE;
3372 }
3373 } # INSCOPE
3374 unless (defined $i) {
3375 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3376 ## Ignore the token
3377 !!!next-token;
3378 redo B;
3379 }
3380
3381 ## generate implied end tags
3382 if ({
3383 dd => 1, dt => 1, li => 1, p => 1,
3384 td => 1, th => 1, tr => 1,
3385 tbody => 1, tfoot=> 1, thead => 1,
3386 }->{$self->{open_elements}->[-1]->[1]}) {
3387 !!!back-token;
3388 $token = {type => END_TAG_TOKEN,
3389 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3390 redo B;
3391 }
3392
3393 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3394 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3395 }
3396
3397 splice @{$self->{open_elements}}, $i;
3398
3399 $clear_up_to_marker->();
3400
3401 $self->{insertion_mode} = IN_TABLE_IM;
3402
3403 !!!next-token;
3404 redo B;
3405 } elsif ($self->{insertion_mode} == IN_CELL_IM) {
3406 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3407 ## Ignore the token
3408 !!!next-token;
3409 redo B;
3410 } else {
3411 #
3412 }
3413 } elsif ({
3414 table => 1, tbody => 1, tfoot => 1,
3415 thead => 1, tr => 1,
3416 }->{$token->{tag_name}} and
3417 $self->{insertion_mode} == IN_CELL_IM) {
3418 ## have an element in table scope
3419 my $i;
3420 my $tn;
3421 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3422 my $node = $self->{open_elements}->[$_];
3423 if ($node->[1] eq $token->{tag_name}) {
3424 $i = $_;
3425 last INSCOPE;
3426 } elsif ($node->[1] eq 'td' or $node->[1] eq 'th') {
3427 $tn = $node->[1];
3428 ## NOTE: There is exactly one |td| or |th| element
3429 ## in scope in the stack of open elements by definition.
3430 } elsif ({
3431 table => 1, html => 1,
3432 }->{$node->[1]}) {
3433 last INSCOPE;
3434 }
3435 } # INSCOPE
3436 unless (defined $i) {
3437 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3438 ## Ignore the token
3439 !!!next-token;
3440 redo B;
3441 }
3442
3443 ## Close the cell
3444 !!!back-token; # </?>
3445 $token = {type => END_TAG_TOKEN, tag_name => $tn};
3446 redo B;
3447 } elsif ($token->{tag_name} eq 'table' and
3448 $self->{insertion_mode} == IN_CAPTION_IM) {
3449 !!!parse-error (type => 'not closed:caption');
3450
3451 ## As if </caption>
3452 ## have a table element in table scope
3453 my $i;
3454 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3455 my $node = $self->{open_elements}->[$_];
3456 if ($node->[1] eq 'caption') {
3457 $i = $_;
3458 last INSCOPE;
3459 } elsif ({
3460 table => 1, html => 1,
3461 }->{$node->[1]}) {
3462 last INSCOPE;
3463 }
3464 } # INSCOPE
3465 unless (defined $i) {
3466 !!!parse-error (type => 'unmatched end tag:caption');
3467 ## Ignore the token
3468 !!!next-token;
3469 redo B;
3470 }
3471
3472 ## generate implied end tags
3473 if ({
3474 dd => 1, dt => 1, li => 1, p => 1,
3475 td => 1, th => 1, tr => 1,
3476 tbody => 1, tfoot=> 1, thead => 1,
3477 }->{$self->{open_elements}->[-1]->[1]}) {
3478 !!!back-token; # </table>
3479 $token = {type => END_TAG_TOKEN, tag_name => 'caption'};
3480 !!!back-token;
3481 $token = {type => END_TAG_TOKEN,
3482 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3483 redo B;
3484 }
3485
3486 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3487 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3488 }
3489
3490 splice @{$self->{open_elements}}, $i;
3491
3492 $clear_up_to_marker->();
3493
3494 $self->{insertion_mode} = IN_TABLE_IM;
3495
3496 ## reprocess
3497 redo B;
3498 } elsif ({
3499 body => 1, col => 1, colgroup => 1, html => 1,
3500 }->{$token->{tag_name}}) {
3501 if ($self->{insertion_mode} & BODY_TABLE_IMS) {
3502 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3503 ## Ignore the token
3504 !!!next-token;
3505 redo B;
3506 } else {
3507 #
3508 }
3509 } elsif ({
3510 tbody => 1, tfoot => 1,
3511 thead => 1, tr => 1,
3512 }->{$token->{tag_name}} and
3513 $self->{insertion_mode} == IN_CAPTION_IM) {
3514 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3515 ## Ignore the token
3516 !!!next-token;
3517 redo B;
3518 } else {
3519 #
3520 }
3521 } else {
3522 die "$0: $token->{type}: Unknown token type";
3523 }
3524
3525 $insert = $insert_to_current;
3526 #
3527 } elsif ($self->{insertion_mode} & TABLE_IMS) {
3528 if ($token->{type} == CHARACTER_TOKEN) {
3529 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3530 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3531
3532 unless (length $token->{data}) {
3533 !!!next-token;
3534 redo B;
3535 }
3536 }
3537
3538 !!!parse-error (type => 'in table:#character');
3539
3540 ## As if in body, but insert into foster parent element
3541 ## ISSUE: Spec says that "whenever a node would be inserted
3542 ## into the current node" while characters might not be
3543 ## result in a new Text node.
3544 $reconstruct_active_formatting_elements->($insert_to_foster);
3545
3546 if ({
3547 table => 1, tbody => 1, tfoot => 1,
3548 thead => 1, tr => 1,
3549 }->{$self->{open_elements}->[-1]->[1]}) {
3550 # MUST
3551 my $foster_parent_element;
3552 my $next_sibling;
3553 my $prev_sibling;
3554 OE: for (reverse 0..$#{$self->{open_elements}}) {
3555 if ($self->{open_elements}->[$_]->[1] eq 'table') {
3556 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3557 if (defined $parent and $parent->node_type == 1) {
3558 $foster_parent_element = $parent;
3559 $next_sibling = $self->{open_elements}->[$_]->[0];
3560 $prev_sibling = $next_sibling->previous_sibling;
3561 } else {
3562 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
3563 $prev_sibling = $foster_parent_element->last_child;
3564 }
3565 last OE;
3566 }
3567 } # OE
3568 $foster_parent_element = $self->{open_elements}->[0]->[0] and
3569 $prev_sibling = $foster_parent_element->last_child
3570 unless defined $foster_parent_element;
3571 if (defined $prev_sibling and
3572 $prev_sibling->node_type == 3) {
3573 $prev_sibling->manakai_append_text ($token->{data});
3574 } else {
3575 $foster_parent_element->insert_before
3576 ($self->{document}->create_text_node ($token->{data}),
3577 $next_sibling);
3578 }
3579 } else {
3580 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3581 }
3582
3583 !!!next-token;
3584 redo B;
3585 } elsif ($token->{type} == START_TAG_TOKEN) {
3586 if ({
3587 tr => ($self->{insertion_mode} != IN_ROW_IM),
3588 th => 1, td => 1,
3589 }->{$token->{tag_name}}) {
3590 if ($self->{insertion_mode} == IN_TABLE_IM) {
3591 ## Clear back to table context
3592 while ($self->{open_elements}->[-1]->[1] ne 'table' and
3593 $self->{open_elements}->[-1]->[1] ne 'html') {
3594 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3595 pop @{$self->{open_elements}};
3596 }
3597
3598 !!!insert-element ('tbody');
3599 $self->{insertion_mode} = IN_TABLE_BODY_IM;
3600 ## reprocess in the "in table body" insertion mode...
3601 }
3602
3603 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
3604 unless ($token->{tag_name} eq 'tr') {
3605 !!!parse-error (type => 'missing start tag:tr');
3606 }
3607
3608 ## Clear back to table body context
3609 while (not {
3610 tbody => 1, tfoot => 1, thead => 1, html => 1,
3611 }->{$self->{open_elements}->[-1]->[1]}) {
3612 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3613 pop @{$self->{open_elements}};
3614 }
3615
3616 $self->{insertion_mode} = IN_ROW_IM;
3617 if ($token->{tag_name} eq 'tr') {
3618 !!!insert-element ($token->{tag_name}, $token->{attributes});
3619 !!!next-token;
3620 redo B;
3621 } else {
3622 !!!insert-element ('tr');
3623 ## reprocess in the "in row" insertion mode
3624 }
3625 }
3626
3627 ## Clear back to table row context
3628 while (not {
3629 tr => 1, html => 1,
3630 }->{$self->{open_elements}->[-1]->[1]}) {
3631 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3632 pop @{$self->{open_elements}};
3633 }
3634
3635 !!!insert-element ($token->{tag_name}, $token->{attributes});
3636 $self->{insertion_mode} = IN_CELL_IM;
3637
3638 push @$active_formatting_elements, ['#marker', ''];
3639
3640 !!!next-token;
3641 redo B;
3642 } elsif ({
3643 caption => 1, col => 1, colgroup => 1,
3644 tbody => 1, tfoot => 1, thead => 1,
3645 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
3646 }->{$token->{tag_name}}) {
3647 if ($self->{insertion_mode} == IN_ROW_IM) {
3648 ## As if </tr>
3649 ## have an element in table scope
3650 my $i;
3651 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3652 my $node = $self->{open_elements}->[$_];
3653 if ($node->[1] eq 'tr') {
3654 $i = $_;
3655 last INSCOPE;
3656 } elsif ({
3657 table => 1, html => 1,
3658 }->{$node->[1]}) {
3659 last INSCOPE;
3660 }
3661 } # INSCOPE
3662 unless (defined $i) {
3663 !!!parse-error (type => 'unmacthed end tag:'.$token->{tag_name});
3664 ## Ignore the token
3665 !!!next-token;
3666 redo B;
3667 }
3668
3669 ## Clear back to table row context
3670 while (not {
3671 tr => 1, html => 1,
3672 }->{$self->{open_elements}->[-1]->[1]}) {
3673 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3674 pop @{$self->{open_elements}};
3675 }
3676
3677 pop @{$self->{open_elements}}; # tr
3678 $self->{insertion_mode} = IN_TABLE_BODY_IM;
3679 if ($token->{tag_name} eq 'tr') {
3680 ## reprocess
3681 redo B;
3682 } else {
3683 ## reprocess in the "in table body" insertion mode...
3684 }
3685 }
3686
3687 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
3688 ## have an element in table scope
3689 my $i;
3690 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3691 my $node = $self->{open_elements}->[$_];
3692 if ({
3693 tbody => 1, thead => 1, tfoot => 1,
3694 }->{$node->[1]}) {
3695 $i = $_;
3696 last INSCOPE;
3697 } elsif ({
3698 table => 1, html => 1,
3699 }->{$node->[1]}) {
3700 last INSCOPE;
3701 }
3702 } # INSCOPE
3703 unless (defined $i) {
3704 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3705 ## Ignore the token
3706 !!!next-token;
3707 redo B;
3708 }
3709
3710 ## Clear back to table body context
3711 while (not {
3712 tbody => 1, tfoot => 1, thead => 1, html => 1,
3713 }->{$self->{open_elements}->[-1]->[1]}) {
3714 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3715 pop @{$self->{open_elements}};
3716 }
3717
3718 ## As if <{current node}>
3719 ## have an element in table scope
3720 ## true by definition
3721
3722 ## Clear back to table body context
3723 ## nop by definition
3724
3725 pop @{$self->{open_elements}};
3726 $self->{insertion_mode} = IN_TABLE_IM;
3727 ## reprocess in "in table" insertion mode...
3728 }
3729
3730 if ($token->{tag_name} eq 'col') {
3731 ## Clear back to table context
3732 while ($self->{open_elements}->[-1]->[1] ne 'table' and
3733 $self->{open_elements}->[-1]->[1] ne 'html') {
3734 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3735 pop @{$self->{open_elements}};
3736 }
3737
3738 !!!insert-element ('colgroup');
3739 $self->{insertion_mode} = IN_COLUMN_GROUP_IM;
3740 ## reprocess
3741 redo B;
3742 } elsif ({
3743 caption => 1,
3744 colgroup => 1,
3745 tbody => 1, tfoot => 1, thead => 1,
3746 }->{$token->{tag_name}}) {
3747 ## Clear back to table context
3748 while ($self->{open_elements}->[-1]->[1] ne 'table' and
3749 $self->{open_elements}->[-1]->[1] ne 'html') {
3750 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3751 pop @{$self->{open_elements}};
3752 }
3753
3754 push @$active_formatting_elements, ['#marker', '']
3755 if $token->{tag_name} eq 'caption';
3756
3757 !!!insert-element ($token->{tag_name}, $token->{attributes});
3758 $self->{insertion_mode} = {
3759 caption => IN_CAPTION_IM,
3760 colgroup => IN_COLUMN_GROUP_IM,
3761 tbody => IN_TABLE_BODY_IM,
3762 tfoot => IN_TABLE_BODY_IM,
3763 thead => IN_TABLE_BODY_IM,
3764 }->{$token->{tag_name}};
3765 !!!next-token;
3766 redo B;
3767 } else {
3768 die "$0: in table: <>: $token->{tag_name}";
3769 }
3770 } elsif ($token->{tag_name} eq 'table') {
3771 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3772
3773 ## As if </table>
3774 ## have a table element in table scope
3775 my $i;
3776 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3777 my $node = $self->{open_elements}->[$_];
3778 if ($node->[1] eq 'table') {
3779 $i = $_;
3780 last INSCOPE;
3781 } elsif ({
3782 table => 1, html => 1,
3783 }->{$node->[1]}) {
3784 last INSCOPE;
3785 }
3786 } # INSCOPE
3787 unless (defined $i) {
3788 !!!parse-error (type => 'unmatched end tag:table');
3789 ## Ignore tokens </table><table>
3790 !!!next-token;
3791 redo B;
3792 }
3793
3794 ## generate implied end tags
3795 if ({
3796 dd => 1, dt => 1, li => 1, p => 1,
3797 td => 1, th => 1, tr => 1,
3798 tbody => 1, tfoot=> 1, thead => 1,
3799 }->{$self->{open_elements}->[-1]->[1]}) {
3800 !!!back-token; # <table>
3801 $token = {type => END_TAG_TOKEN, tag_name => 'table'};
3802 !!!back-token;
3803 $token = {type => END_TAG_TOKEN,
3804 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3805 redo B;
3806 }
3807
3808 if ($self->{open_elements}->[-1]->[1] ne 'table') {
3809 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3810 }
3811
3812 splice @{$self->{open_elements}}, $i;
3813
3814 $self->_reset_insertion_mode;
3815
3816 ## reprocess
3817 redo B;
3818 } else {
3819 !!!parse-error (type => 'in table:'.$token->{tag_name});
3820
3821 $insert = $insert_to_foster;
3822 #
3823 }
3824 } elsif ($token->{type} == END_TAG_TOKEN) {
3825 if ($token->{tag_name} eq 'tr' and
3826 $self->{insertion_mode} == IN_ROW_IM) {
3827 ## have an element in table scope
3828 my $i;
3829 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3830 my $node = $self->{open_elements}->[$_];
3831 if ($node->[1] eq $token->{tag_name}) {
3832 $i = $_;
3833 last INSCOPE;
3834 } elsif ({
3835 table => 1, html => 1,
3836 }->{$node->[1]}) {
3837 last INSCOPE;
3838 }
3839 } # INSCOPE
3840 unless (defined $i) {
3841 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3842 ## Ignore the token
3843 !!!next-token;
3844 redo B;
3845 }
3846
3847 ## Clear back to table row context
3848 while (not {
3849 tr => 1, html => 1,
3850 }->{$self->{open_elements}->[-1]->[1]}) {
3851 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3852 pop @{$self->{open_elements}};
3853 }
3854
3855 pop @{$self->{open_elements}}; # tr
3856 $self->{insertion_mode} = IN_TABLE_BODY_IM;
3857 !!!next-token;
3858 redo B;
3859 } elsif ($token->{tag_name} eq 'table') {
3860 if ($self->{insertion_mode} == IN_ROW_IM) {
3861 ## As if </tr>
3862 ## have an element in table scope
3863 my $i;
3864 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3865 my $node = $self->{open_elements}->[$_];
3866 if ($node->[1] eq 'tr') {
3867 $i = $_;
3868 last INSCOPE;
3869 } elsif ({
3870 table => 1, html => 1,
3871 }->{$node->[1]}) {
3872 last INSCOPE;
3873 }
3874 } # INSCOPE
3875 unless (defined $i) {
3876 !!!parse-error (type => 'unmatched end tag:'.$token->{type});
3877 ## Ignore the token
3878 !!!next-token;
3879 redo B;
3880 }
3881
3882 ## Clear back to table row context
3883 while (not {
3884 tr => 1, html => 1,
3885 }->{$self->{open_elements}->[-1]->[1]}) {
3886 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3887 pop @{$self->{open_elements}};
3888 }
3889
3890 pop @{$self->{open_elements}}; # tr
3891 $self->{insertion_mode} = IN_TABLE_BODY_IM;
3892 ## reprocess in the "in table body" insertion mode...
3893 }
3894
3895 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
3896 ## have an element in table scope
3897 my $i;
3898 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3899 my $node = $self->{open_elements}->[$_];
3900 if ({
3901 tbody => 1, thead => 1, tfoot => 1,
3902 }->{$node->[1]}) {
3903 $i = $_;
3904 last INSCOPE;
3905 } elsif ({
3906 table => 1, html => 1,
3907 }->{$node->[1]}) {
3908 last INSCOPE;
3909 }
3910 } # INSCOPE
3911 unless (defined $i) {
3912 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3913 ## Ignore the token
3914 !!!next-token;
3915 redo B;
3916 }
3917
3918 ## Clear back to table body context
3919 while (not {
3920 tbody => 1, tfoot => 1, thead => 1, html => 1,
3921 }->{$self->{open_elements}->[-1]->[1]}) {
3922 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3923 pop @{$self->{open_elements}};
3924 }
3925
3926 ## As if <{current node}>
3927 ## have an element in table scope
3928 ## true by definition
3929
3930 ## Clear back to table body context
3931 ## nop by definition
3932
3933 pop @{$self->{open_elements}};
3934 $self->{insertion_mode} = IN_TABLE_IM;
3935 ## reprocess in the "in table" insertion mode...
3936 }
3937
3938 ## have a table element in table scope
3939 my $i;
3940 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3941 my $node = $self->{open_elements}->[$_];
3942 if ($node->[1] eq $token->{tag_name}) {
3943 $i = $_;
3944 last INSCOPE;
3945 } elsif ({
3946 table => 1, html => 1,
3947 }->{$node->[1]}) {
3948 last INSCOPE;
3949 }
3950 } # INSCOPE
3951 unless (defined $i) {
3952 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3953 ## Ignore the token
3954 !!!next-token;
3955 redo B;
3956 }
3957
3958 ## generate implied end tags
3959 if ({
3960 dd => 1, dt => 1, li => 1, p => 1,
3961 td => 1, th => 1, tr => 1,
3962 tbody => 1, tfoot=> 1, thead => 1,
3963 }->{$self->{open_elements}->[-1]->[1]}) {
3964 !!!back-token;
3965 $token = {type => END_TAG_TOKEN,
3966 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3967 redo B;
3968 }
3969
3970 if ($self->{open_elements}->[-1]->[1] ne 'table') {
3971 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3972 }
3973
3974 splice @{$self->{open_elements}}, $i;
3975
3976 $self->_reset_insertion_mode;
3977
3978 !!!next-token;
3979 redo B;
3980 } elsif ({
3981 tbody => 1, tfoot => 1, thead => 1,
3982 }->{$token->{tag_name}} and
3983 $self->{insertion_mode} & ROW_IMS) {
3984 if ($self->{insertion_mode} == IN_ROW_IM) {
3985 ## have an element in table scope
3986 my $i;
3987 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3988 my $node = $self->{open_elements}->[$_];
3989 if ($node->[1] eq $token->{tag_name}) {
3990 $i = $_;
3991 last INSCOPE;
3992 } elsif ({
3993 table => 1, html => 1,
3994 }->{$node->[1]}) {
3995 last INSCOPE;
3996 }
3997 } # INSCOPE
3998 unless (defined $i) {
3999 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4000 ## Ignore the token
4001 !!!next-token;
4002 redo B;
4003 }
4004
4005 ## As if </tr>
4006 ## have an element in table scope
4007 my $i;
4008 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4009 my $node = $self->{open_elements}->[$_];
4010 if ($node->[1] eq 'tr') {
4011 $i = $_;
4012 last INSCOPE;
4013 } elsif ({
4014 table => 1, html => 1,
4015 }->{$node->[1]}) {
4016 last INSCOPE;
4017 }
4018 } # INSCOPE
4019 unless (defined $i) {
4020 !!!parse-error (type => 'unmatched end tag:tr');
4021 ## Ignore the token
4022 !!!next-token;
4023 redo B;
4024 }
4025
4026 ## Clear back to table row context
4027 while (not {
4028 tr => 1, html => 1,
4029 }->{$self->{open_elements}->[-1]->[1]}) {
4030 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4031 pop @{$self->{open_elements}};
4032 }
4033
4034 pop @{$self->{open_elements}}; # tr
4035 $self->{insertion_mode} = IN_TABLE_BODY_IM;
4036 ## reprocess in the "in table body" insertion mode...
4037 }
4038
4039 ## have an element in table scope
4040 my $i;
4041 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4042 my $node = $self->{open_elements}->[$_];
4043 if ($node->[1] eq $token->{tag_name}) {
4044 $i = $_;
4045 last INSCOPE;
4046 } elsif ({
4047 table => 1, html => 1,
4048 }->{$node->[1]}) {
4049 last INSCOPE;
4050 }
4051 } # INSCOPE
4052 unless (defined $i) {
4053 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4054 ## Ignore the token
4055 !!!next-token;
4056 redo B;
4057 }
4058
4059 ## Clear back to table body context
4060 while (not {
4061 tbody => 1, tfoot => 1, thead => 1, html => 1,
4062 }->{$self->{open_elements}->[-1]->[1]}) {
4063 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4064 pop @{$self->{open_elements}};
4065 }
4066
4067 pop @{$self->{open_elements}};
4068 $self->{insertion_mode} = IN_TABLE_IM;
4069 !!!next-token;
4070 redo B;
4071 } elsif ({
4072 body => 1, caption => 1, col => 1, colgroup => 1,
4073 html => 1, td => 1, th => 1,
4074 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
4075 tbody => 1, tfoot => 1, thead => 1, # $self->{insertion_mode} == IN_TABLE_IM
4076 }->{$token->{tag_name}}) {
4077 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4078 ## Ignore the token
4079 !!!next-token;
4080 redo B;
4081 } else {
4082 !!!parse-error (type => 'in table:/'.$token->{tag_name});
4083
4084 $insert = $insert_to_foster;
4085 #
4086 }
4087 } else {
4088 die "$0: $token->{type}: Unknown token type";
4089 }
4090 } elsif ($self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
4091 if ($token->{type} == CHARACTER_TOKEN) {
4092 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4093 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4094 unless (length $token->{data}) {
4095 !!!next-token;
4096 redo B;
4097 }
4098 }
4099
4100 #
4101 } elsif ($token->{type} == START_TAG_TOKEN) {
4102 if ($token->{tag_name} eq 'col') {
4103 !!!insert-element ($token->{tag_name}, $token->{attributes});
4104 pop @{$self->{open_elements}};
4105 !!!next-token;
4106 redo B;
4107 } else {
4108 #
4109 }
4110 } elsif ($token->{type} == END_TAG_TOKEN) {
4111 if ($token->{tag_name} eq 'colgroup') {
4112 if ($self->{open_elements}->[-1]->[1] eq 'html') {
4113 !!!parse-error (type => 'unmatched end tag:colgroup');
4114 ## Ignore the token
4115 !!!next-token;
4116 redo B;
4117 } else {
4118 pop @{$self->{open_elements}}; # colgroup
4119 $self->{insertion_mode} = IN_TABLE_IM;
4120 !!!next-token;
4121 redo B;
4122 }
4123 } elsif ($token->{tag_name} eq 'col') {
4124 !!!parse-error (type => 'unmatched end tag:col');
4125 ## Ignore the token
4126 !!!next-token;
4127 redo B;
4128 } else {
4129 #
4130 }
4131 } else {
4132 #
4133 }
4134
4135 ## As if </colgroup>
4136 if ($self->{open_elements}->[-1]->[1] eq 'html') {
4137 !!!parse-error (type => 'unmatched end tag:colgroup');
4138 ## Ignore the token
4139 !!!next-token;
4140 redo B;
4141 } else {
4142 pop @{$self->{open_elements}}; # colgroup
4143 $self->{insertion_mode} = IN_TABLE_IM;
4144 ## reprocess
4145 redo B;
4146 }
4147 } elsif ($self->{insertion_mode} == IN_SELECT_IM) {
4148 if ($token->{type} == CHARACTER_TOKEN) {
4149 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4150 !!!next-token;
4151 redo B;
4152 } elsif ($token->{type} == START_TAG_TOKEN) {
4153 if ($token->{tag_name} eq 'option') {
4154 if ($self->{open_elements}->[-1]->[1] eq 'option') {
4155 ## As if </option>
4156 pop @{$self->{open_elements}};
4157 }
4158
4159 !!!insert-element ($token->{tag_name}, $token->{attributes});
4160 !!!next-token;
4161 redo B;
4162 } elsif ($token->{tag_name} eq 'optgroup') {
4163 if ($self->{open_elements}->[-1]->[1] eq 'option') {
4164 ## As if </option>
4165 pop @{$self->{open_elements}};
4166 }
4167
4168 if ($self->{open_elements}->[-1]->[1] eq 'optgroup') {
4169 ## As if </optgroup>
4170 pop @{$self->{open_elements}};
4171 }
4172
4173 !!!insert-element ($token->{tag_name}, $token->{attributes});
4174 !!!next-token;
4175 redo B;
4176 } elsif ($token->{tag_name} eq 'select') {
4177 !!!parse-error (type => 'not closed:select');
4178 ## As if </select> instead
4179 ## have an element in table scope
4180 my $i;
4181 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4182 my $node = $self->{open_elements}->[$_];
4183 if ($node->[1] eq $token->{tag_name}) {
4184 $i = $_;
4185 last INSCOPE;
4186 } elsif ({
4187 table => 1, html => 1,
4188 }->{$node->[1]}) {
4189 last INSCOPE;
4190 }
4191 } # INSCOPE
4192 unless (defined $i) {
4193 !!!parse-error (type => 'unmatched end tag:select');
4194 ## Ignore the token
4195 !!!next-token;
4196 redo B;
4197 }
4198
4199 splice @{$self->{open_elements}}, $i;
4200
4201 $self->_reset_insertion_mode;
4202
4203 !!!next-token;
4204 redo B;
4205 } else {
4206 !!!parse-error (type => 'in select:'.$token->{tag_name});
4207 ## Ignore the token
4208 !!!next-token;
4209 redo B;
4210 }
4211 } elsif ($token->{type} == END_TAG_TOKEN) {
4212 if ($token->{tag_name} eq 'optgroup') {
4213 if ($self->{open_elements}->[-1]->[1] eq 'option' and
4214 $self->{open_elements}->[-2]->[1] eq 'optgroup') {
4215 ## As if </option>
4216 splice @{$self->{open_elements}}, -2;
4217 } elsif ($self->{open_elements}->[-1]->[1] eq 'optgroup') {
4218 pop @{$self->{open_elements}};
4219 } else {
4220 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4221 ## Ignore the token
4222 }
4223 !!!next-token;
4224 redo B;
4225 } elsif ($token->{tag_name} eq 'option') {
4226 if ($self->{open_elements}->[-1]->[1] eq 'option') {
4227 pop @{$self->{open_elements}};
4228 } else {
4229 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4230 ## Ignore the token
4231 }
4232 !!!next-token;
4233 redo B;
4234 } elsif ($token->{tag_name} eq 'select') {
4235 ## have an element in table scope
4236 my $i;
4237 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4238 my $node = $self->{open_elements}->[$_];
4239 if ($node->[1] eq $token->{tag_name}) {
4240 $i = $_;
4241 last INSCOPE;
4242 } elsif ({
4243 table => 1, html => 1,
4244 }->{$node->[1]}) {
4245 last INSCOPE;
4246 }
4247 } # INSCOPE
4248 unless (defined $i) {
4249 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4250 ## Ignore the token
4251 !!!next-token;
4252 redo B;
4253 }
4254
4255 splice @{$self->{open_elements}}, $i;
4256
4257 $self->_reset_insertion_mode;
4258
4259 !!!next-token;
4260 redo B;
4261 } elsif ({
4262 caption => 1, table => 1, tbody => 1,
4263 tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
4264 }->{$token->{tag_name}}) {
4265 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4266
4267 ## have an element in table scope
4268 my $i;
4269 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4270 my $node = $self->{open_elements}->[$_];
4271 if ($node->[1] eq $token->{tag_name}) {
4272 $i = $_;
4273 last INSCOPE;
4274 } elsif ({
4275 table => 1, html => 1,
4276 }->{$node->[1]}) {
4277 last INSCOPE;
4278 }
4279 } # INSCOPE
4280 unless (defined $i) {
4281 ## Ignore the token
4282 !!!next-token;
4283 redo B;
4284 }
4285
4286 ## As if </select>
4287 ## have an element in table scope
4288 undef $i;
4289 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4290 my $node = $self->{open_elements}->[$_];
4291 if ($node->[1] eq 'select') {
4292 $i = $_;
4293 last INSCOPE;
4294 } elsif ({
4295 table => 1, html => 1,
4296 }->{$node->[1]}) {
4297 last INSCOPE;
4298 }
4299 } # INSCOPE
4300 unless (defined $i) {
4301 !!!parse-error (type => 'unmatched end tag:select');
4302 ## Ignore the </select> token
4303 !!!next-token; ## TODO: ok?
4304 redo B;
4305 }
4306
4307 splice @{$self->{open_elements}}, $i;
4308
4309 $self->_reset_insertion_mode;
4310
4311 ## reprocess
4312 redo B;
4313 } else {
4314 !!!parse-error (type => 'in select:/'.$token->{tag_name});
4315 ## Ignore the token
4316 !!!next-token;
4317 redo B;
4318 }
4319 } else {
4320 die "$0: $token->{type}: Unknown token type";
4321 }
4322 } elsif ($self->{insertion_mode} & BODY_AFTER_IMS) {
4323 if ($token->{type} == CHARACTER_TOKEN) {
4324 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4325 my $data = $1;
4326 ## As if in body
4327 $reconstruct_active_formatting_elements->($insert_to_current);
4328
4329 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4330
4331 unless (length $token->{data}) {
4332 !!!next-token;
4333 redo B;
4334 }
4335 }
4336
4337 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
4338 !!!parse-error (type => 'after html:#character');
4339
4340 ## Reprocess in the "main" phase, "after body" insertion mode...
4341 }
4342
4343 ## "after body" insertion mode
4344 !!!parse-error (type => 'after body:#character');
4345
4346 $self->{insertion_mode} = IN_BODY_IM;
4347 ## reprocess
4348 redo B;
4349 } elsif ($token->{type} == START_TAG_TOKEN) {
4350 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
4351 !!!parse-error (type => 'after html:'.$token->{tag_name});
4352
4353 ## Reprocess in the "main" phase, "after body" insertion mode...
4354 }
4355
4356 ## "after body" insertion mode
4357 !!!parse-error (type => 'after body:'.$token->{tag_name});
4358
4359 $self->{insertion_mode} = IN_BODY_IM;
4360 ## reprocess
4361 redo B;
4362 } elsif ($token->{type} == END_TAG_TOKEN) {
4363 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
4364 !!!parse-error (type => 'after html:/'.$token->{tag_name});
4365
4366 $self->{insertion_mode} = AFTER_BODY_IM;
4367 ## Reprocess in the "main" phase, "after body" insertion mode...
4368 }
4369
4370 ## "after body" insertion mode
4371 if ($token->{tag_name} eq 'html') {
4372 if (defined $self->{inner_html_node}) {
4373 !!!parse-error (type => 'unmatched end tag:html');
4374 ## Ignore the token
4375 !!!next-token;
4376 redo B;
4377 } else {
4378 $self->{insertion_mode} = AFTER_HTML_BODY_IM;
4379 !!!next-token;
4380 redo B;
4381 }
4382 } else {
4383 !!!parse-error (type => 'after body:/'.$token->{tag_name});
4384
4385 $self->{insertion_mode} = IN_BODY_IM;
4386 ## reprocess
4387 redo B;
4388 }
4389 } else {
4390 die "$0: $token->{type}: Unknown token type";
4391 }
4392 } elsif ($self->{insertion_mode} & FRAME_IMS) {
4393 if ($token->{type} == CHARACTER_TOKEN) {
4394 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4395 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4396
4397 unless (length $token->{data}) {
4398 !!!next-token;
4399 redo B;
4400 }
4401 }
4402
4403 if ($token->{data} =~ s/^[^\x09\x0A\x0B\x0C\x20]+//) {
4404 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
4405 !!!parse-error (type => 'in frameset:#character');
4406 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
4407 !!!parse-error (type => 'after frameset:#character');
4408 } else { # "after html frameset"
4409 !!!parse-error (type => 'after html:#character');
4410
4411 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4412 ## Reprocess in the "main" phase, "after frameset"...
4413 !!!parse-error (type => 'after frameset:#character');
4414 }
4415
4416 ## Ignore the token.
4417 if (length $token->{data}) {
4418 ## reprocess the rest of characters
4419 } else {
4420 !!!next-token;
4421 }
4422 redo B;
4423 }
4424
4425 die qq[$0: Character "$token->{data}"];
4426 } elsif ($token->{type} == START_TAG_TOKEN) {
4427 if ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
4428 !!!parse-error (type => 'after html:'.$token->{tag_name});
4429
4430 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4431 ## Process in the "main" phase, "after frameset" insertion mode...
4432 }
4433
4434 if ($token->{tag_name} eq 'frameset' and
4435 $self->{insertion_mode} == IN_FRAMESET_IM) {
4436 !!!insert-element ($token->{tag_name}, $token->{attributes});
4437 !!!next-token;
4438 redo B;
4439 } elsif ($token->{tag_name} eq 'frame' and
4440 $self->{insertion_mode} == IN_FRAMESET_IM) {
4441 !!!insert-element ($token->{tag_name}, $token->{attributes});
4442 pop @{$self->{open_elements}};
4443 !!!next-token;
4444 redo B;
4445 } elsif ($token->{tag_name} eq 'noframes') {
4446 ## NOTE: As if in body.
4447 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert_to_current);
4448 redo B;
4449 } else {
4450 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
4451 !!!parse-error (type => 'in frameset:'.$token->{tag_name});
4452 } else {
4453 !!!parse-error (type => 'after frameset:'.$token->{tag_name});
4454 }
4455 ## Ignore the token
4456 !!!next-token;
4457 redo B;
4458 }
4459 } elsif ($token->{type} == END_TAG_TOKEN) {
4460 if ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
4461 !!!parse-error (type => 'after html:/'.$token->{tag_name});
4462
4463 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4464 ## Process in the "main" phase, "after frameset" insertion mode...
4465 }
4466
4467 if ($token->{tag_name} eq 'frameset' and
4468 $self->{insertion_mode} == IN_FRAMESET_IM) {
4469 if ($self->{open_elements}->[-1]->[1] eq 'html' and
4470 @{$self->{open_elements}} == 1) {
4471 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4472 ## Ignore the token
4473 !!!next-token;
4474 } else {
4475 pop @{$self->{open_elements}};
4476 !!!next-token;
4477 }
4478
4479 if (not defined $self->{inner_html_node} and
4480 $self->{open_elements}->[-1]->[1] ne 'frameset') {
4481 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4482 }
4483 redo B;
4484 } elsif ($token->{tag_name} eq 'html' and
4485 $self->{insertion_mode} == AFTER_FRAMESET_IM) {
4486 $self->{insertion_mode} = AFTER_HTML_FRAMESET_IM;
4487 !!!next-token;
4488 redo B;
4489 } else {
4490 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
4491 !!!parse-error (type => 'in frameset:/'.$token->{tag_name});
4492 } else {
4493 !!!parse-error (type => 'after frameset:/'.$token->{tag_name});
4494 }
4495 ## Ignore the token
4496 !!!next-token;
4497 redo B;
4498 }
4499 } else {
4500 die "$0: $token->{type}: Unknown token type";
4501 }
4502
4503 ## ISSUE: An issue in spec here
4504 } else {
4505 die "$0: $self->{insertion_mode}: Unknown insertion mode";
4506 }
4507
4508 ## "in body" insertion mode
4509 if ($token->{type} == START_TAG_TOKEN) {
4510 if ($token->{tag_name} eq 'script') {
4511 ## NOTE: This is an "as if in head" code clone
4512 $script_start_tag->($insert);
4513 redo B;
4514 } elsif ($token->{tag_name} eq 'style') {
4515 ## NOTE: This is an "as if in head" code clone
4516 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);
4517 redo B;
4518 } elsif ({
4519 base => 1, link => 1,
4520 }->{$token->{tag_name}}) {
4521 ## NOTE: This is an "as if in head" code clone, only "-t" differs
4522 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4523 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4524 !!!next-token;
4525 redo B;
4526 } elsif ($token->{tag_name} eq 'meta') {
4527 ## NOTE: This is an "as if in head" code clone, only "-t" differs
4528 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4529 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4530
4531 unless ($self->{confident}) {
4532 if ($token->{attributes}->{charset}) { ## TODO: And if supported
4533 $self->{change_encoding}
4534 ->($self, $token->{attributes}->{charset}->{value});
4535
4536 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4537 ->set_user_data (manakai_has_reference =>
4538 $token->{attributes}->{charset}
4539 ->{has_reference});
4540 } elsif ($token->{attributes}->{content}) {
4541 ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
4542 if ($token->{attributes}->{content}->{value}
4543 =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
4544 [\x09-\x0D\x20]*=
4545 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
4546 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
4547 $self->{change_encoding}
4548 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3);
4549 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4550 ->set_user_data (manakai_has_reference =>
4551 $token->{attributes}->{content}
4552 ->{has_reference});
4553 }
4554 }
4555 } else {
4556 if ($token->{attributes}->{charset}) {
4557 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4558 ->set_user_data (manakai_has_reference =>
4559 $token->{attributes}->{charset}
4560 ->{has_reference});
4561 }
4562 if ($token->{attributes}->{content}) {
4563 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4564 ->set_user_data (manakai_has_reference =>
4565 $token->{attributes}->{content}
4566 ->{has_reference});
4567 }
4568 }
4569
4570 !!!next-token;
4571 redo B;
4572 } elsif ($token->{tag_name} eq 'title') {
4573 !!!parse-error (type => 'in body:title');
4574 ## NOTE: This is an "as if in head" code clone
4575 $parse_rcdata->(RCDATA_CONTENT_MODEL, sub {
4576 if (defined $self->{head_element}) {
4577 $self->{head_element}->append_child ($_[0]);
4578 } else {
4579 $insert->($_[0]);
4580 }
4581 });
4582 redo B;
4583 } elsif ($token->{tag_name} eq 'body') {
4584 !!!parse-error (type => 'in body:body');
4585
4586 if (@{$self->{open_elements}} == 1 or
4587 $self->{open_elements}->[1]->[1] ne 'body') {
4588 ## Ignore the token
4589 } else {
4590 my $body_el = $self->{open_elements}->[1]->[0];
4591 for my $attr_name (keys %{$token->{attributes}}) {
4592 unless ($body_el->has_attribute_ns (undef, $attr_name)) {
4593 $body_el->set_attribute_ns
4594 (undef, [undef, $attr_name],
4595 $token->{attributes}->{$attr_name}->{value});
4596 }
4597 }
4598 }
4599 !!!next-token;
4600 redo B;
4601 } elsif ({
4602 address => 1, blockquote => 1, center => 1, dir => 1,
4603 div => 1, dl => 1, fieldset => 1, listing => 1,
4604 menu => 1, ol => 1, p => 1, ul => 1,
4605 pre => 1,
4606 }->{$token->{tag_name}}) {
4607 ## has a p element in scope
4608 INSCOPE: for (reverse @{$self->{open_elements}}) {
4609 if ($_->[1] eq 'p') {
4610 !!!back-token;
4611 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4612 redo B;
4613 } elsif ({
4614 table => 1, caption => 1, td => 1, th => 1,
4615 button => 1, marquee => 1, object => 1, html => 1,
4616 }->{$_->[1]}) {
4617 last INSCOPE;
4618 }
4619 } # INSCOPE
4620
4621 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4622 if ($token->{tag_name} eq 'pre') {
4623 !!!next-token;
4624 if ($token->{type} == CHARACTER_TOKEN) {
4625 $token->{data} =~ s/^\x0A//;
4626 unless (length $token->{data}) {
4627 !!!next-token;
4628 }
4629 }
4630 } else {
4631 !!!next-token;
4632 }
4633 redo B;
4634 } elsif ($token->{tag_name} eq 'form') {
4635 if (defined $self->{form_element}) {
4636 !!!parse-error (type => 'in form:form');
4637 ## Ignore the token
4638 !!!next-token;
4639 redo B;
4640 } else {
4641 ## has a p element in scope
4642 INSCOPE: for (reverse @{$self->{open_elements}}) {
4643 if ($_->[1] eq 'p') {
4644 !!!back-token;
4645 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4646 redo B;
4647 } elsif ({
4648 table => 1, caption => 1, td => 1, th => 1,
4649 button => 1, marquee => 1, object => 1, html => 1,
4650 }->{$_->[1]}) {
4651 last INSCOPE;
4652 }
4653 } # INSCOPE
4654
4655 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4656 $self->{form_element} = $self->{open_elements}->[-1]->[0];
4657 !!!next-token;
4658 redo B;
4659 }
4660 } elsif ($token->{tag_name} eq 'li') {
4661 ## has a p element in scope
4662 INSCOPE: for (reverse @{$self->{open_elements}}) {
4663 if ($_->[1] eq 'p') {
4664 !!!back-token;
4665 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4666 redo B;
4667 } elsif ({
4668 table => 1, caption => 1, td => 1, th => 1,
4669 button => 1, marquee => 1, object => 1, html => 1,
4670 }->{$_->[1]}) {
4671 last INSCOPE;
4672 }
4673 } # INSCOPE
4674
4675 ## Step 1
4676 my $i = -1;
4677 my $node = $self->{open_elements}->[$i];
4678 LI: {
4679 ## Step 2
4680 if ($node->[1] eq 'li') {
4681 if ($i != -1) {
4682 !!!parse-error (type => 'end tag missing:'.
4683 $self->{open_elements}->[-1]->[1]);
4684 }
4685 splice @{$self->{open_elements}}, $i;
4686 last LI;
4687 }
4688
4689 ## Step 3
4690 if (not $formatting_category->{$node->[1]} and
4691 #not $phrasing_category->{$node->[1]} and
4692 ($special_category->{$node->[1]} or
4693 $scoping_category->{$node->[1]}) and
4694 $node->[1] ne 'address' and $node->[1] ne 'div') {
4695 last LI;
4696 }
4697
4698 ## Step 4
4699 $i--;
4700 $node = $self->{open_elements}->[$i];
4701 redo LI;
4702 } # LI
4703
4704 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4705 !!!next-token;
4706 redo B;
4707 } elsif ($token->{tag_name} eq 'dd' or $token->{tag_name} eq 'dt') {
4708 ## has a p element in scope
4709 INSCOPE: for (reverse @{$self->{open_elements}}) {
4710 if ($_->[1] eq 'p') {
4711 !!!back-token;
4712 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4713 redo B;
4714 } elsif ({
4715 table => 1, caption => 1, td => 1, th => 1,
4716 button => 1, marquee => 1, object => 1, html => 1,
4717 }->{$_->[1]}) {
4718 last INSCOPE;
4719 }
4720 } # INSCOPE
4721
4722 ## Step 1
4723 my $i = -1;
4724 my $node = $self->{open_elements}->[$i];
4725 LI: {
4726 ## Step 2
4727 if ($node->[1] eq 'dt' or $node->[1] eq 'dd') {
4728 if ($i != -1) {
4729 !!!parse-error (type => 'end tag missing:'.
4730 $self->{open_elements}->[-1]->[1]);
4731 }
4732 splice @{$self->{open_elements}}, $i;
4733 last LI;
4734 }
4735
4736 ## Step 3
4737 if (not $formatting_category->{$node->[1]} and
4738 #not $phrasing_category->{$node->[1]} and
4739 ($special_category->{$node->[1]} or
4740 $scoping_category->{$node->[1]}) and
4741 $node->[1] ne 'address' and $node->[1] ne 'div') {
4742 last LI;
4743 }
4744
4745 ## Step 4
4746 $i--;
4747 $node = $self->{open_elements}->[$i];
4748 redo LI;
4749 } # LI
4750
4751 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4752 !!!next-token;
4753 redo B;
4754 } elsif ($token->{tag_name} eq 'plaintext') {
4755 ## has a p element in scope
4756 INSCOPE: for (reverse @{$self->{open_elements}}) {
4757 if ($_->[1] eq 'p') {
4758 !!!back-token;
4759 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4760 redo B;
4761 } elsif ({
4762 table => 1, caption => 1, td => 1, th => 1,
4763 button => 1, marquee => 1, object => 1, html => 1,
4764 }->{$_->[1]}) {
4765 last INSCOPE;
4766 }
4767 } # INSCOPE
4768
4769 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4770
4771 $self->{content_model} = PLAINTEXT_CONTENT_MODEL;
4772
4773 !!!next-token;
4774 redo B;
4775 } elsif ({
4776 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
4777 }->{$token->{tag_name}}) {
4778 ## has a p element in scope
4779 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4780 my $node = $self->{open_elements}->[$_];
4781 if ($node->[1] eq 'p') {
4782 !!!back-token;
4783 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4784 redo B;
4785 } elsif ({
4786 table => 1, caption => 1, td => 1, th => 1,
4787 button => 1, marquee => 1, object => 1, html => 1,
4788 }->{$node->[1]}) {
4789 last INSCOPE;
4790 }
4791 } # INSCOPE
4792
4793 ## NOTE: See <http://html5.org/tools/web-apps-tracker?from=925&to=926>
4794 ## has an element in scope
4795 #my $i;
4796 #INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4797 # my $node = $self->{open_elements}->[$_];
4798 # if ({
4799 # h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
4800 # }->{$node->[1]}) {
4801 # $i = $_;
4802 # last INSCOPE;
4803 # } elsif ({
4804 # table => 1, caption => 1, td => 1, th => 1,
4805 # button => 1, marquee => 1, object => 1, html => 1,
4806 # }->{$node->[1]}) {
4807 # last INSCOPE;
4808 # }
4809 #} # INSCOPE
4810 #
4811 #if (defined $i) {
4812 # !!! parse-error (type => 'in hn:hn');
4813 # splice @{$self->{open_elements}}, $i;
4814 #}
4815
4816 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4817
4818 !!!next-token;
4819 redo B;
4820 } elsif ($token->{tag_name} eq 'a') {
4821 AFE: for my $i (reverse 0..$#$active_formatting_elements) {
4822 my $node = $active_formatting_elements->[$i];
4823 if ($node->[1] eq 'a') {
4824 !!!parse-error (type => 'in a:a');
4825
4826 !!!back-token;
4827 $token = {type => END_TAG_TOKEN, tag_name => 'a'};
4828 $formatting_end_tag->($token->{tag_name});
4829
4830 AFE2: for (reverse 0..$#$active_formatting_elements) {
4831 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
4832 splice @$active_formatting_elements, $_, 1;
4833 last AFE2;
4834 }
4835 } # AFE2
4836 OE: for (reverse 0..$#{$self->{open_elements}}) {
4837 if ($self->{open_elements}->[$_]->[0] eq $node->[0]) {
4838 splice @{$self->{open_elements}}, $_, 1;
4839 last OE;
4840 }
4841 } # OE
4842 last AFE;
4843 } elsif ($node->[0] eq '#marker') {
4844 last AFE;
4845 }
4846 } # AFE
4847
4848 $reconstruct_active_formatting_elements->($insert_to_current);
4849
4850 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4851 push @$active_formatting_elements, $self->{open_elements}->[-1];
4852
4853 !!!next-token;
4854 redo B;
4855 } elsif ({
4856 b => 1, big => 1, em => 1, font => 1, i => 1,
4857 s => 1, small => 1, strile => 1,
4858 strong => 1, tt => 1, u => 1,
4859 }->{$token->{tag_name}}) {
4860 $reconstruct_active_formatting_elements->($insert_to_current);
4861
4862 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4863 push @$active_formatting_elements, $self->{open_elements}->[-1];
4864
4865 !!!next-token;
4866 redo B;
4867 } elsif ($token->{tag_name} eq 'nobr') {
4868 $reconstruct_active_formatting_elements->($insert_to_current);
4869
4870 ## has a |nobr| element in scope
4871 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4872 my $node = $self->{open_elements}->[$_];
4873 if ($node->[1] eq 'nobr') {
4874 !!!parse-error (type => 'in nobr:nobr');
4875 !!!back-token;
4876 $token = {type => END_TAG_TOKEN, tag_name => 'nobr'};
4877 redo B;
4878 } elsif ({
4879 table => 1, caption => 1, td => 1, th => 1,
4880 button => 1, marquee => 1, object => 1, html => 1,
4881 }->{$node->[1]}) {
4882 last INSCOPE;
4883 }
4884 } # INSCOPE
4885
4886 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4887 push @$active_formatting_elements, $self->{open_elements}->[-1];
4888
4889 !!!next-token;
4890 redo B;
4891 } elsif ($token->{tag_name} eq 'button') {
4892 ## has a button element in scope
4893 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4894 my $node = $self->{open_elements}->[$_];
4895 if ($node->[1] eq 'button') {
4896 !!!parse-error (type => 'in button:button');
4897 !!!back-token;
4898 $token = {type => END_TAG_TOKEN, tag_name => 'button'};
4899 redo B;
4900 } elsif ({
4901 table => 1, caption => 1, td => 1, th => 1,
4902 button => 1, marquee => 1, object => 1, html => 1,
4903 }->{$node->[1]}) {
4904 last INSCOPE;
4905 }
4906 } # INSCOPE
4907
4908 $reconstruct_active_formatting_elements->($insert_to_current);
4909
4910 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4911 push @$active_formatting_elements, ['#marker', ''];
4912
4913 !!!next-token;
4914 redo B;
4915 } elsif ($token->{tag_name} eq 'marquee' or
4916 $token->{tag_name} eq 'object') {
4917 $reconstruct_active_formatting_elements->($insert_to_current);
4918
4919 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4920 push @$active_formatting_elements, ['#marker', ''];
4921
4922 !!!next-token;
4923 redo B;
4924 } elsif ($token->{tag_name} eq 'xmp') {
4925 $reconstruct_active_formatting_elements->($insert_to_current);
4926 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);
4927 redo B;
4928 } elsif ($token->{tag_name} eq 'table') {
4929 ## has a p element in scope
4930 INSCOPE: for (reverse @{$self->{open_elements}}) {
4931 if ($_->[1] eq 'p') {
4932 !!!back-token;
4933 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4934 redo B;
4935 } elsif ({
4936 table => 1, caption => 1, td => 1, th => 1,
4937 button => 1, marquee => 1, object => 1, html => 1,
4938 }->{$_->[1]}) {
4939 last INSCOPE;
4940 }
4941 } # INSCOPE
4942
4943 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4944
4945 $self->{insertion_mode} = IN_TABLE_IM;
4946
4947 !!!next-token;
4948 redo B;
4949 } elsif ({
4950 area => 1, basefont => 1, bgsound => 1, br => 1,
4951 embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
4952 image => 1,
4953 }->{$token->{tag_name}}) {
4954 if ($token->{tag_name} eq 'image') {
4955 !!!parse-error (type => 'image');
4956 $token->{tag_name} = 'img';
4957 }
4958
4959 ## NOTE: There is an "as if <br>" code clone.
4960 $reconstruct_active_formatting_elements->($insert_to_current);
4961
4962 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4963 pop @{$self->{open_elements}};
4964
4965 !!!next-token;
4966 redo B;
4967 } elsif ($token->{tag_name} eq 'hr') {
4968 ## has a p element in scope
4969 INSCOPE: for (reverse @{$self->{open_elements}}) {
4970 if ($_->[1] eq 'p') {
4971 !!!back-token;
4972 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4973 redo B;
4974 } elsif ({
4975 table => 1, caption => 1, td => 1, th => 1,
4976 button => 1, marquee => 1, object => 1, html => 1,
4977 }->{$_->[1]}) {
4978 last INSCOPE;
4979 }
4980 } # INSCOPE
4981
4982 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4983 pop @{$self->{open_elements}};
4984
4985 !!!next-token;
4986 redo B;
4987 } elsif ($token->{tag_name} eq 'input') {
4988 $reconstruct_active_formatting_elements->($insert_to_current);
4989
4990 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4991 ## TODO: associate with $self->{form_element} if defined
4992 pop @{$self->{open_elements}};
4993
4994 !!!next-token;
4995 redo B;
4996 } elsif ($token->{tag_name} eq 'isindex') {
4997 !!!parse-error (type => 'isindex');
4998
4999 if (defined $self->{form_element}) {
5000 ## Ignore the token
5001 !!!next-token;
5002 redo B;
5003 } else {
5004 my $at = $token->{attributes};
5005 my $form_attrs;
5006 $form_attrs->{action} = $at->{action} if $at->{action};
5007 my $prompt_attr = $at->{prompt};
5008 $at->{name} = {name => 'name', value => 'isindex'};
5009 delete $at->{action};
5010 delete $at->{prompt};
5011 my @tokens = (
5012 {type => START_TAG_TOKEN, tag_name => 'form',
5013 attributes => $form_attrs},
5014 {type => START_TAG_TOKEN, tag_name => 'hr'},
5015 {type => START_TAG_TOKEN, tag_name => 'p'},
5016 {type => START_TAG_TOKEN, tag_name => 'label'},
5017 );
5018 if ($prompt_attr) {
5019 push @tokens, {type => CHARACTER_TOKEN, data => $prompt_attr->{value}};
5020 } else {
5021 push @tokens, {type => CHARACTER_TOKEN,
5022 data => 'This is a searchable index. Insert your search keywords here: '}; # SHOULD
5023 ## TODO: make this configurable
5024 }
5025 push @tokens,
5026 {type => START_TAG_TOKEN, tag_name => 'input', attributes => $at},
5027 #{type => CHARACTER_TOKEN, data => ''}, # SHOULD
5028 {type => END_TAG_TOKEN, tag_name => 'label'},
5029 {type => END_TAG_TOKEN, tag_name => 'p'},
5030 {type => START_TAG_TOKEN, tag_name => 'hr'},
5031 {type => END_TAG_TOKEN, tag_name => 'form'};
5032 $token = shift @tokens;
5033 !!!back-token (@tokens);
5034 redo B;
5035 }
5036 } elsif ($token->{tag_name} eq 'textarea') {
5037 my $tag_name = $token->{tag_name};
5038 my $el;
5039 !!!create-element ($el, $token->{tag_name}, $token->{attributes});
5040
5041 ## TODO: $self->{form_element} if defined
5042 $self->{content_model} = RCDATA_CONTENT_MODEL;
5043 delete $self->{escape}; # MUST
5044
5045 $insert->($el);
5046
5047 my $text = '';
5048 !!!next-token;
5049 if ($token->{type} == CHARACTER_TOKEN) {
5050 $token->{data} =~ s/^\x0A//;
5051 unless (length $token->{data}) {
5052 !!!next-token;
5053 }
5054 }
5055 while ($token->{type} == CHARACTER_TOKEN) {
5056 $text .= $token->{data};
5057 !!!next-token;
5058 }
5059 if (length $text) {
5060 $el->manakai_append_text ($text);
5061 }
5062
5063 $self->{content_model} = PCDATA_CONTENT_MODEL;
5064
5065 if ($token->{type} == END_TAG_TOKEN and
5066 $token->{tag_name} eq $tag_name) {
5067 ## Ignore the token
5068 } else {
5069 !!!parse-error (type => 'in RCDATA:#'.$token->{type});
5070 }
5071 !!!next-token;
5072 redo B;
5073 } elsif ({
5074 iframe => 1,
5075 noembed => 1,
5076 noframes => 1,
5077 noscript => 0, ## TODO: 1 if scripting is enabled
5078 }->{$token->{tag_name}}) {
5079 ## NOTE: There is an "as if in body" code clone.
5080 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);
5081 redo B;
5082 } elsif ($token->{tag_name} eq 'select') {
5083 $reconstruct_active_formatting_elements->($insert_to_current);
5084
5085 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5086
5087 $self->{insertion_mode} = IN_SELECT_IM;
5088 !!!next-token;
5089 redo B;
5090 } elsif ({
5091 caption => 1, col => 1, colgroup => 1, frame => 1,
5092 frameset => 1, head => 1, option => 1, optgroup => 1,
5093 tbody => 1, td => 1, tfoot => 1, th => 1,
5094 thead => 1, tr => 1,
5095 }->{$token->{tag_name}}) {
5096 !!!parse-error (type => 'in body:'.$token->{tag_name});
5097 ## Ignore the token
5098 !!!next-token;
5099 redo B;
5100
5101 ## ISSUE: An issue on HTML5 new elements in the spec.
5102 } else {
5103 $reconstruct_active_formatting_elements->($insert_to_current);
5104
5105 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5106
5107 !!!next-token;
5108 redo B;
5109 }
5110 } elsif ($token->{type} == END_TAG_TOKEN) {
5111 if ($token->{tag_name} eq 'body') {
5112 if (@{$self->{open_elements}} > 1 and
5113 $self->{open_elements}->[1]->[1] eq 'body') {
5114 for (@{$self->{open_elements}}) {
5115 unless ({
5116 dd => 1, dt => 1, li => 1, p => 1, td => 1,
5117 th => 1, tr => 1, body => 1, html => 1,
5118 tbody => 1, tfoot => 1, thead => 1,
5119 }->{$_->[1]}) {
5120 !!!parse-error (type => 'not closed:'.$_->[1]);
5121 }
5122 }
5123
5124 $self->{insertion_mode} = AFTER_BODY_IM;
5125 !!!next-token;
5126 redo B;
5127 } else {
5128 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5129 ## Ignore the token
5130 !!!next-token;
5131 redo B;
5132 }
5133 } elsif ($token->{tag_name} eq 'html') {
5134 if (@{$self->{open_elements}} > 1 and $self->{open_elements}->[1]->[1] eq 'body') {
5135 ## ISSUE: There is an issue in the spec.
5136 if ($self->{open_elements}->[-1]->[1] ne 'body') {
5137 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[1]->[1]);
5138 }
5139 $self->{insertion_mode} = AFTER_BODY_IM;
5140 ## reprocess
5141 redo B;
5142 } else {
5143 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5144 ## Ignore the token
5145 !!!next-token;
5146 redo B;
5147 }
5148 } elsif ({
5149 address => 1, blockquote => 1, center => 1, dir => 1,
5150 div => 1, dl => 1, fieldset => 1, listing => 1,
5151 menu => 1, ol => 1, pre => 1, ul => 1,
5152 p => 1,
5153 dd => 1, dt => 1, li => 1,
5154 button => 1, marquee => 1, object => 1,
5155 }->{$token->{tag_name}}) {
5156 ## has an element in scope
5157 my $i;
5158 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5159 my $node = $self->{open_elements}->[$_];
5160 if ($node->[1] eq $token->{tag_name}) {
5161 ## generate implied end tags
5162 if ({
5163 dd => ($token->{tag_name} ne 'dd'),
5164 dt => ($token->{tag_name} ne 'dt'),
5165 li => ($token->{tag_name} ne 'li'),
5166 p => ($token->{tag_name} ne 'p'),
5167 td => 1, th => 1, tr => 1,
5168 tbody => 1, tfoot=> 1, thead => 1,
5169 }->{$self->{open_elements}->[-1]->[1]}) {
5170 !!!back-token;
5171 $token = {type => END_TAG_TOKEN,
5172 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
5173 redo B;
5174 }
5175 $i = $_;
5176 last INSCOPE unless $token->{tag_name} eq 'p';
5177 } elsif ({
5178 table => 1, caption => 1, td => 1, th => 1,
5179 button => 1, marquee => 1, object => 1, html => 1,
5180 }->{$node->[1]}) {
5181 last INSCOPE;
5182 }
5183 } # INSCOPE
5184
5185 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
5186 if (defined $i) {
5187 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
5188 } else {
5189 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5190 }
5191 }
5192
5193 if (defined $i) {
5194 splice @{$self->{open_elements}}, $i;
5195 } elsif ($token->{tag_name} eq 'p') {
5196 ## As if <p>, then reprocess the current token
5197 my $el;
5198 !!!create-element ($el, 'p');
5199 $insert->($el);
5200 }
5201 $clear_up_to_marker->()
5202 if {
5203 button => 1, marquee => 1, object => 1,
5204 }->{$token->{tag_name}};
5205 !!!next-token;
5206 redo B;
5207 } elsif ($token->{tag_name} eq 'form') {
5208 ## has an element in scope
5209 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5210 my $node = $self->{open_elements}->[$_];
5211 if ($node->[1] eq $token->{tag_name}) {
5212 ## generate implied end tags
5213 if ({
5214 dd => 1, dt => 1, li => 1, p => 1,
5215 td => 1, th => 1, tr => 1,
5216 tbody => 1, tfoot=> 1, thead => 1,
5217 }->{$self->{open_elements}->[-1]->[1]}) {
5218 !!!back-token;
5219 $token = {type => END_TAG_TOKEN,
5220 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
5221 redo B;
5222 }
5223 last INSCOPE;
5224 } elsif ({
5225 table => 1, caption => 1, td => 1, th => 1,
5226 button => 1, marquee => 1, object => 1, html => 1,
5227 }->{$node->[1]}) {
5228 last INSCOPE;
5229 }
5230 } # INSCOPE
5231
5232 if ($self->{open_elements}->[-1]->[1] eq $token->{tag_name}) {
5233 pop @{$self->{open_elements}};
5234 } else {
5235 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5236 }
5237
5238 undef $self->{form_element};
5239 !!!next-token;
5240 redo B;
5241 } elsif ({
5242 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
5243 }->{$token->{tag_name}}) {
5244 ## has an element in scope
5245 my $i;
5246 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5247 my $node = $self->{open_elements}->[$_];
5248 if ({
5249 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
5250 }->{$node->[1]}) {
5251 ## generate implied end tags
5252 if ({
5253 dd => 1, dt => 1, li => 1, p => 1,
5254 td => 1, th => 1, tr => 1,
5255 tbody => 1, tfoot=> 1, thead => 1,
5256 }->{$self->{open_elements}->[-1]->[1]}) {
5257 !!!back-token;
5258 $token = {type => END_TAG_TOKEN,
5259 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
5260 redo B;
5261 }
5262 $i = $_;
5263 last INSCOPE;
5264 } elsif ({
5265 table => 1, caption => 1, td => 1, th => 1,
5266 button => 1, marquee => 1, object => 1, html => 1,
5267 }->{$node->[1]}) {
5268 last INSCOPE;
5269 }
5270 } # INSCOPE
5271
5272 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
5273 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5274 }
5275
5276 splice @{$self->{open_elements}}, $i if defined $i;
5277 !!!next-token;
5278 redo B;
5279 } elsif ({
5280 a => 1,
5281 b => 1, big => 1, em => 1, font => 1, i => 1,
5282 nobr => 1, s => 1, small => 1, strile => 1,
5283 strong => 1, tt => 1, u => 1,
5284 }->{$token->{tag_name}}) {
5285 $formatting_end_tag->($token->{tag_name});
5286 redo B;
5287 } elsif ($token->{tag_name} eq 'br') {
5288 !!!parse-error (type => 'unmatched end tag:br');
5289
5290 ## As if <br>
5291 $reconstruct_active_formatting_elements->($insert_to_current);
5292
5293 my $el;
5294 !!!create-element ($el, 'br');
5295 $insert->($el);
5296
5297 ## Ignore the token.
5298 !!!next-token;
5299 redo B;
5300 } elsif ({
5301 caption => 1, col => 1, colgroup => 1, frame => 1,
5302 frameset => 1, head => 1, option => 1, optgroup => 1,
5303 tbody => 1, td => 1, tfoot => 1, th => 1,
5304 thead => 1, tr => 1,
5305 area => 1, basefont => 1, bgsound => 1,
5306 embed => 1, hr => 1, iframe => 1, image => 1,
5307 img => 1, input => 1, isindex => 1, noembed => 1,
5308 noframes => 1, param => 1, select => 1, spacer => 1,
5309 table => 1, textarea => 1, wbr => 1,
5310 noscript => 0, ## TODO: if scripting is enabled
5311 }->{$token->{tag_name}}) {
5312 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5313 ## Ignore the token
5314 !!!next-token;
5315 redo B;
5316
5317 ## ISSUE: Issue on HTML5 new elements in spec
5318
5319 } else {
5320 ## Step 1
5321 my $node_i = -1;
5322 my $node = $self->{open_elements}->[$node_i];
5323
5324 ## Step 2
5325 S2: {
5326 if ($node->[1] eq $token->{tag_name}) {
5327 ## Step 1
5328 ## generate implied end tags
5329 if ({
5330 dd => 1, dt => 1, li => 1, p => 1,
5331 td => 1, th => 1, tr => 1,
5332 tbody => 1, tfoot => 1, thead => 1,
5333 }->{$self->{open_elements}->[-1]->[1]}) {
5334 !!!back-token;
5335 $token = {type => END_TAG_TOKEN,
5336 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
5337 redo B;
5338 }
5339
5340 ## Step 2
5341 if ($token->{tag_name} ne $self->{open_elements}->[-1]->[1]) {
5342 ## NOTE: <x><y></x>
5343 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
5344 }
5345
5346 ## Step 3
5347 splice @{$self->{open_elements}}, $node_i;
5348
5349 !!!next-token;
5350 last S2;
5351 } else {
5352 ## Step 3
5353 if (not $formatting_category->{$node->[1]} and
5354 #not $phrasing_category->{$node->[1]} and
5355 ($special_category->{$node->[1]} or
5356 $scoping_category->{$node->[1]})) {
5357 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5358 ## Ignore the token
5359 !!!next-token;
5360 last S2;
5361 }
5362 }
5363
5364 ## Step 4
5365 $node_i--;
5366 $node = $self->{open_elements}->[$node_i];
5367
5368 ## Step 5;
5369 redo S2;
5370 } # S2
5371 redo B;
5372 }
5373 }
5374 redo B;
5375 } # B
5376
5377 ## NOTE: The "trailing end" phase in HTML5 is split into
5378 ## two insertion modes: "after html body" and "after html frameset".
5379 ## NOTE: States in the main stage is preserved while
5380 ## the parser stays in the trailing end phase. # MUST
5381
5382 ## Stop parsing # MUST
5383
5384 ## TODO: script stuffs
5385 } # _tree_construct_main
5386
5387 sub set_inner_html ($$$) {
5388 my $class = shift;
5389 my $node = shift;
5390 my $s = \$_[0];
5391 my $onerror = $_[1];
5392
5393 ## ISSUE: Should {confident} be true?
5394
5395 my $nt = $node->node_type;
5396 if ($nt == 9) {
5397 # MUST
5398
5399 ## Step 1 # MUST
5400 ## TODO: If the document has an active parser, ...
5401 ## ISSUE: There is an issue in the spec.
5402
5403 ## Step 2 # MUST
5404 my @cn = @{$node->child_nodes};
5405 for (@cn) {
5406 $node->remove_child ($_);
5407 }
5408
5409 ## Step 3, 4, 5 # MUST
5410 $class->parse_string ($$s => $node, $onerror);
5411 } elsif ($nt == 1) {
5412 ## TODO: If non-html element
5413
5414 ## NOTE: Most of this code is copied from |parse_string|
5415
5416 ## Step 1 # MUST
5417 my $this_doc = $node->owner_document;
5418 my $doc = $this_doc->implementation->create_document;
5419 $doc->manakai_is_html (1);
5420 my $p = $class->new;
5421 $p->{document} = $doc;
5422
5423 ## Step 9 # MUST
5424 my $i = 0;
5425 my $line = 1;
5426 my $column = 0;
5427 $p->{set_next_input_character} = sub {
5428 my $self = shift;
5429
5430 pop @{$self->{prev_input_character}};
5431 unshift @{$self->{prev_input_character}}, $self->{next_input_character};
5432
5433 $self->{next_input_character} = -1 and return if $i >= length $$s;
5434 $self->{next_input_character} = ord substr $$s, $i++, 1;
5435 $column++;
5436
5437 if ($self->{next_input_character} == 0x000A) { # LF
5438 $line++;
5439 $column = 0;
5440 } elsif ($self->{next_input_character} == 0x000D) { # CR
5441 $i++ if substr ($$s, $i, 1) eq "\x0A";
5442 $self->{next_input_character} = 0x000A; # LF # MUST
5443 $line++;
5444 $column = 0;
5445 } elsif ($self->{next_input_character} > 0x10FFFF) {
5446 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
5447 } elsif ($self->{next_input_character} == 0x0000) { # NULL
5448 !!!parse-error (type => 'NULL');
5449 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
5450 }
5451 };
5452 $p->{prev_input_character} = [-1, -1, -1];
5453 $p->{next_input_character} = -1;
5454
5455 my $ponerror = $onerror || sub {
5456 my (%opt) = @_;
5457 warn "Parse error ($opt{type}) at line $opt{line} column $opt{column}\n";
5458 };
5459 $p->{parse_error} = sub {
5460 $ponerror->(@_, line => $line, column => $column);
5461 };
5462
5463 $p->_initialize_tokenizer;
5464 $p->_initialize_tree_constructor;
5465
5466 ## Step 2
5467 my $node_ln = $node->local_name;
5468 $p->{content_model} = {
5469 title => RCDATA_CONTENT_MODEL,
5470 textarea => RCDATA_CONTENT_MODEL,
5471 style => CDATA_CONTENT_MODEL,
5472 script => CDATA_CONTENT_MODEL,
5473 xmp => CDATA_CONTENT_MODEL,
5474 iframe => CDATA_CONTENT_MODEL,
5475 noembed => CDATA_CONTENT_MODEL,
5476 noframes => CDATA_CONTENT_MODEL,
5477 noscript => CDATA_CONTENT_MODEL,
5478 plaintext => PLAINTEXT_CONTENT_MODEL,
5479 }->{$node_ln};
5480 $p->{content_model} = PCDATA_CONTENT_MODEL
5481 unless defined $p->{content_model};
5482 ## ISSUE: What is "the name of the element"? local name?
5483
5484 $p->{inner_html_node} = [$node, $node_ln];
5485
5486 ## Step 4
5487 my $root = $doc->create_element_ns
5488 ('http://www.w3.org/1999/xhtml', [undef, 'html']);
5489
5490 ## Step 5 # MUST
5491 $doc->append_child ($root);
5492
5493 ## Step 6 # MUST
5494 push @{$p->{open_elements}}, [$root, 'html'];
5495
5496 undef $p->{head_element};
5497
5498 ## Step 7 # MUST
5499 $p->_reset_insertion_mode;
5500
5501 ## Step 8 # MUST
5502 my $anode = $node;
5503 AN: while (defined $anode) {
5504 if ($anode->node_type == 1) {
5505 my $nsuri = $anode->namespace_uri;
5506 if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {
5507 if ($anode->local_name eq 'form') { ## TODO: case?
5508 $p->{form_element} = $anode;
5509 last AN;
5510 }
5511 }
5512 }
5513 $anode = $anode->parent_node;
5514 } # AN
5515
5516 ## Step 3 # MUST
5517 ## Step 10 # MUST
5518 {
5519 my $self = $p;
5520 !!!next-token;
5521 }
5522 $p->_tree_construction_main;
5523
5524 ## Step 11 # MUST
5525 my @cn = @{$node->child_nodes};
5526 for (@cn) {
5527 $node->remove_child ($_);
5528 }
5529 ## ISSUE: mutation events? read-only?
5530
5531 ## Step 12 # MUST
5532 @cn = @{$root->child_nodes};
5533 for (@cn) {
5534 $this_doc->adopt_node ($_);
5535 $node->append_child ($_);
5536 }
5537 ## ISSUE: mutation events?
5538
5539 $p->_terminate_tree_constructor;
5540 } else {
5541 die "$0: |set_inner_html| is not defined for node of type $nt";
5542 }
5543 } # set_inner_html
5544
5545 } # tree construction stage
5546
5547 package Whatpm::HTML::RestartParser;
5548 push our @ISA, 'Error';
5549
5550 1;
5551 # $Date: 2008/02/17 12:39:32 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24