/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.103 - (show annotations) (download) (as text)
Sun Mar 9 07:57:29 2008 UTC (17 years, 10 months ago) by wakaba
Branch: MAIN
Changes since 1.102: +24 -23 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	9 Mar 2008 07:53:51 -0000
	* tree-test-1.dat: Test data on |applet| are added (HTML5 revision
	1347).

2008-03-09  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ChangeLog	9 Mar 2008 07:54:08 -0000
	* HTML.pm.src: |applet| support (HTML5 revision 1347).

2008-03-09  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.102 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 use Error qw(:try);
5
6 ## ISSUE:
7 ## var doc = implementation.createDocument (null, null, null);
8 ## doc.write ('');
9 ## alert (doc.compatMode);
10
11 ## TODO: Control charcters and noncharacters are not allowed (HTML5 revision 1263)
12 ## TODO: 1252 parse error (revision 1264)
13 ## TODO: 8859-11 = 874 (revision 1271)
14
15 my $permitted_slash_tag_name = {
16 base => 1,
17 link => 1,
18 meta => 1,
19 hr => 1,
20 br => 1,
21 img => 1,
22 embed => 1,
23 param => 1,
24 area => 1,
25 col => 1,
26 input => 1,
27 };
28
29 my $c1_entity_char = {
30 0x80 => 0x20AC,
31 0x81 => 0xFFFD,
32 0x82 => 0x201A,
33 0x83 => 0x0192,
34 0x84 => 0x201E,
35 0x85 => 0x2026,
36 0x86 => 0x2020,
37 0x87 => 0x2021,
38 0x88 => 0x02C6,
39 0x89 => 0x2030,
40 0x8A => 0x0160,
41 0x8B => 0x2039,
42 0x8C => 0x0152,
43 0x8D => 0xFFFD,
44 0x8E => 0x017D,
45 0x8F => 0xFFFD,
46 0x90 => 0xFFFD,
47 0x91 => 0x2018,
48 0x92 => 0x2019,
49 0x93 => 0x201C,
50 0x94 => 0x201D,
51 0x95 => 0x2022,
52 0x96 => 0x2013,
53 0x97 => 0x2014,
54 0x98 => 0x02DC,
55 0x99 => 0x2122,
56 0x9A => 0x0161,
57 0x9B => 0x203A,
58 0x9C => 0x0153,
59 0x9D => 0xFFFD,
60 0x9E => 0x017E,
61 0x9F => 0x0178,
62 }; # $c1_entity_char
63
64 my $special_category = {
65 address => 1, area => 1, base => 1, basefont => 1, bgsound => 1,
66 blockquote => 1, body => 1, br => 1, center => 1, col => 1, colgroup => 1,
67 dd => 1, dir => 1, div => 1, dl => 1, dt => 1, embed => 1, fieldset => 1,
68 form => 1, frame => 1, frameset => 1, h1 => 1, h2 => 1, h3 => 1,
69 h4 => 1, h5 => 1, h6 => 1, head => 1, hr => 1, iframe => 1, image => 1,
70 img => 1, input => 1, isindex => 1, li => 1, link => 1, listing => 1,
71 menu => 1, meta => 1, noembed => 1, noframes => 1, noscript => 1,
72 ol => 1, optgroup => 1, option => 1, p => 1, param => 1, plaintext => 1,
73 pre => 1, script => 1, select => 1, spacer => 1, style => 1, tbody => 1,
74 textarea => 1, tfoot => 1, thead => 1, title => 1, tr => 1, ul => 1, wbr => 1,
75 };
76 my $scoping_category = {
77 applet => 1, button => 1, caption => 1, html => 1, marquee => 1, object => 1,
78 table => 1, td => 1, th => 1,
79 };
80 my $formatting_category = {
81 a => 1, b => 1, big => 1, em => 1, font => 1, i => 1, nobr => 1,
82 s => 1, small => 1, strile => 1, strong => 1, tt => 1, u => 1,
83 };
84 # $phrasing_category: all other elements
85
86 sub parse_byte_string ($$$$;$) {
87 my $self = ref $_[0] ? shift : shift->new;
88 my $charset = shift;
89 my $bytes_s = ref $_[0] ? $_[0] : \($_[0]);
90 my $s;
91
92 if (defined $charset) {
93 require Encode; ## TODO: decode(utf8) don't delete BOM
94 $s = \ (Encode::decode ($charset, $$bytes_s));
95 $self->{input_encoding} = lc $charset; ## TODO: normalize name
96 $self->{confident} = 1;
97 } else {
98 ## TODO: Implement HTML5 detection algorithm
99 require Whatpm::Charset::UniversalCharDet;
100 $charset = Whatpm::Charset::UniversalCharDet->detect_byte_string
101 (substr ($$bytes_s, 0, 1024));
102 $charset ||= 'windows-1252';
103 $s = \ (Encode::decode ($charset, $$bytes_s));
104 $self->{input_encoding} = $charset;
105 $self->{confident} = 0;
106 }
107
108 $self->{change_encoding} = sub {
109 my $self = shift;
110 my $charset = lc shift;
111 ## TODO: if $charset is supported
112 ## TODO: normalize charset name
113
114 ## "Change the encoding" algorithm:
115
116 ## Step 1
117 if ($charset eq 'utf-16') { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8?
118 $charset = 'utf-8';
119 }
120
121 ## Step 2
122 if (defined $self->{input_encoding} and
123 $self->{input_encoding} eq $charset) {
124 $self->{confident} = 1;
125 return;
126 }
127
128 !!!parse-error (type => 'charset label detected:'.$self->{input_encoding}.
129 ':'.$charset, level => 'w');
130
131 ## Step 3
132 # if (can) {
133 ## change the encoding on the fly.
134 #$self->{confident} = 1;
135 #return;
136 # }
137
138 ## Step 4
139 throw Whatpm::HTML::RestartParser (charset => $charset);
140 }; # $self->{change_encoding}
141
142 my @args = @_; shift @args; # $s
143 my $return;
144 try {
145 $return = $self->parse_char_string ($s, @args);
146 } catch Whatpm::HTML::RestartParser with {
147 my $charset = shift->{charset};
148 $s = \ (Encode::decode ($charset, $$bytes_s));
149 $self->{input_encoding} = $charset; ## TODO: normalize
150 $self->{confident} = 1;
151 $return = $self->parse_char_string ($s, @args);
152 };
153 return $return;
154 } # parse_byte_string
155
156 ## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM
157 ## and the HTML layer MUST ignore it. However, we does strip BOM in
158 ## the encoding layer and the HTML layer does not ignore any U+FEFF,
159 ## because the core part of our HTML parser expects a string of character,
160 ## not a string of bytes or code units or anything which might contain a BOM.
161 ## Therefore, any parser interface that accepts a string of bytes,
162 ## such as |parse_byte_string| in this module, must ensure that it does
163 ## strip the BOM and never strip any ZWNBSP.
164
165 *parse_char_string = \&parse_string;
166
167 sub parse_string ($$$;$) {
168 my $self = ref $_[0] ? shift : shift->new;
169 my $s = ref $_[0] ? $_[0] : \($_[0]);
170 $self->{document} = $_[1];
171 @{$self->{document}->child_nodes} = ();
172
173 ## NOTE: |set_inner_html| copies most of this method's code
174
175 $self->{confident} = 1 unless exists $self->{confident};
176 $self->{document}->input_encoding ($self->{input_encoding})
177 if defined $self->{input_encoding};
178
179 my $i = 0;
180 my $line = 1;
181 my $column = 0;
182 $self->{set_next_char} = sub {
183 my $self = shift;
184
185 pop @{$self->{prev_char}};
186 unshift @{$self->{prev_char}}, $self->{next_char};
187
188 $self->{next_char} = -1 and return if $i >= length $$s;
189 $self->{next_char} = ord substr $$s, $i++, 1;
190 $column++;
191
192 if ($self->{next_char} == 0x000A) { # LF
193 $line++;
194 $column = 0;
195 } elsif ($self->{next_char} == 0x000D) { # CR
196 $i++ if substr ($$s, $i, 1) eq "\x0A";
197 $self->{next_char} = 0x000A; # LF # MUST
198 $line++;
199 $column = 0;
200 } elsif ($self->{next_char} > 0x10FFFF) {
201 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
202 } elsif ($self->{next_char} == 0x0000) { # NULL
203 !!!parse-error (type => 'NULL');
204 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
205 }
206 };
207 $self->{prev_char} = [-1, -1, -1];
208 $self->{next_char} = -1;
209
210 my $onerror = $_[2] || sub {
211 my (%opt) = @_;
212 warn "Parse error ($opt{type}) at line $opt{line} column $opt{column}\n";
213 };
214 $self->{parse_error} = sub {
215 $onerror->(@_, line => $line, column => $column);
216 };
217
218 $self->_initialize_tokenizer;
219 $self->_initialize_tree_constructor;
220 $self->_construct_tree;
221 $self->_terminate_tree_constructor;
222
223 return $self->{document};
224 } # parse_string
225
226 sub new ($) {
227 my $class = shift;
228 my $self = bless {}, $class;
229 $self->{set_next_char} = sub {
230 $self->{next_char} = -1;
231 };
232 $self->{parse_error} = sub {
233 #
234 };
235 $self->{change_encoding} = sub {
236 # if ($_[0] is a supported encoding) {
237 # run "change the encoding" algorithm;
238 # throw Whatpm::HTML::RestartParser (charset => $new_encoding);
239 # }
240 };
241 $self->{application_cache_selection} = sub {
242 #
243 };
244 return $self;
245 } # new
246
247 sub CM_ENTITY () { 0b001 } # & markup in data
248 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
249 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
250
251 sub PLAINTEXT_CONTENT_MODEL () { 0 }
252 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
253 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
254 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
255
256 sub DATA_STATE () { 0 }
257 sub ENTITY_DATA_STATE () { 1 }
258 sub TAG_OPEN_STATE () { 2 }
259 sub CLOSE_TAG_OPEN_STATE () { 3 }
260 sub TAG_NAME_STATE () { 4 }
261 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
262 sub ATTRIBUTE_NAME_STATE () { 6 }
263 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
264 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
265 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
266 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
267 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
268 sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
269 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
270 sub COMMENT_START_STATE () { 14 }
271 sub COMMENT_START_DASH_STATE () { 15 }
272 sub COMMENT_STATE () { 16 }
273 sub COMMENT_END_STATE () { 17 }
274 sub COMMENT_END_DASH_STATE () { 18 }
275 sub BOGUS_COMMENT_STATE () { 19 }
276 sub DOCTYPE_STATE () { 20 }
277 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
278 sub DOCTYPE_NAME_STATE () { 22 }
279 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
280 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
281 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
282 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
283 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
284 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
285 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
286 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
287 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
288 sub BOGUS_DOCTYPE_STATE () { 32 }
289 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
290
291 sub DOCTYPE_TOKEN () { 1 }
292 sub COMMENT_TOKEN () { 2 }
293 sub START_TAG_TOKEN () { 3 }
294 sub END_TAG_TOKEN () { 4 }
295 sub END_OF_FILE_TOKEN () { 5 }
296 sub CHARACTER_TOKEN () { 6 }
297
298 sub AFTER_HTML_IMS () { 0b100 }
299 sub HEAD_IMS () { 0b1000 }
300 sub BODY_IMS () { 0b10000 }
301 sub BODY_TABLE_IMS () { 0b100000 }
302 sub TABLE_IMS () { 0b1000000 }
303 sub ROW_IMS () { 0b10000000 }
304 sub BODY_AFTER_IMS () { 0b100000000 }
305 sub FRAME_IMS () { 0b1000000000 }
306 sub SELECT_IMS () { 0b10000000000 }
307
308 ## NOTE: "initial" and "before html" insertion modes have no constants.
309
310 ## NOTE: "after after body" insertion mode.
311 sub AFTER_HTML_BODY_IM () { AFTER_HTML_IMS | BODY_AFTER_IMS }
312
313 ## NOTE: "after after frameset" insertion mode.
314 sub AFTER_HTML_FRAMESET_IM () { AFTER_HTML_IMS | FRAME_IMS }
315
316 sub IN_HEAD_IM () { HEAD_IMS | 0b00 }
317 sub IN_HEAD_NOSCRIPT_IM () { HEAD_IMS | 0b01 }
318 sub AFTER_HEAD_IM () { HEAD_IMS | 0b10 }
319 sub BEFORE_HEAD_IM () { HEAD_IMS | 0b11 }
320 sub IN_BODY_IM () { BODY_IMS }
321 sub IN_CELL_IM () { BODY_IMS | BODY_TABLE_IMS | 0b01 }
322 sub IN_CAPTION_IM () { BODY_IMS | BODY_TABLE_IMS | 0b10 }
323 sub IN_ROW_IM () { TABLE_IMS | ROW_IMS | 0b01 }
324 sub IN_TABLE_BODY_IM () { TABLE_IMS | ROW_IMS | 0b10 }
325 sub IN_TABLE_IM () { TABLE_IMS }
326 sub AFTER_BODY_IM () { BODY_AFTER_IMS }
327 sub IN_FRAMESET_IM () { FRAME_IMS | 0b01 }
328 sub AFTER_FRAMESET_IM () { FRAME_IMS | 0b10 }
329 sub IN_SELECT_IM () { SELECT_IMS | 0b01 }
330 sub IN_SELECT_IN_TABLE_IM () { SELECT_IMS | 0b10 }
331 sub IN_COLUMN_GROUP_IM () { 0b10 }
332
333 ## Implementations MUST act as if state machine in the spec
334
335 sub _initialize_tokenizer ($) {
336 my $self = shift;
337 $self->{state} = DATA_STATE; # MUST
338 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
339 undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE
340 undef $self->{current_attribute};
341 undef $self->{last_emitted_start_tag_name};
342 undef $self->{last_attribute_value_state};
343 $self->{char} = [];
344 # $self->{next_char}
345 !!!next-input-character;
346 $self->{token} = [];
347 # $self->{escape}
348 } # _initialize_tokenizer
349
350 ## A token has:
351 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
352 ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
353 ## ->{name} (DOCTYPE_TOKEN)
354 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
355 ## ->{public_identifier} (DOCTYPE_TOKEN)
356 ## ->{system_identifier} (DOCTYPE_TOKEN)
357 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
358 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
359 ## ->{name}
360 ## ->{value}
361 ## ->{has_reference} == 1 or 0
362 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
363
364 ## Emitted token MUST immediately be handled by the tree construction state.
365
366 ## Before each step, UA MAY check to see if either one of the scripts in
367 ## "list of scripts that will execute as soon as possible" or the first
368 ## script in the "list of scripts that will execute asynchronously",
369 ## has completed loading. If one has, then it MUST be executed
370 ## and removed from the list.
371
372 ## NOTE: HTML5 "Writing HTML documents" section, applied to
373 ## documents and not to user agents and conformance checkers,
374 ## contains some requirements that are not detected by the
375 ## parsing algorithm:
376 ## - Some requirements on character encoding declarations. ## TODO
377 ## - "Elements MUST NOT contain content that their content model disallows."
378 ## ... Some are parse error, some are not (will be reported by c.c.).
379 ## - Polytheistic slash SHOULD NOT be used. (Applied only to atheists.) ## TODO
380 ## - Text (in elements, attributes, and comments) SHOULD NOT contain
381 ## control characters other than space characters. ## TODO: (what is control character? C0, C1 and DEL? Unicode control character?)
382
383 ## TODO: HTML5 poses authors two SHOULD-level requirements that cannot
384 ## be detected by the HTML5 parsing algorithm:
385 ## - Text,
386
387 sub _get_next_token ($) {
388 my $self = shift;
389 if (@{$self->{token}}) {
390 return shift @{$self->{token}};
391 }
392
393 A: {
394 if ($self->{state} == DATA_STATE) {
395 if ($self->{next_char} == 0x0026) { # &
396 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
397 not $self->{escape}) {
398 !!!cp (1);
399 $self->{state} = ENTITY_DATA_STATE;
400 !!!next-input-character;
401 redo A;
402 } else {
403 !!!cp (2);
404 #
405 }
406 } elsif ($self->{next_char} == 0x002D) { # -
407 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
408 unless ($self->{escape}) {
409 if ($self->{prev_char}->[0] == 0x002D and # -
410 $self->{prev_char}->[1] == 0x0021 and # !
411 $self->{prev_char}->[2] == 0x003C) { # <
412 !!!cp (3);
413 $self->{escape} = 1;
414 } else {
415 !!!cp (4);
416 }
417 } else {
418 !!!cp (5);
419 }
420 }
421
422 #
423 } elsif ($self->{next_char} == 0x003C) { # <
424 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
425 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
426 not $self->{escape})) {
427 !!!cp (6);
428 $self->{state} = TAG_OPEN_STATE;
429 !!!next-input-character;
430 redo A;
431 } else {
432 !!!cp (7);
433 #
434 }
435 } elsif ($self->{next_char} == 0x003E) { # >
436 if ($self->{escape} and
437 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
438 if ($self->{prev_char}->[0] == 0x002D and # -
439 $self->{prev_char}->[1] == 0x002D) { # -
440 !!!cp (8);
441 delete $self->{escape};
442 } else {
443 !!!cp (9);
444 }
445 } else {
446 !!!cp (10);
447 }
448
449 #
450 } elsif ($self->{next_char} == -1) {
451 !!!cp (11);
452 !!!emit ({type => END_OF_FILE_TOKEN});
453 last A; ## TODO: ok?
454 } else {
455 !!!cp (12);
456 }
457 # Anything else
458 my $token = {type => CHARACTER_TOKEN,
459 data => chr $self->{next_char}};
460 ## Stay in the data state
461 !!!next-input-character;
462
463 !!!emit ($token);
464
465 redo A;
466 } elsif ($self->{state} == ENTITY_DATA_STATE) {
467 ## (cannot happen in CDATA state)
468
469 my $token = $self->_tokenize_attempt_to_consume_an_entity (0, -1);
470
471 $self->{state} = DATA_STATE;
472 # next-input-character is already done
473
474 unless (defined $token) {
475 !!!cp (13);
476 !!!emit ({type => CHARACTER_TOKEN, data => '&'});
477 } else {
478 !!!cp (14);
479 !!!emit ($token);
480 }
481
482 redo A;
483 } elsif ($self->{state} == TAG_OPEN_STATE) {
484 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
485 if ($self->{next_char} == 0x002F) { # /
486 !!!cp (15);
487 !!!next-input-character;
488 $self->{state} = CLOSE_TAG_OPEN_STATE;
489 redo A;
490 } else {
491 !!!cp (16);
492 ## reconsume
493 $self->{state} = DATA_STATE;
494
495 !!!emit ({type => CHARACTER_TOKEN, data => '<'});
496
497 redo A;
498 }
499 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
500 if ($self->{next_char} == 0x0021) { # !
501 !!!cp (17);
502 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
503 !!!next-input-character;
504 redo A;
505 } elsif ($self->{next_char} == 0x002F) { # /
506 !!!cp (18);
507 $self->{state} = CLOSE_TAG_OPEN_STATE;
508 !!!next-input-character;
509 redo A;
510 } elsif (0x0041 <= $self->{next_char} and
511 $self->{next_char} <= 0x005A) { # A..Z
512 !!!cp (19);
513 $self->{current_token}
514 = {type => START_TAG_TOKEN,
515 tag_name => chr ($self->{next_char} + 0x0020)};
516 $self->{state} = TAG_NAME_STATE;
517 !!!next-input-character;
518 redo A;
519 } elsif (0x0061 <= $self->{next_char} and
520 $self->{next_char} <= 0x007A) { # a..z
521 !!!cp (20);
522 $self->{current_token} = {type => START_TAG_TOKEN,
523 tag_name => chr ($self->{next_char})};
524 $self->{state} = TAG_NAME_STATE;
525 !!!next-input-character;
526 redo A;
527 } elsif ($self->{next_char} == 0x003E) { # >
528 !!!cp (21);
529 !!!parse-error (type => 'empty start tag');
530 $self->{state} = DATA_STATE;
531 !!!next-input-character;
532
533 !!!emit ({type => CHARACTER_TOKEN, data => '<>'});
534
535 redo A;
536 } elsif ($self->{next_char} == 0x003F) { # ?
537 !!!cp (22);
538 !!!parse-error (type => 'pio');
539 $self->{state} = BOGUS_COMMENT_STATE;
540 ## $self->{next_char} is intentionally left as is
541 redo A;
542 } else {
543 !!!cp (23);
544 !!!parse-error (type => 'bare stago');
545 $self->{state} = DATA_STATE;
546 ## reconsume
547
548 !!!emit ({type => CHARACTER_TOKEN, data => '<'});
549
550 redo A;
551 }
552 } else {
553 die "$0: $self->{content_model} in tag open";
554 }
555 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
556 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
557 if (defined $self->{last_emitted_start_tag_name}) {
558 ## NOTE: <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>
559 my @next_char;
560 TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
561 push @next_char, $self->{next_char};
562 my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
563 my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
564 if ($self->{next_char} == $c or $self->{next_char} == $C) {
565 !!!cp (24);
566 !!!next-input-character;
567 next TAGNAME;
568 } else {
569 !!!cp (25);
570 $self->{next_char} = shift @next_char; # reconsume
571 !!!back-next-input-character (@next_char);
572 $self->{state} = DATA_STATE;
573
574 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
575
576 redo A;
577 }
578 }
579 push @next_char, $self->{next_char};
580
581 unless ($self->{next_char} == 0x0009 or # HT
582 $self->{next_char} == 0x000A or # LF
583 $self->{next_char} == 0x000B or # VT
584 $self->{next_char} == 0x000C or # FF
585 $self->{next_char} == 0x0020 or # SP
586 $self->{next_char} == 0x003E or # >
587 $self->{next_char} == 0x002F or # /
588 $self->{next_char} == -1) {
589 !!!cp (26);
590 $self->{next_char} = shift @next_char; # reconsume
591 !!!back-next-input-character (@next_char);
592 $self->{state} = DATA_STATE;
593 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
594 redo A;
595 } else {
596 !!!cp (27);
597 $self->{next_char} = shift @next_char;
598 !!!back-next-input-character (@next_char);
599 # and consume...
600 }
601 } else {
602 ## No start tag token has ever been emitted
603 !!!cp (28);
604 # next-input-character is already done
605 $self->{state} = DATA_STATE;
606 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
607 redo A;
608 }
609 }
610
611 if (0x0041 <= $self->{next_char} and
612 $self->{next_char} <= 0x005A) { # A..Z
613 !!!cp (29);
614 $self->{current_token} = {type => END_TAG_TOKEN,
615 tag_name => chr ($self->{next_char} + 0x0020)};
616 $self->{state} = TAG_NAME_STATE;
617 !!!next-input-character;
618 redo A;
619 } elsif (0x0061 <= $self->{next_char} and
620 $self->{next_char} <= 0x007A) { # a..z
621 !!!cp (30);
622 $self->{current_token} = {type => END_TAG_TOKEN,
623 tag_name => chr ($self->{next_char})};
624 $self->{state} = TAG_NAME_STATE;
625 !!!next-input-character;
626 redo A;
627 } elsif ($self->{next_char} == 0x003E) { # >
628 !!!cp (31);
629 !!!parse-error (type => 'empty end tag');
630 $self->{state} = DATA_STATE;
631 !!!next-input-character;
632 redo A;
633 } elsif ($self->{next_char} == -1) {
634 !!!cp (32);
635 !!!parse-error (type => 'bare etago');
636 $self->{state} = DATA_STATE;
637 # reconsume
638
639 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
640
641 redo A;
642 } else {
643 !!!cp (33);
644 !!!parse-error (type => 'bogus end tag');
645 $self->{state} = BOGUS_COMMENT_STATE;
646 ## $self->{next_char} is intentionally left as is
647 redo A;
648 }
649 } elsif ($self->{state} == TAG_NAME_STATE) {
650 if ($self->{next_char} == 0x0009 or # HT
651 $self->{next_char} == 0x000A or # LF
652 $self->{next_char} == 0x000B or # VT
653 $self->{next_char} == 0x000C or # FF
654 $self->{next_char} == 0x0020) { # SP
655 !!!cp (34);
656 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
657 !!!next-input-character;
658 redo A;
659 } elsif ($self->{next_char} == 0x003E) { # >
660 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
661 !!!cp (35);
662 $self->{current_token}->{first_start_tag}
663 = not defined $self->{last_emitted_start_tag_name};
664 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
665 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
666 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
667 #if ($self->{current_token}->{attributes}) {
668 # ## NOTE: This should never be reached.
669 # !!! cp (36);
670 # !!! parse-error (type => 'end tag attribute');
671 #} else {
672 !!!cp (37);
673 #}
674 } else {
675 die "$0: $self->{current_token}->{type}: Unknown token type";
676 }
677 $self->{state} = DATA_STATE;
678 !!!next-input-character;
679
680 !!!emit ($self->{current_token}); # start tag or end tag
681
682 redo A;
683 } elsif (0x0041 <= $self->{next_char} and
684 $self->{next_char} <= 0x005A) { # A..Z
685 !!!cp (38);
686 $self->{current_token}->{tag_name} .= chr ($self->{next_char} + 0x0020);
687 # start tag or end tag
688 ## Stay in this state
689 !!!next-input-character;
690 redo A;
691 } elsif ($self->{next_char} == -1) {
692 !!!parse-error (type => 'unclosed tag');
693 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
694 !!!cp (39);
695 $self->{current_token}->{first_start_tag}
696 = not defined $self->{last_emitted_start_tag_name};
697 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
698 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
699 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
700 #if ($self->{current_token}->{attributes}) {
701 # ## NOTE: This state should never be reached.
702 # !!! cp (40);
703 # !!! parse-error (type => 'end tag attribute');
704 #} else {
705 !!!cp (41);
706 #}
707 } else {
708 die "$0: $self->{current_token}->{type}: Unknown token type";
709 }
710 $self->{state} = DATA_STATE;
711 # reconsume
712
713 !!!emit ($self->{current_token}); # start tag or end tag
714
715 redo A;
716 } elsif ($self->{next_char} == 0x002F) { # /
717 !!!next-input-character;
718 if ($self->{next_char} == 0x003E and # >
719 $self->{current_token}->{type} == START_TAG_TOKEN and
720 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
721 # permitted slash
722 !!!cp (42);
723 #
724 } else {
725 !!!cp (43);
726 !!!parse-error (type => 'nestc');
727 }
728 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
729 # next-input-character is already done
730 redo A;
731 } else {
732 !!!cp (44);
733 $self->{current_token}->{tag_name} .= chr $self->{next_char};
734 # start tag or end tag
735 ## Stay in the state
736 !!!next-input-character;
737 redo A;
738 }
739 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
740 if ($self->{next_char} == 0x0009 or # HT
741 $self->{next_char} == 0x000A or # LF
742 $self->{next_char} == 0x000B or # VT
743 $self->{next_char} == 0x000C or # FF
744 $self->{next_char} == 0x0020) { # SP
745 !!!cp (45);
746 ## Stay in the state
747 !!!next-input-character;
748 redo A;
749 } elsif ($self->{next_char} == 0x003E) { # >
750 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
751 !!!cp (46);
752 $self->{current_token}->{first_start_tag}
753 = not defined $self->{last_emitted_start_tag_name};
754 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
755 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
756 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
757 if ($self->{current_token}->{attributes}) {
758 !!!cp (47);
759 !!!parse-error (type => 'end tag attribute');
760 } else {
761 !!!cp (48);
762 }
763 } else {
764 die "$0: $self->{current_token}->{type}: Unknown token type";
765 }
766 $self->{state} = DATA_STATE;
767 !!!next-input-character;
768
769 !!!emit ($self->{current_token}); # start tag or end tag
770
771 redo A;
772 } elsif (0x0041 <= $self->{next_char} and
773 $self->{next_char} <= 0x005A) { # A..Z
774 !!!cp (49);
775 $self->{current_attribute} = {name => chr ($self->{next_char} + 0x0020),
776 value => ''};
777 $self->{state} = ATTRIBUTE_NAME_STATE;
778 !!!next-input-character;
779 redo A;
780 } elsif ($self->{next_char} == 0x002F) { # /
781 !!!next-input-character;
782 if ($self->{next_char} == 0x003E and # >
783 $self->{current_token}->{type} == START_TAG_TOKEN and
784 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
785 # permitted slash
786 !!!cp (50);
787 #
788 } else {
789 !!!cp (51);
790 !!!parse-error (type => 'nestc');
791 }
792 ## Stay in the state
793 # next-input-character is already done
794 redo A;
795 } elsif ($self->{next_char} == -1) {
796 !!!parse-error (type => 'unclosed tag');
797 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
798 !!!cp (52);
799 $self->{current_token}->{first_start_tag}
800 = not defined $self->{last_emitted_start_tag_name};
801 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
802 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
803 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
804 if ($self->{current_token}->{attributes}) {
805 !!!cp (53);
806 !!!parse-error (type => 'end tag attribute');
807 } else {
808 !!!cp (54);
809 }
810 } else {
811 die "$0: $self->{current_token}->{type}: Unknown token type";
812 }
813 $self->{state} = DATA_STATE;
814 # reconsume
815
816 !!!emit ($self->{current_token}); # start tag or end tag
817
818 redo A;
819 } else {
820 if ({
821 0x0022 => 1, # "
822 0x0027 => 1, # '
823 0x003D => 1, # =
824 }->{$self->{next_char}}) {
825 !!!cp (55);
826 !!!parse-error (type => 'bad attribute name');
827 } else {
828 !!!cp (56);
829 }
830 $self->{current_attribute} = {name => chr ($self->{next_char}),
831 value => ''};
832 $self->{state} = ATTRIBUTE_NAME_STATE;
833 !!!next-input-character;
834 redo A;
835 }
836 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
837 my $before_leave = sub {
838 if (exists $self->{current_token}->{attributes} # start tag or end tag
839 ->{$self->{current_attribute}->{name}}) { # MUST
840 !!!cp (57);
841 !!!parse-error (type => 'duplicate attribute:'.$self->{current_attribute}->{name});
842 ## Discard $self->{current_attribute} # MUST
843 } else {
844 !!!cp (58);
845 $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}
846 = $self->{current_attribute};
847 }
848 }; # $before_leave
849
850 if ($self->{next_char} == 0x0009 or # HT
851 $self->{next_char} == 0x000A or # LF
852 $self->{next_char} == 0x000B or # VT
853 $self->{next_char} == 0x000C or # FF
854 $self->{next_char} == 0x0020) { # SP
855 !!!cp (59);
856 $before_leave->();
857 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
858 !!!next-input-character;
859 redo A;
860 } elsif ($self->{next_char} == 0x003D) { # =
861 !!!cp (60);
862 $before_leave->();
863 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
864 !!!next-input-character;
865 redo A;
866 } elsif ($self->{next_char} == 0x003E) { # >
867 $before_leave->();
868 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
869 !!!cp (61);
870 $self->{current_token}->{first_start_tag}
871 = not defined $self->{last_emitted_start_tag_name};
872 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
873 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
874 !!!cp (62);
875 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
876 if ($self->{current_token}->{attributes}) {
877 !!!parse-error (type => 'end tag attribute');
878 }
879 } else {
880 die "$0: $self->{current_token}->{type}: Unknown token type";
881 }
882 $self->{state} = DATA_STATE;
883 !!!next-input-character;
884
885 !!!emit ($self->{current_token}); # start tag or end tag
886
887 redo A;
888 } elsif (0x0041 <= $self->{next_char} and
889 $self->{next_char} <= 0x005A) { # A..Z
890 !!!cp (63);
891 $self->{current_attribute}->{name} .= chr ($self->{next_char} + 0x0020);
892 ## Stay in the state
893 !!!next-input-character;
894 redo A;
895 } elsif ($self->{next_char} == 0x002F) { # /
896 $before_leave->();
897 !!!next-input-character;
898 if ($self->{next_char} == 0x003E and # >
899 $self->{current_token}->{type} == START_TAG_TOKEN and
900 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
901 # permitted slash
902 !!!cp (64);
903 #
904 } else {
905 !!!cp (65);
906 !!!parse-error (type => 'nestc');
907 }
908 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
909 # next-input-character is already done
910 redo A;
911 } elsif ($self->{next_char} == -1) {
912 !!!parse-error (type => 'unclosed tag');
913 $before_leave->();
914 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
915 !!!cp (66);
916 $self->{current_token}->{first_start_tag}
917 = not defined $self->{last_emitted_start_tag_name};
918 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
919 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
920 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
921 if ($self->{current_token}->{attributes}) {
922 !!!cp (67);
923 !!!parse-error (type => 'end tag attribute');
924 } else {
925 ## NOTE: This state should never be reached.
926 !!!cp (68);
927 }
928 } else {
929 die "$0: $self->{current_token}->{type}: Unknown token type";
930 }
931 $self->{state} = DATA_STATE;
932 # reconsume
933
934 !!!emit ($self->{current_token}); # start tag or end tag
935
936 redo A;
937 } else {
938 if ($self->{next_char} == 0x0022 or # "
939 $self->{next_char} == 0x0027) { # '
940 !!!cp (69);
941 !!!parse-error (type => 'bad attribute name');
942 } else {
943 !!!cp (70);
944 }
945 $self->{current_attribute}->{name} .= chr ($self->{next_char});
946 ## Stay in the state
947 !!!next-input-character;
948 redo A;
949 }
950 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
951 if ($self->{next_char} == 0x0009 or # HT
952 $self->{next_char} == 0x000A or # LF
953 $self->{next_char} == 0x000B or # VT
954 $self->{next_char} == 0x000C or # FF
955 $self->{next_char} == 0x0020) { # SP
956 !!!cp (71);
957 ## Stay in the state
958 !!!next-input-character;
959 redo A;
960 } elsif ($self->{next_char} == 0x003D) { # =
961 !!!cp (72);
962 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
963 !!!next-input-character;
964 redo A;
965 } elsif ($self->{next_char} == 0x003E) { # >
966 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
967 !!!cp (73);
968 $self->{current_token}->{first_start_tag}
969 = not defined $self->{last_emitted_start_tag_name};
970 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
971 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
972 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
973 if ($self->{current_token}->{attributes}) {
974 !!!cp (74);
975 !!!parse-error (type => 'end tag attribute');
976 } else {
977 ## NOTE: This state should never be reached.
978 !!!cp (75);
979 }
980 } else {
981 die "$0: $self->{current_token}->{type}: Unknown token type";
982 }
983 $self->{state} = DATA_STATE;
984 !!!next-input-character;
985
986 !!!emit ($self->{current_token}); # start tag or end tag
987
988 redo A;
989 } elsif (0x0041 <= $self->{next_char} and
990 $self->{next_char} <= 0x005A) { # A..Z
991 !!!cp (76);
992 $self->{current_attribute} = {name => chr ($self->{next_char} + 0x0020),
993 value => ''};
994 $self->{state} = ATTRIBUTE_NAME_STATE;
995 !!!next-input-character;
996 redo A;
997 } elsif ($self->{next_char} == 0x002F) { # /
998 !!!next-input-character;
999 if ($self->{next_char} == 0x003E and # >
1000 $self->{current_token}->{type} == START_TAG_TOKEN and
1001 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
1002 # permitted slash
1003 !!!cp (77);
1004 #
1005 } else {
1006 !!!cp (78);
1007 !!!parse-error (type => 'nestc');
1008 ## TODO: Different error type for <aa / bb> than <aa/>
1009 }
1010 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1011 # next-input-character is already done
1012 redo A;
1013 } elsif ($self->{next_char} == -1) {
1014 !!!parse-error (type => 'unclosed tag');
1015 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1016 !!!cp (79);
1017 $self->{current_token}->{first_start_tag}
1018 = not defined $self->{last_emitted_start_tag_name};
1019 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1020 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1021 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1022 if ($self->{current_token}->{attributes}) {
1023 !!!cp (80);
1024 !!!parse-error (type => 'end tag attribute');
1025 } else {
1026 ## NOTE: This state should never be reached.
1027 !!!cp (81);
1028 }
1029 } else {
1030 die "$0: $self->{current_token}->{type}: Unknown token type";
1031 }
1032 $self->{state} = DATA_STATE;
1033 # reconsume
1034
1035 !!!emit ($self->{current_token}); # start tag or end tag
1036
1037 redo A;
1038 } else {
1039 !!!cp (82);
1040 $self->{current_attribute} = {name => chr ($self->{next_char}),
1041 value => ''};
1042 $self->{state} = ATTRIBUTE_NAME_STATE;
1043 !!!next-input-character;
1044 redo A;
1045 }
1046 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1047 if ($self->{next_char} == 0x0009 or # HT
1048 $self->{next_char} == 0x000A or # LF
1049 $self->{next_char} == 0x000B or # VT
1050 $self->{next_char} == 0x000C or # FF
1051 $self->{next_char} == 0x0020) { # SP
1052 !!!cp (83);
1053 ## Stay in the state
1054 !!!next-input-character;
1055 redo A;
1056 } elsif ($self->{next_char} == 0x0022) { # "
1057 !!!cp (84);
1058 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1059 !!!next-input-character;
1060 redo A;
1061 } elsif ($self->{next_char} == 0x0026) { # &
1062 !!!cp (85);
1063 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1064 ## reconsume
1065 redo A;
1066 } elsif ($self->{next_char} == 0x0027) { # '
1067 !!!cp (86);
1068 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1069 !!!next-input-character;
1070 redo A;
1071 } elsif ($self->{next_char} == 0x003E) { # >
1072 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1073 !!!cp (87);
1074 $self->{current_token}->{first_start_tag}
1075 = not defined $self->{last_emitted_start_tag_name};
1076 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1077 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1078 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1079 if ($self->{current_token}->{attributes}) {
1080 !!!cp (88);
1081 !!!parse-error (type => 'end tag attribute');
1082 } else {
1083 ## NOTE: This state should never be reached.
1084 !!!cp (89);
1085 }
1086 } else {
1087 die "$0: $self->{current_token}->{type}: Unknown token type";
1088 }
1089 $self->{state} = DATA_STATE;
1090 !!!next-input-character;
1091
1092 !!!emit ($self->{current_token}); # start tag or end tag
1093
1094 redo A;
1095 } elsif ($self->{next_char} == -1) {
1096 !!!parse-error (type => 'unclosed tag');
1097 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1098 !!!cp (90);
1099 $self->{current_token}->{first_start_tag}
1100 = not defined $self->{last_emitted_start_tag_name};
1101 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1102 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1103 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1104 if ($self->{current_token}->{attributes}) {
1105 !!!cp (91);
1106 !!!parse-error (type => 'end tag attribute');
1107 } else {
1108 ## NOTE: This state should never be reached.
1109 !!!cp (92);
1110 }
1111 } else {
1112 die "$0: $self->{current_token}->{type}: Unknown token type";
1113 }
1114 $self->{state} = DATA_STATE;
1115 ## reconsume
1116
1117 !!!emit ($self->{current_token}); # start tag or end tag
1118
1119 redo A;
1120 } else {
1121 if ($self->{next_char} == 0x003D) { # =
1122 !!!cp (93);
1123 !!!parse-error (type => 'bad attribute value');
1124 } else {
1125 !!!cp (94);
1126 }
1127 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1128 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1129 !!!next-input-character;
1130 redo A;
1131 }
1132 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1133 if ($self->{next_char} == 0x0022) { # "
1134 !!!cp (95);
1135 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1136 !!!next-input-character;
1137 redo A;
1138 } elsif ($self->{next_char} == 0x0026) { # &
1139 !!!cp (96);
1140 $self->{last_attribute_value_state} = $self->{state};
1141 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1142 !!!next-input-character;
1143 redo A;
1144 } elsif ($self->{next_char} == -1) {
1145 !!!parse-error (type => 'unclosed attribute value');
1146 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1147 !!!cp (97);
1148 $self->{current_token}->{first_start_tag}
1149 = not defined $self->{last_emitted_start_tag_name};
1150 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1151 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1152 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1153 if ($self->{current_token}->{attributes}) {
1154 !!!cp (98);
1155 !!!parse-error (type => 'end tag attribute');
1156 } else {
1157 ## NOTE: This state should never be reached.
1158 !!!cp (99);
1159 }
1160 } else {
1161 die "$0: $self->{current_token}->{type}: Unknown token type";
1162 }
1163 $self->{state} = DATA_STATE;
1164 ## reconsume
1165
1166 !!!emit ($self->{current_token}); # start tag or end tag
1167
1168 redo A;
1169 } else {
1170 !!!cp (100);
1171 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1172 ## Stay in the state
1173 !!!next-input-character;
1174 redo A;
1175 }
1176 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1177 if ($self->{next_char} == 0x0027) { # '
1178 !!!cp (101);
1179 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1180 !!!next-input-character;
1181 redo A;
1182 } elsif ($self->{next_char} == 0x0026) { # &
1183 !!!cp (102);
1184 $self->{last_attribute_value_state} = $self->{state};
1185 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1186 !!!next-input-character;
1187 redo A;
1188 } elsif ($self->{next_char} == -1) {
1189 !!!parse-error (type => 'unclosed attribute value');
1190 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1191 !!!cp (103);
1192 $self->{current_token}->{first_start_tag}
1193 = not defined $self->{last_emitted_start_tag_name};
1194 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1195 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1196 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1197 if ($self->{current_token}->{attributes}) {
1198 !!!cp (104);
1199 !!!parse-error (type => 'end tag attribute');
1200 } else {
1201 ## NOTE: This state should never be reached.
1202 !!!cp (105);
1203 }
1204 } else {
1205 die "$0: $self->{current_token}->{type}: Unknown token type";
1206 }
1207 $self->{state} = DATA_STATE;
1208 ## reconsume
1209
1210 !!!emit ($self->{current_token}); # start tag or end tag
1211
1212 redo A;
1213 } else {
1214 !!!cp (106);
1215 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1216 ## Stay in the state
1217 !!!next-input-character;
1218 redo A;
1219 }
1220 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1221 if ($self->{next_char} == 0x0009 or # HT
1222 $self->{next_char} == 0x000A or # LF
1223 $self->{next_char} == 0x000B or # HT
1224 $self->{next_char} == 0x000C or # FF
1225 $self->{next_char} == 0x0020) { # SP
1226 !!!cp (107);
1227 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1228 !!!next-input-character;
1229 redo A;
1230 } elsif ($self->{next_char} == 0x0026) { # &
1231 !!!cp (108);
1232 $self->{last_attribute_value_state} = $self->{state};
1233 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1234 !!!next-input-character;
1235 redo A;
1236 } elsif ($self->{next_char} == 0x003E) { # >
1237 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1238 !!!cp (109);
1239 $self->{current_token}->{first_start_tag}
1240 = not defined $self->{last_emitted_start_tag_name};
1241 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1242 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1243 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1244 if ($self->{current_token}->{attributes}) {
1245 !!!cp (110);
1246 !!!parse-error (type => 'end tag attribute');
1247 } else {
1248 ## NOTE: This state should never be reached.
1249 !!!cp (111);
1250 }
1251 } else {
1252 die "$0: $self->{current_token}->{type}: Unknown token type";
1253 }
1254 $self->{state} = DATA_STATE;
1255 !!!next-input-character;
1256
1257 !!!emit ($self->{current_token}); # start tag or end tag
1258
1259 redo A;
1260 } elsif ($self->{next_char} == -1) {
1261 !!!parse-error (type => 'unclosed tag');
1262 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1263 !!!cp (112);
1264 $self->{current_token}->{first_start_tag}
1265 = not defined $self->{last_emitted_start_tag_name};
1266 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1267 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1268 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1269 if ($self->{current_token}->{attributes}) {
1270 !!!cp (113);
1271 !!!parse-error (type => 'end tag attribute');
1272 } else {
1273 ## NOTE: This state should never be reached.
1274 !!!cp (114);
1275 }
1276 } else {
1277 die "$0: $self->{current_token}->{type}: Unknown token type";
1278 }
1279 $self->{state} = DATA_STATE;
1280 ## reconsume
1281
1282 !!!emit ($self->{current_token}); # start tag or end tag
1283
1284 redo A;
1285 } else {
1286 if ({
1287 0x0022 => 1, # "
1288 0x0027 => 1, # '
1289 0x003D => 1, # =
1290 }->{$self->{next_char}}) {
1291 !!!cp (115);
1292 !!!parse-error (type => 'bad attribute value');
1293 } else {
1294 !!!cp (116);
1295 }
1296 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1297 ## Stay in the state
1298 !!!next-input-character;
1299 redo A;
1300 }
1301 } elsif ($self->{state} == ENTITY_IN_ATTRIBUTE_VALUE_STATE) {
1302 my $token = $self->_tokenize_attempt_to_consume_an_entity
1303 (1,
1304 $self->{last_attribute_value_state}
1305 == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE ? 0x0022 : # "
1306 $self->{last_attribute_value_state}
1307 == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE ? 0x0027 : # '
1308 -1);
1309
1310 unless (defined $token) {
1311 !!!cp (117);
1312 $self->{current_attribute}->{value} .= '&';
1313 } else {
1314 !!!cp (118);
1315 $self->{current_attribute}->{value} .= $token->{data};
1316 $self->{current_attribute}->{has_reference} = $token->{has_reference};
1317 ## ISSUE: spec says "append the returned character token to the current attribute's value"
1318 }
1319
1320 $self->{state} = $self->{last_attribute_value_state};
1321 # next-input-character is already done
1322 redo A;
1323 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1324 if ($self->{next_char} == 0x0009 or # HT
1325 $self->{next_char} == 0x000A or # LF
1326 $self->{next_char} == 0x000B or # VT
1327 $self->{next_char} == 0x000C or # FF
1328 $self->{next_char} == 0x0020) { # SP
1329 !!!cp (118);
1330 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1331 !!!next-input-character;
1332 redo A;
1333 } elsif ($self->{next_char} == 0x003E) { # >
1334 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1335 !!!cp (119);
1336 $self->{current_token}->{first_start_tag}
1337 = not defined $self->{last_emitted_start_tag_name};
1338 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1339 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1340 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1341 if ($self->{current_token}->{attributes}) {
1342 !!!cp (120);
1343 !!!parse-error (type => 'end tag attribute');
1344 } else {
1345 ## NOTE: This state should never be reached.
1346 !!!cp (121);
1347 }
1348 } else {
1349 die "$0: $self->{current_token}->{type}: Unknown token type";
1350 }
1351 $self->{state} = DATA_STATE;
1352 !!!next-input-character;
1353
1354 !!!emit ($self->{current_token}); # start tag or end tag
1355
1356 redo A;
1357 } elsif ($self->{next_char} == 0x002F) { # /
1358 !!!next-input-character;
1359 if ($self->{next_char} == 0x003E and # >
1360 $self->{current_token}->{type} == START_TAG_TOKEN and
1361 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
1362 # permitted slash
1363 !!!cp (122);
1364 #
1365 } else {
1366 !!!cp (123);
1367 !!!parse-error (type => 'nestc');
1368 }
1369 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1370 # next-input-character is already done
1371 redo A;
1372 } else {
1373 !!!cp (124);
1374 !!!parse-error (type => 'no space between attributes');
1375 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1376 ## reconsume
1377 redo A;
1378 }
1379 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1380 ## (only happen if PCDATA state)
1381
1382 my $token = {type => COMMENT_TOKEN, data => ''};
1383
1384 BC: {
1385 if ($self->{next_char} == 0x003E) { # >
1386 !!!cp (124);
1387 $self->{state} = DATA_STATE;
1388 !!!next-input-character;
1389
1390 !!!emit ($token);
1391
1392 redo A;
1393 } elsif ($self->{next_char} == -1) {
1394 !!!cp (125);
1395 $self->{state} = DATA_STATE;
1396 ## reconsume
1397
1398 !!!emit ($token);
1399
1400 redo A;
1401 } else {
1402 !!!cp (126);
1403 $token->{data} .= chr ($self->{next_char});
1404 !!!next-input-character;
1405 redo BC;
1406 }
1407 } # BC
1408
1409 die "$0: _get_next_token: unexpected case [BC]";
1410 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1411 ## (only happen if PCDATA state)
1412
1413 my @next_char;
1414 push @next_char, $self->{next_char};
1415
1416 if ($self->{next_char} == 0x002D) { # -
1417 !!!next-input-character;
1418 push @next_char, $self->{next_char};
1419 if ($self->{next_char} == 0x002D) { # -
1420 !!!cp (127);
1421 $self->{current_token} = {type => COMMENT_TOKEN, data => ''};
1422 $self->{state} = COMMENT_START_STATE;
1423 !!!next-input-character;
1424 redo A;
1425 } else {
1426 !!!cp (128);
1427 }
1428 } elsif ($self->{next_char} == 0x0044 or # D
1429 $self->{next_char} == 0x0064) { # d
1430 !!!next-input-character;
1431 push @next_char, $self->{next_char};
1432 if ($self->{next_char} == 0x004F or # O
1433 $self->{next_char} == 0x006F) { # o
1434 !!!next-input-character;
1435 push @next_char, $self->{next_char};
1436 if ($self->{next_char} == 0x0043 or # C
1437 $self->{next_char} == 0x0063) { # c
1438 !!!next-input-character;
1439 push @next_char, $self->{next_char};
1440 if ($self->{next_char} == 0x0054 or # T
1441 $self->{next_char} == 0x0074) { # t
1442 !!!next-input-character;
1443 push @next_char, $self->{next_char};
1444 if ($self->{next_char} == 0x0059 or # Y
1445 $self->{next_char} == 0x0079) { # y
1446 !!!next-input-character;
1447 push @next_char, $self->{next_char};
1448 if ($self->{next_char} == 0x0050 or # P
1449 $self->{next_char} == 0x0070) { # p
1450 !!!next-input-character;
1451 push @next_char, $self->{next_char};
1452 if ($self->{next_char} == 0x0045 or # E
1453 $self->{next_char} == 0x0065) { # e
1454 !!!cp (129);
1455 ## TODO: What a stupid code this is!
1456 $self->{state} = DOCTYPE_STATE;
1457 !!!next-input-character;
1458 redo A;
1459 } else {
1460 !!!cp (130);
1461 }
1462 } else {
1463 !!!cp (131);
1464 }
1465 } else {
1466 !!!cp (132);
1467 }
1468 } else {
1469 !!!cp (133);
1470 }
1471 } else {
1472 !!!cp (134);
1473 }
1474 } else {
1475 !!!cp (135);
1476 }
1477 } else {
1478 !!!cp (136);
1479 }
1480
1481 !!!parse-error (type => 'bogus comment');
1482 $self->{next_char} = shift @next_char;
1483 !!!back-next-input-character (@next_char);
1484 $self->{state} = BOGUS_COMMENT_STATE;
1485 redo A;
1486
1487 ## ISSUE: typos in spec: chacacters, is is a parse error
1488 ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
1489 } elsif ($self->{state} == COMMENT_START_STATE) {
1490 if ($self->{next_char} == 0x002D) { # -
1491 !!!cp (137);
1492 $self->{state} = COMMENT_START_DASH_STATE;
1493 !!!next-input-character;
1494 redo A;
1495 } elsif ($self->{next_char} == 0x003E) { # >
1496 !!!cp (138);
1497 !!!parse-error (type => 'bogus comment');
1498 $self->{state} = DATA_STATE;
1499 !!!next-input-character;
1500
1501 !!!emit ($self->{current_token}); # comment
1502
1503 redo A;
1504 } elsif ($self->{next_char} == -1) {
1505 !!!cp (139);
1506 !!!parse-error (type => 'unclosed comment');
1507 $self->{state} = DATA_STATE;
1508 ## reconsume
1509
1510 !!!emit ($self->{current_token}); # comment
1511
1512 redo A;
1513 } else {
1514 !!!cp (140);
1515 $self->{current_token}->{data} # comment
1516 .= chr ($self->{next_char});
1517 $self->{state} = COMMENT_STATE;
1518 !!!next-input-character;
1519 redo A;
1520 }
1521 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1522 if ($self->{next_char} == 0x002D) { # -
1523 !!!cp (141);
1524 $self->{state} = COMMENT_END_STATE;
1525 !!!next-input-character;
1526 redo A;
1527 } elsif ($self->{next_char} == 0x003E) { # >
1528 !!!cp (142);
1529 !!!parse-error (type => 'bogus comment');
1530 $self->{state} = DATA_STATE;
1531 !!!next-input-character;
1532
1533 !!!emit ($self->{current_token}); # comment
1534
1535 redo A;
1536 } elsif ($self->{next_char} == -1) {
1537 !!!cp (143);
1538 !!!parse-error (type => 'unclosed comment');
1539 $self->{state} = DATA_STATE;
1540 ## reconsume
1541
1542 !!!emit ($self->{current_token}); # comment
1543
1544 redo A;
1545 } else {
1546 !!!cp (144);
1547 $self->{current_token}->{data} # comment
1548 .= '-' . chr ($self->{next_char});
1549 $self->{state} = COMMENT_STATE;
1550 !!!next-input-character;
1551 redo A;
1552 }
1553 } elsif ($self->{state} == COMMENT_STATE) {
1554 if ($self->{next_char} == 0x002D) { # -
1555 !!!cp (145);
1556 $self->{state} = COMMENT_END_DASH_STATE;
1557 !!!next-input-character;
1558 redo A;
1559 } elsif ($self->{next_char} == -1) {
1560 !!!cp (146);
1561 !!!parse-error (type => 'unclosed comment');
1562 $self->{state} = DATA_STATE;
1563 ## reconsume
1564
1565 !!!emit ($self->{current_token}); # comment
1566
1567 redo A;
1568 } else {
1569 !!!cp (147);
1570 $self->{current_token}->{data} .= chr ($self->{next_char}); # comment
1571 ## Stay in the state
1572 !!!next-input-character;
1573 redo A;
1574 }
1575 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1576 if ($self->{next_char} == 0x002D) { # -
1577 !!!cp (148);
1578 $self->{state} = COMMENT_END_STATE;
1579 !!!next-input-character;
1580 redo A;
1581 } elsif ($self->{next_char} == -1) {
1582 !!!cp (149);
1583 !!!parse-error (type => 'unclosed comment');
1584 $self->{state} = DATA_STATE;
1585 ## reconsume
1586
1587 !!!emit ($self->{current_token}); # comment
1588
1589 redo A;
1590 } else {
1591 !!!cp (150);
1592 $self->{current_token}->{data} .= '-' . chr ($self->{next_char}); # comment
1593 $self->{state} = COMMENT_STATE;
1594 !!!next-input-character;
1595 redo A;
1596 }
1597 } elsif ($self->{state} == COMMENT_END_STATE) {
1598 if ($self->{next_char} == 0x003E) { # >
1599 !!!cp (151);
1600 $self->{state} = DATA_STATE;
1601 !!!next-input-character;
1602
1603 !!!emit ($self->{current_token}); # comment
1604
1605 redo A;
1606 } elsif ($self->{next_char} == 0x002D) { # -
1607 !!!cp (152);
1608 !!!parse-error (type => 'dash in comment');
1609 $self->{current_token}->{data} .= '-'; # comment
1610 ## Stay in the state
1611 !!!next-input-character;
1612 redo A;
1613 } elsif ($self->{next_char} == -1) {
1614 !!!cp (153);
1615 !!!parse-error (type => 'unclosed comment');
1616 $self->{state} = DATA_STATE;
1617 ## reconsume
1618
1619 !!!emit ($self->{current_token}); # comment
1620
1621 redo A;
1622 } else {
1623 !!!cp (154);
1624 !!!parse-error (type => 'dash in comment');
1625 $self->{current_token}->{data} .= '--' . chr ($self->{next_char}); # comment
1626 $self->{state} = COMMENT_STATE;
1627 !!!next-input-character;
1628 redo A;
1629 }
1630 } elsif ($self->{state} == DOCTYPE_STATE) {
1631 if ($self->{next_char} == 0x0009 or # HT
1632 $self->{next_char} == 0x000A or # LF
1633 $self->{next_char} == 0x000B or # VT
1634 $self->{next_char} == 0x000C or # FF
1635 $self->{next_char} == 0x0020) { # SP
1636 !!!cp (155);
1637 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1638 !!!next-input-character;
1639 redo A;
1640 } else {
1641 !!!cp (156);
1642 !!!parse-error (type => 'no space before DOCTYPE name');
1643 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1644 ## reconsume
1645 redo A;
1646 }
1647 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
1648 if ($self->{next_char} == 0x0009 or # HT
1649 $self->{next_char} == 0x000A or # LF
1650 $self->{next_char} == 0x000B or # VT
1651 $self->{next_char} == 0x000C or # FF
1652 $self->{next_char} == 0x0020) { # SP
1653 !!!cp (157);
1654 ## Stay in the state
1655 !!!next-input-character;
1656 redo A;
1657 } elsif ($self->{next_char} == 0x003E) { # >
1658 !!!cp (158);
1659 !!!parse-error (type => 'no DOCTYPE name');
1660 $self->{state} = DATA_STATE;
1661 !!!next-input-character;
1662
1663 !!!emit ({type => DOCTYPE_TOKEN, quirks => 1});
1664
1665 redo A;
1666 } elsif ($self->{next_char} == -1) {
1667 !!!cp (159);
1668 !!!parse-error (type => 'no DOCTYPE name');
1669 $self->{state} = DATA_STATE;
1670 ## reconsume
1671
1672 !!!emit ({type => DOCTYPE_TOKEN, quirks => 1});
1673
1674 redo A;
1675 } else {
1676 !!!cp (160);
1677 $self->{current_token}
1678 = {type => DOCTYPE_TOKEN,
1679 name => chr ($self->{next_char}),
1680 #quirks => 0,
1681 };
1682 ## ISSUE: "Set the token's name name to the" in the spec
1683 $self->{state} = DOCTYPE_NAME_STATE;
1684 !!!next-input-character;
1685 redo A;
1686 }
1687 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
1688 ## ISSUE: Redundant "First," in the spec.
1689 if ($self->{next_char} == 0x0009 or # HT
1690 $self->{next_char} == 0x000A or # LF
1691 $self->{next_char} == 0x000B or # VT
1692 $self->{next_char} == 0x000C or # FF
1693 $self->{next_char} == 0x0020) { # SP
1694 !!!cp (161);
1695 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
1696 !!!next-input-character;
1697 redo A;
1698 } elsif ($self->{next_char} == 0x003E) { # >
1699 !!!cp (162);
1700 $self->{state} = DATA_STATE;
1701 !!!next-input-character;
1702
1703 !!!emit ($self->{current_token}); # DOCTYPE
1704
1705 redo A;
1706 } elsif ($self->{next_char} == -1) {
1707 !!!cp (163);
1708 !!!parse-error (type => 'unclosed DOCTYPE');
1709 $self->{state} = DATA_STATE;
1710 ## reconsume
1711
1712 $self->{current_token}->{quirks} = 1;
1713 !!!emit ($self->{current_token}); # DOCTYPE
1714
1715 redo A;
1716 } else {
1717 !!!cp (164);
1718 $self->{current_token}->{name}
1719 .= chr ($self->{next_char}); # DOCTYPE
1720 ## Stay in the state
1721 !!!next-input-character;
1722 redo A;
1723 }
1724 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
1725 if ($self->{next_char} == 0x0009 or # HT
1726 $self->{next_char} == 0x000A or # LF
1727 $self->{next_char} == 0x000B or # VT
1728 $self->{next_char} == 0x000C or # FF
1729 $self->{next_char} == 0x0020) { # SP
1730 !!!cp (165);
1731 ## Stay in the state
1732 !!!next-input-character;
1733 redo A;
1734 } elsif ($self->{next_char} == 0x003E) { # >
1735 !!!cp (166);
1736 $self->{state} = DATA_STATE;
1737 !!!next-input-character;
1738
1739 !!!emit ($self->{current_token}); # DOCTYPE
1740
1741 redo A;
1742 } elsif ($self->{next_char} == -1) {
1743 !!!cp (167);
1744 !!!parse-error (type => 'unclosed DOCTYPE');
1745 $self->{state} = DATA_STATE;
1746 ## reconsume
1747
1748 $self->{current_token}->{quirks} = 1;
1749 !!!emit ($self->{current_token}); # DOCTYPE
1750
1751 redo A;
1752 } elsif ($self->{next_char} == 0x0050 or # P
1753 $self->{next_char} == 0x0070) { # p
1754 !!!next-input-character;
1755 if ($self->{next_char} == 0x0055 or # U
1756 $self->{next_char} == 0x0075) { # u
1757 !!!next-input-character;
1758 if ($self->{next_char} == 0x0042 or # B
1759 $self->{next_char} == 0x0062) { # b
1760 !!!next-input-character;
1761 if ($self->{next_char} == 0x004C or # L
1762 $self->{next_char} == 0x006C) { # l
1763 !!!next-input-character;
1764 if ($self->{next_char} == 0x0049 or # I
1765 $self->{next_char} == 0x0069) { # i
1766 !!!next-input-character;
1767 if ($self->{next_char} == 0x0043 or # C
1768 $self->{next_char} == 0x0063) { # c
1769 !!!cp (168);
1770 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1771 !!!next-input-character;
1772 redo A;
1773 } else {
1774 !!!cp (169);
1775 }
1776 } else {
1777 !!!cp (170);
1778 }
1779 } else {
1780 !!!cp (171);
1781 }
1782 } else {
1783 !!!cp (172);
1784 }
1785 } else {
1786 !!!cp (173);
1787 }
1788
1789 #
1790 } elsif ($self->{next_char} == 0x0053 or # S
1791 $self->{next_char} == 0x0073) { # s
1792 !!!next-input-character;
1793 if ($self->{next_char} == 0x0059 or # Y
1794 $self->{next_char} == 0x0079) { # y
1795 !!!next-input-character;
1796 if ($self->{next_char} == 0x0053 or # S
1797 $self->{next_char} == 0x0073) { # s
1798 !!!next-input-character;
1799 if ($self->{next_char} == 0x0054 or # T
1800 $self->{next_char} == 0x0074) { # t
1801 !!!next-input-character;
1802 if ($self->{next_char} == 0x0045 or # E
1803 $self->{next_char} == 0x0065) { # e
1804 !!!next-input-character;
1805 if ($self->{next_char} == 0x004D or # M
1806 $self->{next_char} == 0x006D) { # m
1807 !!!cp (174);
1808 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1809 !!!next-input-character;
1810 redo A;
1811 } else {
1812 !!!cp (175);
1813 }
1814 } else {
1815 !!!cp (176);
1816 }
1817 } else {
1818 !!!cp (177);
1819 }
1820 } else {
1821 !!!cp (178);
1822 }
1823 } else {
1824 !!!cp (179);
1825 }
1826
1827 #
1828 } else {
1829 !!!cp (180);
1830 !!!next-input-character;
1831 #
1832 }
1833
1834 !!!parse-error (type => 'string after DOCTYPE name');
1835 $self->{current_token}->{quirks} = 1;
1836
1837 $self->{state} = BOGUS_DOCTYPE_STATE;
1838 # next-input-character is already done
1839 redo A;
1840 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
1841 if ({
1842 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1843 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1844 }->{$self->{next_char}}) {
1845 !!!cp (181);
1846 ## Stay in the state
1847 !!!next-input-character;
1848 redo A;
1849 } elsif ($self->{next_char} eq 0x0022) { # "
1850 !!!cp (182);
1851 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
1852 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
1853 !!!next-input-character;
1854 redo A;
1855 } elsif ($self->{next_char} eq 0x0027) { # '
1856 !!!cp (183);
1857 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
1858 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
1859 !!!next-input-character;
1860 redo A;
1861 } elsif ($self->{next_char} eq 0x003E) { # >
1862 !!!cp (184);
1863 !!!parse-error (type => 'no PUBLIC literal');
1864
1865 $self->{state} = DATA_STATE;
1866 !!!next-input-character;
1867
1868 $self->{current_token}->{quirks} = 1;
1869 !!!emit ($self->{current_token}); # DOCTYPE
1870
1871 redo A;
1872 } elsif ($self->{next_char} == -1) {
1873 !!!cp (185);
1874 !!!parse-error (type => 'unclosed DOCTYPE');
1875
1876 $self->{state} = DATA_STATE;
1877 ## reconsume
1878
1879 $self->{current_token}->{quirks} = 1;
1880 !!!emit ($self->{current_token}); # DOCTYPE
1881
1882 redo A;
1883 } else {
1884 !!!cp (186);
1885 !!!parse-error (type => 'string after PUBLIC');
1886 $self->{current_token}->{quirks} = 1;
1887
1888 $self->{state} = BOGUS_DOCTYPE_STATE;
1889 !!!next-input-character;
1890 redo A;
1891 }
1892 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
1893 if ($self->{next_char} == 0x0022) { # "
1894 !!!cp (187);
1895 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1896 !!!next-input-character;
1897 redo A;
1898 } elsif ($self->{next_char} == 0x003E) { # >
1899 !!!cp (188);
1900 !!!parse-error (type => 'unclosed PUBLIC literal');
1901
1902 $self->{state} = DATA_STATE;
1903 !!!next-input-character;
1904
1905 $self->{current_token}->{quirks} = 1;
1906 !!!emit ($self->{current_token}); # DOCTYPE
1907
1908 redo A;
1909 } elsif ($self->{next_char} == -1) {
1910 !!!cp (189);
1911 !!!parse-error (type => 'unclosed PUBLIC literal');
1912
1913 $self->{state} = DATA_STATE;
1914 ## reconsume
1915
1916 $self->{current_token}->{quirks} = 1;
1917 !!!emit ($self->{current_token}); # DOCTYPE
1918
1919 redo A;
1920 } else {
1921 !!!cp (190);
1922 $self->{current_token}->{public_identifier} # DOCTYPE
1923 .= chr $self->{next_char};
1924 ## Stay in the state
1925 !!!next-input-character;
1926 redo A;
1927 }
1928 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
1929 if ($self->{next_char} == 0x0027) { # '
1930 !!!cp (191);
1931 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1932 !!!next-input-character;
1933 redo A;
1934 } elsif ($self->{next_char} == 0x003E) { # >
1935 !!!cp (192);
1936 !!!parse-error (type => 'unclosed PUBLIC literal');
1937
1938 $self->{state} = DATA_STATE;
1939 !!!next-input-character;
1940
1941 $self->{current_token}->{quirks} = 1;
1942 !!!emit ($self->{current_token}); # DOCTYPE
1943
1944 redo A;
1945 } elsif ($self->{next_char} == -1) {
1946 !!!cp (193);
1947 !!!parse-error (type => 'unclosed PUBLIC literal');
1948
1949 $self->{state} = DATA_STATE;
1950 ## reconsume
1951
1952 $self->{current_token}->{quirks} = 1;
1953 !!!emit ($self->{current_token}); # DOCTYPE
1954
1955 redo A;
1956 } else {
1957 !!!cp (194);
1958 $self->{current_token}->{public_identifier} # DOCTYPE
1959 .= chr $self->{next_char};
1960 ## Stay in the state
1961 !!!next-input-character;
1962 redo A;
1963 }
1964 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
1965 if ({
1966 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1967 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1968 }->{$self->{next_char}}) {
1969 !!!cp (195);
1970 ## Stay in the state
1971 !!!next-input-character;
1972 redo A;
1973 } elsif ($self->{next_char} == 0x0022) { # "
1974 !!!cp (196);
1975 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1976 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
1977 !!!next-input-character;
1978 redo A;
1979 } elsif ($self->{next_char} == 0x0027) { # '
1980 !!!cp (197);
1981 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1982 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
1983 !!!next-input-character;
1984 redo A;
1985 } elsif ($self->{next_char} == 0x003E) { # >
1986 !!!cp (198);
1987 $self->{state} = DATA_STATE;
1988 !!!next-input-character;
1989
1990 !!!emit ($self->{current_token}); # DOCTYPE
1991
1992 redo A;
1993 } elsif ($self->{next_char} == -1) {
1994 !!!cp (199);
1995 !!!parse-error (type => 'unclosed DOCTYPE');
1996
1997 $self->{state} = DATA_STATE;
1998 ## reconsume
1999
2000 $self->{current_token}->{quirks} = 1;
2001 !!!emit ($self->{current_token}); # DOCTYPE
2002
2003 redo A;
2004 } else {
2005 !!!cp (200);
2006 !!!parse-error (type => 'string after PUBLIC literal');
2007 $self->{current_token}->{quirks} = 1;
2008
2009 $self->{state} = BOGUS_DOCTYPE_STATE;
2010 !!!next-input-character;
2011 redo A;
2012 }
2013 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2014 if ({
2015 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2016 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2017 }->{$self->{next_char}}) {
2018 !!!cp (201);
2019 ## Stay in the state
2020 !!!next-input-character;
2021 redo A;
2022 } elsif ($self->{next_char} == 0x0022) { # "
2023 !!!cp (202);
2024 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2025 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2026 !!!next-input-character;
2027 redo A;
2028 } elsif ($self->{next_char} == 0x0027) { # '
2029 !!!cp (203);
2030 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2031 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2032 !!!next-input-character;
2033 redo A;
2034 } elsif ($self->{next_char} == 0x003E) { # >
2035 !!!cp (204);
2036 !!!parse-error (type => 'no SYSTEM literal');
2037 $self->{state} = DATA_STATE;
2038 !!!next-input-character;
2039
2040 $self->{current_token}->{quirks} = 1;
2041 !!!emit ($self->{current_token}); # DOCTYPE
2042
2043 redo A;
2044 } elsif ($self->{next_char} == -1) {
2045 !!!cp (205);
2046 !!!parse-error (type => 'unclosed DOCTYPE');
2047
2048 $self->{state} = DATA_STATE;
2049 ## reconsume
2050
2051 $self->{current_token}->{quirks} = 1;
2052 !!!emit ($self->{current_token}); # DOCTYPE
2053
2054 redo A;
2055 } else {
2056 !!!cp (206);
2057 !!!parse-error (type => 'string after SYSTEM');
2058 $self->{current_token}->{quirks} = 1;
2059
2060 $self->{state} = BOGUS_DOCTYPE_STATE;
2061 !!!next-input-character;
2062 redo A;
2063 }
2064 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2065 if ($self->{next_char} == 0x0022) { # "
2066 !!!cp (207);
2067 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2068 !!!next-input-character;
2069 redo A;
2070 } elsif ($self->{next_char} == 0x003E) { # >
2071 !!!cp (208);
2072 !!!parse-error (type => 'unclosed PUBLIC literal');
2073
2074 $self->{state} = DATA_STATE;
2075 !!!next-input-character;
2076
2077 $self->{current_token}->{quirks} = 1;
2078 !!!emit ($self->{current_token}); # DOCTYPE
2079
2080 redo A;
2081 } elsif ($self->{next_char} == -1) {
2082 !!!cp (209);
2083 !!!parse-error (type => 'unclosed SYSTEM literal');
2084
2085 $self->{state} = DATA_STATE;
2086 ## reconsume
2087
2088 $self->{current_token}->{quirks} = 1;
2089 !!!emit ($self->{current_token}); # DOCTYPE
2090
2091 redo A;
2092 } else {
2093 !!!cp (210);
2094 $self->{current_token}->{system_identifier} # DOCTYPE
2095 .= chr $self->{next_char};
2096 ## Stay in the state
2097 !!!next-input-character;
2098 redo A;
2099 }
2100 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2101 if ($self->{next_char} == 0x0027) { # '
2102 !!!cp (211);
2103 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2104 !!!next-input-character;
2105 redo A;
2106 } elsif ($self->{next_char} == 0x003E) { # >
2107 !!!cp (212);
2108 !!!parse-error (type => 'unclosed PUBLIC literal');
2109
2110 $self->{state} = DATA_STATE;
2111 !!!next-input-character;
2112
2113 $self->{current_token}->{quirks} = 1;
2114 !!!emit ($self->{current_token}); # DOCTYPE
2115
2116 redo A;
2117 } elsif ($self->{next_char} == -1) {
2118 !!!cp (213);
2119 !!!parse-error (type => 'unclosed SYSTEM literal');
2120
2121 $self->{state} = DATA_STATE;
2122 ## reconsume
2123
2124 $self->{current_token}->{quirks} = 1;
2125 !!!emit ($self->{current_token}); # DOCTYPE
2126
2127 redo A;
2128 } else {
2129 !!!cp (214);
2130 $self->{current_token}->{system_identifier} # DOCTYPE
2131 .= chr $self->{next_char};
2132 ## Stay in the state
2133 !!!next-input-character;
2134 redo A;
2135 }
2136 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2137 if ({
2138 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2139 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2140 }->{$self->{next_char}}) {
2141 !!!cp (215);
2142 ## Stay in the state
2143 !!!next-input-character;
2144 redo A;
2145 } elsif ($self->{next_char} == 0x003E) { # >
2146 !!!cp (216);
2147 $self->{state} = DATA_STATE;
2148 !!!next-input-character;
2149
2150 !!!emit ($self->{current_token}); # DOCTYPE
2151
2152 redo A;
2153 } elsif ($self->{next_char} == -1) {
2154 !!!cp (217);
2155 !!!parse-error (type => 'unclosed DOCTYPE');
2156
2157 $self->{state} = DATA_STATE;
2158 ## reconsume
2159
2160 $self->{current_token}->{quirks} = 1;
2161 !!!emit ($self->{current_token}); # DOCTYPE
2162
2163 redo A;
2164 } else {
2165 !!!cp (218);
2166 !!!parse-error (type => 'string after SYSTEM literal');
2167 #$self->{current_token}->{quirks} = 1;
2168
2169 $self->{state} = BOGUS_DOCTYPE_STATE;
2170 !!!next-input-character;
2171 redo A;
2172 }
2173 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2174 if ($self->{next_char} == 0x003E) { # >
2175 !!!cp (219);
2176 $self->{state} = DATA_STATE;
2177 !!!next-input-character;
2178
2179 !!!emit ($self->{current_token}); # DOCTYPE
2180
2181 redo A;
2182 } elsif ($self->{next_char} == -1) {
2183 !!!cp (220);
2184 !!!parse-error (type => 'unclosed DOCTYPE');
2185 $self->{state} = DATA_STATE;
2186 ## reconsume
2187
2188 !!!emit ($self->{current_token}); # DOCTYPE
2189
2190 redo A;
2191 } else {
2192 !!!cp (221);
2193 ## Stay in the state
2194 !!!next-input-character;
2195 redo A;
2196 }
2197 } else {
2198 die "$0: $self->{state}: Unknown state";
2199 }
2200 } # A
2201
2202 die "$0: _get_next_token: unexpected case";
2203 } # _get_next_token
2204
2205 sub _tokenize_attempt_to_consume_an_entity ($$$) {
2206 my ($self, $in_attr, $additional) = @_;
2207
2208 if ({
2209 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
2210 0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR
2211 $additional => 1,
2212 }->{$self->{next_char}}) {
2213 !!!cp (1001);
2214 ## Don't consume
2215 ## No error
2216 return undef;
2217 } elsif ($self->{next_char} == 0x0023) { # #
2218 !!!next-input-character;
2219 if ($self->{next_char} == 0x0078 or # x
2220 $self->{next_char} == 0x0058) { # X
2221 my $code;
2222 X: {
2223 my $x_char = $self->{next_char};
2224 !!!next-input-character;
2225 if (0x0030 <= $self->{next_char} and
2226 $self->{next_char} <= 0x0039) { # 0..9
2227 !!!cp (1002);
2228 $code ||= 0;
2229 $code *= 0x10;
2230 $code += $self->{next_char} - 0x0030;
2231 redo X;
2232 } elsif (0x0061 <= $self->{next_char} and
2233 $self->{next_char} <= 0x0066) { # a..f
2234 !!!cp (1003);
2235 $code ||= 0;
2236 $code *= 0x10;
2237 $code += $self->{next_char} - 0x0060 + 9;
2238 redo X;
2239 } elsif (0x0041 <= $self->{next_char} and
2240 $self->{next_char} <= 0x0046) { # A..F
2241 !!!cp (1004);
2242 $code ||= 0;
2243 $code *= 0x10;
2244 $code += $self->{next_char} - 0x0040 + 9;
2245 redo X;
2246 } elsif (not defined $code) { # no hexadecimal digit
2247 !!!cp (1005);
2248 !!!parse-error (type => 'bare hcro');
2249 !!!back-next-input-character ($x_char, $self->{next_char});
2250 $self->{next_char} = 0x0023; # #
2251 return undef;
2252 } elsif ($self->{next_char} == 0x003B) { # ;
2253 !!!cp (1006);
2254 !!!next-input-character;
2255 } else {
2256 !!!cp (1007);
2257 !!!parse-error (type => 'no refc');
2258 }
2259
2260 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
2261 !!!cp (1008);
2262 !!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code);
2263 $code = 0xFFFD;
2264 } elsif ($code > 0x10FFFF) {
2265 !!!cp (1009);
2266 !!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code);
2267 $code = 0xFFFD;
2268 } elsif ($code == 0x000D) {
2269 !!!cp (1010);
2270 !!!parse-error (type => 'CR character reference');
2271 $code = 0x000A;
2272 } elsif (0x80 <= $code and $code <= 0x9F) {
2273 !!!cp (1011);
2274 !!!parse-error (type => sprintf 'C1 character reference:U+%04X', $code);
2275 $code = $c1_entity_char->{$code};
2276 }
2277
2278 return {type => CHARACTER_TOKEN, data => chr $code,
2279 has_reference => 1};
2280 } # X
2281 } elsif (0x0030 <= $self->{next_char} and
2282 $self->{next_char} <= 0x0039) { # 0..9
2283 my $code = $self->{next_char} - 0x0030;
2284 !!!next-input-character;
2285
2286 while (0x0030 <= $self->{next_char} and
2287 $self->{next_char} <= 0x0039) { # 0..9
2288 !!!cp (1012);
2289 $code *= 10;
2290 $code += $self->{next_char} - 0x0030;
2291
2292 !!!next-input-character;
2293 }
2294
2295 if ($self->{next_char} == 0x003B) { # ;
2296 !!!cp (1013);
2297 !!!next-input-character;
2298 } else {
2299 !!!cp (1014);
2300 !!!parse-error (type => 'no refc');
2301 }
2302
2303 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
2304 !!!cp (1015);
2305 !!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code);
2306 $code = 0xFFFD;
2307 } elsif ($code > 0x10FFFF) {
2308 !!!cp (1016);
2309 !!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code);
2310 $code = 0xFFFD;
2311 } elsif ($code == 0x000D) {
2312 !!!cp (1017);
2313 !!!parse-error (type => 'CR character reference');
2314 $code = 0x000A;
2315 } elsif (0x80 <= $code and $code <= 0x9F) {
2316 !!!cp (1018);
2317 !!!parse-error (type => sprintf 'C1 character reference:U+%04X', $code);
2318 $code = $c1_entity_char->{$code};
2319 }
2320
2321 return {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1};
2322 } else {
2323 !!!cp (1019);
2324 !!!parse-error (type => 'bare nero');
2325 !!!back-next-input-character ($self->{next_char});
2326 $self->{next_char} = 0x0023; # #
2327 return undef;
2328 }
2329 } elsif ((0x0041 <= $self->{next_char} and
2330 $self->{next_char} <= 0x005A) or
2331 (0x0061 <= $self->{next_char} and
2332 $self->{next_char} <= 0x007A)) {
2333 my $entity_name = chr $self->{next_char};
2334 !!!next-input-character;
2335
2336 my $value = $entity_name;
2337 my $match = 0;
2338 require Whatpm::_NamedEntityList;
2339 our $EntityChar;
2340
2341 while (length $entity_name < 10 and
2342 ## NOTE: Some number greater than the maximum length of entity name
2343 ((0x0041 <= $self->{next_char} and # a
2344 $self->{next_char} <= 0x005A) or # x
2345 (0x0061 <= $self->{next_char} and # a
2346 $self->{next_char} <= 0x007A) or # z
2347 (0x0030 <= $self->{next_char} and # 0
2348 $self->{next_char} <= 0x0039) or # 9
2349 $self->{next_char} == 0x003B)) { # ;
2350 $entity_name .= chr $self->{next_char};
2351 if (defined $EntityChar->{$entity_name}) {
2352 if ($self->{next_char} == 0x003B) { # ;
2353 !!!cp (1020);
2354 $value = $EntityChar->{$entity_name};
2355 $match = 1;
2356 !!!next-input-character;
2357 last;
2358 } else {
2359 !!!cp (1021);
2360 $value = $EntityChar->{$entity_name};
2361 $match = -1;
2362 !!!next-input-character;
2363 }
2364 } else {
2365 !!!cp (1022);
2366 $value .= chr $self->{next_char};
2367 $match *= 2;
2368 !!!next-input-character;
2369 }
2370 }
2371
2372 if ($match > 0) {
2373 !!!cp (1023);
2374 return {type => CHARACTER_TOKEN, data => $value, has_reference => 1};
2375 } elsif ($match < 0) {
2376 !!!parse-error (type => 'no refc');
2377 if ($in_attr and $match < -1) {
2378 !!!cp (1024);
2379 return {type => CHARACTER_TOKEN, data => '&'.$entity_name};
2380 } else {
2381 !!!cp (1025);
2382 return {type => CHARACTER_TOKEN, data => $value, has_reference => 1};
2383 }
2384 } else {
2385 !!!cp (1026);
2386 !!!parse-error (type => 'bare ero');
2387 ## NOTE: "No characters are consumed" in the spec.
2388 return {type => CHARACTER_TOKEN, data => '&'.$value};
2389 }
2390 } else {
2391 !!!cp (1027);
2392 ## no characters are consumed
2393 !!!parse-error (type => 'bare ero');
2394 return undef;
2395 }
2396 } # _tokenize_attempt_to_consume_an_entity
2397
2398 sub _initialize_tree_constructor ($) {
2399 my $self = shift;
2400 ## NOTE: $self->{document} MUST be specified before this method is called
2401 $self->{document}->strict_error_checking (0);
2402 ## TODO: Turn mutation events off # MUST
2403 ## TODO: Turn loose Document option (manakai extension) on
2404 $self->{document}->manakai_is_html (1); # MUST
2405 } # _initialize_tree_constructor
2406
2407 sub _terminate_tree_constructor ($) {
2408 my $self = shift;
2409 $self->{document}->strict_error_checking (1);
2410 ## TODO: Turn mutation events on
2411 } # _terminate_tree_constructor
2412
2413 ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
2414
2415 { # tree construction stage
2416 my $token;
2417
2418 sub _construct_tree ($) {
2419 my ($self) = @_;
2420
2421 ## When an interactive UA render the $self->{document} available
2422 ## to the user, or when it begin accepting user input, are
2423 ## not defined.
2424
2425 ## Append a character: collect it and all subsequent consecutive
2426 ## characters and insert one Text node whose data is concatenation
2427 ## of all those characters. # MUST
2428
2429 !!!next-token;
2430
2431 undef $self->{form_element};
2432 undef $self->{head_element};
2433 $self->{open_elements} = [];
2434 undef $self->{inner_html_node};
2435
2436 ## NOTE: The "initial" insertion mode.
2437 $self->_tree_construction_initial; # MUST
2438
2439 ## NOTE: The "before html" insertion mode.
2440 $self->_tree_construction_root_element;
2441 $self->{insertion_mode} = BEFORE_HEAD_IM;
2442
2443 ## NOTE: The "before head" insertion mode and so on.
2444 $self->_tree_construction_main;
2445 } # _construct_tree
2446
2447 sub _tree_construction_initial ($) {
2448 my $self = shift;
2449
2450 ## NOTE: "initial" insertion mode
2451
2452 INITIAL: {
2453 if ($token->{type} == DOCTYPE_TOKEN) {
2454 ## NOTE: Conformance checkers MAY, instead of reporting "not HTML5"
2455 ## error, switch to a conformance checking mode for another
2456 ## language.
2457 my $doctype_name = $token->{name};
2458 $doctype_name = '' unless defined $doctype_name;
2459 $doctype_name =~ tr/a-z/A-Z/;
2460 if (not defined $token->{name} or # <!DOCTYPE>
2461 defined $token->{public_identifier} or
2462 defined $token->{system_identifier}) {
2463 !!!cp ('t1');
2464 !!!parse-error (type => 'not HTML5');
2465 } elsif ($doctype_name ne 'HTML') {
2466 !!!cp ('t2');
2467 ## ISSUE: ASCII case-insensitive? (in fact it does not matter)
2468 !!!parse-error (type => 'not HTML5');
2469 } else {
2470 !!!cp ('t3');
2471 }
2472
2473 my $doctype = $self->{document}->create_document_type_definition
2474 ($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)?
2475 $doctype->public_id ($token->{public_identifier})
2476 if defined $token->{public_identifier};
2477 $doctype->system_id ($token->{system_identifier})
2478 if defined $token->{system_identifier};
2479 ## NOTE: Other DocumentType attributes are null or empty lists.
2480 ## ISSUE: internalSubset = null??
2481 $self->{document}->append_child ($doctype);
2482
2483 if ($token->{quirks} or $doctype_name ne 'HTML') {
2484 !!!cp ('t4');
2485 $self->{document}->manakai_compat_mode ('quirks');
2486 } elsif (defined $token->{public_identifier}) {
2487 my $pubid = $token->{public_identifier};
2488 $pubid =~ tr/a-z/A-z/;
2489 if ({
2490 "+//SILMARIL//DTD HTML PRO V0R11 19970101//EN" => 1,
2491 "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
2492 "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
2493 "-//IETF//DTD HTML 2.0 LEVEL 1//EN" => 1,
2494 "-//IETF//DTD HTML 2.0 LEVEL 2//EN" => 1,
2495 "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//EN" => 1,
2496 "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//EN" => 1,
2497 "-//IETF//DTD HTML 2.0 STRICT//EN" => 1,
2498 "-//IETF//DTD HTML 2.0//EN" => 1,
2499 "-//IETF//DTD HTML 2.1E//EN" => 1,
2500 "-//IETF//DTD HTML 3.0//EN" => 1,
2501 "-//IETF//DTD HTML 3.0//EN//" => 1,
2502 "-//IETF//DTD HTML 3.2 FINAL//EN" => 1,
2503 "-//IETF//DTD HTML 3.2//EN" => 1,
2504 "-//IETF//DTD HTML 3//EN" => 1,
2505 "-//IETF//DTD HTML LEVEL 0//EN" => 1,
2506 "-//IETF//DTD HTML LEVEL 0//EN//2.0" => 1,
2507 "-//IETF//DTD HTML LEVEL 1//EN" => 1,
2508 "-//IETF//DTD HTML LEVEL 1//EN//2.0" => 1,
2509 "-//IETF//DTD HTML LEVEL 2//EN" => 1,
2510 "-//IETF//DTD HTML LEVEL 2//EN//2.0" => 1,
2511 "-//IETF//DTD HTML LEVEL 3//EN" => 1,
2512 "-//IETF//DTD HTML LEVEL 3//EN//3.0" => 1,
2513 "-//IETF//DTD HTML STRICT LEVEL 0//EN" => 1,
2514 "-//IETF//DTD HTML STRICT LEVEL 0//EN//2.0" => 1,
2515 "-//IETF//DTD HTML STRICT LEVEL 1//EN" => 1,
2516 "-//IETF//DTD HTML STRICT LEVEL 1//EN//2.0" => 1,
2517 "-//IETF//DTD HTML STRICT LEVEL 2//EN" => 1,
2518 "-//IETF//DTD HTML STRICT LEVEL 2//EN//2.0" => 1,
2519 "-//IETF//DTD HTML STRICT LEVEL 3//EN" => 1,
2520 "-//IETF//DTD HTML STRICT LEVEL 3//EN//3.0" => 1,
2521 "-//IETF//DTD HTML STRICT//EN" => 1,
2522 "-//IETF//DTD HTML STRICT//EN//2.0" => 1,
2523 "-//IETF//DTD HTML STRICT//EN//3.0" => 1,
2524 "-//IETF//DTD HTML//EN" => 1,
2525 "-//IETF//DTD HTML//EN//2.0" => 1,
2526 "-//IETF//DTD HTML//EN//3.0" => 1,
2527 "-//METRIUS//DTD METRIUS PRESENTATIONAL//EN" => 1,
2528 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//EN" => 1,
2529 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//EN" => 1,
2530 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//EN" => 1,
2531 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//EN" => 1,
2532 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//EN" => 1,
2533 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//EN" => 1,
2534 "-//NETSCAPE COMM. CORP.//DTD HTML//EN" => 1,
2535 "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1,
2536 "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1,
2537 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1,
2538 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED RELAXED 1.0//EN" => 1,
2539 "-//SOFTQUAD SOFTWARE//DTD HOTMETAL PRO 6.0::19990601::EXTENSIONS TO HTML 4.0//EN" => 1,
2540 "-//SOFTQUAD//DTD HOTMETAL PRO 4.0::19971010::EXTENSIONS TO HTML 4.0//EN" => 1,
2541 "-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1,
2542 "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1,
2543 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1,
2544 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//EN" => 1,
2545 "-//W3C//DTD HTML 3 1995-03-24//EN" => 1,
2546 "-//W3C//DTD HTML 3.2 DRAFT//EN" => 1,
2547 "-//W3C//DTD HTML 3.2 FINAL//EN" => 1,
2548 "-//W3C//DTD HTML 3.2//EN" => 1,
2549 "-//W3C//DTD HTML 3.2S DRAFT//EN" => 1,
2550 "-//W3C//DTD HTML 4.0 FRAMESET//EN" => 1,
2551 "-//W3C//DTD HTML 4.0 TRANSITIONAL//EN" => 1,
2552 "-//W3C//DTD HTML EXPERIMETNAL 19960712//EN" => 1,
2553 "-//W3C//DTD HTML EXPERIMENTAL 970421//EN" => 1,
2554 "-//W3C//DTD W3 HTML//EN" => 1,
2555 "-//W3O//DTD W3 HTML 3.0//EN" => 1,
2556 "-//W3O//DTD W3 HTML 3.0//EN//" => 1,
2557 "-//W3O//DTD W3 HTML STRICT 3.0//EN//" => 1,
2558 "-//WEBTECHS//DTD MOZILLA HTML 2.0//EN" => 1,
2559 "-//WEBTECHS//DTD MOZILLA HTML//EN" => 1,
2560 "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" => 1,
2561 "HTML" => 1,
2562 }->{$pubid}) {
2563 !!!cp ('t5');
2564 $self->{document}->manakai_compat_mode ('quirks');
2565 } elsif ($pubid eq "-//W3C//DTD HTML 4.01 FRAMESET//EN" or
2566 $pubid eq "-//W3C//DTD HTML 4.01 TRANSITIONAL//EN") {
2567 if (defined $token->{system_identifier}) {
2568 !!!cp ('t6');
2569 $self->{document}->manakai_compat_mode ('quirks');
2570 } else {
2571 !!!cp ('t7');
2572 $self->{document}->manakai_compat_mode ('limited quirks');
2573 }
2574 } elsif ($pubid eq "-//W3C//DTD XHTML 1.0 FRAMESET//EN" or
2575 $pubid eq "-//W3C//DTD XHTML 1.0 TRANSITIONAL//EN") {
2576 !!!cp ('t8');
2577 $self->{document}->manakai_compat_mode ('limited quirks');
2578 } else {
2579 !!!cp ('t9');
2580 }
2581 } else {
2582 !!!cp ('t10');
2583 }
2584 if (defined $token->{system_identifier}) {
2585 my $sysid = $token->{system_identifier};
2586 $sysid =~ tr/A-Z/a-z/;
2587 if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
2588 ## TODO: Check the spec: PUBLIC "(limited quirks)" "(quirks)"
2589 $self->{document}->manakai_compat_mode ('quirks');
2590 !!!cp ('t11');
2591 } else {
2592 !!!cp ('t12');
2593 }
2594 } else {
2595 !!!cp ('t13');
2596 }
2597
2598 ## Go to the "before html" insertion mode.
2599 !!!next-token;
2600 return;
2601 } elsif ({
2602 START_TAG_TOKEN, 1,
2603 END_TAG_TOKEN, 1,
2604 END_OF_FILE_TOKEN, 1,
2605 }->{$token->{type}}) {
2606 !!!cp ('t14');
2607 !!!parse-error (type => 'no DOCTYPE');
2608 $self->{document}->manakai_compat_mode ('quirks');
2609 ## Go to the "before html" insertion mode.
2610 ## reprocess
2611 return;
2612 } elsif ($token->{type} == CHARACTER_TOKEN) {
2613 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
2614 ## Ignore the token
2615
2616 unless (length $token->{data}) {
2617 !!!cp ('t15');
2618 ## Stay in the insertion mode.
2619 !!!next-token;
2620 redo INITIAL;
2621 } else {
2622 !!!cp ('t16');
2623 }
2624 } else {
2625 !!!cp ('t17');
2626 }
2627
2628 !!!parse-error (type => 'no DOCTYPE');
2629 $self->{document}->manakai_compat_mode ('quirks');
2630 ## Go to the "before html" insertion mode.
2631 ## reprocess
2632 return;
2633 } elsif ($token->{type} == COMMENT_TOKEN) {
2634 !!!cp ('t18');
2635 my $comment = $self->{document}->create_comment ($token->{data});
2636 $self->{document}->append_child ($comment);
2637
2638 ## Stay in the insertion mode.
2639 !!!next-token;
2640 redo INITIAL;
2641 } else {
2642 die "$0: $token->{type}: Unknown token type";
2643 }
2644 } # INITIAL
2645
2646 die "$0: _tree_construction_initial: This should be never reached";
2647 } # _tree_construction_initial
2648
2649 sub _tree_construction_root_element ($) {
2650 my $self = shift;
2651
2652 ## NOTE: "before html" insertion mode.
2653
2654 B: {
2655 if ($token->{type} == DOCTYPE_TOKEN) {
2656 !!!cp ('t19');
2657 !!!parse-error (type => 'in html:#DOCTYPE');
2658 ## Ignore the token
2659 ## Stay in the insertion mode.
2660 !!!next-token;
2661 redo B;
2662 } elsif ($token->{type} == COMMENT_TOKEN) {
2663 !!!cp ('t20');
2664 my $comment = $self->{document}->create_comment ($token->{data});
2665 $self->{document}->append_child ($comment);
2666 ## Stay in the insertion mode.
2667 !!!next-token;
2668 redo B;
2669 } elsif ($token->{type} == CHARACTER_TOKEN) {
2670 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
2671 ## Ignore the token.
2672
2673 unless (length $token->{data}) {
2674 !!!cp ('t21');
2675 ## Stay in the insertion mode.
2676 !!!next-token;
2677 redo B;
2678 } else {
2679 !!!cp ('t22');
2680 }
2681 } else {
2682 !!!cp ('t23');
2683 }
2684
2685 $self->{application_cache_selection}->(undef);
2686
2687 #
2688 } elsif ($token->{type} == START_TAG_TOKEN) {
2689 if ($token->{tag_name} eq 'html') {
2690 my $root_element;
2691 !!!create-element ($root_element, $token->{tag_name}, $token->{attributes});
2692 $self->{document}->append_child ($root_element);
2693 push @{$self->{open_elements}}, [$root_element, 'html'];
2694
2695 if ($token->{attributes}->{manifest}) {
2696 !!!cp ('t24');
2697 $self->{application_cache_selection}
2698 ->($token->{attributes}->{manifest}->{value});
2699 ## ISSUE: No relative reference resolution?
2700 } else {
2701 !!!cp ('t25');
2702 $self->{application_cache_selection}->(undef);
2703 }
2704
2705 !!!next-token;
2706 return; ## Go to the "before head" insertion mode.
2707 } else {
2708 !!!cp ('t25.1');
2709 #
2710 }
2711 } elsif ({
2712 END_TAG_TOKEN, 1,
2713 END_OF_FILE_TOKEN, 1,
2714 }->{$token->{type}}) {
2715 !!!cp ('t26');
2716 #
2717 } else {
2718 die "$0: $token->{type}: Unknown token type";
2719 }
2720
2721 my $root_element; !!!create-element ($root_element, 'html');
2722 $self->{document}->append_child ($root_element);
2723 push @{$self->{open_elements}}, [$root_element, 'html'];
2724
2725 $self->{application_cache_selection}->(undef);
2726
2727 ## NOTE: Reprocess the token.
2728 return; ## Go to the "before head" insertion mode.
2729
2730 ## ISSUE: There is an issue in the spec
2731 } # B
2732
2733 die "$0: _tree_construction_root_element: This should never be reached";
2734 } # _tree_construction_root_element
2735
2736 sub _reset_insertion_mode ($) {
2737 my $self = shift;
2738
2739 ## Step 1
2740 my $last;
2741
2742 ## Step 2
2743 my $i = -1;
2744 my $node = $self->{open_elements}->[$i];
2745
2746 ## Step 3
2747 S3: {
2748 if ($self->{open_elements}->[0]->[0] eq $node->[0]) {
2749 $last = 1;
2750 if (defined $self->{inner_html_node}) {
2751 if ($self->{inner_html_node}->[1] eq 'td' or
2752 $self->{inner_html_node}->[1] eq 'th') {
2753 !!!cp ('t27');
2754 #
2755 } else {
2756 !!!cp ('t28');
2757 $node = $self->{inner_html_node};
2758 }
2759 }
2760 }
2761
2762 ## Step 4..13
2763 my $new_mode = {
2764 select => IN_SELECT_IM,
2765 ## NOTE: |option| and |optgroup| do not set
2766 ## insertion mode to "in select" by themselves.
2767 td => IN_CELL_IM,
2768 th => IN_CELL_IM,
2769 tr => IN_ROW_IM,
2770 tbody => IN_TABLE_BODY_IM,
2771 thead => IN_TABLE_BODY_IM,
2772 tfoot => IN_TABLE_BODY_IM,
2773 caption => IN_CAPTION_IM,
2774 colgroup => IN_COLUMN_GROUP_IM,
2775 table => IN_TABLE_IM,
2776 head => IN_BODY_IM, # not in head!
2777 body => IN_BODY_IM,
2778 frameset => IN_FRAMESET_IM,
2779 }->{$node->[1]};
2780 $self->{insertion_mode} = $new_mode and return if defined $new_mode;
2781
2782 ## Step 14
2783 if ($node->[1] eq 'html') {
2784 unless (defined $self->{head_element}) {
2785 !!!cp ('t29');
2786 $self->{insertion_mode} = BEFORE_HEAD_IM;
2787 } else {
2788 ## ISSUE: Can this state be reached?
2789 !!!cp ('t30');
2790 $self->{insertion_mode} = AFTER_HEAD_IM;
2791 }
2792 return;
2793 } else {
2794 !!!cp ('t31');
2795 }
2796
2797 ## Step 15
2798 $self->{insertion_mode} = IN_BODY_IM and return if $last;
2799
2800 ## Step 16
2801 $i--;
2802 $node = $self->{open_elements}->[$i];
2803
2804 ## Step 17
2805 redo S3;
2806 } # S3
2807
2808 die "$0: _reset_insertion_mode: This line should never be reached";
2809 } # _reset_insertion_mode
2810
2811 sub _tree_construction_main ($) {
2812 my $self = shift;
2813
2814 my $active_formatting_elements = [];
2815
2816 my $reconstruct_active_formatting_elements = sub { # MUST
2817 my $insert = shift;
2818
2819 ## Step 1
2820 return unless @$active_formatting_elements;
2821
2822 ## Step 3
2823 my $i = -1;
2824 my $entry = $active_formatting_elements->[$i];
2825
2826 ## Step 2
2827 return if $entry->[0] eq '#marker';
2828 for (@{$self->{open_elements}}) {
2829 if ($entry->[0] eq $_->[0]) {
2830 !!!cp ('t32');
2831 return;
2832 }
2833 }
2834
2835 S4: {
2836 ## Step 4
2837 last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
2838
2839 ## Step 5
2840 $i--;
2841 $entry = $active_formatting_elements->[$i];
2842
2843 ## Step 6
2844 if ($entry->[0] eq '#marker') {
2845 !!!cp ('t33_1');
2846 #
2847 } else {
2848 my $in_open_elements;
2849 OE: for (@{$self->{open_elements}}) {
2850 if ($entry->[0] eq $_->[0]) {
2851 !!!cp ('t33');
2852 $in_open_elements = 1;
2853 last OE;
2854 }
2855 }
2856 if ($in_open_elements) {
2857 !!!cp ('t34');
2858 #
2859 } else {
2860 ## NOTE: <!DOCTYPE HTML><p><b><i><u></p> <p>X
2861 !!!cp ('t35');
2862 redo S4;
2863 }
2864 }
2865
2866 ## Step 7
2867 $i++;
2868 $entry = $active_formatting_elements->[$i];
2869 } # S4
2870
2871 S7: {
2872 ## Step 8
2873 my $clone = [$entry->[0]->clone_node (0), $entry->[1]];
2874
2875 ## Step 9
2876 $insert->($clone->[0]);
2877 push @{$self->{open_elements}}, $clone;
2878
2879 ## Step 10
2880 $active_formatting_elements->[$i] = $self->{open_elements}->[-1];
2881
2882 ## Step 11
2883 unless ($clone->[0] eq $active_formatting_elements->[-1]->[0]) {
2884 !!!cp ('t36');
2885 ## Step 7'
2886 $i++;
2887 $entry = $active_formatting_elements->[$i];
2888
2889 redo S7;
2890 }
2891
2892 !!!cp ('t37');
2893 } # S7
2894 }; # $reconstruct_active_formatting_elements
2895
2896 my $clear_up_to_marker = sub {
2897 for (reverse 0..$#$active_formatting_elements) {
2898 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
2899 !!!cp ('t38');
2900 splice @$active_formatting_elements, $_;
2901 return;
2902 }
2903 }
2904
2905 !!!cp ('t39');
2906 }; # $clear_up_to_marker
2907
2908 my $insert;
2909
2910 my $parse_rcdata = sub ($) {
2911 my ($content_model_flag) = @_;
2912
2913 ## Step 1
2914 my $start_tag_name = $token->{tag_name};
2915 my $el;
2916 !!!create-element ($el, $start_tag_name, $token->{attributes});
2917
2918 ## Step 2
2919 $insert->($el);
2920
2921 ## Step 3
2922 $self->{content_model} = $content_model_flag; # CDATA or RCDATA
2923 delete $self->{escape}; # MUST
2924
2925 ## Step 4
2926 my $text = '';
2927 !!!next-token;
2928 while ($token->{type} == CHARACTER_TOKEN) { # or until stop tokenizing
2929 !!!cp ('t40');
2930 $text .= $token->{data};
2931 !!!next-token;
2932 }
2933
2934 ## Step 5
2935 if (length $text) {
2936 !!!cp ('t41');
2937 my $text = $self->{document}->create_text_node ($text);
2938 $el->append_child ($text);
2939 }
2940
2941 ## Step 6
2942 $self->{content_model} = PCDATA_CONTENT_MODEL;
2943
2944 ## Step 7
2945 if ($token->{type} == END_TAG_TOKEN and
2946 $token->{tag_name} eq $start_tag_name) {
2947 !!!cp ('t42');
2948 ## Ignore the token
2949 } else {
2950 ## NOTE: An end-of-file token.
2951 if ($content_model_flag == CDATA_CONTENT_MODEL) {
2952 !!!cp ('t43');
2953 !!!parse-error (type => 'in CDATA:#'.$token->{type});
2954 } elsif ($content_model_flag == RCDATA_CONTENT_MODEL) {
2955 !!!cp ('t44');
2956 !!!parse-error (type => 'in RCDATA:#'.$token->{type});
2957 } else {
2958 die "$0: $content_model_flag in parse_rcdata";
2959 }
2960 }
2961 !!!next-token;
2962 }; # $parse_rcdata
2963
2964 my $script_start_tag = sub () {
2965 my $script_el;
2966 !!!create-element ($script_el, 'script', $token->{attributes});
2967 ## TODO: mark as "parser-inserted"
2968
2969 $self->{content_model} = CDATA_CONTENT_MODEL;
2970 delete $self->{escape}; # MUST
2971
2972 my $text = '';
2973 !!!next-token;
2974 while ($token->{type} == CHARACTER_TOKEN) {
2975 !!!cp ('t45');
2976 $text .= $token->{data};
2977 !!!next-token;
2978 } # stop if non-character token or tokenizer stops tokenising
2979 if (length $text) {
2980 !!!cp ('t46');
2981 $script_el->manakai_append_text ($text);
2982 }
2983
2984 $self->{content_model} = PCDATA_CONTENT_MODEL;
2985
2986 if ($token->{type} == END_TAG_TOKEN and
2987 $token->{tag_name} eq 'script') {
2988 !!!cp ('t47');
2989 ## Ignore the token
2990 } else {
2991 !!!cp ('t48');
2992 !!!parse-error (type => 'in CDATA:#'.$token->{type});
2993 ## ISSUE: And ignore?
2994 ## TODO: mark as "already executed"
2995 }
2996
2997 if (defined $self->{inner_html_node}) {
2998 !!!cp ('t49');
2999 ## TODO: mark as "already executed"
3000 } else {
3001 !!!cp ('t50');
3002 ## TODO: $old_insertion_point = current insertion point
3003 ## TODO: insertion point = just before the next input character
3004
3005 $insert->($script_el);
3006
3007 ## TODO: insertion point = $old_insertion_point (might be "undefined")
3008
3009 ## TODO: if there is a script that will execute as soon as the parser resume, then...
3010 }
3011
3012 !!!next-token;
3013 }; # $script_start_tag
3014
3015 ## NOTE: $open_tables->[-1]->[0] is the "current table" element node.
3016 ## NOTE: $open_tables->[-1]->[1] is the "tainted" flag.
3017 my $open_tables = [[$self->{open_elements}->[0]->[0]]];
3018
3019 my $formatting_end_tag = sub {
3020 my $tag_name = shift;
3021
3022 ## NOTE: The adoption agency algorithm (AAA).
3023
3024 FET: {
3025 ## Step 1
3026 my $formatting_element;
3027 my $formatting_element_i_in_active;
3028 AFE: for (reverse 0..$#$active_formatting_elements) {
3029 if ($active_formatting_elements->[$_]->[1] eq $tag_name) {
3030 !!!cp ('t51');
3031 $formatting_element = $active_formatting_elements->[$_];
3032 $formatting_element_i_in_active = $_;
3033 last AFE;
3034 } elsif ($active_formatting_elements->[$_]->[0] eq '#marker') {
3035 !!!cp ('t52');
3036 last AFE;
3037 }
3038 } # AFE
3039 unless (defined $formatting_element) {
3040 !!!cp ('t53');
3041 !!!parse-error (type => 'unmatched end tag:'.$tag_name);
3042 ## Ignore the token
3043 !!!next-token;
3044 return;
3045 }
3046 ## has an element in scope
3047 my $in_scope = 1;
3048 my $formatting_element_i_in_open;
3049 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3050 my $node = $self->{open_elements}->[$_];
3051 if ($node->[0] eq $formatting_element->[0]) {
3052 if ($in_scope) {
3053 !!!cp ('t54');
3054 $formatting_element_i_in_open = $_;
3055 last INSCOPE;
3056 } else { # in open elements but not in scope
3057 !!!cp ('t55');
3058 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3059 ## Ignore the token
3060 !!!next-token;
3061 return;
3062 }
3063 } elsif ({
3064 applet => 1, table => 1, caption => 1, td => 1, th => 1,
3065 button => 1, marquee => 1, object => 1, html => 1,
3066 }->{$node->[1]}) {
3067 !!!cp ('t56');
3068 $in_scope = 0;
3069 }
3070 } # INSCOPE
3071 unless (defined $formatting_element_i_in_open) {
3072 !!!cp ('t57');
3073 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3074 pop @$active_formatting_elements; # $formatting_element
3075 !!!next-token; ## TODO: ok?
3076 return;
3077 }
3078 if (not $self->{open_elements}->[-1]->[0] eq $formatting_element->[0]) {
3079 !!!cp ('t58');
3080 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3081 }
3082
3083 ## Step 2
3084 my $furthest_block;
3085 my $furthest_block_i_in_open;
3086 OE: for (reverse 0..$#{$self->{open_elements}}) {
3087 my $node = $self->{open_elements}->[$_];
3088 if (not $formatting_category->{$node->[1]} and
3089 #not $phrasing_category->{$node->[1]} and
3090 ($special_category->{$node->[1]} or
3091 $scoping_category->{$node->[1]})) { ## Scoping is redundant, maybe
3092 !!!cp ('t59');
3093 $furthest_block = $node;
3094 $furthest_block_i_in_open = $_;
3095 } elsif ($node->[0] eq $formatting_element->[0]) {
3096 !!!cp ('t60');
3097 last OE;
3098 }
3099 } # OE
3100
3101 ## Step 3
3102 unless (defined $furthest_block) { # MUST
3103 !!!cp ('t61');
3104 splice @{$self->{open_elements}}, $formatting_element_i_in_open;
3105 splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
3106 !!!next-token;
3107 return;
3108 }
3109
3110 ## Step 4
3111 my $common_ancestor_node = $self->{open_elements}->[$formatting_element_i_in_open - 1];
3112
3113 ## Step 5
3114 my $furthest_block_parent = $furthest_block->[0]->parent_node;
3115 if (defined $furthest_block_parent) {
3116 !!!cp ('t62');
3117 $furthest_block_parent->remove_child ($furthest_block->[0]);
3118 }
3119
3120 ## Step 6
3121 my $bookmark_prev_el
3122 = $active_formatting_elements->[$formatting_element_i_in_active - 1]
3123 ->[0];
3124
3125 ## Step 7
3126 my $node = $furthest_block;
3127 my $node_i_in_open = $furthest_block_i_in_open;
3128 my $last_node = $furthest_block;
3129 S7: {
3130 ## Step 1
3131 $node_i_in_open--;
3132 $node = $self->{open_elements}->[$node_i_in_open];
3133
3134 ## Step 2
3135 my $node_i_in_active;
3136 S7S2: {
3137 for (reverse 0..$#$active_formatting_elements) {
3138 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
3139 !!!cp ('t63');
3140 $node_i_in_active = $_;
3141 last S7S2;
3142 }
3143 }
3144 splice @{$self->{open_elements}}, $node_i_in_open, 1;
3145 redo S7;
3146 } # S7S2
3147
3148 ## Step 3
3149 last S7 if $node->[0] eq $formatting_element->[0];
3150
3151 ## Step 4
3152 if ($last_node->[0] eq $furthest_block->[0]) {
3153 !!!cp ('t64');
3154 $bookmark_prev_el = $node->[0];
3155 }
3156
3157 ## Step 5
3158 if ($node->[0]->has_child_nodes ()) {
3159 !!!cp ('t65');
3160 my $clone = [$node->[0]->clone_node (0), $node->[1]];
3161 $active_formatting_elements->[$node_i_in_active] = $clone;
3162 $self->{open_elements}->[$node_i_in_open] = $clone;
3163 $node = $clone;
3164 }
3165
3166 ## Step 6
3167 $node->[0]->append_child ($last_node->[0]);
3168
3169 ## Step 7
3170 $last_node = $node;
3171
3172 ## Step 8
3173 redo S7;
3174 } # S7
3175
3176 ## Step 8
3177 if ({
3178 table => 1, tbody => 1, tfoot => 1, thead => 1, tr => 1,
3179 }->{$common_ancestor_node->[1]}) {
3180 my $foster_parent_element;
3181 my $next_sibling;
3182 OE: for (reverse 0..$#{$self->{open_elements}}) {
3183 if ($self->{open_elements}->[$_]->[1] eq 'table') {
3184 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3185 if (defined $parent and $parent->node_type == 1) {
3186 !!!cp ('t65.1');
3187 $foster_parent_element = $parent;
3188 $next_sibling = $self->{open_elements}->[$_]->[0];
3189 } else {
3190 !!!cp ('t65.2');
3191 $foster_parent_element
3192 = $self->{open_elements}->[$_ - 1]->[0];
3193 }
3194 last OE;
3195 }
3196 } # OE
3197 $foster_parent_element = $self->{open_elements}->[0]->[0]
3198 unless defined $foster_parent_element;
3199 $foster_parent_element->insert_before ($last_node->[0], $next_sibling);
3200 $open_tables->[-1]->[1] = 1; # tainted
3201 } else {
3202 !!!cp ('t65.3');
3203 $common_ancestor_node->[0]->append_child ($last_node->[0]);
3204 }
3205
3206 ## Step 9
3207 my $clone = [$formatting_element->[0]->clone_node (0),
3208 $formatting_element->[1]];
3209
3210 ## Step 10
3211 my @cn = @{$furthest_block->[0]->child_nodes};
3212 $clone->[0]->append_child ($_) for @cn;
3213
3214 ## Step 11
3215 $furthest_block->[0]->append_child ($clone->[0]);
3216
3217 ## Step 12
3218 my $i;
3219 AFE: for (reverse 0..$#$active_formatting_elements) {
3220 if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
3221 !!!cp ('t66');
3222 splice @$active_formatting_elements, $_, 1;
3223 $i-- and last AFE if defined $i;
3224 } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
3225 !!!cp ('t67');
3226 $i = $_;
3227 }
3228 } # AFE
3229 splice @$active_formatting_elements, $i + 1, 0, $clone;
3230
3231 ## Step 13
3232 undef $i;
3233 OE: for (reverse 0..$#{$self->{open_elements}}) {
3234 if ($self->{open_elements}->[$_]->[0] eq $formatting_element->[0]) {
3235 !!!cp ('t68');
3236 splice @{$self->{open_elements}}, $_, 1;
3237 $i-- and last OE if defined $i;
3238 } elsif ($self->{open_elements}->[$_]->[0] eq $furthest_block->[0]) {
3239 !!!cp ('t69');
3240 $i = $_;
3241 }
3242 } # OE
3243 splice @{$self->{open_elements}}, $i + 1, 1, $clone;
3244
3245 ## Step 14
3246 redo FET;
3247 } # FET
3248 }; # $formatting_end_tag
3249
3250 $insert = my $insert_to_current = sub {
3251 $self->{open_elements}->[-1]->[0]->append_child ($_[0]);
3252 }; # $insert_to_current
3253
3254 my $insert_to_foster = sub {
3255 my $child = shift;
3256 if ({
3257 table => 1, tbody => 1, tfoot => 1, thead => 1, tr => 1,
3258 }->{$self->{open_elements}->[-1]->[1]}) {
3259 # MUST
3260 my $foster_parent_element;
3261 my $next_sibling;
3262 OE: for (reverse 0..$#{$self->{open_elements}}) {
3263 if ($self->{open_elements}->[$_]->[1] eq 'table') {
3264 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3265 if (defined $parent and $parent->node_type == 1) {
3266 !!!cp ('t70');
3267 $foster_parent_element = $parent;
3268 $next_sibling = $self->{open_elements}->[$_]->[0];
3269 } else {
3270 !!!cp ('t71');
3271 $foster_parent_element
3272 = $self->{open_elements}->[$_ - 1]->[0];
3273 }
3274 last OE;
3275 }
3276 } # OE
3277 $foster_parent_element = $self->{open_elements}->[0]->[0]
3278 unless defined $foster_parent_element;
3279 $foster_parent_element->insert_before
3280 ($child, $next_sibling);
3281 $open_tables->[-1]->[1] = 1; # tainted
3282 } else {
3283 !!!cp ('t72');
3284 $self->{open_elements}->[-1]->[0]->append_child ($child);
3285 }
3286 }; # $insert_to_foster
3287
3288 B: {
3289 if ($token->{type} == DOCTYPE_TOKEN) {
3290 !!!cp ('t73');
3291 !!!parse-error (type => 'DOCTYPE in the middle');
3292 ## Ignore the token
3293 ## Stay in the phase
3294 !!!next-token;
3295 redo B;
3296 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
3297 if ($self->{insertion_mode} & AFTER_HTML_IMS) {
3298 !!!cp ('t74');
3299 #
3300 } else {
3301 ## Generate implied end tags
3302 while ({
3303 dd => 1, dt => 1, li => 1, p => 1,
3304 }->{$self->{open_elements}->[-1]->[1]}) {
3305 !!!cp ('t75');
3306 pop @{$self->{open_elements}};
3307 }
3308
3309 if (@{$self->{open_elements}} > 2 or
3310 (@{$self->{open_elements}} == 2 and $self->{open_elements}->[1]->[1] ne 'body')) {
3311 !!!cp ('t76');
3312 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3313 } elsif (defined $self->{inner_html_node} and
3314 @{$self->{open_elements}} > 1 and
3315 $self->{open_elements}->[1]->[1] ne 'body') {
3316 ## ISSUE: This case is never reached.
3317 !!!cp ('t77');
3318 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3319 } else {
3320 !!!cp ('t78');
3321 }
3322
3323 ## ISSUE: There is an issue in the spec.
3324 }
3325
3326 ## Stop parsing
3327 last B;
3328 } elsif ($token->{type} == START_TAG_TOKEN and
3329 $token->{tag_name} eq 'html') {
3330 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
3331 !!!cp ('t79');
3332 !!!parse-error (type => 'after html:html');
3333 $self->{insertion_mode} = AFTER_BODY_IM;
3334 } elsif ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
3335 !!!cp ('t80');
3336 !!!parse-error (type => 'after html:html');
3337 $self->{insertion_mode} = AFTER_FRAMESET_IM;
3338 } else {
3339 !!!cp ('t81');
3340 }
3341
3342 !!!cp ('t82');
3343 !!!parse-error (type => 'not first start tag');
3344 my $top_el = $self->{open_elements}->[0]->[0];
3345 for my $attr_name (keys %{$token->{attributes}}) {
3346 unless ($top_el->has_attribute_ns (undef, $attr_name)) {
3347 !!!cp ('t84');
3348 $top_el->set_attribute_ns
3349 (undef, [undef, $attr_name],
3350 $token->{attributes}->{$attr_name}->{value});
3351 }
3352 }
3353 !!!next-token;
3354 redo B;
3355 } elsif ($token->{type} == COMMENT_TOKEN) {
3356 my $comment = $self->{document}->create_comment ($token->{data});
3357 if ($self->{insertion_mode} & AFTER_HTML_IMS) {
3358 !!!cp ('t85');
3359 $self->{document}->append_child ($comment);
3360 } elsif ($self->{insertion_mode} == AFTER_BODY_IM) {
3361 !!!cp ('t86');
3362 $self->{open_elements}->[0]->[0]->append_child ($comment);
3363 } else {
3364 !!!cp ('t87');
3365 $self->{open_elements}->[-1]->[0]->append_child ($comment);
3366 }
3367 !!!next-token;
3368 redo B;
3369 } elsif ($self->{insertion_mode} & HEAD_IMS) {
3370 if ($token->{type} == CHARACTER_TOKEN) {
3371 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3372 unless ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3373 !!!cp ('t88.2');
3374 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3375 } else {
3376 !!!cp ('t88.1');
3377 ## Ignore the token.
3378 !!!next-token;
3379 redo B;
3380 }
3381 unless (length $token->{data}) {
3382 !!!cp ('t88');
3383 !!!next-token;
3384 redo B;
3385 }
3386 }
3387
3388 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3389 !!!cp ('t89');
3390 ## As if <head>
3391 !!!create-element ($self->{head_element}, 'head');
3392 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3393 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3394
3395 ## Reprocess in the "in head" insertion mode...
3396 pop @{$self->{open_elements}};
3397
3398 ## Reprocess in the "after head" insertion mode...
3399 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3400 !!!cp ('t90');
3401 ## As if </noscript>
3402 pop @{$self->{open_elements}};
3403 !!!parse-error (type => 'in noscript:#character');
3404
3405 ## Reprocess in the "in head" insertion mode...
3406 ## As if </head>
3407 pop @{$self->{open_elements}};
3408
3409 ## Reprocess in the "after head" insertion mode...
3410 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3411 !!!cp ('t91');
3412 pop @{$self->{open_elements}};
3413
3414 ## Reprocess in the "after head" insertion mode...
3415 } else {
3416 !!!cp ('t92');
3417 }
3418
3419 ## "after head" insertion mode
3420 ## As if <body>
3421 !!!insert-element ('body');
3422 $self->{insertion_mode} = IN_BODY_IM;
3423 ## reprocess
3424 redo B;
3425 } elsif ($token->{type} == START_TAG_TOKEN) {
3426 if ($token->{tag_name} eq 'head') {
3427 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3428 !!!cp ('t93');
3429 !!!create-element ($self->{head_element}, $token->{tag_name}, $token->{attributes});
3430 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3431 push @{$self->{open_elements}}, [$self->{head_element}, $token->{tag_name}];
3432 $self->{insertion_mode} = IN_HEAD_IM;
3433 !!!next-token;
3434 redo B;
3435 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
3436 !!!cp ('t94');
3437 #
3438 } else {
3439 !!!cp ('t95');
3440 !!!parse-error (type => 'in head:head'); # or in head noscript
3441 ## Ignore the token
3442 !!!next-token;
3443 redo B;
3444 }
3445 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3446 !!!cp ('t96');
3447 ## As if <head>
3448 !!!create-element ($self->{head_element}, 'head');
3449 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3450 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3451
3452 $self->{insertion_mode} = IN_HEAD_IM;
3453 ## Reprocess in the "in head" insertion mode...
3454 } else {
3455 !!!cp ('t97');
3456 }
3457
3458 if ($token->{tag_name} eq 'base') {
3459 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3460 !!!cp ('t98');
3461 ## As if </noscript>
3462 pop @{$self->{open_elements}};
3463 !!!parse-error (type => 'in noscript:base');
3464
3465 $self->{insertion_mode} = IN_HEAD_IM;
3466 ## Reprocess in the "in head" insertion mode...
3467 } else {
3468 !!!cp ('t99');
3469 }
3470
3471 ## NOTE: There is a "as if in head" code clone.
3472 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
3473 !!!cp ('t100');
3474 !!!parse-error (type => 'after head:'.$token->{tag_name});
3475 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3476 } else {
3477 !!!cp ('t101');
3478 }
3479 !!!insert-element ($token->{tag_name}, $token->{attributes});
3480 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
3481 pop @{$self->{open_elements}} # <head>
3482 if $self->{insertion_mode} == AFTER_HEAD_IM;
3483 !!!next-token;
3484 redo B;
3485 } elsif ($token->{tag_name} eq 'link') {
3486 ## NOTE: There is a "as if in head" code clone.
3487 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
3488 !!!cp ('t102');
3489 !!!parse-error (type => 'after head:'.$token->{tag_name});
3490 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3491 } else {
3492 !!!cp ('t103');
3493 }
3494 !!!insert-element ($token->{tag_name}, $token->{attributes});
3495 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
3496 pop @{$self->{open_elements}} # <head>
3497 if $self->{insertion_mode} == AFTER_HEAD_IM;
3498 !!!next-token;
3499 redo B;
3500 } elsif ($token->{tag_name} eq 'meta') {
3501 ## NOTE: There is a "as if in head" code clone.
3502 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
3503 !!!cp ('t104');
3504 !!!parse-error (type => 'after head:'.$token->{tag_name});
3505 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3506 } else {
3507 !!!cp ('t105');
3508 }
3509 !!!insert-element ($token->{tag_name}, $token->{attributes});
3510 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
3511
3512 unless ($self->{confident}) {
3513 if ($token->{attributes}->{charset}) { ## TODO: And if supported
3514 !!!cp ('t106');
3515 $self->{change_encoding}
3516 ->($self, $token->{attributes}->{charset}->{value});
3517
3518 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
3519 ->set_user_data (manakai_has_reference =>
3520 $token->{attributes}->{charset}
3521 ->{has_reference});
3522 } elsif ($token->{attributes}->{content}) {
3523 ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
3524 if ($token->{attributes}->{content}->{value}
3525 =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
3526 [\x09-\x0D\x20]*=
3527 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
3528 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
3529 !!!cp ('t107');
3530 $self->{change_encoding}
3531 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3);
3532 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
3533 ->set_user_data (manakai_has_reference =>
3534 $token->{attributes}->{content}
3535 ->{has_reference});
3536 } else {
3537 !!!cp ('t108');
3538 }
3539 }
3540 } else {
3541 if ($token->{attributes}->{charset}) {
3542 !!!cp ('t109');
3543 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
3544 ->set_user_data (manakai_has_reference =>
3545 $token->{attributes}->{charset}
3546 ->{has_reference});
3547 }
3548 if ($token->{attributes}->{content}) {
3549 !!!cp ('t110');
3550 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
3551 ->set_user_data (manakai_has_reference =>
3552 $token->{attributes}->{content}
3553 ->{has_reference});
3554 }
3555 }
3556
3557 pop @{$self->{open_elements}} # <head>
3558 if $self->{insertion_mode} == AFTER_HEAD_IM;
3559 !!!next-token;
3560 redo B;
3561 } elsif ($token->{tag_name} eq 'title') {
3562 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3563 !!!cp ('t111');
3564 ## As if </noscript>
3565 pop @{$self->{open_elements}};
3566 !!!parse-error (type => 'in noscript:title');
3567
3568 $self->{insertion_mode} = IN_HEAD_IM;
3569 ## Reprocess in the "in head" insertion mode...
3570 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
3571 !!!cp ('t112');
3572 !!!parse-error (type => 'after head:'.$token->{tag_name});
3573 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3574 } else {
3575 !!!cp ('t113');
3576 }
3577
3578 ## NOTE: There is a "as if in head" code clone.
3579 my $parent = defined $self->{head_element} ? $self->{head_element}
3580 : $self->{open_elements}->[-1]->[0];
3581 $parse_rcdata->(RCDATA_CONTENT_MODEL);
3582 pop @{$self->{open_elements}} # <head>
3583 if $self->{insertion_mode} == AFTER_HEAD_IM;
3584 redo B;
3585 } elsif ($token->{tag_name} eq 'style') {
3586 ## NOTE: Or (scripting is enabled and tag_name eq 'noscript' and
3587 ## insertion mode IN_HEAD_IM)
3588 ## NOTE: There is a "as if in head" code clone.
3589 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
3590 !!!cp ('t114');
3591 !!!parse-error (type => 'after head:'.$token->{tag_name});
3592 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3593 } else {
3594 !!!cp ('t115');
3595 }
3596 $parse_rcdata->(CDATA_CONTENT_MODEL);
3597 pop @{$self->{open_elements}} # <head>
3598 if $self->{insertion_mode} == AFTER_HEAD_IM;
3599 redo B;
3600 } elsif ($token->{tag_name} eq 'noscript') {
3601 if ($self->{insertion_mode} == IN_HEAD_IM) {
3602 !!!cp ('t116');
3603 ## NOTE: and scripting is disalbed
3604 !!!insert-element ($token->{tag_name}, $token->{attributes});
3605 $self->{insertion_mode} = IN_HEAD_NOSCRIPT_IM;
3606 !!!next-token;
3607 redo B;
3608 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3609 !!!cp ('t117');
3610 !!!parse-error (type => 'in noscript:noscript');
3611 ## Ignore the token
3612 !!!next-token;
3613 redo B;
3614 } else {
3615 !!!cp ('t118');
3616 #
3617 }
3618 } elsif ($token->{tag_name} eq 'script') {
3619 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3620 !!!cp ('t119');
3621 ## As if </noscript>
3622 pop @{$self->{open_elements}};
3623 !!!parse-error (type => 'in noscript:script');
3624
3625 $self->{insertion_mode} = IN_HEAD_IM;
3626 ## Reprocess in the "in head" insertion mode...
3627 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
3628 !!!cp ('t120');
3629 !!!parse-error (type => 'after head:'.$token->{tag_name});
3630 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3631 } else {
3632 !!!cp ('t121');
3633 }
3634
3635 ## NOTE: There is a "as if in head" code clone.
3636 $script_start_tag->();
3637 pop @{$self->{open_elements}} # <head>
3638 if $self->{insertion_mode} == AFTER_HEAD_IM;
3639 redo B;
3640 } elsif ($token->{tag_name} eq 'body' or
3641 $token->{tag_name} eq 'frameset') {
3642 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3643 !!!cp ('t122');
3644 ## As if </noscript>
3645 pop @{$self->{open_elements}};
3646 !!!parse-error (type => 'in noscript:'.$token->{tag_name});
3647
3648 ## Reprocess in the "in head" insertion mode...
3649 ## As if </head>
3650 pop @{$self->{open_elements}};
3651
3652 ## Reprocess in the "after head" insertion mode...
3653 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3654 !!!cp ('t124');
3655 pop @{$self->{open_elements}};
3656
3657 ## Reprocess in the "after head" insertion mode...
3658 } else {
3659 !!!cp ('t125');
3660 }
3661
3662 ## "after head" insertion mode
3663 !!!insert-element ($token->{tag_name}, $token->{attributes});
3664 if ($token->{tag_name} eq 'body') {
3665 !!!cp ('t126');
3666 $self->{insertion_mode} = IN_BODY_IM;
3667 } elsif ($token->{tag_name} eq 'frameset') {
3668 !!!cp ('t127');
3669 $self->{insertion_mode} = IN_FRAMESET_IM;
3670 } else {
3671 die "$0: tag name: $self->{tag_name}";
3672 }
3673 !!!next-token;
3674 redo B;
3675 } else {
3676 !!!cp ('t128');
3677 #
3678 }
3679
3680 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3681 !!!cp ('t129');
3682 ## As if </noscript>
3683 pop @{$self->{open_elements}};
3684 !!!parse-error (type => 'in noscript:/'.$token->{tag_name});
3685
3686 ## Reprocess in the "in head" insertion mode...
3687 ## As if </head>
3688 pop @{$self->{open_elements}};
3689
3690 ## Reprocess in the "after head" insertion mode...
3691 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3692 !!!cp ('t130');
3693 ## As if </head>
3694 pop @{$self->{open_elements}};
3695
3696 ## Reprocess in the "after head" insertion mode...
3697 } else {
3698 !!!cp ('t131');
3699 }
3700
3701 ## "after head" insertion mode
3702 ## As if <body>
3703 !!!insert-element ('body');
3704 $self->{insertion_mode} = IN_BODY_IM;
3705 ## reprocess
3706 redo B;
3707 } elsif ($token->{type} == END_TAG_TOKEN) {
3708 if ($token->{tag_name} eq 'head') {
3709 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3710 !!!cp ('t132');
3711 ## As if <head>
3712 !!!create-element ($self->{head_element}, 'head');
3713 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3714 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3715
3716 ## Reprocess in the "in head" insertion mode...
3717 pop @{$self->{open_elements}};
3718 $self->{insertion_mode} = AFTER_HEAD_IM;
3719 !!!next-token;
3720 redo B;
3721 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3722 !!!cp ('t133');
3723 ## As if </noscript>
3724 pop @{$self->{open_elements}};
3725 !!!parse-error (type => 'in noscript:/head');
3726
3727 ## Reprocess in the "in head" insertion mode...
3728 pop @{$self->{open_elements}};
3729 $self->{insertion_mode} = AFTER_HEAD_IM;
3730 !!!next-token;
3731 redo B;
3732 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3733 !!!cp ('t134');
3734 pop @{$self->{open_elements}};
3735 $self->{insertion_mode} = AFTER_HEAD_IM;
3736 !!!next-token;
3737 redo B;
3738 } else {
3739 !!!cp ('t135');
3740 #
3741 }
3742 } elsif ($token->{tag_name} eq 'noscript') {
3743 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3744 !!!cp ('t136');
3745 pop @{$self->{open_elements}};
3746 $self->{insertion_mode} = IN_HEAD_IM;
3747 !!!next-token;
3748 redo B;
3749 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3750 !!!cp ('t137');
3751 !!!parse-error (type => 'unmatched end tag:noscript');
3752 ## Ignore the token ## ISSUE: An issue in the spec.
3753 !!!next-token;
3754 redo B;
3755 } else {
3756 !!!cp ('t138');
3757 #
3758 }
3759 } elsif ({
3760 body => 1, html => 1,
3761 }->{$token->{tag_name}}) {
3762 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3763 !!!cp ('t139');
3764 ## As if <head>
3765 !!!create-element ($self->{head_element}, 'head');
3766 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3767 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3768
3769 $self->{insertion_mode} = IN_HEAD_IM;
3770 ## Reprocess in the "in head" insertion mode...
3771 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3772 !!!cp ('t140');
3773 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3774 ## Ignore the token
3775 !!!next-token;
3776 redo B;
3777 } else {
3778 !!!cp ('t141');
3779 }
3780
3781 #
3782 } elsif ({
3783 p => 1, br => 1,
3784 }->{$token->{tag_name}}) {
3785 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3786 !!!cp ('t142');
3787 ## As if <head>
3788 !!!create-element ($self->{head_element}, 'head');
3789 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3790 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3791
3792 $self->{insertion_mode} = IN_HEAD_IM;
3793 ## Reprocess in the "in head" insertion mode...
3794 } else {
3795 !!!cp ('t143');
3796 }
3797
3798 #
3799 } else {
3800 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
3801 !!!cp ('t144');
3802 #
3803 } else {
3804 !!!cp ('t145');
3805 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3806 ## Ignore the token
3807 !!!next-token;
3808 redo B;
3809 }
3810 }
3811
3812 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3813 !!!cp ('t146');
3814 ## As if </noscript>
3815 pop @{$self->{open_elements}};
3816 !!!parse-error (type => 'in noscript:/'.$token->{tag_name});
3817
3818 ## Reprocess in the "in head" insertion mode...
3819 ## As if </head>
3820 pop @{$self->{open_elements}};
3821
3822 ## Reprocess in the "after head" insertion mode...
3823 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3824 !!!cp ('t147');
3825 ## As if </head>
3826 pop @{$self->{open_elements}};
3827
3828 ## Reprocess in the "after head" insertion mode...
3829 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3830 ## ISSUE: This case cannot be reached?
3831 !!!cp ('t148');
3832 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3833 ## Ignore the token ## ISSUE: An issue in the spec.
3834 !!!next-token;
3835 redo B;
3836 } else {
3837 !!!cp ('t149');
3838 }
3839
3840 ## "after head" insertion mode
3841 ## As if <body>
3842 !!!insert-element ('body');
3843 $self->{insertion_mode} = IN_BODY_IM;
3844 ## reprocess
3845 redo B;
3846 } else {
3847 die "$0: $token->{type}: Unknown token type";
3848 }
3849
3850 ## ISSUE: An issue in the spec.
3851 } elsif ($self->{insertion_mode} & BODY_IMS) {
3852 if ($token->{type} == CHARACTER_TOKEN) {
3853 !!!cp ('t150');
3854 ## NOTE: There is a code clone of "character in body".
3855 $reconstruct_active_formatting_elements->($insert_to_current);
3856
3857 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3858
3859 !!!next-token;
3860 redo B;
3861 } elsif ($token->{type} == START_TAG_TOKEN) {
3862 if ({
3863 caption => 1, col => 1, colgroup => 1, tbody => 1,
3864 td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
3865 }->{$token->{tag_name}}) {
3866 if ($self->{insertion_mode} == IN_CELL_IM) {
3867 ## have an element in table scope
3868 my $tn;
3869 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3870 my $node = $self->{open_elements}->[$_];
3871 if ($node->[1] eq 'td' or $node->[1] eq 'th') {
3872 !!!cp ('t151');
3873 $tn = $node->[1];
3874 last INSCOPE;
3875 } elsif ({
3876 table => 1, html => 1,
3877 }->{$node->[1]}) {
3878 !!!cp ('t152');
3879 last INSCOPE;
3880 }
3881 } # INSCOPE
3882 unless (defined $tn) {
3883 !!!cp ('t153');
3884 ## TODO: This error type is wrong.
3885 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3886 ## Ignore the token
3887 !!!next-token;
3888 redo B;
3889 }
3890
3891 !!!cp ('t154');
3892 ## Close the cell
3893 !!!back-token; # <?>
3894 $token = {type => END_TAG_TOKEN, tag_name => $tn};
3895 redo B;
3896 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
3897 !!!parse-error (type => 'not closed:caption');
3898
3899 ## As if </caption>
3900 ## have a table element in table scope
3901 my $i;
3902 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3903 my $node = $self->{open_elements}->[$_];
3904 if ($node->[1] eq 'caption') {
3905 !!!cp ('t155');
3906 $i = $_;
3907 last INSCOPE;
3908 } elsif ({
3909 table => 1, html => 1,
3910 }->{$node->[1]}) {
3911 !!!cp ('t156');
3912 last INSCOPE;
3913 }
3914 } # INSCOPE
3915 unless (defined $i) {
3916 !!!cp ('t157');
3917 ## TODO: this type is wrong.
3918 !!!parse-error (type => 'unmatched end tag:caption');
3919 ## Ignore the token
3920 !!!next-token;
3921 redo B;
3922 }
3923
3924 ## generate implied end tags
3925 while ({
3926 dd => 1, dt => 1, li => 1, p => 1,
3927 }->{$self->{open_elements}->[-1]->[1]}) {
3928 !!!cp ('t158');
3929 pop @{$self->{open_elements}};
3930 }
3931
3932 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3933 !!!cp ('t159');
3934 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3935 } else {
3936 !!!cp ('t160');
3937 }
3938
3939 splice @{$self->{open_elements}}, $i;
3940
3941 $clear_up_to_marker->();
3942
3943 $self->{insertion_mode} = IN_TABLE_IM;
3944
3945 ## reprocess
3946 redo B;
3947 } else {
3948 !!!cp ('t161');
3949 #
3950 }
3951 } else {
3952 !!!cp ('t162');
3953 #
3954 }
3955 } elsif ($token->{type} == END_TAG_TOKEN) {
3956 if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') {
3957 if ($self->{insertion_mode} == IN_CELL_IM) {
3958 ## have an element in table scope
3959 my $i;
3960 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3961 my $node = $self->{open_elements}->[$_];
3962 if ($node->[1] eq $token->{tag_name}) {
3963 !!!cp ('t163');
3964 $i = $_;
3965 last INSCOPE;
3966 } elsif ({
3967 table => 1, html => 1,
3968 }->{$node->[1]}) {
3969 !!!cp ('t164');
3970 last INSCOPE;
3971 }
3972 } # INSCOPE
3973 unless (defined $i) {
3974 !!!cp ('t165');
3975 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3976 ## Ignore the token
3977 !!!next-token;
3978 redo B;
3979 }
3980
3981 ## generate implied end tags
3982 while ({
3983 dd => 1, dt => 1, li => 1, p => 1,
3984 }->{$self->{open_elements}->[-1]->[1]}) {
3985 !!!cp ('t166');
3986 pop @{$self->{open_elements}};
3987 }
3988
3989 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
3990 !!!cp ('t167');
3991 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3992 } else {
3993 !!!cp ('t168');
3994 }
3995
3996 splice @{$self->{open_elements}}, $i;
3997
3998 $clear_up_to_marker->();
3999
4000 $self->{insertion_mode} = IN_ROW_IM;
4001
4002 !!!next-token;
4003 redo B;
4004 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
4005 !!!cp ('t169');
4006 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4007 ## Ignore the token
4008 !!!next-token;
4009 redo B;
4010 } else {
4011 !!!cp ('t170');
4012 #
4013 }
4014 } elsif ($token->{tag_name} eq 'caption') {
4015 if ($self->{insertion_mode} == IN_CAPTION_IM) {
4016 ## have a table element in table scope
4017 my $i;
4018 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4019 my $node = $self->{open_elements}->[$_];
4020 if ($node->[1] eq $token->{tag_name}) {
4021 !!!cp ('t171');
4022 $i = $_;
4023 last INSCOPE;
4024 } elsif ({
4025 table => 1, html => 1,
4026 }->{$node->[1]}) {
4027 !!!cp ('t172');
4028 last INSCOPE;
4029 }
4030 } # INSCOPE
4031 unless (defined $i) {
4032 !!!cp ('t173');
4033 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4034 ## Ignore the token
4035 !!!next-token;
4036 redo B;
4037 }
4038
4039 ## generate implied end tags
4040 while ({
4041 dd => 1, dt => 1, li => 1, p => 1,
4042 }->{$self->{open_elements}->[-1]->[1]}) {
4043 !!!cp ('t174');
4044 pop @{$self->{open_elements}};
4045 }
4046
4047 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
4048 !!!cp ('t175');
4049 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4050 } else {
4051 !!!cp ('t176');
4052 }
4053
4054 splice @{$self->{open_elements}}, $i;
4055
4056 $clear_up_to_marker->();
4057
4058 $self->{insertion_mode} = IN_TABLE_IM;
4059
4060 !!!next-token;
4061 redo B;
4062 } elsif ($self->{insertion_mode} == IN_CELL_IM) {
4063 !!!cp ('t177');
4064 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4065 ## Ignore the token
4066 !!!next-token;
4067 redo B;
4068 } else {
4069 !!!cp ('t178');
4070 #
4071 }
4072 } elsif ({
4073 table => 1, tbody => 1, tfoot => 1,
4074 thead => 1, tr => 1,
4075 }->{$token->{tag_name}} and
4076 $self->{insertion_mode} == IN_CELL_IM) {
4077 ## have an element in table scope
4078 my $i;
4079 my $tn;
4080 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4081 my $node = $self->{open_elements}->[$_];
4082 if ($node->[1] eq $token->{tag_name}) {
4083 !!!cp ('t179');
4084 $i = $_;
4085 last INSCOPE;
4086 } elsif ($node->[1] eq 'td' or $node->[1] eq 'th') {
4087 !!!cp ('t180');
4088 $tn = $node->[1];
4089 ## NOTE: There is exactly one |td| or |th| element
4090 ## in scope in the stack of open elements by definition.
4091 } elsif ({
4092 table => 1, html => 1,
4093 }->{$node->[1]}) {
4094 !!!cp ('t181');
4095 last INSCOPE;
4096 }
4097 } # INSCOPE
4098 unless (defined $i) {
4099 !!!cp ('t182');
4100 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4101 ## Ignore the token
4102 !!!next-token;
4103 redo B;
4104 } else {
4105 !!!cp ('t183');
4106 }
4107
4108 ## Close the cell
4109 !!!back-token; # </?>
4110 $token = {type => END_TAG_TOKEN, tag_name => $tn};
4111 redo B;
4112 } elsif ($token->{tag_name} eq 'table' and
4113 $self->{insertion_mode} == IN_CAPTION_IM) {
4114 !!!parse-error (type => 'not closed:caption');
4115
4116 ## As if </caption>
4117 ## have a table element in table scope
4118 my $i;
4119 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4120 my $node = $self->{open_elements}->[$_];
4121 if ($node->[1] eq 'caption') {
4122 !!!cp ('t184');
4123 $i = $_;
4124 last INSCOPE;
4125 } elsif ({
4126 table => 1, html => 1,
4127 }->{$node->[1]}) {
4128 !!!cp ('t185');
4129 last INSCOPE;
4130 }
4131 } # INSCOPE
4132 unless (defined $i) {
4133 !!!cp ('t186');
4134 !!!parse-error (type => 'unmatched end tag:caption');
4135 ## Ignore the token
4136 !!!next-token;
4137 redo B;
4138 }
4139
4140 ## generate implied end tags
4141 while ({
4142 dd => 1, dt => 1, li => 1, p => 1,
4143 }->{$self->{open_elements}->[-1]->[1]}) {
4144 !!!cp ('t187');
4145 pop @{$self->{open_elements}};
4146 }
4147
4148 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
4149 !!!cp ('t188');
4150 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4151 } else {
4152 !!!cp ('t189');
4153 }
4154
4155 splice @{$self->{open_elements}}, $i;
4156
4157 $clear_up_to_marker->();
4158
4159 $self->{insertion_mode} = IN_TABLE_IM;
4160
4161 ## reprocess
4162 redo B;
4163 } elsif ({
4164 body => 1, col => 1, colgroup => 1, html => 1,
4165 }->{$token->{tag_name}}) {
4166 if ($self->{insertion_mode} & BODY_TABLE_IMS) {
4167 !!!cp ('t190');
4168 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4169 ## Ignore the token
4170 !!!next-token;
4171 redo B;
4172 } else {
4173 !!!cp ('t191');
4174 #
4175 }
4176 } elsif ({
4177 tbody => 1, tfoot => 1,
4178 thead => 1, tr => 1,
4179 }->{$token->{tag_name}} and
4180 $self->{insertion_mode} == IN_CAPTION_IM) {
4181 !!!cp ('t192');
4182 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4183 ## Ignore the token
4184 !!!next-token;
4185 redo B;
4186 } else {
4187 !!!cp ('t193');
4188 #
4189 }
4190 } else {
4191 die "$0: $token->{type}: Unknown token type";
4192 }
4193
4194 $insert = $insert_to_current;
4195 #
4196 } elsif ($self->{insertion_mode} & TABLE_IMS) {
4197 if ($token->{type} == CHARACTER_TOKEN) {
4198 if (not $open_tables->[-1]->[1] and # tainted
4199 $token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4200 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4201
4202 unless (length $token->{data}) {
4203 !!!cp ('t194');
4204 !!!next-token;
4205 redo B;
4206 } else {
4207 !!!cp ('t195');
4208 }
4209 }
4210
4211 !!!parse-error (type => 'in table:#character');
4212
4213 ## As if in body, but insert into foster parent element
4214 ## ISSUE: Spec says that "whenever a node would be inserted
4215 ## into the current node" while characters might not be
4216 ## result in a new Text node.
4217 $reconstruct_active_formatting_elements->($insert_to_foster);
4218
4219 if ({
4220 table => 1, tbody => 1, tfoot => 1,
4221 thead => 1, tr => 1,
4222 }->{$self->{open_elements}->[-1]->[1]}) {
4223 # MUST
4224 my $foster_parent_element;
4225 my $next_sibling;
4226 my $prev_sibling;
4227 OE: for (reverse 0..$#{$self->{open_elements}}) {
4228 if ($self->{open_elements}->[$_]->[1] eq 'table') {
4229 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
4230 if (defined $parent and $parent->node_type == 1) {
4231 !!!cp ('t196');
4232 $foster_parent_element = $parent;
4233 $next_sibling = $self->{open_elements}->[$_]->[0];
4234 $prev_sibling = $next_sibling->previous_sibling;
4235 } else {
4236 !!!cp ('t197');
4237 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
4238 $prev_sibling = $foster_parent_element->last_child;
4239 }
4240 last OE;
4241 }
4242 } # OE
4243 $foster_parent_element = $self->{open_elements}->[0]->[0] and
4244 $prev_sibling = $foster_parent_element->last_child
4245 unless defined $foster_parent_element;
4246 if (defined $prev_sibling and
4247 $prev_sibling->node_type == 3) {
4248 !!!cp ('t198');
4249 $prev_sibling->manakai_append_text ($token->{data});
4250 } else {
4251 !!!cp ('t199');
4252 $foster_parent_element->insert_before
4253 ($self->{document}->create_text_node ($token->{data}),
4254 $next_sibling);
4255 }
4256 $open_tables->[-1]->[1] = 1; # tainted
4257 } else {
4258 !!!cp ('t200');
4259 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4260 }
4261
4262 !!!next-token;
4263 redo B;
4264 } elsif ($token->{type} == START_TAG_TOKEN) {
4265 if ({
4266 tr => ($self->{insertion_mode} != IN_ROW_IM),
4267 th => 1, td => 1,
4268 }->{$token->{tag_name}}) {
4269 if ($self->{insertion_mode} == IN_TABLE_IM) {
4270 ## Clear back to table context
4271 while ($self->{open_elements}->[-1]->[1] ne 'table' and
4272 $self->{open_elements}->[-1]->[1] ne 'html') {
4273 !!!cp ('t201');
4274 pop @{$self->{open_elements}};
4275 }
4276
4277 !!!insert-element ('tbody');
4278 $self->{insertion_mode} = IN_TABLE_BODY_IM;
4279 ## reprocess in the "in table body" insertion mode...
4280 }
4281
4282 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
4283 unless ($token->{tag_name} eq 'tr') {
4284 !!!cp ('t202');
4285 !!!parse-error (type => 'missing start tag:tr');
4286 }
4287
4288 ## Clear back to table body context
4289 while (not {
4290 tbody => 1, tfoot => 1, thead => 1, html => 1,
4291 }->{$self->{open_elements}->[-1]->[1]}) {
4292 !!!cp ('t203');
4293 ## ISSUE: Can this case be reached?
4294 pop @{$self->{open_elements}};
4295 }
4296
4297 $self->{insertion_mode} = IN_ROW_IM;
4298 if ($token->{tag_name} eq 'tr') {
4299 !!!cp ('t204');
4300 !!!insert-element ($token->{tag_name}, $token->{attributes});
4301 !!!next-token;
4302 redo B;
4303 } else {
4304 !!!cp ('t205');
4305 !!!insert-element ('tr');
4306 ## reprocess in the "in row" insertion mode
4307 }
4308 } else {
4309 !!!cp ('t206');
4310 }
4311
4312 ## Clear back to table row context
4313 while (not {
4314 tr => 1, html => 1,
4315 }->{$self->{open_elements}->[-1]->[1]}) {
4316 !!!cp ('t207');
4317 pop @{$self->{open_elements}};
4318 }
4319
4320 !!!insert-element ($token->{tag_name}, $token->{attributes});
4321 $self->{insertion_mode} = IN_CELL_IM;
4322
4323 push @$active_formatting_elements, ['#marker', ''];
4324
4325 !!!next-token;
4326 redo B;
4327 } elsif ({
4328 caption => 1, col => 1, colgroup => 1,
4329 tbody => 1, tfoot => 1, thead => 1,
4330 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
4331 }->{$token->{tag_name}}) {
4332 if ($self->{insertion_mode} == IN_ROW_IM) {
4333 ## As if </tr>
4334 ## have an element in table scope
4335 my $i;
4336 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4337 my $node = $self->{open_elements}->[$_];
4338 if ($node->[1] eq 'tr') {
4339 !!!cp ('t208');
4340 $i = $_;
4341 last INSCOPE;
4342 } elsif ({
4343 html => 1,
4344
4345 ## NOTE: This element does not appear here, maybe.
4346 table => 1,
4347 }->{$node->[1]}) {
4348 !!!cp ('t209');
4349 last INSCOPE;
4350 }
4351 } # INSCOPE
4352 unless (defined $i) {
4353 !!!cp ('t210');
4354 ## TODO: This type is wrong.
4355 !!!parse-error (type => 'unmacthed end tag:'.$token->{tag_name});
4356 ## Ignore the token
4357 !!!next-token;
4358 redo B;
4359 }
4360
4361 ## Clear back to table row context
4362 while (not {
4363 tr => 1, html => 1,
4364 }->{$self->{open_elements}->[-1]->[1]}) {
4365 !!!cp ('t211');
4366 ## ISSUE: Can this case be reached?
4367 pop @{$self->{open_elements}};
4368 }
4369
4370 pop @{$self->{open_elements}}; # tr
4371 $self->{insertion_mode} = IN_TABLE_BODY_IM;
4372 if ($token->{tag_name} eq 'tr') {
4373 !!!cp ('t212');
4374 ## reprocess
4375 redo B;
4376 } else {
4377 !!!cp ('t213');
4378 ## reprocess in the "in table body" insertion mode...
4379 }
4380 }
4381
4382 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
4383 ## have an element in table scope
4384 my $i;
4385 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4386 my $node = $self->{open_elements}->[$_];
4387 if ({
4388 tbody => 1, thead => 1, tfoot => 1,
4389 }->{$node->[1]}) {
4390 !!!cp ('t214');
4391 $i = $_;
4392 last INSCOPE;
4393 } elsif ({
4394 table => 1, html => 1,
4395 }->{$node->[1]}) {
4396 !!!cp ('t215');
4397 last INSCOPE;
4398 }
4399 } # INSCOPE
4400 unless (defined $i) {
4401 !!!cp ('t216');
4402 ## TODO: This erorr type ios wrong.
4403 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4404 ## Ignore the token
4405 !!!next-token;
4406 redo B;
4407 }
4408
4409 ## Clear back to table body context
4410 while (not {
4411 tbody => 1, tfoot => 1, thead => 1, html => 1,
4412 }->{$self->{open_elements}->[-1]->[1]}) {
4413 !!!cp ('t217');
4414 ## ISSUE: Can this state be reached?
4415 pop @{$self->{open_elements}};
4416 }
4417
4418 ## As if <{current node}>
4419 ## have an element in table scope
4420 ## true by definition
4421
4422 ## Clear back to table body context
4423 ## nop by definition
4424
4425 pop @{$self->{open_elements}};
4426 $self->{insertion_mode} = IN_TABLE_IM;
4427 ## reprocess in "in table" insertion mode...
4428 } else {
4429 !!!cp ('t218');
4430 }
4431
4432 if ($token->{tag_name} eq 'col') {
4433 ## Clear back to table context
4434 while ($self->{open_elements}->[-1]->[1] ne 'table' and
4435 $self->{open_elements}->[-1]->[1] ne 'html') {
4436 !!!cp ('t219');
4437 ## ISSUE: Can this state be reached?
4438 pop @{$self->{open_elements}};
4439 }
4440
4441 !!!insert-element ('colgroup');
4442 $self->{insertion_mode} = IN_COLUMN_GROUP_IM;
4443 ## reprocess
4444 redo B;
4445 } elsif ({
4446 caption => 1,
4447 colgroup => 1,
4448 tbody => 1, tfoot => 1, thead => 1,
4449 }->{$token->{tag_name}}) {
4450 ## Clear back to table context
4451 while ($self->{open_elements}->[-1]->[1] ne 'table' and
4452 $self->{open_elements}->[-1]->[1] ne 'html') {
4453 !!!cp ('t220');
4454 ## ISSUE: Can this state be reached?
4455 pop @{$self->{open_elements}};
4456 }
4457
4458 push @$active_formatting_elements, ['#marker', '']
4459 if $token->{tag_name} eq 'caption';
4460
4461 !!!insert-element ($token->{tag_name}, $token->{attributes});
4462 $self->{insertion_mode} = {
4463 caption => IN_CAPTION_IM,
4464 colgroup => IN_COLUMN_GROUP_IM,
4465 tbody => IN_TABLE_BODY_IM,
4466 tfoot => IN_TABLE_BODY_IM,
4467 thead => IN_TABLE_BODY_IM,
4468 }->{$token->{tag_name}};
4469 !!!next-token;
4470 redo B;
4471 } else {
4472 die "$0: in table: <>: $token->{tag_name}";
4473 }
4474 } elsif ($token->{tag_name} eq 'table') {
4475 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4476
4477 ## As if </table>
4478 ## have a table element in table scope
4479 my $i;
4480 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4481 my $node = $self->{open_elements}->[$_];
4482 if ($node->[1] eq 'table') {
4483 !!!cp ('t221');
4484 $i = $_;
4485 last INSCOPE;
4486 } elsif ({
4487 #table => 1,
4488 html => 1,
4489 }->{$node->[1]}) {
4490 !!!cp ('t222');
4491 last INSCOPE;
4492 }
4493 } # INSCOPE
4494 unless (defined $i) {
4495 !!!cp ('t223');
4496 ## TODO: The following is wrong, maybe.
4497 !!!parse-error (type => 'unmatched end tag:table');
4498 ## Ignore tokens </table><table>
4499 !!!next-token;
4500 redo B;
4501 }
4502
4503 ## generate implied end tags
4504 while ({
4505 dd => 1, dt => 1, li => 1, p => 1,
4506 }->{$self->{open_elements}->[-1]->[1]}) {
4507 !!!cp ('t224');
4508 pop @{$self->{open_elements}};
4509 }
4510
4511 if ($self->{open_elements}->[-1]->[1] ne 'table') {
4512 !!!cp ('t225');
4513 ## ISSUE: Can this case be reached?
4514 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4515 } else {
4516 !!!cp ('t226');
4517 }
4518
4519 splice @{$self->{open_elements}}, $i;
4520 pop @{$open_tables};
4521
4522 $self->_reset_insertion_mode;
4523
4524 ## reprocess
4525 redo B;
4526 } elsif ($token->{tag_name} eq 'style') {
4527 if (not $open_tables->[-1]->[1]) { # tainted
4528 !!!cp ('t227.8');
4529 ## NOTE: This is a "as if in head" code clone.
4530 $parse_rcdata->(CDATA_CONTENT_MODEL);
4531 redo B;
4532 } else {
4533 !!!cp ('t227.7');
4534 #
4535 }
4536 } elsif ($token->{tag_name} eq 'script') {
4537 if (not $open_tables->[-1]->[1]) { # tainted
4538 !!!cp ('t227.6');
4539 ## NOTE: This is a "as if in head" code clone.
4540 $script_start_tag->();
4541 redo B;
4542 } else {
4543 !!!cp ('t227.5');
4544 #
4545 }
4546 } elsif ($token->{tag_name} eq 'input') {
4547 if (not $open_tables->[-1]->[1]) { # tainted
4548 if ($token->{attributes}->{type}) { ## TODO: case
4549 my $type = lc $token->{attributes}->{type}->{value};
4550 if ($type eq 'hidden') {
4551 !!!cp ('t227.3');
4552 !!!parse-error (type => 'in table:'.$token->{tag_name});
4553
4554 !!!insert-element ($token->{tag_name}, $token->{attributes});
4555
4556 ## TODO: form element pointer
4557
4558 pop @{$self->{open_elements}};
4559
4560 !!!next-token;
4561 redo B;
4562 } else {
4563 !!!cp ('t227.2');
4564 #
4565 }
4566 } else {
4567 !!!cp ('t227.1');
4568 #
4569 }
4570 } else {
4571 !!!cp ('t227.4');
4572 #
4573 }
4574 } else {
4575 !!!cp ('t227');
4576 #
4577 }
4578
4579 !!!parse-error (type => 'in table:'.$token->{tag_name});
4580
4581 $insert = $insert_to_foster;
4582 #
4583 } elsif ($token->{type} == END_TAG_TOKEN) {
4584 if ($token->{tag_name} eq 'tr' and
4585 $self->{insertion_mode} == IN_ROW_IM) {
4586 ## have an element in table scope
4587 my $i;
4588 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4589 my $node = $self->{open_elements}->[$_];
4590 if ($node->[1] eq $token->{tag_name}) {
4591 !!!cp ('t228');
4592 $i = $_;
4593 last INSCOPE;
4594 } elsif ({
4595 table => 1, html => 1,
4596 }->{$node->[1]}) {
4597 !!!cp ('t229');
4598 last INSCOPE;
4599 }
4600 } # INSCOPE
4601 unless (defined $i) {
4602 !!!cp ('t230');
4603 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4604 ## Ignore the token
4605 !!!next-token;
4606 redo B;
4607 } else {
4608 !!!cp ('t232');
4609 }
4610
4611 ## Clear back to table row context
4612 while (not {
4613 tr => 1, html => 1,
4614 }->{$self->{open_elements}->[-1]->[1]}) {
4615 !!!cp ('t231');
4616 ## ISSUE: Can this state be reached?
4617 pop @{$self->{open_elements}};
4618 }
4619
4620 pop @{$self->{open_elements}}; # tr
4621 $self->{insertion_mode} = IN_TABLE_BODY_IM;
4622 !!!next-token;
4623 redo B;
4624 } elsif ($token->{tag_name} eq 'table') {
4625 if ($self->{insertion_mode} == IN_ROW_IM) {
4626 ## As if </tr>
4627 ## have an element in table scope
4628 my $i;
4629 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4630 my $node = $self->{open_elements}->[$_];
4631 if ($node->[1] eq 'tr') {
4632 !!!cp ('t233');
4633 $i = $_;
4634 last INSCOPE;
4635 } elsif ({
4636 table => 1, html => 1,
4637 }->{$node->[1]}) {
4638 !!!cp ('t234');
4639 last INSCOPE;
4640 }
4641 } # INSCOPE
4642 unless (defined $i) {
4643 !!!cp ('t235');
4644 ## TODO: The following is wrong.
4645 !!!parse-error (type => 'unmatched end tag:'.$token->{type});
4646 ## Ignore the token
4647 !!!next-token;
4648 redo B;
4649 }
4650
4651 ## Clear back to table row context
4652 while (not {
4653 tr => 1, html => 1,
4654 }->{$self->{open_elements}->[-1]->[1]}) {
4655 !!!cp ('t236');
4656 ## ISSUE: Can this state be reached?
4657 pop @{$self->{open_elements}};
4658 }
4659
4660 pop @{$self->{open_elements}}; # tr
4661 $self->{insertion_mode} = IN_TABLE_BODY_IM;
4662 ## reprocess in the "in table body" insertion mode...
4663 }
4664
4665 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
4666 ## have an element in table scope
4667 my $i;
4668 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4669 my $node = $self->{open_elements}->[$_];
4670 if ({
4671 tbody => 1, thead => 1, tfoot => 1,
4672 }->{$node->[1]}) {
4673 !!!cp ('t237');
4674 $i = $_;
4675 last INSCOPE;
4676 } elsif ({
4677 table => 1, html => 1,
4678 }->{$node->[1]}) {
4679 !!!cp ('t238');
4680 last INSCOPE;
4681 }
4682 } # INSCOPE
4683 unless (defined $i) {
4684 !!!cp ('t239');
4685 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4686 ## Ignore the token
4687 !!!next-token;
4688 redo B;
4689 }
4690
4691 ## Clear back to table body context
4692 while (not {
4693 tbody => 1, tfoot => 1, thead => 1, html => 1,
4694 }->{$self->{open_elements}->[-1]->[1]}) {
4695 !!!cp ('t240');
4696 pop @{$self->{open_elements}};
4697 }
4698
4699 ## As if <{current node}>
4700 ## have an element in table scope
4701 ## true by definition
4702
4703 ## Clear back to table body context
4704 ## nop by definition
4705
4706 pop @{$self->{open_elements}};
4707 $self->{insertion_mode} = IN_TABLE_IM;
4708 ## reprocess in the "in table" insertion mode...
4709 }
4710
4711 ## NOTE: </table> in the "in table" insertion mode.
4712 ## When you edit the code fragment below, please ensure that
4713 ## the code for <table> in the "in table" insertion mode
4714 ## is synced with it.
4715
4716 ## have a table element in table scope
4717 my $i;
4718 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4719 my $node = $self->{open_elements}->[$_];
4720 if ($node->[1] eq $token->{tag_name}) {
4721 !!!cp ('t241');
4722 $i = $_;
4723 last INSCOPE;
4724 } elsif ({
4725 table => 1, html => 1,
4726 }->{$node->[1]}) {
4727 !!!cp ('t242');
4728 last INSCOPE;
4729 }
4730 } # INSCOPE
4731 unless (defined $i) {
4732 !!!cp ('t243');
4733 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4734 ## Ignore the token
4735 !!!next-token;
4736 redo B;
4737 }
4738
4739 splice @{$self->{open_elements}}, $i;
4740 pop @{$open_tables};
4741
4742 $self->_reset_insertion_mode;
4743
4744 !!!next-token;
4745 redo B;
4746 } elsif ({
4747 tbody => 1, tfoot => 1, thead => 1,
4748 }->{$token->{tag_name}} and
4749 $self->{insertion_mode} & ROW_IMS) {
4750 if ($self->{insertion_mode} == IN_ROW_IM) {
4751 ## have an element in table scope
4752 my $i;
4753 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4754 my $node = $self->{open_elements}->[$_];
4755 if ($node->[1] eq $token->{tag_name}) {
4756 !!!cp ('t247');
4757 $i = $_;
4758 last INSCOPE;
4759 } elsif ({
4760 table => 1, html => 1,
4761 }->{$node->[1]}) {
4762 !!!cp ('t248');
4763 last INSCOPE;
4764 }
4765 } # INSCOPE
4766 unless (defined $i) {
4767 !!!cp ('t249');
4768 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4769 ## Ignore the token
4770 !!!next-token;
4771 redo B;
4772 }
4773
4774 ## As if </tr>
4775 ## have an element in table scope
4776 my $i;
4777 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4778 my $node = $self->{open_elements}->[$_];
4779 if ($node->[1] eq 'tr') {
4780 !!!cp ('t250');
4781 $i = $_;
4782 last INSCOPE;
4783 } elsif ({
4784 table => 1, html => 1,
4785 }->{$node->[1]}) {
4786 !!!cp ('t251');
4787 last INSCOPE;
4788 }
4789 } # INSCOPE
4790 unless (defined $i) {
4791 !!!cp ('t252');
4792 !!!parse-error (type => 'unmatched end tag:tr');
4793 ## Ignore the token
4794 !!!next-token;
4795 redo B;
4796 }
4797
4798 ## Clear back to table row context
4799 while (not {
4800 tr => 1, html => 1,
4801 }->{$self->{open_elements}->[-1]->[1]}) {
4802 !!!cp ('t253');
4803 ## ISSUE: Can this case be reached?
4804 pop @{$self->{open_elements}};
4805 }
4806
4807 pop @{$self->{open_elements}}; # tr
4808 $self->{insertion_mode} = IN_TABLE_BODY_IM;
4809 ## reprocess in the "in table body" insertion mode...
4810 }
4811
4812 ## have an element in table scope
4813 my $i;
4814 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4815 my $node = $self->{open_elements}->[$_];
4816 if ($node->[1] eq $token->{tag_name}) {
4817 !!!cp ('t254');
4818 $i = $_;
4819 last INSCOPE;
4820 } elsif ({
4821 table => 1, html => 1,
4822 }->{$node->[1]}) {
4823 !!!cp ('t255');
4824 last INSCOPE;
4825 }
4826 } # INSCOPE
4827 unless (defined $i) {
4828 !!!cp ('t256');
4829 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4830 ## Ignore the token
4831 !!!next-token;
4832 redo B;
4833 }
4834
4835 ## Clear back to table body context
4836 while (not {
4837 tbody => 1, tfoot => 1, thead => 1, html => 1,
4838 }->{$self->{open_elements}->[-1]->[1]}) {
4839 !!!cp ('t257');
4840 ## ISSUE: Can this case be reached?
4841 pop @{$self->{open_elements}};
4842 }
4843
4844 pop @{$self->{open_elements}};
4845 $self->{insertion_mode} = IN_TABLE_IM;
4846 !!!next-token;
4847 redo B;
4848 } elsif ({
4849 body => 1, caption => 1, col => 1, colgroup => 1,
4850 html => 1, td => 1, th => 1,
4851 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
4852 tbody => 1, tfoot => 1, thead => 1, # $self->{insertion_mode} == IN_TABLE_IM
4853 }->{$token->{tag_name}}) {
4854 !!!cp ('t258');
4855 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4856 ## Ignore the token
4857 !!!next-token;
4858 redo B;
4859 } else {
4860 !!!cp ('t259');
4861 !!!parse-error (type => 'in table:/'.$token->{tag_name});
4862
4863 $insert = $insert_to_foster;
4864 #
4865 }
4866 } else {
4867 die "$0: $token->{type}: Unknown token type";
4868 }
4869 } elsif ($self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
4870 if ($token->{type} == CHARACTER_TOKEN) {
4871 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4872 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4873 unless (length $token->{data}) {
4874 !!!cp ('t260');
4875 !!!next-token;
4876 redo B;
4877 }
4878 }
4879
4880 !!!cp ('t261');
4881 #
4882 } elsif ($token->{type} == START_TAG_TOKEN) {
4883 if ($token->{tag_name} eq 'col') {
4884 !!!cp ('t262');
4885 !!!insert-element ($token->{tag_name}, $token->{attributes});
4886 pop @{$self->{open_elements}};
4887 !!!next-token;
4888 redo B;
4889 } else {
4890 !!!cp ('t263');
4891 #
4892 }
4893 } elsif ($token->{type} == END_TAG_TOKEN) {
4894 if ($token->{tag_name} eq 'colgroup') {
4895 if ($self->{open_elements}->[-1]->[1] eq 'html') {
4896 !!!cp ('t264');
4897 !!!parse-error (type => 'unmatched end tag:colgroup');
4898 ## Ignore the token
4899 !!!next-token;
4900 redo B;
4901 } else {
4902 !!!cp ('t265');
4903 pop @{$self->{open_elements}}; # colgroup
4904 $self->{insertion_mode} = IN_TABLE_IM;
4905 !!!next-token;
4906 redo B;
4907 }
4908 } elsif ($token->{tag_name} eq 'col') {
4909 !!!cp ('t266');
4910 !!!parse-error (type => 'unmatched end tag:col');
4911 ## Ignore the token
4912 !!!next-token;
4913 redo B;
4914 } else {
4915 !!!cp ('t267');
4916 #
4917 }
4918 } else {
4919 die "$0: $token->{type}: Unknown token type";
4920 }
4921
4922 ## As if </colgroup>
4923 if ($self->{open_elements}->[-1]->[1] eq 'html') {
4924 !!!cp ('t269');
4925 !!!parse-error (type => 'unmatched end tag:colgroup');
4926 ## Ignore the token
4927 !!!next-token;
4928 redo B;
4929 } else {
4930 !!!cp ('t270');
4931 pop @{$self->{open_elements}}; # colgroup
4932 $self->{insertion_mode} = IN_TABLE_IM;
4933 ## reprocess
4934 redo B;
4935 }
4936 } elsif ($self->{insertion_mode} & SELECT_IMS) {
4937 if ($token->{type} == CHARACTER_TOKEN) {
4938 !!!cp ('t271');
4939 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4940 !!!next-token;
4941 redo B;
4942 } elsif ($token->{type} == START_TAG_TOKEN) {
4943 if ($token->{tag_name} eq 'option') {
4944 if ($self->{open_elements}->[-1]->[1] eq 'option') {
4945 !!!cp ('t272');
4946 ## As if </option>
4947 pop @{$self->{open_elements}};
4948 } else {
4949 !!!cp ('t273');
4950 }
4951
4952 !!!insert-element ($token->{tag_name}, $token->{attributes});
4953 !!!next-token;
4954 redo B;
4955 } elsif ($token->{tag_name} eq 'optgroup') {
4956 if ($self->{open_elements}->[-1]->[1] eq 'option') {
4957 !!!cp ('t274');
4958 ## As if </option>
4959 pop @{$self->{open_elements}};
4960 } else {
4961 !!!cp ('t275');
4962 }
4963
4964 if ($self->{open_elements}->[-1]->[1] eq 'optgroup') {
4965 !!!cp ('t276');
4966 ## As if </optgroup>
4967 pop @{$self->{open_elements}};
4968 } else {
4969 !!!cp ('t277');
4970 }
4971
4972 !!!insert-element ($token->{tag_name}, $token->{attributes});
4973 !!!next-token;
4974 redo B;
4975 } elsif ($token->{tag_name} eq 'select' or
4976 $token->{tag_name} eq 'input' or
4977 ($self->{insertion_mode} == IN_SELECT_IN_TABLE_IM and
4978 {
4979 caption => 1, table => 1,
4980 tbody => 1, tfoot => 1, thead => 1,
4981 tr => 1, td => 1, th => 1,
4982 }->{$token->{tag_name}})) {
4983 ## TODO: The type below is not good - <select> is replaced by </select>
4984 !!!parse-error (type => 'not closed:select');
4985 ## NOTE: As if the token were </select> (<select> case) or
4986 ## as if there were </select> (otherwise).
4987 ## have an element in table scope
4988 my $i;
4989 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4990 my $node = $self->{open_elements}->[$_];
4991 if ($node->[1] eq 'select') {
4992 !!!cp ('t278');
4993 $i = $_;
4994 last INSCOPE;
4995 } elsif ({
4996 table => 1, html => 1,
4997 }->{$node->[1]}) {
4998 !!!cp ('t279');
4999 last INSCOPE;
5000 }
5001 } # INSCOPE
5002 unless (defined $i) {
5003 !!!cp ('t280');
5004 !!!parse-error (type => 'unmatched end tag:select');
5005 ## Ignore the token
5006 !!!next-token;
5007 redo B;
5008 }
5009
5010 !!!cp ('t281');
5011 splice @{$self->{open_elements}}, $i;
5012
5013 $self->_reset_insertion_mode;
5014
5015 if ($token->{tag_name} eq 'select') {
5016 !!!cp ('t281.2');
5017 !!!next-token;
5018 redo B;
5019 } else {
5020 !!!cp ('t281.1');
5021 ## Reprocess the token.
5022 redo B;
5023 }
5024 } else {
5025 !!!cp ('t282');
5026 !!!parse-error (type => 'in select:'.$token->{tag_name});
5027 ## Ignore the token
5028 !!!next-token;
5029 redo B;
5030 }
5031 } elsif ($token->{type} == END_TAG_TOKEN) {
5032 if ($token->{tag_name} eq 'optgroup') {
5033 if ($self->{open_elements}->[-1]->[1] eq 'option' and
5034 $self->{open_elements}->[-2]->[1] eq 'optgroup') {
5035 !!!cp ('t283');
5036 ## As if </option>
5037 splice @{$self->{open_elements}}, -2;
5038 } elsif ($self->{open_elements}->[-1]->[1] eq 'optgroup') {
5039 !!!cp ('t284');
5040 pop @{$self->{open_elements}};
5041 } else {
5042 !!!cp ('t285');
5043 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5044 ## Ignore the token
5045 }
5046 !!!next-token;
5047 redo B;
5048 } elsif ($token->{tag_name} eq 'option') {
5049 if ($self->{open_elements}->[-1]->[1] eq 'option') {
5050 !!!cp ('t286');
5051 pop @{$self->{open_elements}};
5052 } else {
5053 !!!cp ('t287');
5054 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5055 ## Ignore the token
5056 }
5057 !!!next-token;
5058 redo B;
5059 } elsif ($token->{tag_name} eq 'select') {
5060 ## have an element in table scope
5061 my $i;
5062 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5063 my $node = $self->{open_elements}->[$_];
5064 if ($node->[1] eq $token->{tag_name}) {
5065 !!!cp ('t288');
5066 $i = $_;
5067 last INSCOPE;
5068 } elsif ({
5069 table => 1, html => 1,
5070 }->{$node->[1]}) {
5071 !!!cp ('t289');
5072 last INSCOPE;
5073 }
5074 } # INSCOPE
5075 unless (defined $i) {
5076 !!!cp ('t290');
5077 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5078 ## Ignore the token
5079 !!!next-token;
5080 redo B;
5081 }
5082
5083 !!!cp ('t291');
5084 splice @{$self->{open_elements}}, $i;
5085
5086 $self->_reset_insertion_mode;
5087
5088 !!!next-token;
5089 redo B;
5090 } elsif ($self->{insertion_mode} == IN_SELECT_IN_TABLE_IM and
5091 {
5092 caption => 1, table => 1, tbody => 1,
5093 tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
5094 }->{$token->{tag_name}}) {
5095 ## TODO: The following is wrong?
5096 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5097
5098 ## have an element in table scope
5099 my $i;
5100 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5101 my $node = $self->{open_elements}->[$_];
5102 if ($node->[1] eq $token->{tag_name}) {
5103 !!!cp ('t292');
5104 $i = $_;
5105 last INSCOPE;
5106 } elsif ({
5107 table => 1, html => 1,
5108 }->{$node->[1]}) {
5109 !!!cp ('t293');
5110 last INSCOPE;
5111 }
5112 } # INSCOPE
5113 unless (defined $i) {
5114 !!!cp ('t294');
5115 ## Ignore the token
5116 !!!next-token;
5117 redo B;
5118 }
5119
5120 ## As if </select>
5121 ## have an element in table scope
5122 undef $i;
5123 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5124 my $node = $self->{open_elements}->[$_];
5125 if ($node->[1] eq 'select') {
5126 !!!cp ('t295');
5127 $i = $_;
5128 last INSCOPE;
5129 } elsif ({
5130 table => 1, html => 1,
5131 }->{$node->[1]}) {
5132 ## ISSUE: Can this state be reached?
5133 !!!cp ('t296');
5134 last INSCOPE;
5135 }
5136 } # INSCOPE
5137 unless (defined $i) {
5138 !!!cp ('t297');
5139 ## TODO: The following error type is correct?
5140 !!!parse-error (type => 'unmatched end tag:select');
5141 ## Ignore the </select> token
5142 !!!next-token; ## TODO: ok?
5143 redo B;
5144 }
5145
5146 !!!cp ('t298');
5147 splice @{$self->{open_elements}}, $i;
5148
5149 $self->_reset_insertion_mode;
5150
5151 ## reprocess
5152 redo B;
5153 } else {
5154 !!!cp ('t299');
5155 !!!parse-error (type => 'in select:/'.$token->{tag_name});
5156 ## Ignore the token
5157 !!!next-token;
5158 redo B;
5159 }
5160 } else {
5161 die "$0: $token->{type}: Unknown token type";
5162 }
5163 } elsif ($self->{insertion_mode} & BODY_AFTER_IMS) {
5164 if ($token->{type} == CHARACTER_TOKEN) {
5165 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5166 my $data = $1;
5167 ## As if in body
5168 $reconstruct_active_formatting_elements->($insert_to_current);
5169
5170 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5171
5172 unless (length $token->{data}) {
5173 !!!cp ('t300');
5174 !!!next-token;
5175 redo B;
5176 }
5177 }
5178
5179 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
5180 !!!cp ('t301');
5181 !!!parse-error (type => 'after html:#character');
5182
5183 ## Reprocess in the "after body" insertion mode.
5184 } else {
5185 !!!cp ('t302');
5186 }
5187
5188 ## "after body" insertion mode
5189 !!!parse-error (type => 'after body:#character');
5190
5191 $self->{insertion_mode} = IN_BODY_IM;
5192 ## reprocess
5193 redo B;
5194 } elsif ($token->{type} == START_TAG_TOKEN) {
5195 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
5196 !!!cp ('t303');
5197 !!!parse-error (type => 'after html:'.$token->{tag_name});
5198
5199 ## Reprocess in the "after body" insertion mode.
5200 } else {
5201 !!!cp ('t304');
5202 }
5203
5204 ## "after body" insertion mode
5205 !!!parse-error (type => 'after body:'.$token->{tag_name});
5206
5207 $self->{insertion_mode} = IN_BODY_IM;
5208 ## reprocess
5209 redo B;
5210 } elsif ($token->{type} == END_TAG_TOKEN) {
5211 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
5212 !!!cp ('t305');
5213 !!!parse-error (type => 'after html:/'.$token->{tag_name});
5214
5215 $self->{insertion_mode} = AFTER_BODY_IM;
5216 ## Reprocess in the "after body" insertion mode.
5217 } else {
5218 !!!cp ('t306');
5219 }
5220
5221 ## "after body" insertion mode
5222 if ($token->{tag_name} eq 'html') {
5223 if (defined $self->{inner_html_node}) {
5224 !!!cp ('t307');
5225 !!!parse-error (type => 'unmatched end tag:html');
5226 ## Ignore the token
5227 !!!next-token;
5228 redo B;
5229 } else {
5230 !!!cp ('t308');
5231 $self->{insertion_mode} = AFTER_HTML_BODY_IM;
5232 !!!next-token;
5233 redo B;
5234 }
5235 } else {
5236 !!!cp ('t309');
5237 !!!parse-error (type => 'after body:/'.$token->{tag_name});
5238
5239 $self->{insertion_mode} = IN_BODY_IM;
5240 ## reprocess
5241 redo B;
5242 }
5243 } else {
5244 die "$0: $token->{type}: Unknown token type";
5245 }
5246 } elsif ($self->{insertion_mode} & FRAME_IMS) {
5247 if ($token->{type} == CHARACTER_TOKEN) {
5248 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5249 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5250
5251 unless (length $token->{data}) {
5252 !!!cp ('t310');
5253 !!!next-token;
5254 redo B;
5255 }
5256 }
5257
5258 if ($token->{data} =~ s/^[^\x09\x0A\x0B\x0C\x20]+//) {
5259 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
5260 !!!cp ('t311');
5261 !!!parse-error (type => 'in frameset:#character');
5262 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
5263 !!!cp ('t312');
5264 !!!parse-error (type => 'after frameset:#character');
5265 } else { # "after html frameset"
5266 !!!cp ('t313');
5267 !!!parse-error (type => 'after html:#character');
5268
5269 $self->{insertion_mode} = AFTER_FRAMESET_IM;
5270 ## Reprocess in the "after frameset" insertion mode.
5271 !!!parse-error (type => 'after frameset:#character');
5272 }
5273
5274 ## Ignore the token.
5275 if (length $token->{data}) {
5276 !!!cp ('t314');
5277 ## reprocess the rest of characters
5278 } else {
5279 !!!cp ('t315');
5280 !!!next-token;
5281 }
5282 redo B;
5283 }
5284
5285 die qq[$0: Character "$token->{data}"];
5286 } elsif ($token->{type} == START_TAG_TOKEN) {
5287 if ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
5288 !!!cp ('t316');
5289 !!!parse-error (type => 'after html:'.$token->{tag_name});
5290
5291 $self->{insertion_mode} = AFTER_FRAMESET_IM;
5292 ## Process in the "after frameset" insertion mode.
5293 } else {
5294 !!!cp ('t317');
5295 }
5296
5297 if ($token->{tag_name} eq 'frameset' and
5298 $self->{insertion_mode} == IN_FRAMESET_IM) {
5299 !!!cp ('t318');
5300 !!!insert-element ($token->{tag_name}, $token->{attributes});
5301 !!!next-token;
5302 redo B;
5303 } elsif ($token->{tag_name} eq 'frame' and
5304 $self->{insertion_mode} == IN_FRAMESET_IM) {
5305 !!!cp ('t319');
5306 !!!insert-element ($token->{tag_name}, $token->{attributes});
5307 pop @{$self->{open_elements}};
5308 !!!next-token;
5309 redo B;
5310 } elsif ($token->{tag_name} eq 'noframes') {
5311 !!!cp ('t320');
5312 ## NOTE: As if in body.
5313 $parse_rcdata->(CDATA_CONTENT_MODEL);
5314 redo B;
5315 } else {
5316 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
5317 !!!cp ('t321');
5318 !!!parse-error (type => 'in frameset:'.$token->{tag_name});
5319 } else {
5320 !!!cp ('t322');
5321 !!!parse-error (type => 'after frameset:'.$token->{tag_name});
5322 }
5323 ## Ignore the token
5324 !!!next-token;
5325 redo B;
5326 }
5327 } elsif ($token->{type} == END_TAG_TOKEN) {
5328 if ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
5329 !!!cp ('t323');
5330 !!!parse-error (type => 'after html:/'.$token->{tag_name});
5331
5332 $self->{insertion_mode} = AFTER_FRAMESET_IM;
5333 ## Process in the "after frameset" insertion mode.
5334 } else {
5335 !!!cp ('t324');
5336 }
5337
5338 if ($token->{tag_name} eq 'frameset' and
5339 $self->{insertion_mode} == IN_FRAMESET_IM) {
5340 if ($self->{open_elements}->[-1]->[1] eq 'html' and
5341 @{$self->{open_elements}} == 1) {
5342 !!!cp ('t325');
5343 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5344 ## Ignore the token
5345 !!!next-token;
5346 } else {
5347 !!!cp ('t326');
5348 pop @{$self->{open_elements}};
5349 !!!next-token;
5350 }
5351
5352 if (not defined $self->{inner_html_node} and
5353 $self->{open_elements}->[-1]->[1] ne 'frameset') {
5354 !!!cp ('t327');
5355 $self->{insertion_mode} = AFTER_FRAMESET_IM;
5356 } else {
5357 !!!cp ('t328');
5358 }
5359 redo B;
5360 } elsif ($token->{tag_name} eq 'html' and
5361 $self->{insertion_mode} == AFTER_FRAMESET_IM) {
5362 !!!cp ('t329');
5363 $self->{insertion_mode} = AFTER_HTML_FRAMESET_IM;
5364 !!!next-token;
5365 redo B;
5366 } else {
5367 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
5368 !!!cp ('t330');
5369 !!!parse-error (type => 'in frameset:/'.$token->{tag_name});
5370 } else {
5371 !!!cp ('t331');
5372 !!!parse-error (type => 'after frameset:/'.$token->{tag_name});
5373 }
5374 ## Ignore the token
5375 !!!next-token;
5376 redo B;
5377 }
5378 } else {
5379 die "$0: $token->{type}: Unknown token type";
5380 }
5381
5382 ## ISSUE: An issue in spec here
5383 } else {
5384 die "$0: $self->{insertion_mode}: Unknown insertion mode";
5385 }
5386
5387 ## "in body" insertion mode
5388 if ($token->{type} == START_TAG_TOKEN) {
5389 if ($token->{tag_name} eq 'script') {
5390 !!!cp ('t332');
5391 ## NOTE: This is an "as if in head" code clone
5392 $script_start_tag->();
5393 redo B;
5394 } elsif ($token->{tag_name} eq 'style') {
5395 !!!cp ('t333');
5396 ## NOTE: This is an "as if in head" code clone
5397 $parse_rcdata->(CDATA_CONTENT_MODEL);
5398 redo B;
5399 } elsif ({
5400 base => 1, link => 1,
5401 }->{$token->{tag_name}}) {
5402 !!!cp ('t334');
5403 ## NOTE: This is an "as if in head" code clone, only "-t" differs
5404 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5405 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
5406 !!!next-token;
5407 redo B;
5408 } elsif ($token->{tag_name} eq 'meta') {
5409 ## NOTE: This is an "as if in head" code clone, only "-t" differs
5410 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5411 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
5412
5413 unless ($self->{confident}) {
5414 if ($token->{attributes}->{charset}) { ## TODO: And if supported
5415 !!!cp ('t335');
5416 $self->{change_encoding}
5417 ->($self, $token->{attributes}->{charset}->{value});
5418
5419 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
5420 ->set_user_data (manakai_has_reference =>
5421 $token->{attributes}->{charset}
5422 ->{has_reference});
5423 } elsif ($token->{attributes}->{content}) {
5424 ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
5425 if ($token->{attributes}->{content}->{value}
5426 =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
5427 [\x09-\x0D\x20]*=
5428 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
5429 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
5430 !!!cp ('t336');
5431 $self->{change_encoding}
5432 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3);
5433 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
5434 ->set_user_data (manakai_has_reference =>
5435 $token->{attributes}->{content}
5436 ->{has_reference});
5437 }
5438 }
5439 } else {
5440 if ($token->{attributes}->{charset}) {
5441 !!!cp ('t337');
5442 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
5443 ->set_user_data (manakai_has_reference =>
5444 $token->{attributes}->{charset}
5445 ->{has_reference});
5446 }
5447 if ($token->{attributes}->{content}) {
5448 !!!cp ('t338');
5449 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
5450 ->set_user_data (manakai_has_reference =>
5451 $token->{attributes}->{content}
5452 ->{has_reference});
5453 }
5454 }
5455
5456 !!!next-token;
5457 redo B;
5458 } elsif ($token->{tag_name} eq 'title') {
5459 !!!cp ('t341');
5460 ## NOTE: This is an "as if in head" code clone
5461 $parse_rcdata->(RCDATA_CONTENT_MODEL);
5462 redo B;
5463 } elsif ($token->{tag_name} eq 'body') {
5464 !!!parse-error (type => 'in body:body');
5465
5466 if (@{$self->{open_elements}} == 1 or
5467 $self->{open_elements}->[1]->[1] ne 'body') {
5468 !!!cp ('t342');
5469 ## Ignore the token
5470 } else {
5471 my $body_el = $self->{open_elements}->[1]->[0];
5472 for my $attr_name (keys %{$token->{attributes}}) {
5473 unless ($body_el->has_attribute_ns (undef, $attr_name)) {
5474 !!!cp ('t343');
5475 $body_el->set_attribute_ns
5476 (undef, [undef, $attr_name],
5477 $token->{attributes}->{$attr_name}->{value});
5478 }
5479 }
5480 }
5481 !!!next-token;
5482 redo B;
5483 } elsif ({
5484 address => 1, blockquote => 1, center => 1, dir => 1,
5485 div => 1, dl => 1, fieldset => 1,
5486 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
5487 menu => 1, ol => 1, p => 1, ul => 1,
5488 pre => 1, listing => 1,
5489 }->{$token->{tag_name}}) {
5490 ## has a p element in scope
5491 INSCOPE: for (reverse @{$self->{open_elements}}) {
5492 if ($_->[1] eq 'p') {
5493 !!!cp ('t344');
5494 !!!back-token;
5495 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
5496 redo B;
5497 } elsif ({
5498 applet => 1, table => 1, caption => 1, td => 1, th => 1,
5499 button => 1, marquee => 1, object => 1, html => 1,
5500 }->{$_->[1]}) {
5501 !!!cp ('t345');
5502 last INSCOPE;
5503 }
5504 } # INSCOPE
5505
5506 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5507 if ($token->{tag_name} eq 'pre' or $token->{tag_name} eq 'listing') {
5508 !!!next-token;
5509 if ($token->{type} == CHARACTER_TOKEN) {
5510 $token->{data} =~ s/^\x0A//;
5511 unless (length $token->{data}) {
5512 !!!cp ('t346');
5513 !!!next-token;
5514 } else {
5515 !!!cp ('t349');
5516 }
5517 } else {
5518 !!!cp ('t348');
5519 }
5520 } else {
5521 !!!cp ('t347');
5522 !!!next-token;
5523 }
5524 redo B;
5525 } elsif ($token->{tag_name} eq 'form') {
5526 if (defined $self->{form_element}) {
5527 !!!cp ('t350');
5528 !!!parse-error (type => 'in form:form');
5529 ## Ignore the token
5530 !!!next-token;
5531 redo B;
5532 } else {
5533 ## has a p element in scope
5534 INSCOPE: for (reverse @{$self->{open_elements}}) {
5535 if ($_->[1] eq 'p') {
5536 !!!cp ('t351');
5537 !!!back-token;
5538 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
5539 redo B;
5540 } elsif ({
5541 applet => 1, table => 1, caption => 1, td => 1, th => 1,
5542 button => 1, marquee => 1, object => 1, html => 1,
5543 }->{$_->[1]}) {
5544 !!!cp ('t352');
5545 last INSCOPE;
5546 }
5547 } # INSCOPE
5548
5549 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5550 $self->{form_element} = $self->{open_elements}->[-1]->[0];
5551 !!!next-token;
5552 redo B;
5553 }
5554 } elsif ($token->{tag_name} eq 'li') {
5555 ## has a p element in scope
5556 INSCOPE: for (reverse @{$self->{open_elements}}) {
5557 if ($_->[1] eq 'p') {
5558 !!!cp ('t353');
5559 !!!back-token;
5560 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
5561 redo B;
5562 } elsif ({
5563 applet => 1, table => 1, caption => 1, td => 1, th => 1,
5564 button => 1, marquee => 1, object => 1, html => 1,
5565 }->{$_->[1]}) {
5566 !!!cp ('t354');
5567 last INSCOPE;
5568 }
5569 } # INSCOPE
5570
5571 ## Step 1
5572 my $i = -1;
5573 my $node = $self->{open_elements}->[$i];
5574 LI: {
5575 ## Step 2
5576 if ($node->[1] eq 'li') {
5577 if ($i != -1) {
5578 !!!cp ('t355');
5579 !!!parse-error (type => 'end tag missing:'.
5580 $self->{open_elements}->[-1]->[1]);
5581 } else {
5582 !!!cp ('t356');
5583 }
5584 splice @{$self->{open_elements}}, $i;
5585 last LI;
5586 } else {
5587 !!!cp ('t357');
5588 }
5589
5590 ## Step 3
5591 if (not $formatting_category->{$node->[1]} and
5592 #not $phrasing_category->{$node->[1]} and
5593 ($special_category->{$node->[1]} or
5594 $scoping_category->{$node->[1]}) and
5595 $node->[1] ne 'address' and $node->[1] ne 'div') {
5596 !!!cp ('t358');
5597 last LI;
5598 }
5599
5600 !!!cp ('t359');
5601 ## Step 4
5602 $i--;
5603 $node = $self->{open_elements}->[$i];
5604 redo LI;
5605 } # LI
5606
5607 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5608 !!!next-token;
5609 redo B;
5610 } elsif ($token->{tag_name} eq 'dd' or $token->{tag_name} eq 'dt') {
5611 ## has a p element in scope
5612 INSCOPE: for (reverse @{$self->{open_elements}}) {
5613 if ($_->[1] eq 'p') {
5614 !!!cp ('t360');
5615 !!!back-token;
5616 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
5617 redo B;
5618 } elsif ({
5619 applet => 1, table => 1, caption => 1, td => 1, th => 1,
5620 button => 1, marquee => 1, object => 1, html => 1,
5621 }->{$_->[1]}) {
5622 !!!cp ('t361');
5623 last INSCOPE;
5624 }
5625 } # INSCOPE
5626
5627 ## Step 1
5628 my $i = -1;
5629 my $node = $self->{open_elements}->[$i];
5630 LI: {
5631 ## Step 2
5632 if ($node->[1] eq 'dt' or $node->[1] eq 'dd') {
5633 if ($i != -1) {
5634 !!!cp ('t362');
5635 !!!parse-error (type => 'end tag missing:'.
5636 $self->{open_elements}->[-1]->[1]);
5637 } else {
5638 !!!cp ('t363');
5639 }
5640 splice @{$self->{open_elements}}, $i;
5641 last LI;
5642 } else {
5643 !!!cp ('t364');
5644 }
5645
5646 ## Step 3
5647 if (not $formatting_category->{$node->[1]} and
5648 #not $phrasing_category->{$node->[1]} and
5649 ($special_category->{$node->[1]} or
5650 $scoping_category->{$node->[1]}) and
5651 $node->[1] ne 'address' and $node->[1] ne 'div') {
5652 !!!cp ('t365');
5653 last LI;
5654 }
5655
5656 !!!cp ('t366');
5657 ## Step 4
5658 $i--;
5659 $node = $self->{open_elements}->[$i];
5660 redo LI;
5661 } # LI
5662
5663 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5664 !!!next-token;
5665 redo B;
5666 } elsif ($token->{tag_name} eq 'plaintext') {
5667 ## has a p element in scope
5668 INSCOPE: for (reverse @{$self->{open_elements}}) {
5669 if ($_->[1] eq 'p') {
5670 !!!cp ('t367');
5671 !!!back-token;
5672 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
5673 redo B;
5674 } elsif ({
5675 applet => 1, table => 1, caption => 1, td => 1, th => 1,
5676 button => 1, marquee => 1, object => 1, html => 1,
5677 }->{$_->[1]}) {
5678 !!!cp ('t368');
5679 last INSCOPE;
5680 }
5681 } # INSCOPE
5682
5683 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5684
5685 $self->{content_model} = PLAINTEXT_CONTENT_MODEL;
5686
5687 !!!next-token;
5688 redo B;
5689 } elsif ($token->{tag_name} eq 'a') {
5690 AFE: for my $i (reverse 0..$#$active_formatting_elements) {
5691 my $node = $active_formatting_elements->[$i];
5692 if ($node->[1] eq 'a') {
5693 !!!cp ('t371');
5694 !!!parse-error (type => 'in a:a');
5695
5696 !!!back-token;
5697 $token = {type => END_TAG_TOKEN, tag_name => 'a'};
5698 $formatting_end_tag->($token->{tag_name});
5699
5700 AFE2: for (reverse 0..$#$active_formatting_elements) {
5701 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
5702 !!!cp ('t372');
5703 splice @$active_formatting_elements, $_, 1;
5704 last AFE2;
5705 }
5706 } # AFE2
5707 OE: for (reverse 0..$#{$self->{open_elements}}) {
5708 if ($self->{open_elements}->[$_]->[0] eq $node->[0]) {
5709 !!!cp ('t373');
5710 splice @{$self->{open_elements}}, $_, 1;
5711 last OE;
5712 }
5713 } # OE
5714 last AFE;
5715 } elsif ($node->[0] eq '#marker') {
5716 !!!cp ('t374');
5717 last AFE;
5718 }
5719 } # AFE
5720
5721 $reconstruct_active_formatting_elements->($insert_to_current);
5722
5723 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5724 push @$active_formatting_elements, $self->{open_elements}->[-1];
5725
5726 !!!next-token;
5727 redo B;
5728 } elsif ({
5729 b => 1, big => 1, em => 1, font => 1, i => 1,
5730 s => 1, small => 1, strile => 1,
5731 strong => 1, tt => 1, u => 1,
5732 }->{$token->{tag_name}}) {
5733 !!!cp ('t375');
5734 $reconstruct_active_formatting_elements->($insert_to_current);
5735
5736 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5737 push @$active_formatting_elements, $self->{open_elements}->[-1];
5738
5739 !!!next-token;
5740 redo B;
5741 } elsif ($token->{tag_name} eq 'nobr') {
5742 $reconstruct_active_formatting_elements->($insert_to_current);
5743
5744 ## has a |nobr| element in scope
5745 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5746 my $node = $self->{open_elements}->[$_];
5747 if ($node->[1] eq 'nobr') {
5748 !!!cp ('t376');
5749 !!!parse-error (type => 'in nobr:nobr');
5750 !!!back-token;
5751 $token = {type => END_TAG_TOKEN, tag_name => 'nobr'};
5752 redo B;
5753 } elsif ({
5754 applet => 1, table => 1, caption => 1, td => 1, th => 1,
5755 button => 1, marquee => 1, object => 1, html => 1,
5756 }->{$node->[1]}) {
5757 !!!cp ('t377');
5758 last INSCOPE;
5759 }
5760 } # INSCOPE
5761
5762 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5763 push @$active_formatting_elements, $self->{open_elements}->[-1];
5764
5765 !!!next-token;
5766 redo B;
5767 } elsif ($token->{tag_name} eq 'button') {
5768 ## has a button element in scope
5769 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5770 my $node = $self->{open_elements}->[$_];
5771 if ($node->[1] eq 'button') {
5772 !!!cp ('t378');
5773 !!!parse-error (type => 'in button:button');
5774 !!!back-token;
5775 $token = {type => END_TAG_TOKEN, tag_name => 'button'};
5776 redo B;
5777 } elsif ({
5778 applet => 1, table => 1, caption => 1, td => 1, th => 1,
5779 button => 1, marquee => 1, object => 1, html => 1,
5780 }->{$node->[1]}) {
5781 !!!cp ('t379');
5782 last INSCOPE;
5783 }
5784 } # INSCOPE
5785
5786 $reconstruct_active_formatting_elements->($insert_to_current);
5787
5788 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5789
5790 ## TODO: associate with $self->{form_element} if defined
5791
5792 push @$active_formatting_elements, ['#marker', ''];
5793
5794 !!!next-token;
5795 redo B;
5796 } elsif ({
5797 applet => 1, marquee => 1, object => 1,
5798 }->{$token->{tag_name}}) {
5799 !!!cp ('t380');
5800 $reconstruct_active_formatting_elements->($insert_to_current);
5801
5802 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5803 push @$active_formatting_elements, ['#marker', ''];
5804
5805 !!!next-token;
5806 redo B;
5807 } elsif ($token->{tag_name} eq 'xmp') {
5808 !!!cp ('t381');
5809 $reconstruct_active_formatting_elements->($insert_to_current);
5810 $parse_rcdata->(CDATA_CONTENT_MODEL);
5811 redo B;
5812 } elsif ($token->{tag_name} eq 'table') {
5813 ## has a p element in scope
5814 INSCOPE: for (reverse @{$self->{open_elements}}) {
5815 if ($_->[1] eq 'p') {
5816 !!!cp ('t382');
5817 !!!back-token;
5818 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
5819 redo B;
5820 } elsif ({
5821 applet => 1, table => 1, caption => 1, td => 1, th => 1,
5822 button => 1, marquee => 1, object => 1, html => 1,
5823 }->{$_->[1]}) {
5824 !!!cp ('t383');
5825 last INSCOPE;
5826 }
5827 } # INSCOPE
5828
5829 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5830 push @{$open_tables}, [$self->{open_elements}->[-1]->[0]];
5831
5832 $self->{insertion_mode} = IN_TABLE_IM;
5833
5834 !!!next-token;
5835 redo B;
5836 } elsif ({
5837 area => 1, basefont => 1, bgsound => 1, br => 1,
5838 embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
5839 image => 1,
5840 }->{$token->{tag_name}}) {
5841 if ($token->{tag_name} eq 'image') {
5842 !!!cp ('t384');
5843 !!!parse-error (type => 'image');
5844 $token->{tag_name} = 'img';
5845 } else {
5846 !!!cp ('t385');
5847 }
5848
5849 ## NOTE: There is an "as if <br>" code clone.
5850 $reconstruct_active_formatting_elements->($insert_to_current);
5851
5852 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5853 pop @{$self->{open_elements}};
5854
5855 !!!next-token;
5856 redo B;
5857 } elsif ($token->{tag_name} eq 'hr') {
5858 ## has a p element in scope
5859 INSCOPE: for (reverse @{$self->{open_elements}}) {
5860 if ($_->[1] eq 'p') {
5861 !!!cp ('t386');
5862 !!!back-token;
5863 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
5864 redo B;
5865 } elsif ({
5866 applet => 1, table => 1, caption => 1, td => 1, th => 1,
5867 button => 1, marquee => 1, object => 1, html => 1,
5868 }->{$_->[1]}) {
5869 !!!cp ('t387');
5870 last INSCOPE;
5871 }
5872 } # INSCOPE
5873
5874 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5875 pop @{$self->{open_elements}};
5876
5877 !!!next-token;
5878 redo B;
5879 } elsif ($token->{tag_name} eq 'input') {
5880 !!!cp ('t388');
5881 $reconstruct_active_formatting_elements->($insert_to_current);
5882
5883 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5884 ## TODO: associate with $self->{form_element} if defined
5885 pop @{$self->{open_elements}};
5886
5887 !!!next-token;
5888 redo B;
5889 } elsif ($token->{tag_name} eq 'isindex') {
5890 !!!parse-error (type => 'isindex');
5891
5892 if (defined $self->{form_element}) {
5893 !!!cp ('t389');
5894 ## Ignore the token
5895 !!!next-token;
5896 redo B;
5897 } else {
5898 my $at = $token->{attributes};
5899 my $form_attrs;
5900 $form_attrs->{action} = $at->{action} if $at->{action};
5901 my $prompt_attr = $at->{prompt};
5902 $at->{name} = {name => 'name', value => 'isindex'};
5903 delete $at->{action};
5904 delete $at->{prompt};
5905 my @tokens = (
5906 {type => START_TAG_TOKEN, tag_name => 'form',
5907 attributes => $form_attrs},
5908 {type => START_TAG_TOKEN, tag_name => 'hr'},
5909 {type => START_TAG_TOKEN, tag_name => 'p'},
5910 {type => START_TAG_TOKEN, tag_name => 'label'},
5911 );
5912 if ($prompt_attr) {
5913 !!!cp ('t390');
5914 push @tokens, {type => CHARACTER_TOKEN, data => $prompt_attr->{value}};
5915 } else {
5916 !!!cp ('t391');
5917 push @tokens, {type => CHARACTER_TOKEN,
5918 data => 'This is a searchable index. Insert your search keywords here: '}; # SHOULD
5919 ## TODO: make this configurable
5920 }
5921 push @tokens,
5922 {type => START_TAG_TOKEN, tag_name => 'input', attributes => $at},
5923 #{type => CHARACTER_TOKEN, data => ''}, # SHOULD
5924 {type => END_TAG_TOKEN, tag_name => 'label'},
5925 {type => END_TAG_TOKEN, tag_name => 'p'},
5926 {type => START_TAG_TOKEN, tag_name => 'hr'},
5927 {type => END_TAG_TOKEN, tag_name => 'form'};
5928 $token = shift @tokens;
5929 !!!back-token (@tokens);
5930 redo B;
5931 }
5932 } elsif ($token->{tag_name} eq 'textarea') {
5933 my $tag_name = $token->{tag_name};
5934 my $el;
5935 !!!create-element ($el, $token->{tag_name}, $token->{attributes});
5936
5937 ## TODO: $self->{form_element} if defined
5938 $self->{content_model} = RCDATA_CONTENT_MODEL;
5939 delete $self->{escape}; # MUST
5940
5941 $insert->($el);
5942
5943 my $text = '';
5944 !!!next-token;
5945 if ($token->{type} == CHARACTER_TOKEN) {
5946 $token->{data} =~ s/^\x0A//;
5947 unless (length $token->{data}) {
5948 !!!cp ('t392');
5949 !!!next-token;
5950 } else {
5951 !!!cp ('t393');
5952 }
5953 } else {
5954 !!!cp ('t394');
5955 }
5956 while ($token->{type} == CHARACTER_TOKEN) {
5957 !!!cp ('t395');
5958 $text .= $token->{data};
5959 !!!next-token;
5960 }
5961 if (length $text) {
5962 !!!cp ('t396');
5963 $el->manakai_append_text ($text);
5964 }
5965
5966 $self->{content_model} = PCDATA_CONTENT_MODEL;
5967
5968 if ($token->{type} == END_TAG_TOKEN and
5969 $token->{tag_name} eq $tag_name) {
5970 !!!cp ('t397');
5971 ## Ignore the token
5972 } else {
5973 !!!cp ('t398');
5974 !!!parse-error (type => 'in RCDATA:#'.$token->{type});
5975 }
5976 !!!next-token;
5977 redo B;
5978 } elsif ({
5979 iframe => 1,
5980 noembed => 1,
5981 noframes => 1,
5982 noscript => 0, ## TODO: 1 if scripting is enabled
5983 }->{$token->{tag_name}}) {
5984 !!!cp ('t399');
5985 ## NOTE: There is an "as if in body" code clone.
5986 $parse_rcdata->(CDATA_CONTENT_MODEL);
5987 redo B;
5988 } elsif ($token->{tag_name} eq 'select') {
5989 !!!cp ('t400');
5990 $reconstruct_active_formatting_elements->($insert_to_current);
5991
5992 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5993
5994 ## TODO: associate with $self->{form_element} if defined
5995
5996 if ($self->{insertion_mode} & TABLE_IMS or
5997 $self->{insertion_mode} & BODY_TABLE_IMS or
5998 $self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
5999 !!!cp ('t400.1');
6000 $self->{insertion_mode} = IN_SELECT_IN_TABLE_IM;
6001 } else {
6002 !!!cp ('t400.2');
6003 $self->{insertion_mode} = IN_SELECT_IM;
6004 }
6005 !!!next-token;
6006 redo B;
6007 } elsif ({
6008 caption => 1, col => 1, colgroup => 1, frame => 1,
6009 frameset => 1, head => 1, option => 1, optgroup => 1,
6010 tbody => 1, td => 1, tfoot => 1, th => 1,
6011 thead => 1, tr => 1,
6012 }->{$token->{tag_name}}) {
6013 !!!cp ('t401');
6014 !!!parse-error (type => 'in body:'.$token->{tag_name});
6015 ## Ignore the token
6016 !!!next-token;
6017 redo B;
6018
6019 ## ISSUE: An issue on HTML5 new elements in the spec.
6020 } else {
6021 !!!cp ('t402');
6022 $reconstruct_active_formatting_elements->($insert_to_current);
6023
6024 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
6025
6026 !!!next-token;
6027 redo B;
6028 }
6029 } elsif ($token->{type} == END_TAG_TOKEN) {
6030 if ($token->{tag_name} eq 'body') {
6031 if (@{$self->{open_elements}} > 1 and
6032 $self->{open_elements}->[1]->[1] eq 'body') {
6033 for (@{$self->{open_elements}}) {
6034 unless ({
6035 dd => 1, dt => 1, li => 1, p => 1, td => 1,
6036 th => 1, tr => 1, body => 1, html => 1,
6037 tbody => 1, tfoot => 1, thead => 1,
6038 }->{$_->[1]}) {
6039 !!!cp ('t403');
6040 !!!parse-error (type => 'not closed:'.$_->[1]);
6041 } else {
6042 !!!cp ('t404');
6043 }
6044 }
6045
6046 $self->{insertion_mode} = AFTER_BODY_IM;
6047 !!!next-token;
6048 redo B;
6049 } else {
6050 !!!cp ('t405');
6051 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
6052 ## Ignore the token
6053 !!!next-token;
6054 redo B;
6055 }
6056 } elsif ($token->{tag_name} eq 'html') {
6057 if (@{$self->{open_elements}} > 1 and $self->{open_elements}->[1]->[1] eq 'body') {
6058 ## ISSUE: There is an issue in the spec.
6059 if ($self->{open_elements}->[-1]->[1] ne 'body') {
6060 !!!cp ('t406');
6061 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[1]->[1]);
6062 } else {
6063 !!!cp ('t407');
6064 }
6065 $self->{insertion_mode} = AFTER_BODY_IM;
6066 ## reprocess
6067 redo B;
6068 } else {
6069 !!!cp ('t408');
6070 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
6071 ## Ignore the token
6072 !!!next-token;
6073 redo B;
6074 }
6075 } elsif ({
6076 address => 1, blockquote => 1, center => 1, dir => 1,
6077 div => 1, dl => 1, fieldset => 1, listing => 1,
6078 menu => 1, ol => 1, pre => 1, ul => 1,
6079 dd => 1, dt => 1, li => 1,
6080 applet => 1, button => 1, marquee => 1, object => 1,
6081 }->{$token->{tag_name}}) {
6082 ## has an element in scope
6083 my $i;
6084 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6085 my $node = $self->{open_elements}->[$_];
6086 if ($node->[1] eq $token->{tag_name}) {
6087 !!!cp ('t410');
6088 $i = $_;
6089 last INSCOPE;
6090 } elsif ({
6091 applet => 1, table => 1, caption => 1, td => 1, th => 1,
6092 button => 1, marquee => 1, object => 1, html => 1,
6093 }->{$node->[1]}) {
6094 !!!cp ('t411');
6095 last INSCOPE;
6096 }
6097 } # INSCOPE
6098
6099 unless (defined $i) { # has an element in scope
6100 !!!cp ('t413');
6101 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
6102 } else {
6103 ## Step 1. generate implied end tags
6104 while ({
6105 dd => ($token->{tag_name} ne 'dd'),
6106 dt => ($token->{tag_name} ne 'dt'),
6107 li => ($token->{tag_name} ne 'li'),
6108 p => 1,
6109 }->{$self->{open_elements}->[-1]->[1]}) {
6110 !!!cp ('t409');
6111 pop @{$self->{open_elements}};
6112 }
6113
6114 ## Step 2.
6115 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
6116 !!!cp ('t412');
6117 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
6118 } else {
6119 !!!cp ('t414');
6120 }
6121
6122 ## Step 3.
6123 splice @{$self->{open_elements}}, $i;
6124
6125 ## Step 4.
6126 $clear_up_to_marker->()
6127 if {
6128 applet => 1, button => 1, marquee => 1, object => 1,
6129 }->{$token->{tag_name}};
6130 }
6131 !!!next-token;
6132 redo B;
6133 } elsif ($token->{tag_name} eq 'form') {
6134 undef $self->{form_element};
6135
6136 ## has an element in scope
6137 my $i;
6138 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6139 my $node = $self->{open_elements}->[$_];
6140 if ($node->[1] eq $token->{tag_name}) {
6141 !!!cp ('t418');
6142 $i = $_;
6143 last INSCOPE;
6144 } elsif ({
6145 applet => 1, table => 1, caption => 1, td => 1, th => 1,
6146 button => 1, marquee => 1, object => 1, html => 1,
6147 }->{$node->[1]}) {
6148 !!!cp ('t419');
6149 last INSCOPE;
6150 }
6151 } # INSCOPE
6152
6153 unless (defined $i) { # has an element in scope
6154 !!!cp ('t421');
6155 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
6156 } else {
6157 ## Step 1. generate implied end tags
6158 while ({
6159 dd => 1, dt => 1, li => 1, p => 1,
6160 }->{$self->{open_elements}->[-1]->[1]}) {
6161 !!!cp ('t417');
6162 pop @{$self->{open_elements}};
6163 }
6164
6165 ## Step 2.
6166 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
6167 !!!cp ('t417.1');
6168 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
6169 } else {
6170 !!!cp ('t420');
6171 }
6172
6173 ## Step 3.
6174 splice @{$self->{open_elements}}, $i;
6175 }
6176
6177 !!!next-token;
6178 redo B;
6179 } elsif ({
6180 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
6181 }->{$token->{tag_name}}) {
6182 ## has an element in scope
6183 my $i;
6184 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6185 my $node = $self->{open_elements}->[$_];
6186 if ({
6187 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
6188 }->{$node->[1]}) {
6189 !!!cp ('t423');
6190 $i = $_;
6191 last INSCOPE;
6192 } elsif ({
6193 applet => 1, table => 1, caption => 1, td => 1, th => 1,
6194 button => 1, marquee => 1, object => 1, html => 1,
6195 }->{$node->[1]}) {
6196 !!!cp ('t424');
6197 last INSCOPE;
6198 }
6199 } # INSCOPE
6200
6201 unless (defined $i) { # has an element in scope
6202 !!!cp ('t425.1');
6203 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
6204 } else {
6205 ## Step 1. generate implied end tags
6206 while ({
6207 dd => 1, dt => 1, li => 1, p => 1,
6208 }->{$self->{open_elements}->[-1]->[1]}) {
6209 !!!cp ('t422');
6210 pop @{$self->{open_elements}};
6211 }
6212
6213 ## Step 2.
6214 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
6215 !!!cp ('t425');
6216 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
6217 } else {
6218 !!!cp ('t426');
6219 }
6220
6221 ## Step 3.
6222 splice @{$self->{open_elements}}, $i;
6223 }
6224
6225 !!!next-token;
6226 redo B;
6227 } elsif ($token->{tag_name} eq 'p') {
6228 ## has an element in scope
6229 my $i;
6230 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6231 my $node = $self->{open_elements}->[$_];
6232 if ($node->[1] eq $token->{tag_name}) {
6233 !!!cp ('t410.1');
6234 $i = $_;
6235 last INSCOPE;
6236 } elsif ({
6237 applet => 1, table => 1, caption => 1, td => 1, th => 1,
6238 button => 1, marquee => 1, object => 1, html => 1,
6239 }->{$node->[1]}) {
6240 !!!cp ('t411.1');
6241 last INSCOPE;
6242 }
6243 } # INSCOPE
6244
6245 if (defined $i) {
6246 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
6247 !!!cp ('t412.1');
6248 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
6249 } else {
6250 !!!cp ('t414.1');
6251 }
6252
6253 splice @{$self->{open_elements}}, $i;
6254 } else {
6255 !!!cp ('t413.1');
6256 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
6257
6258 !!!cp ('t415.1');
6259 ## As if <p>, then reprocess the current token
6260 my $el;
6261 !!!create-element ($el, 'p');
6262 $insert->($el);
6263 ## NOTE: Not inserted into |$self->{open_elements}|.
6264 }
6265
6266 !!!next-token;
6267 redo B;
6268 } elsif ({
6269 a => 1,
6270 b => 1, big => 1, em => 1, font => 1, i => 1,
6271 nobr => 1, s => 1, small => 1, strile => 1,
6272 strong => 1, tt => 1, u => 1,
6273 }->{$token->{tag_name}}) {
6274 !!!cp ('t427');
6275 $formatting_end_tag->($token->{tag_name});
6276 redo B;
6277 } elsif ($token->{tag_name} eq 'br') {
6278 !!!cp ('t428');
6279 !!!parse-error (type => 'unmatched end tag:br');
6280
6281 ## As if <br>
6282 $reconstruct_active_formatting_elements->($insert_to_current);
6283
6284 my $el;
6285 !!!create-element ($el, 'br');
6286 $insert->($el);
6287
6288 ## Ignore the token.
6289 !!!next-token;
6290 redo B;
6291 } elsif ({
6292 caption => 1, col => 1, colgroup => 1, frame => 1,
6293 frameset => 1, head => 1, option => 1, optgroup => 1,
6294 tbody => 1, td => 1, tfoot => 1, th => 1,
6295 thead => 1, tr => 1,
6296 area => 1, basefont => 1, bgsound => 1,
6297 embed => 1, hr => 1, iframe => 1, image => 1,
6298 img => 1, input => 1, isindex => 1, noembed => 1,
6299 noframes => 1, param => 1, select => 1, spacer => 1,
6300 table => 1, textarea => 1, wbr => 1,
6301 noscript => 0, ## TODO: if scripting is enabled
6302 }->{$token->{tag_name}}) {
6303 !!!cp ('t429');
6304 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
6305 ## Ignore the token
6306 !!!next-token;
6307 redo B;
6308
6309 ## ISSUE: Issue on HTML5 new elements in spec
6310
6311 } else {
6312 ## Step 1
6313 my $node_i = -1;
6314 my $node = $self->{open_elements}->[$node_i];
6315
6316 ## Step 2
6317 S2: {
6318 if ($node->[1] eq $token->{tag_name}) {
6319 ## Step 1
6320 ## generate implied end tags
6321 while ({
6322 dd => 1, dt => 1, li => 1, p => 1,
6323 }->{$self->{open_elements}->[-1]->[1]}) {
6324 !!!cp ('t430');
6325 ## ISSUE: Can this case be reached?
6326 pop @{$self->{open_elements}};
6327 }
6328
6329 ## Step 2
6330 if ($token->{tag_name} ne $self->{open_elements}->[-1]->[1]) {
6331 !!!cp ('t431');
6332 ## NOTE: <x><y></x>
6333 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
6334 } else {
6335 !!!cp ('t432');
6336 }
6337
6338 ## Step 3
6339 splice @{$self->{open_elements}}, $node_i;
6340
6341 !!!next-token;
6342 last S2;
6343 } else {
6344 ## Step 3
6345 if (not $formatting_category->{$node->[1]} and
6346 #not $phrasing_category->{$node->[1]} and
6347 ($special_category->{$node->[1]} or
6348 $scoping_category->{$node->[1]})) {
6349 !!!cp ('t433');
6350 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
6351 ## Ignore the token
6352 !!!next-token;
6353 last S2;
6354 }
6355
6356 !!!cp ('t434');
6357 }
6358
6359 ## Step 4
6360 $node_i--;
6361 $node = $self->{open_elements}->[$node_i];
6362
6363 ## Step 5;
6364 redo S2;
6365 } # S2
6366 redo B;
6367 }
6368 }
6369 redo B;
6370 } # B
6371
6372 ## Stop parsing # MUST
6373
6374 ## TODO: script stuffs
6375 } # _tree_construct_main
6376
6377 sub set_inner_html ($$$) {
6378 my $class = shift;
6379 my $node = shift;
6380 my $s = \$_[0];
6381 my $onerror = $_[1];
6382
6383 ## ISSUE: Should {confident} be true?
6384
6385 my $nt = $node->node_type;
6386 if ($nt == 9) {
6387 # MUST
6388
6389 ## Step 1 # MUST
6390 ## TODO: If the document has an active parser, ...
6391 ## ISSUE: There is an issue in the spec.
6392
6393 ## Step 2 # MUST
6394 my @cn = @{$node->child_nodes};
6395 for (@cn) {
6396 $node->remove_child ($_);
6397 }
6398
6399 ## Step 3, 4, 5 # MUST
6400 $class->parse_string ($$s => $node, $onerror);
6401 } elsif ($nt == 1) {
6402 ## TODO: If non-html element
6403
6404 ## NOTE: Most of this code is copied from |parse_string|
6405
6406 ## Step 1 # MUST
6407 my $this_doc = $node->owner_document;
6408 my $doc = $this_doc->implementation->create_document;
6409 $doc->manakai_is_html (1);
6410 my $p = $class->new;
6411 $p->{document} = $doc;
6412
6413 ## Step 8 # MUST
6414 my $i = 0;
6415 my $line = 1;
6416 my $column = 0;
6417 $p->{set_next_char} = sub {
6418 my $self = shift;
6419
6420 pop @{$self->{prev_char}};
6421 unshift @{$self->{prev_char}}, $self->{next_char};
6422
6423 $self->{next_char} = -1 and return if $i >= length $$s;
6424 $self->{next_char} = ord substr $$s, $i++, 1;
6425 $column++;
6426
6427 if ($self->{next_char} == 0x000A) { # LF
6428 $line++;
6429 $column = 0;
6430 !!!cp ('i1');
6431 } elsif ($self->{next_char} == 0x000D) { # CR
6432 $i++ if substr ($$s, $i, 1) eq "\x0A";
6433 $self->{next_char} = 0x000A; # LF # MUST
6434 $line++;
6435 $column = 0;
6436 !!!cp ('i2');
6437 } elsif ($self->{next_char} > 0x10FFFF) {
6438 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
6439 !!!cp ('i3');
6440 } elsif ($self->{next_char} == 0x0000) { # NULL
6441 !!!cp ('i4');
6442 !!!parse-error (type => 'NULL');
6443 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
6444 }
6445 };
6446 $p->{prev_char} = [-1, -1, -1];
6447 $p->{next_char} = -1;
6448
6449 my $ponerror = $onerror || sub {
6450 my (%opt) = @_;
6451 warn "Parse error ($opt{type}) at line $opt{line} column $opt{column}\n";
6452 };
6453 $p->{parse_error} = sub {
6454 $ponerror->(@_, line => $line, column => $column);
6455 };
6456
6457 $p->_initialize_tokenizer;
6458 $p->_initialize_tree_constructor;
6459
6460 ## Step 2
6461 my $node_ln = $node->manakai_local_name;
6462 $p->{content_model} = {
6463 title => RCDATA_CONTENT_MODEL,
6464 textarea => RCDATA_CONTENT_MODEL,
6465 style => CDATA_CONTENT_MODEL,
6466 script => CDATA_CONTENT_MODEL,
6467 xmp => CDATA_CONTENT_MODEL,
6468 iframe => CDATA_CONTENT_MODEL,
6469 noembed => CDATA_CONTENT_MODEL,
6470 noframes => CDATA_CONTENT_MODEL,
6471 noscript => CDATA_CONTENT_MODEL,
6472 plaintext => PLAINTEXT_CONTENT_MODEL,
6473 }->{$node_ln};
6474 $p->{content_model} = PCDATA_CONTENT_MODEL
6475 unless defined $p->{content_model};
6476 ## ISSUE: What is "the name of the element"? local name?
6477
6478 $p->{inner_html_node} = [$node, $node_ln];
6479
6480 ## Step 3
6481 my $root = $doc->create_element_ns
6482 ('http://www.w3.org/1999/xhtml', [undef, 'html']);
6483
6484 ## Step 4 # MUST
6485 $doc->append_child ($root);
6486
6487 ## Step 5 # MUST
6488 push @{$p->{open_elements}}, [$root, 'html'];
6489
6490 undef $p->{head_element};
6491
6492 ## Step 6 # MUST
6493 $p->_reset_insertion_mode;
6494
6495 ## Step 7 # MUST
6496 my $anode = $node;
6497 AN: while (defined $anode) {
6498 if ($anode->node_type == 1) {
6499 my $nsuri = $anode->namespace_uri;
6500 if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {
6501 if ($anode->manakai_local_name eq 'form') {
6502 !!!cp ('i5');
6503 $p->{form_element} = $anode;
6504 last AN;
6505 }
6506 }
6507 }
6508 $anode = $anode->parent_node;
6509 } # AN
6510
6511 ## Step 9 # MUST
6512 {
6513 my $self = $p;
6514 !!!next-token;
6515 }
6516 $p->_tree_construction_main;
6517
6518 ## Step 10 # MUST
6519 my @cn = @{$node->child_nodes};
6520 for (@cn) {
6521 $node->remove_child ($_);
6522 }
6523 ## ISSUE: mutation events? read-only?
6524
6525 ## Step 11 # MUST
6526 @cn = @{$root->child_nodes};
6527 for (@cn) {
6528 $this_doc->adopt_node ($_);
6529 $node->append_child ($_);
6530 }
6531 ## ISSUE: mutation events?
6532
6533 $p->_terminate_tree_constructor;
6534 } else {
6535 die "$0: |set_inner_html| is not defined for node of type $nt";
6536 }
6537 } # set_inner_html
6538
6539 } # tree construction stage
6540
6541 package Whatpm::HTML::RestartParser;
6542 push our @ISA, 'Error';
6543
6544 1;
6545 # $Date: 2008/03/09 06:29:25 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24