/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.104 - (show annotations) (download) (as text)
Sun Mar 9 08:47:33 2008 UTC (16 years, 8 months ago) by wakaba
Branch: MAIN
Changes since 1.103: +127 -40 lines
File MIME type: application/x-wais-source
++ whatpm/Whatpm/ChangeLog	9 Mar 2008 08:47:28 -0000
	* HTML.pm.src: New end-of-file token implementation (HTML5
	revision 1348).

2008-03-09  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.103 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 use Error qw(:try);
5
6 ## ISSUE:
7 ## var doc = implementation.createDocument (null, null, null);
8 ## doc.write ('');
9 ## alert (doc.compatMode);
10
11 ## TODO: Control charcters and noncharacters are not allowed (HTML5 revision 1263)
12 ## TODO: 1252 parse error (revision 1264)
13 ## TODO: 8859-11 = 874 (revision 1271)
14
15 my $permitted_slash_tag_name = {
16 base => 1,
17 link => 1,
18 meta => 1,
19 hr => 1,
20 br => 1,
21 img => 1,
22 embed => 1,
23 param => 1,
24 area => 1,
25 col => 1,
26 input => 1,
27 };
28
29 my $c1_entity_char = {
30 0x80 => 0x20AC,
31 0x81 => 0xFFFD,
32 0x82 => 0x201A,
33 0x83 => 0x0192,
34 0x84 => 0x201E,
35 0x85 => 0x2026,
36 0x86 => 0x2020,
37 0x87 => 0x2021,
38 0x88 => 0x02C6,
39 0x89 => 0x2030,
40 0x8A => 0x0160,
41 0x8B => 0x2039,
42 0x8C => 0x0152,
43 0x8D => 0xFFFD,
44 0x8E => 0x017D,
45 0x8F => 0xFFFD,
46 0x90 => 0xFFFD,
47 0x91 => 0x2018,
48 0x92 => 0x2019,
49 0x93 => 0x201C,
50 0x94 => 0x201D,
51 0x95 => 0x2022,
52 0x96 => 0x2013,
53 0x97 => 0x2014,
54 0x98 => 0x02DC,
55 0x99 => 0x2122,
56 0x9A => 0x0161,
57 0x9B => 0x203A,
58 0x9C => 0x0153,
59 0x9D => 0xFFFD,
60 0x9E => 0x017E,
61 0x9F => 0x0178,
62 }; # $c1_entity_char
63
64 my $special_category = {
65 address => 1, area => 1, base => 1, basefont => 1, bgsound => 1,
66 blockquote => 1, body => 1, br => 1, center => 1, col => 1, colgroup => 1,
67 dd => 1, dir => 1, div => 1, dl => 1, dt => 1, embed => 1, fieldset => 1,
68 form => 1, frame => 1, frameset => 1, h1 => 1, h2 => 1, h3 => 1,
69 h4 => 1, h5 => 1, h6 => 1, head => 1, hr => 1, iframe => 1, image => 1,
70 img => 1, input => 1, isindex => 1, li => 1, link => 1, listing => 1,
71 menu => 1, meta => 1, noembed => 1, noframes => 1, noscript => 1,
72 ol => 1, optgroup => 1, option => 1, p => 1, param => 1, plaintext => 1,
73 pre => 1, script => 1, select => 1, spacer => 1, style => 1, tbody => 1,
74 textarea => 1, tfoot => 1, thead => 1, title => 1, tr => 1, ul => 1, wbr => 1,
75 };
76 my $scoping_category = {
77 applet => 1, button => 1, caption => 1, html => 1, marquee => 1, object => 1,
78 table => 1, td => 1, th => 1,
79 };
80 my $formatting_category = {
81 a => 1, b => 1, big => 1, em => 1, font => 1, i => 1, nobr => 1,
82 s => 1, small => 1, strile => 1, strong => 1, tt => 1, u => 1,
83 };
84 # $phrasing_category: all other elements
85
86 sub parse_byte_string ($$$$;$) {
87 my $self = ref $_[0] ? shift : shift->new;
88 my $charset = shift;
89 my $bytes_s = ref $_[0] ? $_[0] : \($_[0]);
90 my $s;
91
92 if (defined $charset) {
93 require Encode; ## TODO: decode(utf8) don't delete BOM
94 $s = \ (Encode::decode ($charset, $$bytes_s));
95 $self->{input_encoding} = lc $charset; ## TODO: normalize name
96 $self->{confident} = 1;
97 } else {
98 ## TODO: Implement HTML5 detection algorithm
99 require Whatpm::Charset::UniversalCharDet;
100 $charset = Whatpm::Charset::UniversalCharDet->detect_byte_string
101 (substr ($$bytes_s, 0, 1024));
102 $charset ||= 'windows-1252';
103 $s = \ (Encode::decode ($charset, $$bytes_s));
104 $self->{input_encoding} = $charset;
105 $self->{confident} = 0;
106 }
107
108 $self->{change_encoding} = sub {
109 my $self = shift;
110 my $charset = lc shift;
111 ## TODO: if $charset is supported
112 ## TODO: normalize charset name
113
114 ## "Change the encoding" algorithm:
115
116 ## Step 1
117 if ($charset eq 'utf-16') { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8?
118 $charset = 'utf-8';
119 }
120
121 ## Step 2
122 if (defined $self->{input_encoding} and
123 $self->{input_encoding} eq $charset) {
124 $self->{confident} = 1;
125 return;
126 }
127
128 !!!parse-error (type => 'charset label detected:'.$self->{input_encoding}.
129 ':'.$charset, level => 'w');
130
131 ## Step 3
132 # if (can) {
133 ## change the encoding on the fly.
134 #$self->{confident} = 1;
135 #return;
136 # }
137
138 ## Step 4
139 throw Whatpm::HTML::RestartParser (charset => $charset);
140 }; # $self->{change_encoding}
141
142 my @args = @_; shift @args; # $s
143 my $return;
144 try {
145 $return = $self->parse_char_string ($s, @args);
146 } catch Whatpm::HTML::RestartParser with {
147 my $charset = shift->{charset};
148 $s = \ (Encode::decode ($charset, $$bytes_s));
149 $self->{input_encoding} = $charset; ## TODO: normalize
150 $self->{confident} = 1;
151 $return = $self->parse_char_string ($s, @args);
152 };
153 return $return;
154 } # parse_byte_string
155
156 ## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM
157 ## and the HTML layer MUST ignore it. However, we does strip BOM in
158 ## the encoding layer and the HTML layer does not ignore any U+FEFF,
159 ## because the core part of our HTML parser expects a string of character,
160 ## not a string of bytes or code units or anything which might contain a BOM.
161 ## Therefore, any parser interface that accepts a string of bytes,
162 ## such as |parse_byte_string| in this module, must ensure that it does
163 ## strip the BOM and never strip any ZWNBSP.
164
165 *parse_char_string = \&parse_string;
166
167 sub parse_string ($$$;$) {
168 my $self = ref $_[0] ? shift : shift->new;
169 my $s = ref $_[0] ? $_[0] : \($_[0]);
170 $self->{document} = $_[1];
171 @{$self->{document}->child_nodes} = ();
172
173 ## NOTE: |set_inner_html| copies most of this method's code
174
175 $self->{confident} = 1 unless exists $self->{confident};
176 $self->{document}->input_encoding ($self->{input_encoding})
177 if defined $self->{input_encoding};
178
179 my $i = 0;
180 my $line = 1;
181 my $column = 0;
182 $self->{set_next_char} = sub {
183 my $self = shift;
184
185 pop @{$self->{prev_char}};
186 unshift @{$self->{prev_char}}, $self->{next_char};
187
188 $self->{next_char} = -1 and return if $i >= length $$s;
189 $self->{next_char} = ord substr $$s, $i++, 1;
190 $column++;
191
192 if ($self->{next_char} == 0x000A) { # LF
193 $line++;
194 $column = 0;
195 } elsif ($self->{next_char} == 0x000D) { # CR
196 $i++ if substr ($$s, $i, 1) eq "\x0A";
197 $self->{next_char} = 0x000A; # LF # MUST
198 $line++;
199 $column = 0;
200 } elsif ($self->{next_char} > 0x10FFFF) {
201 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
202 } elsif ($self->{next_char} == 0x0000) { # NULL
203 !!!parse-error (type => 'NULL');
204 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
205 }
206 };
207 $self->{prev_char} = [-1, -1, -1];
208 $self->{next_char} = -1;
209
210 my $onerror = $_[2] || sub {
211 my (%opt) = @_;
212 warn "Parse error ($opt{type}) at line $opt{line} column $opt{column}\n";
213 };
214 $self->{parse_error} = sub {
215 $onerror->(@_, line => $line, column => $column);
216 };
217
218 $self->_initialize_tokenizer;
219 $self->_initialize_tree_constructor;
220 $self->_construct_tree;
221 $self->_terminate_tree_constructor;
222
223 return $self->{document};
224 } # parse_string
225
226 sub new ($) {
227 my $class = shift;
228 my $self = bless {}, $class;
229 $self->{set_next_char} = sub {
230 $self->{next_char} = -1;
231 };
232 $self->{parse_error} = sub {
233 #
234 };
235 $self->{change_encoding} = sub {
236 # if ($_[0] is a supported encoding) {
237 # run "change the encoding" algorithm;
238 # throw Whatpm::HTML::RestartParser (charset => $new_encoding);
239 # }
240 };
241 $self->{application_cache_selection} = sub {
242 #
243 };
244 return $self;
245 } # new
246
247 sub CM_ENTITY () { 0b001 } # & markup in data
248 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
249 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
250
251 sub PLAINTEXT_CONTENT_MODEL () { 0 }
252 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
253 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
254 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
255
256 sub DATA_STATE () { 0 }
257 sub ENTITY_DATA_STATE () { 1 }
258 sub TAG_OPEN_STATE () { 2 }
259 sub CLOSE_TAG_OPEN_STATE () { 3 }
260 sub TAG_NAME_STATE () { 4 }
261 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
262 sub ATTRIBUTE_NAME_STATE () { 6 }
263 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
264 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
265 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
266 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
267 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
268 sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
269 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
270 sub COMMENT_START_STATE () { 14 }
271 sub COMMENT_START_DASH_STATE () { 15 }
272 sub COMMENT_STATE () { 16 }
273 sub COMMENT_END_STATE () { 17 }
274 sub COMMENT_END_DASH_STATE () { 18 }
275 sub BOGUS_COMMENT_STATE () { 19 }
276 sub DOCTYPE_STATE () { 20 }
277 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
278 sub DOCTYPE_NAME_STATE () { 22 }
279 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
280 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
281 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
282 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
283 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
284 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
285 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
286 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
287 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
288 sub BOGUS_DOCTYPE_STATE () { 32 }
289 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
290
291 sub DOCTYPE_TOKEN () { 1 }
292 sub COMMENT_TOKEN () { 2 }
293 sub START_TAG_TOKEN () { 3 }
294 sub END_TAG_TOKEN () { 4 }
295 sub END_OF_FILE_TOKEN () { 5 }
296 sub CHARACTER_TOKEN () { 6 }
297
298 sub AFTER_HTML_IMS () { 0b100 }
299 sub HEAD_IMS () { 0b1000 }
300 sub BODY_IMS () { 0b10000 }
301 sub BODY_TABLE_IMS () { 0b100000 }
302 sub TABLE_IMS () { 0b1000000 }
303 sub ROW_IMS () { 0b10000000 }
304 sub BODY_AFTER_IMS () { 0b100000000 }
305 sub FRAME_IMS () { 0b1000000000 }
306 sub SELECT_IMS () { 0b10000000000 }
307
308 ## NOTE: "initial" and "before html" insertion modes have no constants.
309
310 ## NOTE: "after after body" insertion mode.
311 sub AFTER_HTML_BODY_IM () { AFTER_HTML_IMS | BODY_AFTER_IMS }
312
313 ## NOTE: "after after frameset" insertion mode.
314 sub AFTER_HTML_FRAMESET_IM () { AFTER_HTML_IMS | FRAME_IMS }
315
316 sub IN_HEAD_IM () { HEAD_IMS | 0b00 }
317 sub IN_HEAD_NOSCRIPT_IM () { HEAD_IMS | 0b01 }
318 sub AFTER_HEAD_IM () { HEAD_IMS | 0b10 }
319 sub BEFORE_HEAD_IM () { HEAD_IMS | 0b11 }
320 sub IN_BODY_IM () { BODY_IMS }
321 sub IN_CELL_IM () { BODY_IMS | BODY_TABLE_IMS | 0b01 }
322 sub IN_CAPTION_IM () { BODY_IMS | BODY_TABLE_IMS | 0b10 }
323 sub IN_ROW_IM () { TABLE_IMS | ROW_IMS | 0b01 }
324 sub IN_TABLE_BODY_IM () { TABLE_IMS | ROW_IMS | 0b10 }
325 sub IN_TABLE_IM () { TABLE_IMS }
326 sub AFTER_BODY_IM () { BODY_AFTER_IMS }
327 sub IN_FRAMESET_IM () { FRAME_IMS | 0b01 }
328 sub AFTER_FRAMESET_IM () { FRAME_IMS | 0b10 }
329 sub IN_SELECT_IM () { SELECT_IMS | 0b01 }
330 sub IN_SELECT_IN_TABLE_IM () { SELECT_IMS | 0b10 }
331 sub IN_COLUMN_GROUP_IM () { 0b10 }
332
333 ## Implementations MUST act as if state machine in the spec
334
335 sub _initialize_tokenizer ($) {
336 my $self = shift;
337 $self->{state} = DATA_STATE; # MUST
338 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
339 undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE
340 undef $self->{current_attribute};
341 undef $self->{last_emitted_start_tag_name};
342 undef $self->{last_attribute_value_state};
343 $self->{char} = [];
344 # $self->{next_char}
345 !!!next-input-character;
346 $self->{token} = [];
347 # $self->{escape}
348 } # _initialize_tokenizer
349
350 ## A token has:
351 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
352 ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
353 ## ->{name} (DOCTYPE_TOKEN)
354 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
355 ## ->{public_identifier} (DOCTYPE_TOKEN)
356 ## ->{system_identifier} (DOCTYPE_TOKEN)
357 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
358 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
359 ## ->{name}
360 ## ->{value}
361 ## ->{has_reference} == 1 or 0
362 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
363
364 ## Emitted token MUST immediately be handled by the tree construction state.
365
366 ## Before each step, UA MAY check to see if either one of the scripts in
367 ## "list of scripts that will execute as soon as possible" or the first
368 ## script in the "list of scripts that will execute asynchronously",
369 ## has completed loading. If one has, then it MUST be executed
370 ## and removed from the list.
371
372 ## NOTE: HTML5 "Writing HTML documents" section, applied to
373 ## documents and not to user agents and conformance checkers,
374 ## contains some requirements that are not detected by the
375 ## parsing algorithm:
376 ## - Some requirements on character encoding declarations. ## TODO
377 ## - "Elements MUST NOT contain content that their content model disallows."
378 ## ... Some are parse error, some are not (will be reported by c.c.).
379 ## - Polytheistic slash SHOULD NOT be used. (Applied only to atheists.) ## TODO
380 ## - Text (in elements, attributes, and comments) SHOULD NOT contain
381 ## control characters other than space characters. ## TODO: (what is control character? C0, C1 and DEL? Unicode control character?)
382
383 ## TODO: HTML5 poses authors two SHOULD-level requirements that cannot
384 ## be detected by the HTML5 parsing algorithm:
385 ## - Text,
386
387 sub _get_next_token ($) {
388 my $self = shift;
389 if (@{$self->{token}}) {
390 return shift @{$self->{token}};
391 }
392
393 A: {
394 if ($self->{state} == DATA_STATE) {
395 if ($self->{next_char} == 0x0026) { # &
396 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
397 not $self->{escape}) {
398 !!!cp (1);
399 $self->{state} = ENTITY_DATA_STATE;
400 !!!next-input-character;
401 redo A;
402 } else {
403 !!!cp (2);
404 #
405 }
406 } elsif ($self->{next_char} == 0x002D) { # -
407 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
408 unless ($self->{escape}) {
409 if ($self->{prev_char}->[0] == 0x002D and # -
410 $self->{prev_char}->[1] == 0x0021 and # !
411 $self->{prev_char}->[2] == 0x003C) { # <
412 !!!cp (3);
413 $self->{escape} = 1;
414 } else {
415 !!!cp (4);
416 }
417 } else {
418 !!!cp (5);
419 }
420 }
421
422 #
423 } elsif ($self->{next_char} == 0x003C) { # <
424 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
425 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
426 not $self->{escape})) {
427 !!!cp (6);
428 $self->{state} = TAG_OPEN_STATE;
429 !!!next-input-character;
430 redo A;
431 } else {
432 !!!cp (7);
433 #
434 }
435 } elsif ($self->{next_char} == 0x003E) { # >
436 if ($self->{escape} and
437 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
438 if ($self->{prev_char}->[0] == 0x002D and # -
439 $self->{prev_char}->[1] == 0x002D) { # -
440 !!!cp (8);
441 delete $self->{escape};
442 } else {
443 !!!cp (9);
444 }
445 } else {
446 !!!cp (10);
447 }
448
449 #
450 } elsif ($self->{next_char} == -1) {
451 !!!cp (11);
452 !!!emit ({type => END_OF_FILE_TOKEN});
453 last A; ## TODO: ok?
454 } else {
455 !!!cp (12);
456 }
457 # Anything else
458 my $token = {type => CHARACTER_TOKEN,
459 data => chr $self->{next_char}};
460 ## Stay in the data state
461 !!!next-input-character;
462
463 !!!emit ($token);
464
465 redo A;
466 } elsif ($self->{state} == ENTITY_DATA_STATE) {
467 ## (cannot happen in CDATA state)
468
469 my $token = $self->_tokenize_attempt_to_consume_an_entity (0, -1);
470
471 $self->{state} = DATA_STATE;
472 # next-input-character is already done
473
474 unless (defined $token) {
475 !!!cp (13);
476 !!!emit ({type => CHARACTER_TOKEN, data => '&'});
477 } else {
478 !!!cp (14);
479 !!!emit ($token);
480 }
481
482 redo A;
483 } elsif ($self->{state} == TAG_OPEN_STATE) {
484 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
485 if ($self->{next_char} == 0x002F) { # /
486 !!!cp (15);
487 !!!next-input-character;
488 $self->{state} = CLOSE_TAG_OPEN_STATE;
489 redo A;
490 } else {
491 !!!cp (16);
492 ## reconsume
493 $self->{state} = DATA_STATE;
494
495 !!!emit ({type => CHARACTER_TOKEN, data => '<'});
496
497 redo A;
498 }
499 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
500 if ($self->{next_char} == 0x0021) { # !
501 !!!cp (17);
502 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
503 !!!next-input-character;
504 redo A;
505 } elsif ($self->{next_char} == 0x002F) { # /
506 !!!cp (18);
507 $self->{state} = CLOSE_TAG_OPEN_STATE;
508 !!!next-input-character;
509 redo A;
510 } elsif (0x0041 <= $self->{next_char} and
511 $self->{next_char} <= 0x005A) { # A..Z
512 !!!cp (19);
513 $self->{current_token}
514 = {type => START_TAG_TOKEN,
515 tag_name => chr ($self->{next_char} + 0x0020)};
516 $self->{state} = TAG_NAME_STATE;
517 !!!next-input-character;
518 redo A;
519 } elsif (0x0061 <= $self->{next_char} and
520 $self->{next_char} <= 0x007A) { # a..z
521 !!!cp (20);
522 $self->{current_token} = {type => START_TAG_TOKEN,
523 tag_name => chr ($self->{next_char})};
524 $self->{state} = TAG_NAME_STATE;
525 !!!next-input-character;
526 redo A;
527 } elsif ($self->{next_char} == 0x003E) { # >
528 !!!cp (21);
529 !!!parse-error (type => 'empty start tag');
530 $self->{state} = DATA_STATE;
531 !!!next-input-character;
532
533 !!!emit ({type => CHARACTER_TOKEN, data => '<>'});
534
535 redo A;
536 } elsif ($self->{next_char} == 0x003F) { # ?
537 !!!cp (22);
538 !!!parse-error (type => 'pio');
539 $self->{state} = BOGUS_COMMENT_STATE;
540 ## $self->{next_char} is intentionally left as is
541 redo A;
542 } else {
543 !!!cp (23);
544 !!!parse-error (type => 'bare stago');
545 $self->{state} = DATA_STATE;
546 ## reconsume
547
548 !!!emit ({type => CHARACTER_TOKEN, data => '<'});
549
550 redo A;
551 }
552 } else {
553 die "$0: $self->{content_model} in tag open";
554 }
555 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
556 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
557 if (defined $self->{last_emitted_start_tag_name}) {
558 ## NOTE: <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>
559 my @next_char;
560 TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
561 push @next_char, $self->{next_char};
562 my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
563 my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
564 if ($self->{next_char} == $c or $self->{next_char} == $C) {
565 !!!cp (24);
566 !!!next-input-character;
567 next TAGNAME;
568 } else {
569 !!!cp (25);
570 $self->{next_char} = shift @next_char; # reconsume
571 !!!back-next-input-character (@next_char);
572 $self->{state} = DATA_STATE;
573
574 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
575
576 redo A;
577 }
578 }
579 push @next_char, $self->{next_char};
580
581 unless ($self->{next_char} == 0x0009 or # HT
582 $self->{next_char} == 0x000A or # LF
583 $self->{next_char} == 0x000B or # VT
584 $self->{next_char} == 0x000C or # FF
585 $self->{next_char} == 0x0020 or # SP
586 $self->{next_char} == 0x003E or # >
587 $self->{next_char} == 0x002F or # /
588 $self->{next_char} == -1) {
589 !!!cp (26);
590 $self->{next_char} = shift @next_char; # reconsume
591 !!!back-next-input-character (@next_char);
592 $self->{state} = DATA_STATE;
593 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
594 redo A;
595 } else {
596 !!!cp (27);
597 $self->{next_char} = shift @next_char;
598 !!!back-next-input-character (@next_char);
599 # and consume...
600 }
601 } else {
602 ## No start tag token has ever been emitted
603 !!!cp (28);
604 # next-input-character is already done
605 $self->{state} = DATA_STATE;
606 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
607 redo A;
608 }
609 }
610
611 if (0x0041 <= $self->{next_char} and
612 $self->{next_char} <= 0x005A) { # A..Z
613 !!!cp (29);
614 $self->{current_token} = {type => END_TAG_TOKEN,
615 tag_name => chr ($self->{next_char} + 0x0020)};
616 $self->{state} = TAG_NAME_STATE;
617 !!!next-input-character;
618 redo A;
619 } elsif (0x0061 <= $self->{next_char} and
620 $self->{next_char} <= 0x007A) { # a..z
621 !!!cp (30);
622 $self->{current_token} = {type => END_TAG_TOKEN,
623 tag_name => chr ($self->{next_char})};
624 $self->{state} = TAG_NAME_STATE;
625 !!!next-input-character;
626 redo A;
627 } elsif ($self->{next_char} == 0x003E) { # >
628 !!!cp (31);
629 !!!parse-error (type => 'empty end tag');
630 $self->{state} = DATA_STATE;
631 !!!next-input-character;
632 redo A;
633 } elsif ($self->{next_char} == -1) {
634 !!!cp (32);
635 !!!parse-error (type => 'bare etago');
636 $self->{state} = DATA_STATE;
637 # reconsume
638
639 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
640
641 redo A;
642 } else {
643 !!!cp (33);
644 !!!parse-error (type => 'bogus end tag');
645 $self->{state} = BOGUS_COMMENT_STATE;
646 ## $self->{next_char} is intentionally left as is
647 redo A;
648 }
649 } elsif ($self->{state} == TAG_NAME_STATE) {
650 if ($self->{next_char} == 0x0009 or # HT
651 $self->{next_char} == 0x000A or # LF
652 $self->{next_char} == 0x000B or # VT
653 $self->{next_char} == 0x000C or # FF
654 $self->{next_char} == 0x0020) { # SP
655 !!!cp (34);
656 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
657 !!!next-input-character;
658 redo A;
659 } elsif ($self->{next_char} == 0x003E) { # >
660 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
661 !!!cp (35);
662 $self->{current_token}->{first_start_tag}
663 = not defined $self->{last_emitted_start_tag_name};
664 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
665 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
666 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
667 #if ($self->{current_token}->{attributes}) {
668 # ## NOTE: This should never be reached.
669 # !!! cp (36);
670 # !!! parse-error (type => 'end tag attribute');
671 #} else {
672 !!!cp (37);
673 #}
674 } else {
675 die "$0: $self->{current_token}->{type}: Unknown token type";
676 }
677 $self->{state} = DATA_STATE;
678 !!!next-input-character;
679
680 !!!emit ($self->{current_token}); # start tag or end tag
681
682 redo A;
683 } elsif (0x0041 <= $self->{next_char} and
684 $self->{next_char} <= 0x005A) { # A..Z
685 !!!cp (38);
686 $self->{current_token}->{tag_name} .= chr ($self->{next_char} + 0x0020);
687 # start tag or end tag
688 ## Stay in this state
689 !!!next-input-character;
690 redo A;
691 } elsif ($self->{next_char} == -1) {
692 !!!parse-error (type => 'unclosed tag');
693 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
694 !!!cp (39);
695 $self->{current_token}->{first_start_tag}
696 = not defined $self->{last_emitted_start_tag_name};
697 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
698 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
699 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
700 #if ($self->{current_token}->{attributes}) {
701 # ## NOTE: This state should never be reached.
702 # !!! cp (40);
703 # !!! parse-error (type => 'end tag attribute');
704 #} else {
705 !!!cp (41);
706 #}
707 } else {
708 die "$0: $self->{current_token}->{type}: Unknown token type";
709 }
710 $self->{state} = DATA_STATE;
711 # reconsume
712
713 !!!emit ($self->{current_token}); # start tag or end tag
714
715 redo A;
716 } elsif ($self->{next_char} == 0x002F) { # /
717 !!!next-input-character;
718 if ($self->{next_char} == 0x003E and # >
719 $self->{current_token}->{type} == START_TAG_TOKEN and
720 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
721 # permitted slash
722 !!!cp (42);
723 #
724 } else {
725 !!!cp (43);
726 !!!parse-error (type => 'nestc');
727 }
728 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
729 # next-input-character is already done
730 redo A;
731 } else {
732 !!!cp (44);
733 $self->{current_token}->{tag_name} .= chr $self->{next_char};
734 # start tag or end tag
735 ## Stay in the state
736 !!!next-input-character;
737 redo A;
738 }
739 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
740 if ($self->{next_char} == 0x0009 or # HT
741 $self->{next_char} == 0x000A or # LF
742 $self->{next_char} == 0x000B or # VT
743 $self->{next_char} == 0x000C or # FF
744 $self->{next_char} == 0x0020) { # SP
745 !!!cp (45);
746 ## Stay in the state
747 !!!next-input-character;
748 redo A;
749 } elsif ($self->{next_char} == 0x003E) { # >
750 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
751 !!!cp (46);
752 $self->{current_token}->{first_start_tag}
753 = not defined $self->{last_emitted_start_tag_name};
754 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
755 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
756 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
757 if ($self->{current_token}->{attributes}) {
758 !!!cp (47);
759 !!!parse-error (type => 'end tag attribute');
760 } else {
761 !!!cp (48);
762 }
763 } else {
764 die "$0: $self->{current_token}->{type}: Unknown token type";
765 }
766 $self->{state} = DATA_STATE;
767 !!!next-input-character;
768
769 !!!emit ($self->{current_token}); # start tag or end tag
770
771 redo A;
772 } elsif (0x0041 <= $self->{next_char} and
773 $self->{next_char} <= 0x005A) { # A..Z
774 !!!cp (49);
775 $self->{current_attribute} = {name => chr ($self->{next_char} + 0x0020),
776 value => ''};
777 $self->{state} = ATTRIBUTE_NAME_STATE;
778 !!!next-input-character;
779 redo A;
780 } elsif ($self->{next_char} == 0x002F) { # /
781 !!!next-input-character;
782 if ($self->{next_char} == 0x003E and # >
783 $self->{current_token}->{type} == START_TAG_TOKEN and
784 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
785 # permitted slash
786 !!!cp (50);
787 #
788 } else {
789 !!!cp (51);
790 !!!parse-error (type => 'nestc');
791 }
792 ## Stay in the state
793 # next-input-character is already done
794 redo A;
795 } elsif ($self->{next_char} == -1) {
796 !!!parse-error (type => 'unclosed tag');
797 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
798 !!!cp (52);
799 $self->{current_token}->{first_start_tag}
800 = not defined $self->{last_emitted_start_tag_name};
801 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
802 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
803 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
804 if ($self->{current_token}->{attributes}) {
805 !!!cp (53);
806 !!!parse-error (type => 'end tag attribute');
807 } else {
808 !!!cp (54);
809 }
810 } else {
811 die "$0: $self->{current_token}->{type}: Unknown token type";
812 }
813 $self->{state} = DATA_STATE;
814 # reconsume
815
816 !!!emit ($self->{current_token}); # start tag or end tag
817
818 redo A;
819 } else {
820 if ({
821 0x0022 => 1, # "
822 0x0027 => 1, # '
823 0x003D => 1, # =
824 }->{$self->{next_char}}) {
825 !!!cp (55);
826 !!!parse-error (type => 'bad attribute name');
827 } else {
828 !!!cp (56);
829 }
830 $self->{current_attribute} = {name => chr ($self->{next_char}),
831 value => ''};
832 $self->{state} = ATTRIBUTE_NAME_STATE;
833 !!!next-input-character;
834 redo A;
835 }
836 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
837 my $before_leave = sub {
838 if (exists $self->{current_token}->{attributes} # start tag or end tag
839 ->{$self->{current_attribute}->{name}}) { # MUST
840 !!!cp (57);
841 !!!parse-error (type => 'duplicate attribute:'.$self->{current_attribute}->{name});
842 ## Discard $self->{current_attribute} # MUST
843 } else {
844 !!!cp (58);
845 $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}
846 = $self->{current_attribute};
847 }
848 }; # $before_leave
849
850 if ($self->{next_char} == 0x0009 or # HT
851 $self->{next_char} == 0x000A or # LF
852 $self->{next_char} == 0x000B or # VT
853 $self->{next_char} == 0x000C or # FF
854 $self->{next_char} == 0x0020) { # SP
855 !!!cp (59);
856 $before_leave->();
857 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
858 !!!next-input-character;
859 redo A;
860 } elsif ($self->{next_char} == 0x003D) { # =
861 !!!cp (60);
862 $before_leave->();
863 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
864 !!!next-input-character;
865 redo A;
866 } elsif ($self->{next_char} == 0x003E) { # >
867 $before_leave->();
868 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
869 !!!cp (61);
870 $self->{current_token}->{first_start_tag}
871 = not defined $self->{last_emitted_start_tag_name};
872 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
873 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
874 !!!cp (62);
875 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
876 if ($self->{current_token}->{attributes}) {
877 !!!parse-error (type => 'end tag attribute');
878 }
879 } else {
880 die "$0: $self->{current_token}->{type}: Unknown token type";
881 }
882 $self->{state} = DATA_STATE;
883 !!!next-input-character;
884
885 !!!emit ($self->{current_token}); # start tag or end tag
886
887 redo A;
888 } elsif (0x0041 <= $self->{next_char} and
889 $self->{next_char} <= 0x005A) { # A..Z
890 !!!cp (63);
891 $self->{current_attribute}->{name} .= chr ($self->{next_char} + 0x0020);
892 ## Stay in the state
893 !!!next-input-character;
894 redo A;
895 } elsif ($self->{next_char} == 0x002F) { # /
896 $before_leave->();
897 !!!next-input-character;
898 if ($self->{next_char} == 0x003E and # >
899 $self->{current_token}->{type} == START_TAG_TOKEN and
900 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
901 # permitted slash
902 !!!cp (64);
903 #
904 } else {
905 !!!cp (65);
906 !!!parse-error (type => 'nestc');
907 }
908 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
909 # next-input-character is already done
910 redo A;
911 } elsif ($self->{next_char} == -1) {
912 !!!parse-error (type => 'unclosed tag');
913 $before_leave->();
914 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
915 !!!cp (66);
916 $self->{current_token}->{first_start_tag}
917 = not defined $self->{last_emitted_start_tag_name};
918 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
919 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
920 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
921 if ($self->{current_token}->{attributes}) {
922 !!!cp (67);
923 !!!parse-error (type => 'end tag attribute');
924 } else {
925 ## NOTE: This state should never be reached.
926 !!!cp (68);
927 }
928 } else {
929 die "$0: $self->{current_token}->{type}: Unknown token type";
930 }
931 $self->{state} = DATA_STATE;
932 # reconsume
933
934 !!!emit ($self->{current_token}); # start tag or end tag
935
936 redo A;
937 } else {
938 if ($self->{next_char} == 0x0022 or # "
939 $self->{next_char} == 0x0027) { # '
940 !!!cp (69);
941 !!!parse-error (type => 'bad attribute name');
942 } else {
943 !!!cp (70);
944 }
945 $self->{current_attribute}->{name} .= chr ($self->{next_char});
946 ## Stay in the state
947 !!!next-input-character;
948 redo A;
949 }
950 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
951 if ($self->{next_char} == 0x0009 or # HT
952 $self->{next_char} == 0x000A or # LF
953 $self->{next_char} == 0x000B or # VT
954 $self->{next_char} == 0x000C or # FF
955 $self->{next_char} == 0x0020) { # SP
956 !!!cp (71);
957 ## Stay in the state
958 !!!next-input-character;
959 redo A;
960 } elsif ($self->{next_char} == 0x003D) { # =
961 !!!cp (72);
962 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
963 !!!next-input-character;
964 redo A;
965 } elsif ($self->{next_char} == 0x003E) { # >
966 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
967 !!!cp (73);
968 $self->{current_token}->{first_start_tag}
969 = not defined $self->{last_emitted_start_tag_name};
970 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
971 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
972 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
973 if ($self->{current_token}->{attributes}) {
974 !!!cp (74);
975 !!!parse-error (type => 'end tag attribute');
976 } else {
977 ## NOTE: This state should never be reached.
978 !!!cp (75);
979 }
980 } else {
981 die "$0: $self->{current_token}->{type}: Unknown token type";
982 }
983 $self->{state} = DATA_STATE;
984 !!!next-input-character;
985
986 !!!emit ($self->{current_token}); # start tag or end tag
987
988 redo A;
989 } elsif (0x0041 <= $self->{next_char} and
990 $self->{next_char} <= 0x005A) { # A..Z
991 !!!cp (76);
992 $self->{current_attribute} = {name => chr ($self->{next_char} + 0x0020),
993 value => ''};
994 $self->{state} = ATTRIBUTE_NAME_STATE;
995 !!!next-input-character;
996 redo A;
997 } elsif ($self->{next_char} == 0x002F) { # /
998 !!!next-input-character;
999 if ($self->{next_char} == 0x003E and # >
1000 $self->{current_token}->{type} == START_TAG_TOKEN and
1001 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
1002 # permitted slash
1003 !!!cp (77);
1004 #
1005 } else {
1006 !!!cp (78);
1007 !!!parse-error (type => 'nestc');
1008 ## TODO: Different error type for <aa / bb> than <aa/>
1009 }
1010 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1011 # next-input-character is already done
1012 redo A;
1013 } elsif ($self->{next_char} == -1) {
1014 !!!parse-error (type => 'unclosed tag');
1015 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1016 !!!cp (79);
1017 $self->{current_token}->{first_start_tag}
1018 = not defined $self->{last_emitted_start_tag_name};
1019 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1020 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1021 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1022 if ($self->{current_token}->{attributes}) {
1023 !!!cp (80);
1024 !!!parse-error (type => 'end tag attribute');
1025 } else {
1026 ## NOTE: This state should never be reached.
1027 !!!cp (81);
1028 }
1029 } else {
1030 die "$0: $self->{current_token}->{type}: Unknown token type";
1031 }
1032 $self->{state} = DATA_STATE;
1033 # reconsume
1034
1035 !!!emit ($self->{current_token}); # start tag or end tag
1036
1037 redo A;
1038 } else {
1039 !!!cp (82);
1040 $self->{current_attribute} = {name => chr ($self->{next_char}),
1041 value => ''};
1042 $self->{state} = ATTRIBUTE_NAME_STATE;
1043 !!!next-input-character;
1044 redo A;
1045 }
1046 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1047 if ($self->{next_char} == 0x0009 or # HT
1048 $self->{next_char} == 0x000A or # LF
1049 $self->{next_char} == 0x000B or # VT
1050 $self->{next_char} == 0x000C or # FF
1051 $self->{next_char} == 0x0020) { # SP
1052 !!!cp (83);
1053 ## Stay in the state
1054 !!!next-input-character;
1055 redo A;
1056 } elsif ($self->{next_char} == 0x0022) { # "
1057 !!!cp (84);
1058 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1059 !!!next-input-character;
1060 redo A;
1061 } elsif ($self->{next_char} == 0x0026) { # &
1062 !!!cp (85);
1063 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1064 ## reconsume
1065 redo A;
1066 } elsif ($self->{next_char} == 0x0027) { # '
1067 !!!cp (86);
1068 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1069 !!!next-input-character;
1070 redo A;
1071 } elsif ($self->{next_char} == 0x003E) { # >
1072 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1073 !!!cp (87);
1074 $self->{current_token}->{first_start_tag}
1075 = not defined $self->{last_emitted_start_tag_name};
1076 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1077 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1078 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1079 if ($self->{current_token}->{attributes}) {
1080 !!!cp (88);
1081 !!!parse-error (type => 'end tag attribute');
1082 } else {
1083 ## NOTE: This state should never be reached.
1084 !!!cp (89);
1085 }
1086 } else {
1087 die "$0: $self->{current_token}->{type}: Unknown token type";
1088 }
1089 $self->{state} = DATA_STATE;
1090 !!!next-input-character;
1091
1092 !!!emit ($self->{current_token}); # start tag or end tag
1093
1094 redo A;
1095 } elsif ($self->{next_char} == -1) {
1096 !!!parse-error (type => 'unclosed tag');
1097 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1098 !!!cp (90);
1099 $self->{current_token}->{first_start_tag}
1100 = not defined $self->{last_emitted_start_tag_name};
1101 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1102 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1103 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1104 if ($self->{current_token}->{attributes}) {
1105 !!!cp (91);
1106 !!!parse-error (type => 'end tag attribute');
1107 } else {
1108 ## NOTE: This state should never be reached.
1109 !!!cp (92);
1110 }
1111 } else {
1112 die "$0: $self->{current_token}->{type}: Unknown token type";
1113 }
1114 $self->{state} = DATA_STATE;
1115 ## reconsume
1116
1117 !!!emit ($self->{current_token}); # start tag or end tag
1118
1119 redo A;
1120 } else {
1121 if ($self->{next_char} == 0x003D) { # =
1122 !!!cp (93);
1123 !!!parse-error (type => 'bad attribute value');
1124 } else {
1125 !!!cp (94);
1126 }
1127 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1128 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1129 !!!next-input-character;
1130 redo A;
1131 }
1132 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1133 if ($self->{next_char} == 0x0022) { # "
1134 !!!cp (95);
1135 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1136 !!!next-input-character;
1137 redo A;
1138 } elsif ($self->{next_char} == 0x0026) { # &
1139 !!!cp (96);
1140 $self->{last_attribute_value_state} = $self->{state};
1141 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1142 !!!next-input-character;
1143 redo A;
1144 } elsif ($self->{next_char} == -1) {
1145 !!!parse-error (type => 'unclosed attribute value');
1146 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1147 !!!cp (97);
1148 $self->{current_token}->{first_start_tag}
1149 = not defined $self->{last_emitted_start_tag_name};
1150 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1151 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1152 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1153 if ($self->{current_token}->{attributes}) {
1154 !!!cp (98);
1155 !!!parse-error (type => 'end tag attribute');
1156 } else {
1157 ## NOTE: This state should never be reached.
1158 !!!cp (99);
1159 }
1160 } else {
1161 die "$0: $self->{current_token}->{type}: Unknown token type";
1162 }
1163 $self->{state} = DATA_STATE;
1164 ## reconsume
1165
1166 !!!emit ($self->{current_token}); # start tag or end tag
1167
1168 redo A;
1169 } else {
1170 !!!cp (100);
1171 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1172 ## Stay in the state
1173 !!!next-input-character;
1174 redo A;
1175 }
1176 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1177 if ($self->{next_char} == 0x0027) { # '
1178 !!!cp (101);
1179 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1180 !!!next-input-character;
1181 redo A;
1182 } elsif ($self->{next_char} == 0x0026) { # &
1183 !!!cp (102);
1184 $self->{last_attribute_value_state} = $self->{state};
1185 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1186 !!!next-input-character;
1187 redo A;
1188 } elsif ($self->{next_char} == -1) {
1189 !!!parse-error (type => 'unclosed attribute value');
1190 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1191 !!!cp (103);
1192 $self->{current_token}->{first_start_tag}
1193 = not defined $self->{last_emitted_start_tag_name};
1194 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1195 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1196 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1197 if ($self->{current_token}->{attributes}) {
1198 !!!cp (104);
1199 !!!parse-error (type => 'end tag attribute');
1200 } else {
1201 ## NOTE: This state should never be reached.
1202 !!!cp (105);
1203 }
1204 } else {
1205 die "$0: $self->{current_token}->{type}: Unknown token type";
1206 }
1207 $self->{state} = DATA_STATE;
1208 ## reconsume
1209
1210 !!!emit ($self->{current_token}); # start tag or end tag
1211
1212 redo A;
1213 } else {
1214 !!!cp (106);
1215 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1216 ## Stay in the state
1217 !!!next-input-character;
1218 redo A;
1219 }
1220 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1221 if ($self->{next_char} == 0x0009 or # HT
1222 $self->{next_char} == 0x000A or # LF
1223 $self->{next_char} == 0x000B or # HT
1224 $self->{next_char} == 0x000C or # FF
1225 $self->{next_char} == 0x0020) { # SP
1226 !!!cp (107);
1227 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1228 !!!next-input-character;
1229 redo A;
1230 } elsif ($self->{next_char} == 0x0026) { # &
1231 !!!cp (108);
1232 $self->{last_attribute_value_state} = $self->{state};
1233 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1234 !!!next-input-character;
1235 redo A;
1236 } elsif ($self->{next_char} == 0x003E) { # >
1237 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1238 !!!cp (109);
1239 $self->{current_token}->{first_start_tag}
1240 = not defined $self->{last_emitted_start_tag_name};
1241 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1242 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1243 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1244 if ($self->{current_token}->{attributes}) {
1245 !!!cp (110);
1246 !!!parse-error (type => 'end tag attribute');
1247 } else {
1248 ## NOTE: This state should never be reached.
1249 !!!cp (111);
1250 }
1251 } else {
1252 die "$0: $self->{current_token}->{type}: Unknown token type";
1253 }
1254 $self->{state} = DATA_STATE;
1255 !!!next-input-character;
1256
1257 !!!emit ($self->{current_token}); # start tag or end tag
1258
1259 redo A;
1260 } elsif ($self->{next_char} == -1) {
1261 !!!parse-error (type => 'unclosed tag');
1262 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1263 !!!cp (112);
1264 $self->{current_token}->{first_start_tag}
1265 = not defined $self->{last_emitted_start_tag_name};
1266 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1267 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1268 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1269 if ($self->{current_token}->{attributes}) {
1270 !!!cp (113);
1271 !!!parse-error (type => 'end tag attribute');
1272 } else {
1273 ## NOTE: This state should never be reached.
1274 !!!cp (114);
1275 }
1276 } else {
1277 die "$0: $self->{current_token}->{type}: Unknown token type";
1278 }
1279 $self->{state} = DATA_STATE;
1280 ## reconsume
1281
1282 !!!emit ($self->{current_token}); # start tag or end tag
1283
1284 redo A;
1285 } else {
1286 if ({
1287 0x0022 => 1, # "
1288 0x0027 => 1, # '
1289 0x003D => 1, # =
1290 }->{$self->{next_char}}) {
1291 !!!cp (115);
1292 !!!parse-error (type => 'bad attribute value');
1293 } else {
1294 !!!cp (116);
1295 }
1296 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1297 ## Stay in the state
1298 !!!next-input-character;
1299 redo A;
1300 }
1301 } elsif ($self->{state} == ENTITY_IN_ATTRIBUTE_VALUE_STATE) {
1302 my $token = $self->_tokenize_attempt_to_consume_an_entity
1303 (1,
1304 $self->{last_attribute_value_state}
1305 == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE ? 0x0022 : # "
1306 $self->{last_attribute_value_state}
1307 == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE ? 0x0027 : # '
1308 -1);
1309
1310 unless (defined $token) {
1311 !!!cp (117);
1312 $self->{current_attribute}->{value} .= '&';
1313 } else {
1314 !!!cp (118);
1315 $self->{current_attribute}->{value} .= $token->{data};
1316 $self->{current_attribute}->{has_reference} = $token->{has_reference};
1317 ## ISSUE: spec says "append the returned character token to the current attribute's value"
1318 }
1319
1320 $self->{state} = $self->{last_attribute_value_state};
1321 # next-input-character is already done
1322 redo A;
1323 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1324 if ($self->{next_char} == 0x0009 or # HT
1325 $self->{next_char} == 0x000A or # LF
1326 $self->{next_char} == 0x000B or # VT
1327 $self->{next_char} == 0x000C or # FF
1328 $self->{next_char} == 0x0020) { # SP
1329 !!!cp (118);
1330 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1331 !!!next-input-character;
1332 redo A;
1333 } elsif ($self->{next_char} == 0x003E) { # >
1334 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1335 !!!cp (119);
1336 $self->{current_token}->{first_start_tag}
1337 = not defined $self->{last_emitted_start_tag_name};
1338 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1339 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1340 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1341 if ($self->{current_token}->{attributes}) {
1342 !!!cp (120);
1343 !!!parse-error (type => 'end tag attribute');
1344 } else {
1345 ## NOTE: This state should never be reached.
1346 !!!cp (121);
1347 }
1348 } else {
1349 die "$0: $self->{current_token}->{type}: Unknown token type";
1350 }
1351 $self->{state} = DATA_STATE;
1352 !!!next-input-character;
1353
1354 !!!emit ($self->{current_token}); # start tag or end tag
1355
1356 redo A;
1357 } elsif ($self->{next_char} == 0x002F) { # /
1358 !!!next-input-character;
1359 if ($self->{next_char} == 0x003E and # >
1360 $self->{current_token}->{type} == START_TAG_TOKEN and
1361 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
1362 # permitted slash
1363 !!!cp (122);
1364 #
1365 } else {
1366 !!!cp (123);
1367 !!!parse-error (type => 'nestc');
1368 }
1369 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1370 # next-input-character is already done
1371 redo A;
1372 } else {
1373 !!!cp (124);
1374 !!!parse-error (type => 'no space between attributes');
1375 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1376 ## reconsume
1377 redo A;
1378 }
1379 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1380 ## (only happen if PCDATA state)
1381
1382 my $token = {type => COMMENT_TOKEN, data => ''};
1383
1384 BC: {
1385 if ($self->{next_char} == 0x003E) { # >
1386 !!!cp (124);
1387 $self->{state} = DATA_STATE;
1388 !!!next-input-character;
1389
1390 !!!emit ($token);
1391
1392 redo A;
1393 } elsif ($self->{next_char} == -1) {
1394 !!!cp (125);
1395 $self->{state} = DATA_STATE;
1396 ## reconsume
1397
1398 !!!emit ($token);
1399
1400 redo A;
1401 } else {
1402 !!!cp (126);
1403 $token->{data} .= chr ($self->{next_char});
1404 !!!next-input-character;
1405 redo BC;
1406 }
1407 } # BC
1408
1409 die "$0: _get_next_token: unexpected case [BC]";
1410 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1411 ## (only happen if PCDATA state)
1412
1413 my @next_char;
1414 push @next_char, $self->{next_char};
1415
1416 if ($self->{next_char} == 0x002D) { # -
1417 !!!next-input-character;
1418 push @next_char, $self->{next_char};
1419 if ($self->{next_char} == 0x002D) { # -
1420 !!!cp (127);
1421 $self->{current_token} = {type => COMMENT_TOKEN, data => ''};
1422 $self->{state} = COMMENT_START_STATE;
1423 !!!next-input-character;
1424 redo A;
1425 } else {
1426 !!!cp (128);
1427 }
1428 } elsif ($self->{next_char} == 0x0044 or # D
1429 $self->{next_char} == 0x0064) { # d
1430 !!!next-input-character;
1431 push @next_char, $self->{next_char};
1432 if ($self->{next_char} == 0x004F or # O
1433 $self->{next_char} == 0x006F) { # o
1434 !!!next-input-character;
1435 push @next_char, $self->{next_char};
1436 if ($self->{next_char} == 0x0043 or # C
1437 $self->{next_char} == 0x0063) { # c
1438 !!!next-input-character;
1439 push @next_char, $self->{next_char};
1440 if ($self->{next_char} == 0x0054 or # T
1441 $self->{next_char} == 0x0074) { # t
1442 !!!next-input-character;
1443 push @next_char, $self->{next_char};
1444 if ($self->{next_char} == 0x0059 or # Y
1445 $self->{next_char} == 0x0079) { # y
1446 !!!next-input-character;
1447 push @next_char, $self->{next_char};
1448 if ($self->{next_char} == 0x0050 or # P
1449 $self->{next_char} == 0x0070) { # p
1450 !!!next-input-character;
1451 push @next_char, $self->{next_char};
1452 if ($self->{next_char} == 0x0045 or # E
1453 $self->{next_char} == 0x0065) { # e
1454 !!!cp (129);
1455 ## TODO: What a stupid code this is!
1456 $self->{state} = DOCTYPE_STATE;
1457 !!!next-input-character;
1458 redo A;
1459 } else {
1460 !!!cp (130);
1461 }
1462 } else {
1463 !!!cp (131);
1464 }
1465 } else {
1466 !!!cp (132);
1467 }
1468 } else {
1469 !!!cp (133);
1470 }
1471 } else {
1472 !!!cp (134);
1473 }
1474 } else {
1475 !!!cp (135);
1476 }
1477 } else {
1478 !!!cp (136);
1479 }
1480
1481 !!!parse-error (type => 'bogus comment');
1482 $self->{next_char} = shift @next_char;
1483 !!!back-next-input-character (@next_char);
1484 $self->{state} = BOGUS_COMMENT_STATE;
1485 redo A;
1486
1487 ## ISSUE: typos in spec: chacacters, is is a parse error
1488 ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
1489 } elsif ($self->{state} == COMMENT_START_STATE) {
1490 if ($self->{next_char} == 0x002D) { # -
1491 !!!cp (137);
1492 $self->{state} = COMMENT_START_DASH_STATE;
1493 !!!next-input-character;
1494 redo A;
1495 } elsif ($self->{next_char} == 0x003E) { # >
1496 !!!cp (138);
1497 !!!parse-error (type => 'bogus comment');
1498 $self->{state} = DATA_STATE;
1499 !!!next-input-character;
1500
1501 !!!emit ($self->{current_token}); # comment
1502
1503 redo A;
1504 } elsif ($self->{next_char} == -1) {
1505 !!!cp (139);
1506 !!!parse-error (type => 'unclosed comment');
1507 $self->{state} = DATA_STATE;
1508 ## reconsume
1509
1510 !!!emit ($self->{current_token}); # comment
1511
1512 redo A;
1513 } else {
1514 !!!cp (140);
1515 $self->{current_token}->{data} # comment
1516 .= chr ($self->{next_char});
1517 $self->{state} = COMMENT_STATE;
1518 !!!next-input-character;
1519 redo A;
1520 }
1521 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1522 if ($self->{next_char} == 0x002D) { # -
1523 !!!cp (141);
1524 $self->{state} = COMMENT_END_STATE;
1525 !!!next-input-character;
1526 redo A;
1527 } elsif ($self->{next_char} == 0x003E) { # >
1528 !!!cp (142);
1529 !!!parse-error (type => 'bogus comment');
1530 $self->{state} = DATA_STATE;
1531 !!!next-input-character;
1532
1533 !!!emit ($self->{current_token}); # comment
1534
1535 redo A;
1536 } elsif ($self->{next_char} == -1) {
1537 !!!cp (143);
1538 !!!parse-error (type => 'unclosed comment');
1539 $self->{state} = DATA_STATE;
1540 ## reconsume
1541
1542 !!!emit ($self->{current_token}); # comment
1543
1544 redo A;
1545 } else {
1546 !!!cp (144);
1547 $self->{current_token}->{data} # comment
1548 .= '-' . chr ($self->{next_char});
1549 $self->{state} = COMMENT_STATE;
1550 !!!next-input-character;
1551 redo A;
1552 }
1553 } elsif ($self->{state} == COMMENT_STATE) {
1554 if ($self->{next_char} == 0x002D) { # -
1555 !!!cp (145);
1556 $self->{state} = COMMENT_END_DASH_STATE;
1557 !!!next-input-character;
1558 redo A;
1559 } elsif ($self->{next_char} == -1) {
1560 !!!cp (146);
1561 !!!parse-error (type => 'unclosed comment');
1562 $self->{state} = DATA_STATE;
1563 ## reconsume
1564
1565 !!!emit ($self->{current_token}); # comment
1566
1567 redo A;
1568 } else {
1569 !!!cp (147);
1570 $self->{current_token}->{data} .= chr ($self->{next_char}); # comment
1571 ## Stay in the state
1572 !!!next-input-character;
1573 redo A;
1574 }
1575 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1576 if ($self->{next_char} == 0x002D) { # -
1577 !!!cp (148);
1578 $self->{state} = COMMENT_END_STATE;
1579 !!!next-input-character;
1580 redo A;
1581 } elsif ($self->{next_char} == -1) {
1582 !!!cp (149);
1583 !!!parse-error (type => 'unclosed comment');
1584 $self->{state} = DATA_STATE;
1585 ## reconsume
1586
1587 !!!emit ($self->{current_token}); # comment
1588
1589 redo A;
1590 } else {
1591 !!!cp (150);
1592 $self->{current_token}->{data} .= '-' . chr ($self->{next_char}); # comment
1593 $self->{state} = COMMENT_STATE;
1594 !!!next-input-character;
1595 redo A;
1596 }
1597 } elsif ($self->{state} == COMMENT_END_STATE) {
1598 if ($self->{next_char} == 0x003E) { # >
1599 !!!cp (151);
1600 $self->{state} = DATA_STATE;
1601 !!!next-input-character;
1602
1603 !!!emit ($self->{current_token}); # comment
1604
1605 redo A;
1606 } elsif ($self->{next_char} == 0x002D) { # -
1607 !!!cp (152);
1608 !!!parse-error (type => 'dash in comment');
1609 $self->{current_token}->{data} .= '-'; # comment
1610 ## Stay in the state
1611 !!!next-input-character;
1612 redo A;
1613 } elsif ($self->{next_char} == -1) {
1614 !!!cp (153);
1615 !!!parse-error (type => 'unclosed comment');
1616 $self->{state} = DATA_STATE;
1617 ## reconsume
1618
1619 !!!emit ($self->{current_token}); # comment
1620
1621 redo A;
1622 } else {
1623 !!!cp (154);
1624 !!!parse-error (type => 'dash in comment');
1625 $self->{current_token}->{data} .= '--' . chr ($self->{next_char}); # comment
1626 $self->{state} = COMMENT_STATE;
1627 !!!next-input-character;
1628 redo A;
1629 }
1630 } elsif ($self->{state} == DOCTYPE_STATE) {
1631 if ($self->{next_char} == 0x0009 or # HT
1632 $self->{next_char} == 0x000A or # LF
1633 $self->{next_char} == 0x000B or # VT
1634 $self->{next_char} == 0x000C or # FF
1635 $self->{next_char} == 0x0020) { # SP
1636 !!!cp (155);
1637 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1638 !!!next-input-character;
1639 redo A;
1640 } else {
1641 !!!cp (156);
1642 !!!parse-error (type => 'no space before DOCTYPE name');
1643 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1644 ## reconsume
1645 redo A;
1646 }
1647 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
1648 if ($self->{next_char} == 0x0009 or # HT
1649 $self->{next_char} == 0x000A or # LF
1650 $self->{next_char} == 0x000B or # VT
1651 $self->{next_char} == 0x000C or # FF
1652 $self->{next_char} == 0x0020) { # SP
1653 !!!cp (157);
1654 ## Stay in the state
1655 !!!next-input-character;
1656 redo A;
1657 } elsif ($self->{next_char} == 0x003E) { # >
1658 !!!cp (158);
1659 !!!parse-error (type => 'no DOCTYPE name');
1660 $self->{state} = DATA_STATE;
1661 !!!next-input-character;
1662
1663 !!!emit ({type => DOCTYPE_TOKEN, quirks => 1});
1664
1665 redo A;
1666 } elsif ($self->{next_char} == -1) {
1667 !!!cp (159);
1668 !!!parse-error (type => 'no DOCTYPE name');
1669 $self->{state} = DATA_STATE;
1670 ## reconsume
1671
1672 !!!emit ({type => DOCTYPE_TOKEN, quirks => 1});
1673
1674 redo A;
1675 } else {
1676 !!!cp (160);
1677 $self->{current_token}
1678 = {type => DOCTYPE_TOKEN,
1679 name => chr ($self->{next_char}),
1680 #quirks => 0,
1681 };
1682 ## ISSUE: "Set the token's name name to the" in the spec
1683 $self->{state} = DOCTYPE_NAME_STATE;
1684 !!!next-input-character;
1685 redo A;
1686 }
1687 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
1688 ## ISSUE: Redundant "First," in the spec.
1689 if ($self->{next_char} == 0x0009 or # HT
1690 $self->{next_char} == 0x000A or # LF
1691 $self->{next_char} == 0x000B or # VT
1692 $self->{next_char} == 0x000C or # FF
1693 $self->{next_char} == 0x0020) { # SP
1694 !!!cp (161);
1695 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
1696 !!!next-input-character;
1697 redo A;
1698 } elsif ($self->{next_char} == 0x003E) { # >
1699 !!!cp (162);
1700 $self->{state} = DATA_STATE;
1701 !!!next-input-character;
1702
1703 !!!emit ($self->{current_token}); # DOCTYPE
1704
1705 redo A;
1706 } elsif ($self->{next_char} == -1) {
1707 !!!cp (163);
1708 !!!parse-error (type => 'unclosed DOCTYPE');
1709 $self->{state} = DATA_STATE;
1710 ## reconsume
1711
1712 $self->{current_token}->{quirks} = 1;
1713 !!!emit ($self->{current_token}); # DOCTYPE
1714
1715 redo A;
1716 } else {
1717 !!!cp (164);
1718 $self->{current_token}->{name}
1719 .= chr ($self->{next_char}); # DOCTYPE
1720 ## Stay in the state
1721 !!!next-input-character;
1722 redo A;
1723 }
1724 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
1725 if ($self->{next_char} == 0x0009 or # HT
1726 $self->{next_char} == 0x000A or # LF
1727 $self->{next_char} == 0x000B or # VT
1728 $self->{next_char} == 0x000C or # FF
1729 $self->{next_char} == 0x0020) { # SP
1730 !!!cp (165);
1731 ## Stay in the state
1732 !!!next-input-character;
1733 redo A;
1734 } elsif ($self->{next_char} == 0x003E) { # >
1735 !!!cp (166);
1736 $self->{state} = DATA_STATE;
1737 !!!next-input-character;
1738
1739 !!!emit ($self->{current_token}); # DOCTYPE
1740
1741 redo A;
1742 } elsif ($self->{next_char} == -1) {
1743 !!!cp (167);
1744 !!!parse-error (type => 'unclosed DOCTYPE');
1745 $self->{state} = DATA_STATE;
1746 ## reconsume
1747
1748 $self->{current_token}->{quirks} = 1;
1749 !!!emit ($self->{current_token}); # DOCTYPE
1750
1751 redo A;
1752 } elsif ($self->{next_char} == 0x0050 or # P
1753 $self->{next_char} == 0x0070) { # p
1754 !!!next-input-character;
1755 if ($self->{next_char} == 0x0055 or # U
1756 $self->{next_char} == 0x0075) { # u
1757 !!!next-input-character;
1758 if ($self->{next_char} == 0x0042 or # B
1759 $self->{next_char} == 0x0062) { # b
1760 !!!next-input-character;
1761 if ($self->{next_char} == 0x004C or # L
1762 $self->{next_char} == 0x006C) { # l
1763 !!!next-input-character;
1764 if ($self->{next_char} == 0x0049 or # I
1765 $self->{next_char} == 0x0069) { # i
1766 !!!next-input-character;
1767 if ($self->{next_char} == 0x0043 or # C
1768 $self->{next_char} == 0x0063) { # c
1769 !!!cp (168);
1770 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1771 !!!next-input-character;
1772 redo A;
1773 } else {
1774 !!!cp (169);
1775 }
1776 } else {
1777 !!!cp (170);
1778 }
1779 } else {
1780 !!!cp (171);
1781 }
1782 } else {
1783 !!!cp (172);
1784 }
1785 } else {
1786 !!!cp (173);
1787 }
1788
1789 #
1790 } elsif ($self->{next_char} == 0x0053 or # S
1791 $self->{next_char} == 0x0073) { # s
1792 !!!next-input-character;
1793 if ($self->{next_char} == 0x0059 or # Y
1794 $self->{next_char} == 0x0079) { # y
1795 !!!next-input-character;
1796 if ($self->{next_char} == 0x0053 or # S
1797 $self->{next_char} == 0x0073) { # s
1798 !!!next-input-character;
1799 if ($self->{next_char} == 0x0054 or # T
1800 $self->{next_char} == 0x0074) { # t
1801 !!!next-input-character;
1802 if ($self->{next_char} == 0x0045 or # E
1803 $self->{next_char} == 0x0065) { # e
1804 !!!next-input-character;
1805 if ($self->{next_char} == 0x004D or # M
1806 $self->{next_char} == 0x006D) { # m
1807 !!!cp (174);
1808 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1809 !!!next-input-character;
1810 redo A;
1811 } else {
1812 !!!cp (175);
1813 }
1814 } else {
1815 !!!cp (176);
1816 }
1817 } else {
1818 !!!cp (177);
1819 }
1820 } else {
1821 !!!cp (178);
1822 }
1823 } else {
1824 !!!cp (179);
1825 }
1826
1827 #
1828 } else {
1829 !!!cp (180);
1830 !!!next-input-character;
1831 #
1832 }
1833
1834 !!!parse-error (type => 'string after DOCTYPE name');
1835 $self->{current_token}->{quirks} = 1;
1836
1837 $self->{state} = BOGUS_DOCTYPE_STATE;
1838 # next-input-character is already done
1839 redo A;
1840 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
1841 if ({
1842 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1843 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1844 }->{$self->{next_char}}) {
1845 !!!cp (181);
1846 ## Stay in the state
1847 !!!next-input-character;
1848 redo A;
1849 } elsif ($self->{next_char} eq 0x0022) { # "
1850 !!!cp (182);
1851 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
1852 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
1853 !!!next-input-character;
1854 redo A;
1855 } elsif ($self->{next_char} eq 0x0027) { # '
1856 !!!cp (183);
1857 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
1858 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
1859 !!!next-input-character;
1860 redo A;
1861 } elsif ($self->{next_char} eq 0x003E) { # >
1862 !!!cp (184);
1863 !!!parse-error (type => 'no PUBLIC literal');
1864
1865 $self->{state} = DATA_STATE;
1866 !!!next-input-character;
1867
1868 $self->{current_token}->{quirks} = 1;
1869 !!!emit ($self->{current_token}); # DOCTYPE
1870
1871 redo A;
1872 } elsif ($self->{next_char} == -1) {
1873 !!!cp (185);
1874 !!!parse-error (type => 'unclosed DOCTYPE');
1875
1876 $self->{state} = DATA_STATE;
1877 ## reconsume
1878
1879 $self->{current_token}->{quirks} = 1;
1880 !!!emit ($self->{current_token}); # DOCTYPE
1881
1882 redo A;
1883 } else {
1884 !!!cp (186);
1885 !!!parse-error (type => 'string after PUBLIC');
1886 $self->{current_token}->{quirks} = 1;
1887
1888 $self->{state} = BOGUS_DOCTYPE_STATE;
1889 !!!next-input-character;
1890 redo A;
1891 }
1892 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
1893 if ($self->{next_char} == 0x0022) { # "
1894 !!!cp (187);
1895 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1896 !!!next-input-character;
1897 redo A;
1898 } elsif ($self->{next_char} == 0x003E) { # >
1899 !!!cp (188);
1900 !!!parse-error (type => 'unclosed PUBLIC literal');
1901
1902 $self->{state} = DATA_STATE;
1903 !!!next-input-character;
1904
1905 $self->{current_token}->{quirks} = 1;
1906 !!!emit ($self->{current_token}); # DOCTYPE
1907
1908 redo A;
1909 } elsif ($self->{next_char} == -1) {
1910 !!!cp (189);
1911 !!!parse-error (type => 'unclosed PUBLIC literal');
1912
1913 $self->{state} = DATA_STATE;
1914 ## reconsume
1915
1916 $self->{current_token}->{quirks} = 1;
1917 !!!emit ($self->{current_token}); # DOCTYPE
1918
1919 redo A;
1920 } else {
1921 !!!cp (190);
1922 $self->{current_token}->{public_identifier} # DOCTYPE
1923 .= chr $self->{next_char};
1924 ## Stay in the state
1925 !!!next-input-character;
1926 redo A;
1927 }
1928 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
1929 if ($self->{next_char} == 0x0027) { # '
1930 !!!cp (191);
1931 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1932 !!!next-input-character;
1933 redo A;
1934 } elsif ($self->{next_char} == 0x003E) { # >
1935 !!!cp (192);
1936 !!!parse-error (type => 'unclosed PUBLIC literal');
1937
1938 $self->{state} = DATA_STATE;
1939 !!!next-input-character;
1940
1941 $self->{current_token}->{quirks} = 1;
1942 !!!emit ($self->{current_token}); # DOCTYPE
1943
1944 redo A;
1945 } elsif ($self->{next_char} == -1) {
1946 !!!cp (193);
1947 !!!parse-error (type => 'unclosed PUBLIC literal');
1948
1949 $self->{state} = DATA_STATE;
1950 ## reconsume
1951
1952 $self->{current_token}->{quirks} = 1;
1953 !!!emit ($self->{current_token}); # DOCTYPE
1954
1955 redo A;
1956 } else {
1957 !!!cp (194);
1958 $self->{current_token}->{public_identifier} # DOCTYPE
1959 .= chr $self->{next_char};
1960 ## Stay in the state
1961 !!!next-input-character;
1962 redo A;
1963 }
1964 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
1965 if ({
1966 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1967 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1968 }->{$self->{next_char}}) {
1969 !!!cp (195);
1970 ## Stay in the state
1971 !!!next-input-character;
1972 redo A;
1973 } elsif ($self->{next_char} == 0x0022) { # "
1974 !!!cp (196);
1975 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1976 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
1977 !!!next-input-character;
1978 redo A;
1979 } elsif ($self->{next_char} == 0x0027) { # '
1980 !!!cp (197);
1981 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1982 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
1983 !!!next-input-character;
1984 redo A;
1985 } elsif ($self->{next_char} == 0x003E) { # >
1986 !!!cp (198);
1987 $self->{state} = DATA_STATE;
1988 !!!next-input-character;
1989
1990 !!!emit ($self->{current_token}); # DOCTYPE
1991
1992 redo A;
1993 } elsif ($self->{next_char} == -1) {
1994 !!!cp (199);
1995 !!!parse-error (type => 'unclosed DOCTYPE');
1996
1997 $self->{state} = DATA_STATE;
1998 ## reconsume
1999
2000 $self->{current_token}->{quirks} = 1;
2001 !!!emit ($self->{current_token}); # DOCTYPE
2002
2003 redo A;
2004 } else {
2005 !!!cp (200);
2006 !!!parse-error (type => 'string after PUBLIC literal');
2007 $self->{current_token}->{quirks} = 1;
2008
2009 $self->{state} = BOGUS_DOCTYPE_STATE;
2010 !!!next-input-character;
2011 redo A;
2012 }
2013 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2014 if ({
2015 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2016 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2017 }->{$self->{next_char}}) {
2018 !!!cp (201);
2019 ## Stay in the state
2020 !!!next-input-character;
2021 redo A;
2022 } elsif ($self->{next_char} == 0x0022) { # "
2023 !!!cp (202);
2024 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2025 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2026 !!!next-input-character;
2027 redo A;
2028 } elsif ($self->{next_char} == 0x0027) { # '
2029 !!!cp (203);
2030 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2031 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2032 !!!next-input-character;
2033 redo A;
2034 } elsif ($self->{next_char} == 0x003E) { # >
2035 !!!cp (204);
2036 !!!parse-error (type => 'no SYSTEM literal');
2037 $self->{state} = DATA_STATE;
2038 !!!next-input-character;
2039
2040 $self->{current_token}->{quirks} = 1;
2041 !!!emit ($self->{current_token}); # DOCTYPE
2042
2043 redo A;
2044 } elsif ($self->{next_char} == -1) {
2045 !!!cp (205);
2046 !!!parse-error (type => 'unclosed DOCTYPE');
2047
2048 $self->{state} = DATA_STATE;
2049 ## reconsume
2050
2051 $self->{current_token}->{quirks} = 1;
2052 !!!emit ($self->{current_token}); # DOCTYPE
2053
2054 redo A;
2055 } else {
2056 !!!cp (206);
2057 !!!parse-error (type => 'string after SYSTEM');
2058 $self->{current_token}->{quirks} = 1;
2059
2060 $self->{state} = BOGUS_DOCTYPE_STATE;
2061 !!!next-input-character;
2062 redo A;
2063 }
2064 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2065 if ($self->{next_char} == 0x0022) { # "
2066 !!!cp (207);
2067 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2068 !!!next-input-character;
2069 redo A;
2070 } elsif ($self->{next_char} == 0x003E) { # >
2071 !!!cp (208);
2072 !!!parse-error (type => 'unclosed PUBLIC literal');
2073
2074 $self->{state} = DATA_STATE;
2075 !!!next-input-character;
2076
2077 $self->{current_token}->{quirks} = 1;
2078 !!!emit ($self->{current_token}); # DOCTYPE
2079
2080 redo A;
2081 } elsif ($self->{next_char} == -1) {
2082 !!!cp (209);
2083 !!!parse-error (type => 'unclosed SYSTEM literal');
2084
2085 $self->{state} = DATA_STATE;
2086 ## reconsume
2087
2088 $self->{current_token}->{quirks} = 1;
2089 !!!emit ($self->{current_token}); # DOCTYPE
2090
2091 redo A;
2092 } else {
2093 !!!cp (210);
2094 $self->{current_token}->{system_identifier} # DOCTYPE
2095 .= chr $self->{next_char};
2096 ## Stay in the state
2097 !!!next-input-character;
2098 redo A;
2099 }
2100 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2101 if ($self->{next_char} == 0x0027) { # '
2102 !!!cp (211);
2103 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2104 !!!next-input-character;
2105 redo A;
2106 } elsif ($self->{next_char} == 0x003E) { # >
2107 !!!cp (212);
2108 !!!parse-error (type => 'unclosed PUBLIC literal');
2109
2110 $self->{state} = DATA_STATE;
2111 !!!next-input-character;
2112
2113 $self->{current_token}->{quirks} = 1;
2114 !!!emit ($self->{current_token}); # DOCTYPE
2115
2116 redo A;
2117 } elsif ($self->{next_char} == -1) {
2118 !!!cp (213);
2119 !!!parse-error (type => 'unclosed SYSTEM literal');
2120
2121 $self->{state} = DATA_STATE;
2122 ## reconsume
2123
2124 $self->{current_token}->{quirks} = 1;
2125 !!!emit ($self->{current_token}); # DOCTYPE
2126
2127 redo A;
2128 } else {
2129 !!!cp (214);
2130 $self->{current_token}->{system_identifier} # DOCTYPE
2131 .= chr $self->{next_char};
2132 ## Stay in the state
2133 !!!next-input-character;
2134 redo A;
2135 }
2136 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2137 if ({
2138 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2139 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2140 }->{$self->{next_char}}) {
2141 !!!cp (215);
2142 ## Stay in the state
2143 !!!next-input-character;
2144 redo A;
2145 } elsif ($self->{next_char} == 0x003E) { # >
2146 !!!cp (216);
2147 $self->{state} = DATA_STATE;
2148 !!!next-input-character;
2149
2150 !!!emit ($self->{current_token}); # DOCTYPE
2151
2152 redo A;
2153 } elsif ($self->{next_char} == -1) {
2154 !!!cp (217);
2155 !!!parse-error (type => 'unclosed DOCTYPE');
2156
2157 $self->{state} = DATA_STATE;
2158 ## reconsume
2159
2160 $self->{current_token}->{quirks} = 1;
2161 !!!emit ($self->{current_token}); # DOCTYPE
2162
2163 redo A;
2164 } else {
2165 !!!cp (218);
2166 !!!parse-error (type => 'string after SYSTEM literal');
2167 #$self->{current_token}->{quirks} = 1;
2168
2169 $self->{state} = BOGUS_DOCTYPE_STATE;
2170 !!!next-input-character;
2171 redo A;
2172 }
2173 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2174 if ($self->{next_char} == 0x003E) { # >
2175 !!!cp (219);
2176 $self->{state} = DATA_STATE;
2177 !!!next-input-character;
2178
2179 !!!emit ($self->{current_token}); # DOCTYPE
2180
2181 redo A;
2182 } elsif ($self->{next_char} == -1) {
2183 !!!cp (220);
2184 !!!parse-error (type => 'unclosed DOCTYPE');
2185 $self->{state} = DATA_STATE;
2186 ## reconsume
2187
2188 !!!emit ($self->{current_token}); # DOCTYPE
2189
2190 redo A;
2191 } else {
2192 !!!cp (221);
2193 ## Stay in the state
2194 !!!next-input-character;
2195 redo A;
2196 }
2197 } else {
2198 die "$0: $self->{state}: Unknown state";
2199 }
2200 } # A
2201
2202 die "$0: _get_next_token: unexpected case";
2203 } # _get_next_token
2204
2205 sub _tokenize_attempt_to_consume_an_entity ($$$) {
2206 my ($self, $in_attr, $additional) = @_;
2207
2208 if ({
2209 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
2210 0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR
2211 $additional => 1,
2212 }->{$self->{next_char}}) {
2213 !!!cp (1001);
2214 ## Don't consume
2215 ## No error
2216 return undef;
2217 } elsif ($self->{next_char} == 0x0023) { # #
2218 !!!next-input-character;
2219 if ($self->{next_char} == 0x0078 or # x
2220 $self->{next_char} == 0x0058) { # X
2221 my $code;
2222 X: {
2223 my $x_char = $self->{next_char};
2224 !!!next-input-character;
2225 if (0x0030 <= $self->{next_char} and
2226 $self->{next_char} <= 0x0039) { # 0..9
2227 !!!cp (1002);
2228 $code ||= 0;
2229 $code *= 0x10;
2230 $code += $self->{next_char} - 0x0030;
2231 redo X;
2232 } elsif (0x0061 <= $self->{next_char} and
2233 $self->{next_char} <= 0x0066) { # a..f
2234 !!!cp (1003);
2235 $code ||= 0;
2236 $code *= 0x10;
2237 $code += $self->{next_char} - 0x0060 + 9;
2238 redo X;
2239 } elsif (0x0041 <= $self->{next_char} and
2240 $self->{next_char} <= 0x0046) { # A..F
2241 !!!cp (1004);
2242 $code ||= 0;
2243 $code *= 0x10;
2244 $code += $self->{next_char} - 0x0040 + 9;
2245 redo X;
2246 } elsif (not defined $code) { # no hexadecimal digit
2247 !!!cp (1005);
2248 !!!parse-error (type => 'bare hcro');
2249 !!!back-next-input-character ($x_char, $self->{next_char});
2250 $self->{next_char} = 0x0023; # #
2251 return undef;
2252 } elsif ($self->{next_char} == 0x003B) { # ;
2253 !!!cp (1006);
2254 !!!next-input-character;
2255 } else {
2256 !!!cp (1007);
2257 !!!parse-error (type => 'no refc');
2258 }
2259
2260 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
2261 !!!cp (1008);
2262 !!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code);
2263 $code = 0xFFFD;
2264 } elsif ($code > 0x10FFFF) {
2265 !!!cp (1009);
2266 !!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code);
2267 $code = 0xFFFD;
2268 } elsif ($code == 0x000D) {
2269 !!!cp (1010);
2270 !!!parse-error (type => 'CR character reference');
2271 $code = 0x000A;
2272 } elsif (0x80 <= $code and $code <= 0x9F) {
2273 !!!cp (1011);
2274 !!!parse-error (type => sprintf 'C1 character reference:U+%04X', $code);
2275 $code = $c1_entity_char->{$code};
2276 }
2277
2278 return {type => CHARACTER_TOKEN, data => chr $code,
2279 has_reference => 1};
2280 } # X
2281 } elsif (0x0030 <= $self->{next_char} and
2282 $self->{next_char} <= 0x0039) { # 0..9
2283 my $code = $self->{next_char} - 0x0030;
2284 !!!next-input-character;
2285
2286 while (0x0030 <= $self->{next_char} and
2287 $self->{next_char} <= 0x0039) { # 0..9
2288 !!!cp (1012);
2289 $code *= 10;
2290 $code += $self->{next_char} - 0x0030;
2291
2292 !!!next-input-character;
2293 }
2294
2295 if ($self->{next_char} == 0x003B) { # ;
2296 !!!cp (1013);
2297 !!!next-input-character;
2298 } else {
2299 !!!cp (1014);
2300 !!!parse-error (type => 'no refc');
2301 }
2302
2303 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
2304 !!!cp (1015);
2305 !!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code);
2306 $code = 0xFFFD;
2307 } elsif ($code > 0x10FFFF) {
2308 !!!cp (1016);
2309 !!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code);
2310 $code = 0xFFFD;
2311 } elsif ($code == 0x000D) {
2312 !!!cp (1017);
2313 !!!parse-error (type => 'CR character reference');
2314 $code = 0x000A;
2315 } elsif (0x80 <= $code and $code <= 0x9F) {
2316 !!!cp (1018);
2317 !!!parse-error (type => sprintf 'C1 character reference:U+%04X', $code);
2318 $code = $c1_entity_char->{$code};
2319 }
2320
2321 return {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1};
2322 } else {
2323 !!!cp (1019);
2324 !!!parse-error (type => 'bare nero');
2325 !!!back-next-input-character ($self->{next_char});
2326 $self->{next_char} = 0x0023; # #
2327 return undef;
2328 }
2329 } elsif ((0x0041 <= $self->{next_char} and
2330 $self->{next_char} <= 0x005A) or
2331 (0x0061 <= $self->{next_char} and
2332 $self->{next_char} <= 0x007A)) {
2333 my $entity_name = chr $self->{next_char};
2334 !!!next-input-character;
2335
2336 my $value = $entity_name;
2337 my $match = 0;
2338 require Whatpm::_NamedEntityList;
2339 our $EntityChar;
2340
2341 while (length $entity_name < 10 and
2342 ## NOTE: Some number greater than the maximum length of entity name
2343 ((0x0041 <= $self->{next_char} and # a
2344 $self->{next_char} <= 0x005A) or # x
2345 (0x0061 <= $self->{next_char} and # a
2346 $self->{next_char} <= 0x007A) or # z
2347 (0x0030 <= $self->{next_char} and # 0
2348 $self->{next_char} <= 0x0039) or # 9
2349 $self->{next_char} == 0x003B)) { # ;
2350 $entity_name .= chr $self->{next_char};
2351 if (defined $EntityChar->{$entity_name}) {
2352 if ($self->{next_char} == 0x003B) { # ;
2353 !!!cp (1020);
2354 $value = $EntityChar->{$entity_name};
2355 $match = 1;
2356 !!!next-input-character;
2357 last;
2358 } else {
2359 !!!cp (1021);
2360 $value = $EntityChar->{$entity_name};
2361 $match = -1;
2362 !!!next-input-character;
2363 }
2364 } else {
2365 !!!cp (1022);
2366 $value .= chr $self->{next_char};
2367 $match *= 2;
2368 !!!next-input-character;
2369 }
2370 }
2371
2372 if ($match > 0) {
2373 !!!cp (1023);
2374 return {type => CHARACTER_TOKEN, data => $value, has_reference => 1};
2375 } elsif ($match < 0) {
2376 !!!parse-error (type => 'no refc');
2377 if ($in_attr and $match < -1) {
2378 !!!cp (1024);
2379 return {type => CHARACTER_TOKEN, data => '&'.$entity_name};
2380 } else {
2381 !!!cp (1025);
2382 return {type => CHARACTER_TOKEN, data => $value, has_reference => 1};
2383 }
2384 } else {
2385 !!!cp (1026);
2386 !!!parse-error (type => 'bare ero');
2387 ## NOTE: "No characters are consumed" in the spec.
2388 return {type => CHARACTER_TOKEN, data => '&'.$value};
2389 }
2390 } else {
2391 !!!cp (1027);
2392 ## no characters are consumed
2393 !!!parse-error (type => 'bare ero');
2394 return undef;
2395 }
2396 } # _tokenize_attempt_to_consume_an_entity
2397
2398 sub _initialize_tree_constructor ($) {
2399 my $self = shift;
2400 ## NOTE: $self->{document} MUST be specified before this method is called
2401 $self->{document}->strict_error_checking (0);
2402 ## TODO: Turn mutation events off # MUST
2403 ## TODO: Turn loose Document option (manakai extension) on
2404 $self->{document}->manakai_is_html (1); # MUST
2405 } # _initialize_tree_constructor
2406
2407 sub _terminate_tree_constructor ($) {
2408 my $self = shift;
2409 $self->{document}->strict_error_checking (1);
2410 ## TODO: Turn mutation events on
2411 } # _terminate_tree_constructor
2412
2413 ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
2414
2415 { # tree construction stage
2416 my $token;
2417
2418 sub _construct_tree ($) {
2419 my ($self) = @_;
2420
2421 ## When an interactive UA render the $self->{document} available
2422 ## to the user, or when it begin accepting user input, are
2423 ## not defined.
2424
2425 ## Append a character: collect it and all subsequent consecutive
2426 ## characters and insert one Text node whose data is concatenation
2427 ## of all those characters. # MUST
2428
2429 !!!next-token;
2430
2431 undef $self->{form_element};
2432 undef $self->{head_element};
2433 $self->{open_elements} = [];
2434 undef $self->{inner_html_node};
2435
2436 ## NOTE: The "initial" insertion mode.
2437 $self->_tree_construction_initial; # MUST
2438
2439 ## NOTE: The "before html" insertion mode.
2440 $self->_tree_construction_root_element;
2441 $self->{insertion_mode} = BEFORE_HEAD_IM;
2442
2443 ## NOTE: The "before head" insertion mode and so on.
2444 $self->_tree_construction_main;
2445 } # _construct_tree
2446
2447 sub _tree_construction_initial ($) {
2448 my $self = shift;
2449
2450 ## NOTE: "initial" insertion mode
2451
2452 INITIAL: {
2453 if ($token->{type} == DOCTYPE_TOKEN) {
2454 ## NOTE: Conformance checkers MAY, instead of reporting "not HTML5"
2455 ## error, switch to a conformance checking mode for another
2456 ## language.
2457 my $doctype_name = $token->{name};
2458 $doctype_name = '' unless defined $doctype_name;
2459 $doctype_name =~ tr/a-z/A-Z/;
2460 if (not defined $token->{name} or # <!DOCTYPE>
2461 defined $token->{public_identifier} or
2462 defined $token->{system_identifier}) {
2463 !!!cp ('t1');
2464 !!!parse-error (type => 'not HTML5');
2465 } elsif ($doctype_name ne 'HTML') {
2466 !!!cp ('t2');
2467 ## ISSUE: ASCII case-insensitive? (in fact it does not matter)
2468 !!!parse-error (type => 'not HTML5');
2469 } else {
2470 !!!cp ('t3');
2471 }
2472
2473 my $doctype = $self->{document}->create_document_type_definition
2474 ($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)?
2475 $doctype->public_id ($token->{public_identifier})
2476 if defined $token->{public_identifier};
2477 $doctype->system_id ($token->{system_identifier})
2478 if defined $token->{system_identifier};
2479 ## NOTE: Other DocumentType attributes are null or empty lists.
2480 ## ISSUE: internalSubset = null??
2481 $self->{document}->append_child ($doctype);
2482
2483 if ($token->{quirks} or $doctype_name ne 'HTML') {
2484 !!!cp ('t4');
2485 $self->{document}->manakai_compat_mode ('quirks');
2486 } elsif (defined $token->{public_identifier}) {
2487 my $pubid = $token->{public_identifier};
2488 $pubid =~ tr/a-z/A-z/;
2489 if ({
2490 "+//SILMARIL//DTD HTML PRO V0R11 19970101//EN" => 1,
2491 "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
2492 "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
2493 "-//IETF//DTD HTML 2.0 LEVEL 1//EN" => 1,
2494 "-//IETF//DTD HTML 2.0 LEVEL 2//EN" => 1,
2495 "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//EN" => 1,
2496 "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//EN" => 1,
2497 "-//IETF//DTD HTML 2.0 STRICT//EN" => 1,
2498 "-//IETF//DTD HTML 2.0//EN" => 1,
2499 "-//IETF//DTD HTML 2.1E//EN" => 1,
2500 "-//IETF//DTD HTML 3.0//EN" => 1,
2501 "-//IETF//DTD HTML 3.0//EN//" => 1,
2502 "-//IETF//DTD HTML 3.2 FINAL//EN" => 1,
2503 "-//IETF//DTD HTML 3.2//EN" => 1,
2504 "-//IETF//DTD HTML 3//EN" => 1,
2505 "-//IETF//DTD HTML LEVEL 0//EN" => 1,
2506 "-//IETF//DTD HTML LEVEL 0//EN//2.0" => 1,
2507 "-//IETF//DTD HTML LEVEL 1//EN" => 1,
2508 "-//IETF//DTD HTML LEVEL 1//EN//2.0" => 1,
2509 "-//IETF//DTD HTML LEVEL 2//EN" => 1,
2510 "-//IETF//DTD HTML LEVEL 2//EN//2.0" => 1,
2511 "-//IETF//DTD HTML LEVEL 3//EN" => 1,
2512 "-//IETF//DTD HTML LEVEL 3//EN//3.0" => 1,
2513 "-//IETF//DTD HTML STRICT LEVEL 0//EN" => 1,
2514 "-//IETF//DTD HTML STRICT LEVEL 0//EN//2.0" => 1,
2515 "-//IETF//DTD HTML STRICT LEVEL 1//EN" => 1,
2516 "-//IETF//DTD HTML STRICT LEVEL 1//EN//2.0" => 1,
2517 "-//IETF//DTD HTML STRICT LEVEL 2//EN" => 1,
2518 "-//IETF//DTD HTML STRICT LEVEL 2//EN//2.0" => 1,
2519 "-//IETF//DTD HTML STRICT LEVEL 3//EN" => 1,
2520 "-//IETF//DTD HTML STRICT LEVEL 3//EN//3.0" => 1,
2521 "-//IETF//DTD HTML STRICT//EN" => 1,
2522 "-//IETF//DTD HTML STRICT//EN//2.0" => 1,
2523 "-//IETF//DTD HTML STRICT//EN//3.0" => 1,
2524 "-//IETF//DTD HTML//EN" => 1,
2525 "-//IETF//DTD HTML//EN//2.0" => 1,
2526 "-//IETF//DTD HTML//EN//3.0" => 1,
2527 "-//METRIUS//DTD METRIUS PRESENTATIONAL//EN" => 1,
2528 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//EN" => 1,
2529 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//EN" => 1,
2530 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//EN" => 1,
2531 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//EN" => 1,
2532 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//EN" => 1,
2533 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//EN" => 1,
2534 "-//NETSCAPE COMM. CORP.//DTD HTML//EN" => 1,
2535 "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1,
2536 "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1,
2537 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1,
2538 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED RELAXED 1.0//EN" => 1,
2539 "-//SOFTQUAD SOFTWARE//DTD HOTMETAL PRO 6.0::19990601::EXTENSIONS TO HTML 4.0//EN" => 1,
2540 "-//SOFTQUAD//DTD HOTMETAL PRO 4.0::19971010::EXTENSIONS TO HTML 4.0//EN" => 1,
2541 "-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1,
2542 "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1,
2543 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1,
2544 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//EN" => 1,
2545 "-//W3C//DTD HTML 3 1995-03-24//EN" => 1,
2546 "-//W3C//DTD HTML 3.2 DRAFT//EN" => 1,
2547 "-//W3C//DTD HTML 3.2 FINAL//EN" => 1,
2548 "-//W3C//DTD HTML 3.2//EN" => 1,
2549 "-//W3C//DTD HTML 3.2S DRAFT//EN" => 1,
2550 "-//W3C//DTD HTML 4.0 FRAMESET//EN" => 1,
2551 "-//W3C//DTD HTML 4.0 TRANSITIONAL//EN" => 1,
2552 "-//W3C//DTD HTML EXPERIMETNAL 19960712//EN" => 1,
2553 "-//W3C//DTD HTML EXPERIMENTAL 970421//EN" => 1,
2554 "-//W3C//DTD W3 HTML//EN" => 1,
2555 "-//W3O//DTD W3 HTML 3.0//EN" => 1,
2556 "-//W3O//DTD W3 HTML 3.0//EN//" => 1,
2557 "-//W3O//DTD W3 HTML STRICT 3.0//EN//" => 1,
2558 "-//WEBTECHS//DTD MOZILLA HTML 2.0//EN" => 1,
2559 "-//WEBTECHS//DTD MOZILLA HTML//EN" => 1,
2560 "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" => 1,
2561 "HTML" => 1,
2562 }->{$pubid}) {
2563 !!!cp ('t5');
2564 $self->{document}->manakai_compat_mode ('quirks');
2565 } elsif ($pubid eq "-//W3C//DTD HTML 4.01 FRAMESET//EN" or
2566 $pubid eq "-//W3C//DTD HTML 4.01 TRANSITIONAL//EN") {
2567 if (defined $token->{system_identifier}) {
2568 !!!cp ('t6');
2569 $self->{document}->manakai_compat_mode ('quirks');
2570 } else {
2571 !!!cp ('t7');
2572 $self->{document}->manakai_compat_mode ('limited quirks');
2573 }
2574 } elsif ($pubid eq "-//W3C//DTD XHTML 1.0 FRAMESET//EN" or
2575 $pubid eq "-//W3C//DTD XHTML 1.0 TRANSITIONAL//EN") {
2576 !!!cp ('t8');
2577 $self->{document}->manakai_compat_mode ('limited quirks');
2578 } else {
2579 !!!cp ('t9');
2580 }
2581 } else {
2582 !!!cp ('t10');
2583 }
2584 if (defined $token->{system_identifier}) {
2585 my $sysid = $token->{system_identifier};
2586 $sysid =~ tr/A-Z/a-z/;
2587 if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
2588 ## TODO: Check the spec: PUBLIC "(limited quirks)" "(quirks)"
2589 $self->{document}->manakai_compat_mode ('quirks');
2590 !!!cp ('t11');
2591 } else {
2592 !!!cp ('t12');
2593 }
2594 } else {
2595 !!!cp ('t13');
2596 }
2597
2598 ## Go to the "before html" insertion mode.
2599 !!!next-token;
2600 return;
2601 } elsif ({
2602 START_TAG_TOKEN, 1,
2603 END_TAG_TOKEN, 1,
2604 END_OF_FILE_TOKEN, 1,
2605 }->{$token->{type}}) {
2606 !!!cp ('t14');
2607 !!!parse-error (type => 'no DOCTYPE');
2608 $self->{document}->manakai_compat_mode ('quirks');
2609 ## Go to the "before html" insertion mode.
2610 ## reprocess
2611 return;
2612 } elsif ($token->{type} == CHARACTER_TOKEN) {
2613 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
2614 ## Ignore the token
2615
2616 unless (length $token->{data}) {
2617 !!!cp ('t15');
2618 ## Stay in the insertion mode.
2619 !!!next-token;
2620 redo INITIAL;
2621 } else {
2622 !!!cp ('t16');
2623 }
2624 } else {
2625 !!!cp ('t17');
2626 }
2627
2628 !!!parse-error (type => 'no DOCTYPE');
2629 $self->{document}->manakai_compat_mode ('quirks');
2630 ## Go to the "before html" insertion mode.
2631 ## reprocess
2632 return;
2633 } elsif ($token->{type} == COMMENT_TOKEN) {
2634 !!!cp ('t18');
2635 my $comment = $self->{document}->create_comment ($token->{data});
2636 $self->{document}->append_child ($comment);
2637
2638 ## Stay in the insertion mode.
2639 !!!next-token;
2640 redo INITIAL;
2641 } else {
2642 die "$0: $token->{type}: Unknown token type";
2643 }
2644 } # INITIAL
2645
2646 die "$0: _tree_construction_initial: This should be never reached";
2647 } # _tree_construction_initial
2648
2649 sub _tree_construction_root_element ($) {
2650 my $self = shift;
2651
2652 ## NOTE: "before html" insertion mode.
2653
2654 B: {
2655 if ($token->{type} == DOCTYPE_TOKEN) {
2656 !!!cp ('t19');
2657 !!!parse-error (type => 'in html:#DOCTYPE');
2658 ## Ignore the token
2659 ## Stay in the insertion mode.
2660 !!!next-token;
2661 redo B;
2662 } elsif ($token->{type} == COMMENT_TOKEN) {
2663 !!!cp ('t20');
2664 my $comment = $self->{document}->create_comment ($token->{data});
2665 $self->{document}->append_child ($comment);
2666 ## Stay in the insertion mode.
2667 !!!next-token;
2668 redo B;
2669 } elsif ($token->{type} == CHARACTER_TOKEN) {
2670 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
2671 ## Ignore the token.
2672
2673 unless (length $token->{data}) {
2674 !!!cp ('t21');
2675 ## Stay in the insertion mode.
2676 !!!next-token;
2677 redo B;
2678 } else {
2679 !!!cp ('t22');
2680 }
2681 } else {
2682 !!!cp ('t23');
2683 }
2684
2685 $self->{application_cache_selection}->(undef);
2686
2687 #
2688 } elsif ($token->{type} == START_TAG_TOKEN) {
2689 if ($token->{tag_name} eq 'html') {
2690 my $root_element;
2691 !!!create-element ($root_element, $token->{tag_name}, $token->{attributes});
2692 $self->{document}->append_child ($root_element);
2693 push @{$self->{open_elements}}, [$root_element, 'html'];
2694
2695 if ($token->{attributes}->{manifest}) {
2696 !!!cp ('t24');
2697 $self->{application_cache_selection}
2698 ->($token->{attributes}->{manifest}->{value});
2699 ## ISSUE: No relative reference resolution?
2700 } else {
2701 !!!cp ('t25');
2702 $self->{application_cache_selection}->(undef);
2703 }
2704
2705 !!!next-token;
2706 return; ## Go to the "before head" insertion mode.
2707 } else {
2708 !!!cp ('t25.1');
2709 #
2710 }
2711 } elsif ({
2712 END_TAG_TOKEN, 1,
2713 END_OF_FILE_TOKEN, 1,
2714 }->{$token->{type}}) {
2715 !!!cp ('t26');
2716 #
2717 } else {
2718 die "$0: $token->{type}: Unknown token type";
2719 }
2720
2721 my $root_element; !!!create-element ($root_element, 'html');
2722 $self->{document}->append_child ($root_element);
2723 push @{$self->{open_elements}}, [$root_element, 'html'];
2724
2725 $self->{application_cache_selection}->(undef);
2726
2727 ## NOTE: Reprocess the token.
2728 return; ## Go to the "before head" insertion mode.
2729
2730 ## ISSUE: There is an issue in the spec
2731 } # B
2732
2733 die "$0: _tree_construction_root_element: This should never be reached";
2734 } # _tree_construction_root_element
2735
2736 sub _reset_insertion_mode ($) {
2737 my $self = shift;
2738
2739 ## Step 1
2740 my $last;
2741
2742 ## Step 2
2743 my $i = -1;
2744 my $node = $self->{open_elements}->[$i];
2745
2746 ## Step 3
2747 S3: {
2748 if ($self->{open_elements}->[0]->[0] eq $node->[0]) {
2749 $last = 1;
2750 if (defined $self->{inner_html_node}) {
2751 if ($self->{inner_html_node}->[1] eq 'td' or
2752 $self->{inner_html_node}->[1] eq 'th') {
2753 !!!cp ('t27');
2754 #
2755 } else {
2756 !!!cp ('t28');
2757 $node = $self->{inner_html_node};
2758 }
2759 }
2760 }
2761
2762 ## Step 4..13
2763 my $new_mode = {
2764 select => IN_SELECT_IM,
2765 ## NOTE: |option| and |optgroup| do not set
2766 ## insertion mode to "in select" by themselves.
2767 td => IN_CELL_IM,
2768 th => IN_CELL_IM,
2769 tr => IN_ROW_IM,
2770 tbody => IN_TABLE_BODY_IM,
2771 thead => IN_TABLE_BODY_IM,
2772 tfoot => IN_TABLE_BODY_IM,
2773 caption => IN_CAPTION_IM,
2774 colgroup => IN_COLUMN_GROUP_IM,
2775 table => IN_TABLE_IM,
2776 head => IN_BODY_IM, # not in head!
2777 body => IN_BODY_IM,
2778 frameset => IN_FRAMESET_IM,
2779 }->{$node->[1]};
2780 $self->{insertion_mode} = $new_mode and return if defined $new_mode;
2781
2782 ## Step 14
2783 if ($node->[1] eq 'html') {
2784 unless (defined $self->{head_element}) {
2785 !!!cp ('t29');
2786 $self->{insertion_mode} = BEFORE_HEAD_IM;
2787 } else {
2788 ## ISSUE: Can this state be reached?
2789 !!!cp ('t30');
2790 $self->{insertion_mode} = AFTER_HEAD_IM;
2791 }
2792 return;
2793 } else {
2794 !!!cp ('t31');
2795 }
2796
2797 ## Step 15
2798 $self->{insertion_mode} = IN_BODY_IM and return if $last;
2799
2800 ## Step 16
2801 $i--;
2802 $node = $self->{open_elements}->[$i];
2803
2804 ## Step 17
2805 redo S3;
2806 } # S3
2807
2808 die "$0: _reset_insertion_mode: This line should never be reached";
2809 } # _reset_insertion_mode
2810
2811 sub _tree_construction_main ($) {
2812 my $self = shift;
2813
2814 my $active_formatting_elements = [];
2815
2816 my $reconstruct_active_formatting_elements = sub { # MUST
2817 my $insert = shift;
2818
2819 ## Step 1
2820 return unless @$active_formatting_elements;
2821
2822 ## Step 3
2823 my $i = -1;
2824 my $entry = $active_formatting_elements->[$i];
2825
2826 ## Step 2
2827 return if $entry->[0] eq '#marker';
2828 for (@{$self->{open_elements}}) {
2829 if ($entry->[0] eq $_->[0]) {
2830 !!!cp ('t32');
2831 return;
2832 }
2833 }
2834
2835 S4: {
2836 ## Step 4
2837 last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
2838
2839 ## Step 5
2840 $i--;
2841 $entry = $active_formatting_elements->[$i];
2842
2843 ## Step 6
2844 if ($entry->[0] eq '#marker') {
2845 !!!cp ('t33_1');
2846 #
2847 } else {
2848 my $in_open_elements;
2849 OE: for (@{$self->{open_elements}}) {
2850 if ($entry->[0] eq $_->[0]) {
2851 !!!cp ('t33');
2852 $in_open_elements = 1;
2853 last OE;
2854 }
2855 }
2856 if ($in_open_elements) {
2857 !!!cp ('t34');
2858 #
2859 } else {
2860 ## NOTE: <!DOCTYPE HTML><p><b><i><u></p> <p>X
2861 !!!cp ('t35');
2862 redo S4;
2863 }
2864 }
2865
2866 ## Step 7
2867 $i++;
2868 $entry = $active_formatting_elements->[$i];
2869 } # S4
2870
2871 S7: {
2872 ## Step 8
2873 my $clone = [$entry->[0]->clone_node (0), $entry->[1]];
2874
2875 ## Step 9
2876 $insert->($clone->[0]);
2877 push @{$self->{open_elements}}, $clone;
2878
2879 ## Step 10
2880 $active_formatting_elements->[$i] = $self->{open_elements}->[-1];
2881
2882 ## Step 11
2883 unless ($clone->[0] eq $active_formatting_elements->[-1]->[0]) {
2884 !!!cp ('t36');
2885 ## Step 7'
2886 $i++;
2887 $entry = $active_formatting_elements->[$i];
2888
2889 redo S7;
2890 }
2891
2892 !!!cp ('t37');
2893 } # S7
2894 }; # $reconstruct_active_formatting_elements
2895
2896 my $clear_up_to_marker = sub {
2897 for (reverse 0..$#$active_formatting_elements) {
2898 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
2899 !!!cp ('t38');
2900 splice @$active_formatting_elements, $_;
2901 return;
2902 }
2903 }
2904
2905 !!!cp ('t39');
2906 }; # $clear_up_to_marker
2907
2908 my $insert;
2909
2910 my $parse_rcdata = sub ($) {
2911 my ($content_model_flag) = @_;
2912
2913 ## Step 1
2914 my $start_tag_name = $token->{tag_name};
2915 my $el;
2916 !!!create-element ($el, $start_tag_name, $token->{attributes});
2917
2918 ## Step 2
2919 $insert->($el);
2920
2921 ## Step 3
2922 $self->{content_model} = $content_model_flag; # CDATA or RCDATA
2923 delete $self->{escape}; # MUST
2924
2925 ## Step 4
2926 my $text = '';
2927 !!!next-token;
2928 while ($token->{type} == CHARACTER_TOKEN) { # or until stop tokenizing
2929 !!!cp ('t40');
2930 $text .= $token->{data};
2931 !!!next-token;
2932 }
2933
2934 ## Step 5
2935 if (length $text) {
2936 !!!cp ('t41');
2937 my $text = $self->{document}->create_text_node ($text);
2938 $el->append_child ($text);
2939 }
2940
2941 ## Step 6
2942 $self->{content_model} = PCDATA_CONTENT_MODEL;
2943
2944 ## Step 7
2945 if ($token->{type} == END_TAG_TOKEN and
2946 $token->{tag_name} eq $start_tag_name) {
2947 !!!cp ('t42');
2948 ## Ignore the token
2949 } else {
2950 ## NOTE: An end-of-file token.
2951 if ($content_model_flag == CDATA_CONTENT_MODEL) {
2952 !!!cp ('t43');
2953 !!!parse-error (type => 'in CDATA:#'.$token->{type});
2954 } elsif ($content_model_flag == RCDATA_CONTENT_MODEL) {
2955 !!!cp ('t44');
2956 !!!parse-error (type => 'in RCDATA:#'.$token->{type});
2957 } else {
2958 die "$0: $content_model_flag in parse_rcdata";
2959 }
2960 }
2961 !!!next-token;
2962 }; # $parse_rcdata
2963
2964 my $script_start_tag = sub () {
2965 my $script_el;
2966 !!!create-element ($script_el, 'script', $token->{attributes});
2967 ## TODO: mark as "parser-inserted"
2968
2969 $self->{content_model} = CDATA_CONTENT_MODEL;
2970 delete $self->{escape}; # MUST
2971
2972 my $text = '';
2973 !!!next-token;
2974 while ($token->{type} == CHARACTER_TOKEN) {
2975 !!!cp ('t45');
2976 $text .= $token->{data};
2977 !!!next-token;
2978 } # stop if non-character token or tokenizer stops tokenising
2979 if (length $text) {
2980 !!!cp ('t46');
2981 $script_el->manakai_append_text ($text);
2982 }
2983
2984 $self->{content_model} = PCDATA_CONTENT_MODEL;
2985
2986 if ($token->{type} == END_TAG_TOKEN and
2987 $token->{tag_name} eq 'script') {
2988 !!!cp ('t47');
2989 ## Ignore the token
2990 } else {
2991 !!!cp ('t48');
2992 !!!parse-error (type => 'in CDATA:#'.$token->{type});
2993 ## ISSUE: And ignore?
2994 ## TODO: mark as "already executed"
2995 }
2996
2997 if (defined $self->{inner_html_node}) {
2998 !!!cp ('t49');
2999 ## TODO: mark as "already executed"
3000 } else {
3001 !!!cp ('t50');
3002 ## TODO: $old_insertion_point = current insertion point
3003 ## TODO: insertion point = just before the next input character
3004
3005 $insert->($script_el);
3006
3007 ## TODO: insertion point = $old_insertion_point (might be "undefined")
3008
3009 ## TODO: if there is a script that will execute as soon as the parser resume, then...
3010 }
3011
3012 !!!next-token;
3013 }; # $script_start_tag
3014
3015 ## NOTE: $open_tables->[-1]->[0] is the "current table" element node.
3016 ## NOTE: $open_tables->[-1]->[1] is the "tainted" flag.
3017 my $open_tables = [[$self->{open_elements}->[0]->[0]]];
3018
3019 my $formatting_end_tag = sub {
3020 my $tag_name = shift;
3021
3022 ## NOTE: The adoption agency algorithm (AAA).
3023
3024 FET: {
3025 ## Step 1
3026 my $formatting_element;
3027 my $formatting_element_i_in_active;
3028 AFE: for (reverse 0..$#$active_formatting_elements) {
3029 if ($active_formatting_elements->[$_]->[1] eq $tag_name) {
3030 !!!cp ('t51');
3031 $formatting_element = $active_formatting_elements->[$_];
3032 $formatting_element_i_in_active = $_;
3033 last AFE;
3034 } elsif ($active_formatting_elements->[$_]->[0] eq '#marker') {
3035 !!!cp ('t52');
3036 last AFE;
3037 }
3038 } # AFE
3039 unless (defined $formatting_element) {
3040 !!!cp ('t53');
3041 !!!parse-error (type => 'unmatched end tag:'.$tag_name);
3042 ## Ignore the token
3043 !!!next-token;
3044 return;
3045 }
3046 ## has an element in scope
3047 my $in_scope = 1;
3048 my $formatting_element_i_in_open;
3049 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3050 my $node = $self->{open_elements}->[$_];
3051 if ($node->[0] eq $formatting_element->[0]) {
3052 if ($in_scope) {
3053 !!!cp ('t54');
3054 $formatting_element_i_in_open = $_;
3055 last INSCOPE;
3056 } else { # in open elements but not in scope
3057 !!!cp ('t55');
3058 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3059 ## Ignore the token
3060 !!!next-token;
3061 return;
3062 }
3063 } elsif ({
3064 applet => 1, table => 1, caption => 1, td => 1, th => 1,
3065 button => 1, marquee => 1, object => 1, html => 1,
3066 }->{$node->[1]}) {
3067 !!!cp ('t56');
3068 $in_scope = 0;
3069 }
3070 } # INSCOPE
3071 unless (defined $formatting_element_i_in_open) {
3072 !!!cp ('t57');
3073 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3074 pop @$active_formatting_elements; # $formatting_element
3075 !!!next-token; ## TODO: ok?
3076 return;
3077 }
3078 if (not $self->{open_elements}->[-1]->[0] eq $formatting_element->[0]) {
3079 !!!cp ('t58');
3080 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3081 }
3082
3083 ## Step 2
3084 my $furthest_block;
3085 my $furthest_block_i_in_open;
3086 OE: for (reverse 0..$#{$self->{open_elements}}) {
3087 my $node = $self->{open_elements}->[$_];
3088 if (not $formatting_category->{$node->[1]} and
3089 #not $phrasing_category->{$node->[1]} and
3090 ($special_category->{$node->[1]} or
3091 $scoping_category->{$node->[1]})) { ## Scoping is redundant, maybe
3092 !!!cp ('t59');
3093 $furthest_block = $node;
3094 $furthest_block_i_in_open = $_;
3095 } elsif ($node->[0] eq $formatting_element->[0]) {
3096 !!!cp ('t60');
3097 last OE;
3098 }
3099 } # OE
3100
3101 ## Step 3
3102 unless (defined $furthest_block) { # MUST
3103 !!!cp ('t61');
3104 splice @{$self->{open_elements}}, $formatting_element_i_in_open;
3105 splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
3106 !!!next-token;
3107 return;
3108 }
3109
3110 ## Step 4
3111 my $common_ancestor_node = $self->{open_elements}->[$formatting_element_i_in_open - 1];
3112
3113 ## Step 5
3114 my $furthest_block_parent = $furthest_block->[0]->parent_node;
3115 if (defined $furthest_block_parent) {
3116 !!!cp ('t62');
3117 $furthest_block_parent->remove_child ($furthest_block->[0]);
3118 }
3119
3120 ## Step 6
3121 my $bookmark_prev_el
3122 = $active_formatting_elements->[$formatting_element_i_in_active - 1]
3123 ->[0];
3124
3125 ## Step 7
3126 my $node = $furthest_block;
3127 my $node_i_in_open = $furthest_block_i_in_open;
3128 my $last_node = $furthest_block;
3129 S7: {
3130 ## Step 1
3131 $node_i_in_open--;
3132 $node = $self->{open_elements}->[$node_i_in_open];
3133
3134 ## Step 2
3135 my $node_i_in_active;
3136 S7S2: {
3137 for (reverse 0..$#$active_formatting_elements) {
3138 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
3139 !!!cp ('t63');
3140 $node_i_in_active = $_;
3141 last S7S2;
3142 }
3143 }
3144 splice @{$self->{open_elements}}, $node_i_in_open, 1;
3145 redo S7;
3146 } # S7S2
3147
3148 ## Step 3
3149 last S7 if $node->[0] eq $formatting_element->[0];
3150
3151 ## Step 4
3152 if ($last_node->[0] eq $furthest_block->[0]) {
3153 !!!cp ('t64');
3154 $bookmark_prev_el = $node->[0];
3155 }
3156
3157 ## Step 5
3158 if ($node->[0]->has_child_nodes ()) {
3159 !!!cp ('t65');
3160 my $clone = [$node->[0]->clone_node (0), $node->[1]];
3161 $active_formatting_elements->[$node_i_in_active] = $clone;
3162 $self->{open_elements}->[$node_i_in_open] = $clone;
3163 $node = $clone;
3164 }
3165
3166 ## Step 6
3167 $node->[0]->append_child ($last_node->[0]);
3168
3169 ## Step 7
3170 $last_node = $node;
3171
3172 ## Step 8
3173 redo S7;
3174 } # S7
3175
3176 ## Step 8
3177 if ({
3178 table => 1, tbody => 1, tfoot => 1, thead => 1, tr => 1,
3179 }->{$common_ancestor_node->[1]}) {
3180 my $foster_parent_element;
3181 my $next_sibling;
3182 OE: for (reverse 0..$#{$self->{open_elements}}) {
3183 if ($self->{open_elements}->[$_]->[1] eq 'table') {
3184 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3185 if (defined $parent and $parent->node_type == 1) {
3186 !!!cp ('t65.1');
3187 $foster_parent_element = $parent;
3188 $next_sibling = $self->{open_elements}->[$_]->[0];
3189 } else {
3190 !!!cp ('t65.2');
3191 $foster_parent_element
3192 = $self->{open_elements}->[$_ - 1]->[0];
3193 }
3194 last OE;
3195 }
3196 } # OE
3197 $foster_parent_element = $self->{open_elements}->[0]->[0]
3198 unless defined $foster_parent_element;
3199 $foster_parent_element->insert_before ($last_node->[0], $next_sibling);
3200 $open_tables->[-1]->[1] = 1; # tainted
3201 } else {
3202 !!!cp ('t65.3');
3203 $common_ancestor_node->[0]->append_child ($last_node->[0]);
3204 }
3205
3206 ## Step 9
3207 my $clone = [$formatting_element->[0]->clone_node (0),
3208 $formatting_element->[1]];
3209
3210 ## Step 10
3211 my @cn = @{$furthest_block->[0]->child_nodes};
3212 $clone->[0]->append_child ($_) for @cn;
3213
3214 ## Step 11
3215 $furthest_block->[0]->append_child ($clone->[0]);
3216
3217 ## Step 12
3218 my $i;
3219 AFE: for (reverse 0..$#$active_formatting_elements) {
3220 if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
3221 !!!cp ('t66');
3222 splice @$active_formatting_elements, $_, 1;
3223 $i-- and last AFE if defined $i;
3224 } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
3225 !!!cp ('t67');
3226 $i = $_;
3227 }
3228 } # AFE
3229 splice @$active_formatting_elements, $i + 1, 0, $clone;
3230
3231 ## Step 13
3232 undef $i;
3233 OE: for (reverse 0..$#{$self->{open_elements}}) {
3234 if ($self->{open_elements}->[$_]->[0] eq $formatting_element->[0]) {
3235 !!!cp ('t68');
3236 splice @{$self->{open_elements}}, $_, 1;
3237 $i-- and last OE if defined $i;
3238 } elsif ($self->{open_elements}->[$_]->[0] eq $furthest_block->[0]) {
3239 !!!cp ('t69');
3240 $i = $_;
3241 }
3242 } # OE
3243 splice @{$self->{open_elements}}, $i + 1, 1, $clone;
3244
3245 ## Step 14
3246 redo FET;
3247 } # FET
3248 }; # $formatting_end_tag
3249
3250 $insert = my $insert_to_current = sub {
3251 $self->{open_elements}->[-1]->[0]->append_child ($_[0]);
3252 }; # $insert_to_current
3253
3254 my $insert_to_foster = sub {
3255 my $child = shift;
3256 if ({
3257 table => 1, tbody => 1, tfoot => 1, thead => 1, tr => 1,
3258 }->{$self->{open_elements}->[-1]->[1]}) {
3259 # MUST
3260 my $foster_parent_element;
3261 my $next_sibling;
3262 OE: for (reverse 0..$#{$self->{open_elements}}) {
3263 if ($self->{open_elements}->[$_]->[1] eq 'table') {
3264 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3265 if (defined $parent and $parent->node_type == 1) {
3266 !!!cp ('t70');
3267 $foster_parent_element = $parent;
3268 $next_sibling = $self->{open_elements}->[$_]->[0];
3269 } else {
3270 !!!cp ('t71');
3271 $foster_parent_element
3272 = $self->{open_elements}->[$_ - 1]->[0];
3273 }
3274 last OE;
3275 }
3276 } # OE
3277 $foster_parent_element = $self->{open_elements}->[0]->[0]
3278 unless defined $foster_parent_element;
3279 $foster_parent_element->insert_before
3280 ($child, $next_sibling);
3281 $open_tables->[-1]->[1] = 1; # tainted
3282 } else {
3283 !!!cp ('t72');
3284 $self->{open_elements}->[-1]->[0]->append_child ($child);
3285 }
3286 }; # $insert_to_foster
3287
3288 B: {
3289 if ($token->{type} == DOCTYPE_TOKEN) {
3290 !!!cp ('t73');
3291 !!!parse-error (type => 'DOCTYPE in the middle');
3292 ## Ignore the token
3293 ## Stay in the phase
3294 !!!next-token;
3295 redo B;
3296 } elsif ($token->{type} == START_TAG_TOKEN and
3297 $token->{tag_name} eq 'html') {
3298 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
3299 !!!cp ('t79');
3300 !!!parse-error (type => 'after html:html');
3301 $self->{insertion_mode} = AFTER_BODY_IM;
3302 } elsif ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
3303 !!!cp ('t80');
3304 !!!parse-error (type => 'after html:html');
3305 $self->{insertion_mode} = AFTER_FRAMESET_IM;
3306 } else {
3307 !!!cp ('t81');
3308 }
3309
3310 !!!cp ('t82');
3311 !!!parse-error (type => 'not first start tag');
3312 my $top_el = $self->{open_elements}->[0]->[0];
3313 for my $attr_name (keys %{$token->{attributes}}) {
3314 unless ($top_el->has_attribute_ns (undef, $attr_name)) {
3315 !!!cp ('t84');
3316 $top_el->set_attribute_ns
3317 (undef, [undef, $attr_name],
3318 $token->{attributes}->{$attr_name}->{value});
3319 }
3320 }
3321 !!!next-token;
3322 redo B;
3323 } elsif ($token->{type} == COMMENT_TOKEN) {
3324 my $comment = $self->{document}->create_comment ($token->{data});
3325 if ($self->{insertion_mode} & AFTER_HTML_IMS) {
3326 !!!cp ('t85');
3327 $self->{document}->append_child ($comment);
3328 } elsif ($self->{insertion_mode} == AFTER_BODY_IM) {
3329 !!!cp ('t86');
3330 $self->{open_elements}->[0]->[0]->append_child ($comment);
3331 } else {
3332 !!!cp ('t87');
3333 $self->{open_elements}->[-1]->[0]->append_child ($comment);
3334 }
3335 !!!next-token;
3336 redo B;
3337 } elsif ($self->{insertion_mode} & HEAD_IMS) {
3338 if ($token->{type} == CHARACTER_TOKEN) {
3339 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3340 unless ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3341 !!!cp ('t88.2');
3342 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3343 } else {
3344 !!!cp ('t88.1');
3345 ## Ignore the token.
3346 !!!next-token;
3347 redo B;
3348 }
3349 unless (length $token->{data}) {
3350 !!!cp ('t88');
3351 !!!next-token;
3352 redo B;
3353 }
3354 }
3355
3356 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3357 !!!cp ('t89');
3358 ## As if <head>
3359 !!!create-element ($self->{head_element}, 'head');
3360 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3361 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3362
3363 ## Reprocess in the "in head" insertion mode...
3364 pop @{$self->{open_elements}};
3365
3366 ## Reprocess in the "after head" insertion mode...
3367 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3368 !!!cp ('t90');
3369 ## As if </noscript>
3370 pop @{$self->{open_elements}};
3371 !!!parse-error (type => 'in noscript:#character');
3372
3373 ## Reprocess in the "in head" insertion mode...
3374 ## As if </head>
3375 pop @{$self->{open_elements}};
3376
3377 ## Reprocess in the "after head" insertion mode...
3378 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3379 !!!cp ('t91');
3380 pop @{$self->{open_elements}};
3381
3382 ## Reprocess in the "after head" insertion mode...
3383 } else {
3384 !!!cp ('t92');
3385 }
3386
3387 ## "after head" insertion mode
3388 ## As if <body>
3389 !!!insert-element ('body');
3390 $self->{insertion_mode} = IN_BODY_IM;
3391 ## reprocess
3392 redo B;
3393 } elsif ($token->{type} == START_TAG_TOKEN) {
3394 if ($token->{tag_name} eq 'head') {
3395 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3396 !!!cp ('t93');
3397 !!!create-element ($self->{head_element}, $token->{tag_name}, $token->{attributes});
3398 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3399 push @{$self->{open_elements}}, [$self->{head_element}, $token->{tag_name}];
3400 $self->{insertion_mode} = IN_HEAD_IM;
3401 !!!next-token;
3402 redo B;
3403 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
3404 !!!cp ('t94');
3405 #
3406 } else {
3407 !!!cp ('t95');
3408 !!!parse-error (type => 'in head:head'); # or in head noscript
3409 ## Ignore the token
3410 !!!next-token;
3411 redo B;
3412 }
3413 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3414 !!!cp ('t96');
3415 ## As if <head>
3416 !!!create-element ($self->{head_element}, 'head');
3417 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3418 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3419
3420 $self->{insertion_mode} = IN_HEAD_IM;
3421 ## Reprocess in the "in head" insertion mode...
3422 } else {
3423 !!!cp ('t97');
3424 }
3425
3426 if ($token->{tag_name} eq 'base') {
3427 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3428 !!!cp ('t98');
3429 ## As if </noscript>
3430 pop @{$self->{open_elements}};
3431 !!!parse-error (type => 'in noscript:base');
3432
3433 $self->{insertion_mode} = IN_HEAD_IM;
3434 ## Reprocess in the "in head" insertion mode...
3435 } else {
3436 !!!cp ('t99');
3437 }
3438
3439 ## NOTE: There is a "as if in head" code clone.
3440 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
3441 !!!cp ('t100');
3442 !!!parse-error (type => 'after head:'.$token->{tag_name});
3443 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3444 } else {
3445 !!!cp ('t101');
3446 }
3447 !!!insert-element ($token->{tag_name}, $token->{attributes});
3448 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
3449 pop @{$self->{open_elements}} # <head>
3450 if $self->{insertion_mode} == AFTER_HEAD_IM;
3451 !!!next-token;
3452 redo B;
3453 } elsif ($token->{tag_name} eq 'link') {
3454 ## NOTE: There is a "as if in head" code clone.
3455 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
3456 !!!cp ('t102');
3457 !!!parse-error (type => 'after head:'.$token->{tag_name});
3458 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3459 } else {
3460 !!!cp ('t103');
3461 }
3462 !!!insert-element ($token->{tag_name}, $token->{attributes});
3463 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
3464 pop @{$self->{open_elements}} # <head>
3465 if $self->{insertion_mode} == AFTER_HEAD_IM;
3466 !!!next-token;
3467 redo B;
3468 } elsif ($token->{tag_name} eq 'meta') {
3469 ## NOTE: There is a "as if in head" code clone.
3470 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
3471 !!!cp ('t104');
3472 !!!parse-error (type => 'after head:'.$token->{tag_name});
3473 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3474 } else {
3475 !!!cp ('t105');
3476 }
3477 !!!insert-element ($token->{tag_name}, $token->{attributes});
3478 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
3479
3480 unless ($self->{confident}) {
3481 if ($token->{attributes}->{charset}) { ## TODO: And if supported
3482 !!!cp ('t106');
3483 $self->{change_encoding}
3484 ->($self, $token->{attributes}->{charset}->{value});
3485
3486 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
3487 ->set_user_data (manakai_has_reference =>
3488 $token->{attributes}->{charset}
3489 ->{has_reference});
3490 } elsif ($token->{attributes}->{content}) {
3491 ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
3492 if ($token->{attributes}->{content}->{value}
3493 =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
3494 [\x09-\x0D\x20]*=
3495 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
3496 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
3497 !!!cp ('t107');
3498 $self->{change_encoding}
3499 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3);
3500 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
3501 ->set_user_data (manakai_has_reference =>
3502 $token->{attributes}->{content}
3503 ->{has_reference});
3504 } else {
3505 !!!cp ('t108');
3506 }
3507 }
3508 } else {
3509 if ($token->{attributes}->{charset}) {
3510 !!!cp ('t109');
3511 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
3512 ->set_user_data (manakai_has_reference =>
3513 $token->{attributes}->{charset}
3514 ->{has_reference});
3515 }
3516 if ($token->{attributes}->{content}) {
3517 !!!cp ('t110');
3518 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
3519 ->set_user_data (manakai_has_reference =>
3520 $token->{attributes}->{content}
3521 ->{has_reference});
3522 }
3523 }
3524
3525 pop @{$self->{open_elements}} # <head>
3526 if $self->{insertion_mode} == AFTER_HEAD_IM;
3527 !!!next-token;
3528 redo B;
3529 } elsif ($token->{tag_name} eq 'title') {
3530 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3531 !!!cp ('t111');
3532 ## As if </noscript>
3533 pop @{$self->{open_elements}};
3534 !!!parse-error (type => 'in noscript:title');
3535
3536 $self->{insertion_mode} = IN_HEAD_IM;
3537 ## Reprocess in the "in head" insertion mode...
3538 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
3539 !!!cp ('t112');
3540 !!!parse-error (type => 'after head:'.$token->{tag_name});
3541 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3542 } else {
3543 !!!cp ('t113');
3544 }
3545
3546 ## NOTE: There is a "as if in head" code clone.
3547 my $parent = defined $self->{head_element} ? $self->{head_element}
3548 : $self->{open_elements}->[-1]->[0];
3549 $parse_rcdata->(RCDATA_CONTENT_MODEL);
3550 pop @{$self->{open_elements}} # <head>
3551 if $self->{insertion_mode} == AFTER_HEAD_IM;
3552 redo B;
3553 } elsif ($token->{tag_name} eq 'style') {
3554 ## NOTE: Or (scripting is enabled and tag_name eq 'noscript' and
3555 ## insertion mode IN_HEAD_IM)
3556 ## NOTE: There is a "as if in head" code clone.
3557 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
3558 !!!cp ('t114');
3559 !!!parse-error (type => 'after head:'.$token->{tag_name});
3560 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3561 } else {
3562 !!!cp ('t115');
3563 }
3564 $parse_rcdata->(CDATA_CONTENT_MODEL);
3565 pop @{$self->{open_elements}} # <head>
3566 if $self->{insertion_mode} == AFTER_HEAD_IM;
3567 redo B;
3568 } elsif ($token->{tag_name} eq 'noscript') {
3569 if ($self->{insertion_mode} == IN_HEAD_IM) {
3570 !!!cp ('t116');
3571 ## NOTE: and scripting is disalbed
3572 !!!insert-element ($token->{tag_name}, $token->{attributes});
3573 $self->{insertion_mode} = IN_HEAD_NOSCRIPT_IM;
3574 !!!next-token;
3575 redo B;
3576 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3577 !!!cp ('t117');
3578 !!!parse-error (type => 'in noscript:noscript');
3579 ## Ignore the token
3580 !!!next-token;
3581 redo B;
3582 } else {
3583 !!!cp ('t118');
3584 #
3585 }
3586 } elsif ($token->{tag_name} eq 'script') {
3587 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3588 !!!cp ('t119');
3589 ## As if </noscript>
3590 pop @{$self->{open_elements}};
3591 !!!parse-error (type => 'in noscript:script');
3592
3593 $self->{insertion_mode} = IN_HEAD_IM;
3594 ## Reprocess in the "in head" insertion mode...
3595 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
3596 !!!cp ('t120');
3597 !!!parse-error (type => 'after head:'.$token->{tag_name});
3598 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3599 } else {
3600 !!!cp ('t121');
3601 }
3602
3603 ## NOTE: There is a "as if in head" code clone.
3604 $script_start_tag->();
3605 pop @{$self->{open_elements}} # <head>
3606 if $self->{insertion_mode} == AFTER_HEAD_IM;
3607 redo B;
3608 } elsif ($token->{tag_name} eq 'body' or
3609 $token->{tag_name} eq 'frameset') {
3610 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3611 !!!cp ('t122');
3612 ## As if </noscript>
3613 pop @{$self->{open_elements}};
3614 !!!parse-error (type => 'in noscript:'.$token->{tag_name});
3615
3616 ## Reprocess in the "in head" insertion mode...
3617 ## As if </head>
3618 pop @{$self->{open_elements}};
3619
3620 ## Reprocess in the "after head" insertion mode...
3621 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3622 !!!cp ('t124');
3623 pop @{$self->{open_elements}};
3624
3625 ## Reprocess in the "after head" insertion mode...
3626 } else {
3627 !!!cp ('t125');
3628 }
3629
3630 ## "after head" insertion mode
3631 !!!insert-element ($token->{tag_name}, $token->{attributes});
3632 if ($token->{tag_name} eq 'body') {
3633 !!!cp ('t126');
3634 $self->{insertion_mode} = IN_BODY_IM;
3635 } elsif ($token->{tag_name} eq 'frameset') {
3636 !!!cp ('t127');
3637 $self->{insertion_mode} = IN_FRAMESET_IM;
3638 } else {
3639 die "$0: tag name: $self->{tag_name}";
3640 }
3641 !!!next-token;
3642 redo B;
3643 } else {
3644 !!!cp ('t128');
3645 #
3646 }
3647
3648 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3649 !!!cp ('t129');
3650 ## As if </noscript>
3651 pop @{$self->{open_elements}};
3652 !!!parse-error (type => 'in noscript:/'.$token->{tag_name});
3653
3654 ## Reprocess in the "in head" insertion mode...
3655 ## As if </head>
3656 pop @{$self->{open_elements}};
3657
3658 ## Reprocess in the "after head" insertion mode...
3659 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3660 !!!cp ('t130');
3661 ## As if </head>
3662 pop @{$self->{open_elements}};
3663
3664 ## Reprocess in the "after head" insertion mode...
3665 } else {
3666 !!!cp ('t131');
3667 }
3668
3669 ## "after head" insertion mode
3670 ## As if <body>
3671 !!!insert-element ('body');
3672 $self->{insertion_mode} = IN_BODY_IM;
3673 ## reprocess
3674 redo B;
3675 } elsif ($token->{type} == END_TAG_TOKEN) {
3676 if ($token->{tag_name} eq 'head') {
3677 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3678 !!!cp ('t132');
3679 ## As if <head>
3680 !!!create-element ($self->{head_element}, 'head');
3681 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3682 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3683
3684 ## Reprocess in the "in head" insertion mode...
3685 pop @{$self->{open_elements}};
3686 $self->{insertion_mode} = AFTER_HEAD_IM;
3687 !!!next-token;
3688 redo B;
3689 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3690 !!!cp ('t133');
3691 ## As if </noscript>
3692 pop @{$self->{open_elements}};
3693 !!!parse-error (type => 'in noscript:/head');
3694
3695 ## Reprocess in the "in head" insertion mode...
3696 pop @{$self->{open_elements}};
3697 $self->{insertion_mode} = AFTER_HEAD_IM;
3698 !!!next-token;
3699 redo B;
3700 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3701 !!!cp ('t134');
3702 pop @{$self->{open_elements}};
3703 $self->{insertion_mode} = AFTER_HEAD_IM;
3704 !!!next-token;
3705 redo B;
3706 } else {
3707 !!!cp ('t135');
3708 #
3709 }
3710 } elsif ($token->{tag_name} eq 'noscript') {
3711 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3712 !!!cp ('t136');
3713 pop @{$self->{open_elements}};
3714 $self->{insertion_mode} = IN_HEAD_IM;
3715 !!!next-token;
3716 redo B;
3717 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3718 !!!cp ('t137');
3719 !!!parse-error (type => 'unmatched end tag:noscript');
3720 ## Ignore the token ## ISSUE: An issue in the spec.
3721 !!!next-token;
3722 redo B;
3723 } else {
3724 !!!cp ('t138');
3725 #
3726 }
3727 } elsif ({
3728 body => 1, html => 1,
3729 }->{$token->{tag_name}}) {
3730 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3731 !!!cp ('t139');
3732 ## As if <head>
3733 !!!create-element ($self->{head_element}, 'head');
3734 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3735 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3736
3737 $self->{insertion_mode} = IN_HEAD_IM;
3738 ## Reprocess in the "in head" insertion mode...
3739 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3740 !!!cp ('t140');
3741 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3742 ## Ignore the token
3743 !!!next-token;
3744 redo B;
3745 } else {
3746 !!!cp ('t141');
3747 }
3748
3749 #
3750 } elsif ({
3751 p => 1, br => 1,
3752 }->{$token->{tag_name}}) {
3753 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3754 !!!cp ('t142');
3755 ## As if <head>
3756 !!!create-element ($self->{head_element}, 'head');
3757 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3758 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3759
3760 $self->{insertion_mode} = IN_HEAD_IM;
3761 ## Reprocess in the "in head" insertion mode...
3762 } else {
3763 !!!cp ('t143');
3764 }
3765
3766 #
3767 } else {
3768 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
3769 !!!cp ('t144');
3770 #
3771 } else {
3772 !!!cp ('t145');
3773 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3774 ## Ignore the token
3775 !!!next-token;
3776 redo B;
3777 }
3778 }
3779
3780 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3781 !!!cp ('t146');
3782 ## As if </noscript>
3783 pop @{$self->{open_elements}};
3784 !!!parse-error (type => 'in noscript:/'.$token->{tag_name});
3785
3786 ## Reprocess in the "in head" insertion mode...
3787 ## As if </head>
3788 pop @{$self->{open_elements}};
3789
3790 ## Reprocess in the "after head" insertion mode...
3791 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3792 !!!cp ('t147');
3793 ## As if </head>
3794 pop @{$self->{open_elements}};
3795
3796 ## Reprocess in the "after head" insertion mode...
3797 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3798 ## ISSUE: This case cannot be reached?
3799 !!!cp ('t148');
3800 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3801 ## Ignore the token ## ISSUE: An issue in the spec.
3802 !!!next-token;
3803 redo B;
3804 } else {
3805 !!!cp ('t149');
3806 }
3807
3808 ## "after head" insertion mode
3809 ## As if <body>
3810 !!!insert-element ('body');
3811 $self->{insertion_mode} = IN_BODY_IM;
3812 ## reprocess
3813 redo B;
3814 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
3815 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3816 !!!cp ('t149.1');
3817
3818 ## NOTE: As if <head>
3819 !!!create-element ($self->{head_element}, 'head');
3820 $self->{open_elements}->[-1]->[0]->append_child
3821 ($self->{head_element});
3822 #push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3823 #$self->{insertion_mode} = IN_HEAD_IM;
3824 ## NOTE: Reprocess.
3825
3826 ## NOTE: As if </head>
3827 #pop @{$self->{open_elements}};
3828 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
3829 ## NOTE: Reprocess.
3830
3831 #
3832 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3833 !!!cp ('t149.2');
3834
3835 ## NOTE: As if </head>
3836 pop @{$self->{open_elements}};
3837 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
3838 ## NOTE: Reprocess.
3839
3840 #
3841 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3842 !!!cp ('t149.3');
3843
3844 !!!parse-error (type => 'in noscript:#eof');
3845
3846 ## As if </noscript>
3847 pop @{$self->{open_elements}};
3848 #$self->{insertion_mode} = IN_HEAD_IM;
3849 ## NOTE: Reprocess.
3850
3851 ## NOTE: As if </head>
3852 pop @{$self->{open_elements}};
3853 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
3854 ## NOTE: Reprocess.
3855
3856 #
3857 } else {
3858 !!!cp ('t149.4');
3859 #
3860 }
3861
3862 ## NOTE: As if <body>
3863 !!!insert-element ('body');
3864 $self->{insertion_mode} = IN_BODY_IM;
3865 ## NOTE: Reprocess.
3866 redo B;
3867 } else {
3868 die "$0: $token->{type}: Unknown token type";
3869 }
3870
3871 ## ISSUE: An issue in the spec.
3872 } elsif ($self->{insertion_mode} & BODY_IMS) {
3873 if ($token->{type} == CHARACTER_TOKEN) {
3874 !!!cp ('t150');
3875 ## NOTE: There is a code clone of "character in body".
3876 $reconstruct_active_formatting_elements->($insert_to_current);
3877
3878 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3879
3880 !!!next-token;
3881 redo B;
3882 } elsif ($token->{type} == START_TAG_TOKEN) {
3883 if ({
3884 caption => 1, col => 1, colgroup => 1, tbody => 1,
3885 td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
3886 }->{$token->{tag_name}}) {
3887 if ($self->{insertion_mode} == IN_CELL_IM) {
3888 ## have an element in table scope
3889 my $tn;
3890 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3891 my $node = $self->{open_elements}->[$_];
3892 if ($node->[1] eq 'td' or $node->[1] eq 'th') {
3893 !!!cp ('t151');
3894 $tn = $node->[1];
3895 last INSCOPE;
3896 } elsif ({
3897 table => 1, html => 1,
3898 }->{$node->[1]}) {
3899 !!!cp ('t152');
3900 last INSCOPE;
3901 }
3902 } # INSCOPE
3903 unless (defined $tn) {
3904 !!!cp ('t153');
3905 ## TODO: This error type is wrong.
3906 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3907 ## Ignore the token
3908 !!!next-token;
3909 redo B;
3910 }
3911
3912 !!!cp ('t154');
3913 ## Close the cell
3914 !!!back-token; # <?>
3915 $token = {type => END_TAG_TOKEN, tag_name => $tn};
3916 redo B;
3917 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
3918 !!!parse-error (type => 'not closed:caption');
3919
3920 ## As if </caption>
3921 ## have a table element in table scope
3922 my $i;
3923 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3924 my $node = $self->{open_elements}->[$_];
3925 if ($node->[1] eq 'caption') {
3926 !!!cp ('t155');
3927 $i = $_;
3928 last INSCOPE;
3929 } elsif ({
3930 table => 1, html => 1,
3931 }->{$node->[1]}) {
3932 !!!cp ('t156');
3933 last INSCOPE;
3934 }
3935 } # INSCOPE
3936 unless (defined $i) {
3937 !!!cp ('t157');
3938 ## TODO: this type is wrong.
3939 !!!parse-error (type => 'unmatched end tag:caption');
3940 ## Ignore the token
3941 !!!next-token;
3942 redo B;
3943 }
3944
3945 ## generate implied end tags
3946 while ({
3947 dd => 1, dt => 1, li => 1, p => 1,
3948 }->{$self->{open_elements}->[-1]->[1]}) {
3949 !!!cp ('t158');
3950 pop @{$self->{open_elements}};
3951 }
3952
3953 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3954 !!!cp ('t159');
3955 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3956 } else {
3957 !!!cp ('t160');
3958 }
3959
3960 splice @{$self->{open_elements}}, $i;
3961
3962 $clear_up_to_marker->();
3963
3964 $self->{insertion_mode} = IN_TABLE_IM;
3965
3966 ## reprocess
3967 redo B;
3968 } else {
3969 !!!cp ('t161');
3970 #
3971 }
3972 } else {
3973 !!!cp ('t162');
3974 #
3975 }
3976 } elsif ($token->{type} == END_TAG_TOKEN) {
3977 if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') {
3978 if ($self->{insertion_mode} == IN_CELL_IM) {
3979 ## have an element in table scope
3980 my $i;
3981 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3982 my $node = $self->{open_elements}->[$_];
3983 if ($node->[1] eq $token->{tag_name}) {
3984 !!!cp ('t163');
3985 $i = $_;
3986 last INSCOPE;
3987 } elsif ({
3988 table => 1, html => 1,
3989 }->{$node->[1]}) {
3990 !!!cp ('t164');
3991 last INSCOPE;
3992 }
3993 } # INSCOPE
3994 unless (defined $i) {
3995 !!!cp ('t165');
3996 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3997 ## Ignore the token
3998 !!!next-token;
3999 redo B;
4000 }
4001
4002 ## generate implied end tags
4003 while ({
4004 dd => 1, dt => 1, li => 1, p => 1,
4005 }->{$self->{open_elements}->[-1]->[1]}) {
4006 !!!cp ('t166');
4007 pop @{$self->{open_elements}};
4008 }
4009
4010 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
4011 !!!cp ('t167');
4012 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4013 } else {
4014 !!!cp ('t168');
4015 }
4016
4017 splice @{$self->{open_elements}}, $i;
4018
4019 $clear_up_to_marker->();
4020
4021 $self->{insertion_mode} = IN_ROW_IM;
4022
4023 !!!next-token;
4024 redo B;
4025 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
4026 !!!cp ('t169');
4027 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4028 ## Ignore the token
4029 !!!next-token;
4030 redo B;
4031 } else {
4032 !!!cp ('t170');
4033 #
4034 }
4035 } elsif ($token->{tag_name} eq 'caption') {
4036 if ($self->{insertion_mode} == IN_CAPTION_IM) {
4037 ## have a table element in table scope
4038 my $i;
4039 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4040 my $node = $self->{open_elements}->[$_];
4041 if ($node->[1] eq $token->{tag_name}) {
4042 !!!cp ('t171');
4043 $i = $_;
4044 last INSCOPE;
4045 } elsif ({
4046 table => 1, html => 1,
4047 }->{$node->[1]}) {
4048 !!!cp ('t172');
4049 last INSCOPE;
4050 }
4051 } # INSCOPE
4052 unless (defined $i) {
4053 !!!cp ('t173');
4054 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4055 ## Ignore the token
4056 !!!next-token;
4057 redo B;
4058 }
4059
4060 ## generate implied end tags
4061 while ({
4062 dd => 1, dt => 1, li => 1, p => 1,
4063 }->{$self->{open_elements}->[-1]->[1]}) {
4064 !!!cp ('t174');
4065 pop @{$self->{open_elements}};
4066 }
4067
4068 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
4069 !!!cp ('t175');
4070 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4071 } else {
4072 !!!cp ('t176');
4073 }
4074
4075 splice @{$self->{open_elements}}, $i;
4076
4077 $clear_up_to_marker->();
4078
4079 $self->{insertion_mode} = IN_TABLE_IM;
4080
4081 !!!next-token;
4082 redo B;
4083 } elsif ($self->{insertion_mode} == IN_CELL_IM) {
4084 !!!cp ('t177');
4085 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4086 ## Ignore the token
4087 !!!next-token;
4088 redo B;
4089 } else {
4090 !!!cp ('t178');
4091 #
4092 }
4093 } elsif ({
4094 table => 1, tbody => 1, tfoot => 1,
4095 thead => 1, tr => 1,
4096 }->{$token->{tag_name}} and
4097 $self->{insertion_mode} == IN_CELL_IM) {
4098 ## have an element in table scope
4099 my $i;
4100 my $tn;
4101 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4102 my $node = $self->{open_elements}->[$_];
4103 if ($node->[1] eq $token->{tag_name}) {
4104 !!!cp ('t179');
4105 $i = $_;
4106 last INSCOPE;
4107 } elsif ($node->[1] eq 'td' or $node->[1] eq 'th') {
4108 !!!cp ('t180');
4109 $tn = $node->[1];
4110 ## NOTE: There is exactly one |td| or |th| element
4111 ## in scope in the stack of open elements by definition.
4112 } elsif ({
4113 table => 1, html => 1,
4114 }->{$node->[1]}) {
4115 !!!cp ('t181');
4116 last INSCOPE;
4117 }
4118 } # INSCOPE
4119 unless (defined $i) {
4120 !!!cp ('t182');
4121 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4122 ## Ignore the token
4123 !!!next-token;
4124 redo B;
4125 } else {
4126 !!!cp ('t183');
4127 }
4128
4129 ## Close the cell
4130 !!!back-token; # </?>
4131 $token = {type => END_TAG_TOKEN, tag_name => $tn};
4132 redo B;
4133 } elsif ($token->{tag_name} eq 'table' and
4134 $self->{insertion_mode} == IN_CAPTION_IM) {
4135 !!!parse-error (type => 'not closed:caption');
4136
4137 ## As if </caption>
4138 ## have a table element in table scope
4139 my $i;
4140 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4141 my $node = $self->{open_elements}->[$_];
4142 if ($node->[1] eq 'caption') {
4143 !!!cp ('t184');
4144 $i = $_;
4145 last INSCOPE;
4146 } elsif ({
4147 table => 1, html => 1,
4148 }->{$node->[1]}) {
4149 !!!cp ('t185');
4150 last INSCOPE;
4151 }
4152 } # INSCOPE
4153 unless (defined $i) {
4154 !!!cp ('t186');
4155 !!!parse-error (type => 'unmatched end tag:caption');
4156 ## Ignore the token
4157 !!!next-token;
4158 redo B;
4159 }
4160
4161 ## generate implied end tags
4162 while ({
4163 dd => 1, dt => 1, li => 1, p => 1,
4164 }->{$self->{open_elements}->[-1]->[1]}) {
4165 !!!cp ('t187');
4166 pop @{$self->{open_elements}};
4167 }
4168
4169 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
4170 !!!cp ('t188');
4171 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4172 } else {
4173 !!!cp ('t189');
4174 }
4175
4176 splice @{$self->{open_elements}}, $i;
4177
4178 $clear_up_to_marker->();
4179
4180 $self->{insertion_mode} = IN_TABLE_IM;
4181
4182 ## reprocess
4183 redo B;
4184 } elsif ({
4185 body => 1, col => 1, colgroup => 1, html => 1,
4186 }->{$token->{tag_name}}) {
4187 if ($self->{insertion_mode} & BODY_TABLE_IMS) {
4188 !!!cp ('t190');
4189 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4190 ## Ignore the token
4191 !!!next-token;
4192 redo B;
4193 } else {
4194 !!!cp ('t191');
4195 #
4196 }
4197 } elsif ({
4198 tbody => 1, tfoot => 1,
4199 thead => 1, tr => 1,
4200 }->{$token->{tag_name}} and
4201 $self->{insertion_mode} == IN_CAPTION_IM) {
4202 !!!cp ('t192');
4203 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4204 ## Ignore the token
4205 !!!next-token;
4206 redo B;
4207 } else {
4208 !!!cp ('t193');
4209 #
4210 }
4211 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
4212 for my $entry (@{$self->{open_elements}}) {
4213 if (not {
4214 dd => 1, dt => 1, li => 1, p => 1, tbody => 1, td => 1, tfoot => 1,
4215 th => 1, thead => 1, tr => 1, body => 1, html => 1,
4216 }->{$entry->[1]}) {
4217 !!!cp ('t75');
4218 !!!parse-error (type => 'in body:#eof');
4219 last;
4220 }
4221 }
4222
4223 ## Stop parsing.
4224 last B;
4225 } else {
4226 die "$0: $token->{type}: Unknown token type";
4227 }
4228
4229 $insert = $insert_to_current;
4230 #
4231 } elsif ($self->{insertion_mode} & TABLE_IMS) {
4232 if ($token->{type} == CHARACTER_TOKEN) {
4233 if (not $open_tables->[-1]->[1] and # tainted
4234 $token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4235 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4236
4237 unless (length $token->{data}) {
4238 !!!cp ('t194');
4239 !!!next-token;
4240 redo B;
4241 } else {
4242 !!!cp ('t195');
4243 }
4244 }
4245
4246 !!!parse-error (type => 'in table:#character');
4247
4248 ## As if in body, but insert into foster parent element
4249 ## ISSUE: Spec says that "whenever a node would be inserted
4250 ## into the current node" while characters might not be
4251 ## result in a new Text node.
4252 $reconstruct_active_formatting_elements->($insert_to_foster);
4253
4254 if ({
4255 table => 1, tbody => 1, tfoot => 1,
4256 thead => 1, tr => 1,
4257 }->{$self->{open_elements}->[-1]->[1]}) {
4258 # MUST
4259 my $foster_parent_element;
4260 my $next_sibling;
4261 my $prev_sibling;
4262 OE: for (reverse 0..$#{$self->{open_elements}}) {
4263 if ($self->{open_elements}->[$_]->[1] eq 'table') {
4264 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
4265 if (defined $parent and $parent->node_type == 1) {
4266 !!!cp ('t196');
4267 $foster_parent_element = $parent;
4268 $next_sibling = $self->{open_elements}->[$_]->[0];
4269 $prev_sibling = $next_sibling->previous_sibling;
4270 } else {
4271 !!!cp ('t197');
4272 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
4273 $prev_sibling = $foster_parent_element->last_child;
4274 }
4275 last OE;
4276 }
4277 } # OE
4278 $foster_parent_element = $self->{open_elements}->[0]->[0] and
4279 $prev_sibling = $foster_parent_element->last_child
4280 unless defined $foster_parent_element;
4281 if (defined $prev_sibling and
4282 $prev_sibling->node_type == 3) {
4283 !!!cp ('t198');
4284 $prev_sibling->manakai_append_text ($token->{data});
4285 } else {
4286 !!!cp ('t199');
4287 $foster_parent_element->insert_before
4288 ($self->{document}->create_text_node ($token->{data}),
4289 $next_sibling);
4290 }
4291 $open_tables->[-1]->[1] = 1; # tainted
4292 } else {
4293 !!!cp ('t200');
4294 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4295 }
4296
4297 !!!next-token;
4298 redo B;
4299 } elsif ($token->{type} == START_TAG_TOKEN) {
4300 if ({
4301 tr => ($self->{insertion_mode} != IN_ROW_IM),
4302 th => 1, td => 1,
4303 }->{$token->{tag_name}}) {
4304 if ($self->{insertion_mode} == IN_TABLE_IM) {
4305 ## Clear back to table context
4306 while ($self->{open_elements}->[-1]->[1] ne 'table' and
4307 $self->{open_elements}->[-1]->[1] ne 'html') {
4308 !!!cp ('t201');
4309 pop @{$self->{open_elements}};
4310 }
4311
4312 !!!insert-element ('tbody');
4313 $self->{insertion_mode} = IN_TABLE_BODY_IM;
4314 ## reprocess in the "in table body" insertion mode...
4315 }
4316
4317 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
4318 unless ($token->{tag_name} eq 'tr') {
4319 !!!cp ('t202');
4320 !!!parse-error (type => 'missing start tag:tr');
4321 }
4322
4323 ## Clear back to table body context
4324 while (not {
4325 tbody => 1, tfoot => 1, thead => 1, html => 1,
4326 }->{$self->{open_elements}->[-1]->[1]}) {
4327 !!!cp ('t203');
4328 ## ISSUE: Can this case be reached?
4329 pop @{$self->{open_elements}};
4330 }
4331
4332 $self->{insertion_mode} = IN_ROW_IM;
4333 if ($token->{tag_name} eq 'tr') {
4334 !!!cp ('t204');
4335 !!!insert-element ($token->{tag_name}, $token->{attributes});
4336 !!!next-token;
4337 redo B;
4338 } else {
4339 !!!cp ('t205');
4340 !!!insert-element ('tr');
4341 ## reprocess in the "in row" insertion mode
4342 }
4343 } else {
4344 !!!cp ('t206');
4345 }
4346
4347 ## Clear back to table row context
4348 while (not {
4349 tr => 1, html => 1,
4350 }->{$self->{open_elements}->[-1]->[1]}) {
4351 !!!cp ('t207');
4352 pop @{$self->{open_elements}};
4353 }
4354
4355 !!!insert-element ($token->{tag_name}, $token->{attributes});
4356 $self->{insertion_mode} = IN_CELL_IM;
4357
4358 push @$active_formatting_elements, ['#marker', ''];
4359
4360 !!!next-token;
4361 redo B;
4362 } elsif ({
4363 caption => 1, col => 1, colgroup => 1,
4364 tbody => 1, tfoot => 1, thead => 1,
4365 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
4366 }->{$token->{tag_name}}) {
4367 if ($self->{insertion_mode} == IN_ROW_IM) {
4368 ## As if </tr>
4369 ## have an element in table scope
4370 my $i;
4371 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4372 my $node = $self->{open_elements}->[$_];
4373 if ($node->[1] eq 'tr') {
4374 !!!cp ('t208');
4375 $i = $_;
4376 last INSCOPE;
4377 } elsif ({
4378 html => 1,
4379
4380 ## NOTE: This element does not appear here, maybe.
4381 table => 1,
4382 }->{$node->[1]}) {
4383 !!!cp ('t209');
4384 last INSCOPE;
4385 }
4386 } # INSCOPE
4387 unless (defined $i) {
4388 !!!cp ('t210');
4389 ## TODO: This type is wrong.
4390 !!!parse-error (type => 'unmacthed end tag:'.$token->{tag_name});
4391 ## Ignore the token
4392 !!!next-token;
4393 redo B;
4394 }
4395
4396 ## Clear back to table row context
4397 while (not {
4398 tr => 1, html => 1,
4399 }->{$self->{open_elements}->[-1]->[1]}) {
4400 !!!cp ('t211');
4401 ## ISSUE: Can this case be reached?
4402 pop @{$self->{open_elements}};
4403 }
4404
4405 pop @{$self->{open_elements}}; # tr
4406 $self->{insertion_mode} = IN_TABLE_BODY_IM;
4407 if ($token->{tag_name} eq 'tr') {
4408 !!!cp ('t212');
4409 ## reprocess
4410 redo B;
4411 } else {
4412 !!!cp ('t213');
4413 ## reprocess in the "in table body" insertion mode...
4414 }
4415 }
4416
4417 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
4418 ## have an element in table scope
4419 my $i;
4420 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4421 my $node = $self->{open_elements}->[$_];
4422 if ({
4423 tbody => 1, thead => 1, tfoot => 1,
4424 }->{$node->[1]}) {
4425 !!!cp ('t214');
4426 $i = $_;
4427 last INSCOPE;
4428 } elsif ({
4429 table => 1, html => 1,
4430 }->{$node->[1]}) {
4431 !!!cp ('t215');
4432 last INSCOPE;
4433 }
4434 } # INSCOPE
4435 unless (defined $i) {
4436 !!!cp ('t216');
4437 ## TODO: This erorr type ios wrong.
4438 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4439 ## Ignore the token
4440 !!!next-token;
4441 redo B;
4442 }
4443
4444 ## Clear back to table body context
4445 while (not {
4446 tbody => 1, tfoot => 1, thead => 1, html => 1,
4447 }->{$self->{open_elements}->[-1]->[1]}) {
4448 !!!cp ('t217');
4449 ## ISSUE: Can this state be reached?
4450 pop @{$self->{open_elements}};
4451 }
4452
4453 ## As if <{current node}>
4454 ## have an element in table scope
4455 ## true by definition
4456
4457 ## Clear back to table body context
4458 ## nop by definition
4459
4460 pop @{$self->{open_elements}};
4461 $self->{insertion_mode} = IN_TABLE_IM;
4462 ## reprocess in "in table" insertion mode...
4463 } else {
4464 !!!cp ('t218');
4465 }
4466
4467 if ($token->{tag_name} eq 'col') {
4468 ## Clear back to table context
4469 while ($self->{open_elements}->[-1]->[1] ne 'table' and
4470 $self->{open_elements}->[-1]->[1] ne 'html') {
4471 !!!cp ('t219');
4472 ## ISSUE: Can this state be reached?
4473 pop @{$self->{open_elements}};
4474 }
4475
4476 !!!insert-element ('colgroup');
4477 $self->{insertion_mode} = IN_COLUMN_GROUP_IM;
4478 ## reprocess
4479 redo B;
4480 } elsif ({
4481 caption => 1,
4482 colgroup => 1,
4483 tbody => 1, tfoot => 1, thead => 1,
4484 }->{$token->{tag_name}}) {
4485 ## Clear back to table context
4486 while ($self->{open_elements}->[-1]->[1] ne 'table' and
4487 $self->{open_elements}->[-1]->[1] ne 'html') {
4488 !!!cp ('t220');
4489 ## ISSUE: Can this state be reached?
4490 pop @{$self->{open_elements}};
4491 }
4492
4493 push @$active_formatting_elements, ['#marker', '']
4494 if $token->{tag_name} eq 'caption';
4495
4496 !!!insert-element ($token->{tag_name}, $token->{attributes});
4497 $self->{insertion_mode} = {
4498 caption => IN_CAPTION_IM,
4499 colgroup => IN_COLUMN_GROUP_IM,
4500 tbody => IN_TABLE_BODY_IM,
4501 tfoot => IN_TABLE_BODY_IM,
4502 thead => IN_TABLE_BODY_IM,
4503 }->{$token->{tag_name}};
4504 !!!next-token;
4505 redo B;
4506 } else {
4507 die "$0: in table: <>: $token->{tag_name}";
4508 }
4509 } elsif ($token->{tag_name} eq 'table') {
4510 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4511
4512 ## As if </table>
4513 ## have a table element in table scope
4514 my $i;
4515 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4516 my $node = $self->{open_elements}->[$_];
4517 if ($node->[1] eq 'table') {
4518 !!!cp ('t221');
4519 $i = $_;
4520 last INSCOPE;
4521 } elsif ({
4522 #table => 1,
4523 html => 1,
4524 }->{$node->[1]}) {
4525 !!!cp ('t222');
4526 last INSCOPE;
4527 }
4528 } # INSCOPE
4529 unless (defined $i) {
4530 !!!cp ('t223');
4531 ## TODO: The following is wrong, maybe.
4532 !!!parse-error (type => 'unmatched end tag:table');
4533 ## Ignore tokens </table><table>
4534 !!!next-token;
4535 redo B;
4536 }
4537
4538 ## generate implied end tags
4539 while ({
4540 dd => 1, dt => 1, li => 1, p => 1,
4541 }->{$self->{open_elements}->[-1]->[1]}) {
4542 !!!cp ('t224');
4543 pop @{$self->{open_elements}};
4544 }
4545
4546 if ($self->{open_elements}->[-1]->[1] ne 'table') {
4547 !!!cp ('t225');
4548 ## ISSUE: Can this case be reached?
4549 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4550 } else {
4551 !!!cp ('t226');
4552 }
4553
4554 splice @{$self->{open_elements}}, $i;
4555 pop @{$open_tables};
4556
4557 $self->_reset_insertion_mode;
4558
4559 ## reprocess
4560 redo B;
4561 } elsif ($token->{tag_name} eq 'style') {
4562 if (not $open_tables->[-1]->[1]) { # tainted
4563 !!!cp ('t227.8');
4564 ## NOTE: This is a "as if in head" code clone.
4565 $parse_rcdata->(CDATA_CONTENT_MODEL);
4566 redo B;
4567 } else {
4568 !!!cp ('t227.7');
4569 #
4570 }
4571 } elsif ($token->{tag_name} eq 'script') {
4572 if (not $open_tables->[-1]->[1]) { # tainted
4573 !!!cp ('t227.6');
4574 ## NOTE: This is a "as if in head" code clone.
4575 $script_start_tag->();
4576 redo B;
4577 } else {
4578 !!!cp ('t227.5');
4579 #
4580 }
4581 } elsif ($token->{tag_name} eq 'input') {
4582 if (not $open_tables->[-1]->[1]) { # tainted
4583 if ($token->{attributes}->{type}) { ## TODO: case
4584 my $type = lc $token->{attributes}->{type}->{value};
4585 if ($type eq 'hidden') {
4586 !!!cp ('t227.3');
4587 !!!parse-error (type => 'in table:'.$token->{tag_name});
4588
4589 !!!insert-element ($token->{tag_name}, $token->{attributes});
4590
4591 ## TODO: form element pointer
4592
4593 pop @{$self->{open_elements}};
4594
4595 !!!next-token;
4596 redo B;
4597 } else {
4598 !!!cp ('t227.2');
4599 #
4600 }
4601 } else {
4602 !!!cp ('t227.1');
4603 #
4604 }
4605 } else {
4606 !!!cp ('t227.4');
4607 #
4608 }
4609 } else {
4610 !!!cp ('t227');
4611 #
4612 }
4613
4614 !!!parse-error (type => 'in table:'.$token->{tag_name});
4615
4616 $insert = $insert_to_foster;
4617 #
4618 } elsif ($token->{type} == END_TAG_TOKEN) {
4619 if ($token->{tag_name} eq 'tr' and
4620 $self->{insertion_mode} == IN_ROW_IM) {
4621 ## have an element in table scope
4622 my $i;
4623 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4624 my $node = $self->{open_elements}->[$_];
4625 if ($node->[1] eq $token->{tag_name}) {
4626 !!!cp ('t228');
4627 $i = $_;
4628 last INSCOPE;
4629 } elsif ({
4630 table => 1, html => 1,
4631 }->{$node->[1]}) {
4632 !!!cp ('t229');
4633 last INSCOPE;
4634 }
4635 } # INSCOPE
4636 unless (defined $i) {
4637 !!!cp ('t230');
4638 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4639 ## Ignore the token
4640 !!!next-token;
4641 redo B;
4642 } else {
4643 !!!cp ('t232');
4644 }
4645
4646 ## Clear back to table row context
4647 while (not {
4648 tr => 1, html => 1,
4649 }->{$self->{open_elements}->[-1]->[1]}) {
4650 !!!cp ('t231');
4651 ## ISSUE: Can this state be reached?
4652 pop @{$self->{open_elements}};
4653 }
4654
4655 pop @{$self->{open_elements}}; # tr
4656 $self->{insertion_mode} = IN_TABLE_BODY_IM;
4657 !!!next-token;
4658 redo B;
4659 } elsif ($token->{tag_name} eq 'table') {
4660 if ($self->{insertion_mode} == IN_ROW_IM) {
4661 ## As if </tr>
4662 ## have an element in table scope
4663 my $i;
4664 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4665 my $node = $self->{open_elements}->[$_];
4666 if ($node->[1] eq 'tr') {
4667 !!!cp ('t233');
4668 $i = $_;
4669 last INSCOPE;
4670 } elsif ({
4671 table => 1, html => 1,
4672 }->{$node->[1]}) {
4673 !!!cp ('t234');
4674 last INSCOPE;
4675 }
4676 } # INSCOPE
4677 unless (defined $i) {
4678 !!!cp ('t235');
4679 ## TODO: The following is wrong.
4680 !!!parse-error (type => 'unmatched end tag:'.$token->{type});
4681 ## Ignore the token
4682 !!!next-token;
4683 redo B;
4684 }
4685
4686 ## Clear back to table row context
4687 while (not {
4688 tr => 1, html => 1,
4689 }->{$self->{open_elements}->[-1]->[1]}) {
4690 !!!cp ('t236');
4691 ## ISSUE: Can this state be reached?
4692 pop @{$self->{open_elements}};
4693 }
4694
4695 pop @{$self->{open_elements}}; # tr
4696 $self->{insertion_mode} = IN_TABLE_BODY_IM;
4697 ## reprocess in the "in table body" insertion mode...
4698 }
4699
4700 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
4701 ## have an element in table scope
4702 my $i;
4703 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4704 my $node = $self->{open_elements}->[$_];
4705 if ({
4706 tbody => 1, thead => 1, tfoot => 1,
4707 }->{$node->[1]}) {
4708 !!!cp ('t237');
4709 $i = $_;
4710 last INSCOPE;
4711 } elsif ({
4712 table => 1, html => 1,
4713 }->{$node->[1]}) {
4714 !!!cp ('t238');
4715 last INSCOPE;
4716 }
4717 } # INSCOPE
4718 unless (defined $i) {
4719 !!!cp ('t239');
4720 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4721 ## Ignore the token
4722 !!!next-token;
4723 redo B;
4724 }
4725
4726 ## Clear back to table body context
4727 while (not {
4728 tbody => 1, tfoot => 1, thead => 1, html => 1,
4729 }->{$self->{open_elements}->[-1]->[1]}) {
4730 !!!cp ('t240');
4731 pop @{$self->{open_elements}};
4732 }
4733
4734 ## As if <{current node}>
4735 ## have an element in table scope
4736 ## true by definition
4737
4738 ## Clear back to table body context
4739 ## nop by definition
4740
4741 pop @{$self->{open_elements}};
4742 $self->{insertion_mode} = IN_TABLE_IM;
4743 ## reprocess in the "in table" insertion mode...
4744 }
4745
4746 ## NOTE: </table> in the "in table" insertion mode.
4747 ## When you edit the code fragment below, please ensure that
4748 ## the code for <table> in the "in table" insertion mode
4749 ## is synced with it.
4750
4751 ## have a table element in table scope
4752 my $i;
4753 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4754 my $node = $self->{open_elements}->[$_];
4755 if ($node->[1] eq $token->{tag_name}) {
4756 !!!cp ('t241');
4757 $i = $_;
4758 last INSCOPE;
4759 } elsif ({
4760 table => 1, html => 1,
4761 }->{$node->[1]}) {
4762 !!!cp ('t242');
4763 last INSCOPE;
4764 }
4765 } # INSCOPE
4766 unless (defined $i) {
4767 !!!cp ('t243');
4768 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4769 ## Ignore the token
4770 !!!next-token;
4771 redo B;
4772 }
4773
4774 splice @{$self->{open_elements}}, $i;
4775 pop @{$open_tables};
4776
4777 $self->_reset_insertion_mode;
4778
4779 !!!next-token;
4780 redo B;
4781 } elsif ({
4782 tbody => 1, tfoot => 1, thead => 1,
4783 }->{$token->{tag_name}} and
4784 $self->{insertion_mode} & ROW_IMS) {
4785 if ($self->{insertion_mode} == IN_ROW_IM) {
4786 ## have an element in table scope
4787 my $i;
4788 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4789 my $node = $self->{open_elements}->[$_];
4790 if ($node->[1] eq $token->{tag_name}) {
4791 !!!cp ('t247');
4792 $i = $_;
4793 last INSCOPE;
4794 } elsif ({
4795 table => 1, html => 1,
4796 }->{$node->[1]}) {
4797 !!!cp ('t248');
4798 last INSCOPE;
4799 }
4800 } # INSCOPE
4801 unless (defined $i) {
4802 !!!cp ('t249');
4803 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4804 ## Ignore the token
4805 !!!next-token;
4806 redo B;
4807 }
4808
4809 ## As if </tr>
4810 ## have an element in table scope
4811 my $i;
4812 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4813 my $node = $self->{open_elements}->[$_];
4814 if ($node->[1] eq 'tr') {
4815 !!!cp ('t250');
4816 $i = $_;
4817 last INSCOPE;
4818 } elsif ({
4819 table => 1, html => 1,
4820 }->{$node->[1]}) {
4821 !!!cp ('t251');
4822 last INSCOPE;
4823 }
4824 } # INSCOPE
4825 unless (defined $i) {
4826 !!!cp ('t252');
4827 !!!parse-error (type => 'unmatched end tag:tr');
4828 ## Ignore the token
4829 !!!next-token;
4830 redo B;
4831 }
4832
4833 ## Clear back to table row context
4834 while (not {
4835 tr => 1, html => 1,
4836 }->{$self->{open_elements}->[-1]->[1]}) {
4837 !!!cp ('t253');
4838 ## ISSUE: Can this case be reached?
4839 pop @{$self->{open_elements}};
4840 }
4841
4842 pop @{$self->{open_elements}}; # tr
4843 $self->{insertion_mode} = IN_TABLE_BODY_IM;
4844 ## reprocess in the "in table body" insertion mode...
4845 }
4846
4847 ## have an element in table scope
4848 my $i;
4849 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4850 my $node = $self->{open_elements}->[$_];
4851 if ($node->[1] eq $token->{tag_name}) {
4852 !!!cp ('t254');
4853 $i = $_;
4854 last INSCOPE;
4855 } elsif ({
4856 table => 1, html => 1,
4857 }->{$node->[1]}) {
4858 !!!cp ('t255');
4859 last INSCOPE;
4860 }
4861 } # INSCOPE
4862 unless (defined $i) {
4863 !!!cp ('t256');
4864 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4865 ## Ignore the token
4866 !!!next-token;
4867 redo B;
4868 }
4869
4870 ## Clear back to table body context
4871 while (not {
4872 tbody => 1, tfoot => 1, thead => 1, html => 1,
4873 }->{$self->{open_elements}->[-1]->[1]}) {
4874 !!!cp ('t257');
4875 ## ISSUE: Can this case be reached?
4876 pop @{$self->{open_elements}};
4877 }
4878
4879 pop @{$self->{open_elements}};
4880 $self->{insertion_mode} = IN_TABLE_IM;
4881 !!!next-token;
4882 redo B;
4883 } elsif ({
4884 body => 1, caption => 1, col => 1, colgroup => 1,
4885 html => 1, td => 1, th => 1,
4886 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
4887 tbody => 1, tfoot => 1, thead => 1, # $self->{insertion_mode} == IN_TABLE_IM
4888 }->{$token->{tag_name}}) {
4889 !!!cp ('t258');
4890 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4891 ## Ignore the token
4892 !!!next-token;
4893 redo B;
4894 } else {
4895 !!!cp ('t259');
4896 !!!parse-error (type => 'in table:/'.$token->{tag_name});
4897
4898 $insert = $insert_to_foster;
4899 #
4900 }
4901 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
4902 unless ($self->{open_elements}->[-1]->[1] eq 'html' and
4903 @{$self->{open_elements}} == 1) { # redundant, maybe
4904 !!!cp ('t259.1');
4905 !!!parse-error (type => 'in body:#eof');
4906 } else {
4907 !!!cp ('t259.2');
4908 }
4909
4910 ## Stop parsing
4911 last B;
4912 } else {
4913 die "$0: $token->{type}: Unknown token type";
4914 }
4915 } elsif ($self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
4916 if ($token->{type} == CHARACTER_TOKEN) {
4917 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4918 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4919 unless (length $token->{data}) {
4920 !!!cp ('t260');
4921 !!!next-token;
4922 redo B;
4923 }
4924 }
4925
4926 !!!cp ('t261');
4927 #
4928 } elsif ($token->{type} == START_TAG_TOKEN) {
4929 if ($token->{tag_name} eq 'col') {
4930 !!!cp ('t262');
4931 !!!insert-element ($token->{tag_name}, $token->{attributes});
4932 pop @{$self->{open_elements}};
4933 !!!next-token;
4934 redo B;
4935 } else {
4936 !!!cp ('t263');
4937 #
4938 }
4939 } elsif ($token->{type} == END_TAG_TOKEN) {
4940 if ($token->{tag_name} eq 'colgroup') {
4941 if ($self->{open_elements}->[-1]->[1] eq 'html') {
4942 !!!cp ('t264');
4943 !!!parse-error (type => 'unmatched end tag:colgroup');
4944 ## Ignore the token
4945 !!!next-token;
4946 redo B;
4947 } else {
4948 !!!cp ('t265');
4949 pop @{$self->{open_elements}}; # colgroup
4950 $self->{insertion_mode} = IN_TABLE_IM;
4951 !!!next-token;
4952 redo B;
4953 }
4954 } elsif ($token->{tag_name} eq 'col') {
4955 !!!cp ('t266');
4956 !!!parse-error (type => 'unmatched end tag:col');
4957 ## Ignore the token
4958 !!!next-token;
4959 redo B;
4960 } else {
4961 !!!cp ('t267');
4962 #
4963 }
4964 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
4965 if ($self->{open_elements}->[-1]->[1] eq 'html' or
4966 @{$self->{open_elements}} == 1) { # redundant, maybe
4967 !!!cp ('t270.2');
4968 ## Stop parsing.
4969 last B;
4970 } else {
4971 ## NOTE: As if </colgroup>.
4972 !!!cp ('t270.1');
4973 pop @{$self->{open_elements}}; # colgroup
4974 $self->{insertion_mode} = IN_TABLE_IM;
4975 ## Reprocess.
4976 redo B;
4977 }
4978 } else {
4979 die "$0: $token->{type}: Unknown token type";
4980 }
4981
4982 ## As if </colgroup>
4983 if ($self->{open_elements}->[-1]->[1] eq 'html') {
4984 !!!cp ('t269');
4985 ## TODO: Wrong error type?
4986 !!!parse-error (type => 'unmatched end tag:colgroup');
4987 ## Ignore the token
4988 !!!next-token;
4989 redo B;
4990 } else {
4991 !!!cp ('t270');
4992 pop @{$self->{open_elements}}; # colgroup
4993 $self->{insertion_mode} = IN_TABLE_IM;
4994 ## reprocess
4995 redo B;
4996 }
4997 } elsif ($self->{insertion_mode} & SELECT_IMS) {
4998 if ($token->{type} == CHARACTER_TOKEN) {
4999 !!!cp ('t271');
5000 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
5001 !!!next-token;
5002 redo B;
5003 } elsif ($token->{type} == START_TAG_TOKEN) {
5004 if ($token->{tag_name} eq 'option') {
5005 if ($self->{open_elements}->[-1]->[1] eq 'option') {
5006 !!!cp ('t272');
5007 ## As if </option>
5008 pop @{$self->{open_elements}};
5009 } else {
5010 !!!cp ('t273');
5011 }
5012
5013 !!!insert-element ($token->{tag_name}, $token->{attributes});
5014 !!!next-token;
5015 redo B;
5016 } elsif ($token->{tag_name} eq 'optgroup') {
5017 if ($self->{open_elements}->[-1]->[1] eq 'option') {
5018 !!!cp ('t274');
5019 ## As if </option>
5020 pop @{$self->{open_elements}};
5021 } else {
5022 !!!cp ('t275');
5023 }
5024
5025 if ($self->{open_elements}->[-1]->[1] eq 'optgroup') {
5026 !!!cp ('t276');
5027 ## As if </optgroup>
5028 pop @{$self->{open_elements}};
5029 } else {
5030 !!!cp ('t277');
5031 }
5032
5033 !!!insert-element ($token->{tag_name}, $token->{attributes});
5034 !!!next-token;
5035 redo B;
5036 } elsif ($token->{tag_name} eq 'select' or
5037 $token->{tag_name} eq 'input' or
5038 ($self->{insertion_mode} == IN_SELECT_IN_TABLE_IM and
5039 {
5040 caption => 1, table => 1,
5041 tbody => 1, tfoot => 1, thead => 1,
5042 tr => 1, td => 1, th => 1,
5043 }->{$token->{tag_name}})) {
5044 ## TODO: The type below is not good - <select> is replaced by </select>
5045 !!!parse-error (type => 'not closed:select');
5046 ## NOTE: As if the token were </select> (<select> case) or
5047 ## as if there were </select> (otherwise).
5048 ## have an element in table scope
5049 my $i;
5050 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5051 my $node = $self->{open_elements}->[$_];
5052 if ($node->[1] eq 'select') {
5053 !!!cp ('t278');
5054 $i = $_;
5055 last INSCOPE;
5056 } elsif ({
5057 table => 1, html => 1,
5058 }->{$node->[1]}) {
5059 !!!cp ('t279');
5060 last INSCOPE;
5061 }
5062 } # INSCOPE
5063 unless (defined $i) {
5064 !!!cp ('t280');
5065 !!!parse-error (type => 'unmatched end tag:select');
5066 ## Ignore the token
5067 !!!next-token;
5068 redo B;
5069 }
5070
5071 !!!cp ('t281');
5072 splice @{$self->{open_elements}}, $i;
5073
5074 $self->_reset_insertion_mode;
5075
5076 if ($token->{tag_name} eq 'select') {
5077 !!!cp ('t281.2');
5078 !!!next-token;
5079 redo B;
5080 } else {
5081 !!!cp ('t281.1');
5082 ## Reprocess the token.
5083 redo B;
5084 }
5085 } else {
5086 !!!cp ('t282');
5087 !!!parse-error (type => 'in select:'.$token->{tag_name});
5088 ## Ignore the token
5089 !!!next-token;
5090 redo B;
5091 }
5092 } elsif ($token->{type} == END_TAG_TOKEN) {
5093 if ($token->{tag_name} eq 'optgroup') {
5094 if ($self->{open_elements}->[-1]->[1] eq 'option' and
5095 $self->{open_elements}->[-2]->[1] eq 'optgroup') {
5096 !!!cp ('t283');
5097 ## As if </option>
5098 splice @{$self->{open_elements}}, -2;
5099 } elsif ($self->{open_elements}->[-1]->[1] eq 'optgroup') {
5100 !!!cp ('t284');
5101 pop @{$self->{open_elements}};
5102 } else {
5103 !!!cp ('t285');
5104 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5105 ## Ignore the token
5106 }
5107 !!!next-token;
5108 redo B;
5109 } elsif ($token->{tag_name} eq 'option') {
5110 if ($self->{open_elements}->[-1]->[1] eq 'option') {
5111 !!!cp ('t286');
5112 pop @{$self->{open_elements}};
5113 } else {
5114 !!!cp ('t287');
5115 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5116 ## Ignore the token
5117 }
5118 !!!next-token;
5119 redo B;
5120 } elsif ($token->{tag_name} eq 'select') {
5121 ## have an element in table scope
5122 my $i;
5123 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5124 my $node = $self->{open_elements}->[$_];
5125 if ($node->[1] eq $token->{tag_name}) {
5126 !!!cp ('t288');
5127 $i = $_;
5128 last INSCOPE;
5129 } elsif ({
5130 table => 1, html => 1,
5131 }->{$node->[1]}) {
5132 !!!cp ('t289');
5133 last INSCOPE;
5134 }
5135 } # INSCOPE
5136 unless (defined $i) {
5137 !!!cp ('t290');
5138 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5139 ## Ignore the token
5140 !!!next-token;
5141 redo B;
5142 }
5143
5144 !!!cp ('t291');
5145 splice @{$self->{open_elements}}, $i;
5146
5147 $self->_reset_insertion_mode;
5148
5149 !!!next-token;
5150 redo B;
5151 } elsif ($self->{insertion_mode} == IN_SELECT_IN_TABLE_IM and
5152 {
5153 caption => 1, table => 1, tbody => 1,
5154 tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
5155 }->{$token->{tag_name}}) {
5156 ## TODO: The following is wrong?
5157 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5158
5159 ## have an element in table scope
5160 my $i;
5161 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5162 my $node = $self->{open_elements}->[$_];
5163 if ($node->[1] eq $token->{tag_name}) {
5164 !!!cp ('t292');
5165 $i = $_;
5166 last INSCOPE;
5167 } elsif ({
5168 table => 1, html => 1,
5169 }->{$node->[1]}) {
5170 !!!cp ('t293');
5171 last INSCOPE;
5172 }
5173 } # INSCOPE
5174 unless (defined $i) {
5175 !!!cp ('t294');
5176 ## Ignore the token
5177 !!!next-token;
5178 redo B;
5179 }
5180
5181 ## As if </select>
5182 ## have an element in table scope
5183 undef $i;
5184 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5185 my $node = $self->{open_elements}->[$_];
5186 if ($node->[1] eq 'select') {
5187 !!!cp ('t295');
5188 $i = $_;
5189 last INSCOPE;
5190 } elsif ({
5191 table => 1, html => 1,
5192 }->{$node->[1]}) {
5193 ## ISSUE: Can this state be reached?
5194 !!!cp ('t296');
5195 last INSCOPE;
5196 }
5197 } # INSCOPE
5198 unless (defined $i) {
5199 !!!cp ('t297');
5200 ## TODO: The following error type is correct?
5201 !!!parse-error (type => 'unmatched end tag:select');
5202 ## Ignore the </select> token
5203 !!!next-token; ## TODO: ok?
5204 redo B;
5205 }
5206
5207 !!!cp ('t298');
5208 splice @{$self->{open_elements}}, $i;
5209
5210 $self->_reset_insertion_mode;
5211
5212 ## reprocess
5213 redo B;
5214 } else {
5215 !!!cp ('t299');
5216 !!!parse-error (type => 'in select:/'.$token->{tag_name});
5217 ## Ignore the token
5218 !!!next-token;
5219 redo B;
5220 }
5221 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5222 unless ($self->{open_elements}->[-1]->[1] eq 'html' and
5223 @{$self->{open_elements}} == 1) { # redundant, maybe
5224 !!!cp ('t299.1');
5225 !!!parse-error (type => 'in body:#eof');
5226 } else {
5227 !!!cp ('t299.2');
5228 }
5229
5230 ## Stop parsing.
5231 last B;
5232 } else {
5233 die "$0: $token->{type}: Unknown token type";
5234 }
5235 } elsif ($self->{insertion_mode} & BODY_AFTER_IMS) {
5236 if ($token->{type} == CHARACTER_TOKEN) {
5237 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5238 my $data = $1;
5239 ## As if in body
5240 $reconstruct_active_formatting_elements->($insert_to_current);
5241
5242 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5243
5244 unless (length $token->{data}) {
5245 !!!cp ('t300');
5246 !!!next-token;
5247 redo B;
5248 }
5249 }
5250
5251 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
5252 !!!cp ('t301');
5253 !!!parse-error (type => 'after html:#character');
5254
5255 ## Reprocess in the "after body" insertion mode.
5256 } else {
5257 !!!cp ('t302');
5258 }
5259
5260 ## "after body" insertion mode
5261 !!!parse-error (type => 'after body:#character');
5262
5263 $self->{insertion_mode} = IN_BODY_IM;
5264 ## reprocess
5265 redo B;
5266 } elsif ($token->{type} == START_TAG_TOKEN) {
5267 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
5268 !!!cp ('t303');
5269 !!!parse-error (type => 'after html:'.$token->{tag_name});
5270
5271 ## Reprocess in the "after body" insertion mode.
5272 } else {
5273 !!!cp ('t304');
5274 }
5275
5276 ## "after body" insertion mode
5277 !!!parse-error (type => 'after body:'.$token->{tag_name});
5278
5279 $self->{insertion_mode} = IN_BODY_IM;
5280 ## reprocess
5281 redo B;
5282 } elsif ($token->{type} == END_TAG_TOKEN) {
5283 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
5284 !!!cp ('t305');
5285 !!!parse-error (type => 'after html:/'.$token->{tag_name});
5286
5287 $self->{insertion_mode} = AFTER_BODY_IM;
5288 ## Reprocess in the "after body" insertion mode.
5289 } else {
5290 !!!cp ('t306');
5291 }
5292
5293 ## "after body" insertion mode
5294 if ($token->{tag_name} eq 'html') {
5295 if (defined $self->{inner_html_node}) {
5296 !!!cp ('t307');
5297 !!!parse-error (type => 'unmatched end tag:html');
5298 ## Ignore the token
5299 !!!next-token;
5300 redo B;
5301 } else {
5302 !!!cp ('t308');
5303 $self->{insertion_mode} = AFTER_HTML_BODY_IM;
5304 !!!next-token;
5305 redo B;
5306 }
5307 } else {
5308 !!!cp ('t309');
5309 !!!parse-error (type => 'after body:/'.$token->{tag_name});
5310
5311 $self->{insertion_mode} = IN_BODY_IM;
5312 ## reprocess
5313 redo B;
5314 }
5315 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5316 !!!cp ('t309.2');
5317 ## Stop parsing
5318 last B;
5319 } else {
5320 die "$0: $token->{type}: Unknown token type";
5321 }
5322 } elsif ($self->{insertion_mode} & FRAME_IMS) {
5323 if ($token->{type} == CHARACTER_TOKEN) {
5324 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5325 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5326
5327 unless (length $token->{data}) {
5328 !!!cp ('t310');
5329 !!!next-token;
5330 redo B;
5331 }
5332 }
5333
5334 if ($token->{data} =~ s/^[^\x09\x0A\x0B\x0C\x20]+//) {
5335 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
5336 !!!cp ('t311');
5337 !!!parse-error (type => 'in frameset:#character');
5338 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
5339 !!!cp ('t312');
5340 !!!parse-error (type => 'after frameset:#character');
5341 } else { # "after html frameset"
5342 !!!cp ('t313');
5343 !!!parse-error (type => 'after html:#character');
5344
5345 $self->{insertion_mode} = AFTER_FRAMESET_IM;
5346 ## Reprocess in the "after frameset" insertion mode.
5347 !!!parse-error (type => 'after frameset:#character');
5348 }
5349
5350 ## Ignore the token.
5351 if (length $token->{data}) {
5352 !!!cp ('t314');
5353 ## reprocess the rest of characters
5354 } else {
5355 !!!cp ('t315');
5356 !!!next-token;
5357 }
5358 redo B;
5359 }
5360
5361 die qq[$0: Character "$token->{data}"];
5362 } elsif ($token->{type} == START_TAG_TOKEN) {
5363 if ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
5364 !!!cp ('t316');
5365 !!!parse-error (type => 'after html:'.$token->{tag_name});
5366
5367 $self->{insertion_mode} = AFTER_FRAMESET_IM;
5368 ## Process in the "after frameset" insertion mode.
5369 } else {
5370 !!!cp ('t317');
5371 }
5372
5373 if ($token->{tag_name} eq 'frameset' and
5374 $self->{insertion_mode} == IN_FRAMESET_IM) {
5375 !!!cp ('t318');
5376 !!!insert-element ($token->{tag_name}, $token->{attributes});
5377 !!!next-token;
5378 redo B;
5379 } elsif ($token->{tag_name} eq 'frame' and
5380 $self->{insertion_mode} == IN_FRAMESET_IM) {
5381 !!!cp ('t319');
5382 !!!insert-element ($token->{tag_name}, $token->{attributes});
5383 pop @{$self->{open_elements}};
5384 !!!next-token;
5385 redo B;
5386 } elsif ($token->{tag_name} eq 'noframes') {
5387 !!!cp ('t320');
5388 ## NOTE: As if in body.
5389 $parse_rcdata->(CDATA_CONTENT_MODEL);
5390 redo B;
5391 } else {
5392 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
5393 !!!cp ('t321');
5394 !!!parse-error (type => 'in frameset:'.$token->{tag_name});
5395 } else {
5396 !!!cp ('t322');
5397 !!!parse-error (type => 'after frameset:'.$token->{tag_name});
5398 }
5399 ## Ignore the token
5400 !!!next-token;
5401 redo B;
5402 }
5403 } elsif ($token->{type} == END_TAG_TOKEN) {
5404 if ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
5405 !!!cp ('t323');
5406 !!!parse-error (type => 'after html:/'.$token->{tag_name});
5407
5408 $self->{insertion_mode} = AFTER_FRAMESET_IM;
5409 ## Process in the "after frameset" insertion mode.
5410 } else {
5411 !!!cp ('t324');
5412 }
5413
5414 if ($token->{tag_name} eq 'frameset' and
5415 $self->{insertion_mode} == IN_FRAMESET_IM) {
5416 if ($self->{open_elements}->[-1]->[1] eq 'html' and
5417 @{$self->{open_elements}} == 1) {
5418 !!!cp ('t325');
5419 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5420 ## Ignore the token
5421 !!!next-token;
5422 } else {
5423 !!!cp ('t326');
5424 pop @{$self->{open_elements}};
5425 !!!next-token;
5426 }
5427
5428 if (not defined $self->{inner_html_node} and
5429 $self->{open_elements}->[-1]->[1] ne 'frameset') {
5430 !!!cp ('t327');
5431 $self->{insertion_mode} = AFTER_FRAMESET_IM;
5432 } else {
5433 !!!cp ('t328');
5434 }
5435 redo B;
5436 } elsif ($token->{tag_name} eq 'html' and
5437 $self->{insertion_mode} == AFTER_FRAMESET_IM) {
5438 !!!cp ('t329');
5439 $self->{insertion_mode} = AFTER_HTML_FRAMESET_IM;
5440 !!!next-token;
5441 redo B;
5442 } else {
5443 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
5444 !!!cp ('t330');
5445 !!!parse-error (type => 'in frameset:/'.$token->{tag_name});
5446 } else {
5447 !!!cp ('t331');
5448 !!!parse-error (type => 'after frameset:/'.$token->{tag_name});
5449 }
5450 ## Ignore the token
5451 !!!next-token;
5452 redo B;
5453 }
5454 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5455 unless ($self->{open_elements}->[-1]->[1] eq 'html' and
5456 @{$self->{open_elements}} == 1) { # redundant, maybe
5457 !!!cp ('t331.1');
5458 !!!parse-error (type => 'in body:#eof');
5459 } else {
5460 !!!cp ('t331.2');
5461 }
5462
5463 ## Stop parsing
5464 last B;
5465 } else {
5466 die "$0: $token->{type}: Unknown token type";
5467 }
5468
5469 ## ISSUE: An issue in spec here
5470 } else {
5471 die "$0: $self->{insertion_mode}: Unknown insertion mode";
5472 }
5473
5474 ## "in body" insertion mode
5475 if ($token->{type} == START_TAG_TOKEN) {
5476 if ($token->{tag_name} eq 'script') {
5477 !!!cp ('t332');
5478 ## NOTE: This is an "as if in head" code clone
5479 $script_start_tag->();
5480 redo B;
5481 } elsif ($token->{tag_name} eq 'style') {
5482 !!!cp ('t333');
5483 ## NOTE: This is an "as if in head" code clone
5484 $parse_rcdata->(CDATA_CONTENT_MODEL);
5485 redo B;
5486 } elsif ({
5487 base => 1, link => 1,
5488 }->{$token->{tag_name}}) {
5489 !!!cp ('t334');
5490 ## NOTE: This is an "as if in head" code clone, only "-t" differs
5491 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5492 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
5493 !!!next-token;
5494 redo B;
5495 } elsif ($token->{tag_name} eq 'meta') {
5496 ## NOTE: This is an "as if in head" code clone, only "-t" differs
5497 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5498 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
5499
5500 unless ($self->{confident}) {
5501 if ($token->{attributes}->{charset}) { ## TODO: And if supported
5502 !!!cp ('t335');
5503 $self->{change_encoding}
5504 ->($self, $token->{attributes}->{charset}->{value});
5505
5506 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
5507 ->set_user_data (manakai_has_reference =>
5508 $token->{attributes}->{charset}
5509 ->{has_reference});
5510 } elsif ($token->{attributes}->{content}) {
5511 ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
5512 if ($token->{attributes}->{content}->{value}
5513 =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
5514 [\x09-\x0D\x20]*=
5515 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
5516 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
5517 !!!cp ('t336');
5518 $self->{change_encoding}
5519 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3);
5520 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
5521 ->set_user_data (manakai_has_reference =>
5522 $token->{attributes}->{content}
5523 ->{has_reference});
5524 }
5525 }
5526 } else {
5527 if ($token->{attributes}->{charset}) {
5528 !!!cp ('t337');
5529 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
5530 ->set_user_data (manakai_has_reference =>
5531 $token->{attributes}->{charset}
5532 ->{has_reference});
5533 }
5534 if ($token->{attributes}->{content}) {
5535 !!!cp ('t338');
5536 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
5537 ->set_user_data (manakai_has_reference =>
5538 $token->{attributes}->{content}
5539 ->{has_reference});
5540 }
5541 }
5542
5543 !!!next-token;
5544 redo B;
5545 } elsif ($token->{tag_name} eq 'title') {
5546 !!!cp ('t341');
5547 ## NOTE: This is an "as if in head" code clone
5548 $parse_rcdata->(RCDATA_CONTENT_MODEL);
5549 redo B;
5550 } elsif ($token->{tag_name} eq 'body') {
5551 !!!parse-error (type => 'in body:body');
5552
5553 if (@{$self->{open_elements}} == 1 or
5554 $self->{open_elements}->[1]->[1] ne 'body') {
5555 !!!cp ('t342');
5556 ## Ignore the token
5557 } else {
5558 my $body_el = $self->{open_elements}->[1]->[0];
5559 for my $attr_name (keys %{$token->{attributes}}) {
5560 unless ($body_el->has_attribute_ns (undef, $attr_name)) {
5561 !!!cp ('t343');
5562 $body_el->set_attribute_ns
5563 (undef, [undef, $attr_name],
5564 $token->{attributes}->{$attr_name}->{value});
5565 }
5566 }
5567 }
5568 !!!next-token;
5569 redo B;
5570 } elsif ({
5571 address => 1, blockquote => 1, center => 1, dir => 1,
5572 div => 1, dl => 1, fieldset => 1,
5573 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
5574 menu => 1, ol => 1, p => 1, ul => 1,
5575 pre => 1, listing => 1,
5576 }->{$token->{tag_name}}) {
5577 ## has a p element in scope
5578 INSCOPE: for (reverse @{$self->{open_elements}}) {
5579 if ($_->[1] eq 'p') {
5580 !!!cp ('t344');
5581 !!!back-token;
5582 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
5583 redo B;
5584 } elsif ({
5585 applet => 1, table => 1, caption => 1, td => 1, th => 1,
5586 button => 1, marquee => 1, object => 1, html => 1,
5587 }->{$_->[1]}) {
5588 !!!cp ('t345');
5589 last INSCOPE;
5590 }
5591 } # INSCOPE
5592
5593 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5594 if ($token->{tag_name} eq 'pre' or $token->{tag_name} eq 'listing') {
5595 !!!next-token;
5596 if ($token->{type} == CHARACTER_TOKEN) {
5597 $token->{data} =~ s/^\x0A//;
5598 unless (length $token->{data}) {
5599 !!!cp ('t346');
5600 !!!next-token;
5601 } else {
5602 !!!cp ('t349');
5603 }
5604 } else {
5605 !!!cp ('t348');
5606 }
5607 } else {
5608 !!!cp ('t347');
5609 !!!next-token;
5610 }
5611 redo B;
5612 } elsif ($token->{tag_name} eq 'form') {
5613 if (defined $self->{form_element}) {
5614 !!!cp ('t350');
5615 !!!parse-error (type => 'in form:form');
5616 ## Ignore the token
5617 !!!next-token;
5618 redo B;
5619 } else {
5620 ## has a p element in scope
5621 INSCOPE: for (reverse @{$self->{open_elements}}) {
5622 if ($_->[1] eq 'p') {
5623 !!!cp ('t351');
5624 !!!back-token;
5625 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
5626 redo B;
5627 } elsif ({
5628 applet => 1, table => 1, caption => 1, td => 1, th => 1,
5629 button => 1, marquee => 1, object => 1, html => 1,
5630 }->{$_->[1]}) {
5631 !!!cp ('t352');
5632 last INSCOPE;
5633 }
5634 } # INSCOPE
5635
5636 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5637 $self->{form_element} = $self->{open_elements}->[-1]->[0];
5638 !!!next-token;
5639 redo B;
5640 }
5641 } elsif ($token->{tag_name} eq 'li') {
5642 ## has a p element in scope
5643 INSCOPE: for (reverse @{$self->{open_elements}}) {
5644 if ($_->[1] eq 'p') {
5645 !!!cp ('t353');
5646 !!!back-token;
5647 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
5648 redo B;
5649 } elsif ({
5650 applet => 1, table => 1, caption => 1, td => 1, th => 1,
5651 button => 1, marquee => 1, object => 1, html => 1,
5652 }->{$_->[1]}) {
5653 !!!cp ('t354');
5654 last INSCOPE;
5655 }
5656 } # INSCOPE
5657
5658 ## Step 1
5659 my $i = -1;
5660 my $node = $self->{open_elements}->[$i];
5661 LI: {
5662 ## Step 2
5663 if ($node->[1] eq 'li') {
5664 if ($i != -1) {
5665 !!!cp ('t355');
5666 !!!parse-error (type => 'end tag missing:'.
5667 $self->{open_elements}->[-1]->[1]);
5668 } else {
5669 !!!cp ('t356');
5670 }
5671 splice @{$self->{open_elements}}, $i;
5672 last LI;
5673 } else {
5674 !!!cp ('t357');
5675 }
5676
5677 ## Step 3
5678 if (not $formatting_category->{$node->[1]} and
5679 #not $phrasing_category->{$node->[1]} and
5680 ($special_category->{$node->[1]} or
5681 $scoping_category->{$node->[1]}) and
5682 $node->[1] ne 'address' and $node->[1] ne 'div') {
5683 !!!cp ('t358');
5684 last LI;
5685 }
5686
5687 !!!cp ('t359');
5688 ## Step 4
5689 $i--;
5690 $node = $self->{open_elements}->[$i];
5691 redo LI;
5692 } # LI
5693
5694 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5695 !!!next-token;
5696 redo B;
5697 } elsif ($token->{tag_name} eq 'dd' or $token->{tag_name} eq 'dt') {
5698 ## has a p element in scope
5699 INSCOPE: for (reverse @{$self->{open_elements}}) {
5700 if ($_->[1] eq 'p') {
5701 !!!cp ('t360');
5702 !!!back-token;
5703 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
5704 redo B;
5705 } elsif ({
5706 applet => 1, table => 1, caption => 1, td => 1, th => 1,
5707 button => 1, marquee => 1, object => 1, html => 1,
5708 }->{$_->[1]}) {
5709 !!!cp ('t361');
5710 last INSCOPE;
5711 }
5712 } # INSCOPE
5713
5714 ## Step 1
5715 my $i = -1;
5716 my $node = $self->{open_elements}->[$i];
5717 LI: {
5718 ## Step 2
5719 if ($node->[1] eq 'dt' or $node->[1] eq 'dd') {
5720 if ($i != -1) {
5721 !!!cp ('t362');
5722 !!!parse-error (type => 'end tag missing:'.
5723 $self->{open_elements}->[-1]->[1]);
5724 } else {
5725 !!!cp ('t363');
5726 }
5727 splice @{$self->{open_elements}}, $i;
5728 last LI;
5729 } else {
5730 !!!cp ('t364');
5731 }
5732
5733 ## Step 3
5734 if (not $formatting_category->{$node->[1]} and
5735 #not $phrasing_category->{$node->[1]} and
5736 ($special_category->{$node->[1]} or
5737 $scoping_category->{$node->[1]}) and
5738 $node->[1] ne 'address' and $node->[1] ne 'div') {
5739 !!!cp ('t365');
5740 last LI;
5741 }
5742
5743 !!!cp ('t366');
5744 ## Step 4
5745 $i--;
5746 $node = $self->{open_elements}->[$i];
5747 redo LI;
5748 } # LI
5749
5750 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5751 !!!next-token;
5752 redo B;
5753 } elsif ($token->{tag_name} eq 'plaintext') {
5754 ## has a p element in scope
5755 INSCOPE: for (reverse @{$self->{open_elements}}) {
5756 if ($_->[1] eq 'p') {
5757 !!!cp ('t367');
5758 !!!back-token;
5759 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
5760 redo B;
5761 } elsif ({
5762 applet => 1, table => 1, caption => 1, td => 1, th => 1,
5763 button => 1, marquee => 1, object => 1, html => 1,
5764 }->{$_->[1]}) {
5765 !!!cp ('t368');
5766 last INSCOPE;
5767 }
5768 } # INSCOPE
5769
5770 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5771
5772 $self->{content_model} = PLAINTEXT_CONTENT_MODEL;
5773
5774 !!!next-token;
5775 redo B;
5776 } elsif ($token->{tag_name} eq 'a') {
5777 AFE: for my $i (reverse 0..$#$active_formatting_elements) {
5778 my $node = $active_formatting_elements->[$i];
5779 if ($node->[1] eq 'a') {
5780 !!!cp ('t371');
5781 !!!parse-error (type => 'in a:a');
5782
5783 !!!back-token;
5784 $token = {type => END_TAG_TOKEN, tag_name => 'a'};
5785 $formatting_end_tag->($token->{tag_name});
5786
5787 AFE2: for (reverse 0..$#$active_formatting_elements) {
5788 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
5789 !!!cp ('t372');
5790 splice @$active_formatting_elements, $_, 1;
5791 last AFE2;
5792 }
5793 } # AFE2
5794 OE: for (reverse 0..$#{$self->{open_elements}}) {
5795 if ($self->{open_elements}->[$_]->[0] eq $node->[0]) {
5796 !!!cp ('t373');
5797 splice @{$self->{open_elements}}, $_, 1;
5798 last OE;
5799 }
5800 } # OE
5801 last AFE;
5802 } elsif ($node->[0] eq '#marker') {
5803 !!!cp ('t374');
5804 last AFE;
5805 }
5806 } # AFE
5807
5808 $reconstruct_active_formatting_elements->($insert_to_current);
5809
5810 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5811 push @$active_formatting_elements, $self->{open_elements}->[-1];
5812
5813 !!!next-token;
5814 redo B;
5815 } elsif ({
5816 b => 1, big => 1, em => 1, font => 1, i => 1,
5817 s => 1, small => 1, strile => 1,
5818 strong => 1, tt => 1, u => 1,
5819 }->{$token->{tag_name}}) {
5820 !!!cp ('t375');
5821 $reconstruct_active_formatting_elements->($insert_to_current);
5822
5823 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5824 push @$active_formatting_elements, $self->{open_elements}->[-1];
5825
5826 !!!next-token;
5827 redo B;
5828 } elsif ($token->{tag_name} eq 'nobr') {
5829 $reconstruct_active_formatting_elements->($insert_to_current);
5830
5831 ## has a |nobr| element in scope
5832 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5833 my $node = $self->{open_elements}->[$_];
5834 if ($node->[1] eq 'nobr') {
5835 !!!cp ('t376');
5836 !!!parse-error (type => 'in nobr:nobr');
5837 !!!back-token;
5838 $token = {type => END_TAG_TOKEN, tag_name => 'nobr'};
5839 redo B;
5840 } elsif ({
5841 applet => 1, table => 1, caption => 1, td => 1, th => 1,
5842 button => 1, marquee => 1, object => 1, html => 1,
5843 }->{$node->[1]}) {
5844 !!!cp ('t377');
5845 last INSCOPE;
5846 }
5847 } # INSCOPE
5848
5849 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5850 push @$active_formatting_elements, $self->{open_elements}->[-1];
5851
5852 !!!next-token;
5853 redo B;
5854 } elsif ($token->{tag_name} eq 'button') {
5855 ## has a button element in scope
5856 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5857 my $node = $self->{open_elements}->[$_];
5858 if ($node->[1] eq 'button') {
5859 !!!cp ('t378');
5860 !!!parse-error (type => 'in button:button');
5861 !!!back-token;
5862 $token = {type => END_TAG_TOKEN, tag_name => 'button'};
5863 redo B;
5864 } elsif ({
5865 applet => 1, table => 1, caption => 1, td => 1, th => 1,
5866 button => 1, marquee => 1, object => 1, html => 1,
5867 }->{$node->[1]}) {
5868 !!!cp ('t379');
5869 last INSCOPE;
5870 }
5871 } # INSCOPE
5872
5873 $reconstruct_active_formatting_elements->($insert_to_current);
5874
5875 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5876
5877 ## TODO: associate with $self->{form_element} if defined
5878
5879 push @$active_formatting_elements, ['#marker', ''];
5880
5881 !!!next-token;
5882 redo B;
5883 } elsif ({
5884 applet => 1, marquee => 1, object => 1,
5885 }->{$token->{tag_name}}) {
5886 !!!cp ('t380');
5887 $reconstruct_active_formatting_elements->($insert_to_current);
5888
5889 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5890 push @$active_formatting_elements, ['#marker', ''];
5891
5892 !!!next-token;
5893 redo B;
5894 } elsif ($token->{tag_name} eq 'xmp') {
5895 !!!cp ('t381');
5896 $reconstruct_active_formatting_elements->($insert_to_current);
5897 $parse_rcdata->(CDATA_CONTENT_MODEL);
5898 redo B;
5899 } elsif ($token->{tag_name} eq 'table') {
5900 ## has a p element in scope
5901 INSCOPE: for (reverse @{$self->{open_elements}}) {
5902 if ($_->[1] eq 'p') {
5903 !!!cp ('t382');
5904 !!!back-token;
5905 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
5906 redo B;
5907 } elsif ({
5908 applet => 1, table => 1, caption => 1, td => 1, th => 1,
5909 button => 1, marquee => 1, object => 1, html => 1,
5910 }->{$_->[1]}) {
5911 !!!cp ('t383');
5912 last INSCOPE;
5913 }
5914 } # INSCOPE
5915
5916 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5917 push @{$open_tables}, [$self->{open_elements}->[-1]->[0]];
5918
5919 $self->{insertion_mode} = IN_TABLE_IM;
5920
5921 !!!next-token;
5922 redo B;
5923 } elsif ({
5924 area => 1, basefont => 1, bgsound => 1, br => 1,
5925 embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
5926 image => 1,
5927 }->{$token->{tag_name}}) {
5928 if ($token->{tag_name} eq 'image') {
5929 !!!cp ('t384');
5930 !!!parse-error (type => 'image');
5931 $token->{tag_name} = 'img';
5932 } else {
5933 !!!cp ('t385');
5934 }
5935
5936 ## NOTE: There is an "as if <br>" code clone.
5937 $reconstruct_active_formatting_elements->($insert_to_current);
5938
5939 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5940 pop @{$self->{open_elements}};
5941
5942 !!!next-token;
5943 redo B;
5944 } elsif ($token->{tag_name} eq 'hr') {
5945 ## has a p element in scope
5946 INSCOPE: for (reverse @{$self->{open_elements}}) {
5947 if ($_->[1] eq 'p') {
5948 !!!cp ('t386');
5949 !!!back-token;
5950 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
5951 redo B;
5952 } elsif ({
5953 applet => 1, table => 1, caption => 1, td => 1, th => 1,
5954 button => 1, marquee => 1, object => 1, html => 1,
5955 }->{$_->[1]}) {
5956 !!!cp ('t387');
5957 last INSCOPE;
5958 }
5959 } # INSCOPE
5960
5961 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5962 pop @{$self->{open_elements}};
5963
5964 !!!next-token;
5965 redo B;
5966 } elsif ($token->{tag_name} eq 'input') {
5967 !!!cp ('t388');
5968 $reconstruct_active_formatting_elements->($insert_to_current);
5969
5970 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5971 ## TODO: associate with $self->{form_element} if defined
5972 pop @{$self->{open_elements}};
5973
5974 !!!next-token;
5975 redo B;
5976 } elsif ($token->{tag_name} eq 'isindex') {
5977 !!!parse-error (type => 'isindex');
5978
5979 if (defined $self->{form_element}) {
5980 !!!cp ('t389');
5981 ## Ignore the token
5982 !!!next-token;
5983 redo B;
5984 } else {
5985 my $at = $token->{attributes};
5986 my $form_attrs;
5987 $form_attrs->{action} = $at->{action} if $at->{action};
5988 my $prompt_attr = $at->{prompt};
5989 $at->{name} = {name => 'name', value => 'isindex'};
5990 delete $at->{action};
5991 delete $at->{prompt};
5992 my @tokens = (
5993 {type => START_TAG_TOKEN, tag_name => 'form',
5994 attributes => $form_attrs},
5995 {type => START_TAG_TOKEN, tag_name => 'hr'},
5996 {type => START_TAG_TOKEN, tag_name => 'p'},
5997 {type => START_TAG_TOKEN, tag_name => 'label'},
5998 );
5999 if ($prompt_attr) {
6000 !!!cp ('t390');
6001 push @tokens, {type => CHARACTER_TOKEN, data => $prompt_attr->{value}};
6002 } else {
6003 !!!cp ('t391');
6004 push @tokens, {type => CHARACTER_TOKEN,
6005 data => 'This is a searchable index. Insert your search keywords here: '}; # SHOULD
6006 ## TODO: make this configurable
6007 }
6008 push @tokens,
6009 {type => START_TAG_TOKEN, tag_name => 'input', attributes => $at},
6010 #{type => CHARACTER_TOKEN, data => ''}, # SHOULD
6011 {type => END_TAG_TOKEN, tag_name => 'label'},
6012 {type => END_TAG_TOKEN, tag_name => 'p'},
6013 {type => START_TAG_TOKEN, tag_name => 'hr'},
6014 {type => END_TAG_TOKEN, tag_name => 'form'};
6015 $token = shift @tokens;
6016 !!!back-token (@tokens);
6017 redo B;
6018 }
6019 } elsif ($token->{tag_name} eq 'textarea') {
6020 my $tag_name = $token->{tag_name};
6021 my $el;
6022 !!!create-element ($el, $token->{tag_name}, $token->{attributes});
6023
6024 ## TODO: $self->{form_element} if defined
6025 $self->{content_model} = RCDATA_CONTENT_MODEL;
6026 delete $self->{escape}; # MUST
6027
6028 $insert->($el);
6029
6030 my $text = '';
6031 !!!next-token;
6032 if ($token->{type} == CHARACTER_TOKEN) {
6033 $token->{data} =~ s/^\x0A//;
6034 unless (length $token->{data}) {
6035 !!!cp ('t392');
6036 !!!next-token;
6037 } else {
6038 !!!cp ('t393');
6039 }
6040 } else {
6041 !!!cp ('t394');
6042 }
6043 while ($token->{type} == CHARACTER_TOKEN) {
6044 !!!cp ('t395');
6045 $text .= $token->{data};
6046 !!!next-token;
6047 }
6048 if (length $text) {
6049 !!!cp ('t396');
6050 $el->manakai_append_text ($text);
6051 }
6052
6053 $self->{content_model} = PCDATA_CONTENT_MODEL;
6054
6055 if ($token->{type} == END_TAG_TOKEN and
6056 $token->{tag_name} eq $tag_name) {
6057 !!!cp ('t397');
6058 ## Ignore the token
6059 } else {
6060 !!!cp ('t398');
6061 !!!parse-error (type => 'in RCDATA:#'.$token->{type});
6062 }
6063 !!!next-token;
6064 redo B;
6065 } elsif ({
6066 iframe => 1,
6067 noembed => 1,
6068 noframes => 1,
6069 noscript => 0, ## TODO: 1 if scripting is enabled
6070 }->{$token->{tag_name}}) {
6071 !!!cp ('t399');
6072 ## NOTE: There is an "as if in body" code clone.
6073 $parse_rcdata->(CDATA_CONTENT_MODEL);
6074 redo B;
6075 } elsif ($token->{tag_name} eq 'select') {
6076 !!!cp ('t400');
6077 $reconstruct_active_formatting_elements->($insert_to_current);
6078
6079 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
6080
6081 ## TODO: associate with $self->{form_element} if defined
6082
6083 if ($self->{insertion_mode} & TABLE_IMS or
6084 $self->{insertion_mode} & BODY_TABLE_IMS or
6085 $self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
6086 !!!cp ('t400.1');
6087 $self->{insertion_mode} = IN_SELECT_IN_TABLE_IM;
6088 } else {
6089 !!!cp ('t400.2');
6090 $self->{insertion_mode} = IN_SELECT_IM;
6091 }
6092 !!!next-token;
6093 redo B;
6094 } elsif ({
6095 caption => 1, col => 1, colgroup => 1, frame => 1,
6096 frameset => 1, head => 1, option => 1, optgroup => 1,
6097 tbody => 1, td => 1, tfoot => 1, th => 1,
6098 thead => 1, tr => 1,
6099 }->{$token->{tag_name}}) {
6100 !!!cp ('t401');
6101 !!!parse-error (type => 'in body:'.$token->{tag_name});
6102 ## Ignore the token
6103 !!!next-token;
6104 redo B;
6105
6106 ## ISSUE: An issue on HTML5 new elements in the spec.
6107 } else {
6108 !!!cp ('t402');
6109 $reconstruct_active_formatting_elements->($insert_to_current);
6110
6111 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
6112
6113 !!!next-token;
6114 redo B;
6115 }
6116 } elsif ($token->{type} == END_TAG_TOKEN) {
6117 if ($token->{tag_name} eq 'body') {
6118 if (@{$self->{open_elements}} > 1 and
6119 $self->{open_elements}->[1]->[1] eq 'body') {
6120 for (@{$self->{open_elements}}) {
6121 unless ({
6122 dd => 1, dt => 1, li => 1, p => 1, td => 1,
6123 th => 1, tr => 1, body => 1, html => 1,
6124 tbody => 1, tfoot => 1, thead => 1,
6125 }->{$_->[1]}) {
6126 !!!cp ('t403');
6127 !!!parse-error (type => 'not closed:'.$_->[1]);
6128 } else {
6129 !!!cp ('t404');
6130 }
6131 }
6132
6133 $self->{insertion_mode} = AFTER_BODY_IM;
6134 !!!next-token;
6135 redo B;
6136 } else {
6137 !!!cp ('t405');
6138 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
6139 ## Ignore the token
6140 !!!next-token;
6141 redo B;
6142 }
6143 } elsif ($token->{tag_name} eq 'html') {
6144 if (@{$self->{open_elements}} > 1 and $self->{open_elements}->[1]->[1] eq 'body') {
6145 ## ISSUE: There is an issue in the spec.
6146 if ($self->{open_elements}->[-1]->[1] ne 'body') {
6147 !!!cp ('t406');
6148 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[1]->[1]);
6149 } else {
6150 !!!cp ('t407');
6151 }
6152 $self->{insertion_mode} = AFTER_BODY_IM;
6153 ## reprocess
6154 redo B;
6155 } else {
6156 !!!cp ('t408');
6157 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
6158 ## Ignore the token
6159 !!!next-token;
6160 redo B;
6161 }
6162 } elsif ({
6163 address => 1, blockquote => 1, center => 1, dir => 1,
6164 div => 1, dl => 1, fieldset => 1, listing => 1,
6165 menu => 1, ol => 1, pre => 1, ul => 1,
6166 dd => 1, dt => 1, li => 1,
6167 applet => 1, button => 1, marquee => 1, object => 1,
6168 }->{$token->{tag_name}}) {
6169 ## has an element in scope
6170 my $i;
6171 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6172 my $node = $self->{open_elements}->[$_];
6173 if ($node->[1] eq $token->{tag_name}) {
6174 !!!cp ('t410');
6175 $i = $_;
6176 last INSCOPE;
6177 } elsif ({
6178 applet => 1, table => 1, caption => 1, td => 1, th => 1,
6179 button => 1, marquee => 1, object => 1, html => 1,
6180 }->{$node->[1]}) {
6181 !!!cp ('t411');
6182 last INSCOPE;
6183 }
6184 } # INSCOPE
6185
6186 unless (defined $i) { # has an element in scope
6187 !!!cp ('t413');
6188 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
6189 } else {
6190 ## Step 1. generate implied end tags
6191 while ({
6192 dd => ($token->{tag_name} ne 'dd'),
6193 dt => ($token->{tag_name} ne 'dt'),
6194 li => ($token->{tag_name} ne 'li'),
6195 p => 1,
6196 }->{$self->{open_elements}->[-1]->[1]}) {
6197 !!!cp ('t409');
6198 pop @{$self->{open_elements}};
6199 }
6200
6201 ## Step 2.
6202 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
6203 !!!cp ('t412');
6204 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
6205 } else {
6206 !!!cp ('t414');
6207 }
6208
6209 ## Step 3.
6210 splice @{$self->{open_elements}}, $i;
6211
6212 ## Step 4.
6213 $clear_up_to_marker->()
6214 if {
6215 applet => 1, button => 1, marquee => 1, object => 1,
6216 }->{$token->{tag_name}};
6217 }
6218 !!!next-token;
6219 redo B;
6220 } elsif ($token->{tag_name} eq 'form') {
6221 undef $self->{form_element};
6222
6223 ## has an element in scope
6224 my $i;
6225 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6226 my $node = $self->{open_elements}->[$_];
6227 if ($node->[1] eq $token->{tag_name}) {
6228 !!!cp ('t418');
6229 $i = $_;
6230 last INSCOPE;
6231 } elsif ({
6232 applet => 1, table => 1, caption => 1, td => 1, th => 1,
6233 button => 1, marquee => 1, object => 1, html => 1,
6234 }->{$node->[1]}) {
6235 !!!cp ('t419');
6236 last INSCOPE;
6237 }
6238 } # INSCOPE
6239
6240 unless (defined $i) { # has an element in scope
6241 !!!cp ('t421');
6242 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
6243 } else {
6244 ## Step 1. generate implied end tags
6245 while ({
6246 dd => 1, dt => 1, li => 1, p => 1,
6247 }->{$self->{open_elements}->[-1]->[1]}) {
6248 !!!cp ('t417');
6249 pop @{$self->{open_elements}};
6250 }
6251
6252 ## Step 2.
6253 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
6254 !!!cp ('t417.1');
6255 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
6256 } else {
6257 !!!cp ('t420');
6258 }
6259
6260 ## Step 3.
6261 splice @{$self->{open_elements}}, $i;
6262 }
6263
6264 !!!next-token;
6265 redo B;
6266 } elsif ({
6267 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
6268 }->{$token->{tag_name}}) {
6269 ## has an element in scope
6270 my $i;
6271 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6272 my $node = $self->{open_elements}->[$_];
6273 if ({
6274 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
6275 }->{$node->[1]}) {
6276 !!!cp ('t423');
6277 $i = $_;
6278 last INSCOPE;
6279 } elsif ({
6280 applet => 1, table => 1, caption => 1, td => 1, th => 1,
6281 button => 1, marquee => 1, object => 1, html => 1,
6282 }->{$node->[1]}) {
6283 !!!cp ('t424');
6284 last INSCOPE;
6285 }
6286 } # INSCOPE
6287
6288 unless (defined $i) { # has an element in scope
6289 !!!cp ('t425.1');
6290 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
6291 } else {
6292 ## Step 1. generate implied end tags
6293 while ({
6294 dd => 1, dt => 1, li => 1, p => 1,
6295 }->{$self->{open_elements}->[-1]->[1]}) {
6296 !!!cp ('t422');
6297 pop @{$self->{open_elements}};
6298 }
6299
6300 ## Step 2.
6301 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
6302 !!!cp ('t425');
6303 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
6304 } else {
6305 !!!cp ('t426');
6306 }
6307
6308 ## Step 3.
6309 splice @{$self->{open_elements}}, $i;
6310 }
6311
6312 !!!next-token;
6313 redo B;
6314 } elsif ($token->{tag_name} eq 'p') {
6315 ## has an element in scope
6316 my $i;
6317 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6318 my $node = $self->{open_elements}->[$_];
6319 if ($node->[1] eq $token->{tag_name}) {
6320 !!!cp ('t410.1');
6321 $i = $_;
6322 last INSCOPE;
6323 } elsif ({
6324 applet => 1, table => 1, caption => 1, td => 1, th => 1,
6325 button => 1, marquee => 1, object => 1, html => 1,
6326 }->{$node->[1]}) {
6327 !!!cp ('t411.1');
6328 last INSCOPE;
6329 }
6330 } # INSCOPE
6331
6332 if (defined $i) {
6333 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
6334 !!!cp ('t412.1');
6335 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
6336 } else {
6337 !!!cp ('t414.1');
6338 }
6339
6340 splice @{$self->{open_elements}}, $i;
6341 } else {
6342 !!!cp ('t413.1');
6343 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
6344
6345 !!!cp ('t415.1');
6346 ## As if <p>, then reprocess the current token
6347 my $el;
6348 !!!create-element ($el, 'p');
6349 $insert->($el);
6350 ## NOTE: Not inserted into |$self->{open_elements}|.
6351 }
6352
6353 !!!next-token;
6354 redo B;
6355 } elsif ({
6356 a => 1,
6357 b => 1, big => 1, em => 1, font => 1, i => 1,
6358 nobr => 1, s => 1, small => 1, strile => 1,
6359 strong => 1, tt => 1, u => 1,
6360 }->{$token->{tag_name}}) {
6361 !!!cp ('t427');
6362 $formatting_end_tag->($token->{tag_name});
6363 redo B;
6364 } elsif ($token->{tag_name} eq 'br') {
6365 !!!cp ('t428');
6366 !!!parse-error (type => 'unmatched end tag:br');
6367
6368 ## As if <br>
6369 $reconstruct_active_formatting_elements->($insert_to_current);
6370
6371 my $el;
6372 !!!create-element ($el, 'br');
6373 $insert->($el);
6374
6375 ## Ignore the token.
6376 !!!next-token;
6377 redo B;
6378 } elsif ({
6379 caption => 1, col => 1, colgroup => 1, frame => 1,
6380 frameset => 1, head => 1, option => 1, optgroup => 1,
6381 tbody => 1, td => 1, tfoot => 1, th => 1,
6382 thead => 1, tr => 1,
6383 area => 1, basefont => 1, bgsound => 1,
6384 embed => 1, hr => 1, iframe => 1, image => 1,
6385 img => 1, input => 1, isindex => 1, noembed => 1,
6386 noframes => 1, param => 1, select => 1, spacer => 1,
6387 table => 1, textarea => 1, wbr => 1,
6388 noscript => 0, ## TODO: if scripting is enabled
6389 }->{$token->{tag_name}}) {
6390 !!!cp ('t429');
6391 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
6392 ## Ignore the token
6393 !!!next-token;
6394 redo B;
6395
6396 ## ISSUE: Issue on HTML5 new elements in spec
6397
6398 } else {
6399 ## Step 1
6400 my $node_i = -1;
6401 my $node = $self->{open_elements}->[$node_i];
6402
6403 ## Step 2
6404 S2: {
6405 if ($node->[1] eq $token->{tag_name}) {
6406 ## Step 1
6407 ## generate implied end tags
6408 while ({
6409 dd => 1, dt => 1, li => 1, p => 1,
6410 }->{$self->{open_elements}->[-1]->[1]}) {
6411 !!!cp ('t430');
6412 ## ISSUE: Can this case be reached?
6413 pop @{$self->{open_elements}};
6414 }
6415
6416 ## Step 2
6417 if ($token->{tag_name} ne $self->{open_elements}->[-1]->[1]) {
6418 !!!cp ('t431');
6419 ## NOTE: <x><y></x>
6420 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
6421 } else {
6422 !!!cp ('t432');
6423 }
6424
6425 ## Step 3
6426 splice @{$self->{open_elements}}, $node_i;
6427
6428 !!!next-token;
6429 last S2;
6430 } else {
6431 ## Step 3
6432 if (not $formatting_category->{$node->[1]} and
6433 #not $phrasing_category->{$node->[1]} and
6434 ($special_category->{$node->[1]} or
6435 $scoping_category->{$node->[1]})) {
6436 !!!cp ('t433');
6437 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
6438 ## Ignore the token
6439 !!!next-token;
6440 last S2;
6441 }
6442
6443 !!!cp ('t434');
6444 }
6445
6446 ## Step 4
6447 $node_i--;
6448 $node = $self->{open_elements}->[$node_i];
6449
6450 ## Step 5;
6451 redo S2;
6452 } # S2
6453 redo B;
6454 }
6455 }
6456 redo B;
6457 } # B
6458
6459 ## Stop parsing # MUST
6460
6461 ## TODO: script stuffs
6462 } # _tree_construct_main
6463
6464 sub set_inner_html ($$$) {
6465 my $class = shift;
6466 my $node = shift;
6467 my $s = \$_[0];
6468 my $onerror = $_[1];
6469
6470 ## ISSUE: Should {confident} be true?
6471
6472 my $nt = $node->node_type;
6473 if ($nt == 9) {
6474 # MUST
6475
6476 ## Step 1 # MUST
6477 ## TODO: If the document has an active parser, ...
6478 ## ISSUE: There is an issue in the spec.
6479
6480 ## Step 2 # MUST
6481 my @cn = @{$node->child_nodes};
6482 for (@cn) {
6483 $node->remove_child ($_);
6484 }
6485
6486 ## Step 3, 4, 5 # MUST
6487 $class->parse_string ($$s => $node, $onerror);
6488 } elsif ($nt == 1) {
6489 ## TODO: If non-html element
6490
6491 ## NOTE: Most of this code is copied from |parse_string|
6492
6493 ## Step 1 # MUST
6494 my $this_doc = $node->owner_document;
6495 my $doc = $this_doc->implementation->create_document;
6496 $doc->manakai_is_html (1);
6497 my $p = $class->new;
6498 $p->{document} = $doc;
6499
6500 ## Step 8 # MUST
6501 my $i = 0;
6502 my $line = 1;
6503 my $column = 0;
6504 $p->{set_next_char} = sub {
6505 my $self = shift;
6506
6507 pop @{$self->{prev_char}};
6508 unshift @{$self->{prev_char}}, $self->{next_char};
6509
6510 $self->{next_char} = -1 and return if $i >= length $$s;
6511 $self->{next_char} = ord substr $$s, $i++, 1;
6512 $column++;
6513
6514 if ($self->{next_char} == 0x000A) { # LF
6515 $line++;
6516 $column = 0;
6517 !!!cp ('i1');
6518 } elsif ($self->{next_char} == 0x000D) { # CR
6519 $i++ if substr ($$s, $i, 1) eq "\x0A";
6520 $self->{next_char} = 0x000A; # LF # MUST
6521 $line++;
6522 $column = 0;
6523 !!!cp ('i2');
6524 } elsif ($self->{next_char} > 0x10FFFF) {
6525 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
6526 !!!cp ('i3');
6527 } elsif ($self->{next_char} == 0x0000) { # NULL
6528 !!!cp ('i4');
6529 !!!parse-error (type => 'NULL');
6530 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
6531 }
6532 };
6533 $p->{prev_char} = [-1, -1, -1];
6534 $p->{next_char} = -1;
6535
6536 my $ponerror = $onerror || sub {
6537 my (%opt) = @_;
6538 warn "Parse error ($opt{type}) at line $opt{line} column $opt{column}\n";
6539 };
6540 $p->{parse_error} = sub {
6541 $ponerror->(@_, line => $line, column => $column);
6542 };
6543
6544 $p->_initialize_tokenizer;
6545 $p->_initialize_tree_constructor;
6546
6547 ## Step 2
6548 my $node_ln = $node->manakai_local_name;
6549 $p->{content_model} = {
6550 title => RCDATA_CONTENT_MODEL,
6551 textarea => RCDATA_CONTENT_MODEL,
6552 style => CDATA_CONTENT_MODEL,
6553 script => CDATA_CONTENT_MODEL,
6554 xmp => CDATA_CONTENT_MODEL,
6555 iframe => CDATA_CONTENT_MODEL,
6556 noembed => CDATA_CONTENT_MODEL,
6557 noframes => CDATA_CONTENT_MODEL,
6558 noscript => CDATA_CONTENT_MODEL,
6559 plaintext => PLAINTEXT_CONTENT_MODEL,
6560 }->{$node_ln};
6561 $p->{content_model} = PCDATA_CONTENT_MODEL
6562 unless defined $p->{content_model};
6563 ## ISSUE: What is "the name of the element"? local name?
6564
6565 $p->{inner_html_node} = [$node, $node_ln];
6566
6567 ## Step 3
6568 my $root = $doc->create_element_ns
6569 ('http://www.w3.org/1999/xhtml', [undef, 'html']);
6570
6571 ## Step 4 # MUST
6572 $doc->append_child ($root);
6573
6574 ## Step 5 # MUST
6575 push @{$p->{open_elements}}, [$root, 'html'];
6576
6577 undef $p->{head_element};
6578
6579 ## Step 6 # MUST
6580 $p->_reset_insertion_mode;
6581
6582 ## Step 7 # MUST
6583 my $anode = $node;
6584 AN: while (defined $anode) {
6585 if ($anode->node_type == 1) {
6586 my $nsuri = $anode->namespace_uri;
6587 if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {
6588 if ($anode->manakai_local_name eq 'form') {
6589 !!!cp ('i5');
6590 $p->{form_element} = $anode;
6591 last AN;
6592 }
6593 }
6594 }
6595 $anode = $anode->parent_node;
6596 } # AN
6597
6598 ## Step 9 # MUST
6599 {
6600 my $self = $p;
6601 !!!next-token;
6602 }
6603 $p->_tree_construction_main;
6604
6605 ## Step 10 # MUST
6606 my @cn = @{$node->child_nodes};
6607 for (@cn) {
6608 $node->remove_child ($_);
6609 }
6610 ## ISSUE: mutation events? read-only?
6611
6612 ## Step 11 # MUST
6613 @cn = @{$root->child_nodes};
6614 for (@cn) {
6615 $this_doc->adopt_node ($_);
6616 $node->append_child ($_);
6617 }
6618 ## ISSUE: mutation events?
6619
6620 $p->_terminate_tree_constructor;
6621 } else {
6622 die "$0: |set_inner_html| is not defined for node of type $nt";
6623 }
6624 } # set_inner_html
6625
6626 } # tree construction stage
6627
6628 package Whatpm::HTML::RestartParser;
6629 push our @ISA, 'Error';
6630
6631 1;
6632 # $Date: 2008/03/09 07:57:29 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24