/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.110 - (show annotations) (download) (as text)
Tue Mar 11 01:23:50 2008 UTC (16 years, 7 months ago) by wakaba
Branch: MAIN
Changes since 1.109: +37 -49 lines
File MIME type: application/x-wais-source
++ whatpm/Whatpm/ChangeLog	11 Mar 2008 01:23:47 -0000
	* HTML.pm.src: Similar codes are merged together, again.

2008-03-11  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.109 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 use Error qw(:try);
5
6 ## ISSUE:
7 ## var doc = implementation.createDocument (null, null, null);
8 ## doc.write ('');
9 ## alert (doc.compatMode);
10
11 ## TODO: Control charcters and noncharacters are not allowed (HTML5 revision 1263)
12 ## TODO: 1252 parse error (revision 1264)
13 ## TODO: 8859-11 = 874 (revision 1271)
14
15 my $permitted_slash_tag_name = {
16 base => 1,
17 link => 1,
18 meta => 1,
19 hr => 1,
20 br => 1,
21 img => 1,
22 embed => 1,
23 param => 1,
24 area => 1,
25 col => 1,
26 input => 1,
27 };
28
29 my $c1_entity_char = {
30 0x80 => 0x20AC,
31 0x81 => 0xFFFD,
32 0x82 => 0x201A,
33 0x83 => 0x0192,
34 0x84 => 0x201E,
35 0x85 => 0x2026,
36 0x86 => 0x2020,
37 0x87 => 0x2021,
38 0x88 => 0x02C6,
39 0x89 => 0x2030,
40 0x8A => 0x0160,
41 0x8B => 0x2039,
42 0x8C => 0x0152,
43 0x8D => 0xFFFD,
44 0x8E => 0x017D,
45 0x8F => 0xFFFD,
46 0x90 => 0xFFFD,
47 0x91 => 0x2018,
48 0x92 => 0x2019,
49 0x93 => 0x201C,
50 0x94 => 0x201D,
51 0x95 => 0x2022,
52 0x96 => 0x2013,
53 0x97 => 0x2014,
54 0x98 => 0x02DC,
55 0x99 => 0x2122,
56 0x9A => 0x0161,
57 0x9B => 0x203A,
58 0x9C => 0x0153,
59 0x9D => 0xFFFD,
60 0x9E => 0x017E,
61 0x9F => 0x0178,
62 }; # $c1_entity_char
63
64 my $special_category = {
65 address => 1, area => 1, base => 1, basefont => 1, bgsound => 1,
66 blockquote => 1, body => 1, br => 1, center => 1, col => 1, colgroup => 1,
67 dd => 1, dir => 1, div => 1, dl => 1, dt => 1, embed => 1, fieldset => 1,
68 form => 1, frame => 1, frameset => 1, h1 => 1, h2 => 1, h3 => 1,
69 h4 => 1, h5 => 1, h6 => 1, head => 1, hr => 1, iframe => 1, image => 1,
70 img => 1, input => 1, isindex => 1, li => 1, link => 1, listing => 1,
71 menu => 1, meta => 1, noembed => 1, noframes => 1, noscript => 1,
72 ol => 1, optgroup => 1, option => 1, p => 1, param => 1, plaintext => 1,
73 pre => 1, script => 1, select => 1, spacer => 1, style => 1, tbody => 1,
74 textarea => 1, tfoot => 1, thead => 1, title => 1, tr => 1, ul => 1, wbr => 1,
75 };
76 my $scoping_category = {
77 applet => 1, button => 1, caption => 1, html => 1, marquee => 1, object => 1,
78 table => 1, td => 1, th => 1,
79 };
80 my $formatting_category = {
81 a => 1, b => 1, big => 1, em => 1, font => 1, i => 1, nobr => 1,
82 s => 1, small => 1, strile => 1, strong => 1, tt => 1, u => 1,
83 };
84 # $phrasing_category: all other elements
85
86 sub parse_byte_string ($$$$;$) {
87 my $self = ref $_[0] ? shift : shift->new;
88 my $charset = shift;
89 my $bytes_s = ref $_[0] ? $_[0] : \($_[0]);
90 my $s;
91
92 if (defined $charset) {
93 require Encode; ## TODO: decode(utf8) don't delete BOM
94 $s = \ (Encode::decode ($charset, $$bytes_s));
95 $self->{input_encoding} = lc $charset; ## TODO: normalize name
96 $self->{confident} = 1;
97 } else {
98 ## TODO: Implement HTML5 detection algorithm
99 require Whatpm::Charset::UniversalCharDet;
100 $charset = Whatpm::Charset::UniversalCharDet->detect_byte_string
101 (substr ($$bytes_s, 0, 1024));
102 $charset ||= 'windows-1252';
103 $s = \ (Encode::decode ($charset, $$bytes_s));
104 $self->{input_encoding} = $charset;
105 $self->{confident} = 0;
106 }
107
108 $self->{change_encoding} = sub {
109 my $self = shift;
110 my $charset = lc shift;
111 ## TODO: if $charset is supported
112 ## TODO: normalize charset name
113
114 ## "Change the encoding" algorithm:
115
116 ## Step 1
117 if ($charset eq 'utf-16') { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8?
118 $charset = 'utf-8';
119 }
120
121 ## Step 2
122 if (defined $self->{input_encoding} and
123 $self->{input_encoding} eq $charset) {
124 $self->{confident} = 1;
125 return;
126 }
127
128 !!!parse-error (type => 'charset label detected:'.$self->{input_encoding}.
129 ':'.$charset, level => 'w');
130
131 ## Step 3
132 # if (can) {
133 ## change the encoding on the fly.
134 #$self->{confident} = 1;
135 #return;
136 # }
137
138 ## Step 4
139 throw Whatpm::HTML::RestartParser (charset => $charset);
140 }; # $self->{change_encoding}
141
142 my @args = @_; shift @args; # $s
143 my $return;
144 try {
145 $return = $self->parse_char_string ($s, @args);
146 } catch Whatpm::HTML::RestartParser with {
147 my $charset = shift->{charset};
148 $s = \ (Encode::decode ($charset, $$bytes_s));
149 $self->{input_encoding} = $charset; ## TODO: normalize
150 $self->{confident} = 1;
151 $return = $self->parse_char_string ($s, @args);
152 };
153 return $return;
154 } # parse_byte_string
155
156 ## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM
157 ## and the HTML layer MUST ignore it. However, we does strip BOM in
158 ## the encoding layer and the HTML layer does not ignore any U+FEFF,
159 ## because the core part of our HTML parser expects a string of character,
160 ## not a string of bytes or code units or anything which might contain a BOM.
161 ## Therefore, any parser interface that accepts a string of bytes,
162 ## such as |parse_byte_string| in this module, must ensure that it does
163 ## strip the BOM and never strip any ZWNBSP.
164
165 *parse_char_string = \&parse_string;
166
167 sub parse_string ($$$;$) {
168 my $self = ref $_[0] ? shift : shift->new;
169 my $s = ref $_[0] ? $_[0] : \($_[0]);
170 $self->{document} = $_[1];
171 @{$self->{document}->child_nodes} = ();
172
173 ## NOTE: |set_inner_html| copies most of this method's code
174
175 $self->{confident} = 1 unless exists $self->{confident};
176 $self->{document}->input_encoding ($self->{input_encoding})
177 if defined $self->{input_encoding};
178
179 my $i = 0;
180 my $line = 1;
181 my $column = 0;
182 $self->{set_next_char} = sub {
183 my $self = shift;
184
185 pop @{$self->{prev_char}};
186 unshift @{$self->{prev_char}}, $self->{next_char};
187
188 $self->{next_char} = -1 and return if $i >= length $$s;
189 $self->{next_char} = ord substr $$s, $i++, 1;
190 $column++;
191
192 if ($self->{next_char} == 0x000A) { # LF
193 $line++;
194 $column = 0;
195 } elsif ($self->{next_char} == 0x000D) { # CR
196 $i++ if substr ($$s, $i, 1) eq "\x0A";
197 $self->{next_char} = 0x000A; # LF # MUST
198 $line++;
199 $column = 0;
200 } elsif ($self->{next_char} > 0x10FFFF) {
201 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
202 } elsif ($self->{next_char} == 0x0000) { # NULL
203 !!!parse-error (type => 'NULL');
204 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
205 }
206 };
207 $self->{prev_char} = [-1, -1, -1];
208 $self->{next_char} = -1;
209
210 my $onerror = $_[2] || sub {
211 my (%opt) = @_;
212 warn "Parse error ($opt{type}) at line $opt{line} column $opt{column}\n";
213 };
214 $self->{parse_error} = sub {
215 $onerror->(@_, line => $line, column => $column);
216 };
217
218 $self->_initialize_tokenizer;
219 $self->_initialize_tree_constructor;
220 $self->_construct_tree;
221 $self->_terminate_tree_constructor;
222
223 return $self->{document};
224 } # parse_string
225
226 sub new ($) {
227 my $class = shift;
228 my $self = bless {}, $class;
229 $self->{set_next_char} = sub {
230 $self->{next_char} = -1;
231 };
232 $self->{parse_error} = sub {
233 #
234 };
235 $self->{change_encoding} = sub {
236 # if ($_[0] is a supported encoding) {
237 # run "change the encoding" algorithm;
238 # throw Whatpm::HTML::RestartParser (charset => $new_encoding);
239 # }
240 };
241 $self->{application_cache_selection} = sub {
242 #
243 };
244 return $self;
245 } # new
246
247 sub CM_ENTITY () { 0b001 } # & markup in data
248 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
249 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
250
251 sub PLAINTEXT_CONTENT_MODEL () { 0 }
252 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
253 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
254 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
255
256 sub DATA_STATE () { 0 }
257 sub ENTITY_DATA_STATE () { 1 }
258 sub TAG_OPEN_STATE () { 2 }
259 sub CLOSE_TAG_OPEN_STATE () { 3 }
260 sub TAG_NAME_STATE () { 4 }
261 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
262 sub ATTRIBUTE_NAME_STATE () { 6 }
263 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
264 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
265 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
266 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
267 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
268 sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
269 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
270 sub COMMENT_START_STATE () { 14 }
271 sub COMMENT_START_DASH_STATE () { 15 }
272 sub COMMENT_STATE () { 16 }
273 sub COMMENT_END_STATE () { 17 }
274 sub COMMENT_END_DASH_STATE () { 18 }
275 sub BOGUS_COMMENT_STATE () { 19 }
276 sub DOCTYPE_STATE () { 20 }
277 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
278 sub DOCTYPE_NAME_STATE () { 22 }
279 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
280 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
281 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
282 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
283 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
284 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
285 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
286 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
287 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
288 sub BOGUS_DOCTYPE_STATE () { 32 }
289 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
290
291 sub DOCTYPE_TOKEN () { 1 }
292 sub COMMENT_TOKEN () { 2 }
293 sub START_TAG_TOKEN () { 3 }
294 sub END_TAG_TOKEN () { 4 }
295 sub END_OF_FILE_TOKEN () { 5 }
296 sub CHARACTER_TOKEN () { 6 }
297
298 sub AFTER_HTML_IMS () { 0b100 }
299 sub HEAD_IMS () { 0b1000 }
300 sub BODY_IMS () { 0b10000 }
301 sub BODY_TABLE_IMS () { 0b100000 }
302 sub TABLE_IMS () { 0b1000000 }
303 sub ROW_IMS () { 0b10000000 }
304 sub BODY_AFTER_IMS () { 0b100000000 }
305 sub FRAME_IMS () { 0b1000000000 }
306 sub SELECT_IMS () { 0b10000000000 }
307
308 ## NOTE: "initial" and "before html" insertion modes have no constants.
309
310 ## NOTE: "after after body" insertion mode.
311 sub AFTER_HTML_BODY_IM () { AFTER_HTML_IMS | BODY_AFTER_IMS }
312
313 ## NOTE: "after after frameset" insertion mode.
314 sub AFTER_HTML_FRAMESET_IM () { AFTER_HTML_IMS | FRAME_IMS }
315
316 sub IN_HEAD_IM () { HEAD_IMS | 0b00 }
317 sub IN_HEAD_NOSCRIPT_IM () { HEAD_IMS | 0b01 }
318 sub AFTER_HEAD_IM () { HEAD_IMS | 0b10 }
319 sub BEFORE_HEAD_IM () { HEAD_IMS | 0b11 }
320 sub IN_BODY_IM () { BODY_IMS }
321 sub IN_CELL_IM () { BODY_IMS | BODY_TABLE_IMS | 0b01 }
322 sub IN_CAPTION_IM () { BODY_IMS | BODY_TABLE_IMS | 0b10 }
323 sub IN_ROW_IM () { TABLE_IMS | ROW_IMS | 0b01 }
324 sub IN_TABLE_BODY_IM () { TABLE_IMS | ROW_IMS | 0b10 }
325 sub IN_TABLE_IM () { TABLE_IMS }
326 sub AFTER_BODY_IM () { BODY_AFTER_IMS }
327 sub IN_FRAMESET_IM () { FRAME_IMS | 0b01 }
328 sub AFTER_FRAMESET_IM () { FRAME_IMS | 0b10 }
329 sub IN_SELECT_IM () { SELECT_IMS | 0b01 }
330 sub IN_SELECT_IN_TABLE_IM () { SELECT_IMS | 0b10 }
331 sub IN_COLUMN_GROUP_IM () { 0b10 }
332
333 ## Implementations MUST act as if state machine in the spec
334
335 sub _initialize_tokenizer ($) {
336 my $self = shift;
337 $self->{state} = DATA_STATE; # MUST
338 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
339 undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE
340 undef $self->{current_attribute};
341 undef $self->{last_emitted_start_tag_name};
342 undef $self->{last_attribute_value_state};
343 $self->{char} = [];
344 # $self->{next_char}
345 !!!next-input-character;
346 $self->{token} = [];
347 # $self->{escape}
348 } # _initialize_tokenizer
349
350 ## A token has:
351 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
352 ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
353 ## ->{name} (DOCTYPE_TOKEN)
354 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
355 ## ->{public_identifier} (DOCTYPE_TOKEN)
356 ## ->{system_identifier} (DOCTYPE_TOKEN)
357 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
358 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
359 ## ->{name}
360 ## ->{value}
361 ## ->{has_reference} == 1 or 0
362 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
363
364 ## Emitted token MUST immediately be handled by the tree construction state.
365
366 ## Before each step, UA MAY check to see if either one of the scripts in
367 ## "list of scripts that will execute as soon as possible" or the first
368 ## script in the "list of scripts that will execute asynchronously",
369 ## has completed loading. If one has, then it MUST be executed
370 ## and removed from the list.
371
372 ## NOTE: HTML5 "Writing HTML documents" section, applied to
373 ## documents and not to user agents and conformance checkers,
374 ## contains some requirements that are not detected by the
375 ## parsing algorithm:
376 ## - Some requirements on character encoding declarations. ## TODO
377 ## - "Elements MUST NOT contain content that their content model disallows."
378 ## ... Some are parse error, some are not (will be reported by c.c.).
379 ## - Polytheistic slash SHOULD NOT be used. (Applied only to atheists.) ## TODO
380 ## - Text (in elements, attributes, and comments) SHOULD NOT contain
381 ## control characters other than space characters. ## TODO: (what is control character? C0, C1 and DEL? Unicode control character?)
382
383 ## TODO: HTML5 poses authors two SHOULD-level requirements that cannot
384 ## be detected by the HTML5 parsing algorithm:
385 ## - Text,
386
387 sub _get_next_token ($) {
388 my $self = shift;
389 if (@{$self->{token}}) {
390 return shift @{$self->{token}};
391 }
392
393 A: {
394 if ($self->{state} == DATA_STATE) {
395 if ($self->{next_char} == 0x0026) { # &
396 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
397 not $self->{escape}) {
398 !!!cp (1);
399 $self->{state} = ENTITY_DATA_STATE;
400 !!!next-input-character;
401 redo A;
402 } else {
403 !!!cp (2);
404 #
405 }
406 } elsif ($self->{next_char} == 0x002D) { # -
407 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
408 unless ($self->{escape}) {
409 if ($self->{prev_char}->[0] == 0x002D and # -
410 $self->{prev_char}->[1] == 0x0021 and # !
411 $self->{prev_char}->[2] == 0x003C) { # <
412 !!!cp (3);
413 $self->{escape} = 1;
414 } else {
415 !!!cp (4);
416 }
417 } else {
418 !!!cp (5);
419 }
420 }
421
422 #
423 } elsif ($self->{next_char} == 0x003C) { # <
424 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
425 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
426 not $self->{escape})) {
427 !!!cp (6);
428 $self->{state} = TAG_OPEN_STATE;
429 !!!next-input-character;
430 redo A;
431 } else {
432 !!!cp (7);
433 #
434 }
435 } elsif ($self->{next_char} == 0x003E) { # >
436 if ($self->{escape} and
437 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
438 if ($self->{prev_char}->[0] == 0x002D and # -
439 $self->{prev_char}->[1] == 0x002D) { # -
440 !!!cp (8);
441 delete $self->{escape};
442 } else {
443 !!!cp (9);
444 }
445 } else {
446 !!!cp (10);
447 }
448
449 #
450 } elsif ($self->{next_char} == -1) {
451 !!!cp (11);
452 !!!emit ({type => END_OF_FILE_TOKEN});
453 last A; ## TODO: ok?
454 } else {
455 !!!cp (12);
456 }
457 # Anything else
458 my $token = {type => CHARACTER_TOKEN,
459 data => chr $self->{next_char}};
460 ## Stay in the data state
461 !!!next-input-character;
462
463 !!!emit ($token);
464
465 redo A;
466 } elsif ($self->{state} == ENTITY_DATA_STATE) {
467 ## (cannot happen in CDATA state)
468
469 my $token = $self->_tokenize_attempt_to_consume_an_entity (0, -1);
470
471 $self->{state} = DATA_STATE;
472 # next-input-character is already done
473
474 unless (defined $token) {
475 !!!cp (13);
476 !!!emit ({type => CHARACTER_TOKEN, data => '&'});
477 } else {
478 !!!cp (14);
479 !!!emit ($token);
480 }
481
482 redo A;
483 } elsif ($self->{state} == TAG_OPEN_STATE) {
484 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
485 if ($self->{next_char} == 0x002F) { # /
486 !!!cp (15);
487 !!!next-input-character;
488 $self->{state} = CLOSE_TAG_OPEN_STATE;
489 redo A;
490 } else {
491 !!!cp (16);
492 ## reconsume
493 $self->{state} = DATA_STATE;
494
495 !!!emit ({type => CHARACTER_TOKEN, data => '<'});
496
497 redo A;
498 }
499 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
500 if ($self->{next_char} == 0x0021) { # !
501 !!!cp (17);
502 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
503 !!!next-input-character;
504 redo A;
505 } elsif ($self->{next_char} == 0x002F) { # /
506 !!!cp (18);
507 $self->{state} = CLOSE_TAG_OPEN_STATE;
508 !!!next-input-character;
509 redo A;
510 } elsif (0x0041 <= $self->{next_char} and
511 $self->{next_char} <= 0x005A) { # A..Z
512 !!!cp (19);
513 $self->{current_token}
514 = {type => START_TAG_TOKEN,
515 tag_name => chr ($self->{next_char} + 0x0020)};
516 $self->{state} = TAG_NAME_STATE;
517 !!!next-input-character;
518 redo A;
519 } elsif (0x0061 <= $self->{next_char} and
520 $self->{next_char} <= 0x007A) { # a..z
521 !!!cp (20);
522 $self->{current_token} = {type => START_TAG_TOKEN,
523 tag_name => chr ($self->{next_char})};
524 $self->{state} = TAG_NAME_STATE;
525 !!!next-input-character;
526 redo A;
527 } elsif ($self->{next_char} == 0x003E) { # >
528 !!!cp (21);
529 !!!parse-error (type => 'empty start tag');
530 $self->{state} = DATA_STATE;
531 !!!next-input-character;
532
533 !!!emit ({type => CHARACTER_TOKEN, data => '<>'});
534
535 redo A;
536 } elsif ($self->{next_char} == 0x003F) { # ?
537 !!!cp (22);
538 !!!parse-error (type => 'pio');
539 $self->{state} = BOGUS_COMMENT_STATE;
540 ## $self->{next_char} is intentionally left as is
541 redo A;
542 } else {
543 !!!cp (23);
544 !!!parse-error (type => 'bare stago');
545 $self->{state} = DATA_STATE;
546 ## reconsume
547
548 !!!emit ({type => CHARACTER_TOKEN, data => '<'});
549
550 redo A;
551 }
552 } else {
553 die "$0: $self->{content_model} in tag open";
554 }
555 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
556 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
557 if (defined $self->{last_emitted_start_tag_name}) {
558 ## NOTE: <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>
559 my @next_char;
560 TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
561 push @next_char, $self->{next_char};
562 my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
563 my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
564 if ($self->{next_char} == $c or $self->{next_char} == $C) {
565 !!!cp (24);
566 !!!next-input-character;
567 next TAGNAME;
568 } else {
569 !!!cp (25);
570 $self->{next_char} = shift @next_char; # reconsume
571 !!!back-next-input-character (@next_char);
572 $self->{state} = DATA_STATE;
573
574 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
575
576 redo A;
577 }
578 }
579 push @next_char, $self->{next_char};
580
581 unless ($self->{next_char} == 0x0009 or # HT
582 $self->{next_char} == 0x000A or # LF
583 $self->{next_char} == 0x000B or # VT
584 $self->{next_char} == 0x000C or # FF
585 $self->{next_char} == 0x0020 or # SP
586 $self->{next_char} == 0x003E or # >
587 $self->{next_char} == 0x002F or # /
588 $self->{next_char} == -1) {
589 !!!cp (26);
590 $self->{next_char} = shift @next_char; # reconsume
591 !!!back-next-input-character (@next_char);
592 $self->{state} = DATA_STATE;
593 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
594 redo A;
595 } else {
596 !!!cp (27);
597 $self->{next_char} = shift @next_char;
598 !!!back-next-input-character (@next_char);
599 # and consume...
600 }
601 } else {
602 ## No start tag token has ever been emitted
603 !!!cp (28);
604 # next-input-character is already done
605 $self->{state} = DATA_STATE;
606 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
607 redo A;
608 }
609 }
610
611 if (0x0041 <= $self->{next_char} and
612 $self->{next_char} <= 0x005A) { # A..Z
613 !!!cp (29);
614 $self->{current_token} = {type => END_TAG_TOKEN,
615 tag_name => chr ($self->{next_char} + 0x0020)};
616 $self->{state} = TAG_NAME_STATE;
617 !!!next-input-character;
618 redo A;
619 } elsif (0x0061 <= $self->{next_char} and
620 $self->{next_char} <= 0x007A) { # a..z
621 !!!cp (30);
622 $self->{current_token} = {type => END_TAG_TOKEN,
623 tag_name => chr ($self->{next_char})};
624 $self->{state} = TAG_NAME_STATE;
625 !!!next-input-character;
626 redo A;
627 } elsif ($self->{next_char} == 0x003E) { # >
628 !!!cp (31);
629 !!!parse-error (type => 'empty end tag');
630 $self->{state} = DATA_STATE;
631 !!!next-input-character;
632 redo A;
633 } elsif ($self->{next_char} == -1) {
634 !!!cp (32);
635 !!!parse-error (type => 'bare etago');
636 $self->{state} = DATA_STATE;
637 # reconsume
638
639 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
640
641 redo A;
642 } else {
643 !!!cp (33);
644 !!!parse-error (type => 'bogus end tag');
645 $self->{state} = BOGUS_COMMENT_STATE;
646 ## $self->{next_char} is intentionally left as is
647 redo A;
648 }
649 } elsif ($self->{state} == TAG_NAME_STATE) {
650 if ($self->{next_char} == 0x0009 or # HT
651 $self->{next_char} == 0x000A or # LF
652 $self->{next_char} == 0x000B or # VT
653 $self->{next_char} == 0x000C or # FF
654 $self->{next_char} == 0x0020) { # SP
655 !!!cp (34);
656 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
657 !!!next-input-character;
658 redo A;
659 } elsif ($self->{next_char} == 0x003E) { # >
660 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
661 !!!cp (35);
662 $self->{current_token}->{first_start_tag}
663 = not defined $self->{last_emitted_start_tag_name};
664 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
665 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
666 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
667 #if ($self->{current_token}->{attributes}) {
668 # ## NOTE: This should never be reached.
669 # !!! cp (36);
670 # !!! parse-error (type => 'end tag attribute');
671 #} else {
672 !!!cp (37);
673 #}
674 } else {
675 die "$0: $self->{current_token}->{type}: Unknown token type";
676 }
677 $self->{state} = DATA_STATE;
678 !!!next-input-character;
679
680 !!!emit ($self->{current_token}); # start tag or end tag
681
682 redo A;
683 } elsif (0x0041 <= $self->{next_char} and
684 $self->{next_char} <= 0x005A) { # A..Z
685 !!!cp (38);
686 $self->{current_token}->{tag_name} .= chr ($self->{next_char} + 0x0020);
687 # start tag or end tag
688 ## Stay in this state
689 !!!next-input-character;
690 redo A;
691 } elsif ($self->{next_char} == -1) {
692 !!!parse-error (type => 'unclosed tag');
693 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
694 !!!cp (39);
695 $self->{current_token}->{first_start_tag}
696 = not defined $self->{last_emitted_start_tag_name};
697 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
698 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
699 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
700 #if ($self->{current_token}->{attributes}) {
701 # ## NOTE: This state should never be reached.
702 # !!! cp (40);
703 # !!! parse-error (type => 'end tag attribute');
704 #} else {
705 !!!cp (41);
706 #}
707 } else {
708 die "$0: $self->{current_token}->{type}: Unknown token type";
709 }
710 $self->{state} = DATA_STATE;
711 # reconsume
712
713 !!!emit ($self->{current_token}); # start tag or end tag
714
715 redo A;
716 } elsif ($self->{next_char} == 0x002F) { # /
717 !!!next-input-character;
718 if ($self->{next_char} == 0x003E and # >
719 $self->{current_token}->{type} == START_TAG_TOKEN and
720 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
721 # permitted slash
722 !!!cp (42);
723 #
724 } else {
725 !!!cp (43);
726 !!!parse-error (type => 'nestc');
727 }
728 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
729 # next-input-character is already done
730 redo A;
731 } else {
732 !!!cp (44);
733 $self->{current_token}->{tag_name} .= chr $self->{next_char};
734 # start tag or end tag
735 ## Stay in the state
736 !!!next-input-character;
737 redo A;
738 }
739 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
740 if ($self->{next_char} == 0x0009 or # HT
741 $self->{next_char} == 0x000A or # LF
742 $self->{next_char} == 0x000B or # VT
743 $self->{next_char} == 0x000C or # FF
744 $self->{next_char} == 0x0020) { # SP
745 !!!cp (45);
746 ## Stay in the state
747 !!!next-input-character;
748 redo A;
749 } elsif ($self->{next_char} == 0x003E) { # >
750 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
751 !!!cp (46);
752 $self->{current_token}->{first_start_tag}
753 = not defined $self->{last_emitted_start_tag_name};
754 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
755 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
756 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
757 if ($self->{current_token}->{attributes}) {
758 !!!cp (47);
759 !!!parse-error (type => 'end tag attribute');
760 } else {
761 !!!cp (48);
762 }
763 } else {
764 die "$0: $self->{current_token}->{type}: Unknown token type";
765 }
766 $self->{state} = DATA_STATE;
767 !!!next-input-character;
768
769 !!!emit ($self->{current_token}); # start tag or end tag
770
771 redo A;
772 } elsif (0x0041 <= $self->{next_char} and
773 $self->{next_char} <= 0x005A) { # A..Z
774 !!!cp (49);
775 $self->{current_attribute} = {name => chr ($self->{next_char} + 0x0020),
776 value => ''};
777 $self->{state} = ATTRIBUTE_NAME_STATE;
778 !!!next-input-character;
779 redo A;
780 } elsif ($self->{next_char} == 0x002F) { # /
781 !!!next-input-character;
782 if ($self->{next_char} == 0x003E and # >
783 $self->{current_token}->{type} == START_TAG_TOKEN and
784 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
785 # permitted slash
786 !!!cp (50);
787 #
788 } else {
789 !!!cp (51);
790 !!!parse-error (type => 'nestc');
791 }
792 ## Stay in the state
793 # next-input-character is already done
794 redo A;
795 } elsif ($self->{next_char} == -1) {
796 !!!parse-error (type => 'unclosed tag');
797 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
798 !!!cp (52);
799 $self->{current_token}->{first_start_tag}
800 = not defined $self->{last_emitted_start_tag_name};
801 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
802 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
803 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
804 if ($self->{current_token}->{attributes}) {
805 !!!cp (53);
806 !!!parse-error (type => 'end tag attribute');
807 } else {
808 !!!cp (54);
809 }
810 } else {
811 die "$0: $self->{current_token}->{type}: Unknown token type";
812 }
813 $self->{state} = DATA_STATE;
814 # reconsume
815
816 !!!emit ($self->{current_token}); # start tag or end tag
817
818 redo A;
819 } else {
820 if ({
821 0x0022 => 1, # "
822 0x0027 => 1, # '
823 0x003D => 1, # =
824 }->{$self->{next_char}}) {
825 !!!cp (55);
826 !!!parse-error (type => 'bad attribute name');
827 } else {
828 !!!cp (56);
829 }
830 $self->{current_attribute} = {name => chr ($self->{next_char}),
831 value => ''};
832 $self->{state} = ATTRIBUTE_NAME_STATE;
833 !!!next-input-character;
834 redo A;
835 }
836 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
837 my $before_leave = sub {
838 if (exists $self->{current_token}->{attributes} # start tag or end tag
839 ->{$self->{current_attribute}->{name}}) { # MUST
840 !!!cp (57);
841 !!!parse-error (type => 'duplicate attribute:'.$self->{current_attribute}->{name});
842 ## Discard $self->{current_attribute} # MUST
843 } else {
844 !!!cp (58);
845 $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}
846 = $self->{current_attribute};
847 }
848 }; # $before_leave
849
850 if ($self->{next_char} == 0x0009 or # HT
851 $self->{next_char} == 0x000A or # LF
852 $self->{next_char} == 0x000B or # VT
853 $self->{next_char} == 0x000C or # FF
854 $self->{next_char} == 0x0020) { # SP
855 !!!cp (59);
856 $before_leave->();
857 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
858 !!!next-input-character;
859 redo A;
860 } elsif ($self->{next_char} == 0x003D) { # =
861 !!!cp (60);
862 $before_leave->();
863 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
864 !!!next-input-character;
865 redo A;
866 } elsif ($self->{next_char} == 0x003E) { # >
867 $before_leave->();
868 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
869 !!!cp (61);
870 $self->{current_token}->{first_start_tag}
871 = not defined $self->{last_emitted_start_tag_name};
872 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
873 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
874 !!!cp (62);
875 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
876 if ($self->{current_token}->{attributes}) {
877 !!!parse-error (type => 'end tag attribute');
878 }
879 } else {
880 die "$0: $self->{current_token}->{type}: Unknown token type";
881 }
882 $self->{state} = DATA_STATE;
883 !!!next-input-character;
884
885 !!!emit ($self->{current_token}); # start tag or end tag
886
887 redo A;
888 } elsif (0x0041 <= $self->{next_char} and
889 $self->{next_char} <= 0x005A) { # A..Z
890 !!!cp (63);
891 $self->{current_attribute}->{name} .= chr ($self->{next_char} + 0x0020);
892 ## Stay in the state
893 !!!next-input-character;
894 redo A;
895 } elsif ($self->{next_char} == 0x002F) { # /
896 $before_leave->();
897 !!!next-input-character;
898 if ($self->{next_char} == 0x003E and # >
899 $self->{current_token}->{type} == START_TAG_TOKEN and
900 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
901 # permitted slash
902 !!!cp (64);
903 #
904 } else {
905 !!!cp (65);
906 !!!parse-error (type => 'nestc');
907 }
908 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
909 # next-input-character is already done
910 redo A;
911 } elsif ($self->{next_char} == -1) {
912 !!!parse-error (type => 'unclosed tag');
913 $before_leave->();
914 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
915 !!!cp (66);
916 $self->{current_token}->{first_start_tag}
917 = not defined $self->{last_emitted_start_tag_name};
918 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
919 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
920 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
921 if ($self->{current_token}->{attributes}) {
922 !!!cp (67);
923 !!!parse-error (type => 'end tag attribute');
924 } else {
925 ## NOTE: This state should never be reached.
926 !!!cp (68);
927 }
928 } else {
929 die "$0: $self->{current_token}->{type}: Unknown token type";
930 }
931 $self->{state} = DATA_STATE;
932 # reconsume
933
934 !!!emit ($self->{current_token}); # start tag or end tag
935
936 redo A;
937 } else {
938 if ($self->{next_char} == 0x0022 or # "
939 $self->{next_char} == 0x0027) { # '
940 !!!cp (69);
941 !!!parse-error (type => 'bad attribute name');
942 } else {
943 !!!cp (70);
944 }
945 $self->{current_attribute}->{name} .= chr ($self->{next_char});
946 ## Stay in the state
947 !!!next-input-character;
948 redo A;
949 }
950 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
951 if ($self->{next_char} == 0x0009 or # HT
952 $self->{next_char} == 0x000A or # LF
953 $self->{next_char} == 0x000B or # VT
954 $self->{next_char} == 0x000C or # FF
955 $self->{next_char} == 0x0020) { # SP
956 !!!cp (71);
957 ## Stay in the state
958 !!!next-input-character;
959 redo A;
960 } elsif ($self->{next_char} == 0x003D) { # =
961 !!!cp (72);
962 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
963 !!!next-input-character;
964 redo A;
965 } elsif ($self->{next_char} == 0x003E) { # >
966 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
967 !!!cp (73);
968 $self->{current_token}->{first_start_tag}
969 = not defined $self->{last_emitted_start_tag_name};
970 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
971 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
972 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
973 if ($self->{current_token}->{attributes}) {
974 !!!cp (74);
975 !!!parse-error (type => 'end tag attribute');
976 } else {
977 ## NOTE: This state should never be reached.
978 !!!cp (75);
979 }
980 } else {
981 die "$0: $self->{current_token}->{type}: Unknown token type";
982 }
983 $self->{state} = DATA_STATE;
984 !!!next-input-character;
985
986 !!!emit ($self->{current_token}); # start tag or end tag
987
988 redo A;
989 } elsif (0x0041 <= $self->{next_char} and
990 $self->{next_char} <= 0x005A) { # A..Z
991 !!!cp (76);
992 $self->{current_attribute} = {name => chr ($self->{next_char} + 0x0020),
993 value => ''};
994 $self->{state} = ATTRIBUTE_NAME_STATE;
995 !!!next-input-character;
996 redo A;
997 } elsif ($self->{next_char} == 0x002F) { # /
998 !!!next-input-character;
999 if ($self->{next_char} == 0x003E and # >
1000 $self->{current_token}->{type} == START_TAG_TOKEN and
1001 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
1002 # permitted slash
1003 !!!cp (77);
1004 #
1005 } else {
1006 !!!cp (78);
1007 !!!parse-error (type => 'nestc');
1008 ## TODO: Different error type for <aa / bb> than <aa/>
1009 }
1010 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1011 # next-input-character is already done
1012 redo A;
1013 } elsif ($self->{next_char} == -1) {
1014 !!!parse-error (type => 'unclosed tag');
1015 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1016 !!!cp (79);
1017 $self->{current_token}->{first_start_tag}
1018 = not defined $self->{last_emitted_start_tag_name};
1019 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1020 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1021 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1022 if ($self->{current_token}->{attributes}) {
1023 !!!cp (80);
1024 !!!parse-error (type => 'end tag attribute');
1025 } else {
1026 ## NOTE: This state should never be reached.
1027 !!!cp (81);
1028 }
1029 } else {
1030 die "$0: $self->{current_token}->{type}: Unknown token type";
1031 }
1032 $self->{state} = DATA_STATE;
1033 # reconsume
1034
1035 !!!emit ($self->{current_token}); # start tag or end tag
1036
1037 redo A;
1038 } else {
1039 !!!cp (82);
1040 $self->{current_attribute} = {name => chr ($self->{next_char}),
1041 value => ''};
1042 $self->{state} = ATTRIBUTE_NAME_STATE;
1043 !!!next-input-character;
1044 redo A;
1045 }
1046 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1047 if ($self->{next_char} == 0x0009 or # HT
1048 $self->{next_char} == 0x000A or # LF
1049 $self->{next_char} == 0x000B or # VT
1050 $self->{next_char} == 0x000C or # FF
1051 $self->{next_char} == 0x0020) { # SP
1052 !!!cp (83);
1053 ## Stay in the state
1054 !!!next-input-character;
1055 redo A;
1056 } elsif ($self->{next_char} == 0x0022) { # "
1057 !!!cp (84);
1058 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1059 !!!next-input-character;
1060 redo A;
1061 } elsif ($self->{next_char} == 0x0026) { # &
1062 !!!cp (85);
1063 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1064 ## reconsume
1065 redo A;
1066 } elsif ($self->{next_char} == 0x0027) { # '
1067 !!!cp (86);
1068 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1069 !!!next-input-character;
1070 redo A;
1071 } elsif ($self->{next_char} == 0x003E) { # >
1072 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1073 !!!cp (87);
1074 $self->{current_token}->{first_start_tag}
1075 = not defined $self->{last_emitted_start_tag_name};
1076 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1077 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1078 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1079 if ($self->{current_token}->{attributes}) {
1080 !!!cp (88);
1081 !!!parse-error (type => 'end tag attribute');
1082 } else {
1083 ## NOTE: This state should never be reached.
1084 !!!cp (89);
1085 }
1086 } else {
1087 die "$0: $self->{current_token}->{type}: Unknown token type";
1088 }
1089 $self->{state} = DATA_STATE;
1090 !!!next-input-character;
1091
1092 !!!emit ($self->{current_token}); # start tag or end tag
1093
1094 redo A;
1095 } elsif ($self->{next_char} == -1) {
1096 !!!parse-error (type => 'unclosed tag');
1097 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1098 !!!cp (90);
1099 $self->{current_token}->{first_start_tag}
1100 = not defined $self->{last_emitted_start_tag_name};
1101 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1102 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1103 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1104 if ($self->{current_token}->{attributes}) {
1105 !!!cp (91);
1106 !!!parse-error (type => 'end tag attribute');
1107 } else {
1108 ## NOTE: This state should never be reached.
1109 !!!cp (92);
1110 }
1111 } else {
1112 die "$0: $self->{current_token}->{type}: Unknown token type";
1113 }
1114 $self->{state} = DATA_STATE;
1115 ## reconsume
1116
1117 !!!emit ($self->{current_token}); # start tag or end tag
1118
1119 redo A;
1120 } else {
1121 if ($self->{next_char} == 0x003D) { # =
1122 !!!cp (93);
1123 !!!parse-error (type => 'bad attribute value');
1124 } else {
1125 !!!cp (94);
1126 }
1127 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1128 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1129 !!!next-input-character;
1130 redo A;
1131 }
1132 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1133 if ($self->{next_char} == 0x0022) { # "
1134 !!!cp (95);
1135 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1136 !!!next-input-character;
1137 redo A;
1138 } elsif ($self->{next_char} == 0x0026) { # &
1139 !!!cp (96);
1140 $self->{last_attribute_value_state} = $self->{state};
1141 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1142 !!!next-input-character;
1143 redo A;
1144 } elsif ($self->{next_char} == -1) {
1145 !!!parse-error (type => 'unclosed attribute value');
1146 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1147 !!!cp (97);
1148 $self->{current_token}->{first_start_tag}
1149 = not defined $self->{last_emitted_start_tag_name};
1150 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1151 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1152 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1153 if ($self->{current_token}->{attributes}) {
1154 !!!cp (98);
1155 !!!parse-error (type => 'end tag attribute');
1156 } else {
1157 ## NOTE: This state should never be reached.
1158 !!!cp (99);
1159 }
1160 } else {
1161 die "$0: $self->{current_token}->{type}: Unknown token type";
1162 }
1163 $self->{state} = DATA_STATE;
1164 ## reconsume
1165
1166 !!!emit ($self->{current_token}); # start tag or end tag
1167
1168 redo A;
1169 } else {
1170 !!!cp (100);
1171 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1172 ## Stay in the state
1173 !!!next-input-character;
1174 redo A;
1175 }
1176 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1177 if ($self->{next_char} == 0x0027) { # '
1178 !!!cp (101);
1179 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1180 !!!next-input-character;
1181 redo A;
1182 } elsif ($self->{next_char} == 0x0026) { # &
1183 !!!cp (102);
1184 $self->{last_attribute_value_state} = $self->{state};
1185 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1186 !!!next-input-character;
1187 redo A;
1188 } elsif ($self->{next_char} == -1) {
1189 !!!parse-error (type => 'unclosed attribute value');
1190 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1191 !!!cp (103);
1192 $self->{current_token}->{first_start_tag}
1193 = not defined $self->{last_emitted_start_tag_name};
1194 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1195 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1196 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1197 if ($self->{current_token}->{attributes}) {
1198 !!!cp (104);
1199 !!!parse-error (type => 'end tag attribute');
1200 } else {
1201 ## NOTE: This state should never be reached.
1202 !!!cp (105);
1203 }
1204 } else {
1205 die "$0: $self->{current_token}->{type}: Unknown token type";
1206 }
1207 $self->{state} = DATA_STATE;
1208 ## reconsume
1209
1210 !!!emit ($self->{current_token}); # start tag or end tag
1211
1212 redo A;
1213 } else {
1214 !!!cp (106);
1215 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1216 ## Stay in the state
1217 !!!next-input-character;
1218 redo A;
1219 }
1220 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1221 if ($self->{next_char} == 0x0009 or # HT
1222 $self->{next_char} == 0x000A or # LF
1223 $self->{next_char} == 0x000B or # HT
1224 $self->{next_char} == 0x000C or # FF
1225 $self->{next_char} == 0x0020) { # SP
1226 !!!cp (107);
1227 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1228 !!!next-input-character;
1229 redo A;
1230 } elsif ($self->{next_char} == 0x0026) { # &
1231 !!!cp (108);
1232 $self->{last_attribute_value_state} = $self->{state};
1233 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1234 !!!next-input-character;
1235 redo A;
1236 } elsif ($self->{next_char} == 0x003E) { # >
1237 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1238 !!!cp (109);
1239 $self->{current_token}->{first_start_tag}
1240 = not defined $self->{last_emitted_start_tag_name};
1241 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1242 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1243 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1244 if ($self->{current_token}->{attributes}) {
1245 !!!cp (110);
1246 !!!parse-error (type => 'end tag attribute');
1247 } else {
1248 ## NOTE: This state should never be reached.
1249 !!!cp (111);
1250 }
1251 } else {
1252 die "$0: $self->{current_token}->{type}: Unknown token type";
1253 }
1254 $self->{state} = DATA_STATE;
1255 !!!next-input-character;
1256
1257 !!!emit ($self->{current_token}); # start tag or end tag
1258
1259 redo A;
1260 } elsif ($self->{next_char} == -1) {
1261 !!!parse-error (type => 'unclosed tag');
1262 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1263 !!!cp (112);
1264 $self->{current_token}->{first_start_tag}
1265 = not defined $self->{last_emitted_start_tag_name};
1266 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1267 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1268 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1269 if ($self->{current_token}->{attributes}) {
1270 !!!cp (113);
1271 !!!parse-error (type => 'end tag attribute');
1272 } else {
1273 ## NOTE: This state should never be reached.
1274 !!!cp (114);
1275 }
1276 } else {
1277 die "$0: $self->{current_token}->{type}: Unknown token type";
1278 }
1279 $self->{state} = DATA_STATE;
1280 ## reconsume
1281
1282 !!!emit ($self->{current_token}); # start tag or end tag
1283
1284 redo A;
1285 } else {
1286 if ({
1287 0x0022 => 1, # "
1288 0x0027 => 1, # '
1289 0x003D => 1, # =
1290 }->{$self->{next_char}}) {
1291 !!!cp (115);
1292 !!!parse-error (type => 'bad attribute value');
1293 } else {
1294 !!!cp (116);
1295 }
1296 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1297 ## Stay in the state
1298 !!!next-input-character;
1299 redo A;
1300 }
1301 } elsif ($self->{state} == ENTITY_IN_ATTRIBUTE_VALUE_STATE) {
1302 my $token = $self->_tokenize_attempt_to_consume_an_entity
1303 (1,
1304 $self->{last_attribute_value_state}
1305 == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE ? 0x0022 : # "
1306 $self->{last_attribute_value_state}
1307 == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE ? 0x0027 : # '
1308 -1);
1309
1310 unless (defined $token) {
1311 !!!cp (117);
1312 $self->{current_attribute}->{value} .= '&';
1313 } else {
1314 !!!cp (118);
1315 $self->{current_attribute}->{value} .= $token->{data};
1316 $self->{current_attribute}->{has_reference} = $token->{has_reference};
1317 ## ISSUE: spec says "append the returned character token to the current attribute's value"
1318 }
1319
1320 $self->{state} = $self->{last_attribute_value_state};
1321 # next-input-character is already done
1322 redo A;
1323 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1324 if ($self->{next_char} == 0x0009 or # HT
1325 $self->{next_char} == 0x000A or # LF
1326 $self->{next_char} == 0x000B or # VT
1327 $self->{next_char} == 0x000C or # FF
1328 $self->{next_char} == 0x0020) { # SP
1329 !!!cp (118);
1330 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1331 !!!next-input-character;
1332 redo A;
1333 } elsif ($self->{next_char} == 0x003E) { # >
1334 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1335 !!!cp (119);
1336 $self->{current_token}->{first_start_tag}
1337 = not defined $self->{last_emitted_start_tag_name};
1338 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1339 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1340 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1341 if ($self->{current_token}->{attributes}) {
1342 !!!cp (120);
1343 !!!parse-error (type => 'end tag attribute');
1344 } else {
1345 ## NOTE: This state should never be reached.
1346 !!!cp (121);
1347 }
1348 } else {
1349 die "$0: $self->{current_token}->{type}: Unknown token type";
1350 }
1351 $self->{state} = DATA_STATE;
1352 !!!next-input-character;
1353
1354 !!!emit ($self->{current_token}); # start tag or end tag
1355
1356 redo A;
1357 } elsif ($self->{next_char} == 0x002F) { # /
1358 !!!next-input-character;
1359 if ($self->{next_char} == 0x003E and # >
1360 $self->{current_token}->{type} == START_TAG_TOKEN and
1361 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
1362 # permitted slash
1363 !!!cp (122);
1364 #
1365 } else {
1366 !!!cp (123);
1367 !!!parse-error (type => 'nestc');
1368 }
1369 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1370 # next-input-character is already done
1371 redo A;
1372 } else {
1373 !!!cp (124);
1374 !!!parse-error (type => 'no space between attributes');
1375 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1376 ## reconsume
1377 redo A;
1378 }
1379 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1380 ## (only happen if PCDATA state)
1381
1382 my $token = {type => COMMENT_TOKEN, data => ''};
1383
1384 BC: {
1385 if ($self->{next_char} == 0x003E) { # >
1386 !!!cp (124);
1387 $self->{state} = DATA_STATE;
1388 !!!next-input-character;
1389
1390 !!!emit ($token);
1391
1392 redo A;
1393 } elsif ($self->{next_char} == -1) {
1394 !!!cp (125);
1395 $self->{state} = DATA_STATE;
1396 ## reconsume
1397
1398 !!!emit ($token);
1399
1400 redo A;
1401 } else {
1402 !!!cp (126);
1403 $token->{data} .= chr ($self->{next_char});
1404 !!!next-input-character;
1405 redo BC;
1406 }
1407 } # BC
1408
1409 die "$0: _get_next_token: unexpected case [BC]";
1410 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1411 ## (only happen if PCDATA state)
1412
1413 my @next_char;
1414 push @next_char, $self->{next_char};
1415
1416 if ($self->{next_char} == 0x002D) { # -
1417 !!!next-input-character;
1418 push @next_char, $self->{next_char};
1419 if ($self->{next_char} == 0x002D) { # -
1420 !!!cp (127);
1421 $self->{current_token} = {type => COMMENT_TOKEN, data => ''};
1422 $self->{state} = COMMENT_START_STATE;
1423 !!!next-input-character;
1424 redo A;
1425 } else {
1426 !!!cp (128);
1427 }
1428 } elsif ($self->{next_char} == 0x0044 or # D
1429 $self->{next_char} == 0x0064) { # d
1430 !!!next-input-character;
1431 push @next_char, $self->{next_char};
1432 if ($self->{next_char} == 0x004F or # O
1433 $self->{next_char} == 0x006F) { # o
1434 !!!next-input-character;
1435 push @next_char, $self->{next_char};
1436 if ($self->{next_char} == 0x0043 or # C
1437 $self->{next_char} == 0x0063) { # c
1438 !!!next-input-character;
1439 push @next_char, $self->{next_char};
1440 if ($self->{next_char} == 0x0054 or # T
1441 $self->{next_char} == 0x0074) { # t
1442 !!!next-input-character;
1443 push @next_char, $self->{next_char};
1444 if ($self->{next_char} == 0x0059 or # Y
1445 $self->{next_char} == 0x0079) { # y
1446 !!!next-input-character;
1447 push @next_char, $self->{next_char};
1448 if ($self->{next_char} == 0x0050 or # P
1449 $self->{next_char} == 0x0070) { # p
1450 !!!next-input-character;
1451 push @next_char, $self->{next_char};
1452 if ($self->{next_char} == 0x0045 or # E
1453 $self->{next_char} == 0x0065) { # e
1454 !!!cp (129);
1455 ## TODO: What a stupid code this is!
1456 $self->{state} = DOCTYPE_STATE;
1457 !!!next-input-character;
1458 redo A;
1459 } else {
1460 !!!cp (130);
1461 }
1462 } else {
1463 !!!cp (131);
1464 }
1465 } else {
1466 !!!cp (132);
1467 }
1468 } else {
1469 !!!cp (133);
1470 }
1471 } else {
1472 !!!cp (134);
1473 }
1474 } else {
1475 !!!cp (135);
1476 }
1477 } else {
1478 !!!cp (136);
1479 }
1480
1481 !!!parse-error (type => 'bogus comment');
1482 $self->{next_char} = shift @next_char;
1483 !!!back-next-input-character (@next_char);
1484 $self->{state} = BOGUS_COMMENT_STATE;
1485 redo A;
1486
1487 ## ISSUE: typos in spec: chacacters, is is a parse error
1488 ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
1489 } elsif ($self->{state} == COMMENT_START_STATE) {
1490 if ($self->{next_char} == 0x002D) { # -
1491 !!!cp (137);
1492 $self->{state} = COMMENT_START_DASH_STATE;
1493 !!!next-input-character;
1494 redo A;
1495 } elsif ($self->{next_char} == 0x003E) { # >
1496 !!!cp (138);
1497 !!!parse-error (type => 'bogus comment');
1498 $self->{state} = DATA_STATE;
1499 !!!next-input-character;
1500
1501 !!!emit ($self->{current_token}); # comment
1502
1503 redo A;
1504 } elsif ($self->{next_char} == -1) {
1505 !!!cp (139);
1506 !!!parse-error (type => 'unclosed comment');
1507 $self->{state} = DATA_STATE;
1508 ## reconsume
1509
1510 !!!emit ($self->{current_token}); # comment
1511
1512 redo A;
1513 } else {
1514 !!!cp (140);
1515 $self->{current_token}->{data} # comment
1516 .= chr ($self->{next_char});
1517 $self->{state} = COMMENT_STATE;
1518 !!!next-input-character;
1519 redo A;
1520 }
1521 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1522 if ($self->{next_char} == 0x002D) { # -
1523 !!!cp (141);
1524 $self->{state} = COMMENT_END_STATE;
1525 !!!next-input-character;
1526 redo A;
1527 } elsif ($self->{next_char} == 0x003E) { # >
1528 !!!cp (142);
1529 !!!parse-error (type => 'bogus comment');
1530 $self->{state} = DATA_STATE;
1531 !!!next-input-character;
1532
1533 !!!emit ($self->{current_token}); # comment
1534
1535 redo A;
1536 } elsif ($self->{next_char} == -1) {
1537 !!!cp (143);
1538 !!!parse-error (type => 'unclosed comment');
1539 $self->{state} = DATA_STATE;
1540 ## reconsume
1541
1542 !!!emit ($self->{current_token}); # comment
1543
1544 redo A;
1545 } else {
1546 !!!cp (144);
1547 $self->{current_token}->{data} # comment
1548 .= '-' . chr ($self->{next_char});
1549 $self->{state} = COMMENT_STATE;
1550 !!!next-input-character;
1551 redo A;
1552 }
1553 } elsif ($self->{state} == COMMENT_STATE) {
1554 if ($self->{next_char} == 0x002D) { # -
1555 !!!cp (145);
1556 $self->{state} = COMMENT_END_DASH_STATE;
1557 !!!next-input-character;
1558 redo A;
1559 } elsif ($self->{next_char} == -1) {
1560 !!!cp (146);
1561 !!!parse-error (type => 'unclosed comment');
1562 $self->{state} = DATA_STATE;
1563 ## reconsume
1564
1565 !!!emit ($self->{current_token}); # comment
1566
1567 redo A;
1568 } else {
1569 !!!cp (147);
1570 $self->{current_token}->{data} .= chr ($self->{next_char}); # comment
1571 ## Stay in the state
1572 !!!next-input-character;
1573 redo A;
1574 }
1575 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1576 if ($self->{next_char} == 0x002D) { # -
1577 !!!cp (148);
1578 $self->{state} = COMMENT_END_STATE;
1579 !!!next-input-character;
1580 redo A;
1581 } elsif ($self->{next_char} == -1) {
1582 !!!cp (149);
1583 !!!parse-error (type => 'unclosed comment');
1584 $self->{state} = DATA_STATE;
1585 ## reconsume
1586
1587 !!!emit ($self->{current_token}); # comment
1588
1589 redo A;
1590 } else {
1591 !!!cp (150);
1592 $self->{current_token}->{data} .= '-' . chr ($self->{next_char}); # comment
1593 $self->{state} = COMMENT_STATE;
1594 !!!next-input-character;
1595 redo A;
1596 }
1597 } elsif ($self->{state} == COMMENT_END_STATE) {
1598 if ($self->{next_char} == 0x003E) { # >
1599 !!!cp (151);
1600 $self->{state} = DATA_STATE;
1601 !!!next-input-character;
1602
1603 !!!emit ($self->{current_token}); # comment
1604
1605 redo A;
1606 } elsif ($self->{next_char} == 0x002D) { # -
1607 !!!cp (152);
1608 !!!parse-error (type => 'dash in comment');
1609 $self->{current_token}->{data} .= '-'; # comment
1610 ## Stay in the state
1611 !!!next-input-character;
1612 redo A;
1613 } elsif ($self->{next_char} == -1) {
1614 !!!cp (153);
1615 !!!parse-error (type => 'unclosed comment');
1616 $self->{state} = DATA_STATE;
1617 ## reconsume
1618
1619 !!!emit ($self->{current_token}); # comment
1620
1621 redo A;
1622 } else {
1623 !!!cp (154);
1624 !!!parse-error (type => 'dash in comment');
1625 $self->{current_token}->{data} .= '--' . chr ($self->{next_char}); # comment
1626 $self->{state} = COMMENT_STATE;
1627 !!!next-input-character;
1628 redo A;
1629 }
1630 } elsif ($self->{state} == DOCTYPE_STATE) {
1631 if ($self->{next_char} == 0x0009 or # HT
1632 $self->{next_char} == 0x000A or # LF
1633 $self->{next_char} == 0x000B or # VT
1634 $self->{next_char} == 0x000C or # FF
1635 $self->{next_char} == 0x0020) { # SP
1636 !!!cp (155);
1637 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1638 !!!next-input-character;
1639 redo A;
1640 } else {
1641 !!!cp (156);
1642 !!!parse-error (type => 'no space before DOCTYPE name');
1643 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1644 ## reconsume
1645 redo A;
1646 }
1647 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
1648 if ($self->{next_char} == 0x0009 or # HT
1649 $self->{next_char} == 0x000A or # LF
1650 $self->{next_char} == 0x000B or # VT
1651 $self->{next_char} == 0x000C or # FF
1652 $self->{next_char} == 0x0020) { # SP
1653 !!!cp (157);
1654 ## Stay in the state
1655 !!!next-input-character;
1656 redo A;
1657 } elsif ($self->{next_char} == 0x003E) { # >
1658 !!!cp (158);
1659 !!!parse-error (type => 'no DOCTYPE name');
1660 $self->{state} = DATA_STATE;
1661 !!!next-input-character;
1662
1663 !!!emit ({type => DOCTYPE_TOKEN, quirks => 1});
1664
1665 redo A;
1666 } elsif ($self->{next_char} == -1) {
1667 !!!cp (159);
1668 !!!parse-error (type => 'no DOCTYPE name');
1669 $self->{state} = DATA_STATE;
1670 ## reconsume
1671
1672 !!!emit ({type => DOCTYPE_TOKEN, quirks => 1});
1673
1674 redo A;
1675 } else {
1676 !!!cp (160);
1677 $self->{current_token}
1678 = {type => DOCTYPE_TOKEN,
1679 name => chr ($self->{next_char}),
1680 #quirks => 0,
1681 };
1682 ## ISSUE: "Set the token's name name to the" in the spec
1683 $self->{state} = DOCTYPE_NAME_STATE;
1684 !!!next-input-character;
1685 redo A;
1686 }
1687 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
1688 ## ISSUE: Redundant "First," in the spec.
1689 if ($self->{next_char} == 0x0009 or # HT
1690 $self->{next_char} == 0x000A or # LF
1691 $self->{next_char} == 0x000B or # VT
1692 $self->{next_char} == 0x000C or # FF
1693 $self->{next_char} == 0x0020) { # SP
1694 !!!cp (161);
1695 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
1696 !!!next-input-character;
1697 redo A;
1698 } elsif ($self->{next_char} == 0x003E) { # >
1699 !!!cp (162);
1700 $self->{state} = DATA_STATE;
1701 !!!next-input-character;
1702
1703 !!!emit ($self->{current_token}); # DOCTYPE
1704
1705 redo A;
1706 } elsif ($self->{next_char} == -1) {
1707 !!!cp (163);
1708 !!!parse-error (type => 'unclosed DOCTYPE');
1709 $self->{state} = DATA_STATE;
1710 ## reconsume
1711
1712 $self->{current_token}->{quirks} = 1;
1713 !!!emit ($self->{current_token}); # DOCTYPE
1714
1715 redo A;
1716 } else {
1717 !!!cp (164);
1718 $self->{current_token}->{name}
1719 .= chr ($self->{next_char}); # DOCTYPE
1720 ## Stay in the state
1721 !!!next-input-character;
1722 redo A;
1723 }
1724 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
1725 if ($self->{next_char} == 0x0009 or # HT
1726 $self->{next_char} == 0x000A or # LF
1727 $self->{next_char} == 0x000B or # VT
1728 $self->{next_char} == 0x000C or # FF
1729 $self->{next_char} == 0x0020) { # SP
1730 !!!cp (165);
1731 ## Stay in the state
1732 !!!next-input-character;
1733 redo A;
1734 } elsif ($self->{next_char} == 0x003E) { # >
1735 !!!cp (166);
1736 $self->{state} = DATA_STATE;
1737 !!!next-input-character;
1738
1739 !!!emit ($self->{current_token}); # DOCTYPE
1740
1741 redo A;
1742 } elsif ($self->{next_char} == -1) {
1743 !!!cp (167);
1744 !!!parse-error (type => 'unclosed DOCTYPE');
1745 $self->{state} = DATA_STATE;
1746 ## reconsume
1747
1748 $self->{current_token}->{quirks} = 1;
1749 !!!emit ($self->{current_token}); # DOCTYPE
1750
1751 redo A;
1752 } elsif ($self->{next_char} == 0x0050 or # P
1753 $self->{next_char} == 0x0070) { # p
1754 !!!next-input-character;
1755 if ($self->{next_char} == 0x0055 or # U
1756 $self->{next_char} == 0x0075) { # u
1757 !!!next-input-character;
1758 if ($self->{next_char} == 0x0042 or # B
1759 $self->{next_char} == 0x0062) { # b
1760 !!!next-input-character;
1761 if ($self->{next_char} == 0x004C or # L
1762 $self->{next_char} == 0x006C) { # l
1763 !!!next-input-character;
1764 if ($self->{next_char} == 0x0049 or # I
1765 $self->{next_char} == 0x0069) { # i
1766 !!!next-input-character;
1767 if ($self->{next_char} == 0x0043 or # C
1768 $self->{next_char} == 0x0063) { # c
1769 !!!cp (168);
1770 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1771 !!!next-input-character;
1772 redo A;
1773 } else {
1774 !!!cp (169);
1775 }
1776 } else {
1777 !!!cp (170);
1778 }
1779 } else {
1780 !!!cp (171);
1781 }
1782 } else {
1783 !!!cp (172);
1784 }
1785 } else {
1786 !!!cp (173);
1787 }
1788
1789 #
1790 } elsif ($self->{next_char} == 0x0053 or # S
1791 $self->{next_char} == 0x0073) { # s
1792 !!!next-input-character;
1793 if ($self->{next_char} == 0x0059 or # Y
1794 $self->{next_char} == 0x0079) { # y
1795 !!!next-input-character;
1796 if ($self->{next_char} == 0x0053 or # S
1797 $self->{next_char} == 0x0073) { # s
1798 !!!next-input-character;
1799 if ($self->{next_char} == 0x0054 or # T
1800 $self->{next_char} == 0x0074) { # t
1801 !!!next-input-character;
1802 if ($self->{next_char} == 0x0045 or # E
1803 $self->{next_char} == 0x0065) { # e
1804 !!!next-input-character;
1805 if ($self->{next_char} == 0x004D or # M
1806 $self->{next_char} == 0x006D) { # m
1807 !!!cp (174);
1808 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1809 !!!next-input-character;
1810 redo A;
1811 } else {
1812 !!!cp (175);
1813 }
1814 } else {
1815 !!!cp (176);
1816 }
1817 } else {
1818 !!!cp (177);
1819 }
1820 } else {
1821 !!!cp (178);
1822 }
1823 } else {
1824 !!!cp (179);
1825 }
1826
1827 #
1828 } else {
1829 !!!cp (180);
1830 !!!next-input-character;
1831 #
1832 }
1833
1834 !!!parse-error (type => 'string after DOCTYPE name');
1835 $self->{current_token}->{quirks} = 1;
1836
1837 $self->{state} = BOGUS_DOCTYPE_STATE;
1838 # next-input-character is already done
1839 redo A;
1840 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
1841 if ({
1842 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1843 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1844 }->{$self->{next_char}}) {
1845 !!!cp (181);
1846 ## Stay in the state
1847 !!!next-input-character;
1848 redo A;
1849 } elsif ($self->{next_char} eq 0x0022) { # "
1850 !!!cp (182);
1851 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
1852 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
1853 !!!next-input-character;
1854 redo A;
1855 } elsif ($self->{next_char} eq 0x0027) { # '
1856 !!!cp (183);
1857 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
1858 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
1859 !!!next-input-character;
1860 redo A;
1861 } elsif ($self->{next_char} eq 0x003E) { # >
1862 !!!cp (184);
1863 !!!parse-error (type => 'no PUBLIC literal');
1864
1865 $self->{state} = DATA_STATE;
1866 !!!next-input-character;
1867
1868 $self->{current_token}->{quirks} = 1;
1869 !!!emit ($self->{current_token}); # DOCTYPE
1870
1871 redo A;
1872 } elsif ($self->{next_char} == -1) {
1873 !!!cp (185);
1874 !!!parse-error (type => 'unclosed DOCTYPE');
1875
1876 $self->{state} = DATA_STATE;
1877 ## reconsume
1878
1879 $self->{current_token}->{quirks} = 1;
1880 !!!emit ($self->{current_token}); # DOCTYPE
1881
1882 redo A;
1883 } else {
1884 !!!cp (186);
1885 !!!parse-error (type => 'string after PUBLIC');
1886 $self->{current_token}->{quirks} = 1;
1887
1888 $self->{state} = BOGUS_DOCTYPE_STATE;
1889 !!!next-input-character;
1890 redo A;
1891 }
1892 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
1893 if ($self->{next_char} == 0x0022) { # "
1894 !!!cp (187);
1895 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1896 !!!next-input-character;
1897 redo A;
1898 } elsif ($self->{next_char} == 0x003E) { # >
1899 !!!cp (188);
1900 !!!parse-error (type => 'unclosed PUBLIC literal');
1901
1902 $self->{state} = DATA_STATE;
1903 !!!next-input-character;
1904
1905 $self->{current_token}->{quirks} = 1;
1906 !!!emit ($self->{current_token}); # DOCTYPE
1907
1908 redo A;
1909 } elsif ($self->{next_char} == -1) {
1910 !!!cp (189);
1911 !!!parse-error (type => 'unclosed PUBLIC literal');
1912
1913 $self->{state} = DATA_STATE;
1914 ## reconsume
1915
1916 $self->{current_token}->{quirks} = 1;
1917 !!!emit ($self->{current_token}); # DOCTYPE
1918
1919 redo A;
1920 } else {
1921 !!!cp (190);
1922 $self->{current_token}->{public_identifier} # DOCTYPE
1923 .= chr $self->{next_char};
1924 ## Stay in the state
1925 !!!next-input-character;
1926 redo A;
1927 }
1928 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
1929 if ($self->{next_char} == 0x0027) { # '
1930 !!!cp (191);
1931 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1932 !!!next-input-character;
1933 redo A;
1934 } elsif ($self->{next_char} == 0x003E) { # >
1935 !!!cp (192);
1936 !!!parse-error (type => 'unclosed PUBLIC literal');
1937
1938 $self->{state} = DATA_STATE;
1939 !!!next-input-character;
1940
1941 $self->{current_token}->{quirks} = 1;
1942 !!!emit ($self->{current_token}); # DOCTYPE
1943
1944 redo A;
1945 } elsif ($self->{next_char} == -1) {
1946 !!!cp (193);
1947 !!!parse-error (type => 'unclosed PUBLIC literal');
1948
1949 $self->{state} = DATA_STATE;
1950 ## reconsume
1951
1952 $self->{current_token}->{quirks} = 1;
1953 !!!emit ($self->{current_token}); # DOCTYPE
1954
1955 redo A;
1956 } else {
1957 !!!cp (194);
1958 $self->{current_token}->{public_identifier} # DOCTYPE
1959 .= chr $self->{next_char};
1960 ## Stay in the state
1961 !!!next-input-character;
1962 redo A;
1963 }
1964 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
1965 if ({
1966 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1967 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1968 }->{$self->{next_char}}) {
1969 !!!cp (195);
1970 ## Stay in the state
1971 !!!next-input-character;
1972 redo A;
1973 } elsif ($self->{next_char} == 0x0022) { # "
1974 !!!cp (196);
1975 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1976 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
1977 !!!next-input-character;
1978 redo A;
1979 } elsif ($self->{next_char} == 0x0027) { # '
1980 !!!cp (197);
1981 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1982 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
1983 !!!next-input-character;
1984 redo A;
1985 } elsif ($self->{next_char} == 0x003E) { # >
1986 !!!cp (198);
1987 $self->{state} = DATA_STATE;
1988 !!!next-input-character;
1989
1990 !!!emit ($self->{current_token}); # DOCTYPE
1991
1992 redo A;
1993 } elsif ($self->{next_char} == -1) {
1994 !!!cp (199);
1995 !!!parse-error (type => 'unclosed DOCTYPE');
1996
1997 $self->{state} = DATA_STATE;
1998 ## reconsume
1999
2000 $self->{current_token}->{quirks} = 1;
2001 !!!emit ($self->{current_token}); # DOCTYPE
2002
2003 redo A;
2004 } else {
2005 !!!cp (200);
2006 !!!parse-error (type => 'string after PUBLIC literal');
2007 $self->{current_token}->{quirks} = 1;
2008
2009 $self->{state} = BOGUS_DOCTYPE_STATE;
2010 !!!next-input-character;
2011 redo A;
2012 }
2013 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2014 if ({
2015 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2016 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2017 }->{$self->{next_char}}) {
2018 !!!cp (201);
2019 ## Stay in the state
2020 !!!next-input-character;
2021 redo A;
2022 } elsif ($self->{next_char} == 0x0022) { # "
2023 !!!cp (202);
2024 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2025 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2026 !!!next-input-character;
2027 redo A;
2028 } elsif ($self->{next_char} == 0x0027) { # '
2029 !!!cp (203);
2030 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2031 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2032 !!!next-input-character;
2033 redo A;
2034 } elsif ($self->{next_char} == 0x003E) { # >
2035 !!!cp (204);
2036 !!!parse-error (type => 'no SYSTEM literal');
2037 $self->{state} = DATA_STATE;
2038 !!!next-input-character;
2039
2040 $self->{current_token}->{quirks} = 1;
2041 !!!emit ($self->{current_token}); # DOCTYPE
2042
2043 redo A;
2044 } elsif ($self->{next_char} == -1) {
2045 !!!cp (205);
2046 !!!parse-error (type => 'unclosed DOCTYPE');
2047
2048 $self->{state} = DATA_STATE;
2049 ## reconsume
2050
2051 $self->{current_token}->{quirks} = 1;
2052 !!!emit ($self->{current_token}); # DOCTYPE
2053
2054 redo A;
2055 } else {
2056 !!!cp (206);
2057 !!!parse-error (type => 'string after SYSTEM');
2058 $self->{current_token}->{quirks} = 1;
2059
2060 $self->{state} = BOGUS_DOCTYPE_STATE;
2061 !!!next-input-character;
2062 redo A;
2063 }
2064 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2065 if ($self->{next_char} == 0x0022) { # "
2066 !!!cp (207);
2067 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2068 !!!next-input-character;
2069 redo A;
2070 } elsif ($self->{next_char} == 0x003E) { # >
2071 !!!cp (208);
2072 !!!parse-error (type => 'unclosed PUBLIC literal');
2073
2074 $self->{state} = DATA_STATE;
2075 !!!next-input-character;
2076
2077 $self->{current_token}->{quirks} = 1;
2078 !!!emit ($self->{current_token}); # DOCTYPE
2079
2080 redo A;
2081 } elsif ($self->{next_char} == -1) {
2082 !!!cp (209);
2083 !!!parse-error (type => 'unclosed SYSTEM literal');
2084
2085 $self->{state} = DATA_STATE;
2086 ## reconsume
2087
2088 $self->{current_token}->{quirks} = 1;
2089 !!!emit ($self->{current_token}); # DOCTYPE
2090
2091 redo A;
2092 } else {
2093 !!!cp (210);
2094 $self->{current_token}->{system_identifier} # DOCTYPE
2095 .= chr $self->{next_char};
2096 ## Stay in the state
2097 !!!next-input-character;
2098 redo A;
2099 }
2100 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2101 if ($self->{next_char} == 0x0027) { # '
2102 !!!cp (211);
2103 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2104 !!!next-input-character;
2105 redo A;
2106 } elsif ($self->{next_char} == 0x003E) { # >
2107 !!!cp (212);
2108 !!!parse-error (type => 'unclosed PUBLIC literal');
2109
2110 $self->{state} = DATA_STATE;
2111 !!!next-input-character;
2112
2113 $self->{current_token}->{quirks} = 1;
2114 !!!emit ($self->{current_token}); # DOCTYPE
2115
2116 redo A;
2117 } elsif ($self->{next_char} == -1) {
2118 !!!cp (213);
2119 !!!parse-error (type => 'unclosed SYSTEM literal');
2120
2121 $self->{state} = DATA_STATE;
2122 ## reconsume
2123
2124 $self->{current_token}->{quirks} = 1;
2125 !!!emit ($self->{current_token}); # DOCTYPE
2126
2127 redo A;
2128 } else {
2129 !!!cp (214);
2130 $self->{current_token}->{system_identifier} # DOCTYPE
2131 .= chr $self->{next_char};
2132 ## Stay in the state
2133 !!!next-input-character;
2134 redo A;
2135 }
2136 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2137 if ({
2138 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2139 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2140 }->{$self->{next_char}}) {
2141 !!!cp (215);
2142 ## Stay in the state
2143 !!!next-input-character;
2144 redo A;
2145 } elsif ($self->{next_char} == 0x003E) { # >
2146 !!!cp (216);
2147 $self->{state} = DATA_STATE;
2148 !!!next-input-character;
2149
2150 !!!emit ($self->{current_token}); # DOCTYPE
2151
2152 redo A;
2153 } elsif ($self->{next_char} == -1) {
2154 !!!cp (217);
2155 !!!parse-error (type => 'unclosed DOCTYPE');
2156
2157 $self->{state} = DATA_STATE;
2158 ## reconsume
2159
2160 $self->{current_token}->{quirks} = 1;
2161 !!!emit ($self->{current_token}); # DOCTYPE
2162
2163 redo A;
2164 } else {
2165 !!!cp (218);
2166 !!!parse-error (type => 'string after SYSTEM literal');
2167 #$self->{current_token}->{quirks} = 1;
2168
2169 $self->{state} = BOGUS_DOCTYPE_STATE;
2170 !!!next-input-character;
2171 redo A;
2172 }
2173 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2174 if ($self->{next_char} == 0x003E) { # >
2175 !!!cp (219);
2176 $self->{state} = DATA_STATE;
2177 !!!next-input-character;
2178
2179 !!!emit ($self->{current_token}); # DOCTYPE
2180
2181 redo A;
2182 } elsif ($self->{next_char} == -1) {
2183 !!!cp (220);
2184 !!!parse-error (type => 'unclosed DOCTYPE');
2185 $self->{state} = DATA_STATE;
2186 ## reconsume
2187
2188 !!!emit ($self->{current_token}); # DOCTYPE
2189
2190 redo A;
2191 } else {
2192 !!!cp (221);
2193 ## Stay in the state
2194 !!!next-input-character;
2195 redo A;
2196 }
2197 } else {
2198 die "$0: $self->{state}: Unknown state";
2199 }
2200 } # A
2201
2202 die "$0: _get_next_token: unexpected case";
2203 } # _get_next_token
2204
2205 sub _tokenize_attempt_to_consume_an_entity ($$$) {
2206 my ($self, $in_attr, $additional) = @_;
2207
2208 if ({
2209 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
2210 0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR
2211 $additional => 1,
2212 }->{$self->{next_char}}) {
2213 !!!cp (1001);
2214 ## Don't consume
2215 ## No error
2216 return undef;
2217 } elsif ($self->{next_char} == 0x0023) { # #
2218 !!!next-input-character;
2219 if ($self->{next_char} == 0x0078 or # x
2220 $self->{next_char} == 0x0058) { # X
2221 my $code;
2222 X: {
2223 my $x_char = $self->{next_char};
2224 !!!next-input-character;
2225 if (0x0030 <= $self->{next_char} and
2226 $self->{next_char} <= 0x0039) { # 0..9
2227 !!!cp (1002);
2228 $code ||= 0;
2229 $code *= 0x10;
2230 $code += $self->{next_char} - 0x0030;
2231 redo X;
2232 } elsif (0x0061 <= $self->{next_char} and
2233 $self->{next_char} <= 0x0066) { # a..f
2234 !!!cp (1003);
2235 $code ||= 0;
2236 $code *= 0x10;
2237 $code += $self->{next_char} - 0x0060 + 9;
2238 redo X;
2239 } elsif (0x0041 <= $self->{next_char} and
2240 $self->{next_char} <= 0x0046) { # A..F
2241 !!!cp (1004);
2242 $code ||= 0;
2243 $code *= 0x10;
2244 $code += $self->{next_char} - 0x0040 + 9;
2245 redo X;
2246 } elsif (not defined $code) { # no hexadecimal digit
2247 !!!cp (1005);
2248 !!!parse-error (type => 'bare hcro');
2249 !!!back-next-input-character ($x_char, $self->{next_char});
2250 $self->{next_char} = 0x0023; # #
2251 return undef;
2252 } elsif ($self->{next_char} == 0x003B) { # ;
2253 !!!cp (1006);
2254 !!!next-input-character;
2255 } else {
2256 !!!cp (1007);
2257 !!!parse-error (type => 'no refc');
2258 }
2259
2260 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
2261 !!!cp (1008);
2262 !!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code);
2263 $code = 0xFFFD;
2264 } elsif ($code > 0x10FFFF) {
2265 !!!cp (1009);
2266 !!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code);
2267 $code = 0xFFFD;
2268 } elsif ($code == 0x000D) {
2269 !!!cp (1010);
2270 !!!parse-error (type => 'CR character reference');
2271 $code = 0x000A;
2272 } elsif (0x80 <= $code and $code <= 0x9F) {
2273 !!!cp (1011);
2274 !!!parse-error (type => sprintf 'C1 character reference:U+%04X', $code);
2275 $code = $c1_entity_char->{$code};
2276 }
2277
2278 return {type => CHARACTER_TOKEN, data => chr $code,
2279 has_reference => 1};
2280 } # X
2281 } elsif (0x0030 <= $self->{next_char} and
2282 $self->{next_char} <= 0x0039) { # 0..9
2283 my $code = $self->{next_char} - 0x0030;
2284 !!!next-input-character;
2285
2286 while (0x0030 <= $self->{next_char} and
2287 $self->{next_char} <= 0x0039) { # 0..9
2288 !!!cp (1012);
2289 $code *= 10;
2290 $code += $self->{next_char} - 0x0030;
2291
2292 !!!next-input-character;
2293 }
2294
2295 if ($self->{next_char} == 0x003B) { # ;
2296 !!!cp (1013);
2297 !!!next-input-character;
2298 } else {
2299 !!!cp (1014);
2300 !!!parse-error (type => 'no refc');
2301 }
2302
2303 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
2304 !!!cp (1015);
2305 !!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code);
2306 $code = 0xFFFD;
2307 } elsif ($code > 0x10FFFF) {
2308 !!!cp (1016);
2309 !!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code);
2310 $code = 0xFFFD;
2311 } elsif ($code == 0x000D) {
2312 !!!cp (1017);
2313 !!!parse-error (type => 'CR character reference');
2314 $code = 0x000A;
2315 } elsif (0x80 <= $code and $code <= 0x9F) {
2316 !!!cp (1018);
2317 !!!parse-error (type => sprintf 'C1 character reference:U+%04X', $code);
2318 $code = $c1_entity_char->{$code};
2319 }
2320
2321 return {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1};
2322 } else {
2323 !!!cp (1019);
2324 !!!parse-error (type => 'bare nero');
2325 !!!back-next-input-character ($self->{next_char});
2326 $self->{next_char} = 0x0023; # #
2327 return undef;
2328 }
2329 } elsif ((0x0041 <= $self->{next_char} and
2330 $self->{next_char} <= 0x005A) or
2331 (0x0061 <= $self->{next_char} and
2332 $self->{next_char} <= 0x007A)) {
2333 my $entity_name = chr $self->{next_char};
2334 !!!next-input-character;
2335
2336 my $value = $entity_name;
2337 my $match = 0;
2338 require Whatpm::_NamedEntityList;
2339 our $EntityChar;
2340
2341 while (length $entity_name < 10 and
2342 ## NOTE: Some number greater than the maximum length of entity name
2343 ((0x0041 <= $self->{next_char} and # a
2344 $self->{next_char} <= 0x005A) or # x
2345 (0x0061 <= $self->{next_char} and # a
2346 $self->{next_char} <= 0x007A) or # z
2347 (0x0030 <= $self->{next_char} and # 0
2348 $self->{next_char} <= 0x0039) or # 9
2349 $self->{next_char} == 0x003B)) { # ;
2350 $entity_name .= chr $self->{next_char};
2351 if (defined $EntityChar->{$entity_name}) {
2352 if ($self->{next_char} == 0x003B) { # ;
2353 !!!cp (1020);
2354 $value = $EntityChar->{$entity_name};
2355 $match = 1;
2356 !!!next-input-character;
2357 last;
2358 } else {
2359 !!!cp (1021);
2360 $value = $EntityChar->{$entity_name};
2361 $match = -1;
2362 !!!next-input-character;
2363 }
2364 } else {
2365 !!!cp (1022);
2366 $value .= chr $self->{next_char};
2367 $match *= 2;
2368 !!!next-input-character;
2369 }
2370 }
2371
2372 if ($match > 0) {
2373 !!!cp (1023);
2374 return {type => CHARACTER_TOKEN, data => $value, has_reference => 1};
2375 } elsif ($match < 0) {
2376 !!!parse-error (type => 'no refc');
2377 if ($in_attr and $match < -1) {
2378 !!!cp (1024);
2379 return {type => CHARACTER_TOKEN, data => '&'.$entity_name};
2380 } else {
2381 !!!cp (1025);
2382 return {type => CHARACTER_TOKEN, data => $value, has_reference => 1};
2383 }
2384 } else {
2385 !!!cp (1026);
2386 !!!parse-error (type => 'bare ero');
2387 ## NOTE: "No characters are consumed" in the spec.
2388 return {type => CHARACTER_TOKEN, data => '&'.$value};
2389 }
2390 } else {
2391 !!!cp (1027);
2392 ## no characters are consumed
2393 !!!parse-error (type => 'bare ero');
2394 return undef;
2395 }
2396 } # _tokenize_attempt_to_consume_an_entity
2397
2398 sub _initialize_tree_constructor ($) {
2399 my $self = shift;
2400 ## NOTE: $self->{document} MUST be specified before this method is called
2401 $self->{document}->strict_error_checking (0);
2402 ## TODO: Turn mutation events off # MUST
2403 ## TODO: Turn loose Document option (manakai extension) on
2404 $self->{document}->manakai_is_html (1); # MUST
2405 } # _initialize_tree_constructor
2406
2407 sub _terminate_tree_constructor ($) {
2408 my $self = shift;
2409 $self->{document}->strict_error_checking (1);
2410 ## TODO: Turn mutation events on
2411 } # _terminate_tree_constructor
2412
2413 ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
2414
2415 { # tree construction stage
2416 my $token;
2417
2418 sub _construct_tree ($) {
2419 my ($self) = @_;
2420
2421 ## When an interactive UA render the $self->{document} available
2422 ## to the user, or when it begin accepting user input, are
2423 ## not defined.
2424
2425 ## Append a character: collect it and all subsequent consecutive
2426 ## characters and insert one Text node whose data is concatenation
2427 ## of all those characters. # MUST
2428
2429 !!!next-token;
2430
2431 undef $self->{form_element};
2432 undef $self->{head_element};
2433 $self->{open_elements} = [];
2434 undef $self->{inner_html_node};
2435
2436 ## NOTE: The "initial" insertion mode.
2437 $self->_tree_construction_initial; # MUST
2438
2439 ## NOTE: The "before html" insertion mode.
2440 $self->_tree_construction_root_element;
2441 $self->{insertion_mode} = BEFORE_HEAD_IM;
2442
2443 ## NOTE: The "before head" insertion mode and so on.
2444 $self->_tree_construction_main;
2445 } # _construct_tree
2446
2447 sub _tree_construction_initial ($) {
2448 my $self = shift;
2449
2450 ## NOTE: "initial" insertion mode
2451
2452 INITIAL: {
2453 if ($token->{type} == DOCTYPE_TOKEN) {
2454 ## NOTE: Conformance checkers MAY, instead of reporting "not HTML5"
2455 ## error, switch to a conformance checking mode for another
2456 ## language.
2457 my $doctype_name = $token->{name};
2458 $doctype_name = '' unless defined $doctype_name;
2459 $doctype_name =~ tr/a-z/A-Z/;
2460 if (not defined $token->{name} or # <!DOCTYPE>
2461 defined $token->{public_identifier} or
2462 defined $token->{system_identifier}) {
2463 !!!cp ('t1');
2464 !!!parse-error (type => 'not HTML5');
2465 } elsif ($doctype_name ne 'HTML') {
2466 !!!cp ('t2');
2467 ## ISSUE: ASCII case-insensitive? (in fact it does not matter)
2468 !!!parse-error (type => 'not HTML5');
2469 } else {
2470 !!!cp ('t3');
2471 }
2472
2473 my $doctype = $self->{document}->create_document_type_definition
2474 ($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)?
2475 $doctype->public_id ($token->{public_identifier})
2476 if defined $token->{public_identifier};
2477 $doctype->system_id ($token->{system_identifier})
2478 if defined $token->{system_identifier};
2479 ## NOTE: Other DocumentType attributes are null or empty lists.
2480 ## ISSUE: internalSubset = null??
2481 $self->{document}->append_child ($doctype);
2482
2483 if ($token->{quirks} or $doctype_name ne 'HTML') {
2484 !!!cp ('t4');
2485 $self->{document}->manakai_compat_mode ('quirks');
2486 } elsif (defined $token->{public_identifier}) {
2487 my $pubid = $token->{public_identifier};
2488 $pubid =~ tr/a-z/A-z/;
2489 if ({
2490 "+//SILMARIL//DTD HTML PRO V0R11 19970101//EN" => 1,
2491 "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
2492 "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
2493 "-//IETF//DTD HTML 2.0 LEVEL 1//EN" => 1,
2494 "-//IETF//DTD HTML 2.0 LEVEL 2//EN" => 1,
2495 "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//EN" => 1,
2496 "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//EN" => 1,
2497 "-//IETF//DTD HTML 2.0 STRICT//EN" => 1,
2498 "-//IETF//DTD HTML 2.0//EN" => 1,
2499 "-//IETF//DTD HTML 2.1E//EN" => 1,
2500 "-//IETF//DTD HTML 3.0//EN" => 1,
2501 "-//IETF//DTD HTML 3.0//EN//" => 1,
2502 "-//IETF//DTD HTML 3.2 FINAL//EN" => 1,
2503 "-//IETF//DTD HTML 3.2//EN" => 1,
2504 "-//IETF//DTD HTML 3//EN" => 1,
2505 "-//IETF//DTD HTML LEVEL 0//EN" => 1,
2506 "-//IETF//DTD HTML LEVEL 0//EN//2.0" => 1,
2507 "-//IETF//DTD HTML LEVEL 1//EN" => 1,
2508 "-//IETF//DTD HTML LEVEL 1//EN//2.0" => 1,
2509 "-//IETF//DTD HTML LEVEL 2//EN" => 1,
2510 "-//IETF//DTD HTML LEVEL 2//EN//2.0" => 1,
2511 "-//IETF//DTD HTML LEVEL 3//EN" => 1,
2512 "-//IETF//DTD HTML LEVEL 3//EN//3.0" => 1,
2513 "-//IETF//DTD HTML STRICT LEVEL 0//EN" => 1,
2514 "-//IETF//DTD HTML STRICT LEVEL 0//EN//2.0" => 1,
2515 "-//IETF//DTD HTML STRICT LEVEL 1//EN" => 1,
2516 "-//IETF//DTD HTML STRICT LEVEL 1//EN//2.0" => 1,
2517 "-//IETF//DTD HTML STRICT LEVEL 2//EN" => 1,
2518 "-//IETF//DTD HTML STRICT LEVEL 2//EN//2.0" => 1,
2519 "-//IETF//DTD HTML STRICT LEVEL 3//EN" => 1,
2520 "-//IETF//DTD HTML STRICT LEVEL 3//EN//3.0" => 1,
2521 "-//IETF//DTD HTML STRICT//EN" => 1,
2522 "-//IETF//DTD HTML STRICT//EN//2.0" => 1,
2523 "-//IETF//DTD HTML STRICT//EN//3.0" => 1,
2524 "-//IETF//DTD HTML//EN" => 1,
2525 "-//IETF//DTD HTML//EN//2.0" => 1,
2526 "-//IETF//DTD HTML//EN//3.0" => 1,
2527 "-//METRIUS//DTD METRIUS PRESENTATIONAL//EN" => 1,
2528 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//EN" => 1,
2529 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//EN" => 1,
2530 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//EN" => 1,
2531 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//EN" => 1,
2532 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//EN" => 1,
2533 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//EN" => 1,
2534 "-//NETSCAPE COMM. CORP.//DTD HTML//EN" => 1,
2535 "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1,
2536 "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1,
2537 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1,
2538 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED RELAXED 1.0//EN" => 1,
2539 "-//SOFTQUAD SOFTWARE//DTD HOTMETAL PRO 6.0::19990601::EXTENSIONS TO HTML 4.0//EN" => 1,
2540 "-//SOFTQUAD//DTD HOTMETAL PRO 4.0::19971010::EXTENSIONS TO HTML 4.0//EN" => 1,
2541 "-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1,
2542 "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1,
2543 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1,
2544 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//EN" => 1,
2545 "-//W3C//DTD HTML 3 1995-03-24//EN" => 1,
2546 "-//W3C//DTD HTML 3.2 DRAFT//EN" => 1,
2547 "-//W3C//DTD HTML 3.2 FINAL//EN" => 1,
2548 "-//W3C//DTD HTML 3.2//EN" => 1,
2549 "-//W3C//DTD HTML 3.2S DRAFT//EN" => 1,
2550 "-//W3C//DTD HTML 4.0 FRAMESET//EN" => 1,
2551 "-//W3C//DTD HTML 4.0 TRANSITIONAL//EN" => 1,
2552 "-//W3C//DTD HTML EXPERIMETNAL 19960712//EN" => 1,
2553 "-//W3C//DTD HTML EXPERIMENTAL 970421//EN" => 1,
2554 "-//W3C//DTD W3 HTML//EN" => 1,
2555 "-//W3O//DTD W3 HTML 3.0//EN" => 1,
2556 "-//W3O//DTD W3 HTML 3.0//EN//" => 1,
2557 "-//W3O//DTD W3 HTML STRICT 3.0//EN//" => 1,
2558 "-//WEBTECHS//DTD MOZILLA HTML 2.0//EN" => 1,
2559 "-//WEBTECHS//DTD MOZILLA HTML//EN" => 1,
2560 "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" => 1,
2561 "HTML" => 1,
2562 }->{$pubid}) {
2563 !!!cp ('t5');
2564 $self->{document}->manakai_compat_mode ('quirks');
2565 } elsif ($pubid eq "-//W3C//DTD HTML 4.01 FRAMESET//EN" or
2566 $pubid eq "-//W3C//DTD HTML 4.01 TRANSITIONAL//EN") {
2567 if (defined $token->{system_identifier}) {
2568 !!!cp ('t6');
2569 $self->{document}->manakai_compat_mode ('quirks');
2570 } else {
2571 !!!cp ('t7');
2572 $self->{document}->manakai_compat_mode ('limited quirks');
2573 }
2574 } elsif ($pubid eq "-//W3C//DTD XHTML 1.0 FRAMESET//EN" or
2575 $pubid eq "-//W3C//DTD XHTML 1.0 TRANSITIONAL//EN") {
2576 !!!cp ('t8');
2577 $self->{document}->manakai_compat_mode ('limited quirks');
2578 } else {
2579 !!!cp ('t9');
2580 }
2581 } else {
2582 !!!cp ('t10');
2583 }
2584 if (defined $token->{system_identifier}) {
2585 my $sysid = $token->{system_identifier};
2586 $sysid =~ tr/A-Z/a-z/;
2587 if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
2588 ## TODO: Check the spec: PUBLIC "(limited quirks)" "(quirks)"
2589 $self->{document}->manakai_compat_mode ('quirks');
2590 !!!cp ('t11');
2591 } else {
2592 !!!cp ('t12');
2593 }
2594 } else {
2595 !!!cp ('t13');
2596 }
2597
2598 ## Go to the "before html" insertion mode.
2599 !!!next-token;
2600 return;
2601 } elsif ({
2602 START_TAG_TOKEN, 1,
2603 END_TAG_TOKEN, 1,
2604 END_OF_FILE_TOKEN, 1,
2605 }->{$token->{type}}) {
2606 !!!cp ('t14');
2607 !!!parse-error (type => 'no DOCTYPE');
2608 $self->{document}->manakai_compat_mode ('quirks');
2609 ## Go to the "before html" insertion mode.
2610 ## reprocess
2611 return;
2612 } elsif ($token->{type} == CHARACTER_TOKEN) {
2613 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
2614 ## Ignore the token
2615
2616 unless (length $token->{data}) {
2617 !!!cp ('t15');
2618 ## Stay in the insertion mode.
2619 !!!next-token;
2620 redo INITIAL;
2621 } else {
2622 !!!cp ('t16');
2623 }
2624 } else {
2625 !!!cp ('t17');
2626 }
2627
2628 !!!parse-error (type => 'no DOCTYPE');
2629 $self->{document}->manakai_compat_mode ('quirks');
2630 ## Go to the "before html" insertion mode.
2631 ## reprocess
2632 return;
2633 } elsif ($token->{type} == COMMENT_TOKEN) {
2634 !!!cp ('t18');
2635 my $comment = $self->{document}->create_comment ($token->{data});
2636 $self->{document}->append_child ($comment);
2637
2638 ## Stay in the insertion mode.
2639 !!!next-token;
2640 redo INITIAL;
2641 } else {
2642 die "$0: $token->{type}: Unknown token type";
2643 }
2644 } # INITIAL
2645
2646 die "$0: _tree_construction_initial: This should be never reached";
2647 } # _tree_construction_initial
2648
2649 sub _tree_construction_root_element ($) {
2650 my $self = shift;
2651
2652 ## NOTE: "before html" insertion mode.
2653
2654 B: {
2655 if ($token->{type} == DOCTYPE_TOKEN) {
2656 !!!cp ('t19');
2657 !!!parse-error (type => 'in html:#DOCTYPE');
2658 ## Ignore the token
2659 ## Stay in the insertion mode.
2660 !!!next-token;
2661 redo B;
2662 } elsif ($token->{type} == COMMENT_TOKEN) {
2663 !!!cp ('t20');
2664 my $comment = $self->{document}->create_comment ($token->{data});
2665 $self->{document}->append_child ($comment);
2666 ## Stay in the insertion mode.
2667 !!!next-token;
2668 redo B;
2669 } elsif ($token->{type} == CHARACTER_TOKEN) {
2670 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
2671 ## Ignore the token.
2672
2673 unless (length $token->{data}) {
2674 !!!cp ('t21');
2675 ## Stay in the insertion mode.
2676 !!!next-token;
2677 redo B;
2678 } else {
2679 !!!cp ('t22');
2680 }
2681 } else {
2682 !!!cp ('t23');
2683 }
2684
2685 $self->{application_cache_selection}->(undef);
2686
2687 #
2688 } elsif ($token->{type} == START_TAG_TOKEN) {
2689 if ($token->{tag_name} eq 'html') {
2690 my $root_element;
2691 !!!create-element ($root_element, $token->{tag_name}, $token->{attributes});
2692 $self->{document}->append_child ($root_element);
2693 push @{$self->{open_elements}}, [$root_element, 'html'];
2694
2695 if ($token->{attributes}->{manifest}) {
2696 !!!cp ('t24');
2697 $self->{application_cache_selection}
2698 ->($token->{attributes}->{manifest}->{value});
2699 ## ISSUE: No relative reference resolution?
2700 } else {
2701 !!!cp ('t25');
2702 $self->{application_cache_selection}->(undef);
2703 }
2704
2705 !!!next-token;
2706 return; ## Go to the "before head" insertion mode.
2707 } else {
2708 !!!cp ('t25.1');
2709 #
2710 }
2711 } elsif ({
2712 END_TAG_TOKEN, 1,
2713 END_OF_FILE_TOKEN, 1,
2714 }->{$token->{type}}) {
2715 !!!cp ('t26');
2716 #
2717 } else {
2718 die "$0: $token->{type}: Unknown token type";
2719 }
2720
2721 my $root_element; !!!create-element ($root_element, 'html');
2722 $self->{document}->append_child ($root_element);
2723 push @{$self->{open_elements}}, [$root_element, 'html'];
2724
2725 $self->{application_cache_selection}->(undef);
2726
2727 ## NOTE: Reprocess the token.
2728 return; ## Go to the "before head" insertion mode.
2729
2730 ## ISSUE: There is an issue in the spec
2731 } # B
2732
2733 die "$0: _tree_construction_root_element: This should never be reached";
2734 } # _tree_construction_root_element
2735
2736 sub _reset_insertion_mode ($) {
2737 my $self = shift;
2738
2739 ## Step 1
2740 my $last;
2741
2742 ## Step 2
2743 my $i = -1;
2744 my $node = $self->{open_elements}->[$i];
2745
2746 ## Step 3
2747 S3: {
2748 if ($self->{open_elements}->[0]->[0] eq $node->[0]) {
2749 $last = 1;
2750 if (defined $self->{inner_html_node}) {
2751 if ($self->{inner_html_node}->[1] eq 'td' or
2752 $self->{inner_html_node}->[1] eq 'th') {
2753 !!!cp ('t27');
2754 #
2755 } else {
2756 !!!cp ('t28');
2757 $node = $self->{inner_html_node};
2758 }
2759 }
2760 }
2761
2762 ## Step 4..13
2763 my $new_mode = {
2764 select => IN_SELECT_IM,
2765 ## NOTE: |option| and |optgroup| do not set
2766 ## insertion mode to "in select" by themselves.
2767 td => IN_CELL_IM,
2768 th => IN_CELL_IM,
2769 tr => IN_ROW_IM,
2770 tbody => IN_TABLE_BODY_IM,
2771 thead => IN_TABLE_BODY_IM,
2772 tfoot => IN_TABLE_BODY_IM,
2773 caption => IN_CAPTION_IM,
2774 colgroup => IN_COLUMN_GROUP_IM,
2775 table => IN_TABLE_IM,
2776 head => IN_BODY_IM, # not in head!
2777 body => IN_BODY_IM,
2778 frameset => IN_FRAMESET_IM,
2779 }->{$node->[1]};
2780 $self->{insertion_mode} = $new_mode and return if defined $new_mode;
2781
2782 ## Step 14
2783 if ($node->[1] eq 'html') {
2784 unless (defined $self->{head_element}) {
2785 !!!cp ('t29');
2786 $self->{insertion_mode} = BEFORE_HEAD_IM;
2787 } else {
2788 ## ISSUE: Can this state be reached?
2789 !!!cp ('t30');
2790 $self->{insertion_mode} = AFTER_HEAD_IM;
2791 }
2792 return;
2793 } else {
2794 !!!cp ('t31');
2795 }
2796
2797 ## Step 15
2798 $self->{insertion_mode} = IN_BODY_IM and return if $last;
2799
2800 ## Step 16
2801 $i--;
2802 $node = $self->{open_elements}->[$i];
2803
2804 ## Step 17
2805 redo S3;
2806 } # S3
2807
2808 die "$0: _reset_insertion_mode: This line should never be reached";
2809 } # _reset_insertion_mode
2810
2811 sub _tree_construction_main ($) {
2812 my $self = shift;
2813
2814 my $active_formatting_elements = [];
2815
2816 my $reconstruct_active_formatting_elements = sub { # MUST
2817 my $insert = shift;
2818
2819 ## Step 1
2820 return unless @$active_formatting_elements;
2821
2822 ## Step 3
2823 my $i = -1;
2824 my $entry = $active_formatting_elements->[$i];
2825
2826 ## Step 2
2827 return if $entry->[0] eq '#marker';
2828 for (@{$self->{open_elements}}) {
2829 if ($entry->[0] eq $_->[0]) {
2830 !!!cp ('t32');
2831 return;
2832 }
2833 }
2834
2835 S4: {
2836 ## Step 4
2837 last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
2838
2839 ## Step 5
2840 $i--;
2841 $entry = $active_formatting_elements->[$i];
2842
2843 ## Step 6
2844 if ($entry->[0] eq '#marker') {
2845 !!!cp ('t33_1');
2846 #
2847 } else {
2848 my $in_open_elements;
2849 OE: for (@{$self->{open_elements}}) {
2850 if ($entry->[0] eq $_->[0]) {
2851 !!!cp ('t33');
2852 $in_open_elements = 1;
2853 last OE;
2854 }
2855 }
2856 if ($in_open_elements) {
2857 !!!cp ('t34');
2858 #
2859 } else {
2860 ## NOTE: <!DOCTYPE HTML><p><b><i><u></p> <p>X
2861 !!!cp ('t35');
2862 redo S4;
2863 }
2864 }
2865
2866 ## Step 7
2867 $i++;
2868 $entry = $active_formatting_elements->[$i];
2869 } # S4
2870
2871 S7: {
2872 ## Step 8
2873 my $clone = [$entry->[0]->clone_node (0), $entry->[1]];
2874
2875 ## Step 9
2876 $insert->($clone->[0]);
2877 push @{$self->{open_elements}}, $clone;
2878
2879 ## Step 10
2880 $active_formatting_elements->[$i] = $self->{open_elements}->[-1];
2881
2882 ## Step 11
2883 unless ($clone->[0] eq $active_formatting_elements->[-1]->[0]) {
2884 !!!cp ('t36');
2885 ## Step 7'
2886 $i++;
2887 $entry = $active_formatting_elements->[$i];
2888
2889 redo S7;
2890 }
2891
2892 !!!cp ('t37');
2893 } # S7
2894 }; # $reconstruct_active_formatting_elements
2895
2896 my $clear_up_to_marker = sub {
2897 for (reverse 0..$#$active_formatting_elements) {
2898 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
2899 !!!cp ('t38');
2900 splice @$active_formatting_elements, $_;
2901 return;
2902 }
2903 }
2904
2905 !!!cp ('t39');
2906 }; # $clear_up_to_marker
2907
2908 my $insert;
2909
2910 my $parse_rcdata = sub ($) {
2911 my ($content_model_flag) = @_;
2912
2913 ## Step 1
2914 my $start_tag_name = $token->{tag_name};
2915 my $el;
2916 !!!create-element ($el, $start_tag_name, $token->{attributes});
2917
2918 ## Step 2
2919 $insert->($el);
2920
2921 ## Step 3
2922 $self->{content_model} = $content_model_flag; # CDATA or RCDATA
2923 delete $self->{escape}; # MUST
2924
2925 ## Step 4
2926 my $text = '';
2927 !!!next-token;
2928 while ($token->{type} == CHARACTER_TOKEN) { # or until stop tokenizing
2929 !!!cp ('t40');
2930 $text .= $token->{data};
2931 !!!next-token;
2932 }
2933
2934 ## Step 5
2935 if (length $text) {
2936 !!!cp ('t41');
2937 my $text = $self->{document}->create_text_node ($text);
2938 $el->append_child ($text);
2939 }
2940
2941 ## Step 6
2942 $self->{content_model} = PCDATA_CONTENT_MODEL;
2943
2944 ## Step 7
2945 if ($token->{type} == END_TAG_TOKEN and
2946 $token->{tag_name} eq $start_tag_name) {
2947 !!!cp ('t42');
2948 ## Ignore the token
2949 } else {
2950 ## NOTE: An end-of-file token.
2951 if ($content_model_flag == CDATA_CONTENT_MODEL) {
2952 !!!cp ('t43');
2953 !!!parse-error (type => 'in CDATA:#'.$token->{type});
2954 } elsif ($content_model_flag == RCDATA_CONTENT_MODEL) {
2955 !!!cp ('t44');
2956 !!!parse-error (type => 'in RCDATA:#'.$token->{type});
2957 } else {
2958 die "$0: $content_model_flag in parse_rcdata";
2959 }
2960 }
2961 !!!next-token;
2962 }; # $parse_rcdata
2963
2964 my $script_start_tag = sub () {
2965 my $script_el;
2966 !!!create-element ($script_el, 'script', $token->{attributes});
2967 ## TODO: mark as "parser-inserted"
2968
2969 $self->{content_model} = CDATA_CONTENT_MODEL;
2970 delete $self->{escape}; # MUST
2971
2972 my $text = '';
2973 !!!next-token;
2974 while ($token->{type} == CHARACTER_TOKEN) {
2975 !!!cp ('t45');
2976 $text .= $token->{data};
2977 !!!next-token;
2978 } # stop if non-character token or tokenizer stops tokenising
2979 if (length $text) {
2980 !!!cp ('t46');
2981 $script_el->manakai_append_text ($text);
2982 }
2983
2984 $self->{content_model} = PCDATA_CONTENT_MODEL;
2985
2986 if ($token->{type} == END_TAG_TOKEN and
2987 $token->{tag_name} eq 'script') {
2988 !!!cp ('t47');
2989 ## Ignore the token
2990 } else {
2991 !!!cp ('t48');
2992 !!!parse-error (type => 'in CDATA:#'.$token->{type});
2993 ## ISSUE: And ignore?
2994 ## TODO: mark as "already executed"
2995 }
2996
2997 if (defined $self->{inner_html_node}) {
2998 !!!cp ('t49');
2999 ## TODO: mark as "already executed"
3000 } else {
3001 !!!cp ('t50');
3002 ## TODO: $old_insertion_point = current insertion point
3003 ## TODO: insertion point = just before the next input character
3004
3005 $insert->($script_el);
3006
3007 ## TODO: insertion point = $old_insertion_point (might be "undefined")
3008
3009 ## TODO: if there is a script that will execute as soon as the parser resume, then...
3010 }
3011
3012 !!!next-token;
3013 }; # $script_start_tag
3014
3015 ## NOTE: $open_tables->[-1]->[0] is the "current table" element node.
3016 ## NOTE: $open_tables->[-1]->[1] is the "tainted" flag.
3017 my $open_tables = [[$self->{open_elements}->[0]->[0]]];
3018
3019 my $formatting_end_tag = sub {
3020 my $tag_name = shift;
3021
3022 ## NOTE: The adoption agency algorithm (AAA).
3023
3024 FET: {
3025 ## Step 1
3026 my $formatting_element;
3027 my $formatting_element_i_in_active;
3028 AFE: for (reverse 0..$#$active_formatting_elements) {
3029 if ($active_formatting_elements->[$_]->[1] eq $tag_name) {
3030 !!!cp ('t51');
3031 $formatting_element = $active_formatting_elements->[$_];
3032 $formatting_element_i_in_active = $_;
3033 last AFE;
3034 } elsif ($active_formatting_elements->[$_]->[0] eq '#marker') {
3035 !!!cp ('t52');
3036 last AFE;
3037 }
3038 } # AFE
3039 unless (defined $formatting_element) {
3040 !!!cp ('t53');
3041 !!!parse-error (type => 'unmatched end tag:'.$tag_name);
3042 ## Ignore the token
3043 !!!next-token;
3044 return;
3045 }
3046 ## has an element in scope
3047 my $in_scope = 1;
3048 my $formatting_element_i_in_open;
3049 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3050 my $node = $self->{open_elements}->[$_];
3051 if ($node->[0] eq $formatting_element->[0]) {
3052 if ($in_scope) {
3053 !!!cp ('t54');
3054 $formatting_element_i_in_open = $_;
3055 last INSCOPE;
3056 } else { # in open elements but not in scope
3057 !!!cp ('t55');
3058 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3059 ## Ignore the token
3060 !!!next-token;
3061 return;
3062 }
3063 } elsif ({
3064 applet => 1, table => 1, caption => 1, td => 1, th => 1,
3065 button => 1, marquee => 1, object => 1, html => 1,
3066 }->{$node->[1]}) {
3067 !!!cp ('t56');
3068 $in_scope = 0;
3069 }
3070 } # INSCOPE
3071 unless (defined $formatting_element_i_in_open) {
3072 !!!cp ('t57');
3073 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3074 pop @$active_formatting_elements; # $formatting_element
3075 !!!next-token; ## TODO: ok?
3076 return;
3077 }
3078 if (not $self->{open_elements}->[-1]->[0] eq $formatting_element->[0]) {
3079 !!!cp ('t58');
3080 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3081 }
3082
3083 ## Step 2
3084 my $furthest_block;
3085 my $furthest_block_i_in_open;
3086 OE: for (reverse 0..$#{$self->{open_elements}}) {
3087 my $node = $self->{open_elements}->[$_];
3088 if (not $formatting_category->{$node->[1]} and
3089 #not $phrasing_category->{$node->[1]} and
3090 ($special_category->{$node->[1]} or
3091 $scoping_category->{$node->[1]})) { ## Scoping is redundant, maybe
3092 !!!cp ('t59');
3093 $furthest_block = $node;
3094 $furthest_block_i_in_open = $_;
3095 } elsif ($node->[0] eq $formatting_element->[0]) {
3096 !!!cp ('t60');
3097 last OE;
3098 }
3099 } # OE
3100
3101 ## Step 3
3102 unless (defined $furthest_block) { # MUST
3103 !!!cp ('t61');
3104 splice @{$self->{open_elements}}, $formatting_element_i_in_open;
3105 splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
3106 !!!next-token;
3107 return;
3108 }
3109
3110 ## Step 4
3111 my $common_ancestor_node = $self->{open_elements}->[$formatting_element_i_in_open - 1];
3112
3113 ## Step 5
3114 my $furthest_block_parent = $furthest_block->[0]->parent_node;
3115 if (defined $furthest_block_parent) {
3116 !!!cp ('t62');
3117 $furthest_block_parent->remove_child ($furthest_block->[0]);
3118 }
3119
3120 ## Step 6
3121 my $bookmark_prev_el
3122 = $active_formatting_elements->[$formatting_element_i_in_active - 1]
3123 ->[0];
3124
3125 ## Step 7
3126 my $node = $furthest_block;
3127 my $node_i_in_open = $furthest_block_i_in_open;
3128 my $last_node = $furthest_block;
3129 S7: {
3130 ## Step 1
3131 $node_i_in_open--;
3132 $node = $self->{open_elements}->[$node_i_in_open];
3133
3134 ## Step 2
3135 my $node_i_in_active;
3136 S7S2: {
3137 for (reverse 0..$#$active_formatting_elements) {
3138 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
3139 !!!cp ('t63');
3140 $node_i_in_active = $_;
3141 last S7S2;
3142 }
3143 }
3144 splice @{$self->{open_elements}}, $node_i_in_open, 1;
3145 redo S7;
3146 } # S7S2
3147
3148 ## Step 3
3149 last S7 if $node->[0] eq $formatting_element->[0];
3150
3151 ## Step 4
3152 if ($last_node->[0] eq $furthest_block->[0]) {
3153 !!!cp ('t64');
3154 $bookmark_prev_el = $node->[0];
3155 }
3156
3157 ## Step 5
3158 if ($node->[0]->has_child_nodes ()) {
3159 !!!cp ('t65');
3160 my $clone = [$node->[0]->clone_node (0), $node->[1]];
3161 $active_formatting_elements->[$node_i_in_active] = $clone;
3162 $self->{open_elements}->[$node_i_in_open] = $clone;
3163 $node = $clone;
3164 }
3165
3166 ## Step 6
3167 $node->[0]->append_child ($last_node->[0]);
3168
3169 ## Step 7
3170 $last_node = $node;
3171
3172 ## Step 8
3173 redo S7;
3174 } # S7
3175
3176 ## Step 8
3177 if ({
3178 table => 1, tbody => 1, tfoot => 1, thead => 1, tr => 1,
3179 }->{$common_ancestor_node->[1]}) {
3180 my $foster_parent_element;
3181 my $next_sibling;
3182 OE: for (reverse 0..$#{$self->{open_elements}}) {
3183 if ($self->{open_elements}->[$_]->[1] eq 'table') {
3184 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3185 if (defined $parent and $parent->node_type == 1) {
3186 !!!cp ('t65.1');
3187 $foster_parent_element = $parent;
3188 $next_sibling = $self->{open_elements}->[$_]->[0];
3189 } else {
3190 !!!cp ('t65.2');
3191 $foster_parent_element
3192 = $self->{open_elements}->[$_ - 1]->[0];
3193 }
3194 last OE;
3195 }
3196 } # OE
3197 $foster_parent_element = $self->{open_elements}->[0]->[0]
3198 unless defined $foster_parent_element;
3199 $foster_parent_element->insert_before ($last_node->[0], $next_sibling);
3200 $open_tables->[-1]->[1] = 1; # tainted
3201 } else {
3202 !!!cp ('t65.3');
3203 $common_ancestor_node->[0]->append_child ($last_node->[0]);
3204 }
3205
3206 ## Step 9
3207 my $clone = [$formatting_element->[0]->clone_node (0),
3208 $formatting_element->[1]];
3209
3210 ## Step 10
3211 my @cn = @{$furthest_block->[0]->child_nodes};
3212 $clone->[0]->append_child ($_) for @cn;
3213
3214 ## Step 11
3215 $furthest_block->[0]->append_child ($clone->[0]);
3216
3217 ## Step 12
3218 my $i;
3219 AFE: for (reverse 0..$#$active_formatting_elements) {
3220 if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
3221 !!!cp ('t66');
3222 splice @$active_formatting_elements, $_, 1;
3223 $i-- and last AFE if defined $i;
3224 } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
3225 !!!cp ('t67');
3226 $i = $_;
3227 }
3228 } # AFE
3229 splice @$active_formatting_elements, $i + 1, 0, $clone;
3230
3231 ## Step 13
3232 undef $i;
3233 OE: for (reverse 0..$#{$self->{open_elements}}) {
3234 if ($self->{open_elements}->[$_]->[0] eq $formatting_element->[0]) {
3235 !!!cp ('t68');
3236 splice @{$self->{open_elements}}, $_, 1;
3237 $i-- and last OE if defined $i;
3238 } elsif ($self->{open_elements}->[$_]->[0] eq $furthest_block->[0]) {
3239 !!!cp ('t69');
3240 $i = $_;
3241 }
3242 } # OE
3243 splice @{$self->{open_elements}}, $i + 1, 1, $clone;
3244
3245 ## Step 14
3246 redo FET;
3247 } # FET
3248 }; # $formatting_end_tag
3249
3250 $insert = my $insert_to_current = sub {
3251 $self->{open_elements}->[-1]->[0]->append_child ($_[0]);
3252 }; # $insert_to_current
3253
3254 my $insert_to_foster = sub {
3255 my $child = shift;
3256 if ({
3257 table => 1, tbody => 1, tfoot => 1, thead => 1, tr => 1,
3258 }->{$self->{open_elements}->[-1]->[1]}) {
3259 # MUST
3260 my $foster_parent_element;
3261 my $next_sibling;
3262 OE: for (reverse 0..$#{$self->{open_elements}}) {
3263 if ($self->{open_elements}->[$_]->[1] eq 'table') {
3264 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3265 if (defined $parent and $parent->node_type == 1) {
3266 !!!cp ('t70');
3267 $foster_parent_element = $parent;
3268 $next_sibling = $self->{open_elements}->[$_]->[0];
3269 } else {
3270 !!!cp ('t71');
3271 $foster_parent_element
3272 = $self->{open_elements}->[$_ - 1]->[0];
3273 }
3274 last OE;
3275 }
3276 } # OE
3277 $foster_parent_element = $self->{open_elements}->[0]->[0]
3278 unless defined $foster_parent_element;
3279 $foster_parent_element->insert_before
3280 ($child, $next_sibling);
3281 $open_tables->[-1]->[1] = 1; # tainted
3282 } else {
3283 !!!cp ('t72');
3284 $self->{open_elements}->[-1]->[0]->append_child ($child);
3285 }
3286 }; # $insert_to_foster
3287
3288 B: {
3289 if ($token->{type} == DOCTYPE_TOKEN) {
3290 !!!cp ('t73');
3291 !!!parse-error (type => 'DOCTYPE in the middle');
3292 ## Ignore the token
3293 ## Stay in the phase
3294 !!!next-token;
3295 redo B;
3296 } elsif ($token->{type} == START_TAG_TOKEN and
3297 $token->{tag_name} eq 'html') {
3298 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
3299 !!!cp ('t79');
3300 !!!parse-error (type => 'after html:html');
3301 $self->{insertion_mode} = AFTER_BODY_IM;
3302 } elsif ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
3303 !!!cp ('t80');
3304 !!!parse-error (type => 'after html:html');
3305 $self->{insertion_mode} = AFTER_FRAMESET_IM;
3306 } else {
3307 !!!cp ('t81');
3308 }
3309
3310 !!!cp ('t82');
3311 !!!parse-error (type => 'not first start tag');
3312 my $top_el = $self->{open_elements}->[0]->[0];
3313 for my $attr_name (keys %{$token->{attributes}}) {
3314 unless ($top_el->has_attribute_ns (undef, $attr_name)) {
3315 !!!cp ('t84');
3316 $top_el->set_attribute_ns
3317 (undef, [undef, $attr_name],
3318 $token->{attributes}->{$attr_name}->{value});
3319 }
3320 }
3321 !!!next-token;
3322 redo B;
3323 } elsif ($token->{type} == COMMENT_TOKEN) {
3324 my $comment = $self->{document}->create_comment ($token->{data});
3325 if ($self->{insertion_mode} & AFTER_HTML_IMS) {
3326 !!!cp ('t85');
3327 $self->{document}->append_child ($comment);
3328 } elsif ($self->{insertion_mode} == AFTER_BODY_IM) {
3329 !!!cp ('t86');
3330 $self->{open_elements}->[0]->[0]->append_child ($comment);
3331 } else {
3332 !!!cp ('t87');
3333 $self->{open_elements}->[-1]->[0]->append_child ($comment);
3334 }
3335 !!!next-token;
3336 redo B;
3337 } elsif ($self->{insertion_mode} & HEAD_IMS) {
3338 if ($token->{type} == CHARACTER_TOKEN) {
3339 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3340 unless ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3341 !!!cp ('t88.2');
3342 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3343 } else {
3344 !!!cp ('t88.1');
3345 ## Ignore the token.
3346 !!!next-token;
3347 redo B;
3348 }
3349 unless (length $token->{data}) {
3350 !!!cp ('t88');
3351 !!!next-token;
3352 redo B;
3353 }
3354 }
3355
3356 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3357 !!!cp ('t89');
3358 ## As if <head>
3359 !!!create-element ($self->{head_element}, 'head');
3360 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3361 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3362
3363 ## Reprocess in the "in head" insertion mode...
3364 pop @{$self->{open_elements}};
3365
3366 ## Reprocess in the "after head" insertion mode...
3367 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3368 !!!cp ('t90');
3369 ## As if </noscript>
3370 pop @{$self->{open_elements}};
3371 !!!parse-error (type => 'in noscript:#character');
3372
3373 ## Reprocess in the "in head" insertion mode...
3374 ## As if </head>
3375 pop @{$self->{open_elements}};
3376
3377 ## Reprocess in the "after head" insertion mode...
3378 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3379 !!!cp ('t91');
3380 pop @{$self->{open_elements}};
3381
3382 ## Reprocess in the "after head" insertion mode...
3383 } else {
3384 !!!cp ('t92');
3385 }
3386
3387 ## "after head" insertion mode
3388 ## As if <body>
3389 !!!insert-element ('body');
3390 $self->{insertion_mode} = IN_BODY_IM;
3391 ## reprocess
3392 redo B;
3393 } elsif ($token->{type} == START_TAG_TOKEN) {
3394 if ($token->{tag_name} eq 'head') {
3395 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3396 !!!cp ('t93');
3397 !!!create-element ($self->{head_element}, $token->{tag_name}, $token->{attributes});
3398 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3399 push @{$self->{open_elements}}, [$self->{head_element}, $token->{tag_name}];
3400 $self->{insertion_mode} = IN_HEAD_IM;
3401 !!!next-token;
3402 redo B;
3403 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
3404 !!!cp ('t94');
3405 #
3406 } else {
3407 !!!cp ('t95');
3408 !!!parse-error (type => 'in head:head'); # or in head noscript
3409 ## Ignore the token
3410 !!!next-token;
3411 redo B;
3412 }
3413 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3414 !!!cp ('t96');
3415 ## As if <head>
3416 !!!create-element ($self->{head_element}, 'head');
3417 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3418 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3419
3420 $self->{insertion_mode} = IN_HEAD_IM;
3421 ## Reprocess in the "in head" insertion mode...
3422 } else {
3423 !!!cp ('t97');
3424 }
3425
3426 if ($token->{tag_name} eq 'base') {
3427 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3428 !!!cp ('t98');
3429 ## As if </noscript>
3430 pop @{$self->{open_elements}};
3431 !!!parse-error (type => 'in noscript:base');
3432
3433 $self->{insertion_mode} = IN_HEAD_IM;
3434 ## Reprocess in the "in head" insertion mode...
3435 } else {
3436 !!!cp ('t99');
3437 }
3438
3439 ## NOTE: There is a "as if in head" code clone.
3440 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
3441 !!!cp ('t100');
3442 !!!parse-error (type => 'after head:'.$token->{tag_name});
3443 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3444 } else {
3445 !!!cp ('t101');
3446 }
3447 !!!insert-element ($token->{tag_name}, $token->{attributes});
3448 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
3449 pop @{$self->{open_elements}} # <head>
3450 if $self->{insertion_mode} == AFTER_HEAD_IM;
3451 !!!next-token;
3452 redo B;
3453 } elsif ($token->{tag_name} eq 'link') {
3454 ## NOTE: There is a "as if in head" code clone.
3455 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
3456 !!!cp ('t102');
3457 !!!parse-error (type => 'after head:'.$token->{tag_name});
3458 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3459 } else {
3460 !!!cp ('t103');
3461 }
3462 !!!insert-element ($token->{tag_name}, $token->{attributes});
3463 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
3464 pop @{$self->{open_elements}} # <head>
3465 if $self->{insertion_mode} == AFTER_HEAD_IM;
3466 !!!next-token;
3467 redo B;
3468 } elsif ($token->{tag_name} eq 'meta') {
3469 ## NOTE: There is a "as if in head" code clone.
3470 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
3471 !!!cp ('t104');
3472 !!!parse-error (type => 'after head:'.$token->{tag_name});
3473 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3474 } else {
3475 !!!cp ('t105');
3476 }
3477 !!!insert-element ($token->{tag_name}, $token->{attributes});
3478 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
3479
3480 unless ($self->{confident}) {
3481 if ($token->{attributes}->{charset}) { ## TODO: And if supported
3482 !!!cp ('t106');
3483 $self->{change_encoding}
3484 ->($self, $token->{attributes}->{charset}->{value});
3485
3486 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
3487 ->set_user_data (manakai_has_reference =>
3488 $token->{attributes}->{charset}
3489 ->{has_reference});
3490 } elsif ($token->{attributes}->{content}) {
3491 ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
3492 if ($token->{attributes}->{content}->{value}
3493 =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
3494 [\x09-\x0D\x20]*=
3495 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
3496 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
3497 !!!cp ('t107');
3498 $self->{change_encoding}
3499 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3);
3500 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
3501 ->set_user_data (manakai_has_reference =>
3502 $token->{attributes}->{content}
3503 ->{has_reference});
3504 } else {
3505 !!!cp ('t108');
3506 }
3507 }
3508 } else {
3509 if ($token->{attributes}->{charset}) {
3510 !!!cp ('t109');
3511 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
3512 ->set_user_data (manakai_has_reference =>
3513 $token->{attributes}->{charset}
3514 ->{has_reference});
3515 }
3516 if ($token->{attributes}->{content}) {
3517 !!!cp ('t110');
3518 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
3519 ->set_user_data (manakai_has_reference =>
3520 $token->{attributes}->{content}
3521 ->{has_reference});
3522 }
3523 }
3524
3525 pop @{$self->{open_elements}} # <head>
3526 if $self->{insertion_mode} == AFTER_HEAD_IM;
3527 !!!next-token;
3528 redo B;
3529 } elsif ($token->{tag_name} eq 'title') {
3530 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3531 !!!cp ('t111');
3532 ## As if </noscript>
3533 pop @{$self->{open_elements}};
3534 !!!parse-error (type => 'in noscript:title');
3535
3536 $self->{insertion_mode} = IN_HEAD_IM;
3537 ## Reprocess in the "in head" insertion mode...
3538 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
3539 !!!cp ('t112');
3540 !!!parse-error (type => 'after head:'.$token->{tag_name});
3541 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3542 } else {
3543 !!!cp ('t113');
3544 }
3545
3546 ## NOTE: There is a "as if in head" code clone.
3547 my $parent = defined $self->{head_element} ? $self->{head_element}
3548 : $self->{open_elements}->[-1]->[0];
3549 $parse_rcdata->(RCDATA_CONTENT_MODEL);
3550 pop @{$self->{open_elements}} # <head>
3551 if $self->{insertion_mode} == AFTER_HEAD_IM;
3552 redo B;
3553 } elsif ($token->{tag_name} eq 'style') {
3554 ## NOTE: Or (scripting is enabled and tag_name eq 'noscript' and
3555 ## insertion mode IN_HEAD_IM)
3556 ## NOTE: There is a "as if in head" code clone.
3557 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
3558 !!!cp ('t114');
3559 !!!parse-error (type => 'after head:'.$token->{tag_name});
3560 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3561 } else {
3562 !!!cp ('t115');
3563 }
3564 $parse_rcdata->(CDATA_CONTENT_MODEL);
3565 pop @{$self->{open_elements}} # <head>
3566 if $self->{insertion_mode} == AFTER_HEAD_IM;
3567 redo B;
3568 } elsif ($token->{tag_name} eq 'noscript') {
3569 if ($self->{insertion_mode} == IN_HEAD_IM) {
3570 !!!cp ('t116');
3571 ## NOTE: and scripting is disalbed
3572 !!!insert-element ($token->{tag_name}, $token->{attributes});
3573 $self->{insertion_mode} = IN_HEAD_NOSCRIPT_IM;
3574 !!!next-token;
3575 redo B;
3576 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3577 !!!cp ('t117');
3578 !!!parse-error (type => 'in noscript:noscript');
3579 ## Ignore the token
3580 !!!next-token;
3581 redo B;
3582 } else {
3583 !!!cp ('t118');
3584 #
3585 }
3586 } elsif ($token->{tag_name} eq 'script') {
3587 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3588 !!!cp ('t119');
3589 ## As if </noscript>
3590 pop @{$self->{open_elements}};
3591 !!!parse-error (type => 'in noscript:script');
3592
3593 $self->{insertion_mode} = IN_HEAD_IM;
3594 ## Reprocess in the "in head" insertion mode...
3595 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
3596 !!!cp ('t120');
3597 !!!parse-error (type => 'after head:'.$token->{tag_name});
3598 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3599 } else {
3600 !!!cp ('t121');
3601 }
3602
3603 ## NOTE: There is a "as if in head" code clone.
3604 $script_start_tag->();
3605 pop @{$self->{open_elements}} # <head>
3606 if $self->{insertion_mode} == AFTER_HEAD_IM;
3607 redo B;
3608 } elsif ($token->{tag_name} eq 'body' or
3609 $token->{tag_name} eq 'frameset') {
3610 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3611 !!!cp ('t122');
3612 ## As if </noscript>
3613 pop @{$self->{open_elements}};
3614 !!!parse-error (type => 'in noscript:'.$token->{tag_name});
3615
3616 ## Reprocess in the "in head" insertion mode...
3617 ## As if </head>
3618 pop @{$self->{open_elements}};
3619
3620 ## Reprocess in the "after head" insertion mode...
3621 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3622 !!!cp ('t124');
3623 pop @{$self->{open_elements}};
3624
3625 ## Reprocess in the "after head" insertion mode...
3626 } else {
3627 !!!cp ('t125');
3628 }
3629
3630 ## "after head" insertion mode
3631 !!!insert-element ($token->{tag_name}, $token->{attributes});
3632 if ($token->{tag_name} eq 'body') {
3633 !!!cp ('t126');
3634 $self->{insertion_mode} = IN_BODY_IM;
3635 } elsif ($token->{tag_name} eq 'frameset') {
3636 !!!cp ('t127');
3637 $self->{insertion_mode} = IN_FRAMESET_IM;
3638 } else {
3639 die "$0: tag name: $self->{tag_name}";
3640 }
3641 !!!next-token;
3642 redo B;
3643 } else {
3644 !!!cp ('t128');
3645 #
3646 }
3647
3648 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3649 !!!cp ('t129');
3650 ## As if </noscript>
3651 pop @{$self->{open_elements}};
3652 !!!parse-error (type => 'in noscript:/'.$token->{tag_name});
3653
3654 ## Reprocess in the "in head" insertion mode...
3655 ## As if </head>
3656 pop @{$self->{open_elements}};
3657
3658 ## Reprocess in the "after head" insertion mode...
3659 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3660 !!!cp ('t130');
3661 ## As if </head>
3662 pop @{$self->{open_elements}};
3663
3664 ## Reprocess in the "after head" insertion mode...
3665 } else {
3666 !!!cp ('t131');
3667 }
3668
3669 ## "after head" insertion mode
3670 ## As if <body>
3671 !!!insert-element ('body');
3672 $self->{insertion_mode} = IN_BODY_IM;
3673 ## reprocess
3674 redo B;
3675 } elsif ($token->{type} == END_TAG_TOKEN) {
3676 if ($token->{tag_name} eq 'head') {
3677 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3678 !!!cp ('t132');
3679 ## As if <head>
3680 !!!create-element ($self->{head_element}, 'head');
3681 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3682 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3683
3684 ## Reprocess in the "in head" insertion mode...
3685 pop @{$self->{open_elements}};
3686 $self->{insertion_mode} = AFTER_HEAD_IM;
3687 !!!next-token;
3688 redo B;
3689 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3690 !!!cp ('t133');
3691 ## As if </noscript>
3692 pop @{$self->{open_elements}};
3693 !!!parse-error (type => 'in noscript:/head');
3694
3695 ## Reprocess in the "in head" insertion mode...
3696 pop @{$self->{open_elements}};
3697 $self->{insertion_mode} = AFTER_HEAD_IM;
3698 !!!next-token;
3699 redo B;
3700 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3701 !!!cp ('t134');
3702 pop @{$self->{open_elements}};
3703 $self->{insertion_mode} = AFTER_HEAD_IM;
3704 !!!next-token;
3705 redo B;
3706 } else {
3707 !!!cp ('t135');
3708 #
3709 }
3710 } elsif ($token->{tag_name} eq 'noscript') {
3711 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3712 !!!cp ('t136');
3713 pop @{$self->{open_elements}};
3714 $self->{insertion_mode} = IN_HEAD_IM;
3715 !!!next-token;
3716 redo B;
3717 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3718 !!!cp ('t137');
3719 !!!parse-error (type => 'unmatched end tag:noscript');
3720 ## Ignore the token ## ISSUE: An issue in the spec.
3721 !!!next-token;
3722 redo B;
3723 } else {
3724 !!!cp ('t138');
3725 #
3726 }
3727 } elsif ({
3728 body => 1, html => 1,
3729 }->{$token->{tag_name}}) {
3730 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3731 !!!cp ('t139');
3732 ## As if <head>
3733 !!!create-element ($self->{head_element}, 'head');
3734 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3735 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3736
3737 $self->{insertion_mode} = IN_HEAD_IM;
3738 ## Reprocess in the "in head" insertion mode...
3739 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3740 !!!cp ('t140');
3741 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3742 ## Ignore the token
3743 !!!next-token;
3744 redo B;
3745 } else {
3746 !!!cp ('t141');
3747 }
3748
3749 #
3750 } elsif ({
3751 p => 1, br => 1,
3752 }->{$token->{tag_name}}) {
3753 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3754 !!!cp ('t142');
3755 ## As if <head>
3756 !!!create-element ($self->{head_element}, 'head');
3757 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3758 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3759
3760 $self->{insertion_mode} = IN_HEAD_IM;
3761 ## Reprocess in the "in head" insertion mode...
3762 } else {
3763 !!!cp ('t143');
3764 }
3765
3766 #
3767 } else {
3768 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
3769 !!!cp ('t144');
3770 #
3771 } else {
3772 !!!cp ('t145');
3773 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3774 ## Ignore the token
3775 !!!next-token;
3776 redo B;
3777 }
3778 }
3779
3780 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3781 !!!cp ('t146');
3782 ## As if </noscript>
3783 pop @{$self->{open_elements}};
3784 !!!parse-error (type => 'in noscript:/'.$token->{tag_name});
3785
3786 ## Reprocess in the "in head" insertion mode...
3787 ## As if </head>
3788 pop @{$self->{open_elements}};
3789
3790 ## Reprocess in the "after head" insertion mode...
3791 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3792 !!!cp ('t147');
3793 ## As if </head>
3794 pop @{$self->{open_elements}};
3795
3796 ## Reprocess in the "after head" insertion mode...
3797 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3798 ## ISSUE: This case cannot be reached?
3799 !!!cp ('t148');
3800 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3801 ## Ignore the token ## ISSUE: An issue in the spec.
3802 !!!next-token;
3803 redo B;
3804 } else {
3805 !!!cp ('t149');
3806 }
3807
3808 ## "after head" insertion mode
3809 ## As if <body>
3810 !!!insert-element ('body');
3811 $self->{insertion_mode} = IN_BODY_IM;
3812 ## reprocess
3813 redo B;
3814 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
3815 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3816 !!!cp ('t149.1');
3817
3818 ## NOTE: As if <head>
3819 !!!create-element ($self->{head_element}, 'head');
3820 $self->{open_elements}->[-1]->[0]->append_child
3821 ($self->{head_element});
3822 #push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3823 #$self->{insertion_mode} = IN_HEAD_IM;
3824 ## NOTE: Reprocess.
3825
3826 ## NOTE: As if </head>
3827 #pop @{$self->{open_elements}};
3828 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
3829 ## NOTE: Reprocess.
3830
3831 #
3832 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3833 !!!cp ('t149.2');
3834
3835 ## NOTE: As if </head>
3836 pop @{$self->{open_elements}};
3837 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
3838 ## NOTE: Reprocess.
3839
3840 #
3841 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3842 !!!cp ('t149.3');
3843
3844 !!!parse-error (type => 'in noscript:#eof');
3845
3846 ## As if </noscript>
3847 pop @{$self->{open_elements}};
3848 #$self->{insertion_mode} = IN_HEAD_IM;
3849 ## NOTE: Reprocess.
3850
3851 ## NOTE: As if </head>
3852 pop @{$self->{open_elements}};
3853 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
3854 ## NOTE: Reprocess.
3855
3856 #
3857 } else {
3858 !!!cp ('t149.4');
3859 #
3860 }
3861
3862 ## NOTE: As if <body>
3863 !!!insert-element ('body');
3864 $self->{insertion_mode} = IN_BODY_IM;
3865 ## NOTE: Reprocess.
3866 redo B;
3867 } else {
3868 die "$0: $token->{type}: Unknown token type";
3869 }
3870
3871 ## ISSUE: An issue in the spec.
3872 } elsif ($self->{insertion_mode} & BODY_IMS) {
3873 if ($token->{type} == CHARACTER_TOKEN) {
3874 !!!cp ('t150');
3875 ## NOTE: There is a code clone of "character in body".
3876 $reconstruct_active_formatting_elements->($insert_to_current);
3877
3878 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3879
3880 !!!next-token;
3881 redo B;
3882 } elsif ($token->{type} == START_TAG_TOKEN) {
3883 if ({
3884 caption => 1, col => 1, colgroup => 1, tbody => 1,
3885 td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
3886 }->{$token->{tag_name}}) {
3887 if ($self->{insertion_mode} == IN_CELL_IM) {
3888 ## have an element in table scope
3889 for (reverse 0..$#{$self->{open_elements}}) {
3890 my $node = $self->{open_elements}->[$_];
3891 if ($node->[1] eq 'td' or $node->[1] eq 'th') {
3892 !!!cp ('t151');
3893
3894 ## Close the cell
3895 !!!back-token; # <?>
3896 $token = {type => END_TAG_TOKEN, tag_name => $node->[1]};
3897 redo B;
3898 } elsif ({
3899 table => 1, html => 1,
3900 }->{$node->[1]}) {
3901 !!!cp ('t152');
3902 ## ISSUE: This case can never be reached, maybe.
3903 last;
3904 }
3905 }
3906
3907 !!!cp ('t153');
3908 !!!parse-error (type => 'start tag not allowed',
3909 value => $token->{tag_name});
3910 ## Ignore the token
3911 !!!next-token;
3912 redo B;
3913 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
3914 !!!parse-error (type => 'not closed:caption');
3915
3916 ## NOTE: As if </caption>.
3917 ## have a table element in table scope
3918 my $i;
3919 INSCOPE: {
3920 for (reverse 0..$#{$self->{open_elements}}) {
3921 my $node = $self->{open_elements}->[$_];
3922 if ($node->[1] eq 'caption') {
3923 !!!cp ('t155');
3924 $i = $_;
3925 last INSCOPE;
3926 } elsif ({
3927 table => 1, html => 1,
3928 }->{$node->[1]}) {
3929 !!!cp ('t156');
3930 last;
3931 }
3932 }
3933
3934 !!!cp ('t157');
3935 !!!parse-error (type => 'start tag not allowed',
3936 value => $token->{tag_name});
3937 ## Ignore the token
3938 !!!next-token;
3939 redo B;
3940 } # INSCOPE
3941
3942 ## generate implied end tags
3943 while ({
3944 dd => 1, dt => 1, li => 1, p => 1,
3945 }->{$self->{open_elements}->[-1]->[1]}) {
3946 !!!cp ('t158');
3947 pop @{$self->{open_elements}};
3948 }
3949
3950 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3951 !!!cp ('t159');
3952 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3953 } else {
3954 !!!cp ('t160');
3955 }
3956
3957 splice @{$self->{open_elements}}, $i;
3958
3959 $clear_up_to_marker->();
3960
3961 $self->{insertion_mode} = IN_TABLE_IM;
3962
3963 ## reprocess
3964 redo B;
3965 } else {
3966 !!!cp ('t161');
3967 #
3968 }
3969 } else {
3970 !!!cp ('t162');
3971 #
3972 }
3973 } elsif ($token->{type} == END_TAG_TOKEN) {
3974 if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') {
3975 if ($self->{insertion_mode} == IN_CELL_IM) {
3976 ## have an element in table scope
3977 my $i;
3978 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3979 my $node = $self->{open_elements}->[$_];
3980 if ($node->[1] eq $token->{tag_name}) {
3981 !!!cp ('t163');
3982 $i = $_;
3983 last INSCOPE;
3984 } elsif ({
3985 table => 1, html => 1,
3986 }->{$node->[1]}) {
3987 !!!cp ('t164');
3988 last INSCOPE;
3989 }
3990 } # INSCOPE
3991 unless (defined $i) {
3992 !!!cp ('t165');
3993 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3994 ## Ignore the token
3995 !!!next-token;
3996 redo B;
3997 }
3998
3999 ## generate implied end tags
4000 while ({
4001 dd => 1, dt => 1, li => 1, p => 1,
4002 }->{$self->{open_elements}->[-1]->[1]}) {
4003 !!!cp ('t166');
4004 pop @{$self->{open_elements}};
4005 }
4006
4007 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
4008 !!!cp ('t167');
4009 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4010 } else {
4011 !!!cp ('t168');
4012 }
4013
4014 splice @{$self->{open_elements}}, $i;
4015
4016 $clear_up_to_marker->();
4017
4018 $self->{insertion_mode} = IN_ROW_IM;
4019
4020 !!!next-token;
4021 redo B;
4022 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
4023 !!!cp ('t169');
4024 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4025 ## Ignore the token
4026 !!!next-token;
4027 redo B;
4028 } else {
4029 !!!cp ('t170');
4030 #
4031 }
4032 } elsif ($token->{tag_name} eq 'caption') {
4033 if ($self->{insertion_mode} == IN_CAPTION_IM) {
4034 ## have a table element in table scope
4035 my $i;
4036 INSCOPE: {
4037 for (reverse 0..$#{$self->{open_elements}}) {
4038 my $node = $self->{open_elements}->[$_];
4039 if ($node->[1] eq $token->{tag_name}) {
4040 !!!cp ('t171');
4041 $i = $_;
4042 last INSCOPE;
4043 } elsif ({
4044 table => 1, html => 1,
4045 }->{$node->[1]}) {
4046 !!!cp ('t172');
4047 last;
4048 }
4049 }
4050
4051 !!!cp ('t173');
4052 !!!parse-error (type => 'unmatched end tag',
4053 value => $token->{tag_name});
4054 ## Ignore the token
4055 !!!next-token;
4056 redo B;
4057 } # INSCOPE
4058
4059 ## generate implied end tags
4060 while ({
4061 dd => 1, dt => 1, li => 1, p => 1,
4062 }->{$self->{open_elements}->[-1]->[1]}) {
4063 !!!cp ('t174');
4064 pop @{$self->{open_elements}};
4065 }
4066
4067 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
4068 !!!cp ('t175');
4069 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4070 } else {
4071 !!!cp ('t176');
4072 }
4073
4074 splice @{$self->{open_elements}}, $i;
4075
4076 $clear_up_to_marker->();
4077
4078 $self->{insertion_mode} = IN_TABLE_IM;
4079
4080 !!!next-token;
4081 redo B;
4082 } elsif ($self->{insertion_mode} == IN_CELL_IM) {
4083 !!!cp ('t177');
4084 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4085 ## Ignore the token
4086 !!!next-token;
4087 redo B;
4088 } else {
4089 !!!cp ('t178');
4090 #
4091 }
4092 } elsif ({
4093 table => 1, tbody => 1, tfoot => 1,
4094 thead => 1, tr => 1,
4095 }->{$token->{tag_name}} and
4096 $self->{insertion_mode} == IN_CELL_IM) {
4097 ## have an element in table scope
4098 my $i;
4099 my $tn;
4100 INSCOPE: {
4101 for (reverse 0..$#{$self->{open_elements}}) {
4102 my $node = $self->{open_elements}->[$_];
4103 if ($node->[1] eq $token->{tag_name}) {
4104 !!!cp ('t179');
4105 $i = $_;
4106
4107 ## Close the cell
4108 !!!back-token; # </?>
4109 $token = {type => END_TAG_TOKEN, tag_name => $tn};
4110 redo B;
4111 } elsif ($node->[1] eq 'td' or $node->[1] eq 'th') {
4112 !!!cp ('t180');
4113 $tn = $node->[1];
4114 ## NOTE: There is exactly one |td| or |th| element
4115 ## in scope in the stack of open elements by definition.
4116 } elsif ({
4117 table => 1, html => 1,
4118 }->{$node->[1]}) {
4119 ## ISSUE: Can this be reached?
4120 !!!cp ('t181');
4121 last;
4122 }
4123 }
4124
4125 !!!cp ('t182');
4126 !!!parse-error (type => 'unmatched end tag',
4127 value => $token->{tag_name});
4128 ## Ignore the token
4129 !!!next-token;
4130 redo B;
4131 } # INSCOPE
4132 } elsif ($token->{tag_name} eq 'table' and
4133 $self->{insertion_mode} == IN_CAPTION_IM) {
4134 !!!parse-error (type => 'not closed:caption');
4135
4136 ## As if </caption>
4137 ## have a table element in table scope
4138 my $i;
4139 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4140 my $node = $self->{open_elements}->[$_];
4141 if ($node->[1] eq 'caption') {
4142 !!!cp ('t184');
4143 $i = $_;
4144 last INSCOPE;
4145 } elsif ({
4146 table => 1, html => 1,
4147 }->{$node->[1]}) {
4148 !!!cp ('t185');
4149 last INSCOPE;
4150 }
4151 } # INSCOPE
4152 unless (defined $i) {
4153 !!!cp ('t186');
4154 !!!parse-error (type => 'unmatched end tag:caption');
4155 ## Ignore the token
4156 !!!next-token;
4157 redo B;
4158 }
4159
4160 ## generate implied end tags
4161 while ({
4162 dd => 1, dt => 1, li => 1, p => 1,
4163 }->{$self->{open_elements}->[-1]->[1]}) {
4164 !!!cp ('t187');
4165 pop @{$self->{open_elements}};
4166 }
4167
4168 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
4169 !!!cp ('t188');
4170 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4171 } else {
4172 !!!cp ('t189');
4173 }
4174
4175 splice @{$self->{open_elements}}, $i;
4176
4177 $clear_up_to_marker->();
4178
4179 $self->{insertion_mode} = IN_TABLE_IM;
4180
4181 ## reprocess
4182 redo B;
4183 } elsif ({
4184 body => 1, col => 1, colgroup => 1, html => 1,
4185 }->{$token->{tag_name}}) {
4186 if ($self->{insertion_mode} & BODY_TABLE_IMS) {
4187 !!!cp ('t190');
4188 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4189 ## Ignore the token
4190 !!!next-token;
4191 redo B;
4192 } else {
4193 !!!cp ('t191');
4194 #
4195 }
4196 } elsif ({
4197 tbody => 1, tfoot => 1,
4198 thead => 1, tr => 1,
4199 }->{$token->{tag_name}} and
4200 $self->{insertion_mode} == IN_CAPTION_IM) {
4201 !!!cp ('t192');
4202 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4203 ## Ignore the token
4204 !!!next-token;
4205 redo B;
4206 } else {
4207 !!!cp ('t193');
4208 #
4209 }
4210 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
4211 for my $entry (@{$self->{open_elements}}) {
4212 if (not {
4213 dd => 1, dt => 1, li => 1, p => 1, tbody => 1, td => 1, tfoot => 1,
4214 th => 1, thead => 1, tr => 1, body => 1, html => 1,
4215 }->{$entry->[1]}) {
4216 !!!cp ('t75');
4217 !!!parse-error (type => 'in body:#eof');
4218 last;
4219 }
4220 }
4221
4222 ## Stop parsing.
4223 last B;
4224 } else {
4225 die "$0: $token->{type}: Unknown token type";
4226 }
4227
4228 $insert = $insert_to_current;
4229 #
4230 } elsif ($self->{insertion_mode} & TABLE_IMS) {
4231 if ($token->{type} == CHARACTER_TOKEN) {
4232 if (not $open_tables->[-1]->[1] and # tainted
4233 $token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4234 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4235
4236 unless (length $token->{data}) {
4237 !!!cp ('t194');
4238 !!!next-token;
4239 redo B;
4240 } else {
4241 !!!cp ('t195');
4242 }
4243 }
4244
4245 !!!parse-error (type => 'in table:#character');
4246
4247 ## As if in body, but insert into foster parent element
4248 ## ISSUE: Spec says that "whenever a node would be inserted
4249 ## into the current node" while characters might not be
4250 ## result in a new Text node.
4251 $reconstruct_active_formatting_elements->($insert_to_foster);
4252
4253 if ({
4254 table => 1, tbody => 1, tfoot => 1,
4255 thead => 1, tr => 1,
4256 }->{$self->{open_elements}->[-1]->[1]}) {
4257 # MUST
4258 my $foster_parent_element;
4259 my $next_sibling;
4260 my $prev_sibling;
4261 OE: for (reverse 0..$#{$self->{open_elements}}) {
4262 if ($self->{open_elements}->[$_]->[1] eq 'table') {
4263 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
4264 if (defined $parent and $parent->node_type == 1) {
4265 !!!cp ('t196');
4266 $foster_parent_element = $parent;
4267 $next_sibling = $self->{open_elements}->[$_]->[0];
4268 $prev_sibling = $next_sibling->previous_sibling;
4269 } else {
4270 !!!cp ('t197');
4271 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
4272 $prev_sibling = $foster_parent_element->last_child;
4273 }
4274 last OE;
4275 }
4276 } # OE
4277 $foster_parent_element = $self->{open_elements}->[0]->[0] and
4278 $prev_sibling = $foster_parent_element->last_child
4279 unless defined $foster_parent_element;
4280 if (defined $prev_sibling and
4281 $prev_sibling->node_type == 3) {
4282 !!!cp ('t198');
4283 $prev_sibling->manakai_append_text ($token->{data});
4284 } else {
4285 !!!cp ('t199');
4286 $foster_parent_element->insert_before
4287 ($self->{document}->create_text_node ($token->{data}),
4288 $next_sibling);
4289 }
4290 $open_tables->[-1]->[1] = 1; # tainted
4291 } else {
4292 !!!cp ('t200');
4293 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4294 }
4295
4296 !!!next-token;
4297 redo B;
4298 } elsif ($token->{type} == START_TAG_TOKEN) {
4299 if ({
4300 tr => ($self->{insertion_mode} != IN_ROW_IM),
4301 th => 1, td => 1,
4302 }->{$token->{tag_name}}) {
4303 if ($self->{insertion_mode} == IN_TABLE_IM) {
4304 ## Clear back to table context
4305 while ($self->{open_elements}->[-1]->[1] ne 'table' and
4306 $self->{open_elements}->[-1]->[1] ne 'html') {
4307 !!!cp ('t201');
4308 pop @{$self->{open_elements}};
4309 }
4310
4311 !!!insert-element ('tbody');
4312 $self->{insertion_mode} = IN_TABLE_BODY_IM;
4313 ## reprocess in the "in table body" insertion mode...
4314 }
4315
4316 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
4317 unless ($token->{tag_name} eq 'tr') {
4318 !!!cp ('t202');
4319 !!!parse-error (type => 'missing start tag:tr');
4320 }
4321
4322 ## Clear back to table body context
4323 while (not {
4324 tbody => 1, tfoot => 1, thead => 1, html => 1,
4325 }->{$self->{open_elements}->[-1]->[1]}) {
4326 !!!cp ('t203');
4327 ## ISSUE: Can this case be reached?
4328 pop @{$self->{open_elements}};
4329 }
4330
4331 $self->{insertion_mode} = IN_ROW_IM;
4332 if ($token->{tag_name} eq 'tr') {
4333 !!!cp ('t204');
4334 !!!insert-element ($token->{tag_name}, $token->{attributes});
4335 !!!next-token;
4336 redo B;
4337 } else {
4338 !!!cp ('t205');
4339 !!!insert-element ('tr');
4340 ## reprocess in the "in row" insertion mode
4341 }
4342 } else {
4343 !!!cp ('t206');
4344 }
4345
4346 ## Clear back to table row context
4347 while (not {
4348 tr => 1, html => 1,
4349 }->{$self->{open_elements}->[-1]->[1]}) {
4350 !!!cp ('t207');
4351 pop @{$self->{open_elements}};
4352 }
4353
4354 !!!insert-element ($token->{tag_name}, $token->{attributes});
4355 $self->{insertion_mode} = IN_CELL_IM;
4356
4357 push @$active_formatting_elements, ['#marker', ''];
4358
4359 !!!next-token;
4360 redo B;
4361 } elsif ({
4362 caption => 1, col => 1, colgroup => 1,
4363 tbody => 1, tfoot => 1, thead => 1,
4364 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
4365 }->{$token->{tag_name}}) {
4366 if ($self->{insertion_mode} == IN_ROW_IM) {
4367 ## As if </tr>
4368 ## have an element in table scope
4369 my $i;
4370 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4371 my $node = $self->{open_elements}->[$_];
4372 if ($node->[1] eq 'tr') {
4373 !!!cp ('t208');
4374 $i = $_;
4375 last INSCOPE;
4376 } elsif ({
4377 html => 1,
4378
4379 ## NOTE: This element does not appear here, maybe.
4380 table => 1,
4381 }->{$node->[1]}) {
4382 !!!cp ('t209');
4383 last INSCOPE;
4384 }
4385 } # INSCOPE
4386 unless (defined $i) {
4387 !!!cp ('t210');
4388 ## TODO: This type is wrong.
4389 !!!parse-error (type => 'unmacthed end tag:'.$token->{tag_name});
4390 ## Ignore the token
4391 !!!next-token;
4392 redo B;
4393 }
4394
4395 ## Clear back to table row context
4396 while (not {
4397 tr => 1, html => 1,
4398 }->{$self->{open_elements}->[-1]->[1]}) {
4399 !!!cp ('t211');
4400 ## ISSUE: Can this case be reached?
4401 pop @{$self->{open_elements}};
4402 }
4403
4404 pop @{$self->{open_elements}}; # tr
4405 $self->{insertion_mode} = IN_TABLE_BODY_IM;
4406 if ($token->{tag_name} eq 'tr') {
4407 !!!cp ('t212');
4408 ## reprocess
4409 redo B;
4410 } else {
4411 !!!cp ('t213');
4412 ## reprocess in the "in table body" insertion mode...
4413 }
4414 }
4415
4416 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
4417 ## have an element in table scope
4418 my $i;
4419 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4420 my $node = $self->{open_elements}->[$_];
4421 if ({
4422 tbody => 1, thead => 1, tfoot => 1,
4423 }->{$node->[1]}) {
4424 !!!cp ('t214');
4425 $i = $_;
4426 last INSCOPE;
4427 } elsif ({
4428 table => 1, html => 1,
4429 }->{$node->[1]}) {
4430 !!!cp ('t215');
4431 last INSCOPE;
4432 }
4433 } # INSCOPE
4434 unless (defined $i) {
4435 !!!cp ('t216');
4436 ## TODO: This erorr type ios wrong.
4437 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4438 ## Ignore the token
4439 !!!next-token;
4440 redo B;
4441 }
4442
4443 ## Clear back to table body context
4444 while (not {
4445 tbody => 1, tfoot => 1, thead => 1, html => 1,
4446 }->{$self->{open_elements}->[-1]->[1]}) {
4447 !!!cp ('t217');
4448 ## ISSUE: Can this state be reached?
4449 pop @{$self->{open_elements}};
4450 }
4451
4452 ## As if <{current node}>
4453 ## have an element in table scope
4454 ## true by definition
4455
4456 ## Clear back to table body context
4457 ## nop by definition
4458
4459 pop @{$self->{open_elements}};
4460 $self->{insertion_mode} = IN_TABLE_IM;
4461 ## reprocess in "in table" insertion mode...
4462 } else {
4463 !!!cp ('t218');
4464 }
4465
4466 if ($token->{tag_name} eq 'col') {
4467 ## Clear back to table context
4468 while ($self->{open_elements}->[-1]->[1] ne 'table' and
4469 $self->{open_elements}->[-1]->[1] ne 'html') {
4470 !!!cp ('t219');
4471 ## ISSUE: Can this state be reached?
4472 pop @{$self->{open_elements}};
4473 }
4474
4475 !!!insert-element ('colgroup');
4476 $self->{insertion_mode} = IN_COLUMN_GROUP_IM;
4477 ## reprocess
4478 redo B;
4479 } elsif ({
4480 caption => 1,
4481 colgroup => 1,
4482 tbody => 1, tfoot => 1, thead => 1,
4483 }->{$token->{tag_name}}) {
4484 ## Clear back to table context
4485 while ($self->{open_elements}->[-1]->[1] ne 'table' and
4486 $self->{open_elements}->[-1]->[1] ne 'html') {
4487 !!!cp ('t220');
4488 ## ISSUE: Can this state be reached?
4489 pop @{$self->{open_elements}};
4490 }
4491
4492 push @$active_formatting_elements, ['#marker', '']
4493 if $token->{tag_name} eq 'caption';
4494
4495 !!!insert-element ($token->{tag_name}, $token->{attributes});
4496 $self->{insertion_mode} = {
4497 caption => IN_CAPTION_IM,
4498 colgroup => IN_COLUMN_GROUP_IM,
4499 tbody => IN_TABLE_BODY_IM,
4500 tfoot => IN_TABLE_BODY_IM,
4501 thead => IN_TABLE_BODY_IM,
4502 }->{$token->{tag_name}};
4503 !!!next-token;
4504 redo B;
4505 } else {
4506 die "$0: in table: <>: $token->{tag_name}";
4507 }
4508 } elsif ($token->{tag_name} eq 'table') {
4509 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4510
4511 ## As if </table>
4512 ## have a table element in table scope
4513 my $i;
4514 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4515 my $node = $self->{open_elements}->[$_];
4516 if ($node->[1] eq 'table') {
4517 !!!cp ('t221');
4518 $i = $_;
4519 last INSCOPE;
4520 } elsif ({
4521 #table => 1,
4522 html => 1,
4523 }->{$node->[1]}) {
4524 !!!cp ('t222');
4525 last INSCOPE;
4526 }
4527 } # INSCOPE
4528 unless (defined $i) {
4529 !!!cp ('t223');
4530 ## TODO: The following is wrong, maybe.
4531 !!!parse-error (type => 'unmatched end tag:table');
4532 ## Ignore tokens </table><table>
4533 !!!next-token;
4534 redo B;
4535 }
4536
4537 ## TODO: Followings are removed from the latest spec.
4538 ## generate implied end tags
4539 while ({
4540 dd => 1, dt => 1, li => 1, p => 1,
4541 }->{$self->{open_elements}->[-1]->[1]}) {
4542 !!!cp ('t224');
4543 pop @{$self->{open_elements}};
4544 }
4545
4546 if ($self->{open_elements}->[-1]->[1] ne 'table') {
4547 !!!cp ('t225');
4548 ## ISSUE: Can this case be reached?
4549 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4550 } else {
4551 !!!cp ('t226');
4552 }
4553
4554 splice @{$self->{open_elements}}, $i;
4555 pop @{$open_tables};
4556
4557 $self->_reset_insertion_mode;
4558
4559 ## reprocess
4560 redo B;
4561 } elsif ($token->{tag_name} eq 'style') {
4562 if (not $open_tables->[-1]->[1]) { # tainted
4563 !!!cp ('t227.8');
4564 ## NOTE: This is a "as if in head" code clone.
4565 $parse_rcdata->(CDATA_CONTENT_MODEL);
4566 redo B;
4567 } else {
4568 !!!cp ('t227.7');
4569 #
4570 }
4571 } elsif ($token->{tag_name} eq 'script') {
4572 if (not $open_tables->[-1]->[1]) { # tainted
4573 !!!cp ('t227.6');
4574 ## NOTE: This is a "as if in head" code clone.
4575 $script_start_tag->();
4576 redo B;
4577 } else {
4578 !!!cp ('t227.5');
4579 #
4580 }
4581 } elsif ($token->{tag_name} eq 'input') {
4582 if (not $open_tables->[-1]->[1]) { # tainted
4583 if ($token->{attributes}->{type}) { ## TODO: case
4584 my $type = lc $token->{attributes}->{type}->{value};
4585 if ($type eq 'hidden') {
4586 !!!cp ('t227.3');
4587 !!!parse-error (type => 'in table:'.$token->{tag_name});
4588
4589 !!!insert-element ($token->{tag_name}, $token->{attributes});
4590
4591 ## TODO: form element pointer
4592
4593 pop @{$self->{open_elements}};
4594
4595 !!!next-token;
4596 redo B;
4597 } else {
4598 !!!cp ('t227.2');
4599 #
4600 }
4601 } else {
4602 !!!cp ('t227.1');
4603 #
4604 }
4605 } else {
4606 !!!cp ('t227.4');
4607 #
4608 }
4609 } else {
4610 !!!cp ('t227');
4611 #
4612 }
4613
4614 !!!parse-error (type => 'in table:'.$token->{tag_name});
4615
4616 $insert = $insert_to_foster;
4617 #
4618 } elsif ($token->{type} == END_TAG_TOKEN) {
4619 if ($token->{tag_name} eq 'tr' and
4620 $self->{insertion_mode} == IN_ROW_IM) {
4621 ## have an element in table scope
4622 my $i;
4623 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4624 my $node = $self->{open_elements}->[$_];
4625 if ($node->[1] eq $token->{tag_name}) {
4626 !!!cp ('t228');
4627 $i = $_;
4628 last INSCOPE;
4629 } elsif ({
4630 table => 1, html => 1,
4631 }->{$node->[1]}) {
4632 !!!cp ('t229');
4633 last INSCOPE;
4634 }
4635 } # INSCOPE
4636 unless (defined $i) {
4637 !!!cp ('t230');
4638 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4639 ## Ignore the token
4640 !!!next-token;
4641 redo B;
4642 } else {
4643 !!!cp ('t232');
4644 }
4645
4646 ## Clear back to table row context
4647 while (not {
4648 tr => 1, html => 1,
4649 }->{$self->{open_elements}->[-1]->[1]}) {
4650 !!!cp ('t231');
4651 ## ISSUE: Can this state be reached?
4652 pop @{$self->{open_elements}};
4653 }
4654
4655 pop @{$self->{open_elements}}; # tr
4656 $self->{insertion_mode} = IN_TABLE_BODY_IM;
4657 !!!next-token;
4658 redo B;
4659 } elsif ($token->{tag_name} eq 'table') {
4660 if ($self->{insertion_mode} == IN_ROW_IM) {
4661 ## As if </tr>
4662 ## have an element in table scope
4663 my $i;
4664 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4665 my $node = $self->{open_elements}->[$_];
4666 if ($node->[1] eq 'tr') {
4667 !!!cp ('t233');
4668 $i = $_;
4669 last INSCOPE;
4670 } elsif ({
4671 table => 1, html => 1,
4672 }->{$node->[1]}) {
4673 !!!cp ('t234');
4674 last INSCOPE;
4675 }
4676 } # INSCOPE
4677 unless (defined $i) {
4678 !!!cp ('t235');
4679 ## TODO: The following is wrong.
4680 !!!parse-error (type => 'unmatched end tag:'.$token->{type});
4681 ## Ignore the token
4682 !!!next-token;
4683 redo B;
4684 }
4685
4686 ## Clear back to table row context
4687 while (not {
4688 tr => 1, html => 1,
4689 }->{$self->{open_elements}->[-1]->[1]}) {
4690 !!!cp ('t236');
4691 ## ISSUE: Can this state be reached?
4692 pop @{$self->{open_elements}};
4693 }
4694
4695 pop @{$self->{open_elements}}; # tr
4696 $self->{insertion_mode} = IN_TABLE_BODY_IM;
4697 ## reprocess in the "in table body" insertion mode...
4698 }
4699
4700 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
4701 ## have an element in table scope
4702 my $i;
4703 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4704 my $node = $self->{open_elements}->[$_];
4705 if ({
4706 tbody => 1, thead => 1, tfoot => 1,
4707 }->{$node->[1]}) {
4708 !!!cp ('t237');
4709 $i = $_;
4710 last INSCOPE;
4711 } elsif ({
4712 table => 1, html => 1,
4713 }->{$node->[1]}) {
4714 !!!cp ('t238');
4715 last INSCOPE;
4716 }
4717 } # INSCOPE
4718 unless (defined $i) {
4719 !!!cp ('t239');
4720 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4721 ## Ignore the token
4722 !!!next-token;
4723 redo B;
4724 }
4725
4726 ## Clear back to table body context
4727 while (not {
4728 tbody => 1, tfoot => 1, thead => 1, html => 1,
4729 }->{$self->{open_elements}->[-1]->[1]}) {
4730 !!!cp ('t240');
4731 pop @{$self->{open_elements}};
4732 }
4733
4734 ## As if <{current node}>
4735 ## have an element in table scope
4736 ## true by definition
4737
4738 ## Clear back to table body context
4739 ## nop by definition
4740
4741 pop @{$self->{open_elements}};
4742 $self->{insertion_mode} = IN_TABLE_IM;
4743 ## reprocess in the "in table" insertion mode...
4744 }
4745
4746 ## NOTE: </table> in the "in table" insertion mode.
4747 ## When you edit the code fragment below, please ensure that
4748 ## the code for <table> in the "in table" insertion mode
4749 ## is synced with it.
4750
4751 ## have a table element in table scope
4752 my $i;
4753 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4754 my $node = $self->{open_elements}->[$_];
4755 if ($node->[1] eq $token->{tag_name}) {
4756 !!!cp ('t241');
4757 $i = $_;
4758 last INSCOPE;
4759 } elsif ({
4760 table => 1, html => 1,
4761 }->{$node->[1]}) {
4762 !!!cp ('t242');
4763 last INSCOPE;
4764 }
4765 } # INSCOPE
4766 unless (defined $i) {
4767 !!!cp ('t243');
4768 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4769 ## Ignore the token
4770 !!!next-token;
4771 redo B;
4772 }
4773
4774 splice @{$self->{open_elements}}, $i;
4775 pop @{$open_tables};
4776
4777 $self->_reset_insertion_mode;
4778
4779 !!!next-token;
4780 redo B;
4781 } elsif ({
4782 tbody => 1, tfoot => 1, thead => 1,
4783 }->{$token->{tag_name}} and
4784 $self->{insertion_mode} & ROW_IMS) {
4785 if ($self->{insertion_mode} == IN_ROW_IM) {
4786 ## have an element in table scope
4787 my $i;
4788 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4789 my $node = $self->{open_elements}->[$_];
4790 if ($node->[1] eq $token->{tag_name}) {
4791 !!!cp ('t247');
4792 $i = $_;
4793 last INSCOPE;
4794 } elsif ({
4795 table => 1, html => 1,
4796 }->{$node->[1]}) {
4797 !!!cp ('t248');
4798 last INSCOPE;
4799 }
4800 } # INSCOPE
4801 unless (defined $i) {
4802 !!!cp ('t249');
4803 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4804 ## Ignore the token
4805 !!!next-token;
4806 redo B;
4807 }
4808
4809 ## As if </tr>
4810 ## have an element in table scope
4811 my $i;
4812 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4813 my $node = $self->{open_elements}->[$_];
4814 if ($node->[1] eq 'tr') {
4815 !!!cp ('t250');
4816 $i = $_;
4817 last INSCOPE;
4818 } elsif ({
4819 table => 1, html => 1,
4820 }->{$node->[1]}) {
4821 !!!cp ('t251');
4822 last INSCOPE;
4823 }
4824 } # INSCOPE
4825 unless (defined $i) {
4826 !!!cp ('t252');
4827 !!!parse-error (type => 'unmatched end tag:tr');
4828 ## Ignore the token
4829 !!!next-token;
4830 redo B;
4831 }
4832
4833 ## Clear back to table row context
4834 while (not {
4835 tr => 1, html => 1,
4836 }->{$self->{open_elements}->[-1]->[1]}) {
4837 !!!cp ('t253');
4838 ## ISSUE: Can this case be reached?
4839 pop @{$self->{open_elements}};
4840 }
4841
4842 pop @{$self->{open_elements}}; # tr
4843 $self->{insertion_mode} = IN_TABLE_BODY_IM;
4844 ## reprocess in the "in table body" insertion mode...
4845 }
4846
4847 ## have an element in table scope
4848 my $i;
4849 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4850 my $node = $self->{open_elements}->[$_];
4851 if ($node->[1] eq $token->{tag_name}) {
4852 !!!cp ('t254');
4853 $i = $_;
4854 last INSCOPE;
4855 } elsif ({
4856 table => 1, html => 1,
4857 }->{$node->[1]}) {
4858 !!!cp ('t255');
4859 last INSCOPE;
4860 }
4861 } # INSCOPE
4862 unless (defined $i) {
4863 !!!cp ('t256');
4864 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4865 ## Ignore the token
4866 !!!next-token;
4867 redo B;
4868 }
4869
4870 ## Clear back to table body context
4871 while (not {
4872 tbody => 1, tfoot => 1, thead => 1, html => 1,
4873 }->{$self->{open_elements}->[-1]->[1]}) {
4874 !!!cp ('t257');
4875 ## ISSUE: Can this case be reached?
4876 pop @{$self->{open_elements}};
4877 }
4878
4879 pop @{$self->{open_elements}};
4880 $self->{insertion_mode} = IN_TABLE_IM;
4881 !!!next-token;
4882 redo B;
4883 } elsif ({
4884 body => 1, caption => 1, col => 1, colgroup => 1,
4885 html => 1, td => 1, th => 1,
4886 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
4887 tbody => 1, tfoot => 1, thead => 1, # $self->{insertion_mode} == IN_TABLE_IM
4888 }->{$token->{tag_name}}) {
4889 !!!cp ('t258');
4890 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4891 ## Ignore the token
4892 !!!next-token;
4893 redo B;
4894 } else {
4895 !!!cp ('t259');
4896 !!!parse-error (type => 'in table:/'.$token->{tag_name});
4897
4898 $insert = $insert_to_foster;
4899 #
4900 }
4901 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
4902 unless ($self->{open_elements}->[-1]->[1] eq 'html' and
4903 @{$self->{open_elements}} == 1) { # redundant, maybe
4904 !!!parse-error (type => 'in body:#eof');
4905 !!!cp ('t259.1');
4906 #
4907 } else {
4908 !!!cp ('t259.2');
4909 #
4910 }
4911
4912 ## Stop parsing
4913 last B;
4914 } else {
4915 die "$0: $token->{type}: Unknown token type";
4916 }
4917 } elsif ($self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
4918 if ($token->{type} == CHARACTER_TOKEN) {
4919 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4920 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4921 unless (length $token->{data}) {
4922 !!!cp ('t260');
4923 !!!next-token;
4924 redo B;
4925 }
4926 }
4927
4928 !!!cp ('t261');
4929 #
4930 } elsif ($token->{type} == START_TAG_TOKEN) {
4931 if ($token->{tag_name} eq 'col') {
4932 !!!cp ('t262');
4933 !!!insert-element ($token->{tag_name}, $token->{attributes});
4934 pop @{$self->{open_elements}};
4935 !!!next-token;
4936 redo B;
4937 } else {
4938 !!!cp ('t263');
4939 #
4940 }
4941 } elsif ($token->{type} == END_TAG_TOKEN) {
4942 if ($token->{tag_name} eq 'colgroup') {
4943 if ($self->{open_elements}->[-1]->[1] eq 'html') {
4944 !!!cp ('t264');
4945 !!!parse-error (type => 'unmatched end tag:colgroup');
4946 ## Ignore the token
4947 !!!next-token;
4948 redo B;
4949 } else {
4950 !!!cp ('t265');
4951 pop @{$self->{open_elements}}; # colgroup
4952 $self->{insertion_mode} = IN_TABLE_IM;
4953 !!!next-token;
4954 redo B;
4955 }
4956 } elsif ($token->{tag_name} eq 'col') {
4957 !!!cp ('t266');
4958 !!!parse-error (type => 'unmatched end tag:col');
4959 ## Ignore the token
4960 !!!next-token;
4961 redo B;
4962 } else {
4963 !!!cp ('t267');
4964 #
4965 }
4966 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
4967 if ($self->{open_elements}->[-1]->[1] eq 'html' or
4968 @{$self->{open_elements}} == 1) { # redundant, maybe
4969 !!!cp ('t270.2');
4970 ## Stop parsing.
4971 last B;
4972 } else {
4973 ## NOTE: As if </colgroup>.
4974 !!!cp ('t270.1');
4975 pop @{$self->{open_elements}}; # colgroup
4976 $self->{insertion_mode} = IN_TABLE_IM;
4977 ## Reprocess.
4978 redo B;
4979 }
4980 } else {
4981 die "$0: $token->{type}: Unknown token type";
4982 }
4983
4984 ## As if </colgroup>
4985 if ($self->{open_elements}->[-1]->[1] eq 'html') {
4986 !!!cp ('t269');
4987 ## TODO: Wrong error type?
4988 !!!parse-error (type => 'unmatched end tag:colgroup');
4989 ## Ignore the token
4990 !!!next-token;
4991 redo B;
4992 } else {
4993 !!!cp ('t270');
4994 pop @{$self->{open_elements}}; # colgroup
4995 $self->{insertion_mode} = IN_TABLE_IM;
4996 ## reprocess
4997 redo B;
4998 }
4999 } elsif ($self->{insertion_mode} & SELECT_IMS) {
5000 if ($token->{type} == CHARACTER_TOKEN) {
5001 !!!cp ('t271');
5002 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
5003 !!!next-token;
5004 redo B;
5005 } elsif ($token->{type} == START_TAG_TOKEN) {
5006 if ($token->{tag_name} eq 'option') {
5007 if ($self->{open_elements}->[-1]->[1] eq 'option') {
5008 !!!cp ('t272');
5009 ## As if </option>
5010 pop @{$self->{open_elements}};
5011 } else {
5012 !!!cp ('t273');
5013 }
5014
5015 !!!insert-element ($token->{tag_name}, $token->{attributes});
5016 !!!next-token;
5017 redo B;
5018 } elsif ($token->{tag_name} eq 'optgroup') {
5019 if ($self->{open_elements}->[-1]->[1] eq 'option') {
5020 !!!cp ('t274');
5021 ## As if </option>
5022 pop @{$self->{open_elements}};
5023 } else {
5024 !!!cp ('t275');
5025 }
5026
5027 if ($self->{open_elements}->[-1]->[1] eq 'optgroup') {
5028 !!!cp ('t276');
5029 ## As if </optgroup>
5030 pop @{$self->{open_elements}};
5031 } else {
5032 !!!cp ('t277');
5033 }
5034
5035 !!!insert-element ($token->{tag_name}, $token->{attributes});
5036 !!!next-token;
5037 redo B;
5038 } elsif ($token->{tag_name} eq 'select' or
5039 $token->{tag_name} eq 'input' or
5040 ($self->{insertion_mode} == IN_SELECT_IN_TABLE_IM and
5041 {
5042 caption => 1, table => 1,
5043 tbody => 1, tfoot => 1, thead => 1,
5044 tr => 1, td => 1, th => 1,
5045 }->{$token->{tag_name}})) {
5046 ## TODO: The type below is not good - <select> is replaced by </select>
5047 !!!parse-error (type => 'not closed:select');
5048 ## NOTE: As if the token were </select> (<select> case) or
5049 ## as if there were </select> (otherwise).
5050 ## have an element in table scope
5051 my $i;
5052 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5053 my $node = $self->{open_elements}->[$_];
5054 if ($node->[1] eq 'select') {
5055 !!!cp ('t278');
5056 $i = $_;
5057 last INSCOPE;
5058 } elsif ({
5059 table => 1, html => 1,
5060 }->{$node->[1]}) {
5061 !!!cp ('t279');
5062 last INSCOPE;
5063 }
5064 } # INSCOPE
5065 unless (defined $i) {
5066 !!!cp ('t280');
5067 !!!parse-error (type => 'unmatched end tag:select');
5068 ## Ignore the token
5069 !!!next-token;
5070 redo B;
5071 }
5072
5073 !!!cp ('t281');
5074 splice @{$self->{open_elements}}, $i;
5075
5076 $self->_reset_insertion_mode;
5077
5078 if ($token->{tag_name} eq 'select') {
5079 !!!cp ('t281.2');
5080 !!!next-token;
5081 redo B;
5082 } else {
5083 !!!cp ('t281.1');
5084 ## Reprocess the token.
5085 redo B;
5086 }
5087 } else {
5088 !!!cp ('t282');
5089 !!!parse-error (type => 'in select:'.$token->{tag_name});
5090 ## Ignore the token
5091 !!!next-token;
5092 redo B;
5093 }
5094 } elsif ($token->{type} == END_TAG_TOKEN) {
5095 if ($token->{tag_name} eq 'optgroup') {
5096 if ($self->{open_elements}->[-1]->[1] eq 'option' and
5097 $self->{open_elements}->[-2]->[1] eq 'optgroup') {
5098 !!!cp ('t283');
5099 ## As if </option>
5100 splice @{$self->{open_elements}}, -2;
5101 } elsif ($self->{open_elements}->[-1]->[1] eq 'optgroup') {
5102 !!!cp ('t284');
5103 pop @{$self->{open_elements}};
5104 } else {
5105 !!!cp ('t285');
5106 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5107 ## Ignore the token
5108 }
5109 !!!next-token;
5110 redo B;
5111 } elsif ($token->{tag_name} eq 'option') {
5112 if ($self->{open_elements}->[-1]->[1] eq 'option') {
5113 !!!cp ('t286');
5114 pop @{$self->{open_elements}};
5115 } else {
5116 !!!cp ('t287');
5117 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5118 ## Ignore the token
5119 }
5120 !!!next-token;
5121 redo B;
5122 } elsif ($token->{tag_name} eq 'select') {
5123 ## have an element in table scope
5124 my $i;
5125 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5126 my $node = $self->{open_elements}->[$_];
5127 if ($node->[1] eq $token->{tag_name}) {
5128 !!!cp ('t288');
5129 $i = $_;
5130 last INSCOPE;
5131 } elsif ({
5132 table => 1, html => 1,
5133 }->{$node->[1]}) {
5134 !!!cp ('t289');
5135 last INSCOPE;
5136 }
5137 } # INSCOPE
5138 unless (defined $i) {
5139 !!!cp ('t290');
5140 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5141 ## Ignore the token
5142 !!!next-token;
5143 redo B;
5144 }
5145
5146 !!!cp ('t291');
5147 splice @{$self->{open_elements}}, $i;
5148
5149 $self->_reset_insertion_mode;
5150
5151 !!!next-token;
5152 redo B;
5153 } elsif ($self->{insertion_mode} == IN_SELECT_IN_TABLE_IM and
5154 {
5155 caption => 1, table => 1, tbody => 1,
5156 tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
5157 }->{$token->{tag_name}}) {
5158 ## TODO: The following is wrong?
5159 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5160
5161 ## have an element in table scope
5162 my $i;
5163 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5164 my $node = $self->{open_elements}->[$_];
5165 if ($node->[1] eq $token->{tag_name}) {
5166 !!!cp ('t292');
5167 $i = $_;
5168 last INSCOPE;
5169 } elsif ({
5170 table => 1, html => 1,
5171 }->{$node->[1]}) {
5172 !!!cp ('t293');
5173 last INSCOPE;
5174 }
5175 } # INSCOPE
5176 unless (defined $i) {
5177 !!!cp ('t294');
5178 ## Ignore the token
5179 !!!next-token;
5180 redo B;
5181 }
5182
5183 ## As if </select>
5184 ## have an element in table scope
5185 undef $i;
5186 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5187 my $node = $self->{open_elements}->[$_];
5188 if ($node->[1] eq 'select') {
5189 !!!cp ('t295');
5190 $i = $_;
5191 last INSCOPE;
5192 } elsif ({
5193 table => 1, html => 1,
5194 }->{$node->[1]}) {
5195 ## ISSUE: Can this state be reached?
5196 !!!cp ('t296');
5197 last INSCOPE;
5198 }
5199 } # INSCOPE
5200 unless (defined $i) {
5201 !!!cp ('t297');
5202 ## TODO: The following error type is correct?
5203 !!!parse-error (type => 'unmatched end tag:select');
5204 ## Ignore the </select> token
5205 !!!next-token; ## TODO: ok?
5206 redo B;
5207 }
5208
5209 !!!cp ('t298');
5210 splice @{$self->{open_elements}}, $i;
5211
5212 $self->_reset_insertion_mode;
5213
5214 ## reprocess
5215 redo B;
5216 } else {
5217 !!!cp ('t299');
5218 !!!parse-error (type => 'in select:/'.$token->{tag_name});
5219 ## Ignore the token
5220 !!!next-token;
5221 redo B;
5222 }
5223 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5224 unless ($self->{open_elements}->[-1]->[1] eq 'html' and
5225 @{$self->{open_elements}} == 1) { # redundant, maybe
5226 !!!cp ('t299.1');
5227 !!!parse-error (type => 'in body:#eof');
5228 } else {
5229 !!!cp ('t299.2');
5230 }
5231
5232 ## Stop parsing.
5233 last B;
5234 } else {
5235 die "$0: $token->{type}: Unknown token type";
5236 }
5237 } elsif ($self->{insertion_mode} & BODY_AFTER_IMS) {
5238 if ($token->{type} == CHARACTER_TOKEN) {
5239 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5240 my $data = $1;
5241 ## As if in body
5242 $reconstruct_active_formatting_elements->($insert_to_current);
5243
5244 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5245
5246 unless (length $token->{data}) {
5247 !!!cp ('t300');
5248 !!!next-token;
5249 redo B;
5250 }
5251 }
5252
5253 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
5254 !!!cp ('t301');
5255 !!!parse-error (type => 'after html:#character');
5256
5257 ## Reprocess in the "after body" insertion mode.
5258 } else {
5259 !!!cp ('t302');
5260 }
5261
5262 ## "after body" insertion mode
5263 !!!parse-error (type => 'after body:#character');
5264
5265 $self->{insertion_mode} = IN_BODY_IM;
5266 ## reprocess
5267 redo B;
5268 } elsif ($token->{type} == START_TAG_TOKEN) {
5269 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
5270 !!!cp ('t303');
5271 !!!parse-error (type => 'after html:'.$token->{tag_name});
5272
5273 ## Reprocess in the "after body" insertion mode.
5274 } else {
5275 !!!cp ('t304');
5276 }
5277
5278 ## "after body" insertion mode
5279 !!!parse-error (type => 'after body:'.$token->{tag_name});
5280
5281 $self->{insertion_mode} = IN_BODY_IM;
5282 ## reprocess
5283 redo B;
5284 } elsif ($token->{type} == END_TAG_TOKEN) {
5285 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
5286 !!!cp ('t305');
5287 !!!parse-error (type => 'after html:/'.$token->{tag_name});
5288
5289 $self->{insertion_mode} = AFTER_BODY_IM;
5290 ## Reprocess in the "after body" insertion mode.
5291 } else {
5292 !!!cp ('t306');
5293 }
5294
5295 ## "after body" insertion mode
5296 if ($token->{tag_name} eq 'html') {
5297 if (defined $self->{inner_html_node}) {
5298 !!!cp ('t307');
5299 !!!parse-error (type => 'unmatched end tag:html');
5300 ## Ignore the token
5301 !!!next-token;
5302 redo B;
5303 } else {
5304 !!!cp ('t308');
5305 $self->{insertion_mode} = AFTER_HTML_BODY_IM;
5306 !!!next-token;
5307 redo B;
5308 }
5309 } else {
5310 !!!cp ('t309');
5311 !!!parse-error (type => 'after body:/'.$token->{tag_name});
5312
5313 $self->{insertion_mode} = IN_BODY_IM;
5314 ## reprocess
5315 redo B;
5316 }
5317 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5318 !!!cp ('t309.2');
5319 ## Stop parsing
5320 last B;
5321 } else {
5322 die "$0: $token->{type}: Unknown token type";
5323 }
5324 } elsif ($self->{insertion_mode} & FRAME_IMS) {
5325 if ($token->{type} == CHARACTER_TOKEN) {
5326 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5327 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5328
5329 unless (length $token->{data}) {
5330 !!!cp ('t310');
5331 !!!next-token;
5332 redo B;
5333 }
5334 }
5335
5336 if ($token->{data} =~ s/^[^\x09\x0A\x0B\x0C\x20]+//) {
5337 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
5338 !!!cp ('t311');
5339 !!!parse-error (type => 'in frameset:#character');
5340 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
5341 !!!cp ('t312');
5342 !!!parse-error (type => 'after frameset:#character');
5343 } else { # "after html frameset"
5344 !!!cp ('t313');
5345 !!!parse-error (type => 'after html:#character');
5346
5347 $self->{insertion_mode} = AFTER_FRAMESET_IM;
5348 ## Reprocess in the "after frameset" insertion mode.
5349 !!!parse-error (type => 'after frameset:#character');
5350 }
5351
5352 ## Ignore the token.
5353 if (length $token->{data}) {
5354 !!!cp ('t314');
5355 ## reprocess the rest of characters
5356 } else {
5357 !!!cp ('t315');
5358 !!!next-token;
5359 }
5360 redo B;
5361 }
5362
5363 die qq[$0: Character "$token->{data}"];
5364 } elsif ($token->{type} == START_TAG_TOKEN) {
5365 if ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
5366 !!!cp ('t316');
5367 !!!parse-error (type => 'after html:'.$token->{tag_name});
5368
5369 $self->{insertion_mode} = AFTER_FRAMESET_IM;
5370 ## Process in the "after frameset" insertion mode.
5371 } else {
5372 !!!cp ('t317');
5373 }
5374
5375 if ($token->{tag_name} eq 'frameset' and
5376 $self->{insertion_mode} == IN_FRAMESET_IM) {
5377 !!!cp ('t318');
5378 !!!insert-element ($token->{tag_name}, $token->{attributes});
5379 !!!next-token;
5380 redo B;
5381 } elsif ($token->{tag_name} eq 'frame' and
5382 $self->{insertion_mode} == IN_FRAMESET_IM) {
5383 !!!cp ('t319');
5384 !!!insert-element ($token->{tag_name}, $token->{attributes});
5385 pop @{$self->{open_elements}};
5386 !!!next-token;
5387 redo B;
5388 } elsif ($token->{tag_name} eq 'noframes') {
5389 !!!cp ('t320');
5390 ## NOTE: As if in body.
5391 $parse_rcdata->(CDATA_CONTENT_MODEL);
5392 redo B;
5393 } else {
5394 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
5395 !!!cp ('t321');
5396 !!!parse-error (type => 'in frameset:'.$token->{tag_name});
5397 } else {
5398 !!!cp ('t322');
5399 !!!parse-error (type => 'after frameset:'.$token->{tag_name});
5400 }
5401 ## Ignore the token
5402 !!!next-token;
5403 redo B;
5404 }
5405 } elsif ($token->{type} == END_TAG_TOKEN) {
5406 if ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
5407 !!!cp ('t323');
5408 !!!parse-error (type => 'after html:/'.$token->{tag_name});
5409
5410 $self->{insertion_mode} = AFTER_FRAMESET_IM;
5411 ## Process in the "after frameset" insertion mode.
5412 } else {
5413 !!!cp ('t324');
5414 }
5415
5416 if ($token->{tag_name} eq 'frameset' and
5417 $self->{insertion_mode} == IN_FRAMESET_IM) {
5418 if ($self->{open_elements}->[-1]->[1] eq 'html' and
5419 @{$self->{open_elements}} == 1) {
5420 !!!cp ('t325');
5421 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5422 ## Ignore the token
5423 !!!next-token;
5424 } else {
5425 !!!cp ('t326');
5426 pop @{$self->{open_elements}};
5427 !!!next-token;
5428 }
5429
5430 if (not defined $self->{inner_html_node} and
5431 $self->{open_elements}->[-1]->[1] ne 'frameset') {
5432 !!!cp ('t327');
5433 $self->{insertion_mode} = AFTER_FRAMESET_IM;
5434 } else {
5435 !!!cp ('t328');
5436 }
5437 redo B;
5438 } elsif ($token->{tag_name} eq 'html' and
5439 $self->{insertion_mode} == AFTER_FRAMESET_IM) {
5440 !!!cp ('t329');
5441 $self->{insertion_mode} = AFTER_HTML_FRAMESET_IM;
5442 !!!next-token;
5443 redo B;
5444 } else {
5445 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
5446 !!!cp ('t330');
5447 !!!parse-error (type => 'in frameset:/'.$token->{tag_name});
5448 } else {
5449 !!!cp ('t331');
5450 !!!parse-error (type => 'after frameset:/'.$token->{tag_name});
5451 }
5452 ## Ignore the token
5453 !!!next-token;
5454 redo B;
5455 }
5456 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5457 unless ($self->{open_elements}->[-1]->[1] eq 'html' and
5458 @{$self->{open_elements}} == 1) { # redundant, maybe
5459 !!!cp ('t331.1');
5460 !!!parse-error (type => 'in body:#eof');
5461 } else {
5462 !!!cp ('t331.2');
5463 }
5464
5465 ## Stop parsing
5466 last B;
5467 } else {
5468 die "$0: $token->{type}: Unknown token type";
5469 }
5470
5471 ## ISSUE: An issue in spec here
5472 } else {
5473 die "$0: $self->{insertion_mode}: Unknown insertion mode";
5474 }
5475
5476 ## "in body" insertion mode
5477 if ($token->{type} == START_TAG_TOKEN) {
5478 if ($token->{tag_name} eq 'script') {
5479 !!!cp ('t332');
5480 ## NOTE: This is an "as if in head" code clone
5481 $script_start_tag->();
5482 redo B;
5483 } elsif ($token->{tag_name} eq 'style') {
5484 !!!cp ('t333');
5485 ## NOTE: This is an "as if in head" code clone
5486 $parse_rcdata->(CDATA_CONTENT_MODEL);
5487 redo B;
5488 } elsif ({
5489 base => 1, link => 1,
5490 }->{$token->{tag_name}}) {
5491 !!!cp ('t334');
5492 ## NOTE: This is an "as if in head" code clone, only "-t" differs
5493 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5494 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
5495 !!!next-token;
5496 redo B;
5497 } elsif ($token->{tag_name} eq 'meta') {
5498 ## NOTE: This is an "as if in head" code clone, only "-t" differs
5499 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5500 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
5501
5502 unless ($self->{confident}) {
5503 if ($token->{attributes}->{charset}) { ## TODO: And if supported
5504 !!!cp ('t335');
5505 $self->{change_encoding}
5506 ->($self, $token->{attributes}->{charset}->{value});
5507
5508 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
5509 ->set_user_data (manakai_has_reference =>
5510 $token->{attributes}->{charset}
5511 ->{has_reference});
5512 } elsif ($token->{attributes}->{content}) {
5513 ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
5514 if ($token->{attributes}->{content}->{value}
5515 =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
5516 [\x09-\x0D\x20]*=
5517 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
5518 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
5519 !!!cp ('t336');
5520 $self->{change_encoding}
5521 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3);
5522 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
5523 ->set_user_data (manakai_has_reference =>
5524 $token->{attributes}->{content}
5525 ->{has_reference});
5526 }
5527 }
5528 } else {
5529 if ($token->{attributes}->{charset}) {
5530 !!!cp ('t337');
5531 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
5532 ->set_user_data (manakai_has_reference =>
5533 $token->{attributes}->{charset}
5534 ->{has_reference});
5535 }
5536 if ($token->{attributes}->{content}) {
5537 !!!cp ('t338');
5538 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
5539 ->set_user_data (manakai_has_reference =>
5540 $token->{attributes}->{content}
5541 ->{has_reference});
5542 }
5543 }
5544
5545 !!!next-token;
5546 redo B;
5547 } elsif ($token->{tag_name} eq 'title') {
5548 !!!cp ('t341');
5549 ## NOTE: This is an "as if in head" code clone
5550 $parse_rcdata->(RCDATA_CONTENT_MODEL);
5551 redo B;
5552 } elsif ($token->{tag_name} eq 'body') {
5553 !!!parse-error (type => 'in body:body');
5554
5555 if (@{$self->{open_elements}} == 1 or
5556 $self->{open_elements}->[1]->[1] ne 'body') {
5557 !!!cp ('t342');
5558 ## Ignore the token
5559 } else {
5560 my $body_el = $self->{open_elements}->[1]->[0];
5561 for my $attr_name (keys %{$token->{attributes}}) {
5562 unless ($body_el->has_attribute_ns (undef, $attr_name)) {
5563 !!!cp ('t343');
5564 $body_el->set_attribute_ns
5565 (undef, [undef, $attr_name],
5566 $token->{attributes}->{$attr_name}->{value});
5567 }
5568 }
5569 }
5570 !!!next-token;
5571 redo B;
5572 } elsif ({
5573 address => 1, blockquote => 1, center => 1, dir => 1,
5574 div => 1, dl => 1, fieldset => 1,
5575 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
5576 menu => 1, ol => 1, p => 1, ul => 1,
5577 pre => 1, listing => 1,
5578 form => 1,
5579 table => 1,
5580 hr => 1,
5581 }->{$token->{tag_name}}) {
5582 if ($token->{tag_name} eq 'form' and defined $self->{form_element}) {
5583 !!!cp ('t350');
5584 !!!parse-error (type => 'in form:form');
5585 ## Ignore the token
5586 !!!next-token;
5587 redo B;
5588 }
5589
5590 ## has a p element in scope
5591 INSCOPE: for (reverse @{$self->{open_elements}}) {
5592 if ($_->[1] eq 'p') {
5593 !!!cp ('t344');
5594 !!!back-token;
5595 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
5596 redo B;
5597 } elsif ({
5598 applet => 1, table => 1, caption => 1, td => 1, th => 1,
5599 button => 1, marquee => 1, object => 1, html => 1,
5600 }->{$_->[1]}) {
5601 !!!cp ('t345');
5602 last INSCOPE;
5603 }
5604 } # INSCOPE
5605
5606 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5607 if ($token->{tag_name} eq 'pre' or $token->{tag_name} eq 'listing') {
5608 !!!next-token;
5609 if ($token->{type} == CHARACTER_TOKEN) {
5610 $token->{data} =~ s/^\x0A//;
5611 unless (length $token->{data}) {
5612 !!!cp ('t346');
5613 !!!next-token;
5614 } else {
5615 !!!cp ('t349');
5616 }
5617 } else {
5618 !!!cp ('t348');
5619 }
5620 } elsif ($token->{tag_name} eq 'form') {
5621 !!!cp ('t347.1');
5622 $self->{form_element} = $self->{open_elements}->[-1]->[0];
5623
5624 !!!next-token;
5625 } elsif ($token->{tag_name} eq 'table') {
5626 !!!cp ('t382');
5627 push @{$open_tables}, [$self->{open_elements}->[-1]->[0]];
5628
5629 $self->{insertion_mode} = IN_TABLE_IM;
5630
5631 !!!next-token;
5632 } elsif ($token->{tag_name} eq 'hr') {
5633 !!!cp ('t386');
5634 pop @{$self->{open_elements}};
5635
5636 !!!next-token;
5637 } else {
5638 !!!cp ('t347');
5639 !!!next-token;
5640 }
5641 redo B;
5642 } elsif ({li => 1, dt => 1, dd => 1}->{$token->{tag_name}}) {
5643 ## has a p element in scope
5644 INSCOPE: for (reverse @{$self->{open_elements}}) {
5645 if ($_->[1] eq 'p') {
5646 !!!cp ('t353');
5647 !!!back-token;
5648 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
5649 redo B;
5650 } elsif ({
5651 applet => 1, table => 1, caption => 1, td => 1, th => 1,
5652 button => 1, marquee => 1, object => 1, html => 1,
5653 }->{$_->[1]}) {
5654 !!!cp ('t354');
5655 last INSCOPE;
5656 }
5657 } # INSCOPE
5658
5659 ## Step 1
5660 my $i = -1;
5661 my $node = $self->{open_elements}->[$i];
5662 my $li_or_dtdd = {li => {li => 1},
5663 dt => {dt => 1, dd => 1},
5664 dd => {dt => 1, dd => 1}}->{$token->{tag_name}};
5665 LI: {
5666 ## Step 2
5667 if ($li_or_dtdd->{$node->[1]}) {
5668 if ($i != -1) {
5669 !!!cp ('t355');
5670 !!!parse-error (type => 'end tag missing:'.
5671 $self->{open_elements}->[-1]->[1]);
5672 } else {
5673 !!!cp ('t356');
5674 }
5675 splice @{$self->{open_elements}}, $i;
5676 last LI;
5677 } else {
5678 !!!cp ('t357');
5679 }
5680
5681 ## Step 3
5682 if (not $formatting_category->{$node->[1]} and
5683 #not $phrasing_category->{$node->[1]} and
5684 ($special_category->{$node->[1]} or
5685 $scoping_category->{$node->[1]}) and
5686 $node->[1] ne 'address' and $node->[1] ne 'div') {
5687 !!!cp ('t358');
5688 last LI;
5689 }
5690
5691 !!!cp ('t359');
5692 ## Step 4
5693 $i--;
5694 $node = $self->{open_elements}->[$i];
5695 redo LI;
5696 } # LI
5697
5698 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5699 !!!next-token;
5700 redo B;
5701 } elsif ($token->{tag_name} eq 'plaintext') {
5702 ## has a p element in scope
5703 INSCOPE: for (reverse @{$self->{open_elements}}) {
5704 if ($_->[1] eq 'p') {
5705 !!!cp ('t367');
5706 !!!back-token;
5707 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
5708 redo B;
5709 } elsif ({
5710 applet => 1, table => 1, caption => 1, td => 1, th => 1,
5711 button => 1, marquee => 1, object => 1, html => 1,
5712 }->{$_->[1]}) {
5713 !!!cp ('t368');
5714 last INSCOPE;
5715 }
5716 } # INSCOPE
5717
5718 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5719
5720 $self->{content_model} = PLAINTEXT_CONTENT_MODEL;
5721
5722 !!!next-token;
5723 redo B;
5724 } elsif ($token->{tag_name} eq 'a') {
5725 AFE: for my $i (reverse 0..$#$active_formatting_elements) {
5726 my $node = $active_formatting_elements->[$i];
5727 if ($node->[1] eq 'a') {
5728 !!!cp ('t371');
5729 !!!parse-error (type => 'in a:a');
5730
5731 !!!back-token;
5732 $token = {type => END_TAG_TOKEN, tag_name => 'a'};
5733 $formatting_end_tag->($token->{tag_name});
5734
5735 AFE2: for (reverse 0..$#$active_formatting_elements) {
5736 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
5737 !!!cp ('t372');
5738 splice @$active_formatting_elements, $_, 1;
5739 last AFE2;
5740 }
5741 } # AFE2
5742 OE: for (reverse 0..$#{$self->{open_elements}}) {
5743 if ($self->{open_elements}->[$_]->[0] eq $node->[0]) {
5744 !!!cp ('t373');
5745 splice @{$self->{open_elements}}, $_, 1;
5746 last OE;
5747 }
5748 } # OE
5749 last AFE;
5750 } elsif ($node->[0] eq '#marker') {
5751 !!!cp ('t374');
5752 last AFE;
5753 }
5754 } # AFE
5755
5756 $reconstruct_active_formatting_elements->($insert_to_current);
5757
5758 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5759 push @$active_formatting_elements, $self->{open_elements}->[-1];
5760
5761 !!!next-token;
5762 redo B;
5763 } elsif ($token->{tag_name} eq 'nobr') {
5764 $reconstruct_active_formatting_elements->($insert_to_current);
5765
5766 ## has a |nobr| element in scope
5767 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5768 my $node = $self->{open_elements}->[$_];
5769 if ($node->[1] eq 'nobr') {
5770 !!!cp ('t376');
5771 !!!parse-error (type => 'in nobr:nobr');
5772 !!!back-token;
5773 $token = {type => END_TAG_TOKEN, tag_name => 'nobr'};
5774 redo B;
5775 } elsif ({
5776 applet => 1, table => 1, caption => 1, td => 1, th => 1,
5777 button => 1, marquee => 1, object => 1, html => 1,
5778 }->{$node->[1]}) {
5779 !!!cp ('t377');
5780 last INSCOPE;
5781 }
5782 } # INSCOPE
5783
5784 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5785 push @$active_formatting_elements, $self->{open_elements}->[-1];
5786
5787 !!!next-token;
5788 redo B;
5789 } elsif ($token->{tag_name} eq 'button') {
5790 ## has a button element in scope
5791 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5792 my $node = $self->{open_elements}->[$_];
5793 if ($node->[1] eq 'button') {
5794 !!!cp ('t378');
5795 !!!parse-error (type => 'in button:button');
5796 !!!back-token;
5797 $token = {type => END_TAG_TOKEN, tag_name => 'button'};
5798 redo B;
5799 } elsif ({
5800 applet => 1, table => 1, caption => 1, td => 1, th => 1,
5801 button => 1, marquee => 1, object => 1, html => 1,
5802 }->{$node->[1]}) {
5803 !!!cp ('t379');
5804 last INSCOPE;
5805 }
5806 } # INSCOPE
5807
5808 $reconstruct_active_formatting_elements->($insert_to_current);
5809
5810 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5811
5812 ## TODO: associate with $self->{form_element} if defined
5813
5814 push @$active_formatting_elements, ['#marker', ''];
5815
5816 !!!next-token;
5817 redo B;
5818 } elsif ({
5819 xmp => 1,
5820 iframe => 1,
5821 noembed => 1,
5822 noframes => 1,
5823 noscript => 0, ## TODO: 1 if scripting is enabled
5824 }->{$token->{tag_name}}) {
5825 if ($token->{tag_name} eq 'xmp') {
5826 !!!cp ('t381');
5827 $reconstruct_active_formatting_elements->($insert_to_current);
5828 } else {
5829 !!!cp ('t399');
5830 }
5831 ## NOTE: There is an "as if in body" code clone.
5832 $parse_rcdata->(CDATA_CONTENT_MODEL);
5833 redo B;
5834 } elsif ($token->{tag_name} eq 'isindex') {
5835 !!!parse-error (type => 'isindex');
5836
5837 if (defined $self->{form_element}) {
5838 !!!cp ('t389');
5839 ## Ignore the token
5840 !!!next-token;
5841 redo B;
5842 } else {
5843 my $at = $token->{attributes};
5844 my $form_attrs;
5845 $form_attrs->{action} = $at->{action} if $at->{action};
5846 my $prompt_attr = $at->{prompt};
5847 $at->{name} = {name => 'name', value => 'isindex'};
5848 delete $at->{action};
5849 delete $at->{prompt};
5850 my @tokens = (
5851 {type => START_TAG_TOKEN, tag_name => 'form',
5852 attributes => $form_attrs},
5853 {type => START_TAG_TOKEN, tag_name => 'hr'},
5854 {type => START_TAG_TOKEN, tag_name => 'p'},
5855 {type => START_TAG_TOKEN, tag_name => 'label'},
5856 );
5857 if ($prompt_attr) {
5858 !!!cp ('t390');
5859 push @tokens, {type => CHARACTER_TOKEN, data => $prompt_attr->{value}};
5860 } else {
5861 !!!cp ('t391');
5862 push @tokens, {type => CHARACTER_TOKEN,
5863 data => 'This is a searchable index. Insert your search keywords here: '}; # SHOULD
5864 ## TODO: make this configurable
5865 }
5866 push @tokens,
5867 {type => START_TAG_TOKEN, tag_name => 'input', attributes => $at},
5868 #{type => CHARACTER_TOKEN, data => ''}, # SHOULD
5869 {type => END_TAG_TOKEN, tag_name => 'label'},
5870 {type => END_TAG_TOKEN, tag_name => 'p'},
5871 {type => START_TAG_TOKEN, tag_name => 'hr'},
5872 {type => END_TAG_TOKEN, tag_name => 'form'};
5873 $token = shift @tokens;
5874 !!!back-token (@tokens);
5875 redo B;
5876 }
5877 } elsif ($token->{tag_name} eq 'textarea') {
5878 my $tag_name = $token->{tag_name};
5879 my $el;
5880 !!!create-element ($el, $token->{tag_name}, $token->{attributes});
5881
5882 ## TODO: $self->{form_element} if defined
5883 $self->{content_model} = RCDATA_CONTENT_MODEL;
5884 delete $self->{escape}; # MUST
5885
5886 $insert->($el);
5887
5888 my $text = '';
5889 !!!next-token;
5890 if ($token->{type} == CHARACTER_TOKEN) {
5891 $token->{data} =~ s/^\x0A//;
5892 unless (length $token->{data}) {
5893 !!!cp ('t392');
5894 !!!next-token;
5895 } else {
5896 !!!cp ('t393');
5897 }
5898 } else {
5899 !!!cp ('t394');
5900 }
5901 while ($token->{type} == CHARACTER_TOKEN) {
5902 !!!cp ('t395');
5903 $text .= $token->{data};
5904 !!!next-token;
5905 }
5906 if (length $text) {
5907 !!!cp ('t396');
5908 $el->manakai_append_text ($text);
5909 }
5910
5911 $self->{content_model} = PCDATA_CONTENT_MODEL;
5912
5913 if ($token->{type} == END_TAG_TOKEN and
5914 $token->{tag_name} eq $tag_name) {
5915 !!!cp ('t397');
5916 ## Ignore the token
5917 } else {
5918 !!!cp ('t398');
5919 !!!parse-error (type => 'in RCDATA:#'.$token->{type});
5920 }
5921 !!!next-token;
5922 redo B;
5923 } elsif ({
5924 caption => 1, col => 1, colgroup => 1, frame => 1,
5925 frameset => 1, head => 1, option => 1, optgroup => 1,
5926 tbody => 1, td => 1, tfoot => 1, th => 1,
5927 thead => 1, tr => 1,
5928 }->{$token->{tag_name}}) {
5929 !!!cp ('t401');
5930 !!!parse-error (type => 'in body:'.$token->{tag_name});
5931 ## Ignore the token
5932 !!!next-token;
5933 redo B;
5934
5935 ## ISSUE: An issue on HTML5 new elements in the spec.
5936 } else {
5937 if ($token->{tag_name} eq 'image') {
5938 !!!cp ('t384');
5939 !!!parse-error (type => 'image');
5940 $token->{tag_name} = 'img';
5941 } else {
5942 !!!cp ('t385');
5943 }
5944
5945 ## NOTE: There is an "as if <br>" code clone.
5946 $reconstruct_active_formatting_elements->($insert_to_current);
5947
5948 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5949
5950 if ({
5951 applet => 1, marquee => 1, object => 1,
5952 }->{$token->{tag_name}}) {
5953 !!!cp ('t380');
5954 push @$active_formatting_elements, ['#marker', ''];
5955 } elsif ({
5956 b => 1, big => 1, em => 1, font => 1, i => 1,
5957 s => 1, small => 1, strile => 1,
5958 strong => 1, tt => 1, u => 1,
5959 }->{$token->{tag_name}}) {
5960 !!!cp ('t375');
5961 push @$active_formatting_elements, $self->{open_elements}->[-1];
5962 } elsif ($token->{tag_name} eq 'input') {
5963 !!!cp ('t388');
5964 ## TODO: associate with $self->{form_element} if defined
5965 pop @{$self->{open_elements}};
5966 } elsif ({
5967 area => 1, basefont => 1, bgsound => 1, br => 1,
5968 embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
5969 #image => 1,
5970 }->{$token->{tag_name}}) {
5971 !!!cp ('t388.1');
5972 pop @{$self->{open_elements}};
5973 } elsif ($token->{tag_name} eq 'select') {
5974 ## TODO: associate with $self->{form_element} if defined
5975
5976 if ($self->{insertion_mode} & TABLE_IMS or
5977 $self->{insertion_mode} & BODY_TABLE_IMS or
5978 $self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
5979 !!!cp ('t400.1');
5980 $self->{insertion_mode} = IN_SELECT_IN_TABLE_IM;
5981 } else {
5982 !!!cp ('t400.2');
5983 $self->{insertion_mode} = IN_SELECT_IM;
5984 }
5985 } else {
5986 !!!cp ('t402');
5987 }
5988
5989 !!!next-token;
5990 redo B;
5991 }
5992 } elsif ($token->{type} == END_TAG_TOKEN) {
5993 if ($token->{tag_name} eq 'body') {
5994 ## has a |body| element in scope
5995 my $i;
5996 INSCOPE: for (reverse @{$self->{open_elements}}) {
5997 if ($_->[1] eq 'body') {
5998 !!!cp ('t405');
5999 $i = $_;
6000 last INSCOPE;
6001 } elsif ({
6002 applet => 1, table => 1, caption => 1, td => 1, th => 1,
6003 button => 1, marquee => 1, object => 1, html => 1,
6004 }->{$_->[1]}) {
6005 !!!cp ('t405.1');
6006 last INSCOPE;
6007 }
6008 } # INSCOPE
6009 unless (defined $i) {
6010 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
6011 ## NOTE: Ignore the token.
6012 !!!next-token;
6013 redo B;
6014 }
6015
6016 for (@{$self->{open_elements}}) {
6017 unless ({
6018 dd => 1, dt => 1, li => 1, p => 1, td => 1,
6019 th => 1, tr => 1, body => 1, html => 1,
6020 tbody => 1, tfoot => 1, thead => 1,
6021 }->{$_->[1]}) {
6022 !!!cp ('t403');
6023 !!!parse-error (type => 'not closed:'.$_->[1]);
6024 last;
6025 } else {
6026 !!!cp ('t404');
6027 }
6028 }
6029
6030 $self->{insertion_mode} = AFTER_BODY_IM;
6031 !!!next-token;
6032 redo B;
6033 } elsif ($token->{tag_name} eq 'html') {
6034 if (@{$self->{open_elements}} > 1 and $self->{open_elements}->[1]->[1] eq 'body') {
6035 ## ISSUE: There is an issue in the spec.
6036 if ($self->{open_elements}->[-1]->[1] ne 'body') {
6037 !!!cp ('t406');
6038 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[1]->[1]);
6039 } else {
6040 !!!cp ('t407');
6041 }
6042 $self->{insertion_mode} = AFTER_BODY_IM;
6043 ## reprocess
6044 redo B;
6045 } else {
6046 !!!cp ('t408');
6047 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
6048 ## Ignore the token
6049 !!!next-token;
6050 redo B;
6051 }
6052 } elsif ({
6053 address => 1, blockquote => 1, center => 1, dir => 1,
6054 div => 1, dl => 1, fieldset => 1, listing => 1,
6055 menu => 1, ol => 1, pre => 1, ul => 1,
6056 dd => 1, dt => 1, li => 1,
6057 applet => 1, button => 1, marquee => 1, object => 1,
6058 }->{$token->{tag_name}}) {
6059 ## has an element in scope
6060 my $i;
6061 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6062 my $node = $self->{open_elements}->[$_];
6063 if ($node->[1] eq $token->{tag_name}) {
6064 !!!cp ('t410');
6065 $i = $_;
6066 last INSCOPE;
6067 } elsif ({
6068 applet => 1, table => 1, caption => 1, td => 1, th => 1,
6069 button => 1, marquee => 1, object => 1, html => 1,
6070 }->{$node->[1]}) {
6071 !!!cp ('t411');
6072 last INSCOPE;
6073 }
6074 } # INSCOPE
6075
6076 unless (defined $i) { # has an element in scope
6077 !!!cp ('t413');
6078 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
6079 } else {
6080 ## Step 1. generate implied end tags
6081 while ({
6082 dd => ($token->{tag_name} ne 'dd'),
6083 dt => ($token->{tag_name} ne 'dt'),
6084 li => ($token->{tag_name} ne 'li'),
6085 p => 1,
6086 }->{$self->{open_elements}->[-1]->[1]}) {
6087 !!!cp ('t409');
6088 pop @{$self->{open_elements}};
6089 }
6090
6091 ## Step 2.
6092 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
6093 !!!cp ('t412');
6094 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
6095 } else {
6096 !!!cp ('t414');
6097 }
6098
6099 ## Step 3.
6100 splice @{$self->{open_elements}}, $i;
6101
6102 ## Step 4.
6103 $clear_up_to_marker->()
6104 if {
6105 applet => 1, button => 1, marquee => 1, object => 1,
6106 }->{$token->{tag_name}};
6107 }
6108 !!!next-token;
6109 redo B;
6110 } elsif ($token->{tag_name} eq 'form') {
6111 undef $self->{form_element};
6112
6113 ## has an element in scope
6114 my $i;
6115 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6116 my $node = $self->{open_elements}->[$_];
6117 if ($node->[1] eq $token->{tag_name}) {
6118 !!!cp ('t418');
6119 $i = $_;
6120 last INSCOPE;
6121 } elsif ({
6122 applet => 1, table => 1, caption => 1, td => 1, th => 1,
6123 button => 1, marquee => 1, object => 1, html => 1,
6124 }->{$node->[1]}) {
6125 !!!cp ('t419');
6126 last INSCOPE;
6127 }
6128 } # INSCOPE
6129
6130 unless (defined $i) { # has an element in scope
6131 !!!cp ('t421');
6132 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
6133 } else {
6134 ## Step 1. generate implied end tags
6135 while ({
6136 dd => 1, dt => 1, li => 1, p => 1,
6137 }->{$self->{open_elements}->[-1]->[1]}) {
6138 !!!cp ('t417');
6139 pop @{$self->{open_elements}};
6140 }
6141
6142 ## Step 2.
6143 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
6144 !!!cp ('t417.1');
6145 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
6146 } else {
6147 !!!cp ('t420');
6148 }
6149
6150 ## Step 3.
6151 splice @{$self->{open_elements}}, $i;
6152 }
6153
6154 !!!next-token;
6155 redo B;
6156 } elsif ({
6157 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
6158 }->{$token->{tag_name}}) {
6159 ## has an element in scope
6160 my $i;
6161 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6162 my $node = $self->{open_elements}->[$_];
6163 if ({
6164 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
6165 }->{$node->[1]}) {
6166 !!!cp ('t423');
6167 $i = $_;
6168 last INSCOPE;
6169 } elsif ({
6170 applet => 1, table => 1, caption => 1, td => 1, th => 1,
6171 button => 1, marquee => 1, object => 1, html => 1,
6172 }->{$node->[1]}) {
6173 !!!cp ('t424');
6174 last INSCOPE;
6175 }
6176 } # INSCOPE
6177
6178 unless (defined $i) { # has an element in scope
6179 !!!cp ('t425.1');
6180 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
6181 } else {
6182 ## Step 1. generate implied end tags
6183 while ({
6184 dd => 1, dt => 1, li => 1, p => 1,
6185 }->{$self->{open_elements}->[-1]->[1]}) {
6186 !!!cp ('t422');
6187 pop @{$self->{open_elements}};
6188 }
6189
6190 ## Step 2.
6191 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
6192 !!!cp ('t425');
6193 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
6194 } else {
6195 !!!cp ('t426');
6196 }
6197
6198 ## Step 3.
6199 splice @{$self->{open_elements}}, $i;
6200 }
6201
6202 !!!next-token;
6203 redo B;
6204 } elsif ($token->{tag_name} eq 'p') {
6205 ## has an element in scope
6206 my $i;
6207 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6208 my $node = $self->{open_elements}->[$_];
6209 if ($node->[1] eq $token->{tag_name}) {
6210 !!!cp ('t410.1');
6211 $i = $_;
6212 last INSCOPE;
6213 } elsif ({
6214 applet => 1, table => 1, caption => 1, td => 1, th => 1,
6215 button => 1, marquee => 1, object => 1, html => 1,
6216 }->{$node->[1]}) {
6217 !!!cp ('t411.1');
6218 last INSCOPE;
6219 }
6220 } # INSCOPE
6221
6222 if (defined $i) {
6223 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
6224 !!!cp ('t412.1');
6225 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
6226 } else {
6227 !!!cp ('t414.1');
6228 }
6229
6230 splice @{$self->{open_elements}}, $i;
6231 } else {
6232 !!!cp ('t413.1');
6233 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
6234
6235 !!!cp ('t415.1');
6236 ## As if <p>, then reprocess the current token
6237 my $el;
6238 !!!create-element ($el, 'p');
6239 $insert->($el);
6240 ## NOTE: Not inserted into |$self->{open_elements}|.
6241 }
6242
6243 !!!next-token;
6244 redo B;
6245 } elsif ({
6246 a => 1,
6247 b => 1, big => 1, em => 1, font => 1, i => 1,
6248 nobr => 1, s => 1, small => 1, strile => 1,
6249 strong => 1, tt => 1, u => 1,
6250 }->{$token->{tag_name}}) {
6251 !!!cp ('t427');
6252 $formatting_end_tag->($token->{tag_name});
6253 redo B;
6254 } elsif ($token->{tag_name} eq 'br') {
6255 !!!cp ('t428');
6256 !!!parse-error (type => 'unmatched end tag:br');
6257
6258 ## As if <br>
6259 $reconstruct_active_formatting_elements->($insert_to_current);
6260
6261 my $el;
6262 !!!create-element ($el, 'br');
6263 $insert->($el);
6264
6265 ## Ignore the token.
6266 !!!next-token;
6267 redo B;
6268 } elsif ({
6269 caption => 1, col => 1, colgroup => 1, frame => 1,
6270 frameset => 1, head => 1, option => 1, optgroup => 1,
6271 tbody => 1, td => 1, tfoot => 1, th => 1,
6272 thead => 1, tr => 1,
6273 area => 1, basefont => 1, bgsound => 1,
6274 embed => 1, hr => 1, iframe => 1, image => 1,
6275 img => 1, input => 1, isindex => 1, noembed => 1,
6276 noframes => 1, param => 1, select => 1, spacer => 1,
6277 table => 1, textarea => 1, wbr => 1,
6278 noscript => 0, ## TODO: if scripting is enabled
6279 }->{$token->{tag_name}}) {
6280 !!!cp ('t429');
6281 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
6282 ## Ignore the token
6283 !!!next-token;
6284 redo B;
6285
6286 ## ISSUE: Issue on HTML5 new elements in spec
6287
6288 } else {
6289 ## Step 1
6290 my $node_i = -1;
6291 my $node = $self->{open_elements}->[$node_i];
6292
6293 ## Step 2
6294 S2: {
6295 if ($node->[1] eq $token->{tag_name}) {
6296 ## Step 1
6297 ## generate implied end tags
6298 while ({
6299 dd => 1, dt => 1, li => 1, p => 1,
6300 }->{$self->{open_elements}->[-1]->[1]}) {
6301 !!!cp ('t430');
6302 ## ISSUE: Can this case be reached?
6303 pop @{$self->{open_elements}};
6304 }
6305
6306 ## Step 2
6307 if ($token->{tag_name} ne $self->{open_elements}->[-1]->[1]) {
6308 !!!cp ('t431');
6309 ## NOTE: <x><y></x>
6310 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
6311 } else {
6312 !!!cp ('t432');
6313 }
6314
6315 ## Step 3
6316 splice @{$self->{open_elements}}, $node_i;
6317
6318 !!!next-token;
6319 last S2;
6320 } else {
6321 ## Step 3
6322 if (not $formatting_category->{$node->[1]} and
6323 #not $phrasing_category->{$node->[1]} and
6324 ($special_category->{$node->[1]} or
6325 $scoping_category->{$node->[1]})) {
6326 !!!cp ('t433');
6327 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
6328 ## Ignore the token
6329 !!!next-token;
6330 last S2;
6331 }
6332
6333 !!!cp ('t434');
6334 }
6335
6336 ## Step 4
6337 $node_i--;
6338 $node = $self->{open_elements}->[$node_i];
6339
6340 ## Step 5;
6341 redo S2;
6342 } # S2
6343 redo B;
6344 }
6345 }
6346 redo B;
6347 } # B
6348
6349 ## Stop parsing # MUST
6350
6351 ## TODO: script stuffs
6352 } # _tree_construct_main
6353
6354 sub set_inner_html ($$$) {
6355 my $class = shift;
6356 my $node = shift;
6357 my $s = \$_[0];
6358 my $onerror = $_[1];
6359
6360 ## ISSUE: Should {confident} be true?
6361
6362 my $nt = $node->node_type;
6363 if ($nt == 9) {
6364 # MUST
6365
6366 ## Step 1 # MUST
6367 ## TODO: If the document has an active parser, ...
6368 ## ISSUE: There is an issue in the spec.
6369
6370 ## Step 2 # MUST
6371 my @cn = @{$node->child_nodes};
6372 for (@cn) {
6373 $node->remove_child ($_);
6374 }
6375
6376 ## Step 3, 4, 5 # MUST
6377 $class->parse_string ($$s => $node, $onerror);
6378 } elsif ($nt == 1) {
6379 ## TODO: If non-html element
6380
6381 ## NOTE: Most of this code is copied from |parse_string|
6382
6383 ## Step 1 # MUST
6384 my $this_doc = $node->owner_document;
6385 my $doc = $this_doc->implementation->create_document;
6386 $doc->manakai_is_html (1);
6387 my $p = $class->new;
6388 $p->{document} = $doc;
6389
6390 ## Step 8 # MUST
6391 my $i = 0;
6392 my $line = 1;
6393 my $column = 0;
6394 $p->{set_next_char} = sub {
6395 my $self = shift;
6396
6397 pop @{$self->{prev_char}};
6398 unshift @{$self->{prev_char}}, $self->{next_char};
6399
6400 $self->{next_char} = -1 and return if $i >= length $$s;
6401 $self->{next_char} = ord substr $$s, $i++, 1;
6402 $column++;
6403
6404 if ($self->{next_char} == 0x000A) { # LF
6405 $line++;
6406 $column = 0;
6407 !!!cp ('i1');
6408 } elsif ($self->{next_char} == 0x000D) { # CR
6409 $i++ if substr ($$s, $i, 1) eq "\x0A";
6410 $self->{next_char} = 0x000A; # LF # MUST
6411 $line++;
6412 $column = 0;
6413 !!!cp ('i2');
6414 } elsif ($self->{next_char} > 0x10FFFF) {
6415 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
6416 !!!cp ('i3');
6417 } elsif ($self->{next_char} == 0x0000) { # NULL
6418 !!!cp ('i4');
6419 !!!parse-error (type => 'NULL');
6420 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
6421 }
6422 };
6423 $p->{prev_char} = [-1, -1, -1];
6424 $p->{next_char} = -1;
6425
6426 my $ponerror = $onerror || sub {
6427 my (%opt) = @_;
6428 warn "Parse error ($opt{type}) at line $opt{line} column $opt{column}\n";
6429 };
6430 $p->{parse_error} = sub {
6431 $ponerror->(@_, line => $line, column => $column);
6432 };
6433
6434 $p->_initialize_tokenizer;
6435 $p->_initialize_tree_constructor;
6436
6437 ## Step 2
6438 my $node_ln = $node->manakai_local_name;
6439 $p->{content_model} = {
6440 title => RCDATA_CONTENT_MODEL,
6441 textarea => RCDATA_CONTENT_MODEL,
6442 style => CDATA_CONTENT_MODEL,
6443 script => CDATA_CONTENT_MODEL,
6444 xmp => CDATA_CONTENT_MODEL,
6445 iframe => CDATA_CONTENT_MODEL,
6446 noembed => CDATA_CONTENT_MODEL,
6447 noframes => CDATA_CONTENT_MODEL,
6448 noscript => CDATA_CONTENT_MODEL,
6449 plaintext => PLAINTEXT_CONTENT_MODEL,
6450 }->{$node_ln};
6451 $p->{content_model} = PCDATA_CONTENT_MODEL
6452 unless defined $p->{content_model};
6453 ## ISSUE: What is "the name of the element"? local name?
6454
6455 $p->{inner_html_node} = [$node, $node_ln];
6456
6457 ## Step 3
6458 my $root = $doc->create_element_ns
6459 ('http://www.w3.org/1999/xhtml', [undef, 'html']);
6460
6461 ## Step 4 # MUST
6462 $doc->append_child ($root);
6463
6464 ## Step 5 # MUST
6465 push @{$p->{open_elements}}, [$root, 'html'];
6466
6467 undef $p->{head_element};
6468
6469 ## Step 6 # MUST
6470 $p->_reset_insertion_mode;
6471
6472 ## Step 7 # MUST
6473 my $anode = $node;
6474 AN: while (defined $anode) {
6475 if ($anode->node_type == 1) {
6476 my $nsuri = $anode->namespace_uri;
6477 if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {
6478 if ($anode->manakai_local_name eq 'form') {
6479 !!!cp ('i5');
6480 $p->{form_element} = $anode;
6481 last AN;
6482 }
6483 }
6484 }
6485 $anode = $anode->parent_node;
6486 } # AN
6487
6488 ## Step 9 # MUST
6489 {
6490 my $self = $p;
6491 !!!next-token;
6492 }
6493 $p->_tree_construction_main;
6494
6495 ## Step 10 # MUST
6496 my @cn = @{$node->child_nodes};
6497 for (@cn) {
6498 $node->remove_child ($_);
6499 }
6500 ## ISSUE: mutation events? read-only?
6501
6502 ## Step 11 # MUST
6503 @cn = @{$root->child_nodes};
6504 for (@cn) {
6505 $this_doc->adopt_node ($_);
6506 $node->append_child ($_);
6507 }
6508 ## ISSUE: mutation events?
6509
6510 $p->_terminate_tree_constructor;
6511 } else {
6512 die "$0: |set_inner_html| is not defined for node of type $nt";
6513 }
6514 } # set_inner_html
6515
6516 } # tree construction stage
6517
6518 package Whatpm::HTML::RestartParser;
6519 push our @ISA, 'Error';
6520
6521 1;
6522 # $Date: 2008/03/11 01:15:38 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24