/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.117 - (show annotations) (download) (as text)
Wed Mar 19 23:43:47 2008 UTC (17 years, 10 months ago) by wakaba
Branch: MAIN
Changes since 1.116: +2 -32 lines
File MIME type: application/x-wais-source
++ whatpm/Whatpm/ChangeLog	19 Mar 2008 23:42:08 -0000
2008-03-20  Wakaba  <wakaba@suika.fam.cx>

	* HTML.pm.src (_get_next_token): Remove |first_start_tag|
	flag, which is no longer used.

1 package Whatpm::HTML;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.116 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 use Error qw(:try);
5
6 ## ISSUE:
7 ## var doc = implementation.createDocument (null, null, null);
8 ## doc.write ('');
9 ## alert (doc.compatMode);
10
11 ## TODO: Control charcters and noncharacters are not allowed (HTML5 revision 1263)
12 ## TODO: 1252 parse error (revision 1264)
13 ## TODO: 8859-11 = 874 (revision 1271)
14
15 my $permitted_slash_tag_name = {
16 base => 1,
17 link => 1,
18 meta => 1,
19 hr => 1,
20 br => 1,
21 img => 1,
22 embed => 1,
23 param => 1,
24 area => 1,
25 col => 1,
26 input => 1,
27 };
28
29 my $c1_entity_char = {
30 0x80 => 0x20AC,
31 0x81 => 0xFFFD,
32 0x82 => 0x201A,
33 0x83 => 0x0192,
34 0x84 => 0x201E,
35 0x85 => 0x2026,
36 0x86 => 0x2020,
37 0x87 => 0x2021,
38 0x88 => 0x02C6,
39 0x89 => 0x2030,
40 0x8A => 0x0160,
41 0x8B => 0x2039,
42 0x8C => 0x0152,
43 0x8D => 0xFFFD,
44 0x8E => 0x017D,
45 0x8F => 0xFFFD,
46 0x90 => 0xFFFD,
47 0x91 => 0x2018,
48 0x92 => 0x2019,
49 0x93 => 0x201C,
50 0x94 => 0x201D,
51 0x95 => 0x2022,
52 0x96 => 0x2013,
53 0x97 => 0x2014,
54 0x98 => 0x02DC,
55 0x99 => 0x2122,
56 0x9A => 0x0161,
57 0x9B => 0x203A,
58 0x9C => 0x0153,
59 0x9D => 0xFFFD,
60 0x9E => 0x017E,
61 0x9F => 0x0178,
62 }; # $c1_entity_char
63
64 my $special_category = {
65 address => 1, area => 1, base => 1, basefont => 1, bgsound => 1,
66 blockquote => 1, body => 1, br => 1, center => 1, col => 1, colgroup => 1,
67 dd => 1, dir => 1, div => 1, dl => 1, dt => 1, embed => 1, fieldset => 1,
68 form => 1, frame => 1, frameset => 1, h1 => 1, h2 => 1, h3 => 1,
69 h4 => 1, h5 => 1, h6 => 1, head => 1, hr => 1, iframe => 1, image => 1,
70 img => 1, input => 1, isindex => 1, li => 1, link => 1, listing => 1,
71 menu => 1, meta => 1, noembed => 1, noframes => 1, noscript => 1,
72 ol => 1, optgroup => 1, option => 1, p => 1, param => 1, plaintext => 1,
73 pre => 1, script => 1, select => 1, spacer => 1, style => 1, tbody => 1,
74 textarea => 1, tfoot => 1, thead => 1, title => 1, tr => 1, ul => 1, wbr => 1,
75 };
76 my $scoping_category = {
77 applet => 1, button => 1, caption => 1, html => 1, marquee => 1, object => 1,
78 table => 1, td => 1, th => 1,
79 };
80 my $formatting_category = {
81 a => 1, b => 1, big => 1, em => 1, font => 1, i => 1, nobr => 1,
82 s => 1, small => 1, strile => 1, strong => 1, tt => 1, u => 1,
83 };
84 # $phrasing_category: all other elements
85
86 sub parse_byte_string ($$$$;$) {
87 my $self = ref $_[0] ? shift : shift->new;
88 my $charset = shift;
89 my $bytes_s = ref $_[0] ? $_[0] : \($_[0]);
90 my $s;
91
92 if (defined $charset) {
93 require Encode; ## TODO: decode(utf8) don't delete BOM
94 $s = \ (Encode::decode ($charset, $$bytes_s));
95 $self->{input_encoding} = lc $charset; ## TODO: normalize name
96 $self->{confident} = 1;
97 } else {
98 ## TODO: Implement HTML5 detection algorithm
99 require Whatpm::Charset::UniversalCharDet;
100 $charset = Whatpm::Charset::UniversalCharDet->detect_byte_string
101 (substr ($$bytes_s, 0, 1024));
102 $charset ||= 'windows-1252';
103 $s = \ (Encode::decode ($charset, $$bytes_s));
104 $self->{input_encoding} = $charset;
105 $self->{confident} = 0;
106 }
107
108 $self->{change_encoding} = sub {
109 my $self = shift;
110 my $charset = lc shift;
111 my $token = shift;
112 ## TODO: if $charset is supported
113 ## TODO: normalize charset name
114
115 ## "Change the encoding" algorithm:
116
117 ## Step 1
118 if ($charset eq 'utf-16') { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8?
119 $charset = 'utf-8';
120 }
121
122 ## Step 2
123 if (defined $self->{input_encoding} and
124 $self->{input_encoding} eq $charset) {
125 $self->{confident} = 1;
126 return;
127 }
128
129 !!!parse-error (type => 'charset label detected:'.$self->{input_encoding}.
130 ':'.$charset, level => 'w', token => $token);
131
132 ## Step 3
133 # if (can) {
134 ## change the encoding on the fly.
135 #$self->{confident} = 1;
136 #return;
137 # }
138
139 ## Step 4
140 throw Whatpm::HTML::RestartParser (charset => $charset);
141 }; # $self->{change_encoding}
142
143 my @args = @_; shift @args; # $s
144 my $return;
145 try {
146 $return = $self->parse_char_string ($s, @args);
147 } catch Whatpm::HTML::RestartParser with {
148 my $charset = shift->{charset};
149 $s = \ (Encode::decode ($charset, $$bytes_s));
150 $self->{input_encoding} = $charset; ## TODO: normalize
151 $self->{confident} = 1;
152 $return = $self->parse_char_string ($s, @args);
153 };
154 return $return;
155 } # parse_byte_string
156
157 ## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM
158 ## and the HTML layer MUST ignore it. However, we does strip BOM in
159 ## the encoding layer and the HTML layer does not ignore any U+FEFF,
160 ## because the core part of our HTML parser expects a string of character,
161 ## not a string of bytes or code units or anything which might contain a BOM.
162 ## Therefore, any parser interface that accepts a string of bytes,
163 ## such as |parse_byte_string| in this module, must ensure that it does
164 ## strip the BOM and never strip any ZWNBSP.
165
166 *parse_char_string = \&parse_string;
167
168 sub parse_string ($$$;$) {
169 my $self = ref $_[0] ? shift : shift->new;
170 my $s = ref $_[0] ? $_[0] : \($_[0]);
171 $self->{document} = $_[1];
172 @{$self->{document}->child_nodes} = ();
173
174 ## NOTE: |set_inner_html| copies most of this method's code
175
176 $self->{confident} = 1 unless exists $self->{confident};
177 $self->{document}->input_encoding ($self->{input_encoding})
178 if defined $self->{input_encoding};
179
180 my $i = 0;
181 $self->{line_prev} = $self->{line} = 1;
182 $self->{column_prev} = $self->{column} = 0;
183 $self->{set_next_char} = sub {
184 my $self = shift;
185
186 pop @{$self->{prev_char}};
187 unshift @{$self->{prev_char}}, $self->{next_char};
188
189 $self->{next_char} = -1 and return if $i >= length $$s;
190 $self->{next_char} = ord substr $$s, $i++, 1;
191
192 ($self->{line_prev}, $self->{column_prev})
193 = ($self->{line}, $self->{column});
194 $self->{column}++;
195
196 if ($self->{next_char} == 0x000A) { # LF
197 $self->{line}++;
198 $self->{column} = 0;
199 } elsif ($self->{next_char} == 0x000D) { # CR
200 $i++ if substr ($$s, $i, 1) eq "\x0A";
201 $self->{next_char} = 0x000A; # LF # MUST
202 $self->{line}++;
203 $self->{column} = 0;
204 } elsif ($self->{next_char} > 0x10FFFF) {
205 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
206 } elsif ($self->{next_char} == 0x0000) { # NULL
207 !!!parse-error (type => 'NULL');
208 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
209 }
210 };
211 $self->{prev_char} = [-1, -1, -1];
212 $self->{next_char} = -1;
213
214 my $onerror = $_[2] || sub {
215 my (%opt) = @_;
216 my $line = $opt{token} ? $opt{token}->{line} : $opt{line};
217 my $column = $opt{token} ? $opt{token}->{column} : $opt{column};
218 warn "Parse error ($opt{type}) at line $line column $column\n";
219 };
220 $self->{parse_error} = sub {
221 $onerror->(line => $self->{line}, column => $self->{column}, @_);
222 };
223
224 $self->_initialize_tokenizer;
225 $self->_initialize_tree_constructor;
226 $self->_construct_tree;
227 $self->_terminate_tree_constructor;
228
229 delete $self->{parse_error}; # remove loop
230
231 return $self->{document};
232 } # parse_string
233
234 sub new ($) {
235 my $class = shift;
236 my $self = bless {}, $class;
237 $self->{set_next_char} = sub {
238 $self->{next_char} = -1;
239 };
240 $self->{parse_error} = sub {
241 #
242 };
243 $self->{change_encoding} = sub {
244 # if ($_[0] is a supported encoding) {
245 # run "change the encoding" algorithm;
246 # throw Whatpm::HTML::RestartParser (charset => $new_encoding);
247 # }
248 };
249 $self->{application_cache_selection} = sub {
250 #
251 };
252 return $self;
253 } # new
254
255 sub CM_ENTITY () { 0b001 } # & markup in data
256 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
257 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
258
259 sub PLAINTEXT_CONTENT_MODEL () { 0 }
260 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
261 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
262 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
263
264 sub DATA_STATE () { 0 }
265 sub ENTITY_DATA_STATE () { 1 }
266 sub TAG_OPEN_STATE () { 2 }
267 sub CLOSE_TAG_OPEN_STATE () { 3 }
268 sub TAG_NAME_STATE () { 4 }
269 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
270 sub ATTRIBUTE_NAME_STATE () { 6 }
271 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
272 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
273 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
274 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
275 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
276 sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
277 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
278 sub COMMENT_START_STATE () { 14 }
279 sub COMMENT_START_DASH_STATE () { 15 }
280 sub COMMENT_STATE () { 16 }
281 sub COMMENT_END_STATE () { 17 }
282 sub COMMENT_END_DASH_STATE () { 18 }
283 sub BOGUS_COMMENT_STATE () { 19 }
284 sub DOCTYPE_STATE () { 20 }
285 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
286 sub DOCTYPE_NAME_STATE () { 22 }
287 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
288 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
289 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
290 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
291 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
292 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
293 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
294 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
295 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
296 sub BOGUS_DOCTYPE_STATE () { 32 }
297 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
298
299 sub DOCTYPE_TOKEN () { 1 }
300 sub COMMENT_TOKEN () { 2 }
301 sub START_TAG_TOKEN () { 3 }
302 sub END_TAG_TOKEN () { 4 }
303 sub END_OF_FILE_TOKEN () { 5 }
304 sub CHARACTER_TOKEN () { 6 }
305
306 sub AFTER_HTML_IMS () { 0b100 }
307 sub HEAD_IMS () { 0b1000 }
308 sub BODY_IMS () { 0b10000 }
309 sub BODY_TABLE_IMS () { 0b100000 }
310 sub TABLE_IMS () { 0b1000000 }
311 sub ROW_IMS () { 0b10000000 }
312 sub BODY_AFTER_IMS () { 0b100000000 }
313 sub FRAME_IMS () { 0b1000000000 }
314 sub SELECT_IMS () { 0b10000000000 }
315
316 ## NOTE: "initial" and "before html" insertion modes have no constants.
317
318 ## NOTE: "after after body" insertion mode.
319 sub AFTER_HTML_BODY_IM () { AFTER_HTML_IMS | BODY_AFTER_IMS }
320
321 ## NOTE: "after after frameset" insertion mode.
322 sub AFTER_HTML_FRAMESET_IM () { AFTER_HTML_IMS | FRAME_IMS }
323
324 sub IN_HEAD_IM () { HEAD_IMS | 0b00 }
325 sub IN_HEAD_NOSCRIPT_IM () { HEAD_IMS | 0b01 }
326 sub AFTER_HEAD_IM () { HEAD_IMS | 0b10 }
327 sub BEFORE_HEAD_IM () { HEAD_IMS | 0b11 }
328 sub IN_BODY_IM () { BODY_IMS }
329 sub IN_CELL_IM () { BODY_IMS | BODY_TABLE_IMS | 0b01 }
330 sub IN_CAPTION_IM () { BODY_IMS | BODY_TABLE_IMS | 0b10 }
331 sub IN_ROW_IM () { TABLE_IMS | ROW_IMS | 0b01 }
332 sub IN_TABLE_BODY_IM () { TABLE_IMS | ROW_IMS | 0b10 }
333 sub IN_TABLE_IM () { TABLE_IMS }
334 sub AFTER_BODY_IM () { BODY_AFTER_IMS }
335 sub IN_FRAMESET_IM () { FRAME_IMS | 0b01 }
336 sub AFTER_FRAMESET_IM () { FRAME_IMS | 0b10 }
337 sub IN_SELECT_IM () { SELECT_IMS | 0b01 }
338 sub IN_SELECT_IN_TABLE_IM () { SELECT_IMS | 0b10 }
339 sub IN_COLUMN_GROUP_IM () { 0b10 }
340
341 ## Implementations MUST act as if state machine in the spec
342
343 sub _initialize_tokenizer ($) {
344 my $self = shift;
345 $self->{state} = DATA_STATE; # MUST
346 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
347 undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE
348 undef $self->{current_attribute};
349 undef $self->{last_emitted_start_tag_name};
350 undef $self->{last_attribute_value_state};
351 $self->{char} = [];
352 # $self->{next_char}
353 !!!next-input-character;
354 $self->{token} = [];
355 # $self->{escape}
356 } # _initialize_tokenizer
357
358 ## A token has:
359 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
360 ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
361 ## ->{name} (DOCTYPE_TOKEN)
362 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
363 ## ->{public_identifier} (DOCTYPE_TOKEN)
364 ## ->{system_identifier} (DOCTYPE_TOKEN)
365 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
366 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
367 ## ->{name}
368 ## ->{value}
369 ## ->{has_reference} == 1 or 0
370 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
371
372 ## Emitted token MUST immediately be handled by the tree construction state.
373
374 ## Before each step, UA MAY check to see if either one of the scripts in
375 ## "list of scripts that will execute as soon as possible" or the first
376 ## script in the "list of scripts that will execute asynchronously",
377 ## has completed loading. If one has, then it MUST be executed
378 ## and removed from the list.
379
380 ## NOTE: HTML5 "Writing HTML documents" section, applied to
381 ## documents and not to user agents and conformance checkers,
382 ## contains some requirements that are not detected by the
383 ## parsing algorithm:
384 ## - Some requirements on character encoding declarations. ## TODO
385 ## - "Elements MUST NOT contain content that their content model disallows."
386 ## ... Some are parse error, some are not (will be reported by c.c.).
387 ## - Polytheistic slash SHOULD NOT be used. (Applied only to atheists.) ## TODO
388 ## - Text (in elements, attributes, and comments) SHOULD NOT contain
389 ## control characters other than space characters. ## TODO: (what is control character? C0, C1 and DEL? Unicode control character?)
390
391 ## TODO: HTML5 poses authors two SHOULD-level requirements that cannot
392 ## be detected by the HTML5 parsing algorithm:
393 ## - Text,
394
395 sub _get_next_token ($) {
396 my $self = shift;
397 if (@{$self->{token}}) {
398 return shift @{$self->{token}};
399 }
400
401 A: {
402 if ($self->{state} == DATA_STATE) {
403 if ($self->{next_char} == 0x0026) { # &
404 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
405 not $self->{escape}) {
406 !!!cp (1);
407 $self->{state} = ENTITY_DATA_STATE;
408 !!!next-input-character;
409 redo A;
410 } else {
411 !!!cp (2);
412 #
413 }
414 } elsif ($self->{next_char} == 0x002D) { # -
415 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
416 unless ($self->{escape}) {
417 if ($self->{prev_char}->[0] == 0x002D and # -
418 $self->{prev_char}->[1] == 0x0021 and # !
419 $self->{prev_char}->[2] == 0x003C) { # <
420 !!!cp (3);
421 $self->{escape} = 1;
422 } else {
423 !!!cp (4);
424 }
425 } else {
426 !!!cp (5);
427 }
428 }
429
430 #
431 } elsif ($self->{next_char} == 0x003C) { # <
432 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
433 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
434 not $self->{escape})) {
435 !!!cp (6);
436 $self->{state} = TAG_OPEN_STATE;
437 !!!next-input-character;
438 redo A;
439 } else {
440 !!!cp (7);
441 #
442 }
443 } elsif ($self->{next_char} == 0x003E) { # >
444 if ($self->{escape} and
445 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
446 if ($self->{prev_char}->[0] == 0x002D and # -
447 $self->{prev_char}->[1] == 0x002D) { # -
448 !!!cp (8);
449 delete $self->{escape};
450 } else {
451 !!!cp (9);
452 }
453 } else {
454 !!!cp (10);
455 }
456
457 #
458 } elsif ($self->{next_char} == -1) {
459 !!!cp (11);
460 !!!emit ({type => END_OF_FILE_TOKEN,
461 line => $self->{line}, column => $self->{column}});
462 last A; ## TODO: ok?
463 } else {
464 !!!cp (12);
465 }
466 # Anything else
467 my $token = {type => CHARACTER_TOKEN,
468 data => chr $self->{next_char},
469 line => $self->{line}, column => $self->{column}};
470 ## Stay in the data state
471 !!!next-input-character;
472
473 !!!emit ($token);
474
475 redo A;
476 } elsif ($self->{state} == ENTITY_DATA_STATE) {
477 ## (cannot happen in CDATA state)
478
479 my ($l, $c) = ($self->{line_prev}, $self->{column_prev});
480
481 my $token = $self->_tokenize_attempt_to_consume_an_entity (0, -1);
482
483 $self->{state} = DATA_STATE;
484 # next-input-character is already done
485
486 unless (defined $token) {
487 !!!cp (13);
488 !!!emit ({type => CHARACTER_TOKEN, data => '&',
489 line => $l, column => $c});
490 } else {
491 !!!cp (14);
492 !!!emit ($token);
493 }
494
495 redo A;
496 } elsif ($self->{state} == TAG_OPEN_STATE) {
497 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
498 if ($self->{next_char} == 0x002F) { # /
499 !!!cp (15);
500 !!!next-input-character;
501 $self->{state} = CLOSE_TAG_OPEN_STATE;
502 redo A;
503 } else {
504 !!!cp (16);
505 ## reconsume
506 $self->{state} = DATA_STATE;
507
508 !!!emit ({type => CHARACTER_TOKEN, data => '<',
509 line => $self->{line_prev},
510 column => $self->{column_prev}});
511
512 redo A;
513 }
514 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
515 if ($self->{next_char} == 0x0021) { # !
516 !!!cp (17);
517 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
518 !!!next-input-character;
519 redo A;
520 } elsif ($self->{next_char} == 0x002F) { # /
521 !!!cp (18);
522 $self->{state} = CLOSE_TAG_OPEN_STATE;
523 !!!next-input-character;
524 redo A;
525 } elsif (0x0041 <= $self->{next_char} and
526 $self->{next_char} <= 0x005A) { # A..Z
527 !!!cp (19);
528 $self->{current_token}
529 = {type => START_TAG_TOKEN,
530 tag_name => chr ($self->{next_char} + 0x0020),
531 line => $self->{line_prev},
532 column => $self->{column_prev}};
533 $self->{state} = TAG_NAME_STATE;
534 !!!next-input-character;
535 redo A;
536 } elsif (0x0061 <= $self->{next_char} and
537 $self->{next_char} <= 0x007A) { # a..z
538 !!!cp (20);
539 $self->{current_token} = {type => START_TAG_TOKEN,
540 tag_name => chr ($self->{next_char}),
541 line => $self->{line_prev},
542 column => $self->{column_prev}};
543 $self->{state} = TAG_NAME_STATE;
544 !!!next-input-character;
545 redo A;
546 } elsif ($self->{next_char} == 0x003E) { # >
547 !!!cp (21);
548 !!!parse-error (type => 'empty start tag',
549 line => $self->{line_prev},
550 column => $self->{column_prev});
551 $self->{state} = DATA_STATE;
552 !!!next-input-character;
553
554 !!!emit ({type => CHARACTER_TOKEN, data => '<>',
555 line => $self->{line_prev},
556 column => $self->{column_prev}});
557
558 redo A;
559 } elsif ($self->{next_char} == 0x003F) { # ?
560 !!!cp (22);
561 !!!parse-error (type => 'pio',
562 line => $self->{line_prev},
563 column => $self->{column_prev});
564 $self->{state} = BOGUS_COMMENT_STATE;
565 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
566 line => $self->{line_prev},
567 column => $self->{column_prev}};
568 ## $self->{next_char} is intentionally left as is
569 redo A;
570 } else {
571 !!!cp (23);
572 !!!parse-error (type => 'bare stago');
573 $self->{state} = DATA_STATE;
574 ## reconsume
575
576 !!!emit ({type => CHARACTER_TOKEN, data => '<',
577 line => $self->{line_prev},
578 column => $self->{column_prev}});
579
580 redo A;
581 }
582 } else {
583 die "$0: $self->{content_model} in tag open";
584 }
585 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
586 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
587 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
588 if (defined $self->{last_emitted_start_tag_name}) {
589
590 ## NOTE: <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>
591 my @next_char;
592 TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
593 push @next_char, $self->{next_char};
594 my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
595 my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
596 if ($self->{next_char} == $c or $self->{next_char} == $C) {
597 !!!cp (24);
598 !!!next-input-character;
599 next TAGNAME;
600 } else {
601 !!!cp (25);
602 $self->{next_char} = shift @next_char; # reconsume
603 !!!back-next-input-character (@next_char);
604 $self->{state} = DATA_STATE;
605
606 !!!emit ({type => CHARACTER_TOKEN, data => '</',
607 line => $l, column => $c});
608
609 redo A;
610 }
611 }
612 push @next_char, $self->{next_char};
613
614 unless ($self->{next_char} == 0x0009 or # HT
615 $self->{next_char} == 0x000A or # LF
616 $self->{next_char} == 0x000B or # VT
617 $self->{next_char} == 0x000C or # FF
618 $self->{next_char} == 0x0020 or # SP
619 $self->{next_char} == 0x003E or # >
620 $self->{next_char} == 0x002F or # /
621 $self->{next_char} == -1) {
622 !!!cp (26);
623 $self->{next_char} = shift @next_char; # reconsume
624 !!!back-next-input-character (@next_char);
625 $self->{state} = DATA_STATE;
626 !!!emit ({type => CHARACTER_TOKEN, data => '</',
627 line => $l, column => $c});
628 redo A;
629 } else {
630 !!!cp (27);
631 $self->{next_char} = shift @next_char;
632 !!!back-next-input-character (@next_char);
633 # and consume...
634 }
635 } else {
636 ## No start tag token has ever been emitted
637 !!!cp (28);
638 # next-input-character is already done
639 $self->{state} = DATA_STATE;
640 !!!emit ({type => CHARACTER_TOKEN, data => '</',
641 line => $l, column => $c});
642 redo A;
643 }
644 }
645
646 if (0x0041 <= $self->{next_char} and
647 $self->{next_char} <= 0x005A) { # A..Z
648 !!!cp (29);
649 $self->{current_token}
650 = {type => END_TAG_TOKEN,
651 tag_name => chr ($self->{next_char} + 0x0020),
652 line => $l, column => $c};
653 $self->{state} = TAG_NAME_STATE;
654 !!!next-input-character;
655 redo A;
656 } elsif (0x0061 <= $self->{next_char} and
657 $self->{next_char} <= 0x007A) { # a..z
658 !!!cp (30);
659 $self->{current_token} = {type => END_TAG_TOKEN,
660 tag_name => chr ($self->{next_char}),
661 line => $l, column => $c};
662 $self->{state} = TAG_NAME_STATE;
663 !!!next-input-character;
664 redo A;
665 } elsif ($self->{next_char} == 0x003E) { # >
666 !!!cp (31);
667 !!!parse-error (type => 'empty end tag',
668 line => $self->{line_prev}, ## "<" in "</>"
669 column => $self->{column_prev} - 1);
670 $self->{state} = DATA_STATE;
671 !!!next-input-character;
672 redo A;
673 } elsif ($self->{next_char} == -1) {
674 !!!cp (32);
675 !!!parse-error (type => 'bare etago');
676 $self->{state} = DATA_STATE;
677 # reconsume
678
679 !!!emit ({type => CHARACTER_TOKEN, data => '</',
680 line => $l, column => $c});
681
682 redo A;
683 } else {
684 !!!cp (33);
685 !!!parse-error (type => 'bogus end tag');
686 $self->{state} = BOGUS_COMMENT_STATE;
687 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
688 line => $self->{line_prev}, # "<" of "</"
689 column => $self->{column_prev} - 1};
690 ## $self->{next_char} is intentionally left as is
691 redo A;
692 }
693 } elsif ($self->{state} == TAG_NAME_STATE) {
694 if ($self->{next_char} == 0x0009 or # HT
695 $self->{next_char} == 0x000A or # LF
696 $self->{next_char} == 0x000B or # VT
697 $self->{next_char} == 0x000C or # FF
698 $self->{next_char} == 0x0020) { # SP
699 !!!cp (34);
700 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
701 !!!next-input-character;
702 redo A;
703 } elsif ($self->{next_char} == 0x003E) { # >
704 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
705 !!!cp (35);
706 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
707 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
708 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
709 #if ($self->{current_token}->{attributes}) {
710 # ## NOTE: This should never be reached.
711 # !!! cp (36);
712 # !!! parse-error (type => 'end tag attribute');
713 #} else {
714 !!!cp (37);
715 #}
716 } else {
717 die "$0: $self->{current_token}->{type}: Unknown token type";
718 }
719 $self->{state} = DATA_STATE;
720 !!!next-input-character;
721
722 !!!emit ($self->{current_token}); # start tag or end tag
723
724 redo A;
725 } elsif (0x0041 <= $self->{next_char} and
726 $self->{next_char} <= 0x005A) { # A..Z
727 !!!cp (38);
728 $self->{current_token}->{tag_name} .= chr ($self->{next_char} + 0x0020);
729 # start tag or end tag
730 ## Stay in this state
731 !!!next-input-character;
732 redo A;
733 } elsif ($self->{next_char} == -1) {
734 !!!parse-error (type => 'unclosed tag');
735 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
736 !!!cp (39);
737 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
738 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
739 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
740 #if ($self->{current_token}->{attributes}) {
741 # ## NOTE: This state should never be reached.
742 # !!! cp (40);
743 # !!! parse-error (type => 'end tag attribute');
744 #} else {
745 !!!cp (41);
746 #}
747 } else {
748 die "$0: $self->{current_token}->{type}: Unknown token type";
749 }
750 $self->{state} = DATA_STATE;
751 # reconsume
752
753 !!!emit ($self->{current_token}); # start tag or end tag
754
755 redo A;
756 } elsif ($self->{next_char} == 0x002F) { # /
757 !!!next-input-character;
758 if ($self->{next_char} == 0x003E and # >
759 $self->{current_token}->{type} == START_TAG_TOKEN and
760 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
761 # permitted slash
762 !!!cp (42);
763 #
764 } else {
765 !!!cp (43);
766 !!!parse-error (type => 'nestc');
767 }
768 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
769 # next-input-character is already done
770 redo A;
771 } else {
772 !!!cp (44);
773 $self->{current_token}->{tag_name} .= chr $self->{next_char};
774 # start tag or end tag
775 ## Stay in the state
776 !!!next-input-character;
777 redo A;
778 }
779 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
780 if ($self->{next_char} == 0x0009 or # HT
781 $self->{next_char} == 0x000A or # LF
782 $self->{next_char} == 0x000B or # VT
783 $self->{next_char} == 0x000C or # FF
784 $self->{next_char} == 0x0020) { # SP
785 !!!cp (45);
786 ## Stay in the state
787 !!!next-input-character;
788 redo A;
789 } elsif ($self->{next_char} == 0x003E) { # >
790 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
791 !!!cp (46);
792 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
793 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
794 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
795 if ($self->{current_token}->{attributes}) {
796 !!!cp (47);
797 !!!parse-error (type => 'end tag attribute');
798 } else {
799 !!!cp (48);
800 }
801 } else {
802 die "$0: $self->{current_token}->{type}: Unknown token type";
803 }
804 $self->{state} = DATA_STATE;
805 !!!next-input-character;
806
807 !!!emit ($self->{current_token}); # start tag or end tag
808
809 redo A;
810 } elsif (0x0041 <= $self->{next_char} and
811 $self->{next_char} <= 0x005A) { # A..Z
812 !!!cp (49);
813 $self->{current_attribute} = {name => chr ($self->{next_char} + 0x0020),
814 value => ''};
815 $self->{state} = ATTRIBUTE_NAME_STATE;
816 !!!next-input-character;
817 redo A;
818 } elsif ($self->{next_char} == 0x002F) { # /
819 !!!next-input-character;
820 if ($self->{next_char} == 0x003E and # >
821 $self->{current_token}->{type} == START_TAG_TOKEN and
822 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
823 # permitted slash
824 !!!cp (50);
825 #
826 } else {
827 !!!cp (51);
828 !!!parse-error (type => 'nestc');
829 }
830 ## Stay in the state
831 # next-input-character is already done
832 redo A;
833 } elsif ($self->{next_char} == -1) {
834 !!!parse-error (type => 'unclosed tag');
835 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
836 !!!cp (52);
837 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
838 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
839 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
840 if ($self->{current_token}->{attributes}) {
841 !!!cp (53);
842 !!!parse-error (type => 'end tag attribute');
843 } else {
844 !!!cp (54);
845 }
846 } else {
847 die "$0: $self->{current_token}->{type}: Unknown token type";
848 }
849 $self->{state} = DATA_STATE;
850 # reconsume
851
852 !!!emit ($self->{current_token}); # start tag or end tag
853
854 redo A;
855 } else {
856 if ({
857 0x0022 => 1, # "
858 0x0027 => 1, # '
859 0x003D => 1, # =
860 }->{$self->{next_char}}) {
861 !!!cp (55);
862 !!!parse-error (type => 'bad attribute name');
863 } else {
864 !!!cp (56);
865 }
866 $self->{current_attribute} = {name => chr ($self->{next_char}),
867 value => ''};
868 $self->{state} = ATTRIBUTE_NAME_STATE;
869 !!!next-input-character;
870 redo A;
871 }
872 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
873 my $before_leave = sub {
874 if (exists $self->{current_token}->{attributes} # start tag or end tag
875 ->{$self->{current_attribute}->{name}}) { # MUST
876 !!!cp (57);
877 !!!parse-error (type => 'duplicate attribute:'.$self->{current_attribute}->{name});
878 ## Discard $self->{current_attribute} # MUST
879 } else {
880 !!!cp (58);
881 $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}
882 = $self->{current_attribute};
883 }
884 }; # $before_leave
885
886 if ($self->{next_char} == 0x0009 or # HT
887 $self->{next_char} == 0x000A or # LF
888 $self->{next_char} == 0x000B or # VT
889 $self->{next_char} == 0x000C or # FF
890 $self->{next_char} == 0x0020) { # SP
891 !!!cp (59);
892 $before_leave->();
893 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
894 !!!next-input-character;
895 redo A;
896 } elsif ($self->{next_char} == 0x003D) { # =
897 !!!cp (60);
898 $before_leave->();
899 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
900 !!!next-input-character;
901 redo A;
902 } elsif ($self->{next_char} == 0x003E) { # >
903 $before_leave->();
904 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
905 !!!cp (61);
906 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
907 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
908 !!!cp (62);
909 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
910 if ($self->{current_token}->{attributes}) {
911 !!!parse-error (type => 'end tag attribute');
912 }
913 } else {
914 die "$0: $self->{current_token}->{type}: Unknown token type";
915 }
916 $self->{state} = DATA_STATE;
917 !!!next-input-character;
918
919 !!!emit ($self->{current_token}); # start tag or end tag
920
921 redo A;
922 } elsif (0x0041 <= $self->{next_char} and
923 $self->{next_char} <= 0x005A) { # A..Z
924 !!!cp (63);
925 $self->{current_attribute}->{name} .= chr ($self->{next_char} + 0x0020);
926 ## Stay in the state
927 !!!next-input-character;
928 redo A;
929 } elsif ($self->{next_char} == 0x002F) { # /
930 $before_leave->();
931 !!!next-input-character;
932 if ($self->{next_char} == 0x003E and # >
933 $self->{current_token}->{type} == START_TAG_TOKEN and
934 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
935 # permitted slash
936 !!!cp (64);
937 #
938 } else {
939 !!!cp (65);
940 !!!parse-error (type => 'nestc');
941 }
942 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
943 # next-input-character is already done
944 redo A;
945 } elsif ($self->{next_char} == -1) {
946 !!!parse-error (type => 'unclosed tag');
947 $before_leave->();
948 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
949 !!!cp (66);
950 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
951 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
952 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
953 if ($self->{current_token}->{attributes}) {
954 !!!cp (67);
955 !!!parse-error (type => 'end tag attribute');
956 } else {
957 ## NOTE: This state should never be reached.
958 !!!cp (68);
959 }
960 } else {
961 die "$0: $self->{current_token}->{type}: Unknown token type";
962 }
963 $self->{state} = DATA_STATE;
964 # reconsume
965
966 !!!emit ($self->{current_token}); # start tag or end tag
967
968 redo A;
969 } else {
970 if ($self->{next_char} == 0x0022 or # "
971 $self->{next_char} == 0x0027) { # '
972 !!!cp (69);
973 !!!parse-error (type => 'bad attribute name');
974 } else {
975 !!!cp (70);
976 }
977 $self->{current_attribute}->{name} .= chr ($self->{next_char});
978 ## Stay in the state
979 !!!next-input-character;
980 redo A;
981 }
982 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
983 if ($self->{next_char} == 0x0009 or # HT
984 $self->{next_char} == 0x000A or # LF
985 $self->{next_char} == 0x000B or # VT
986 $self->{next_char} == 0x000C or # FF
987 $self->{next_char} == 0x0020) { # SP
988 !!!cp (71);
989 ## Stay in the state
990 !!!next-input-character;
991 redo A;
992 } elsif ($self->{next_char} == 0x003D) { # =
993 !!!cp (72);
994 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
995 !!!next-input-character;
996 redo A;
997 } elsif ($self->{next_char} == 0x003E) { # >
998 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
999 !!!cp (73);
1000 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1001 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1002 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1003 if ($self->{current_token}->{attributes}) {
1004 !!!cp (74);
1005 !!!parse-error (type => 'end tag attribute');
1006 } else {
1007 ## NOTE: This state should never be reached.
1008 !!!cp (75);
1009 }
1010 } else {
1011 die "$0: $self->{current_token}->{type}: Unknown token type";
1012 }
1013 $self->{state} = DATA_STATE;
1014 !!!next-input-character;
1015
1016 !!!emit ($self->{current_token}); # start tag or end tag
1017
1018 redo A;
1019 } elsif (0x0041 <= $self->{next_char} and
1020 $self->{next_char} <= 0x005A) { # A..Z
1021 !!!cp (76);
1022 $self->{current_attribute} = {name => chr ($self->{next_char} + 0x0020),
1023 value => ''};
1024 $self->{state} = ATTRIBUTE_NAME_STATE;
1025 !!!next-input-character;
1026 redo A;
1027 } elsif ($self->{next_char} == 0x002F) { # /
1028 !!!next-input-character;
1029 if ($self->{next_char} == 0x003E and # >
1030 $self->{current_token}->{type} == START_TAG_TOKEN and
1031 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
1032 # permitted slash
1033 !!!cp (77);
1034 #
1035 } else {
1036 !!!cp (78);
1037 !!!parse-error (type => 'nestc');
1038 ## TODO: Different error type for <aa / bb> than <aa/>
1039 }
1040 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1041 # next-input-character is already done
1042 redo A;
1043 } elsif ($self->{next_char} == -1) {
1044 !!!parse-error (type => 'unclosed tag');
1045 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1046 !!!cp (79);
1047 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1048 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1049 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1050 if ($self->{current_token}->{attributes}) {
1051 !!!cp (80);
1052 !!!parse-error (type => 'end tag attribute');
1053 } else {
1054 ## NOTE: This state should never be reached.
1055 !!!cp (81);
1056 }
1057 } else {
1058 die "$0: $self->{current_token}->{type}: Unknown token type";
1059 }
1060 $self->{state} = DATA_STATE;
1061 # reconsume
1062
1063 !!!emit ($self->{current_token}); # start tag or end tag
1064
1065 redo A;
1066 } else {
1067 !!!cp (82);
1068 $self->{current_attribute} = {name => chr ($self->{next_char}),
1069 value => ''};
1070 $self->{state} = ATTRIBUTE_NAME_STATE;
1071 !!!next-input-character;
1072 redo A;
1073 }
1074 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1075 if ($self->{next_char} == 0x0009 or # HT
1076 $self->{next_char} == 0x000A or # LF
1077 $self->{next_char} == 0x000B or # VT
1078 $self->{next_char} == 0x000C or # FF
1079 $self->{next_char} == 0x0020) { # SP
1080 !!!cp (83);
1081 ## Stay in the state
1082 !!!next-input-character;
1083 redo A;
1084 } elsif ($self->{next_char} == 0x0022) { # "
1085 !!!cp (84);
1086 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1087 !!!next-input-character;
1088 redo A;
1089 } elsif ($self->{next_char} == 0x0026) { # &
1090 !!!cp (85);
1091 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1092 ## reconsume
1093 redo A;
1094 } elsif ($self->{next_char} == 0x0027) { # '
1095 !!!cp (86);
1096 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1097 !!!next-input-character;
1098 redo A;
1099 } elsif ($self->{next_char} == 0x003E) { # >
1100 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1101 !!!cp (87);
1102 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1103 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1104 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1105 if ($self->{current_token}->{attributes}) {
1106 !!!cp (88);
1107 !!!parse-error (type => 'end tag attribute');
1108 } else {
1109 ## NOTE: This state should never be reached.
1110 !!!cp (89);
1111 }
1112 } else {
1113 die "$0: $self->{current_token}->{type}: Unknown token type";
1114 }
1115 $self->{state} = DATA_STATE;
1116 !!!next-input-character;
1117
1118 !!!emit ($self->{current_token}); # start tag or end tag
1119
1120 redo A;
1121 } elsif ($self->{next_char} == -1) {
1122 !!!parse-error (type => 'unclosed tag');
1123 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1124 !!!cp (90);
1125 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1126 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1127 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1128 if ($self->{current_token}->{attributes}) {
1129 !!!cp (91);
1130 !!!parse-error (type => 'end tag attribute');
1131 } else {
1132 ## NOTE: This state should never be reached.
1133 !!!cp (92);
1134 }
1135 } else {
1136 die "$0: $self->{current_token}->{type}: Unknown token type";
1137 }
1138 $self->{state} = DATA_STATE;
1139 ## reconsume
1140
1141 !!!emit ($self->{current_token}); # start tag or end tag
1142
1143 redo A;
1144 } else {
1145 if ($self->{next_char} == 0x003D) { # =
1146 !!!cp (93);
1147 !!!parse-error (type => 'bad attribute value');
1148 } else {
1149 !!!cp (94);
1150 }
1151 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1152 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1153 !!!next-input-character;
1154 redo A;
1155 }
1156 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1157 if ($self->{next_char} == 0x0022) { # "
1158 !!!cp (95);
1159 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1160 !!!next-input-character;
1161 redo A;
1162 } elsif ($self->{next_char} == 0x0026) { # &
1163 !!!cp (96);
1164 $self->{last_attribute_value_state} = $self->{state};
1165 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1166 !!!next-input-character;
1167 redo A;
1168 } elsif ($self->{next_char} == -1) {
1169 !!!parse-error (type => 'unclosed attribute value');
1170 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1171 !!!cp (97);
1172 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1173 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1174 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1175 if ($self->{current_token}->{attributes}) {
1176 !!!cp (98);
1177 !!!parse-error (type => 'end tag attribute');
1178 } else {
1179 ## NOTE: This state should never be reached.
1180 !!!cp (99);
1181 }
1182 } else {
1183 die "$0: $self->{current_token}->{type}: Unknown token type";
1184 }
1185 $self->{state} = DATA_STATE;
1186 ## reconsume
1187
1188 !!!emit ($self->{current_token}); # start tag or end tag
1189
1190 redo A;
1191 } else {
1192 !!!cp (100);
1193 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1194 ## Stay in the state
1195 !!!next-input-character;
1196 redo A;
1197 }
1198 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1199 if ($self->{next_char} == 0x0027) { # '
1200 !!!cp (101);
1201 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1202 !!!next-input-character;
1203 redo A;
1204 } elsif ($self->{next_char} == 0x0026) { # &
1205 !!!cp (102);
1206 $self->{last_attribute_value_state} = $self->{state};
1207 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1208 !!!next-input-character;
1209 redo A;
1210 } elsif ($self->{next_char} == -1) {
1211 !!!parse-error (type => 'unclosed attribute value');
1212 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1213 !!!cp (103);
1214 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1215 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1216 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1217 if ($self->{current_token}->{attributes}) {
1218 !!!cp (104);
1219 !!!parse-error (type => 'end tag attribute');
1220 } else {
1221 ## NOTE: This state should never be reached.
1222 !!!cp (105);
1223 }
1224 } else {
1225 die "$0: $self->{current_token}->{type}: Unknown token type";
1226 }
1227 $self->{state} = DATA_STATE;
1228 ## reconsume
1229
1230 !!!emit ($self->{current_token}); # start tag or end tag
1231
1232 redo A;
1233 } else {
1234 !!!cp (106);
1235 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1236 ## Stay in the state
1237 !!!next-input-character;
1238 redo A;
1239 }
1240 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1241 if ($self->{next_char} == 0x0009 or # HT
1242 $self->{next_char} == 0x000A or # LF
1243 $self->{next_char} == 0x000B or # HT
1244 $self->{next_char} == 0x000C or # FF
1245 $self->{next_char} == 0x0020) { # SP
1246 !!!cp (107);
1247 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1248 !!!next-input-character;
1249 redo A;
1250 } elsif ($self->{next_char} == 0x0026) { # &
1251 !!!cp (108);
1252 $self->{last_attribute_value_state} = $self->{state};
1253 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1254 !!!next-input-character;
1255 redo A;
1256 } elsif ($self->{next_char} == 0x003E) { # >
1257 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1258 !!!cp (109);
1259 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1260 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1261 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1262 if ($self->{current_token}->{attributes}) {
1263 !!!cp (110);
1264 !!!parse-error (type => 'end tag attribute');
1265 } else {
1266 ## NOTE: This state should never be reached.
1267 !!!cp (111);
1268 }
1269 } else {
1270 die "$0: $self->{current_token}->{type}: Unknown token type";
1271 }
1272 $self->{state} = DATA_STATE;
1273 !!!next-input-character;
1274
1275 !!!emit ($self->{current_token}); # start tag or end tag
1276
1277 redo A;
1278 } elsif ($self->{next_char} == -1) {
1279 !!!parse-error (type => 'unclosed tag');
1280 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1281 !!!cp (112);
1282 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1283 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1284 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1285 if ($self->{current_token}->{attributes}) {
1286 !!!cp (113);
1287 !!!parse-error (type => 'end tag attribute');
1288 } else {
1289 ## NOTE: This state should never be reached.
1290 !!!cp (114);
1291 }
1292 } else {
1293 die "$0: $self->{current_token}->{type}: Unknown token type";
1294 }
1295 $self->{state} = DATA_STATE;
1296 ## reconsume
1297
1298 !!!emit ($self->{current_token}); # start tag or end tag
1299
1300 redo A;
1301 } else {
1302 if ({
1303 0x0022 => 1, # "
1304 0x0027 => 1, # '
1305 0x003D => 1, # =
1306 }->{$self->{next_char}}) {
1307 !!!cp (115);
1308 !!!parse-error (type => 'bad attribute value');
1309 } else {
1310 !!!cp (116);
1311 }
1312 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1313 ## Stay in the state
1314 !!!next-input-character;
1315 redo A;
1316 }
1317 } elsif ($self->{state} == ENTITY_IN_ATTRIBUTE_VALUE_STATE) {
1318 my $token = $self->_tokenize_attempt_to_consume_an_entity
1319 (1,
1320 $self->{last_attribute_value_state}
1321 == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE ? 0x0022 : # "
1322 $self->{last_attribute_value_state}
1323 == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE ? 0x0027 : # '
1324 -1);
1325
1326 unless (defined $token) {
1327 !!!cp (117);
1328 $self->{current_attribute}->{value} .= '&';
1329 } else {
1330 !!!cp (118);
1331 $self->{current_attribute}->{value} .= $token->{data};
1332 $self->{current_attribute}->{has_reference} = $token->{has_reference};
1333 ## ISSUE: spec says "append the returned character token to the current attribute's value"
1334 }
1335
1336 $self->{state} = $self->{last_attribute_value_state};
1337 # next-input-character is already done
1338 redo A;
1339 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1340 if ($self->{next_char} == 0x0009 or # HT
1341 $self->{next_char} == 0x000A or # LF
1342 $self->{next_char} == 0x000B or # VT
1343 $self->{next_char} == 0x000C or # FF
1344 $self->{next_char} == 0x0020) { # SP
1345 !!!cp (118);
1346 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1347 !!!next-input-character;
1348 redo A;
1349 } elsif ($self->{next_char} == 0x003E) { # >
1350 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1351 !!!cp (119);
1352 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1353 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1354 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1355 if ($self->{current_token}->{attributes}) {
1356 !!!cp (120);
1357 !!!parse-error (type => 'end tag attribute');
1358 } else {
1359 ## NOTE: This state should never be reached.
1360 !!!cp (121);
1361 }
1362 } else {
1363 die "$0: $self->{current_token}->{type}: Unknown token type";
1364 }
1365 $self->{state} = DATA_STATE;
1366 !!!next-input-character;
1367
1368 !!!emit ($self->{current_token}); # start tag or end tag
1369
1370 redo A;
1371 } elsif ($self->{next_char} == 0x002F) { # /
1372 !!!next-input-character;
1373 if ($self->{next_char} == 0x003E and # >
1374 $self->{current_token}->{type} == START_TAG_TOKEN and
1375 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
1376 # permitted slash
1377 !!!cp (122);
1378 #
1379 } else {
1380 !!!cp (123);
1381 !!!parse-error (type => 'nestc');
1382 }
1383 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1384 # next-input-character is already done
1385 redo A;
1386 } else {
1387 !!!cp (124);
1388 !!!parse-error (type => 'no space between attributes');
1389 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1390 ## reconsume
1391 redo A;
1392 }
1393 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1394 ## (only happen if PCDATA state)
1395
1396 ## NOTE: Set by the previous state
1397 #my $token = {type => COMMENT_TOKEN, data => ''};
1398
1399 BC: {
1400 if ($self->{next_char} == 0x003E) { # >
1401 !!!cp (124);
1402 $self->{state} = DATA_STATE;
1403 !!!next-input-character;
1404
1405 !!!emit ($self->{current_token}); # comment
1406
1407 redo A;
1408 } elsif ($self->{next_char} == -1) {
1409 !!!cp (125);
1410 $self->{state} = DATA_STATE;
1411 ## reconsume
1412
1413 !!!emit ($self->{current_token}); # comment
1414
1415 redo A;
1416 } else {
1417 !!!cp (126);
1418 $self->{current_token}->{data} .= chr ($self->{next_char}); # comment
1419 !!!next-input-character;
1420 redo BC;
1421 }
1422 } # BC
1423
1424 die "$0: _get_next_token: unexpected case [BC]";
1425 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1426 ## (only happen if PCDATA state)
1427
1428 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1);
1429
1430 my @next_char;
1431 push @next_char, $self->{next_char};
1432
1433 if ($self->{next_char} == 0x002D) { # -
1434 !!!next-input-character;
1435 push @next_char, $self->{next_char};
1436 if ($self->{next_char} == 0x002D) { # -
1437 !!!cp (127);
1438 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
1439 line => $l, column => $c};
1440 $self->{state} = COMMENT_START_STATE;
1441 !!!next-input-character;
1442 redo A;
1443 } else {
1444 !!!cp (128);
1445 }
1446 } elsif ($self->{next_char} == 0x0044 or # D
1447 $self->{next_char} == 0x0064) { # d
1448 !!!next-input-character;
1449 push @next_char, $self->{next_char};
1450 if ($self->{next_char} == 0x004F or # O
1451 $self->{next_char} == 0x006F) { # o
1452 !!!next-input-character;
1453 push @next_char, $self->{next_char};
1454 if ($self->{next_char} == 0x0043 or # C
1455 $self->{next_char} == 0x0063) { # c
1456 !!!next-input-character;
1457 push @next_char, $self->{next_char};
1458 if ($self->{next_char} == 0x0054 or # T
1459 $self->{next_char} == 0x0074) { # t
1460 !!!next-input-character;
1461 push @next_char, $self->{next_char};
1462 if ($self->{next_char} == 0x0059 or # Y
1463 $self->{next_char} == 0x0079) { # y
1464 !!!next-input-character;
1465 push @next_char, $self->{next_char};
1466 if ($self->{next_char} == 0x0050 or # P
1467 $self->{next_char} == 0x0070) { # p
1468 !!!next-input-character;
1469 push @next_char, $self->{next_char};
1470 if ($self->{next_char} == 0x0045 or # E
1471 $self->{next_char} == 0x0065) { # e
1472 !!!cp (129);
1473 ## TODO: What a stupid code this is!
1474 $self->{state} = DOCTYPE_STATE;
1475 $self->{current_token} = {type => DOCTYPE_TOKEN,
1476 quirks => 1,
1477 line => $l, column => $c};
1478 !!!next-input-character;
1479 redo A;
1480 } else {
1481 !!!cp (130);
1482 }
1483 } else {
1484 !!!cp (131);
1485 }
1486 } else {
1487 !!!cp (132);
1488 }
1489 } else {
1490 !!!cp (133);
1491 }
1492 } else {
1493 !!!cp (134);
1494 }
1495 } else {
1496 !!!cp (135);
1497 }
1498 } else {
1499 !!!cp (136);
1500 }
1501
1502 !!!parse-error (type => 'bogus comment');
1503 $self->{next_char} = shift @next_char;
1504 !!!back-next-input-character (@next_char);
1505 $self->{state} = BOGUS_COMMENT_STATE;
1506 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
1507 line => $l, column => $c};
1508 redo A;
1509
1510 ## ISSUE: typos in spec: chacacters, is is a parse error
1511 ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
1512 } elsif ($self->{state} == COMMENT_START_STATE) {
1513 if ($self->{next_char} == 0x002D) { # -
1514 !!!cp (137);
1515 $self->{state} = COMMENT_START_DASH_STATE;
1516 !!!next-input-character;
1517 redo A;
1518 } elsif ($self->{next_char} == 0x003E) { # >
1519 !!!cp (138);
1520 !!!parse-error (type => 'bogus comment');
1521 $self->{state} = DATA_STATE;
1522 !!!next-input-character;
1523
1524 !!!emit ($self->{current_token}); # comment
1525
1526 redo A;
1527 } elsif ($self->{next_char} == -1) {
1528 !!!cp (139);
1529 !!!parse-error (type => 'unclosed comment');
1530 $self->{state} = DATA_STATE;
1531 ## reconsume
1532
1533 !!!emit ($self->{current_token}); # comment
1534
1535 redo A;
1536 } else {
1537 !!!cp (140);
1538 $self->{current_token}->{data} # comment
1539 .= chr ($self->{next_char});
1540 $self->{state} = COMMENT_STATE;
1541 !!!next-input-character;
1542 redo A;
1543 }
1544 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1545 if ($self->{next_char} == 0x002D) { # -
1546 !!!cp (141);
1547 $self->{state} = COMMENT_END_STATE;
1548 !!!next-input-character;
1549 redo A;
1550 } elsif ($self->{next_char} == 0x003E) { # >
1551 !!!cp (142);
1552 !!!parse-error (type => 'bogus comment');
1553 $self->{state} = DATA_STATE;
1554 !!!next-input-character;
1555
1556 !!!emit ($self->{current_token}); # comment
1557
1558 redo A;
1559 } elsif ($self->{next_char} == -1) {
1560 !!!cp (143);
1561 !!!parse-error (type => 'unclosed comment');
1562 $self->{state} = DATA_STATE;
1563 ## reconsume
1564
1565 !!!emit ($self->{current_token}); # comment
1566
1567 redo A;
1568 } else {
1569 !!!cp (144);
1570 $self->{current_token}->{data} # comment
1571 .= '-' . chr ($self->{next_char});
1572 $self->{state} = COMMENT_STATE;
1573 !!!next-input-character;
1574 redo A;
1575 }
1576 } elsif ($self->{state} == COMMENT_STATE) {
1577 if ($self->{next_char} == 0x002D) { # -
1578 !!!cp (145);
1579 $self->{state} = COMMENT_END_DASH_STATE;
1580 !!!next-input-character;
1581 redo A;
1582 } elsif ($self->{next_char} == -1) {
1583 !!!cp (146);
1584 !!!parse-error (type => 'unclosed comment');
1585 $self->{state} = DATA_STATE;
1586 ## reconsume
1587
1588 !!!emit ($self->{current_token}); # comment
1589
1590 redo A;
1591 } else {
1592 !!!cp (147);
1593 $self->{current_token}->{data} .= chr ($self->{next_char}); # comment
1594 ## Stay in the state
1595 !!!next-input-character;
1596 redo A;
1597 }
1598 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1599 if ($self->{next_char} == 0x002D) { # -
1600 !!!cp (148);
1601 $self->{state} = COMMENT_END_STATE;
1602 !!!next-input-character;
1603 redo A;
1604 } elsif ($self->{next_char} == -1) {
1605 !!!cp (149);
1606 !!!parse-error (type => 'unclosed comment');
1607 $self->{state} = DATA_STATE;
1608 ## reconsume
1609
1610 !!!emit ($self->{current_token}); # comment
1611
1612 redo A;
1613 } else {
1614 !!!cp (150);
1615 $self->{current_token}->{data} .= '-' . chr ($self->{next_char}); # comment
1616 $self->{state} = COMMENT_STATE;
1617 !!!next-input-character;
1618 redo A;
1619 }
1620 } elsif ($self->{state} == COMMENT_END_STATE) {
1621 if ($self->{next_char} == 0x003E) { # >
1622 !!!cp (151);
1623 $self->{state} = DATA_STATE;
1624 !!!next-input-character;
1625
1626 !!!emit ($self->{current_token}); # comment
1627
1628 redo A;
1629 } elsif ($self->{next_char} == 0x002D) { # -
1630 !!!cp (152);
1631 !!!parse-error (type => 'dash in comment',
1632 line => $self->{line_prev},
1633 column => $self->{column_prev});
1634 $self->{current_token}->{data} .= '-'; # comment
1635 ## Stay in the state
1636 !!!next-input-character;
1637 redo A;
1638 } elsif ($self->{next_char} == -1) {
1639 !!!cp (153);
1640 !!!parse-error (type => 'unclosed comment');
1641 $self->{state} = DATA_STATE;
1642 ## reconsume
1643
1644 !!!emit ($self->{current_token}); # comment
1645
1646 redo A;
1647 } else {
1648 !!!cp (154);
1649 !!!parse-error (type => 'dash in comment',
1650 line => $self->{line_prev},
1651 column => $self->{column_prev});
1652 $self->{current_token}->{data} .= '--' . chr ($self->{next_char}); # comment
1653 $self->{state} = COMMENT_STATE;
1654 !!!next-input-character;
1655 redo A;
1656 }
1657 } elsif ($self->{state} == DOCTYPE_STATE) {
1658 if ($self->{next_char} == 0x0009 or # HT
1659 $self->{next_char} == 0x000A or # LF
1660 $self->{next_char} == 0x000B or # VT
1661 $self->{next_char} == 0x000C or # FF
1662 $self->{next_char} == 0x0020) { # SP
1663 !!!cp (155);
1664 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1665 !!!next-input-character;
1666 redo A;
1667 } else {
1668 !!!cp (156);
1669 !!!parse-error (type => 'no space before DOCTYPE name');
1670 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1671 ## reconsume
1672 redo A;
1673 }
1674 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
1675 if ($self->{next_char} == 0x0009 or # HT
1676 $self->{next_char} == 0x000A or # LF
1677 $self->{next_char} == 0x000B or # VT
1678 $self->{next_char} == 0x000C or # FF
1679 $self->{next_char} == 0x0020) { # SP
1680 !!!cp (157);
1681 ## Stay in the state
1682 !!!next-input-character;
1683 redo A;
1684 } elsif ($self->{next_char} == 0x003E) { # >
1685 !!!cp (158);
1686 !!!parse-error (type => 'no DOCTYPE name');
1687 $self->{state} = DATA_STATE;
1688 !!!next-input-character;
1689
1690 !!!emit ($self->{current_token}); # DOCTYPE (quirks)
1691
1692 redo A;
1693 } elsif ($self->{next_char} == -1) {
1694 !!!cp (159);
1695 !!!parse-error (type => 'no DOCTYPE name');
1696 $self->{state} = DATA_STATE;
1697 ## reconsume
1698
1699 !!!emit ($self->{current_token}); # DOCTYPE (quirks)
1700
1701 redo A;
1702 } else {
1703 !!!cp (160);
1704 $self->{current_token}->{name} = chr $self->{next_char};
1705 delete $self->{current_token}->{quirks};
1706 ## ISSUE: "Set the token's name name to the" in the spec
1707 $self->{state} = DOCTYPE_NAME_STATE;
1708 !!!next-input-character;
1709 redo A;
1710 }
1711 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
1712 ## ISSUE: Redundant "First," in the spec.
1713 if ($self->{next_char} == 0x0009 or # HT
1714 $self->{next_char} == 0x000A or # LF
1715 $self->{next_char} == 0x000B or # VT
1716 $self->{next_char} == 0x000C or # FF
1717 $self->{next_char} == 0x0020) { # SP
1718 !!!cp (161);
1719 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
1720 !!!next-input-character;
1721 redo A;
1722 } elsif ($self->{next_char} == 0x003E) { # >
1723 !!!cp (162);
1724 $self->{state} = DATA_STATE;
1725 !!!next-input-character;
1726
1727 !!!emit ($self->{current_token}); # DOCTYPE
1728
1729 redo A;
1730 } elsif ($self->{next_char} == -1) {
1731 !!!cp (163);
1732 !!!parse-error (type => 'unclosed DOCTYPE');
1733 $self->{state} = DATA_STATE;
1734 ## reconsume
1735
1736 $self->{current_token}->{quirks} = 1;
1737 !!!emit ($self->{current_token}); # DOCTYPE
1738
1739 redo A;
1740 } else {
1741 !!!cp (164);
1742 $self->{current_token}->{name}
1743 .= chr ($self->{next_char}); # DOCTYPE
1744 ## Stay in the state
1745 !!!next-input-character;
1746 redo A;
1747 }
1748 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
1749 if ($self->{next_char} == 0x0009 or # HT
1750 $self->{next_char} == 0x000A or # LF
1751 $self->{next_char} == 0x000B or # VT
1752 $self->{next_char} == 0x000C or # FF
1753 $self->{next_char} == 0x0020) { # SP
1754 !!!cp (165);
1755 ## Stay in the state
1756 !!!next-input-character;
1757 redo A;
1758 } elsif ($self->{next_char} == 0x003E) { # >
1759 !!!cp (166);
1760 $self->{state} = DATA_STATE;
1761 !!!next-input-character;
1762
1763 !!!emit ($self->{current_token}); # DOCTYPE
1764
1765 redo A;
1766 } elsif ($self->{next_char} == -1) {
1767 !!!cp (167);
1768 !!!parse-error (type => 'unclosed DOCTYPE');
1769 $self->{state} = DATA_STATE;
1770 ## reconsume
1771
1772 $self->{current_token}->{quirks} = 1;
1773 !!!emit ($self->{current_token}); # DOCTYPE
1774
1775 redo A;
1776 } elsif ($self->{next_char} == 0x0050 or # P
1777 $self->{next_char} == 0x0070) { # p
1778 !!!next-input-character;
1779 if ($self->{next_char} == 0x0055 or # U
1780 $self->{next_char} == 0x0075) { # u
1781 !!!next-input-character;
1782 if ($self->{next_char} == 0x0042 or # B
1783 $self->{next_char} == 0x0062) { # b
1784 !!!next-input-character;
1785 if ($self->{next_char} == 0x004C or # L
1786 $self->{next_char} == 0x006C) { # l
1787 !!!next-input-character;
1788 if ($self->{next_char} == 0x0049 or # I
1789 $self->{next_char} == 0x0069) { # i
1790 !!!next-input-character;
1791 if ($self->{next_char} == 0x0043 or # C
1792 $self->{next_char} == 0x0063) { # c
1793 !!!cp (168);
1794 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1795 !!!next-input-character;
1796 redo A;
1797 } else {
1798 !!!cp (169);
1799 }
1800 } else {
1801 !!!cp (170);
1802 }
1803 } else {
1804 !!!cp (171);
1805 }
1806 } else {
1807 !!!cp (172);
1808 }
1809 } else {
1810 !!!cp (173);
1811 }
1812
1813 #
1814 } elsif ($self->{next_char} == 0x0053 or # S
1815 $self->{next_char} == 0x0073) { # s
1816 !!!next-input-character;
1817 if ($self->{next_char} == 0x0059 or # Y
1818 $self->{next_char} == 0x0079) { # y
1819 !!!next-input-character;
1820 if ($self->{next_char} == 0x0053 or # S
1821 $self->{next_char} == 0x0073) { # s
1822 !!!next-input-character;
1823 if ($self->{next_char} == 0x0054 or # T
1824 $self->{next_char} == 0x0074) { # t
1825 !!!next-input-character;
1826 if ($self->{next_char} == 0x0045 or # E
1827 $self->{next_char} == 0x0065) { # e
1828 !!!next-input-character;
1829 if ($self->{next_char} == 0x004D or # M
1830 $self->{next_char} == 0x006D) { # m
1831 !!!cp (174);
1832 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1833 !!!next-input-character;
1834 redo A;
1835 } else {
1836 !!!cp (175);
1837 }
1838 } else {
1839 !!!cp (176);
1840 }
1841 } else {
1842 !!!cp (177);
1843 }
1844 } else {
1845 !!!cp (178);
1846 }
1847 } else {
1848 !!!cp (179);
1849 }
1850
1851 #
1852 } else {
1853 !!!cp (180);
1854 !!!next-input-character;
1855 #
1856 }
1857
1858 !!!parse-error (type => 'string after DOCTYPE name');
1859 $self->{current_token}->{quirks} = 1;
1860
1861 $self->{state} = BOGUS_DOCTYPE_STATE;
1862 # next-input-character is already done
1863 redo A;
1864 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
1865 if ({
1866 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1867 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1868 }->{$self->{next_char}}) {
1869 !!!cp (181);
1870 ## Stay in the state
1871 !!!next-input-character;
1872 redo A;
1873 } elsif ($self->{next_char} eq 0x0022) { # "
1874 !!!cp (182);
1875 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
1876 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
1877 !!!next-input-character;
1878 redo A;
1879 } elsif ($self->{next_char} eq 0x0027) { # '
1880 !!!cp (183);
1881 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
1882 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
1883 !!!next-input-character;
1884 redo A;
1885 } elsif ($self->{next_char} eq 0x003E) { # >
1886 !!!cp (184);
1887 !!!parse-error (type => 'no PUBLIC literal');
1888
1889 $self->{state} = DATA_STATE;
1890 !!!next-input-character;
1891
1892 $self->{current_token}->{quirks} = 1;
1893 !!!emit ($self->{current_token}); # DOCTYPE
1894
1895 redo A;
1896 } elsif ($self->{next_char} == -1) {
1897 !!!cp (185);
1898 !!!parse-error (type => 'unclosed DOCTYPE');
1899
1900 $self->{state} = DATA_STATE;
1901 ## reconsume
1902
1903 $self->{current_token}->{quirks} = 1;
1904 !!!emit ($self->{current_token}); # DOCTYPE
1905
1906 redo A;
1907 } else {
1908 !!!cp (186);
1909 !!!parse-error (type => 'string after PUBLIC');
1910 $self->{current_token}->{quirks} = 1;
1911
1912 $self->{state} = BOGUS_DOCTYPE_STATE;
1913 !!!next-input-character;
1914 redo A;
1915 }
1916 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
1917 if ($self->{next_char} == 0x0022) { # "
1918 !!!cp (187);
1919 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1920 !!!next-input-character;
1921 redo A;
1922 } elsif ($self->{next_char} == 0x003E) { # >
1923 !!!cp (188);
1924 !!!parse-error (type => 'unclosed PUBLIC literal');
1925
1926 $self->{state} = DATA_STATE;
1927 !!!next-input-character;
1928
1929 $self->{current_token}->{quirks} = 1;
1930 !!!emit ($self->{current_token}); # DOCTYPE
1931
1932 redo A;
1933 } elsif ($self->{next_char} == -1) {
1934 !!!cp (189);
1935 !!!parse-error (type => 'unclosed PUBLIC literal');
1936
1937 $self->{state} = DATA_STATE;
1938 ## reconsume
1939
1940 $self->{current_token}->{quirks} = 1;
1941 !!!emit ($self->{current_token}); # DOCTYPE
1942
1943 redo A;
1944 } else {
1945 !!!cp (190);
1946 $self->{current_token}->{public_identifier} # DOCTYPE
1947 .= chr $self->{next_char};
1948 ## Stay in the state
1949 !!!next-input-character;
1950 redo A;
1951 }
1952 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
1953 if ($self->{next_char} == 0x0027) { # '
1954 !!!cp (191);
1955 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1956 !!!next-input-character;
1957 redo A;
1958 } elsif ($self->{next_char} == 0x003E) { # >
1959 !!!cp (192);
1960 !!!parse-error (type => 'unclosed PUBLIC literal');
1961
1962 $self->{state} = DATA_STATE;
1963 !!!next-input-character;
1964
1965 $self->{current_token}->{quirks} = 1;
1966 !!!emit ($self->{current_token}); # DOCTYPE
1967
1968 redo A;
1969 } elsif ($self->{next_char} == -1) {
1970 !!!cp (193);
1971 !!!parse-error (type => 'unclosed PUBLIC literal');
1972
1973 $self->{state} = DATA_STATE;
1974 ## reconsume
1975
1976 $self->{current_token}->{quirks} = 1;
1977 !!!emit ($self->{current_token}); # DOCTYPE
1978
1979 redo A;
1980 } else {
1981 !!!cp (194);
1982 $self->{current_token}->{public_identifier} # DOCTYPE
1983 .= chr $self->{next_char};
1984 ## Stay in the state
1985 !!!next-input-character;
1986 redo A;
1987 }
1988 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
1989 if ({
1990 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1991 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1992 }->{$self->{next_char}}) {
1993 !!!cp (195);
1994 ## Stay in the state
1995 !!!next-input-character;
1996 redo A;
1997 } elsif ($self->{next_char} == 0x0022) { # "
1998 !!!cp (196);
1999 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2000 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2001 !!!next-input-character;
2002 redo A;
2003 } elsif ($self->{next_char} == 0x0027) { # '
2004 !!!cp (197);
2005 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2006 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2007 !!!next-input-character;
2008 redo A;
2009 } elsif ($self->{next_char} == 0x003E) { # >
2010 !!!cp (198);
2011 $self->{state} = DATA_STATE;
2012 !!!next-input-character;
2013
2014 !!!emit ($self->{current_token}); # DOCTYPE
2015
2016 redo A;
2017 } elsif ($self->{next_char} == -1) {
2018 !!!cp (199);
2019 !!!parse-error (type => 'unclosed DOCTYPE');
2020
2021 $self->{state} = DATA_STATE;
2022 ## reconsume
2023
2024 $self->{current_token}->{quirks} = 1;
2025 !!!emit ($self->{current_token}); # DOCTYPE
2026
2027 redo A;
2028 } else {
2029 !!!cp (200);
2030 !!!parse-error (type => 'string after PUBLIC literal');
2031 $self->{current_token}->{quirks} = 1;
2032
2033 $self->{state} = BOGUS_DOCTYPE_STATE;
2034 !!!next-input-character;
2035 redo A;
2036 }
2037 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2038 if ({
2039 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2040 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2041 }->{$self->{next_char}}) {
2042 !!!cp (201);
2043 ## Stay in the state
2044 !!!next-input-character;
2045 redo A;
2046 } elsif ($self->{next_char} == 0x0022) { # "
2047 !!!cp (202);
2048 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2049 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2050 !!!next-input-character;
2051 redo A;
2052 } elsif ($self->{next_char} == 0x0027) { # '
2053 !!!cp (203);
2054 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2055 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2056 !!!next-input-character;
2057 redo A;
2058 } elsif ($self->{next_char} == 0x003E) { # >
2059 !!!cp (204);
2060 !!!parse-error (type => 'no SYSTEM literal');
2061 $self->{state} = DATA_STATE;
2062 !!!next-input-character;
2063
2064 $self->{current_token}->{quirks} = 1;
2065 !!!emit ($self->{current_token}); # DOCTYPE
2066
2067 redo A;
2068 } elsif ($self->{next_char} == -1) {
2069 !!!cp (205);
2070 !!!parse-error (type => 'unclosed DOCTYPE');
2071
2072 $self->{state} = DATA_STATE;
2073 ## reconsume
2074
2075 $self->{current_token}->{quirks} = 1;
2076 !!!emit ($self->{current_token}); # DOCTYPE
2077
2078 redo A;
2079 } else {
2080 !!!cp (206);
2081 !!!parse-error (type => 'string after SYSTEM');
2082 $self->{current_token}->{quirks} = 1;
2083
2084 $self->{state} = BOGUS_DOCTYPE_STATE;
2085 !!!next-input-character;
2086 redo A;
2087 }
2088 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2089 if ($self->{next_char} == 0x0022) { # "
2090 !!!cp (207);
2091 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2092 !!!next-input-character;
2093 redo A;
2094 } elsif ($self->{next_char} == 0x003E) { # >
2095 !!!cp (208);
2096 !!!parse-error (type => 'unclosed PUBLIC literal');
2097
2098 $self->{state} = DATA_STATE;
2099 !!!next-input-character;
2100
2101 $self->{current_token}->{quirks} = 1;
2102 !!!emit ($self->{current_token}); # DOCTYPE
2103
2104 redo A;
2105 } elsif ($self->{next_char} == -1) {
2106 !!!cp (209);
2107 !!!parse-error (type => 'unclosed SYSTEM literal');
2108
2109 $self->{state} = DATA_STATE;
2110 ## reconsume
2111
2112 $self->{current_token}->{quirks} = 1;
2113 !!!emit ($self->{current_token}); # DOCTYPE
2114
2115 redo A;
2116 } else {
2117 !!!cp (210);
2118 $self->{current_token}->{system_identifier} # DOCTYPE
2119 .= chr $self->{next_char};
2120 ## Stay in the state
2121 !!!next-input-character;
2122 redo A;
2123 }
2124 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2125 if ($self->{next_char} == 0x0027) { # '
2126 !!!cp (211);
2127 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2128 !!!next-input-character;
2129 redo A;
2130 } elsif ($self->{next_char} == 0x003E) { # >
2131 !!!cp (212);
2132 !!!parse-error (type => 'unclosed PUBLIC literal');
2133
2134 $self->{state} = DATA_STATE;
2135 !!!next-input-character;
2136
2137 $self->{current_token}->{quirks} = 1;
2138 !!!emit ($self->{current_token}); # DOCTYPE
2139
2140 redo A;
2141 } elsif ($self->{next_char} == -1) {
2142 !!!cp (213);
2143 !!!parse-error (type => 'unclosed SYSTEM literal');
2144
2145 $self->{state} = DATA_STATE;
2146 ## reconsume
2147
2148 $self->{current_token}->{quirks} = 1;
2149 !!!emit ($self->{current_token}); # DOCTYPE
2150
2151 redo A;
2152 } else {
2153 !!!cp (214);
2154 $self->{current_token}->{system_identifier} # DOCTYPE
2155 .= chr $self->{next_char};
2156 ## Stay in the state
2157 !!!next-input-character;
2158 redo A;
2159 }
2160 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2161 if ({
2162 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2163 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2164 }->{$self->{next_char}}) {
2165 !!!cp (215);
2166 ## Stay in the state
2167 !!!next-input-character;
2168 redo A;
2169 } elsif ($self->{next_char} == 0x003E) { # >
2170 !!!cp (216);
2171 $self->{state} = DATA_STATE;
2172 !!!next-input-character;
2173
2174 !!!emit ($self->{current_token}); # DOCTYPE
2175
2176 redo A;
2177 } elsif ($self->{next_char} == -1) {
2178 !!!cp (217);
2179 !!!parse-error (type => 'unclosed DOCTYPE');
2180
2181 $self->{state} = DATA_STATE;
2182 ## reconsume
2183
2184 $self->{current_token}->{quirks} = 1;
2185 !!!emit ($self->{current_token}); # DOCTYPE
2186
2187 redo A;
2188 } else {
2189 !!!cp (218);
2190 !!!parse-error (type => 'string after SYSTEM literal');
2191 #$self->{current_token}->{quirks} = 1;
2192
2193 $self->{state} = BOGUS_DOCTYPE_STATE;
2194 !!!next-input-character;
2195 redo A;
2196 }
2197 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2198 if ($self->{next_char} == 0x003E) { # >
2199 !!!cp (219);
2200 $self->{state} = DATA_STATE;
2201 !!!next-input-character;
2202
2203 !!!emit ($self->{current_token}); # DOCTYPE
2204
2205 redo A;
2206 } elsif ($self->{next_char} == -1) {
2207 !!!cp (220);
2208 !!!parse-error (type => 'unclosed DOCTYPE');
2209 $self->{state} = DATA_STATE;
2210 ## reconsume
2211
2212 !!!emit ($self->{current_token}); # DOCTYPE
2213
2214 redo A;
2215 } else {
2216 !!!cp (221);
2217 ## Stay in the state
2218 !!!next-input-character;
2219 redo A;
2220 }
2221 } else {
2222 die "$0: $self->{state}: Unknown state";
2223 }
2224 } # A
2225
2226 die "$0: _get_next_token: unexpected case";
2227 } # _get_next_token
2228
2229 sub _tokenize_attempt_to_consume_an_entity ($$$) {
2230 my ($self, $in_attr, $additional) = @_;
2231
2232 my ($l, $c) = ($self->{line_prev}, $self->{column_prev});
2233
2234 if ({
2235 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
2236 0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR
2237 $additional => 1,
2238 }->{$self->{next_char}}) {
2239 !!!cp (1001);
2240 ## Don't consume
2241 ## No error
2242 return undef;
2243 } elsif ($self->{next_char} == 0x0023) { # #
2244 !!!next-input-character;
2245 if ($self->{next_char} == 0x0078 or # x
2246 $self->{next_char} == 0x0058) { # X
2247 my $code;
2248 X: {
2249 my $x_char = $self->{next_char};
2250 !!!next-input-character;
2251 if (0x0030 <= $self->{next_char} and
2252 $self->{next_char} <= 0x0039) { # 0..9
2253 !!!cp (1002);
2254 $code ||= 0;
2255 $code *= 0x10;
2256 $code += $self->{next_char} - 0x0030;
2257 redo X;
2258 } elsif (0x0061 <= $self->{next_char} and
2259 $self->{next_char} <= 0x0066) { # a..f
2260 !!!cp (1003);
2261 $code ||= 0;
2262 $code *= 0x10;
2263 $code += $self->{next_char} - 0x0060 + 9;
2264 redo X;
2265 } elsif (0x0041 <= $self->{next_char} and
2266 $self->{next_char} <= 0x0046) { # A..F
2267 !!!cp (1004);
2268 $code ||= 0;
2269 $code *= 0x10;
2270 $code += $self->{next_char} - 0x0040 + 9;
2271 redo X;
2272 } elsif (not defined $code) { # no hexadecimal digit
2273 !!!cp (1005);
2274 !!!parse-error (type => 'bare hcro', line => $l, column => $c);
2275 !!!back-next-input-character ($x_char, $self->{next_char});
2276 $self->{next_char} = 0x0023; # #
2277 return undef;
2278 } elsif ($self->{next_char} == 0x003B) { # ;
2279 !!!cp (1006);
2280 !!!next-input-character;
2281 } else {
2282 !!!cp (1007);
2283 !!!parse-error (type => 'no refc', line => $l, column => $c);
2284 }
2285
2286 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
2287 !!!cp (1008);
2288 !!!parse-error (type => (sprintf 'invalid character reference:U+%04X', $code), line => $l, column => $c);
2289 $code = 0xFFFD;
2290 } elsif ($code > 0x10FFFF) {
2291 !!!cp (1009);
2292 !!!parse-error (type => (sprintf 'invalid character reference:U-%08X', $code), line => $l, column => $c);
2293 $code = 0xFFFD;
2294 } elsif ($code == 0x000D) {
2295 !!!cp (1010);
2296 !!!parse-error (type => 'CR character reference', line => $l, column => $c);
2297 $code = 0x000A;
2298 } elsif (0x80 <= $code and $code <= 0x9F) {
2299 !!!cp (1011);
2300 !!!parse-error (type => (sprintf 'C1 character reference:U+%04X', $code), line => $l, column => $c);
2301 $code = $c1_entity_char->{$code};
2302 }
2303
2304 return {type => CHARACTER_TOKEN, data => chr $code,
2305 has_reference => 1, line => $l, column => $c};
2306 } # X
2307 } elsif (0x0030 <= $self->{next_char} and
2308 $self->{next_char} <= 0x0039) { # 0..9
2309 my $code = $self->{next_char} - 0x0030;
2310 !!!next-input-character;
2311
2312 while (0x0030 <= $self->{next_char} and
2313 $self->{next_char} <= 0x0039) { # 0..9
2314 !!!cp (1012);
2315 $code *= 10;
2316 $code += $self->{next_char} - 0x0030;
2317
2318 !!!next-input-character;
2319 }
2320
2321 if ($self->{next_char} == 0x003B) { # ;
2322 !!!cp (1013);
2323 !!!next-input-character;
2324 } else {
2325 !!!cp (1014);
2326 !!!parse-error (type => 'no refc', line => $l, column => $c);
2327 }
2328
2329 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
2330 !!!cp (1015);
2331 !!!parse-error (type => (sprintf 'invalid character reference:U+%04X', $code), line => $l, column => $c);
2332 $code = 0xFFFD;
2333 } elsif ($code > 0x10FFFF) {
2334 !!!cp (1016);
2335 !!!parse-error (type => (sprintf 'invalid character reference:U-%08X', $code), line => $l, column => $c);
2336 $code = 0xFFFD;
2337 } elsif ($code == 0x000D) {
2338 !!!cp (1017);
2339 !!!parse-error (type => 'CR character reference', line => $l, column => $c);
2340 $code = 0x000A;
2341 } elsif (0x80 <= $code and $code <= 0x9F) {
2342 !!!cp (1018);
2343 !!!parse-error (type => (sprintf 'C1 character reference:U+%04X', $code), line => $l, column => $c);
2344 $code = $c1_entity_char->{$code};
2345 }
2346
2347 return {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1,
2348 line => $l, column => $c};
2349 } else {
2350 !!!cp (1019);
2351 !!!parse-error (type => 'bare nero', line => $l, column => $c);
2352 !!!back-next-input-character ($self->{next_char});
2353 $self->{next_char} = 0x0023; # #
2354 return undef;
2355 }
2356 } elsif ((0x0041 <= $self->{next_char} and
2357 $self->{next_char} <= 0x005A) or
2358 (0x0061 <= $self->{next_char} and
2359 $self->{next_char} <= 0x007A)) {
2360 my $entity_name = chr $self->{next_char};
2361 !!!next-input-character;
2362
2363 my $value = $entity_name;
2364 my $match = 0;
2365 require Whatpm::_NamedEntityList;
2366 our $EntityChar;
2367
2368 while (length $entity_name < 10 and
2369 ## NOTE: Some number greater than the maximum length of entity name
2370 ((0x0041 <= $self->{next_char} and # a
2371 $self->{next_char} <= 0x005A) or # x
2372 (0x0061 <= $self->{next_char} and # a
2373 $self->{next_char} <= 0x007A) or # z
2374 (0x0030 <= $self->{next_char} and # 0
2375 $self->{next_char} <= 0x0039) or # 9
2376 $self->{next_char} == 0x003B)) { # ;
2377 $entity_name .= chr $self->{next_char};
2378 if (defined $EntityChar->{$entity_name}) {
2379 if ($self->{next_char} == 0x003B) { # ;
2380 !!!cp (1020);
2381 $value = $EntityChar->{$entity_name};
2382 $match = 1;
2383 !!!next-input-character;
2384 last;
2385 } else {
2386 !!!cp (1021);
2387 $value = $EntityChar->{$entity_name};
2388 $match = -1;
2389 !!!next-input-character;
2390 }
2391 } else {
2392 !!!cp (1022);
2393 $value .= chr $self->{next_char};
2394 $match *= 2;
2395 !!!next-input-character;
2396 }
2397 }
2398
2399 if ($match > 0) {
2400 !!!cp (1023);
2401 return {type => CHARACTER_TOKEN, data => $value, has_reference => 1,
2402 line => $l, column => $c};
2403 } elsif ($match < 0) {
2404 !!!parse-error (type => 'no refc', line => $l, column => $c);
2405 if ($in_attr and $match < -1) {
2406 !!!cp (1024);
2407 return {type => CHARACTER_TOKEN, data => '&'.$entity_name,
2408 line => $l, column => $c};
2409 } else {
2410 !!!cp (1025);
2411 return {type => CHARACTER_TOKEN, data => $value, has_reference => 1,
2412 line => $l, column => $c};
2413 }
2414 } else {
2415 !!!cp (1026);
2416 !!!parse-error (type => 'bare ero', line => $l, column => $c);
2417 ## NOTE: "No characters are consumed" in the spec.
2418 return {type => CHARACTER_TOKEN, data => '&'.$value,
2419 line => $l, column => $c};
2420 }
2421 } else {
2422 !!!cp (1027);
2423 ## no characters are consumed
2424 !!!parse-error (type => 'bare ero', line => $l, column => $c);
2425 return undef;
2426 }
2427 } # _tokenize_attempt_to_consume_an_entity
2428
2429 sub _initialize_tree_constructor ($) {
2430 my $self = shift;
2431 ## NOTE: $self->{document} MUST be specified before this method is called
2432 $self->{document}->strict_error_checking (0);
2433 ## TODO: Turn mutation events off # MUST
2434 ## TODO: Turn loose Document option (manakai extension) on
2435 $self->{document}->manakai_is_html (1); # MUST
2436 } # _initialize_tree_constructor
2437
2438 sub _terminate_tree_constructor ($) {
2439 my $self = shift;
2440 $self->{document}->strict_error_checking (1);
2441 ## TODO: Turn mutation events on
2442 } # _terminate_tree_constructor
2443
2444 ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
2445
2446 { # tree construction stage
2447 my $token;
2448
2449 sub _construct_tree ($) {
2450 my ($self) = @_;
2451
2452 ## When an interactive UA render the $self->{document} available
2453 ## to the user, or when it begin accepting user input, are
2454 ## not defined.
2455
2456 ## Append a character: collect it and all subsequent consecutive
2457 ## characters and insert one Text node whose data is concatenation
2458 ## of all those characters. # MUST
2459
2460 !!!next-token;
2461
2462 undef $self->{form_element};
2463 undef $self->{head_element};
2464 $self->{open_elements} = [];
2465 undef $self->{inner_html_node};
2466
2467 ## NOTE: The "initial" insertion mode.
2468 $self->_tree_construction_initial; # MUST
2469
2470 ## NOTE: The "before html" insertion mode.
2471 $self->_tree_construction_root_element;
2472 $self->{insertion_mode} = BEFORE_HEAD_IM;
2473
2474 ## NOTE: The "before head" insertion mode and so on.
2475 $self->_tree_construction_main;
2476 } # _construct_tree
2477
2478 sub _tree_construction_initial ($) {
2479 my $self = shift;
2480
2481 ## NOTE: "initial" insertion mode
2482
2483 INITIAL: {
2484 if ($token->{type} == DOCTYPE_TOKEN) {
2485 ## NOTE: Conformance checkers MAY, instead of reporting "not HTML5"
2486 ## error, switch to a conformance checking mode for another
2487 ## language.
2488 my $doctype_name = $token->{name};
2489 $doctype_name = '' unless defined $doctype_name;
2490 $doctype_name =~ tr/a-z/A-Z/;
2491 if (not defined $token->{name} or # <!DOCTYPE>
2492 defined $token->{public_identifier} or
2493 defined $token->{system_identifier}) {
2494 !!!cp ('t1');
2495 !!!parse-error (type => 'not HTML5', token => $token);
2496 } elsif ($doctype_name ne 'HTML') {
2497 !!!cp ('t2');
2498 ## ISSUE: ASCII case-insensitive? (in fact it does not matter)
2499 !!!parse-error (type => 'not HTML5', token => $token);
2500 } else {
2501 !!!cp ('t3');
2502 }
2503
2504 my $doctype = $self->{document}->create_document_type_definition
2505 ($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)?
2506 $doctype->public_id ($token->{public_identifier})
2507 if defined $token->{public_identifier};
2508 $doctype->system_id ($token->{system_identifier})
2509 if defined $token->{system_identifier};
2510 ## NOTE: Other DocumentType attributes are null or empty lists.
2511 ## ISSUE: internalSubset = null??
2512 $self->{document}->append_child ($doctype);
2513
2514 if ($token->{quirks} or $doctype_name ne 'HTML') {
2515 !!!cp ('t4');
2516 $self->{document}->manakai_compat_mode ('quirks');
2517 } elsif (defined $token->{public_identifier}) {
2518 my $pubid = $token->{public_identifier};
2519 $pubid =~ tr/a-z/A-z/;
2520 if ({
2521 "+//SILMARIL//DTD HTML PRO V0R11 19970101//EN" => 1,
2522 "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
2523 "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
2524 "-//IETF//DTD HTML 2.0 LEVEL 1//EN" => 1,
2525 "-//IETF//DTD HTML 2.0 LEVEL 2//EN" => 1,
2526 "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//EN" => 1,
2527 "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//EN" => 1,
2528 "-//IETF//DTD HTML 2.0 STRICT//EN" => 1,
2529 "-//IETF//DTD HTML 2.0//EN" => 1,
2530 "-//IETF//DTD HTML 2.1E//EN" => 1,
2531 "-//IETF//DTD HTML 3.0//EN" => 1,
2532 "-//IETF//DTD HTML 3.0//EN//" => 1,
2533 "-//IETF//DTD HTML 3.2 FINAL//EN" => 1,
2534 "-//IETF//DTD HTML 3.2//EN" => 1,
2535 "-//IETF//DTD HTML 3//EN" => 1,
2536 "-//IETF//DTD HTML LEVEL 0//EN" => 1,
2537 "-//IETF//DTD HTML LEVEL 0//EN//2.0" => 1,
2538 "-//IETF//DTD HTML LEVEL 1//EN" => 1,
2539 "-//IETF//DTD HTML LEVEL 1//EN//2.0" => 1,
2540 "-//IETF//DTD HTML LEVEL 2//EN" => 1,
2541 "-//IETF//DTD HTML LEVEL 2//EN//2.0" => 1,
2542 "-//IETF//DTD HTML LEVEL 3//EN" => 1,
2543 "-//IETF//DTD HTML LEVEL 3//EN//3.0" => 1,
2544 "-//IETF//DTD HTML STRICT LEVEL 0//EN" => 1,
2545 "-//IETF//DTD HTML STRICT LEVEL 0//EN//2.0" => 1,
2546 "-//IETF//DTD HTML STRICT LEVEL 1//EN" => 1,
2547 "-//IETF//DTD HTML STRICT LEVEL 1//EN//2.0" => 1,
2548 "-//IETF//DTD HTML STRICT LEVEL 2//EN" => 1,
2549 "-//IETF//DTD HTML STRICT LEVEL 2//EN//2.0" => 1,
2550 "-//IETF//DTD HTML STRICT LEVEL 3//EN" => 1,
2551 "-//IETF//DTD HTML STRICT LEVEL 3//EN//3.0" => 1,
2552 "-//IETF//DTD HTML STRICT//EN" => 1,
2553 "-//IETF//DTD HTML STRICT//EN//2.0" => 1,
2554 "-//IETF//DTD HTML STRICT//EN//3.0" => 1,
2555 "-//IETF//DTD HTML//EN" => 1,
2556 "-//IETF//DTD HTML//EN//2.0" => 1,
2557 "-//IETF//DTD HTML//EN//3.0" => 1,
2558 "-//METRIUS//DTD METRIUS PRESENTATIONAL//EN" => 1,
2559 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//EN" => 1,
2560 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//EN" => 1,
2561 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//EN" => 1,
2562 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//EN" => 1,
2563 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//EN" => 1,
2564 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//EN" => 1,
2565 "-//NETSCAPE COMM. CORP.//DTD HTML//EN" => 1,
2566 "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1,
2567 "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1,
2568 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1,
2569 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED RELAXED 1.0//EN" => 1,
2570 "-//SOFTQUAD SOFTWARE//DTD HOTMETAL PRO 6.0::19990601::EXTENSIONS TO HTML 4.0//EN" => 1,
2571 "-//SOFTQUAD//DTD HOTMETAL PRO 4.0::19971010::EXTENSIONS TO HTML 4.0//EN" => 1,
2572 "-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1,
2573 "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1,
2574 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1,
2575 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//EN" => 1,
2576 "-//W3C//DTD HTML 3 1995-03-24//EN" => 1,
2577 "-//W3C//DTD HTML 3.2 DRAFT//EN" => 1,
2578 "-//W3C//DTD HTML 3.2 FINAL//EN" => 1,
2579 "-//W3C//DTD HTML 3.2//EN" => 1,
2580 "-//W3C//DTD HTML 3.2S DRAFT//EN" => 1,
2581 "-//W3C//DTD HTML 4.0 FRAMESET//EN" => 1,
2582 "-//W3C//DTD HTML 4.0 TRANSITIONAL//EN" => 1,
2583 "-//W3C//DTD HTML EXPERIMETNAL 19960712//EN" => 1,
2584 "-//W3C//DTD HTML EXPERIMENTAL 970421//EN" => 1,
2585 "-//W3C//DTD W3 HTML//EN" => 1,
2586 "-//W3O//DTD W3 HTML 3.0//EN" => 1,
2587 "-//W3O//DTD W3 HTML 3.0//EN//" => 1,
2588 "-//W3O//DTD W3 HTML STRICT 3.0//EN//" => 1,
2589 "-//WEBTECHS//DTD MOZILLA HTML 2.0//EN" => 1,
2590 "-//WEBTECHS//DTD MOZILLA HTML//EN" => 1,
2591 "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" => 1,
2592 "HTML" => 1,
2593 }->{$pubid}) {
2594 !!!cp ('t5');
2595 $self->{document}->manakai_compat_mode ('quirks');
2596 } elsif ($pubid eq "-//W3C//DTD HTML 4.01 FRAMESET//EN" or
2597 $pubid eq "-//W3C//DTD HTML 4.01 TRANSITIONAL//EN") {
2598 if (defined $token->{system_identifier}) {
2599 !!!cp ('t6');
2600 $self->{document}->manakai_compat_mode ('quirks');
2601 } else {
2602 !!!cp ('t7');
2603 $self->{document}->manakai_compat_mode ('limited quirks');
2604 }
2605 } elsif ($pubid eq "-//W3C//DTD XHTML 1.0 FRAMESET//EN" or
2606 $pubid eq "-//W3C//DTD XHTML 1.0 TRANSITIONAL//EN") {
2607 !!!cp ('t8');
2608 $self->{document}->manakai_compat_mode ('limited quirks');
2609 } else {
2610 !!!cp ('t9');
2611 }
2612 } else {
2613 !!!cp ('t10');
2614 }
2615 if (defined $token->{system_identifier}) {
2616 my $sysid = $token->{system_identifier};
2617 $sysid =~ tr/A-Z/a-z/;
2618 if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
2619 ## TODO: Check the spec: PUBLIC "(limited quirks)" "(quirks)"
2620 $self->{document}->manakai_compat_mode ('quirks');
2621 !!!cp ('t11');
2622 } else {
2623 !!!cp ('t12');
2624 }
2625 } else {
2626 !!!cp ('t13');
2627 }
2628
2629 ## Go to the "before html" insertion mode.
2630 !!!next-token;
2631 return;
2632 } elsif ({
2633 START_TAG_TOKEN, 1,
2634 END_TAG_TOKEN, 1,
2635 END_OF_FILE_TOKEN, 1,
2636 }->{$token->{type}}) {
2637 !!!cp ('t14');
2638 !!!parse-error (type => 'no DOCTYPE', token => $token);
2639 $self->{document}->manakai_compat_mode ('quirks');
2640 ## Go to the "before html" insertion mode.
2641 ## reprocess
2642 return;
2643 } elsif ($token->{type} == CHARACTER_TOKEN) {
2644 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
2645 ## Ignore the token
2646
2647 unless (length $token->{data}) {
2648 !!!cp ('t15');
2649 ## Stay in the insertion mode.
2650 !!!next-token;
2651 redo INITIAL;
2652 } else {
2653 !!!cp ('t16');
2654 }
2655 } else {
2656 !!!cp ('t17');
2657 }
2658
2659 !!!parse-error (type => 'no DOCTYPE', token => $token);
2660 $self->{document}->manakai_compat_mode ('quirks');
2661 ## Go to the "before html" insertion mode.
2662 ## reprocess
2663 return;
2664 } elsif ($token->{type} == COMMENT_TOKEN) {
2665 !!!cp ('t18');
2666 my $comment = $self->{document}->create_comment ($token->{data});
2667 $self->{document}->append_child ($comment);
2668
2669 ## Stay in the insertion mode.
2670 !!!next-token;
2671 redo INITIAL;
2672 } else {
2673 die "$0: $token->{type}: Unknown token type";
2674 }
2675 } # INITIAL
2676
2677 die "$0: _tree_construction_initial: This should be never reached";
2678 } # _tree_construction_initial
2679
2680 sub _tree_construction_root_element ($) {
2681 my $self = shift;
2682
2683 ## NOTE: "before html" insertion mode.
2684
2685 B: {
2686 if ($token->{type} == DOCTYPE_TOKEN) {
2687 !!!cp ('t19');
2688 !!!parse-error (type => 'in html:#DOCTYPE', token => $token);
2689 ## Ignore the token
2690 ## Stay in the insertion mode.
2691 !!!next-token;
2692 redo B;
2693 } elsif ($token->{type} == COMMENT_TOKEN) {
2694 !!!cp ('t20');
2695 my $comment = $self->{document}->create_comment ($token->{data});
2696 $self->{document}->append_child ($comment);
2697 ## Stay in the insertion mode.
2698 !!!next-token;
2699 redo B;
2700 } elsif ($token->{type} == CHARACTER_TOKEN) {
2701 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
2702 ## Ignore the token.
2703
2704 unless (length $token->{data}) {
2705 !!!cp ('t21');
2706 ## Stay in the insertion mode.
2707 !!!next-token;
2708 redo B;
2709 } else {
2710 !!!cp ('t22');
2711 }
2712 } else {
2713 !!!cp ('t23');
2714 }
2715
2716 $self->{application_cache_selection}->(undef);
2717
2718 #
2719 } elsif ($token->{type} == START_TAG_TOKEN) {
2720 if ($token->{tag_name} eq 'html') {
2721 my $root_element;
2722 !!!create-element ($root_element, $token->{tag_name}, $token->{attributes}, $token);
2723 $self->{document}->append_child ($root_element);
2724 push @{$self->{open_elements}}, [$root_element, 'html'];
2725
2726 if ($token->{attributes}->{manifest}) {
2727 !!!cp ('t24');
2728 $self->{application_cache_selection}
2729 ->($token->{attributes}->{manifest}->{value});
2730 ## ISSUE: No relative reference resolution?
2731 } else {
2732 !!!cp ('t25');
2733 $self->{application_cache_selection}->(undef);
2734 }
2735
2736 !!!next-token;
2737 return; ## Go to the "before head" insertion mode.
2738 } else {
2739 !!!cp ('t25.1');
2740 #
2741 }
2742 } elsif ({
2743 END_TAG_TOKEN, 1,
2744 END_OF_FILE_TOKEN, 1,
2745 }->{$token->{type}}) {
2746 !!!cp ('t26');
2747 #
2748 } else {
2749 die "$0: $token->{type}: Unknown token type";
2750 }
2751
2752 my $root_element; !!!create-element ($root_element, 'html',, $token);
2753 $self->{document}->append_child ($root_element);
2754 push @{$self->{open_elements}}, [$root_element, 'html'];
2755
2756 $self->{application_cache_selection}->(undef);
2757
2758 ## NOTE: Reprocess the token.
2759 return; ## Go to the "before head" insertion mode.
2760
2761 ## ISSUE: There is an issue in the spec
2762 } # B
2763
2764 die "$0: _tree_construction_root_element: This should never be reached";
2765 } # _tree_construction_root_element
2766
2767 sub _reset_insertion_mode ($) {
2768 my $self = shift;
2769
2770 ## Step 1
2771 my $last;
2772
2773 ## Step 2
2774 my $i = -1;
2775 my $node = $self->{open_elements}->[$i];
2776
2777 ## Step 3
2778 S3: {
2779 if ($self->{open_elements}->[0]->[0] eq $node->[0]) {
2780 $last = 1;
2781 if (defined $self->{inner_html_node}) {
2782 if ($self->{inner_html_node}->[1] eq 'td' or
2783 $self->{inner_html_node}->[1] eq 'th') {
2784 !!!cp ('t27');
2785 #
2786 } else {
2787 !!!cp ('t28');
2788 $node = $self->{inner_html_node};
2789 }
2790 }
2791 }
2792
2793 ## Step 4..13
2794 my $new_mode = {
2795 select => IN_SELECT_IM,
2796 ## NOTE: |option| and |optgroup| do not set
2797 ## insertion mode to "in select" by themselves.
2798 td => IN_CELL_IM,
2799 th => IN_CELL_IM,
2800 tr => IN_ROW_IM,
2801 tbody => IN_TABLE_BODY_IM,
2802 thead => IN_TABLE_BODY_IM,
2803 tfoot => IN_TABLE_BODY_IM,
2804 caption => IN_CAPTION_IM,
2805 colgroup => IN_COLUMN_GROUP_IM,
2806 table => IN_TABLE_IM,
2807 head => IN_BODY_IM, # not in head!
2808 body => IN_BODY_IM,
2809 frameset => IN_FRAMESET_IM,
2810 }->{$node->[1]};
2811 $self->{insertion_mode} = $new_mode and return if defined $new_mode;
2812
2813 ## Step 14
2814 if ($node->[1] eq 'html') {
2815 unless (defined $self->{head_element}) {
2816 !!!cp ('t29');
2817 $self->{insertion_mode} = BEFORE_HEAD_IM;
2818 } else {
2819 ## ISSUE: Can this state be reached?
2820 !!!cp ('t30');
2821 $self->{insertion_mode} = AFTER_HEAD_IM;
2822 }
2823 return;
2824 } else {
2825 !!!cp ('t31');
2826 }
2827
2828 ## Step 15
2829 $self->{insertion_mode} = IN_BODY_IM and return if $last;
2830
2831 ## Step 16
2832 $i--;
2833 $node = $self->{open_elements}->[$i];
2834
2835 ## Step 17
2836 redo S3;
2837 } # S3
2838
2839 die "$0: _reset_insertion_mode: This line should never be reached";
2840 } # _reset_insertion_mode
2841
2842 sub _tree_construction_main ($) {
2843 my $self = shift;
2844
2845 my $active_formatting_elements = [];
2846
2847 my $reconstruct_active_formatting_elements = sub { # MUST
2848 my $insert = shift;
2849
2850 ## Step 1
2851 return unless @$active_formatting_elements;
2852
2853 ## Step 3
2854 my $i = -1;
2855 my $entry = $active_formatting_elements->[$i];
2856
2857 ## Step 2
2858 return if $entry->[0] eq '#marker';
2859 for (@{$self->{open_elements}}) {
2860 if ($entry->[0] eq $_->[0]) {
2861 !!!cp ('t32');
2862 return;
2863 }
2864 }
2865
2866 S4: {
2867 ## Step 4
2868 last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
2869
2870 ## Step 5
2871 $i--;
2872 $entry = $active_formatting_elements->[$i];
2873
2874 ## Step 6
2875 if ($entry->[0] eq '#marker') {
2876 !!!cp ('t33_1');
2877 #
2878 } else {
2879 my $in_open_elements;
2880 OE: for (@{$self->{open_elements}}) {
2881 if ($entry->[0] eq $_->[0]) {
2882 !!!cp ('t33');
2883 $in_open_elements = 1;
2884 last OE;
2885 }
2886 }
2887 if ($in_open_elements) {
2888 !!!cp ('t34');
2889 #
2890 } else {
2891 ## NOTE: <!DOCTYPE HTML><p><b><i><u></p> <p>X
2892 !!!cp ('t35');
2893 redo S4;
2894 }
2895 }
2896
2897 ## Step 7
2898 $i++;
2899 $entry = $active_formatting_elements->[$i];
2900 } # S4
2901
2902 S7: {
2903 ## Step 8
2904 my $clone = [$entry->[0]->clone_node (0), $entry->[1]];
2905
2906 ## Step 9
2907 $insert->($clone->[0]);
2908 push @{$self->{open_elements}}, $clone;
2909
2910 ## Step 10
2911 $active_formatting_elements->[$i] = $self->{open_elements}->[-1];
2912
2913 ## Step 11
2914 unless ($clone->[0] eq $active_formatting_elements->[-1]->[0]) {
2915 !!!cp ('t36');
2916 ## Step 7'
2917 $i++;
2918 $entry = $active_formatting_elements->[$i];
2919
2920 redo S7;
2921 }
2922
2923 !!!cp ('t37');
2924 } # S7
2925 }; # $reconstruct_active_formatting_elements
2926
2927 my $clear_up_to_marker = sub {
2928 for (reverse 0..$#$active_formatting_elements) {
2929 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
2930 !!!cp ('t38');
2931 splice @$active_formatting_elements, $_;
2932 return;
2933 }
2934 }
2935
2936 !!!cp ('t39');
2937 }; # $clear_up_to_marker
2938
2939 my $insert;
2940
2941 my $parse_rcdata = sub ($) {
2942 my ($content_model_flag) = @_;
2943
2944 ## Step 1
2945 my $start_tag_name = $token->{tag_name};
2946 my $el;
2947 !!!create-element ($el, $start_tag_name, $token->{attributes}, $token);
2948
2949 ## Step 2
2950 $insert->($el);
2951
2952 ## Step 3
2953 $self->{content_model} = $content_model_flag; # CDATA or RCDATA
2954 delete $self->{escape}; # MUST
2955
2956 ## Step 4
2957 my $text = '';
2958 !!!next-token;
2959 while ($token->{type} == CHARACTER_TOKEN) { # or until stop tokenizing
2960 !!!cp ('t40');
2961 $text .= $token->{data};
2962 !!!next-token;
2963 }
2964
2965 ## Step 5
2966 if (length $text) {
2967 !!!cp ('t41');
2968 my $text = $self->{document}->create_text_node ($text);
2969 $el->append_child ($text);
2970 }
2971
2972 ## Step 6
2973 $self->{content_model} = PCDATA_CONTENT_MODEL;
2974
2975 ## Step 7
2976 if ($token->{type} == END_TAG_TOKEN and
2977 $token->{tag_name} eq $start_tag_name) {
2978 !!!cp ('t42');
2979 ## Ignore the token
2980 } else {
2981 ## NOTE: An end-of-file token.
2982 if ($content_model_flag == CDATA_CONTENT_MODEL) {
2983 !!!cp ('t43');
2984 !!!parse-error (type => 'in CDATA:#'.$token->{type}, token => $token);
2985 } elsif ($content_model_flag == RCDATA_CONTENT_MODEL) {
2986 !!!cp ('t44');
2987 !!!parse-error (type => 'in RCDATA:#'.$token->{type}, token => $token);
2988 } else {
2989 die "$0: $content_model_flag in parse_rcdata";
2990 }
2991 }
2992 !!!next-token;
2993 }; # $parse_rcdata
2994
2995 my $script_start_tag = sub () {
2996 my $script_el;
2997 !!!create-element ($script_el, 'script', $token->{attributes}, $token);
2998 ## TODO: mark as "parser-inserted"
2999
3000 $self->{content_model} = CDATA_CONTENT_MODEL;
3001 delete $self->{escape}; # MUST
3002
3003 my $text = '';
3004 !!!next-token;
3005 while ($token->{type} == CHARACTER_TOKEN) {
3006 !!!cp ('t45');
3007 $text .= $token->{data};
3008 !!!next-token;
3009 } # stop if non-character token or tokenizer stops tokenising
3010 if (length $text) {
3011 !!!cp ('t46');
3012 $script_el->manakai_append_text ($text);
3013 }
3014
3015 $self->{content_model} = PCDATA_CONTENT_MODEL;
3016
3017 if ($token->{type} == END_TAG_TOKEN and
3018 $token->{tag_name} eq 'script') {
3019 !!!cp ('t47');
3020 ## Ignore the token
3021 } else {
3022 !!!cp ('t48');
3023 !!!parse-error (type => 'in CDATA:#'.$token->{type}, token => $token);
3024 ## ISSUE: And ignore?
3025 ## TODO: mark as "already executed"
3026 }
3027
3028 if (defined $self->{inner_html_node}) {
3029 !!!cp ('t49');
3030 ## TODO: mark as "already executed"
3031 } else {
3032 !!!cp ('t50');
3033 ## TODO: $old_insertion_point = current insertion point
3034 ## TODO: insertion point = just before the next input character
3035
3036 $insert->($script_el);
3037
3038 ## TODO: insertion point = $old_insertion_point (might be "undefined")
3039
3040 ## TODO: if there is a script that will execute as soon as the parser resume, then...
3041 }
3042
3043 !!!next-token;
3044 }; # $script_start_tag
3045
3046 ## NOTE: $open_tables->[-1]->[0] is the "current table" element node.
3047 ## NOTE: $open_tables->[-1]->[1] is the "tainted" flag.
3048 my $open_tables = [[$self->{open_elements}->[0]->[0]]];
3049
3050 my $formatting_end_tag = sub {
3051 my $end_tag_token = shift;
3052 my $tag_name = $end_tag_token->{tag_name};
3053
3054 ## NOTE: The adoption agency algorithm (AAA).
3055
3056 FET: {
3057 ## Step 1
3058 my $formatting_element;
3059 my $formatting_element_i_in_active;
3060 AFE: for (reverse 0..$#$active_formatting_elements) {
3061 if ($active_formatting_elements->[$_]->[1] eq $tag_name) {
3062 !!!cp ('t51');
3063 $formatting_element = $active_formatting_elements->[$_];
3064 $formatting_element_i_in_active = $_;
3065 last AFE;
3066 } elsif ($active_formatting_elements->[$_]->[0] eq '#marker') {
3067 !!!cp ('t52');
3068 last AFE;
3069 }
3070 } # AFE
3071 unless (defined $formatting_element) {
3072 !!!cp ('t53');
3073 !!!parse-error (type => 'unmatched end tag:'.$tag_name, token => $end_tag_token);
3074 ## Ignore the token
3075 !!!next-token;
3076 return;
3077 }
3078 ## has an element in scope
3079 my $in_scope = 1;
3080 my $formatting_element_i_in_open;
3081 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3082 my $node = $self->{open_elements}->[$_];
3083 if ($node->[0] eq $formatting_element->[0]) {
3084 if ($in_scope) {
3085 !!!cp ('t54');
3086 $formatting_element_i_in_open = $_;
3087 last INSCOPE;
3088 } else { # in open elements but not in scope
3089 !!!cp ('t55');
3090 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name},
3091 token => $end_tag_token);
3092 ## Ignore the token
3093 !!!next-token;
3094 return;
3095 }
3096 } elsif ({
3097 applet => 1, table => 1, caption => 1, td => 1, th => 1,
3098 button => 1, marquee => 1, object => 1, html => 1,
3099 }->{$node->[1]}) {
3100 !!!cp ('t56');
3101 $in_scope = 0;
3102 }
3103 } # INSCOPE
3104 unless (defined $formatting_element_i_in_open) {
3105 !!!cp ('t57');
3106 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name},
3107 token => $end_tag_token);
3108 pop @$active_formatting_elements; # $formatting_element
3109 !!!next-token; ## TODO: ok?
3110 return;
3111 }
3112 if (not $self->{open_elements}->[-1]->[0] eq $formatting_element->[0]) {
3113 !!!cp ('t58');
3114 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1],
3115 token => $end_tag_token);
3116 }
3117
3118 ## Step 2
3119 my $furthest_block;
3120 my $furthest_block_i_in_open;
3121 OE: for (reverse 0..$#{$self->{open_elements}}) {
3122 my $node = $self->{open_elements}->[$_];
3123 if (not $formatting_category->{$node->[1]} and
3124 #not $phrasing_category->{$node->[1]} and
3125 ($special_category->{$node->[1]} or
3126 $scoping_category->{$node->[1]})) { ## Scoping is redundant, maybe
3127 !!!cp ('t59');
3128 $furthest_block = $node;
3129 $furthest_block_i_in_open = $_;
3130 } elsif ($node->[0] eq $formatting_element->[0]) {
3131 !!!cp ('t60');
3132 last OE;
3133 }
3134 } # OE
3135
3136 ## Step 3
3137 unless (defined $furthest_block) { # MUST
3138 !!!cp ('t61');
3139 splice @{$self->{open_elements}}, $formatting_element_i_in_open;
3140 splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
3141 !!!next-token;
3142 return;
3143 }
3144
3145 ## Step 4
3146 my $common_ancestor_node = $self->{open_elements}->[$formatting_element_i_in_open - 1];
3147
3148 ## Step 5
3149 my $furthest_block_parent = $furthest_block->[0]->parent_node;
3150 if (defined $furthest_block_parent) {
3151 !!!cp ('t62');
3152 $furthest_block_parent->remove_child ($furthest_block->[0]);
3153 }
3154
3155 ## Step 6
3156 my $bookmark_prev_el
3157 = $active_formatting_elements->[$formatting_element_i_in_active - 1]
3158 ->[0];
3159
3160 ## Step 7
3161 my $node = $furthest_block;
3162 my $node_i_in_open = $furthest_block_i_in_open;
3163 my $last_node = $furthest_block;
3164 S7: {
3165 ## Step 1
3166 $node_i_in_open--;
3167 $node = $self->{open_elements}->[$node_i_in_open];
3168
3169 ## Step 2
3170 my $node_i_in_active;
3171 S7S2: {
3172 for (reverse 0..$#$active_formatting_elements) {
3173 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
3174 !!!cp ('t63');
3175 $node_i_in_active = $_;
3176 last S7S2;
3177 }
3178 }
3179 splice @{$self->{open_elements}}, $node_i_in_open, 1;
3180 redo S7;
3181 } # S7S2
3182
3183 ## Step 3
3184 last S7 if $node->[0] eq $formatting_element->[0];
3185
3186 ## Step 4
3187 if ($last_node->[0] eq $furthest_block->[0]) {
3188 !!!cp ('t64');
3189 $bookmark_prev_el = $node->[0];
3190 }
3191
3192 ## Step 5
3193 if ($node->[0]->has_child_nodes ()) {
3194 !!!cp ('t65');
3195 my $clone = [$node->[0]->clone_node (0), $node->[1]];
3196 $active_formatting_elements->[$node_i_in_active] = $clone;
3197 $self->{open_elements}->[$node_i_in_open] = $clone;
3198 $node = $clone;
3199 }
3200
3201 ## Step 6
3202 $node->[0]->append_child ($last_node->[0]);
3203
3204 ## Step 7
3205 $last_node = $node;
3206
3207 ## Step 8
3208 redo S7;
3209 } # S7
3210
3211 ## Step 8
3212 if ({
3213 table => 1, tbody => 1, tfoot => 1, thead => 1, tr => 1,
3214 }->{$common_ancestor_node->[1]}) {
3215 my $foster_parent_element;
3216 my $next_sibling;
3217 OE: for (reverse 0..$#{$self->{open_elements}}) {
3218 if ($self->{open_elements}->[$_]->[1] eq 'table') {
3219 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3220 if (defined $parent and $parent->node_type == 1) {
3221 !!!cp ('t65.1');
3222 $foster_parent_element = $parent;
3223 $next_sibling = $self->{open_elements}->[$_]->[0];
3224 } else {
3225 !!!cp ('t65.2');
3226 $foster_parent_element
3227 = $self->{open_elements}->[$_ - 1]->[0];
3228 }
3229 last OE;
3230 }
3231 } # OE
3232 $foster_parent_element = $self->{open_elements}->[0]->[0]
3233 unless defined $foster_parent_element;
3234 $foster_parent_element->insert_before ($last_node->[0], $next_sibling);
3235 $open_tables->[-1]->[1] = 1; # tainted
3236 } else {
3237 !!!cp ('t65.3');
3238 $common_ancestor_node->[0]->append_child ($last_node->[0]);
3239 }
3240
3241 ## Step 9
3242 my $clone = [$formatting_element->[0]->clone_node (0),
3243 $formatting_element->[1]];
3244
3245 ## Step 10
3246 my @cn = @{$furthest_block->[0]->child_nodes};
3247 $clone->[0]->append_child ($_) for @cn;
3248
3249 ## Step 11
3250 $furthest_block->[0]->append_child ($clone->[0]);
3251
3252 ## Step 12
3253 my $i;
3254 AFE: for (reverse 0..$#$active_formatting_elements) {
3255 if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
3256 !!!cp ('t66');
3257 splice @$active_formatting_elements, $_, 1;
3258 $i-- and last AFE if defined $i;
3259 } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
3260 !!!cp ('t67');
3261 $i = $_;
3262 }
3263 } # AFE
3264 splice @$active_formatting_elements, $i + 1, 0, $clone;
3265
3266 ## Step 13
3267 undef $i;
3268 OE: for (reverse 0..$#{$self->{open_elements}}) {
3269 if ($self->{open_elements}->[$_]->[0] eq $formatting_element->[0]) {
3270 !!!cp ('t68');
3271 splice @{$self->{open_elements}}, $_, 1;
3272 $i-- and last OE if defined $i;
3273 } elsif ($self->{open_elements}->[$_]->[0] eq $furthest_block->[0]) {
3274 !!!cp ('t69');
3275 $i = $_;
3276 }
3277 } # OE
3278 splice @{$self->{open_elements}}, $i + 1, 1, $clone;
3279
3280 ## Step 14
3281 redo FET;
3282 } # FET
3283 }; # $formatting_end_tag
3284
3285 $insert = my $insert_to_current = sub {
3286 $self->{open_elements}->[-1]->[0]->append_child ($_[0]);
3287 }; # $insert_to_current
3288
3289 my $insert_to_foster = sub {
3290 my $child = shift;
3291 if ({
3292 table => 1, tbody => 1, tfoot => 1, thead => 1, tr => 1,
3293 }->{$self->{open_elements}->[-1]->[1]}) {
3294 # MUST
3295 my $foster_parent_element;
3296 my $next_sibling;
3297 OE: for (reverse 0..$#{$self->{open_elements}}) {
3298 if ($self->{open_elements}->[$_]->[1] eq 'table') {
3299 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3300 if (defined $parent and $parent->node_type == 1) {
3301 !!!cp ('t70');
3302 $foster_parent_element = $parent;
3303 $next_sibling = $self->{open_elements}->[$_]->[0];
3304 } else {
3305 !!!cp ('t71');
3306 $foster_parent_element
3307 = $self->{open_elements}->[$_ - 1]->[0];
3308 }
3309 last OE;
3310 }
3311 } # OE
3312 $foster_parent_element = $self->{open_elements}->[0]->[0]
3313 unless defined $foster_parent_element;
3314 $foster_parent_element->insert_before
3315 ($child, $next_sibling);
3316 $open_tables->[-1]->[1] = 1; # tainted
3317 } else {
3318 !!!cp ('t72');
3319 $self->{open_elements}->[-1]->[0]->append_child ($child);
3320 }
3321 }; # $insert_to_foster
3322
3323 B: {
3324 if ($token->{type} == DOCTYPE_TOKEN) {
3325 !!!cp ('t73');
3326 !!!parse-error (type => 'DOCTYPE in the middle', token => $token);
3327 ## Ignore the token
3328 ## Stay in the phase
3329 !!!next-token;
3330 redo B;
3331 } elsif ($token->{type} == START_TAG_TOKEN and
3332 $token->{tag_name} eq 'html') {
3333 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
3334 !!!cp ('t79');
3335 !!!parse-error (type => 'after html:html', token => $token);
3336 $self->{insertion_mode} = AFTER_BODY_IM;
3337 } elsif ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
3338 !!!cp ('t80');
3339 !!!parse-error (type => 'after html:html', token => $token);
3340 $self->{insertion_mode} = AFTER_FRAMESET_IM;
3341 } else {
3342 !!!cp ('t81');
3343 }
3344
3345 !!!cp ('t82');
3346 !!!parse-error (type => 'not first start tag', token => $token);
3347 my $top_el = $self->{open_elements}->[0]->[0];
3348 for my $attr_name (keys %{$token->{attributes}}) {
3349 unless ($top_el->has_attribute_ns (undef, $attr_name)) {
3350 !!!cp ('t84');
3351 $top_el->set_attribute_ns
3352 (undef, [undef, $attr_name],
3353 $token->{attributes}->{$attr_name}->{value});
3354 }
3355 }
3356 !!!next-token;
3357 redo B;
3358 } elsif ($token->{type} == COMMENT_TOKEN) {
3359 my $comment = $self->{document}->create_comment ($token->{data});
3360 if ($self->{insertion_mode} & AFTER_HTML_IMS) {
3361 !!!cp ('t85');
3362 $self->{document}->append_child ($comment);
3363 } elsif ($self->{insertion_mode} == AFTER_BODY_IM) {
3364 !!!cp ('t86');
3365 $self->{open_elements}->[0]->[0]->append_child ($comment);
3366 } else {
3367 !!!cp ('t87');
3368 $self->{open_elements}->[-1]->[0]->append_child ($comment);
3369 }
3370 !!!next-token;
3371 redo B;
3372 } elsif ($self->{insertion_mode} & HEAD_IMS) {
3373 if ($token->{type} == CHARACTER_TOKEN) {
3374 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3375 unless ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3376 !!!cp ('t88.2');
3377 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3378 } else {
3379 !!!cp ('t88.1');
3380 ## Ignore the token.
3381 !!!next-token;
3382 redo B;
3383 }
3384 unless (length $token->{data}) {
3385 !!!cp ('t88');
3386 !!!next-token;
3387 redo B;
3388 }
3389 }
3390
3391 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3392 !!!cp ('t89');
3393 ## As if <head>
3394 !!!create-element ($self->{head_element}, 'head',, $token);
3395 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3396 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3397
3398 ## Reprocess in the "in head" insertion mode...
3399 pop @{$self->{open_elements}};
3400
3401 ## Reprocess in the "after head" insertion mode...
3402 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3403 !!!cp ('t90');
3404 ## As if </noscript>
3405 pop @{$self->{open_elements}};
3406 !!!parse-error (type => 'in noscript:#character', token => $token);
3407
3408 ## Reprocess in the "in head" insertion mode...
3409 ## As if </head>
3410 pop @{$self->{open_elements}};
3411
3412 ## Reprocess in the "after head" insertion mode...
3413 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3414 !!!cp ('t91');
3415 pop @{$self->{open_elements}};
3416
3417 ## Reprocess in the "after head" insertion mode...
3418 } else {
3419 !!!cp ('t92');
3420 }
3421
3422 ## "after head" insertion mode
3423 ## As if <body>
3424 !!!insert-element ('body',, $token);
3425 $self->{insertion_mode} = IN_BODY_IM;
3426 ## reprocess
3427 redo B;
3428 } elsif ($token->{type} == START_TAG_TOKEN) {
3429 if ($token->{tag_name} eq 'head') {
3430 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3431 !!!cp ('t93');
3432 !!!create-element ($self->{head_element}, $token->{tag_name}, $token->{attributes}, $token);
3433 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3434 push @{$self->{open_elements}}, [$self->{head_element}, $token->{tag_name}];
3435 $self->{insertion_mode} = IN_HEAD_IM;
3436 !!!next-token;
3437 redo B;
3438 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
3439 !!!cp ('t94');
3440 #
3441 } else {
3442 !!!cp ('t95');
3443 !!!parse-error (type => 'in head:head', token => $token); # or in head noscript
3444 ## Ignore the token
3445 !!!next-token;
3446 redo B;
3447 }
3448 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3449 !!!cp ('t96');
3450 ## As if <head>
3451 !!!create-element ($self->{head_element}, 'head',, $token);
3452 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3453 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3454
3455 $self->{insertion_mode} = IN_HEAD_IM;
3456 ## Reprocess in the "in head" insertion mode...
3457 } else {
3458 !!!cp ('t97');
3459 }
3460
3461 if ($token->{tag_name} eq 'base') {
3462 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3463 !!!cp ('t98');
3464 ## As if </noscript>
3465 pop @{$self->{open_elements}};
3466 !!!parse-error (type => 'in noscript:base', token => $token);
3467
3468 $self->{insertion_mode} = IN_HEAD_IM;
3469 ## Reprocess in the "in head" insertion mode...
3470 } else {
3471 !!!cp ('t99');
3472 }
3473
3474 ## NOTE: There is a "as if in head" code clone.
3475 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
3476 !!!cp ('t100');
3477 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
3478 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3479 } else {
3480 !!!cp ('t101');
3481 }
3482 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
3483 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
3484 pop @{$self->{open_elements}} # <head>
3485 if $self->{insertion_mode} == AFTER_HEAD_IM;
3486 !!!next-token;
3487 redo B;
3488 } elsif ($token->{tag_name} eq 'link') {
3489 ## NOTE: There is a "as if in head" code clone.
3490 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
3491 !!!cp ('t102');
3492 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
3493 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3494 } else {
3495 !!!cp ('t103');
3496 }
3497 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
3498 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
3499 pop @{$self->{open_elements}} # <head>
3500 if $self->{insertion_mode} == AFTER_HEAD_IM;
3501 !!!next-token;
3502 redo B;
3503 } elsif ($token->{tag_name} eq 'meta') {
3504 ## NOTE: There is a "as if in head" code clone.
3505 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
3506 !!!cp ('t104');
3507 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
3508 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3509 } else {
3510 !!!cp ('t105');
3511 }
3512 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
3513 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
3514
3515 unless ($self->{confident}) {
3516 if ($token->{attributes}->{charset}) { ## TODO: And if supported
3517 !!!cp ('t106');
3518 $self->{change_encoding}
3519 ->($self, $token->{attributes}->{charset}->{value},
3520 $token);
3521
3522 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
3523 ->set_user_data (manakai_has_reference =>
3524 $token->{attributes}->{charset}
3525 ->{has_reference});
3526 } elsif ($token->{attributes}->{content}) {
3527 ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
3528 if ($token->{attributes}->{content}->{value}
3529 =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
3530 [\x09-\x0D\x20]*=
3531 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
3532 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
3533 !!!cp ('t107');
3534 $self->{change_encoding}
3535 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3,
3536 $token);
3537 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
3538 ->set_user_data (manakai_has_reference =>
3539 $token->{attributes}->{content}
3540 ->{has_reference});
3541 } else {
3542 !!!cp ('t108');
3543 }
3544 }
3545 } else {
3546 if ($token->{attributes}->{charset}) {
3547 !!!cp ('t109');
3548 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
3549 ->set_user_data (manakai_has_reference =>
3550 $token->{attributes}->{charset}
3551 ->{has_reference});
3552 }
3553 if ($token->{attributes}->{content}) {
3554 !!!cp ('t110');
3555 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
3556 ->set_user_data (manakai_has_reference =>
3557 $token->{attributes}->{content}
3558 ->{has_reference});
3559 }
3560 }
3561
3562 pop @{$self->{open_elements}} # <head>
3563 if $self->{insertion_mode} == AFTER_HEAD_IM;
3564 !!!next-token;
3565 redo B;
3566 } elsif ($token->{tag_name} eq 'title') {
3567 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3568 !!!cp ('t111');
3569 ## As if </noscript>
3570 pop @{$self->{open_elements}};
3571 !!!parse-error (type => 'in noscript:title', token => $token);
3572
3573 $self->{insertion_mode} = IN_HEAD_IM;
3574 ## Reprocess in the "in head" insertion mode...
3575 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
3576 !!!cp ('t112');
3577 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
3578 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3579 } else {
3580 !!!cp ('t113');
3581 }
3582
3583 ## NOTE: There is a "as if in head" code clone.
3584 my $parent = defined $self->{head_element} ? $self->{head_element}
3585 : $self->{open_elements}->[-1]->[0];
3586 $parse_rcdata->(RCDATA_CONTENT_MODEL);
3587 pop @{$self->{open_elements}} # <head>
3588 if $self->{insertion_mode} == AFTER_HEAD_IM;
3589 redo B;
3590 } elsif ($token->{tag_name} eq 'style') {
3591 ## NOTE: Or (scripting is enabled and tag_name eq 'noscript' and
3592 ## insertion mode IN_HEAD_IM)
3593 ## NOTE: There is a "as if in head" code clone.
3594 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
3595 !!!cp ('t114');
3596 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
3597 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3598 } else {
3599 !!!cp ('t115');
3600 }
3601 $parse_rcdata->(CDATA_CONTENT_MODEL);
3602 pop @{$self->{open_elements}} # <head>
3603 if $self->{insertion_mode} == AFTER_HEAD_IM;
3604 redo B;
3605 } elsif ($token->{tag_name} eq 'noscript') {
3606 if ($self->{insertion_mode} == IN_HEAD_IM) {
3607 !!!cp ('t116');
3608 ## NOTE: and scripting is disalbed
3609 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
3610 $self->{insertion_mode} = IN_HEAD_NOSCRIPT_IM;
3611 !!!next-token;
3612 redo B;
3613 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3614 !!!cp ('t117');
3615 !!!parse-error (type => 'in noscript:noscript', token => $token);
3616 ## Ignore the token
3617 !!!next-token;
3618 redo B;
3619 } else {
3620 !!!cp ('t118');
3621 #
3622 }
3623 } elsif ($token->{tag_name} eq 'script') {
3624 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3625 !!!cp ('t119');
3626 ## As if </noscript>
3627 pop @{$self->{open_elements}};
3628 !!!parse-error (type => 'in noscript:script', token => $token);
3629
3630 $self->{insertion_mode} = IN_HEAD_IM;
3631 ## Reprocess in the "in head" insertion mode...
3632 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
3633 !!!cp ('t120');
3634 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
3635 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3636 } else {
3637 !!!cp ('t121');
3638 }
3639
3640 ## NOTE: There is a "as if in head" code clone.
3641 $script_start_tag->();
3642 pop @{$self->{open_elements}} # <head>
3643 if $self->{insertion_mode} == AFTER_HEAD_IM;
3644 redo B;
3645 } elsif ($token->{tag_name} eq 'body' or
3646 $token->{tag_name} eq 'frameset') {
3647 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3648 !!!cp ('t122');
3649 ## As if </noscript>
3650 pop @{$self->{open_elements}};
3651 !!!parse-error (type => 'in noscript:'.$token->{tag_name}, token => $token);
3652
3653 ## Reprocess in the "in head" insertion mode...
3654 ## As if </head>
3655 pop @{$self->{open_elements}};
3656
3657 ## Reprocess in the "after head" insertion mode...
3658 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3659 !!!cp ('t124');
3660 pop @{$self->{open_elements}};
3661
3662 ## Reprocess in the "after head" insertion mode...
3663 } else {
3664 !!!cp ('t125');
3665 }
3666
3667 ## "after head" insertion mode
3668 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
3669 if ($token->{tag_name} eq 'body') {
3670 !!!cp ('t126');
3671 $self->{insertion_mode} = IN_BODY_IM;
3672 } elsif ($token->{tag_name} eq 'frameset') {
3673 !!!cp ('t127');
3674 $self->{insertion_mode} = IN_FRAMESET_IM;
3675 } else {
3676 die "$0: tag name: $self->{tag_name}";
3677 }
3678 !!!next-token;
3679 redo B;
3680 } else {
3681 !!!cp ('t128');
3682 #
3683 }
3684
3685 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3686 !!!cp ('t129');
3687 ## As if </noscript>
3688 pop @{$self->{open_elements}};
3689 !!!parse-error (type => 'in noscript:/'.$token->{tag_name}, token => $token);
3690
3691 ## Reprocess in the "in head" insertion mode...
3692 ## As if </head>
3693 pop @{$self->{open_elements}};
3694
3695 ## Reprocess in the "after head" insertion mode...
3696 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3697 !!!cp ('t130');
3698 ## As if </head>
3699 pop @{$self->{open_elements}};
3700
3701 ## Reprocess in the "after head" insertion mode...
3702 } else {
3703 !!!cp ('t131');
3704 }
3705
3706 ## "after head" insertion mode
3707 ## As if <body>
3708 !!!insert-element ('body',, $token);
3709 $self->{insertion_mode} = IN_BODY_IM;
3710 ## reprocess
3711 redo B;
3712 } elsif ($token->{type} == END_TAG_TOKEN) {
3713 if ($token->{tag_name} eq 'head') {
3714 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3715 !!!cp ('t132');
3716 ## As if <head>
3717 !!!create-element ($self->{head_element}, 'head',, $token);
3718 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3719 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3720
3721 ## Reprocess in the "in head" insertion mode...
3722 pop @{$self->{open_elements}};
3723 $self->{insertion_mode} = AFTER_HEAD_IM;
3724 !!!next-token;
3725 redo B;
3726 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3727 !!!cp ('t133');
3728 ## As if </noscript>
3729 pop @{$self->{open_elements}};
3730 !!!parse-error (type => 'in noscript:/head', token => $token);
3731
3732 ## Reprocess in the "in head" insertion mode...
3733 pop @{$self->{open_elements}};
3734 $self->{insertion_mode} = AFTER_HEAD_IM;
3735 !!!next-token;
3736 redo B;
3737 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3738 !!!cp ('t134');
3739 pop @{$self->{open_elements}};
3740 $self->{insertion_mode} = AFTER_HEAD_IM;
3741 !!!next-token;
3742 redo B;
3743 } else {
3744 !!!cp ('t135');
3745 #
3746 }
3747 } elsif ($token->{tag_name} eq 'noscript') {
3748 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3749 !!!cp ('t136');
3750 pop @{$self->{open_elements}};
3751 $self->{insertion_mode} = IN_HEAD_IM;
3752 !!!next-token;
3753 redo B;
3754 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3755 !!!cp ('t137');
3756 !!!parse-error (type => 'unmatched end tag:noscript', token => $token);
3757 ## Ignore the token ## ISSUE: An issue in the spec.
3758 !!!next-token;
3759 redo B;
3760 } else {
3761 !!!cp ('t138');
3762 #
3763 }
3764 } elsif ({
3765 body => 1, html => 1,
3766 }->{$token->{tag_name}}) {
3767 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3768 !!!cp ('t139');
3769 ## As if <head>
3770 !!!create-element ($self->{head_element}, 'head',, $token);
3771 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3772 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3773
3774 $self->{insertion_mode} = IN_HEAD_IM;
3775 ## Reprocess in the "in head" insertion mode...
3776 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3777 !!!cp ('t140');
3778 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
3779 ## Ignore the token
3780 !!!next-token;
3781 redo B;
3782 } else {
3783 !!!cp ('t141');
3784 }
3785
3786 #
3787 } elsif ({
3788 p => 1, br => 1,
3789 }->{$token->{tag_name}}) {
3790 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3791 !!!cp ('t142');
3792 ## As if <head>
3793 !!!create-element ($self->{head_element}, 'head',, $token);
3794 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3795 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3796
3797 $self->{insertion_mode} = IN_HEAD_IM;
3798 ## Reprocess in the "in head" insertion mode...
3799 } else {
3800 !!!cp ('t143');
3801 }
3802
3803 #
3804 } else {
3805 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
3806 !!!cp ('t144');
3807 #
3808 } else {
3809 !!!cp ('t145');
3810 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
3811 ## Ignore the token
3812 !!!next-token;
3813 redo B;
3814 }
3815 }
3816
3817 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3818 !!!cp ('t146');
3819 ## As if </noscript>
3820 pop @{$self->{open_elements}};
3821 !!!parse-error (type => 'in noscript:/'.$token->{tag_name}, token => $token);
3822
3823 ## Reprocess in the "in head" insertion mode...
3824 ## As if </head>
3825 pop @{$self->{open_elements}};
3826
3827 ## Reprocess in the "after head" insertion mode...
3828 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3829 !!!cp ('t147');
3830 ## As if </head>
3831 pop @{$self->{open_elements}};
3832
3833 ## Reprocess in the "after head" insertion mode...
3834 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3835 ## ISSUE: This case cannot be reached?
3836 !!!cp ('t148');
3837 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
3838 ## Ignore the token ## ISSUE: An issue in the spec.
3839 !!!next-token;
3840 redo B;
3841 } else {
3842 !!!cp ('t149');
3843 }
3844
3845 ## "after head" insertion mode
3846 ## As if <body>
3847 !!!insert-element ('body',, $token);
3848 $self->{insertion_mode} = IN_BODY_IM;
3849 ## reprocess
3850 redo B;
3851 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
3852 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3853 !!!cp ('t149.1');
3854
3855 ## NOTE: As if <head>
3856 !!!create-element ($self->{head_element}, 'head',, $token);
3857 $self->{open_elements}->[-1]->[0]->append_child
3858 ($self->{head_element});
3859 #push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3860 #$self->{insertion_mode} = IN_HEAD_IM;
3861 ## NOTE: Reprocess.
3862
3863 ## NOTE: As if </head>
3864 #pop @{$self->{open_elements}};
3865 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
3866 ## NOTE: Reprocess.
3867
3868 #
3869 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3870 !!!cp ('t149.2');
3871
3872 ## NOTE: As if </head>
3873 pop @{$self->{open_elements}};
3874 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
3875 ## NOTE: Reprocess.
3876
3877 #
3878 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3879 !!!cp ('t149.3');
3880
3881 !!!parse-error (type => 'in noscript:#eof', token => $token);
3882
3883 ## As if </noscript>
3884 pop @{$self->{open_elements}};
3885 #$self->{insertion_mode} = IN_HEAD_IM;
3886 ## NOTE: Reprocess.
3887
3888 ## NOTE: As if </head>
3889 pop @{$self->{open_elements}};
3890 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
3891 ## NOTE: Reprocess.
3892
3893 #
3894 } else {
3895 !!!cp ('t149.4');
3896 #
3897 }
3898
3899 ## NOTE: As if <body>
3900 !!!insert-element ('body',, $token);
3901 $self->{insertion_mode} = IN_BODY_IM;
3902 ## NOTE: Reprocess.
3903 redo B;
3904 } else {
3905 die "$0: $token->{type}: Unknown token type";
3906 }
3907
3908 ## ISSUE: An issue in the spec.
3909 } elsif ($self->{insertion_mode} & BODY_IMS) {
3910 if ($token->{type} == CHARACTER_TOKEN) {
3911 !!!cp ('t150');
3912 ## NOTE: There is a code clone of "character in body".
3913 $reconstruct_active_formatting_elements->($insert_to_current);
3914
3915 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3916
3917 !!!next-token;
3918 redo B;
3919 } elsif ($token->{type} == START_TAG_TOKEN) {
3920 if ({
3921 caption => 1, col => 1, colgroup => 1, tbody => 1,
3922 td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
3923 }->{$token->{tag_name}}) {
3924 if ($self->{insertion_mode} == IN_CELL_IM) {
3925 ## have an element in table scope
3926 for (reverse 0..$#{$self->{open_elements}}) {
3927 my $node = $self->{open_elements}->[$_];
3928 if ($node->[1] eq 'td' or $node->[1] eq 'th') {
3929 !!!cp ('t151');
3930
3931 ## Close the cell
3932 !!!back-token; # <?>
3933 $token = {type => END_TAG_TOKEN, tag_name => $node->[1],
3934 line => $token->{line},
3935 column => $token->{column}};
3936 redo B;
3937 } elsif ({
3938 table => 1, html => 1,
3939 }->{$node->[1]}) {
3940 !!!cp ('t152');
3941 ## ISSUE: This case can never be reached, maybe.
3942 last;
3943 }
3944 }
3945
3946 !!!cp ('t153');
3947 !!!parse-error (type => 'start tag not allowed',
3948 value => $token->{tag_name}, token => $token);
3949 ## Ignore the token
3950 !!!next-token;
3951 redo B;
3952 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
3953 !!!parse-error (type => 'not closed:caption', token => $token);
3954
3955 ## NOTE: As if </caption>.
3956 ## have a table element in table scope
3957 my $i;
3958 INSCOPE: {
3959 for (reverse 0..$#{$self->{open_elements}}) {
3960 my $node = $self->{open_elements}->[$_];
3961 if ($node->[1] eq 'caption') {
3962 !!!cp ('t155');
3963 $i = $_;
3964 last INSCOPE;
3965 } elsif ({
3966 table => 1, html => 1,
3967 }->{$node->[1]}) {
3968 !!!cp ('t156');
3969 last;
3970 }
3971 }
3972
3973 !!!cp ('t157');
3974 !!!parse-error (type => 'start tag not allowed',
3975 value => $token->{tag_name}, token => $token);
3976 ## Ignore the token
3977 !!!next-token;
3978 redo B;
3979 } # INSCOPE
3980
3981 ## generate implied end tags
3982 while ({
3983 dd => 1, dt => 1, li => 1, p => 1,
3984 }->{$self->{open_elements}->[-1]->[1]}) {
3985 !!!cp ('t158');
3986 pop @{$self->{open_elements}};
3987 }
3988
3989 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3990 !!!cp ('t159');
3991 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1], token => $token);
3992 } else {
3993 !!!cp ('t160');
3994 }
3995
3996 splice @{$self->{open_elements}}, $i;
3997
3998 $clear_up_to_marker->();
3999
4000 $self->{insertion_mode} = IN_TABLE_IM;
4001
4002 ## reprocess
4003 redo B;
4004 } else {
4005 !!!cp ('t161');
4006 #
4007 }
4008 } else {
4009 !!!cp ('t162');
4010 #
4011 }
4012 } elsif ($token->{type} == END_TAG_TOKEN) {
4013 if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') {
4014 if ($self->{insertion_mode} == IN_CELL_IM) {
4015 ## have an element in table scope
4016 my $i;
4017 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4018 my $node = $self->{open_elements}->[$_];
4019 if ($node->[1] eq $token->{tag_name}) {
4020 !!!cp ('t163');
4021 $i = $_;
4022 last INSCOPE;
4023 } elsif ({
4024 table => 1, html => 1,
4025 }->{$node->[1]}) {
4026 !!!cp ('t164');
4027 last INSCOPE;
4028 }
4029 } # INSCOPE
4030 unless (defined $i) {
4031 !!!cp ('t165');
4032 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4033 ## Ignore the token
4034 !!!next-token;
4035 redo B;
4036 }
4037
4038 ## generate implied end tags
4039 while ({
4040 dd => 1, dt => 1, li => 1, p => 1,
4041 }->{$self->{open_elements}->[-1]->[1]}) {
4042 !!!cp ('t166');
4043 pop @{$self->{open_elements}};
4044 }
4045
4046 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
4047 !!!cp ('t167');
4048 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1], token => $token);
4049 } else {
4050 !!!cp ('t168');
4051 }
4052
4053 splice @{$self->{open_elements}}, $i;
4054
4055 $clear_up_to_marker->();
4056
4057 $self->{insertion_mode} = IN_ROW_IM;
4058
4059 !!!next-token;
4060 redo B;
4061 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
4062 !!!cp ('t169');
4063 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4064 ## Ignore the token
4065 !!!next-token;
4066 redo B;
4067 } else {
4068 !!!cp ('t170');
4069 #
4070 }
4071 } elsif ($token->{tag_name} eq 'caption') {
4072 if ($self->{insertion_mode} == IN_CAPTION_IM) {
4073 ## have a table element in table scope
4074 my $i;
4075 INSCOPE: {
4076 for (reverse 0..$#{$self->{open_elements}}) {
4077 my $node = $self->{open_elements}->[$_];
4078 if ($node->[1] eq $token->{tag_name}) {
4079 !!!cp ('t171');
4080 $i = $_;
4081 last INSCOPE;
4082 } elsif ({
4083 table => 1, html => 1,
4084 }->{$node->[1]}) {
4085 !!!cp ('t172');
4086 last;
4087 }
4088 }
4089
4090 !!!cp ('t173');
4091 !!!parse-error (type => 'unmatched end tag',
4092 value => $token->{tag_name}, token => $token);
4093 ## Ignore the token
4094 !!!next-token;
4095 redo B;
4096 } # INSCOPE
4097
4098 ## generate implied end tags
4099 while ({
4100 dd => 1, dt => 1, li => 1, p => 1,
4101 }->{$self->{open_elements}->[-1]->[1]}) {
4102 !!!cp ('t174');
4103 pop @{$self->{open_elements}};
4104 }
4105
4106 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
4107 !!!cp ('t175');
4108 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1], token => $token);
4109 } else {
4110 !!!cp ('t176');
4111 }
4112
4113 splice @{$self->{open_elements}}, $i;
4114
4115 $clear_up_to_marker->();
4116
4117 $self->{insertion_mode} = IN_TABLE_IM;
4118
4119 !!!next-token;
4120 redo B;
4121 } elsif ($self->{insertion_mode} == IN_CELL_IM) {
4122 !!!cp ('t177');
4123 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4124 ## Ignore the token
4125 !!!next-token;
4126 redo B;
4127 } else {
4128 !!!cp ('t178');
4129 #
4130 }
4131 } elsif ({
4132 table => 1, tbody => 1, tfoot => 1,
4133 thead => 1, tr => 1,
4134 }->{$token->{tag_name}} and
4135 $self->{insertion_mode} == IN_CELL_IM) {
4136 ## have an element in table scope
4137 my $i;
4138 my $tn;
4139 INSCOPE: {
4140 for (reverse 0..$#{$self->{open_elements}}) {
4141 my $node = $self->{open_elements}->[$_];
4142 if ($node->[1] eq $token->{tag_name}) {
4143 !!!cp ('t179');
4144 $i = $_;
4145
4146 ## Close the cell
4147 !!!back-token; # </?>
4148 $token = {type => END_TAG_TOKEN, tag_name => $tn,
4149 line => $token->{line},
4150 column => $token->{column}};
4151 redo B;
4152 } elsif ($node->[1] eq 'td' or $node->[1] eq 'th') {
4153 !!!cp ('t180');
4154 $tn = $node->[1];
4155 ## NOTE: There is exactly one |td| or |th| element
4156 ## in scope in the stack of open elements by definition.
4157 } elsif ({
4158 table => 1, html => 1,
4159 }->{$node->[1]}) {
4160 ## ISSUE: Can this be reached?
4161 !!!cp ('t181');
4162 last;
4163 }
4164 }
4165
4166 !!!cp ('t182');
4167 !!!parse-error (type => 'unmatched end tag',
4168 value => $token->{tag_name}, token => $token);
4169 ## Ignore the token
4170 !!!next-token;
4171 redo B;
4172 } # INSCOPE
4173 } elsif ($token->{tag_name} eq 'table' and
4174 $self->{insertion_mode} == IN_CAPTION_IM) {
4175 !!!parse-error (type => 'not closed:caption', token => $token);
4176
4177 ## As if </caption>
4178 ## have a table element in table scope
4179 my $i;
4180 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4181 my $node = $self->{open_elements}->[$_];
4182 if ($node->[1] eq 'caption') {
4183 !!!cp ('t184');
4184 $i = $_;
4185 last INSCOPE;
4186 } elsif ({
4187 table => 1, html => 1,
4188 }->{$node->[1]}) {
4189 !!!cp ('t185');
4190 last INSCOPE;
4191 }
4192 } # INSCOPE
4193 unless (defined $i) {
4194 !!!cp ('t186');
4195 !!!parse-error (type => 'unmatched end tag:caption', token => $token);
4196 ## Ignore the token
4197 !!!next-token;
4198 redo B;
4199 }
4200
4201 ## generate implied end tags
4202 while ({
4203 dd => 1, dt => 1, li => 1, p => 1,
4204 }->{$self->{open_elements}->[-1]->[1]}) {
4205 !!!cp ('t187');
4206 pop @{$self->{open_elements}};
4207 }
4208
4209 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
4210 !!!cp ('t188');
4211 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1], token => $token);
4212 } else {
4213 !!!cp ('t189');
4214 }
4215
4216 splice @{$self->{open_elements}}, $i;
4217
4218 $clear_up_to_marker->();
4219
4220 $self->{insertion_mode} = IN_TABLE_IM;
4221
4222 ## reprocess
4223 redo B;
4224 } elsif ({
4225 body => 1, col => 1, colgroup => 1, html => 1,
4226 }->{$token->{tag_name}}) {
4227 if ($self->{insertion_mode} & BODY_TABLE_IMS) {
4228 !!!cp ('t190');
4229 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4230 ## Ignore the token
4231 !!!next-token;
4232 redo B;
4233 } else {
4234 !!!cp ('t191');
4235 #
4236 }
4237 } elsif ({
4238 tbody => 1, tfoot => 1,
4239 thead => 1, tr => 1,
4240 }->{$token->{tag_name}} and
4241 $self->{insertion_mode} == IN_CAPTION_IM) {
4242 !!!cp ('t192');
4243 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4244 ## Ignore the token
4245 !!!next-token;
4246 redo B;
4247 } else {
4248 !!!cp ('t193');
4249 #
4250 }
4251 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
4252 for my $entry (@{$self->{open_elements}}) {
4253 if (not {
4254 dd => 1, dt => 1, li => 1, p => 1, tbody => 1, td => 1, tfoot => 1,
4255 th => 1, thead => 1, tr => 1, body => 1, html => 1,
4256 }->{$entry->[1]}) {
4257 !!!cp ('t75');
4258 !!!parse-error (type => 'in body:#eof', token => $token);
4259 last;
4260 }
4261 }
4262
4263 ## Stop parsing.
4264 last B;
4265 } else {
4266 die "$0: $token->{type}: Unknown token type";
4267 }
4268
4269 $insert = $insert_to_current;
4270 #
4271 } elsif ($self->{insertion_mode} & TABLE_IMS) {
4272 if ($token->{type} == CHARACTER_TOKEN) {
4273 if (not $open_tables->[-1]->[1] and # tainted
4274 $token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4275 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4276
4277 unless (length $token->{data}) {
4278 !!!cp ('t194');
4279 !!!next-token;
4280 redo B;
4281 } else {
4282 !!!cp ('t195');
4283 }
4284 }
4285
4286 !!!parse-error (type => 'in table:#character', token => $token);
4287
4288 ## As if in body, but insert into foster parent element
4289 ## ISSUE: Spec says that "whenever a node would be inserted
4290 ## into the current node" while characters might not be
4291 ## result in a new Text node.
4292 $reconstruct_active_formatting_elements->($insert_to_foster);
4293
4294 if ({
4295 table => 1, tbody => 1, tfoot => 1,
4296 thead => 1, tr => 1,
4297 }->{$self->{open_elements}->[-1]->[1]}) {
4298 # MUST
4299 my $foster_parent_element;
4300 my $next_sibling;
4301 my $prev_sibling;
4302 OE: for (reverse 0..$#{$self->{open_elements}}) {
4303 if ($self->{open_elements}->[$_]->[1] eq 'table') {
4304 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
4305 if (defined $parent and $parent->node_type == 1) {
4306 !!!cp ('t196');
4307 $foster_parent_element = $parent;
4308 $next_sibling = $self->{open_elements}->[$_]->[0];
4309 $prev_sibling = $next_sibling->previous_sibling;
4310 } else {
4311 !!!cp ('t197');
4312 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
4313 $prev_sibling = $foster_parent_element->last_child;
4314 }
4315 last OE;
4316 }
4317 } # OE
4318 $foster_parent_element = $self->{open_elements}->[0]->[0] and
4319 $prev_sibling = $foster_parent_element->last_child
4320 unless defined $foster_parent_element;
4321 if (defined $prev_sibling and
4322 $prev_sibling->node_type == 3) {
4323 !!!cp ('t198');
4324 $prev_sibling->manakai_append_text ($token->{data});
4325 } else {
4326 !!!cp ('t199');
4327 $foster_parent_element->insert_before
4328 ($self->{document}->create_text_node ($token->{data}),
4329 $next_sibling);
4330 }
4331 $open_tables->[-1]->[1] = 1; # tainted
4332 } else {
4333 !!!cp ('t200');
4334 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4335 }
4336
4337 !!!next-token;
4338 redo B;
4339 } elsif ($token->{type} == START_TAG_TOKEN) {
4340 if ({
4341 tr => ($self->{insertion_mode} != IN_ROW_IM),
4342 th => 1, td => 1,
4343 }->{$token->{tag_name}}) {
4344 if ($self->{insertion_mode} == IN_TABLE_IM) {
4345 ## Clear back to table context
4346 while ($self->{open_elements}->[-1]->[1] ne 'table' and
4347 $self->{open_elements}->[-1]->[1] ne 'html') {
4348 !!!cp ('t201');
4349 pop @{$self->{open_elements}};
4350 }
4351
4352 !!!insert-element ('tbody',, $token);
4353 $self->{insertion_mode} = IN_TABLE_BODY_IM;
4354 ## reprocess in the "in table body" insertion mode...
4355 }
4356
4357 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
4358 unless ($token->{tag_name} eq 'tr') {
4359 !!!cp ('t202');
4360 !!!parse-error (type => 'missing start tag:tr', token => $token);
4361 }
4362
4363 ## Clear back to table body context
4364 while (not {
4365 tbody => 1, tfoot => 1, thead => 1, html => 1,
4366 }->{$self->{open_elements}->[-1]->[1]}) {
4367 !!!cp ('t203');
4368 ## ISSUE: Can this case be reached?
4369 pop @{$self->{open_elements}};
4370 }
4371
4372 $self->{insertion_mode} = IN_ROW_IM;
4373 if ($token->{tag_name} eq 'tr') {
4374 !!!cp ('t204');
4375 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4376 !!!next-token;
4377 redo B;
4378 } else {
4379 !!!cp ('t205');
4380 !!!insert-element ('tr',, $token);
4381 ## reprocess in the "in row" insertion mode
4382 }
4383 } else {
4384 !!!cp ('t206');
4385 }
4386
4387 ## Clear back to table row context
4388 while (not {
4389 tr => 1, html => 1,
4390 }->{$self->{open_elements}->[-1]->[1]}) {
4391 !!!cp ('t207');
4392 pop @{$self->{open_elements}};
4393 }
4394
4395 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4396 $self->{insertion_mode} = IN_CELL_IM;
4397
4398 push @$active_formatting_elements, ['#marker', ''];
4399
4400 !!!next-token;
4401 redo B;
4402 } elsif ({
4403 caption => 1, col => 1, colgroup => 1,
4404 tbody => 1, tfoot => 1, thead => 1,
4405 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
4406 }->{$token->{tag_name}}) {
4407 if ($self->{insertion_mode} == IN_ROW_IM) {
4408 ## As if </tr>
4409 ## have an element in table scope
4410 my $i;
4411 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4412 my $node = $self->{open_elements}->[$_];
4413 if ($node->[1] eq 'tr') {
4414 !!!cp ('t208');
4415 $i = $_;
4416 last INSCOPE;
4417 } elsif ({
4418 html => 1,
4419
4420 ## NOTE: This element does not appear here, maybe.
4421 table => 1,
4422 }->{$node->[1]}) {
4423 !!!cp ('t209');
4424 last INSCOPE;
4425 }
4426 } # INSCOPE
4427 unless (defined $i) {
4428 !!!cp ('t210');
4429 ## TODO: This type is wrong.
4430 !!!parse-error (type => 'unmacthed end tag:'.$token->{tag_name}, token => $token);
4431 ## Ignore the token
4432 !!!next-token;
4433 redo B;
4434 }
4435
4436 ## Clear back to table row context
4437 while (not {
4438 tr => 1, html => 1,
4439 }->{$self->{open_elements}->[-1]->[1]}) {
4440 !!!cp ('t211');
4441 ## ISSUE: Can this case be reached?
4442 pop @{$self->{open_elements}};
4443 }
4444
4445 pop @{$self->{open_elements}}; # tr
4446 $self->{insertion_mode} = IN_TABLE_BODY_IM;
4447 if ($token->{tag_name} eq 'tr') {
4448 !!!cp ('t212');
4449 ## reprocess
4450 redo B;
4451 } else {
4452 !!!cp ('t213');
4453 ## reprocess in the "in table body" insertion mode...
4454 }
4455 }
4456
4457 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
4458 ## have an element in table scope
4459 my $i;
4460 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4461 my $node = $self->{open_elements}->[$_];
4462 if ({
4463 tbody => 1, thead => 1, tfoot => 1,
4464 }->{$node->[1]}) {
4465 !!!cp ('t214');
4466 $i = $_;
4467 last INSCOPE;
4468 } elsif ({
4469 table => 1, html => 1,
4470 }->{$node->[1]}) {
4471 !!!cp ('t215');
4472 last INSCOPE;
4473 }
4474 } # INSCOPE
4475 unless (defined $i) {
4476 !!!cp ('t216');
4477 ## TODO: This erorr type ios wrong.
4478 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4479 ## Ignore the token
4480 !!!next-token;
4481 redo B;
4482 }
4483
4484 ## Clear back to table body context
4485 while (not {
4486 tbody => 1, tfoot => 1, thead => 1, html => 1,
4487 }->{$self->{open_elements}->[-1]->[1]}) {
4488 !!!cp ('t217');
4489 ## ISSUE: Can this state be reached?
4490 pop @{$self->{open_elements}};
4491 }
4492
4493 ## As if <{current node}>
4494 ## have an element in table scope
4495 ## true by definition
4496
4497 ## Clear back to table body context
4498 ## nop by definition
4499
4500 pop @{$self->{open_elements}};
4501 $self->{insertion_mode} = IN_TABLE_IM;
4502 ## reprocess in "in table" insertion mode...
4503 } else {
4504 !!!cp ('t218');
4505 }
4506
4507 if ($token->{tag_name} eq 'col') {
4508 ## Clear back to table context
4509 while ($self->{open_elements}->[-1]->[1] ne 'table' and
4510 $self->{open_elements}->[-1]->[1] ne 'html') {
4511 !!!cp ('t219');
4512 ## ISSUE: Can this state be reached?
4513 pop @{$self->{open_elements}};
4514 }
4515
4516 !!!insert-element ('colgroup',, $token);
4517 $self->{insertion_mode} = IN_COLUMN_GROUP_IM;
4518 ## reprocess
4519 redo B;
4520 } elsif ({
4521 caption => 1,
4522 colgroup => 1,
4523 tbody => 1, tfoot => 1, thead => 1,
4524 }->{$token->{tag_name}}) {
4525 ## Clear back to table context
4526 while ($self->{open_elements}->[-1]->[1] ne 'table' and
4527 $self->{open_elements}->[-1]->[1] ne 'html') {
4528 !!!cp ('t220');
4529 ## ISSUE: Can this state be reached?
4530 pop @{$self->{open_elements}};
4531 }
4532
4533 push @$active_formatting_elements, ['#marker', '']
4534 if $token->{tag_name} eq 'caption';
4535
4536 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4537 $self->{insertion_mode} = {
4538 caption => IN_CAPTION_IM,
4539 colgroup => IN_COLUMN_GROUP_IM,
4540 tbody => IN_TABLE_BODY_IM,
4541 tfoot => IN_TABLE_BODY_IM,
4542 thead => IN_TABLE_BODY_IM,
4543 }->{$token->{tag_name}};
4544 !!!next-token;
4545 redo B;
4546 } else {
4547 die "$0: in table: <>: $token->{tag_name}";
4548 }
4549 } elsif ($token->{tag_name} eq 'table') {
4550 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1], token => $token);
4551
4552 ## As if </table>
4553 ## have a table element in table scope
4554 my $i;
4555 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4556 my $node = $self->{open_elements}->[$_];
4557 if ($node->[1] eq 'table') {
4558 !!!cp ('t221');
4559 $i = $_;
4560 last INSCOPE;
4561 } elsif ({
4562 #table => 1,
4563 html => 1,
4564 }->{$node->[1]}) {
4565 !!!cp ('t222');
4566 last INSCOPE;
4567 }
4568 } # INSCOPE
4569 unless (defined $i) {
4570 !!!cp ('t223');
4571 ## TODO: The following is wrong, maybe.
4572 !!!parse-error (type => 'unmatched end tag:table', token => $token);
4573 ## Ignore tokens </table><table>
4574 !!!next-token;
4575 redo B;
4576 }
4577
4578 ## TODO: Followings are removed from the latest spec.
4579 ## generate implied end tags
4580 while ({
4581 dd => 1, dt => 1, li => 1, p => 1,
4582 }->{$self->{open_elements}->[-1]->[1]}) {
4583 !!!cp ('t224');
4584 pop @{$self->{open_elements}};
4585 }
4586
4587 if ($self->{open_elements}->[-1]->[1] ne 'table') {
4588 !!!cp ('t225');
4589 ## ISSUE: Can this case be reached?
4590 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1], token => $token);
4591 } else {
4592 !!!cp ('t226');
4593 }
4594
4595 splice @{$self->{open_elements}}, $i;
4596 pop @{$open_tables};
4597
4598 $self->_reset_insertion_mode;
4599
4600 ## reprocess
4601 redo B;
4602 } elsif ($token->{tag_name} eq 'style') {
4603 if (not $open_tables->[-1]->[1]) { # tainted
4604 !!!cp ('t227.8');
4605 ## NOTE: This is a "as if in head" code clone.
4606 $parse_rcdata->(CDATA_CONTENT_MODEL);
4607 redo B;
4608 } else {
4609 !!!cp ('t227.7');
4610 #
4611 }
4612 } elsif ($token->{tag_name} eq 'script') {
4613 if (not $open_tables->[-1]->[1]) { # tainted
4614 !!!cp ('t227.6');
4615 ## NOTE: This is a "as if in head" code clone.
4616 $script_start_tag->();
4617 redo B;
4618 } else {
4619 !!!cp ('t227.5');
4620 #
4621 }
4622 } elsif ($token->{tag_name} eq 'input') {
4623 if (not $open_tables->[-1]->[1]) { # tainted
4624 if ($token->{attributes}->{type}) { ## TODO: case
4625 my $type = lc $token->{attributes}->{type}->{value};
4626 if ($type eq 'hidden') {
4627 !!!cp ('t227.3');
4628 !!!parse-error (type => 'in table:'.$token->{tag_name}, token => $token);
4629
4630 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4631
4632 ## TODO: form element pointer
4633
4634 pop @{$self->{open_elements}};
4635
4636 !!!next-token;
4637 redo B;
4638 } else {
4639 !!!cp ('t227.2');
4640 #
4641 }
4642 } else {
4643 !!!cp ('t227.1');
4644 #
4645 }
4646 } else {
4647 !!!cp ('t227.4');
4648 #
4649 }
4650 } else {
4651 !!!cp ('t227');
4652 #
4653 }
4654
4655 !!!parse-error (type => 'in table:'.$token->{tag_name}, token => $token);
4656
4657 $insert = $insert_to_foster;
4658 #
4659 } elsif ($token->{type} == END_TAG_TOKEN) {
4660 if ($token->{tag_name} eq 'tr' and
4661 $self->{insertion_mode} == IN_ROW_IM) {
4662 ## have an element in table scope
4663 my $i;
4664 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4665 my $node = $self->{open_elements}->[$_];
4666 if ($node->[1] eq $token->{tag_name}) {
4667 !!!cp ('t228');
4668 $i = $_;
4669 last INSCOPE;
4670 } elsif ({
4671 table => 1, html => 1,
4672 }->{$node->[1]}) {
4673 !!!cp ('t229');
4674 last INSCOPE;
4675 }
4676 } # INSCOPE
4677 unless (defined $i) {
4678 !!!cp ('t230');
4679 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4680 ## Ignore the token
4681 !!!next-token;
4682 redo B;
4683 } else {
4684 !!!cp ('t232');
4685 }
4686
4687 ## Clear back to table row context
4688 while (not {
4689 tr => 1, html => 1,
4690 }->{$self->{open_elements}->[-1]->[1]}) {
4691 !!!cp ('t231');
4692 ## ISSUE: Can this state be reached?
4693 pop @{$self->{open_elements}};
4694 }
4695
4696 pop @{$self->{open_elements}}; # tr
4697 $self->{insertion_mode} = IN_TABLE_BODY_IM;
4698 !!!next-token;
4699 redo B;
4700 } elsif ($token->{tag_name} eq 'table') {
4701 if ($self->{insertion_mode} == IN_ROW_IM) {
4702 ## As if </tr>
4703 ## have an element in table scope
4704 my $i;
4705 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4706 my $node = $self->{open_elements}->[$_];
4707 if ($node->[1] eq 'tr') {
4708 !!!cp ('t233');
4709 $i = $_;
4710 last INSCOPE;
4711 } elsif ({
4712 table => 1, html => 1,
4713 }->{$node->[1]}) {
4714 !!!cp ('t234');
4715 last INSCOPE;
4716 }
4717 } # INSCOPE
4718 unless (defined $i) {
4719 !!!cp ('t235');
4720 ## TODO: The following is wrong.
4721 !!!parse-error (type => 'unmatched end tag:'.$token->{type}, token => $token);
4722 ## Ignore the token
4723 !!!next-token;
4724 redo B;
4725 }
4726
4727 ## Clear back to table row context
4728 while (not {
4729 tr => 1, html => 1,
4730 }->{$self->{open_elements}->[-1]->[1]}) {
4731 !!!cp ('t236');
4732 ## ISSUE: Can this state be reached?
4733 pop @{$self->{open_elements}};
4734 }
4735
4736 pop @{$self->{open_elements}}; # tr
4737 $self->{insertion_mode} = IN_TABLE_BODY_IM;
4738 ## reprocess in the "in table body" insertion mode...
4739 }
4740
4741 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
4742 ## have an element in table scope
4743 my $i;
4744 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4745 my $node = $self->{open_elements}->[$_];
4746 if ({
4747 tbody => 1, thead => 1, tfoot => 1,
4748 }->{$node->[1]}) {
4749 !!!cp ('t237');
4750 $i = $_;
4751 last INSCOPE;
4752 } elsif ({
4753 table => 1, html => 1,
4754 }->{$node->[1]}) {
4755 !!!cp ('t238');
4756 last INSCOPE;
4757 }
4758 } # INSCOPE
4759 unless (defined $i) {
4760 !!!cp ('t239');
4761 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4762 ## Ignore the token
4763 !!!next-token;
4764 redo B;
4765 }
4766
4767 ## Clear back to table body context
4768 while (not {
4769 tbody => 1, tfoot => 1, thead => 1, html => 1,
4770 }->{$self->{open_elements}->[-1]->[1]}) {
4771 !!!cp ('t240');
4772 pop @{$self->{open_elements}};
4773 }
4774
4775 ## As if <{current node}>
4776 ## have an element in table scope
4777 ## true by definition
4778
4779 ## Clear back to table body context
4780 ## nop by definition
4781
4782 pop @{$self->{open_elements}};
4783 $self->{insertion_mode} = IN_TABLE_IM;
4784 ## reprocess in the "in table" insertion mode...
4785 }
4786
4787 ## NOTE: </table> in the "in table" insertion mode.
4788 ## When you edit the code fragment below, please ensure that
4789 ## the code for <table> in the "in table" insertion mode
4790 ## is synced with it.
4791
4792 ## have a table element in table scope
4793 my $i;
4794 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4795 my $node = $self->{open_elements}->[$_];
4796 if ($node->[1] eq $token->{tag_name}) {
4797 !!!cp ('t241');
4798 $i = $_;
4799 last INSCOPE;
4800 } elsif ({
4801 table => 1, html => 1,
4802 }->{$node->[1]}) {
4803 !!!cp ('t242');
4804 last INSCOPE;
4805 }
4806 } # INSCOPE
4807 unless (defined $i) {
4808 !!!cp ('t243');
4809 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4810 ## Ignore the token
4811 !!!next-token;
4812 redo B;
4813 }
4814
4815 splice @{$self->{open_elements}}, $i;
4816 pop @{$open_tables};
4817
4818 $self->_reset_insertion_mode;
4819
4820 !!!next-token;
4821 redo B;
4822 } elsif ({
4823 tbody => 1, tfoot => 1, thead => 1,
4824 }->{$token->{tag_name}} and
4825 $self->{insertion_mode} & ROW_IMS) {
4826 if ($self->{insertion_mode} == IN_ROW_IM) {
4827 ## have an element in table scope
4828 my $i;
4829 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4830 my $node = $self->{open_elements}->[$_];
4831 if ($node->[1] eq $token->{tag_name}) {
4832 !!!cp ('t247');
4833 $i = $_;
4834 last INSCOPE;
4835 } elsif ({
4836 table => 1, html => 1,
4837 }->{$node->[1]}) {
4838 !!!cp ('t248');
4839 last INSCOPE;
4840 }
4841 } # INSCOPE
4842 unless (defined $i) {
4843 !!!cp ('t249');
4844 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4845 ## Ignore the token
4846 !!!next-token;
4847 redo B;
4848 }
4849
4850 ## As if </tr>
4851 ## have an element in table scope
4852 my $i;
4853 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4854 my $node = $self->{open_elements}->[$_];
4855 if ($node->[1] eq 'tr') {
4856 !!!cp ('t250');
4857 $i = $_;
4858 last INSCOPE;
4859 } elsif ({
4860 table => 1, html => 1,
4861 }->{$node->[1]}) {
4862 !!!cp ('t251');
4863 last INSCOPE;
4864 }
4865 } # INSCOPE
4866 unless (defined $i) {
4867 !!!cp ('t252');
4868 !!!parse-error (type => 'unmatched end tag:tr', token => $token);
4869 ## Ignore the token
4870 !!!next-token;
4871 redo B;
4872 }
4873
4874 ## Clear back to table row context
4875 while (not {
4876 tr => 1, html => 1,
4877 }->{$self->{open_elements}->[-1]->[1]}) {
4878 !!!cp ('t253');
4879 ## ISSUE: Can this case be reached?
4880 pop @{$self->{open_elements}};
4881 }
4882
4883 pop @{$self->{open_elements}}; # tr
4884 $self->{insertion_mode} = IN_TABLE_BODY_IM;
4885 ## reprocess in the "in table body" insertion mode...
4886 }
4887
4888 ## have an element in table scope
4889 my $i;
4890 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4891 my $node = $self->{open_elements}->[$_];
4892 if ($node->[1] eq $token->{tag_name}) {
4893 !!!cp ('t254');
4894 $i = $_;
4895 last INSCOPE;
4896 } elsif ({
4897 table => 1, html => 1,
4898 }->{$node->[1]}) {
4899 !!!cp ('t255');
4900 last INSCOPE;
4901 }
4902 } # INSCOPE
4903 unless (defined $i) {
4904 !!!cp ('t256');
4905 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4906 ## Ignore the token
4907 !!!next-token;
4908 redo B;
4909 }
4910
4911 ## Clear back to table body context
4912 while (not {
4913 tbody => 1, tfoot => 1, thead => 1, html => 1,
4914 }->{$self->{open_elements}->[-1]->[1]}) {
4915 !!!cp ('t257');
4916 ## ISSUE: Can this case be reached?
4917 pop @{$self->{open_elements}};
4918 }
4919
4920 pop @{$self->{open_elements}};
4921 $self->{insertion_mode} = IN_TABLE_IM;
4922 !!!next-token;
4923 redo B;
4924 } elsif ({
4925 body => 1, caption => 1, col => 1, colgroup => 1,
4926 html => 1, td => 1, th => 1,
4927 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
4928 tbody => 1, tfoot => 1, thead => 1, # $self->{insertion_mode} == IN_TABLE_IM
4929 }->{$token->{tag_name}}) {
4930 !!!cp ('t258');
4931 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4932 ## Ignore the token
4933 !!!next-token;
4934 redo B;
4935 } else {
4936 !!!cp ('t259');
4937 !!!parse-error (type => 'in table:/'.$token->{tag_name}, token => $token);
4938
4939 $insert = $insert_to_foster;
4940 #
4941 }
4942 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
4943 unless ($self->{open_elements}->[-1]->[1] eq 'html' and
4944 @{$self->{open_elements}} == 1) { # redundant, maybe
4945 !!!parse-error (type => 'in body:#eof', token => $token);
4946 !!!cp ('t259.1');
4947 #
4948 } else {
4949 !!!cp ('t259.2');
4950 #
4951 }
4952
4953 ## Stop parsing
4954 last B;
4955 } else {
4956 die "$0: $token->{type}: Unknown token type";
4957 }
4958 } elsif ($self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
4959 if ($token->{type} == CHARACTER_TOKEN) {
4960 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4961 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4962 unless (length $token->{data}) {
4963 !!!cp ('t260');
4964 !!!next-token;
4965 redo B;
4966 }
4967 }
4968
4969 !!!cp ('t261');
4970 #
4971 } elsif ($token->{type} == START_TAG_TOKEN) {
4972 if ($token->{tag_name} eq 'col') {
4973 !!!cp ('t262');
4974 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4975 pop @{$self->{open_elements}};
4976 !!!next-token;
4977 redo B;
4978 } else {
4979 !!!cp ('t263');
4980 #
4981 }
4982 } elsif ($token->{type} == END_TAG_TOKEN) {
4983 if ($token->{tag_name} eq 'colgroup') {
4984 if ($self->{open_elements}->[-1]->[1] eq 'html') {
4985 !!!cp ('t264');
4986 !!!parse-error (type => 'unmatched end tag:colgroup', token => $token);
4987 ## Ignore the token
4988 !!!next-token;
4989 redo B;
4990 } else {
4991 !!!cp ('t265');
4992 pop @{$self->{open_elements}}; # colgroup
4993 $self->{insertion_mode} = IN_TABLE_IM;
4994 !!!next-token;
4995 redo B;
4996 }
4997 } elsif ($token->{tag_name} eq 'col') {
4998 !!!cp ('t266');
4999 !!!parse-error (type => 'unmatched end tag:col', token => $token);
5000 ## Ignore the token
5001 !!!next-token;
5002 redo B;
5003 } else {
5004 !!!cp ('t267');
5005 #
5006 }
5007 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5008 if ($self->{open_elements}->[-1]->[1] eq 'html' or
5009 @{$self->{open_elements}} == 1) { # redundant, maybe
5010 !!!cp ('t270.2');
5011 ## Stop parsing.
5012 last B;
5013 } else {
5014 ## NOTE: As if </colgroup>.
5015 !!!cp ('t270.1');
5016 pop @{$self->{open_elements}}; # colgroup
5017 $self->{insertion_mode} = IN_TABLE_IM;
5018 ## Reprocess.
5019 redo B;
5020 }
5021 } else {
5022 die "$0: $token->{type}: Unknown token type";
5023 }
5024
5025 ## As if </colgroup>
5026 if ($self->{open_elements}->[-1]->[1] eq 'html') {
5027 !!!cp ('t269');
5028 ## TODO: Wrong error type?
5029 !!!parse-error (type => 'unmatched end tag:colgroup', token => $token);
5030 ## Ignore the token
5031 !!!next-token;
5032 redo B;
5033 } else {
5034 !!!cp ('t270');
5035 pop @{$self->{open_elements}}; # colgroup
5036 $self->{insertion_mode} = IN_TABLE_IM;
5037 ## reprocess
5038 redo B;
5039 }
5040 } elsif ($self->{insertion_mode} & SELECT_IMS) {
5041 if ($token->{type} == CHARACTER_TOKEN) {
5042 !!!cp ('t271');
5043 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
5044 !!!next-token;
5045 redo B;
5046 } elsif ($token->{type} == START_TAG_TOKEN) {
5047 if ($token->{tag_name} eq 'option') {
5048 if ($self->{open_elements}->[-1]->[1] eq 'option') {
5049 !!!cp ('t272');
5050 ## As if </option>
5051 pop @{$self->{open_elements}};
5052 } else {
5053 !!!cp ('t273');
5054 }
5055
5056 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5057 !!!next-token;
5058 redo B;
5059 } elsif ($token->{tag_name} eq 'optgroup') {
5060 if ($self->{open_elements}->[-1]->[1] eq 'option') {
5061 !!!cp ('t274');
5062 ## As if </option>
5063 pop @{$self->{open_elements}};
5064 } else {
5065 !!!cp ('t275');
5066 }
5067
5068 if ($self->{open_elements}->[-1]->[1] eq 'optgroup') {
5069 !!!cp ('t276');
5070 ## As if </optgroup>
5071 pop @{$self->{open_elements}};
5072 } else {
5073 !!!cp ('t277');
5074 }
5075
5076 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5077 !!!next-token;
5078 redo B;
5079 } elsif ($token->{tag_name} eq 'select' or
5080 $token->{tag_name} eq 'input' or
5081 ($self->{insertion_mode} == IN_SELECT_IN_TABLE_IM and
5082 {
5083 caption => 1, table => 1,
5084 tbody => 1, tfoot => 1, thead => 1,
5085 tr => 1, td => 1, th => 1,
5086 }->{$token->{tag_name}})) {
5087 ## TODO: The type below is not good - <select> is replaced by </select>
5088 !!!parse-error (type => 'not closed:select', token => $token);
5089 ## NOTE: As if the token were </select> (<select> case) or
5090 ## as if there were </select> (otherwise).
5091 ## have an element in table scope
5092 my $i;
5093 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5094 my $node = $self->{open_elements}->[$_];
5095 if ($node->[1] eq 'select') {
5096 !!!cp ('t278');
5097 $i = $_;
5098 last INSCOPE;
5099 } elsif ({
5100 table => 1, html => 1,
5101 }->{$node->[1]}) {
5102 !!!cp ('t279');
5103 last INSCOPE;
5104 }
5105 } # INSCOPE
5106 unless (defined $i) {
5107 !!!cp ('t280');
5108 !!!parse-error (type => 'unmatched end tag:select', token => $token);
5109 ## Ignore the token
5110 !!!next-token;
5111 redo B;
5112 }
5113
5114 !!!cp ('t281');
5115 splice @{$self->{open_elements}}, $i;
5116
5117 $self->_reset_insertion_mode;
5118
5119 if ($token->{tag_name} eq 'select') {
5120 !!!cp ('t281.2');
5121 !!!next-token;
5122 redo B;
5123 } else {
5124 !!!cp ('t281.1');
5125 ## Reprocess the token.
5126 redo B;
5127 }
5128 } else {
5129 !!!cp ('t282');
5130 !!!parse-error (type => 'in select:'.$token->{tag_name}, token => $token);
5131 ## Ignore the token
5132 !!!next-token;
5133 redo B;
5134 }
5135 } elsif ($token->{type} == END_TAG_TOKEN) {
5136 if ($token->{tag_name} eq 'optgroup') {
5137 if ($self->{open_elements}->[-1]->[1] eq 'option' and
5138 $self->{open_elements}->[-2]->[1] eq 'optgroup') {
5139 !!!cp ('t283');
5140 ## As if </option>
5141 splice @{$self->{open_elements}}, -2;
5142 } elsif ($self->{open_elements}->[-1]->[1] eq 'optgroup') {
5143 !!!cp ('t284');
5144 pop @{$self->{open_elements}};
5145 } else {
5146 !!!cp ('t285');
5147 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5148 ## Ignore the token
5149 }
5150 !!!next-token;
5151 redo B;
5152 } elsif ($token->{tag_name} eq 'option') {
5153 if ($self->{open_elements}->[-1]->[1] eq 'option') {
5154 !!!cp ('t286');
5155 pop @{$self->{open_elements}};
5156 } else {
5157 !!!cp ('t287');
5158 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5159 ## Ignore the token
5160 }
5161 !!!next-token;
5162 redo B;
5163 } elsif ($token->{tag_name} eq 'select') {
5164 ## have an element in table scope
5165 my $i;
5166 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5167 my $node = $self->{open_elements}->[$_];
5168 if ($node->[1] eq $token->{tag_name}) {
5169 !!!cp ('t288');
5170 $i = $_;
5171 last INSCOPE;
5172 } elsif ({
5173 table => 1, html => 1,
5174 }->{$node->[1]}) {
5175 !!!cp ('t289');
5176 last INSCOPE;
5177 }
5178 } # INSCOPE
5179 unless (defined $i) {
5180 !!!cp ('t290');
5181 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5182 ## Ignore the token
5183 !!!next-token;
5184 redo B;
5185 }
5186
5187 !!!cp ('t291');
5188 splice @{$self->{open_elements}}, $i;
5189
5190 $self->_reset_insertion_mode;
5191
5192 !!!next-token;
5193 redo B;
5194 } elsif ($self->{insertion_mode} == IN_SELECT_IN_TABLE_IM and
5195 {
5196 caption => 1, table => 1, tbody => 1,
5197 tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
5198 }->{$token->{tag_name}}) {
5199 ## TODO: The following is wrong?
5200 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5201
5202 ## have an element in table scope
5203 my $i;
5204 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5205 my $node = $self->{open_elements}->[$_];
5206 if ($node->[1] eq $token->{tag_name}) {
5207 !!!cp ('t292');
5208 $i = $_;
5209 last INSCOPE;
5210 } elsif ({
5211 table => 1, html => 1,
5212 }->{$node->[1]}) {
5213 !!!cp ('t293');
5214 last INSCOPE;
5215 }
5216 } # INSCOPE
5217 unless (defined $i) {
5218 !!!cp ('t294');
5219 ## Ignore the token
5220 !!!next-token;
5221 redo B;
5222 }
5223
5224 ## As if </select>
5225 ## have an element in table scope
5226 undef $i;
5227 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5228 my $node = $self->{open_elements}->[$_];
5229 if ($node->[1] eq 'select') {
5230 !!!cp ('t295');
5231 $i = $_;
5232 last INSCOPE;
5233 } elsif ({
5234 table => 1, html => 1,
5235 }->{$node->[1]}) {
5236 ## ISSUE: Can this state be reached?
5237 !!!cp ('t296');
5238 last INSCOPE;
5239 }
5240 } # INSCOPE
5241 unless (defined $i) {
5242 !!!cp ('t297');
5243 ## TODO: The following error type is correct?
5244 !!!parse-error (type => 'unmatched end tag:select', token => $token);
5245 ## Ignore the </select> token
5246 !!!next-token; ## TODO: ok?
5247 redo B;
5248 }
5249
5250 !!!cp ('t298');
5251 splice @{$self->{open_elements}}, $i;
5252
5253 $self->_reset_insertion_mode;
5254
5255 ## reprocess
5256 redo B;
5257 } else {
5258 !!!cp ('t299');
5259 !!!parse-error (type => 'in select:/'.$token->{tag_name}, token => $token);
5260 ## Ignore the token
5261 !!!next-token;
5262 redo B;
5263 }
5264 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5265 unless ($self->{open_elements}->[-1]->[1] eq 'html' and
5266 @{$self->{open_elements}} == 1) { # redundant, maybe
5267 !!!cp ('t299.1');
5268 !!!parse-error (type => 'in body:#eof', token => $token);
5269 } else {
5270 !!!cp ('t299.2');
5271 }
5272
5273 ## Stop parsing.
5274 last B;
5275 } else {
5276 die "$0: $token->{type}: Unknown token type";
5277 }
5278 } elsif ($self->{insertion_mode} & BODY_AFTER_IMS) {
5279 if ($token->{type} == CHARACTER_TOKEN) {
5280 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5281 my $data = $1;
5282 ## As if in body
5283 $reconstruct_active_formatting_elements->($insert_to_current);
5284
5285 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5286
5287 unless (length $token->{data}) {
5288 !!!cp ('t300');
5289 !!!next-token;
5290 redo B;
5291 }
5292 }
5293
5294 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
5295 !!!cp ('t301');
5296 !!!parse-error (type => 'after html:#character', token => $token);
5297
5298 ## Reprocess in the "after body" insertion mode.
5299 } else {
5300 !!!cp ('t302');
5301 }
5302
5303 ## "after body" insertion mode
5304 !!!parse-error (type => 'after body:#character', token => $token);
5305
5306 $self->{insertion_mode} = IN_BODY_IM;
5307 ## reprocess
5308 redo B;
5309 } elsif ($token->{type} == START_TAG_TOKEN) {
5310 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
5311 !!!cp ('t303');
5312 !!!parse-error (type => 'after html:'.$token->{tag_name}, token => $token);
5313
5314 ## Reprocess in the "after body" insertion mode.
5315 } else {
5316 !!!cp ('t304');
5317 }
5318
5319 ## "after body" insertion mode
5320 !!!parse-error (type => 'after body:'.$token->{tag_name}, token => $token);
5321
5322 $self->{insertion_mode} = IN_BODY_IM;
5323 ## reprocess
5324 redo B;
5325 } elsif ($token->{type} == END_TAG_TOKEN) {
5326 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
5327 !!!cp ('t305');
5328 !!!parse-error (type => 'after html:/'.$token->{tag_name}, token => $token);
5329
5330 $self->{insertion_mode} = AFTER_BODY_IM;
5331 ## Reprocess in the "after body" insertion mode.
5332 } else {
5333 !!!cp ('t306');
5334 }
5335
5336 ## "after body" insertion mode
5337 if ($token->{tag_name} eq 'html') {
5338 if (defined $self->{inner_html_node}) {
5339 !!!cp ('t307');
5340 !!!parse-error (type => 'unmatched end tag:html', token => $token);
5341 ## Ignore the token
5342 !!!next-token;
5343 redo B;
5344 } else {
5345 !!!cp ('t308');
5346 $self->{insertion_mode} = AFTER_HTML_BODY_IM;
5347 !!!next-token;
5348 redo B;
5349 }
5350 } else {
5351 !!!cp ('t309');
5352 !!!parse-error (type => 'after body:/'.$token->{tag_name}, token => $token);
5353
5354 $self->{insertion_mode} = IN_BODY_IM;
5355 ## reprocess
5356 redo B;
5357 }
5358 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5359 !!!cp ('t309.2');
5360 ## Stop parsing
5361 last B;
5362 } else {
5363 die "$0: $token->{type}: Unknown token type";
5364 }
5365 } elsif ($self->{insertion_mode} & FRAME_IMS) {
5366 if ($token->{type} == CHARACTER_TOKEN) {
5367 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5368 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5369
5370 unless (length $token->{data}) {
5371 !!!cp ('t310');
5372 !!!next-token;
5373 redo B;
5374 }
5375 }
5376
5377 if ($token->{data} =~ s/^[^\x09\x0A\x0B\x0C\x20]+//) {
5378 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
5379 !!!cp ('t311');
5380 !!!parse-error (type => 'in frameset:#character', token => $token);
5381 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
5382 !!!cp ('t312');
5383 !!!parse-error (type => 'after frameset:#character', token => $token);
5384 } else { # "after html frameset"
5385 !!!cp ('t313');
5386 !!!parse-error (type => 'after html:#character', token => $token);
5387
5388 $self->{insertion_mode} = AFTER_FRAMESET_IM;
5389 ## Reprocess in the "after frameset" insertion mode.
5390 !!!parse-error (type => 'after frameset:#character', token => $token);
5391 }
5392
5393 ## Ignore the token.
5394 if (length $token->{data}) {
5395 !!!cp ('t314');
5396 ## reprocess the rest of characters
5397 } else {
5398 !!!cp ('t315');
5399 !!!next-token;
5400 }
5401 redo B;
5402 }
5403
5404 die qq[$0: Character "$token->{data}"];
5405 } elsif ($token->{type} == START_TAG_TOKEN) {
5406 if ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
5407 !!!cp ('t316');
5408 !!!parse-error (type => 'after html:'.$token->{tag_name}, token => $token);
5409
5410 $self->{insertion_mode} = AFTER_FRAMESET_IM;
5411 ## Process in the "after frameset" insertion mode.
5412 } else {
5413 !!!cp ('t317');
5414 }
5415
5416 if ($token->{tag_name} eq 'frameset' and
5417 $self->{insertion_mode} == IN_FRAMESET_IM) {
5418 !!!cp ('t318');
5419 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5420 !!!next-token;
5421 redo B;
5422 } elsif ($token->{tag_name} eq 'frame' and
5423 $self->{insertion_mode} == IN_FRAMESET_IM) {
5424 !!!cp ('t319');
5425 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5426 pop @{$self->{open_elements}};
5427 !!!next-token;
5428 redo B;
5429 } elsif ($token->{tag_name} eq 'noframes') {
5430 !!!cp ('t320');
5431 ## NOTE: As if in body.
5432 $parse_rcdata->(CDATA_CONTENT_MODEL);
5433 redo B;
5434 } else {
5435 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
5436 !!!cp ('t321');
5437 !!!parse-error (type => 'in frameset:'.$token->{tag_name}, token => $token);
5438 } else {
5439 !!!cp ('t322');
5440 !!!parse-error (type => 'after frameset:'.$token->{tag_name}, token => $token);
5441 }
5442 ## Ignore the token
5443 !!!next-token;
5444 redo B;
5445 }
5446 } elsif ($token->{type} == END_TAG_TOKEN) {
5447 if ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
5448 !!!cp ('t323');
5449 !!!parse-error (type => 'after html:/'.$token->{tag_name}, token => $token);
5450
5451 $self->{insertion_mode} = AFTER_FRAMESET_IM;
5452 ## Process in the "after frameset" insertion mode.
5453 } else {
5454 !!!cp ('t324');
5455 }
5456
5457 if ($token->{tag_name} eq 'frameset' and
5458 $self->{insertion_mode} == IN_FRAMESET_IM) {
5459 if ($self->{open_elements}->[-1]->[1] eq 'html' and
5460 @{$self->{open_elements}} == 1) {
5461 !!!cp ('t325');
5462 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5463 ## Ignore the token
5464 !!!next-token;
5465 } else {
5466 !!!cp ('t326');
5467 pop @{$self->{open_elements}};
5468 !!!next-token;
5469 }
5470
5471 if (not defined $self->{inner_html_node} and
5472 $self->{open_elements}->[-1]->[1] ne 'frameset') {
5473 !!!cp ('t327');
5474 $self->{insertion_mode} = AFTER_FRAMESET_IM;
5475 } else {
5476 !!!cp ('t328');
5477 }
5478 redo B;
5479 } elsif ($token->{tag_name} eq 'html' and
5480 $self->{insertion_mode} == AFTER_FRAMESET_IM) {
5481 !!!cp ('t329');
5482 $self->{insertion_mode} = AFTER_HTML_FRAMESET_IM;
5483 !!!next-token;
5484 redo B;
5485 } else {
5486 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
5487 !!!cp ('t330');
5488 !!!parse-error (type => 'in frameset:/'.$token->{tag_name}, token => $token);
5489 } else {
5490 !!!cp ('t331');
5491 !!!parse-error (type => 'after frameset:/'.$token->{tag_name}, token => $token);
5492 }
5493 ## Ignore the token
5494 !!!next-token;
5495 redo B;
5496 }
5497 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5498 unless ($self->{open_elements}->[-1]->[1] eq 'html' and
5499 @{$self->{open_elements}} == 1) { # redundant, maybe
5500 !!!cp ('t331.1');
5501 !!!parse-error (type => 'in body:#eof', token => $token);
5502 } else {
5503 !!!cp ('t331.2');
5504 }
5505
5506 ## Stop parsing
5507 last B;
5508 } else {
5509 die "$0: $token->{type}: Unknown token type";
5510 }
5511
5512 ## ISSUE: An issue in spec here
5513 } else {
5514 die "$0: $self->{insertion_mode}: Unknown insertion mode";
5515 }
5516
5517 ## "in body" insertion mode
5518 if ($token->{type} == START_TAG_TOKEN) {
5519 if ($token->{tag_name} eq 'script') {
5520 !!!cp ('t332');
5521 ## NOTE: This is an "as if in head" code clone
5522 $script_start_tag->();
5523 redo B;
5524 } elsif ($token->{tag_name} eq 'style') {
5525 !!!cp ('t333');
5526 ## NOTE: This is an "as if in head" code clone
5527 $parse_rcdata->(CDATA_CONTENT_MODEL);
5528 redo B;
5529 } elsif ({
5530 base => 1, link => 1,
5531 }->{$token->{tag_name}}) {
5532 !!!cp ('t334');
5533 ## NOTE: This is an "as if in head" code clone, only "-t" differs
5534 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
5535 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
5536 !!!next-token;
5537 redo B;
5538 } elsif ($token->{tag_name} eq 'meta') {
5539 ## NOTE: This is an "as if in head" code clone, only "-t" differs
5540 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
5541 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
5542
5543 unless ($self->{confident}) {
5544 if ($token->{attributes}->{charset}) { ## TODO: And if supported
5545 !!!cp ('t335');
5546 $self->{change_encoding}
5547 ->($self, $token->{attributes}->{charset}->{value}, $token);
5548
5549 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
5550 ->set_user_data (manakai_has_reference =>
5551 $token->{attributes}->{charset}
5552 ->{has_reference});
5553 } elsif ($token->{attributes}->{content}) {
5554 ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
5555 if ($token->{attributes}->{content}->{value}
5556 =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
5557 [\x09-\x0D\x20]*=
5558 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
5559 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
5560 !!!cp ('t336');
5561 $self->{change_encoding}
5562 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3, $token);
5563 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
5564 ->set_user_data (manakai_has_reference =>
5565 $token->{attributes}->{content}
5566 ->{has_reference});
5567 }
5568 }
5569 } else {
5570 if ($token->{attributes}->{charset}) {
5571 !!!cp ('t337');
5572 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
5573 ->set_user_data (manakai_has_reference =>
5574 $token->{attributes}->{charset}
5575 ->{has_reference});
5576 }
5577 if ($token->{attributes}->{content}) {
5578 !!!cp ('t338');
5579 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
5580 ->set_user_data (manakai_has_reference =>
5581 $token->{attributes}->{content}
5582 ->{has_reference});
5583 }
5584 }
5585
5586 !!!next-token;
5587 redo B;
5588 } elsif ($token->{tag_name} eq 'title') {
5589 !!!cp ('t341');
5590 ## NOTE: This is an "as if in head" code clone
5591 $parse_rcdata->(RCDATA_CONTENT_MODEL);
5592 redo B;
5593 } elsif ($token->{tag_name} eq 'body') {
5594 !!!parse-error (type => 'in body:body', token => $token);
5595
5596 if (@{$self->{open_elements}} == 1 or
5597 $self->{open_elements}->[1]->[1] ne 'body') {
5598 !!!cp ('t342');
5599 ## Ignore the token
5600 } else {
5601 my $body_el = $self->{open_elements}->[1]->[0];
5602 for my $attr_name (keys %{$token->{attributes}}) {
5603 unless ($body_el->has_attribute_ns (undef, $attr_name)) {
5604 !!!cp ('t343');
5605 $body_el->set_attribute_ns
5606 (undef, [undef, $attr_name],
5607 $token->{attributes}->{$attr_name}->{value});
5608 }
5609 }
5610 }
5611 !!!next-token;
5612 redo B;
5613 } elsif ({
5614 address => 1, blockquote => 1, center => 1, dir => 1,
5615 div => 1, dl => 1, fieldset => 1,
5616 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
5617 menu => 1, ol => 1, p => 1, ul => 1,
5618 pre => 1, listing => 1,
5619 form => 1,
5620 table => 1,
5621 hr => 1,
5622 }->{$token->{tag_name}}) {
5623 if ($token->{tag_name} eq 'form' and defined $self->{form_element}) {
5624 !!!cp ('t350');
5625 !!!parse-error (type => 'in form:form', token => $token);
5626 ## Ignore the token
5627 !!!next-token;
5628 redo B;
5629 }
5630
5631 ## has a p element in scope
5632 INSCOPE: for (reverse @{$self->{open_elements}}) {
5633 if ($_->[1] eq 'p') {
5634 !!!cp ('t344');
5635 !!!back-token;
5636 $token = {type => END_TAG_TOKEN, tag_name => 'p',
5637 line => $token->{line}, column => $token->{column}};
5638 redo B;
5639 } elsif ({
5640 applet => 1, table => 1, caption => 1, td => 1, th => 1,
5641 button => 1, marquee => 1, object => 1, html => 1,
5642 }->{$_->[1]}) {
5643 !!!cp ('t345');
5644 last INSCOPE;
5645 }
5646 } # INSCOPE
5647
5648 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
5649 if ($token->{tag_name} eq 'pre' or $token->{tag_name} eq 'listing') {
5650 !!!next-token;
5651 if ($token->{type} == CHARACTER_TOKEN) {
5652 $token->{data} =~ s/^\x0A//;
5653 unless (length $token->{data}) {
5654 !!!cp ('t346');
5655 !!!next-token;
5656 } else {
5657 !!!cp ('t349');
5658 }
5659 } else {
5660 !!!cp ('t348');
5661 }
5662 } elsif ($token->{tag_name} eq 'form') {
5663 !!!cp ('t347.1');
5664 $self->{form_element} = $self->{open_elements}->[-1]->[0];
5665
5666 !!!next-token;
5667 } elsif ($token->{tag_name} eq 'table') {
5668 !!!cp ('t382');
5669 push @{$open_tables}, [$self->{open_elements}->[-1]->[0]];
5670
5671 $self->{insertion_mode} = IN_TABLE_IM;
5672
5673 !!!next-token;
5674 } elsif ($token->{tag_name} eq 'hr') {
5675 !!!cp ('t386');
5676 pop @{$self->{open_elements}};
5677
5678 !!!next-token;
5679 } else {
5680 !!!cp ('t347');
5681 !!!next-token;
5682 }
5683 redo B;
5684 } elsif ({li => 1, dt => 1, dd => 1}->{$token->{tag_name}}) {
5685 ## has a p element in scope
5686 INSCOPE: for (reverse @{$self->{open_elements}}) {
5687 if ($_->[1] eq 'p') {
5688 !!!cp ('t353');
5689 !!!back-token;
5690 $token = {type => END_TAG_TOKEN, tag_name => 'p',
5691 line => $token->{line}, column => $token->{column}};
5692 redo B;
5693 } elsif ({
5694 applet => 1, table => 1, caption => 1, td => 1, th => 1,
5695 button => 1, marquee => 1, object => 1, html => 1,
5696 }->{$_->[1]}) {
5697 !!!cp ('t354');
5698 last INSCOPE;
5699 }
5700 } # INSCOPE
5701
5702 ## Step 1
5703 my $i = -1;
5704 my $node = $self->{open_elements}->[$i];
5705 my $li_or_dtdd = {li => {li => 1},
5706 dt => {dt => 1, dd => 1},
5707 dd => {dt => 1, dd => 1}}->{$token->{tag_name}};
5708 LI: {
5709 ## Step 2
5710 if ($li_or_dtdd->{$node->[1]}) {
5711 if ($i != -1) {
5712 !!!cp ('t355');
5713 !!!parse-error (type => 'end tag missing:'.
5714 $self->{open_elements}->[-1]->[1], token => $token);
5715 } else {
5716 !!!cp ('t356');
5717 }
5718 splice @{$self->{open_elements}}, $i;
5719 last LI;
5720 } else {
5721 !!!cp ('t357');
5722 }
5723
5724 ## Step 3
5725 if (not $formatting_category->{$node->[1]} and
5726 #not $phrasing_category->{$node->[1]} and
5727 ($special_category->{$node->[1]} or
5728 $scoping_category->{$node->[1]}) and
5729 $node->[1] ne 'address' and $node->[1] ne 'div') {
5730 !!!cp ('t358');
5731 last LI;
5732 }
5733
5734 !!!cp ('t359');
5735 ## Step 4
5736 $i--;
5737 $node = $self->{open_elements}->[$i];
5738 redo LI;
5739 } # LI
5740
5741 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
5742 !!!next-token;
5743 redo B;
5744 } elsif ($token->{tag_name} eq 'plaintext') {
5745 ## has a p element in scope
5746 INSCOPE: for (reverse @{$self->{open_elements}}) {
5747 if ($_->[1] eq 'p') {
5748 !!!cp ('t367');
5749 !!!back-token;
5750 $token = {type => END_TAG_TOKEN, tag_name => 'p',
5751 line => $token->{line}, column => $token->{column}};
5752 redo B;
5753 } elsif ({
5754 applet => 1, table => 1, caption => 1, td => 1, th => 1,
5755 button => 1, marquee => 1, object => 1, html => 1,
5756 }->{$_->[1]}) {
5757 !!!cp ('t368');
5758 last INSCOPE;
5759 }
5760 } # INSCOPE
5761
5762 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
5763
5764 $self->{content_model} = PLAINTEXT_CONTENT_MODEL;
5765
5766 !!!next-token;
5767 redo B;
5768 } elsif ($token->{tag_name} eq 'a') {
5769 AFE: for my $i (reverse 0..$#$active_formatting_elements) {
5770 my $node = $active_formatting_elements->[$i];
5771 if ($node->[1] eq 'a') {
5772 !!!cp ('t371');
5773 !!!parse-error (type => 'in a:a', token => $token);
5774
5775 !!!back-token;
5776 $token = {type => END_TAG_TOKEN, tag_name => 'a',
5777 line => $token->{line}, column => $token->{column}};
5778 $formatting_end_tag->($token);
5779
5780 AFE2: for (reverse 0..$#$active_formatting_elements) {
5781 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
5782 !!!cp ('t372');
5783 splice @$active_formatting_elements, $_, 1;
5784 last AFE2;
5785 }
5786 } # AFE2
5787 OE: for (reverse 0..$#{$self->{open_elements}}) {
5788 if ($self->{open_elements}->[$_]->[0] eq $node->[0]) {
5789 !!!cp ('t373');
5790 splice @{$self->{open_elements}}, $_, 1;
5791 last OE;
5792 }
5793 } # OE
5794 last AFE;
5795 } elsif ($node->[0] eq '#marker') {
5796 !!!cp ('t374');
5797 last AFE;
5798 }
5799 } # AFE
5800
5801 $reconstruct_active_formatting_elements->($insert_to_current);
5802
5803 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
5804 push @$active_formatting_elements, $self->{open_elements}->[-1];
5805
5806 !!!next-token;
5807 redo B;
5808 } elsif ($token->{tag_name} eq 'nobr') {
5809 $reconstruct_active_formatting_elements->($insert_to_current);
5810
5811 ## has a |nobr| element in scope
5812 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5813 my $node = $self->{open_elements}->[$_];
5814 if ($node->[1] eq 'nobr') {
5815 !!!cp ('t376');
5816 !!!parse-error (type => 'in nobr:nobr', token => $token);
5817 !!!back-token;
5818 $token = {type => END_TAG_TOKEN, tag_name => 'nobr',
5819 line => $token->{line}, column => $token->{column}};
5820 redo B;
5821 } elsif ({
5822 applet => 1, table => 1, caption => 1, td => 1, th => 1,
5823 button => 1, marquee => 1, object => 1, html => 1,
5824 }->{$node->[1]}) {
5825 !!!cp ('t377');
5826 last INSCOPE;
5827 }
5828 } # INSCOPE
5829
5830 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
5831 push @$active_formatting_elements, $self->{open_elements}->[-1];
5832
5833 !!!next-token;
5834 redo B;
5835 } elsif ($token->{tag_name} eq 'button') {
5836 ## has a button element in scope
5837 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5838 my $node = $self->{open_elements}->[$_];
5839 if ($node->[1] eq 'button') {
5840 !!!cp ('t378');
5841 !!!parse-error (type => 'in button:button', token => $token);
5842 !!!back-token;
5843 $token = {type => END_TAG_TOKEN, tag_name => 'button',
5844 line => $token->{line}, column => $token->{column}};
5845 redo B;
5846 } elsif ({
5847 applet => 1, table => 1, caption => 1, td => 1, th => 1,
5848 button => 1, marquee => 1, object => 1, html => 1,
5849 }->{$node->[1]}) {
5850 !!!cp ('t379');
5851 last INSCOPE;
5852 }
5853 } # INSCOPE
5854
5855 $reconstruct_active_formatting_elements->($insert_to_current);
5856
5857 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
5858
5859 ## TODO: associate with $self->{form_element} if defined
5860
5861 push @$active_formatting_elements, ['#marker', ''];
5862
5863 !!!next-token;
5864 redo B;
5865 } elsif ({
5866 xmp => 1,
5867 iframe => 1,
5868 noembed => 1,
5869 noframes => 1,
5870 noscript => 0, ## TODO: 1 if scripting is enabled
5871 }->{$token->{tag_name}}) {
5872 if ($token->{tag_name} eq 'xmp') {
5873 !!!cp ('t381');
5874 $reconstruct_active_formatting_elements->($insert_to_current);
5875 } else {
5876 !!!cp ('t399');
5877 }
5878 ## NOTE: There is an "as if in body" code clone.
5879 $parse_rcdata->(CDATA_CONTENT_MODEL);
5880 redo B;
5881 } elsif ($token->{tag_name} eq 'isindex') {
5882 !!!parse-error (type => 'isindex', token => $token);
5883
5884 if (defined $self->{form_element}) {
5885 !!!cp ('t389');
5886 ## Ignore the token
5887 !!!next-token;
5888 redo B;
5889 } else {
5890 my $at = $token->{attributes};
5891 my $form_attrs;
5892 $form_attrs->{action} = $at->{action} if $at->{action};
5893 my $prompt_attr = $at->{prompt};
5894 $at->{name} = {name => 'name', value => 'isindex'};
5895 delete $at->{action};
5896 delete $at->{prompt};
5897 my @tokens = (
5898 {type => START_TAG_TOKEN, tag_name => 'form',
5899 attributes => $form_attrs,
5900 line => $token->{line}, column => $token->{column}},
5901 {type => START_TAG_TOKEN, tag_name => 'hr',
5902 line => $token->{line}, column => $token->{column}},
5903 {type => START_TAG_TOKEN, tag_name => 'p',
5904 line => $token->{line}, column => $token->{column}},
5905 {type => START_TAG_TOKEN, tag_name => 'label',
5906 line => $token->{line}, column => $token->{column}},
5907 );
5908 if ($prompt_attr) {
5909 !!!cp ('t390');
5910 push @tokens, {type => CHARACTER_TOKEN, data => $prompt_attr->{value},
5911 line => $token->{line}, column => $token->{column}};
5912 } else {
5913 !!!cp ('t391');
5914 push @tokens, {type => CHARACTER_TOKEN,
5915 data => 'This is a searchable index. Insert your search keywords here: ',
5916 line => $token->{line}, column => $token->{column}}; # SHOULD
5917 ## TODO: make this configurable
5918 }
5919 push @tokens,
5920 {type => START_TAG_TOKEN, tag_name => 'input', attributes => $at,
5921 line => $token->{line}, column => $token->{column}},
5922 #{type => CHARACTER_TOKEN, data => ''}, # SHOULD
5923 {type => END_TAG_TOKEN, tag_name => 'label',
5924 line => $token->{line}, column => $token->{column}},
5925 {type => END_TAG_TOKEN, tag_name => 'p',
5926 line => $token->{line}, column => $token->{column}},
5927 {type => START_TAG_TOKEN, tag_name => 'hr',
5928 line => $token->{line}, column => $token->{column}},
5929 {type => END_TAG_TOKEN, tag_name => 'form',
5930 line => $token->{line}, column => $token->{column}};
5931 $token = shift @tokens;
5932 !!!back-token (@tokens);
5933 redo B;
5934 }
5935 } elsif ($token->{tag_name} eq 'textarea') {
5936 my $tag_name = $token->{tag_name};
5937 my $el;
5938 !!!create-element ($el, $token->{tag_name}, $token->{attributes}, $token);
5939
5940 ## TODO: $self->{form_element} if defined
5941 $self->{content_model} = RCDATA_CONTENT_MODEL;
5942 delete $self->{escape}; # MUST
5943
5944 $insert->($el);
5945
5946 my $text = '';
5947 !!!next-token;
5948 if ($token->{type} == CHARACTER_TOKEN) {
5949 $token->{data} =~ s/^\x0A//;
5950 unless (length $token->{data}) {
5951 !!!cp ('t392');
5952 !!!next-token;
5953 } else {
5954 !!!cp ('t393');
5955 }
5956 } else {
5957 !!!cp ('t394');
5958 }
5959 while ($token->{type} == CHARACTER_TOKEN) {
5960 !!!cp ('t395');
5961 $text .= $token->{data};
5962 !!!next-token;
5963 }
5964 if (length $text) {
5965 !!!cp ('t396');
5966 $el->manakai_append_text ($text);
5967 }
5968
5969 $self->{content_model} = PCDATA_CONTENT_MODEL;
5970
5971 if ($token->{type} == END_TAG_TOKEN and
5972 $token->{tag_name} eq $tag_name) {
5973 !!!cp ('t397');
5974 ## Ignore the token
5975 } else {
5976 !!!cp ('t398');
5977 !!!parse-error (type => 'in RCDATA:#'.$token->{type}, token => $token);
5978 }
5979 !!!next-token;
5980 redo B;
5981 } elsif ({
5982 caption => 1, col => 1, colgroup => 1, frame => 1,
5983 frameset => 1, head => 1, option => 1, optgroup => 1,
5984 tbody => 1, td => 1, tfoot => 1, th => 1,
5985 thead => 1, tr => 1,
5986 }->{$token->{tag_name}}) {
5987 !!!cp ('t401');
5988 !!!parse-error (type => 'in body:'.$token->{tag_name}, token => $token);
5989 ## Ignore the token
5990 !!!next-token;
5991 redo B;
5992
5993 ## ISSUE: An issue on HTML5 new elements in the spec.
5994 } else {
5995 if ($token->{tag_name} eq 'image') {
5996 !!!cp ('t384');
5997 !!!parse-error (type => 'image', token => $token);
5998 $token->{tag_name} = 'img';
5999 } else {
6000 !!!cp ('t385');
6001 }
6002
6003 ## NOTE: There is an "as if <br>" code clone.
6004 $reconstruct_active_formatting_elements->($insert_to_current);
6005
6006 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6007
6008 if ({
6009 applet => 1, marquee => 1, object => 1,
6010 }->{$token->{tag_name}}) {
6011 !!!cp ('t380');
6012 push @$active_formatting_elements, ['#marker', ''];
6013 } elsif ({
6014 b => 1, big => 1, em => 1, font => 1, i => 1,
6015 s => 1, small => 1, strile => 1,
6016 strong => 1, tt => 1, u => 1,
6017 }->{$token->{tag_name}}) {
6018 !!!cp ('t375');
6019 push @$active_formatting_elements, $self->{open_elements}->[-1];
6020 } elsif ($token->{tag_name} eq 'input') {
6021 !!!cp ('t388');
6022 ## TODO: associate with $self->{form_element} if defined
6023 pop @{$self->{open_elements}};
6024 } elsif ({
6025 area => 1, basefont => 1, bgsound => 1, br => 1,
6026 embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
6027 #image => 1,
6028 }->{$token->{tag_name}}) {
6029 !!!cp ('t388.1');
6030 pop @{$self->{open_elements}};
6031 } elsif ($token->{tag_name} eq 'select') {
6032 ## TODO: associate with $self->{form_element} if defined
6033
6034 if ($self->{insertion_mode} & TABLE_IMS or
6035 $self->{insertion_mode} & BODY_TABLE_IMS or
6036 $self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
6037 !!!cp ('t400.1');
6038 $self->{insertion_mode} = IN_SELECT_IN_TABLE_IM;
6039 } else {
6040 !!!cp ('t400.2');
6041 $self->{insertion_mode} = IN_SELECT_IM;
6042 }
6043 } else {
6044 !!!cp ('t402');
6045 }
6046
6047 !!!next-token;
6048 redo B;
6049 }
6050 } elsif ($token->{type} == END_TAG_TOKEN) {
6051 if ($token->{tag_name} eq 'body') {
6052 ## has a |body| element in scope
6053 my $i;
6054 INSCOPE: {
6055 for (reverse @{$self->{open_elements}}) {
6056 if ($_->[1] eq 'body') {
6057 !!!cp ('t405');
6058 $i = $_;
6059 last INSCOPE;
6060 } elsif ({
6061 applet => 1, table => 1, caption => 1, td => 1, th => 1,
6062 button => 1, marquee => 1, object => 1, html => 1,
6063 }->{$_->[1]}) {
6064 !!!cp ('t405.1');
6065 last;
6066 }
6067 }
6068
6069 !!!parse-error (type => 'start tag not allowed',
6070 value => $token->{tag_name}, token => $token);
6071 ## NOTE: Ignore the token.
6072 !!!next-token;
6073 redo B;
6074 } # INSCOPE
6075
6076 for (@{$self->{open_elements}}) {
6077 unless ({
6078 dd => 1, dt => 1, li => 1, p => 1, td => 1,
6079 th => 1, tr => 1, body => 1, html => 1,
6080 tbody => 1, tfoot => 1, thead => 1,
6081 }->{$_->[1]}) {
6082 !!!cp ('t403');
6083 !!!parse-error (type => 'not closed:'.$_->[1], token => $token);
6084 last;
6085 } else {
6086 !!!cp ('t404');
6087 }
6088 }
6089
6090 $self->{insertion_mode} = AFTER_BODY_IM;
6091 !!!next-token;
6092 redo B;
6093 } elsif ($token->{tag_name} eq 'html') {
6094 if (@{$self->{open_elements}} > 1 and $self->{open_elements}->[1]->[1] eq 'body') {
6095 ## ISSUE: There is an issue in the spec.
6096 if ($self->{open_elements}->[-1]->[1] ne 'body') {
6097 !!!cp ('t406');
6098 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[1]->[1], token => $token);
6099 } else {
6100 !!!cp ('t407');
6101 }
6102 $self->{insertion_mode} = AFTER_BODY_IM;
6103 ## reprocess
6104 redo B;
6105 } else {
6106 !!!cp ('t408');
6107 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6108 ## Ignore the token
6109 !!!next-token;
6110 redo B;
6111 }
6112 } elsif ({
6113 address => 1, blockquote => 1, center => 1, dir => 1,
6114 div => 1, dl => 1, fieldset => 1, listing => 1,
6115 menu => 1, ol => 1, pre => 1, ul => 1,
6116 dd => 1, dt => 1, li => 1,
6117 applet => 1, button => 1, marquee => 1, object => 1,
6118 }->{$token->{tag_name}}) {
6119 ## has an element in scope
6120 my $i;
6121 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6122 my $node = $self->{open_elements}->[$_];
6123 if ($node->[1] eq $token->{tag_name}) {
6124 !!!cp ('t410');
6125 $i = $_;
6126 last INSCOPE;
6127 } elsif ({
6128 applet => 1, table => 1, caption => 1, td => 1, th => 1,
6129 button => 1, marquee => 1, object => 1, html => 1,
6130 }->{$node->[1]}) {
6131 !!!cp ('t411');
6132 last INSCOPE;
6133 }
6134 } # INSCOPE
6135
6136 unless (defined $i) { # has an element in scope
6137 !!!cp ('t413');
6138 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6139 } else {
6140 ## Step 1. generate implied end tags
6141 while ({
6142 dd => ($token->{tag_name} ne 'dd'),
6143 dt => ($token->{tag_name} ne 'dt'),
6144 li => ($token->{tag_name} ne 'li'),
6145 p => 1,
6146 }->{$self->{open_elements}->[-1]->[1]}) {
6147 !!!cp ('t409');
6148 pop @{$self->{open_elements}};
6149 }
6150
6151 ## Step 2.
6152 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
6153 !!!cp ('t412');
6154 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1], token => $token);
6155 } else {
6156 !!!cp ('t414');
6157 }
6158
6159 ## Step 3.
6160 splice @{$self->{open_elements}}, $i;
6161
6162 ## Step 4.
6163 $clear_up_to_marker->()
6164 if {
6165 applet => 1, button => 1, marquee => 1, object => 1,
6166 }->{$token->{tag_name}};
6167 }
6168 !!!next-token;
6169 redo B;
6170 } elsif ($token->{tag_name} eq 'form') {
6171 undef $self->{form_element};
6172
6173 ## has an element in scope
6174 my $i;
6175 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6176 my $node = $self->{open_elements}->[$_];
6177 if ($node->[1] eq $token->{tag_name}) {
6178 !!!cp ('t418');
6179 $i = $_;
6180 last INSCOPE;
6181 } elsif ({
6182 applet => 1, table => 1, caption => 1, td => 1, th => 1,
6183 button => 1, marquee => 1, object => 1, html => 1,
6184 }->{$node->[1]}) {
6185 !!!cp ('t419');
6186 last INSCOPE;
6187 }
6188 } # INSCOPE
6189
6190 unless (defined $i) { # has an element in scope
6191 !!!cp ('t421');
6192 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6193 } else {
6194 ## Step 1. generate implied end tags
6195 while ({
6196 dd => 1, dt => 1, li => 1, p => 1,
6197 }->{$self->{open_elements}->[-1]->[1]}) {
6198 !!!cp ('t417');
6199 pop @{$self->{open_elements}};
6200 }
6201
6202 ## Step 2.
6203 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
6204 !!!cp ('t417.1');
6205 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1], token => $token);
6206 } else {
6207 !!!cp ('t420');
6208 }
6209
6210 ## Step 3.
6211 splice @{$self->{open_elements}}, $i;
6212 }
6213
6214 !!!next-token;
6215 redo B;
6216 } elsif ({
6217 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
6218 }->{$token->{tag_name}}) {
6219 ## has an element in scope
6220 my $i;
6221 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6222 my $node = $self->{open_elements}->[$_];
6223 if ({
6224 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
6225 }->{$node->[1]}) {
6226 !!!cp ('t423');
6227 $i = $_;
6228 last INSCOPE;
6229 } elsif ({
6230 applet => 1, table => 1, caption => 1, td => 1, th => 1,
6231 button => 1, marquee => 1, object => 1, html => 1,
6232 }->{$node->[1]}) {
6233 !!!cp ('t424');
6234 last INSCOPE;
6235 }
6236 } # INSCOPE
6237
6238 unless (defined $i) { # has an element in scope
6239 !!!cp ('t425.1');
6240 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6241 } else {
6242 ## Step 1. generate implied end tags
6243 while ({
6244 dd => 1, dt => 1, li => 1, p => 1,
6245 }->{$self->{open_elements}->[-1]->[1]}) {
6246 !!!cp ('t422');
6247 pop @{$self->{open_elements}};
6248 }
6249
6250 ## Step 2.
6251 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
6252 !!!cp ('t425');
6253 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6254 } else {
6255 !!!cp ('t426');
6256 }
6257
6258 ## Step 3.
6259 splice @{$self->{open_elements}}, $i;
6260 }
6261
6262 !!!next-token;
6263 redo B;
6264 } elsif ($token->{tag_name} eq 'p') {
6265 ## has an element in scope
6266 my $i;
6267 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6268 my $node = $self->{open_elements}->[$_];
6269 if ($node->[1] eq $token->{tag_name}) {
6270 !!!cp ('t410.1');
6271 $i = $_;
6272 last INSCOPE;
6273 } elsif ({
6274 applet => 1, table => 1, caption => 1, td => 1, th => 1,
6275 button => 1, marquee => 1, object => 1, html => 1,
6276 }->{$node->[1]}) {
6277 !!!cp ('t411.1');
6278 last INSCOPE;
6279 }
6280 } # INSCOPE
6281
6282 if (defined $i) {
6283 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
6284 !!!cp ('t412.1');
6285 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1], token => $token);
6286 } else {
6287 !!!cp ('t414.1');
6288 }
6289
6290 splice @{$self->{open_elements}}, $i;
6291 } else {
6292 !!!cp ('t413.1');
6293 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6294
6295 !!!cp ('t415.1');
6296 ## As if <p>, then reprocess the current token
6297 my $el;
6298 !!!create-element ($el, 'p',, $token);
6299 $insert->($el);
6300 ## NOTE: Not inserted into |$self->{open_elements}|.
6301 }
6302
6303 !!!next-token;
6304 redo B;
6305 } elsif ({
6306 a => 1,
6307 b => 1, big => 1, em => 1, font => 1, i => 1,
6308 nobr => 1, s => 1, small => 1, strile => 1,
6309 strong => 1, tt => 1, u => 1,
6310 }->{$token->{tag_name}}) {
6311 !!!cp ('t427');
6312 $formatting_end_tag->($token);
6313 redo B;
6314 } elsif ($token->{tag_name} eq 'br') {
6315 !!!cp ('t428');
6316 !!!parse-error (type => 'unmatched end tag:br', token => $token);
6317
6318 ## As if <br>
6319 $reconstruct_active_formatting_elements->($insert_to_current);
6320
6321 my $el;
6322 !!!create-element ($el, 'br',, $token);
6323 $insert->($el);
6324
6325 ## Ignore the token.
6326 !!!next-token;
6327 redo B;
6328 } elsif ({
6329 caption => 1, col => 1, colgroup => 1, frame => 1,
6330 frameset => 1, head => 1, option => 1, optgroup => 1,
6331 tbody => 1, td => 1, tfoot => 1, th => 1,
6332 thead => 1, tr => 1,
6333 area => 1, basefont => 1, bgsound => 1,
6334 embed => 1, hr => 1, iframe => 1, image => 1,
6335 img => 1, input => 1, isindex => 1, noembed => 1,
6336 noframes => 1, param => 1, select => 1, spacer => 1,
6337 table => 1, textarea => 1, wbr => 1,
6338 noscript => 0, ## TODO: if scripting is enabled
6339 }->{$token->{tag_name}}) {
6340 !!!cp ('t429');
6341 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6342 ## Ignore the token
6343 !!!next-token;
6344 redo B;
6345
6346 ## ISSUE: Issue on HTML5 new elements in spec
6347
6348 } else {
6349 ## Step 1
6350 my $node_i = -1;
6351 my $node = $self->{open_elements}->[$node_i];
6352
6353 ## Step 2
6354 S2: {
6355 if ($node->[1] eq $token->{tag_name}) {
6356 ## Step 1
6357 ## generate implied end tags
6358 while ({
6359 dd => 1, dt => 1, li => 1, p => 1,
6360 }->{$self->{open_elements}->[-1]->[1]}) {
6361 !!!cp ('t430');
6362 ## ISSUE: Can this case be reached?
6363 pop @{$self->{open_elements}};
6364 }
6365
6366 ## Step 2
6367 if ($token->{tag_name} ne $self->{open_elements}->[-1]->[1]) {
6368 !!!cp ('t431');
6369 ## NOTE: <x><y></x>
6370 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1], token => $token);
6371 } else {
6372 !!!cp ('t432');
6373 }
6374
6375 ## Step 3
6376 splice @{$self->{open_elements}}, $node_i;
6377
6378 !!!next-token;
6379 last S2;
6380 } else {
6381 ## Step 3
6382 if (not $formatting_category->{$node->[1]} and
6383 #not $phrasing_category->{$node->[1]} and
6384 ($special_category->{$node->[1]} or
6385 $scoping_category->{$node->[1]})) {
6386 !!!cp ('t433');
6387 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6388 ## Ignore the token
6389 !!!next-token;
6390 last S2;
6391 }
6392
6393 !!!cp ('t434');
6394 }
6395
6396 ## Step 4
6397 $node_i--;
6398 $node = $self->{open_elements}->[$node_i];
6399
6400 ## Step 5;
6401 redo S2;
6402 } # S2
6403 redo B;
6404 }
6405 }
6406 redo B;
6407 } # B
6408
6409 ## Stop parsing # MUST
6410
6411 ## TODO: script stuffs
6412 } # _tree_construct_main
6413
6414 sub set_inner_html ($$$) {
6415 my $class = shift;
6416 my $node = shift;
6417 my $s = \$_[0];
6418 my $onerror = $_[1];
6419
6420 ## ISSUE: Should {confident} be true?
6421
6422 my $nt = $node->node_type;
6423 if ($nt == 9) {
6424 # MUST
6425
6426 ## Step 1 # MUST
6427 ## TODO: If the document has an active parser, ...
6428 ## ISSUE: There is an issue in the spec.
6429
6430 ## Step 2 # MUST
6431 my @cn = @{$node->child_nodes};
6432 for (@cn) {
6433 $node->remove_child ($_);
6434 }
6435
6436 ## Step 3, 4, 5 # MUST
6437 $class->parse_string ($$s => $node, $onerror);
6438 } elsif ($nt == 1) {
6439 ## TODO: If non-html element
6440
6441 ## NOTE: Most of this code is copied from |parse_string|
6442
6443 ## Step 1 # MUST
6444 my $this_doc = $node->owner_document;
6445 my $doc = $this_doc->implementation->create_document;
6446 $doc->manakai_is_html (1);
6447 my $p = $class->new;
6448 $p->{document} = $doc;
6449
6450 ## Step 8 # MUST
6451 my $i = 0;
6452 my $line = 1;
6453 my $column = 0;
6454 $p->{set_next_char} = sub {
6455 my $self = shift;
6456
6457 pop @{$self->{prev_char}};
6458 unshift @{$self->{prev_char}}, $self->{next_char};
6459
6460 $self->{next_char} = -1 and return if $i >= length $$s;
6461 $self->{next_char} = ord substr $$s, $i++, 1;
6462 $column++;
6463
6464 if ($self->{next_char} == 0x000A) { # LF
6465 $line++;
6466 $column = 0;
6467 !!!cp ('i1');
6468 } elsif ($self->{next_char} == 0x000D) { # CR
6469 $i++ if substr ($$s, $i, 1) eq "\x0A";
6470 $self->{next_char} = 0x000A; # LF # MUST
6471 $line++;
6472 $column = 0;
6473 !!!cp ('i2');
6474 } elsif ($self->{next_char} > 0x10FFFF) {
6475 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
6476 !!!cp ('i3');
6477 } elsif ($self->{next_char} == 0x0000) { # NULL
6478 !!!cp ('i4');
6479 !!!parse-error (type => 'NULL');
6480 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
6481 }
6482 };
6483 $p->{prev_char} = [-1, -1, -1];
6484 $p->{next_char} = -1;
6485
6486 my $ponerror = $onerror || sub {
6487 my (%opt) = @_;
6488 warn "Parse error ($opt{type}) at line $opt{line} column $opt{column}\n";
6489 };
6490 $p->{parse_error} = sub {
6491 $ponerror->(@_, line => $line, column => $column);
6492 };
6493
6494 $p->_initialize_tokenizer;
6495 $p->_initialize_tree_constructor;
6496
6497 ## Step 2
6498 my $node_ln = $node->manakai_local_name;
6499 $p->{content_model} = {
6500 title => RCDATA_CONTENT_MODEL,
6501 textarea => RCDATA_CONTENT_MODEL,
6502 style => CDATA_CONTENT_MODEL,
6503 script => CDATA_CONTENT_MODEL,
6504 xmp => CDATA_CONTENT_MODEL,
6505 iframe => CDATA_CONTENT_MODEL,
6506 noembed => CDATA_CONTENT_MODEL,
6507 noframes => CDATA_CONTENT_MODEL,
6508 noscript => CDATA_CONTENT_MODEL,
6509 plaintext => PLAINTEXT_CONTENT_MODEL,
6510 }->{$node_ln};
6511 $p->{content_model} = PCDATA_CONTENT_MODEL
6512 unless defined $p->{content_model};
6513 ## ISSUE: What is "the name of the element"? local name?
6514
6515 $p->{inner_html_node} = [$node, $node_ln];
6516
6517 ## Step 3
6518 my $root = $doc->create_element_ns
6519 ('http://www.w3.org/1999/xhtml', [undef, 'html']);
6520
6521 ## Step 4 # MUST
6522 $doc->append_child ($root);
6523
6524 ## Step 5 # MUST
6525 push @{$p->{open_elements}}, [$root, 'html'];
6526
6527 undef $p->{head_element};
6528
6529 ## Step 6 # MUST
6530 $p->_reset_insertion_mode;
6531
6532 ## Step 7 # MUST
6533 my $anode = $node;
6534 AN: while (defined $anode) {
6535 if ($anode->node_type == 1) {
6536 my $nsuri = $anode->namespace_uri;
6537 if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {
6538 if ($anode->manakai_local_name eq 'form') {
6539 !!!cp ('i5');
6540 $p->{form_element} = $anode;
6541 last AN;
6542 }
6543 }
6544 }
6545 $anode = $anode->parent_node;
6546 } # AN
6547
6548 ## Step 9 # MUST
6549 {
6550 my $self = $p;
6551 !!!next-token;
6552 }
6553 $p->_tree_construction_main;
6554
6555 ## Step 10 # MUST
6556 my @cn = @{$node->child_nodes};
6557 for (@cn) {
6558 $node->remove_child ($_);
6559 }
6560 ## ISSUE: mutation events? read-only?
6561
6562 ## Step 11 # MUST
6563 @cn = @{$root->child_nodes};
6564 for (@cn) {
6565 $this_doc->adopt_node ($_);
6566 $node->append_child ($_);
6567 }
6568 ## ISSUE: mutation events?
6569
6570 $p->_terminate_tree_constructor;
6571 } else {
6572 die "$0: |set_inner_html| is not defined for node of type $nt";
6573 }
6574 } # set_inner_html
6575
6576 } # tree construction stage
6577
6578 package Whatpm::HTML::RestartParser;
6579 push our @ISA, 'Error';
6580
6581 1;
6582 # $Date: 2008/03/17 13:23:39 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24