/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.78 - (show annotations) (download) (as text)
Mon Mar 3 11:56:18 2008 UTC (17 years, 11 months ago) by wakaba
Branch: MAIN
Changes since 1.77: +51 -12 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	3 Mar 2008 11:49:36 -0000
	* tokenizer-test-1.test: New test data are added to cover
	all possible cases.

	* HTML-tree.t: Support for test coverage.

2008-03-03  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ChangeLog	3 Mar 2008 11:56:09 -0000
	* HTML.pm.src (_tokenize_attempt_to_consume_an_entity): Checkpoints
	are set.  Cases that are unlikely reached are noted as so.

2008-03-03  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.77 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 use Error qw(:try);
5
6 ## ISSUE:
7 ## var doc = implementation.createDocument (null, null, null);
8 ## doc.write ('');
9 ## alert (doc.compatMode);
10
11 ## TODO: Control charcters and noncharacters are not allowed (HTML5 revision 1263)
12 ## TODO: 1252 parse error (revision 1264)
13 ## TODO: 8859-11 = 874 (revision 1271)
14
15 my $permitted_slash_tag_name = {
16 base => 1,
17 link => 1,
18 meta => 1,
19 hr => 1,
20 br => 1,
21 img => 1,
22 embed => 1,
23 param => 1,
24 area => 1,
25 col => 1,
26 input => 1,
27 };
28
29 my $c1_entity_char = {
30 0x80 => 0x20AC,
31 0x81 => 0xFFFD,
32 0x82 => 0x201A,
33 0x83 => 0x0192,
34 0x84 => 0x201E,
35 0x85 => 0x2026,
36 0x86 => 0x2020,
37 0x87 => 0x2021,
38 0x88 => 0x02C6,
39 0x89 => 0x2030,
40 0x8A => 0x0160,
41 0x8B => 0x2039,
42 0x8C => 0x0152,
43 0x8D => 0xFFFD,
44 0x8E => 0x017D,
45 0x8F => 0xFFFD,
46 0x90 => 0xFFFD,
47 0x91 => 0x2018,
48 0x92 => 0x2019,
49 0x93 => 0x201C,
50 0x94 => 0x201D,
51 0x95 => 0x2022,
52 0x96 => 0x2013,
53 0x97 => 0x2014,
54 0x98 => 0x02DC,
55 0x99 => 0x2122,
56 0x9A => 0x0161,
57 0x9B => 0x203A,
58 0x9C => 0x0153,
59 0x9D => 0xFFFD,
60 0x9E => 0x017E,
61 0x9F => 0x0178,
62 }; # $c1_entity_char
63
64 my $special_category = {
65 address => 1, area => 1, base => 1, basefont => 1, bgsound => 1,
66 blockquote => 1, body => 1, br => 1, center => 1, col => 1, colgroup => 1,
67 dd => 1, dir => 1, div => 1, dl => 1, dt => 1, embed => 1, fieldset => 1,
68 form => 1, frame => 1, frameset => 1, h1 => 1, h2 => 1, h3 => 1,
69 h4 => 1, h5 => 1, h6 => 1, head => 1, hr => 1, iframe => 1, image => 1,
70 img => 1, input => 1, isindex => 1, li => 1, link => 1, listing => 1,
71 menu => 1, meta => 1, noembed => 1, noframes => 1, noscript => 1,
72 ol => 1, optgroup => 1, option => 1, p => 1, param => 1, plaintext => 1,
73 pre => 1, script => 1, select => 1, spacer => 1, style => 1, tbody => 1,
74 textarea => 1, tfoot => 1, thead => 1, title => 1, tr => 1, ul => 1, wbr => 1,
75 };
76 my $scoping_category = {
77 button => 1, caption => 1, html => 1, marquee => 1, object => 1,
78 table => 1, td => 1, th => 1,
79 };
80 my $formatting_category = {
81 a => 1, b => 1, big => 1, em => 1, font => 1, i => 1, nobr => 1,
82 s => 1, small => 1, strile => 1, strong => 1, tt => 1, u => 1,
83 };
84 # $phrasing_category: all other elements
85
86 sub parse_byte_string ($$$$;$) {
87 my $self = ref $_[0] ? shift : shift->new;
88 my $charset = shift;
89 my $bytes_s = ref $_[0] ? $_[0] : \($_[0]);
90 my $s;
91
92 if (defined $charset) {
93 require Encode; ## TODO: decode(utf8) don't delete BOM
94 $s = \ (Encode::decode ($charset, $$bytes_s));
95 $self->{input_encoding} = lc $charset; ## TODO: normalize name
96 $self->{confident} = 1;
97 } else {
98 ## TODO: Implement HTML5 detection algorithm
99 require Whatpm::Charset::UniversalCharDet;
100 $charset = Whatpm::Charset::UniversalCharDet->detect_byte_string
101 (substr ($$bytes_s, 0, 1024));
102 $charset ||= 'windows-1252';
103 $s = \ (Encode::decode ($charset, $$bytes_s));
104 $self->{input_encoding} = $charset;
105 $self->{confident} = 0;
106 }
107
108 $self->{change_encoding} = sub {
109 my $self = shift;
110 my $charset = lc shift;
111 ## TODO: if $charset is supported
112 ## TODO: normalize charset name
113
114 ## "Change the encoding" algorithm:
115
116 ## Step 1
117 if ($charset eq 'utf-16') { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8?
118 $charset = 'utf-8';
119 }
120
121 ## Step 2
122 if (defined $self->{input_encoding} and
123 $self->{input_encoding} eq $charset) {
124 $self->{confident} = 1;
125 return;
126 }
127
128 !!!parse-error (type => 'charset label detected:'.$self->{input_encoding}.
129 ':'.$charset, level => 'w');
130
131 ## Step 3
132 # if (can) {
133 ## change the encoding on the fly.
134 #$self->{confident} = 1;
135 #return;
136 # }
137
138 ## Step 4
139 throw Whatpm::HTML::RestartParser (charset => $charset);
140 }; # $self->{change_encoding}
141
142 my @args = @_; shift @args; # $s
143 my $return;
144 try {
145 $return = $self->parse_char_string ($s, @args);
146 } catch Whatpm::HTML::RestartParser with {
147 my $charset = shift->{charset};
148 $s = \ (Encode::decode ($charset, $$bytes_s));
149 $self->{input_encoding} = $charset; ## TODO: normalize
150 $self->{confident} = 1;
151 $return = $self->parse_char_string ($s, @args);
152 };
153 return $return;
154 } # parse_byte_string
155
156 ## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM
157 ## and the HTML layer MUST ignore it. However, we does strip BOM in
158 ## the encoding layer and the HTML layer does not ignore any U+FEFF,
159 ## because the core part of our HTML parser expects a string of character,
160 ## not a string of bytes or code units or anything which might contain a BOM.
161 ## Therefore, any parser interface that accepts a string of bytes,
162 ## such as |parse_byte_string| in this module, must ensure that it does
163 ## strip the BOM and never strip any ZWNBSP.
164
165 *parse_char_string = \&parse_string;
166
167 sub parse_string ($$$;$) {
168 my $self = ref $_[0] ? shift : shift->new;
169 my $s = ref $_[0] ? $_[0] : \($_[0]);
170 $self->{document} = $_[1];
171 @{$self->{document}->child_nodes} = ();
172
173 ## NOTE: |set_inner_html| copies most of this method's code
174
175 $self->{confident} = 1 unless exists $self->{confident};
176 $self->{document}->input_encoding ($self->{input_encoding})
177 if defined $self->{input_encoding};
178
179 my $i = 0;
180 my $line = 1;
181 my $column = 0;
182 $self->{set_next_char} = sub {
183 my $self = shift;
184
185 pop @{$self->{prev_char}};
186 unshift @{$self->{prev_char}}, $self->{next_char};
187
188 $self->{next_char} = -1 and return if $i >= length $$s;
189 $self->{next_char} = ord substr $$s, $i++, 1;
190 $column++;
191
192 if ($self->{next_char} == 0x000A) { # LF
193 $line++;
194 $column = 0;
195 } elsif ($self->{next_char} == 0x000D) { # CR
196 $i++ if substr ($$s, $i, 1) eq "\x0A";
197 $self->{next_char} = 0x000A; # LF # MUST
198 $line++;
199 $column = 0;
200 } elsif ($self->{next_char} > 0x10FFFF) {
201 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
202 } elsif ($self->{next_char} == 0x0000) { # NULL
203 !!!parse-error (type => 'NULL');
204 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
205 }
206 };
207 $self->{prev_char} = [-1, -1, -1];
208 $self->{next_char} = -1;
209
210 my $onerror = $_[2] || sub {
211 my (%opt) = @_;
212 warn "Parse error ($opt{type}) at line $opt{line} column $opt{column}\n";
213 };
214 $self->{parse_error} = sub {
215 $onerror->(@_, line => $line, column => $column);
216 };
217
218 $self->_initialize_tokenizer;
219 $self->_initialize_tree_constructor;
220 $self->_construct_tree;
221 $self->_terminate_tree_constructor;
222
223 return $self->{document};
224 } # parse_string
225
226 sub new ($) {
227 my $class = shift;
228 my $self = bless {}, $class;
229 $self->{set_next_char} = sub {
230 $self->{next_char} = -1;
231 };
232 $self->{parse_error} = sub {
233 #
234 };
235 $self->{change_encoding} = sub {
236 # if ($_[0] is a supported encoding) {
237 # run "change the encoding" algorithm;
238 # throw Whatpm::HTML::RestartParser (charset => $new_encoding);
239 # }
240 };
241 $self->{application_cache_selection} = sub {
242 #
243 };
244 return $self;
245 } # new
246
247 sub CM_ENTITY () { 0b001 } # & markup in data
248 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
249 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
250
251 sub PLAINTEXT_CONTENT_MODEL () { 0 }
252 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
253 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
254 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
255
256 sub DATA_STATE () { 0 }
257 sub ENTITY_DATA_STATE () { 1 }
258 sub TAG_OPEN_STATE () { 2 }
259 sub CLOSE_TAG_OPEN_STATE () { 3 }
260 sub TAG_NAME_STATE () { 4 }
261 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
262 sub ATTRIBUTE_NAME_STATE () { 6 }
263 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
264 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
265 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
266 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
267 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
268 sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
269 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
270 sub COMMENT_START_STATE () { 14 }
271 sub COMMENT_START_DASH_STATE () { 15 }
272 sub COMMENT_STATE () { 16 }
273 sub COMMENT_END_STATE () { 17 }
274 sub COMMENT_END_DASH_STATE () { 18 }
275 sub BOGUS_COMMENT_STATE () { 19 }
276 sub DOCTYPE_STATE () { 20 }
277 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
278 sub DOCTYPE_NAME_STATE () { 22 }
279 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
280 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
281 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
282 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
283 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
284 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
285 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
286 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
287 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
288 sub BOGUS_DOCTYPE_STATE () { 32 }
289 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
290
291 sub DOCTYPE_TOKEN () { 1 }
292 sub COMMENT_TOKEN () { 2 }
293 sub START_TAG_TOKEN () { 3 }
294 sub END_TAG_TOKEN () { 4 }
295 sub END_OF_FILE_TOKEN () { 5 }
296 sub CHARACTER_TOKEN () { 6 }
297
298 sub AFTER_HTML_IMS () { 0b100 }
299 sub HEAD_IMS () { 0b1000 }
300 sub BODY_IMS () { 0b10000 }
301 sub BODY_TABLE_IMS () { 0b100000 }
302 sub TABLE_IMS () { 0b1000000 }
303 sub ROW_IMS () { 0b10000000 }
304 sub BODY_AFTER_IMS () { 0b100000000 }
305 sub FRAME_IMS () { 0b1000000000 }
306
307 sub AFTER_HTML_BODY_IM () { AFTER_HTML_IMS | BODY_AFTER_IMS }
308 sub AFTER_HTML_FRAMESET_IM () { AFTER_HTML_IMS | FRAME_IMS }
309 sub IN_HEAD_IM () { HEAD_IMS | 0b00 }
310 sub IN_HEAD_NOSCRIPT_IM () { HEAD_IMS | 0b01 }
311 sub AFTER_HEAD_IM () { HEAD_IMS | 0b10 }
312 sub BEFORE_HEAD_IM () { HEAD_IMS | 0b11 }
313 sub IN_BODY_IM () { BODY_IMS }
314 sub IN_CELL_IM () { BODY_IMS | BODY_TABLE_IMS | 0b01 }
315 sub IN_CAPTION_IM () { BODY_IMS | BODY_TABLE_IMS | 0b10 }
316 sub IN_ROW_IM () { TABLE_IMS | ROW_IMS | 0b01 }
317 sub IN_TABLE_BODY_IM () { TABLE_IMS | ROW_IMS | 0b10 }
318 sub IN_TABLE_IM () { TABLE_IMS }
319 sub AFTER_BODY_IM () { BODY_AFTER_IMS }
320 sub IN_FRAMESET_IM () { FRAME_IMS | 0b01 }
321 sub AFTER_FRAMESET_IM () { FRAME_IMS | 0b10 }
322 sub IN_SELECT_IM () { 0b01 }
323 sub IN_COLUMN_GROUP_IM () { 0b10 }
324
325 ## Implementations MUST act as if state machine in the spec
326
327 sub _initialize_tokenizer ($) {
328 my $self = shift;
329 $self->{state} = DATA_STATE; # MUST
330 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
331 undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE
332 undef $self->{current_attribute};
333 undef $self->{last_emitted_start_tag_name};
334 undef $self->{last_attribute_value_state};
335 $self->{char} = [];
336 # $self->{next_char}
337 !!!next-input-character;
338 $self->{token} = [];
339 # $self->{escape}
340 } # _initialize_tokenizer
341
342 ## A token has:
343 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
344 ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
345 ## ->{name} (DOCTYPE_TOKEN)
346 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
347 ## ->{public_identifier} (DOCTYPE_TOKEN)
348 ## ->{system_identifier} (DOCTYPE_TOKEN)
349 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
350 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
351 ## ->{name}
352 ## ->{value}
353 ## ->{has_reference} == 1 or 0
354 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
355
356 ## Emitted token MUST immediately be handled by the tree construction state.
357
358 ## Before each step, UA MAY check to see if either one of the scripts in
359 ## "list of scripts that will execute as soon as possible" or the first
360 ## script in the "list of scripts that will execute asynchronously",
361 ## has completed loading. If one has, then it MUST be executed
362 ## and removed from the list.
363
364 ## NOTE: HTML5 "Writing HTML documents" section, applied to
365 ## documents and not to user agents and conformance checkers,
366 ## contains some requirements that are not detected by the
367 ## parsing algorithm:
368 ## - Some requirements on character encoding declarations. ## TODO
369 ## - "Elements MUST NOT contain content that their content model disallows."
370 ## ... Some are parse error, some are not (will be reported by c.c.).
371 ## - Polytheistic slash SHOULD NOT be used. (Applied only to atheists.) ## TODO
372 ## - Text (in elements, attributes, and comments) SHOULD NOT contain
373 ## control characters other than space characters. ## TODO: (what is control character? C0, C1 and DEL? Unicode control character?)
374
375 ## TODO: HTML5 poses authors two SHOULD-level requirements that cannot
376 ## be detected by the HTML5 parsing algorithm:
377 ## - Text,
378
379 sub _get_next_token ($) {
380 my $self = shift;
381 if (@{$self->{token}}) {
382 return shift @{$self->{token}};
383 }
384
385 A: {
386 if ($self->{state} == DATA_STATE) {
387 if ($self->{next_char} == 0x0026) { # &
388 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
389 not $self->{escape}) {
390 !!!cp (1);
391 $self->{state} = ENTITY_DATA_STATE;
392 !!!next-input-character;
393 redo A;
394 } else {
395 !!!cp (2);
396 #
397 }
398 } elsif ($self->{next_char} == 0x002D) { # -
399 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
400 unless ($self->{escape}) {
401 if ($self->{prev_char}->[0] == 0x002D and # -
402 $self->{prev_char}->[1] == 0x0021 and # !
403 $self->{prev_char}->[2] == 0x003C) { # <
404 !!!cp (3);
405 $self->{escape} = 1;
406 } else {
407 !!!cp (4);
408 }
409 } else {
410 !!!cp (5);
411 }
412 }
413
414 #
415 } elsif ($self->{next_char} == 0x003C) { # <
416 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
417 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
418 not $self->{escape})) {
419 !!!cp (6);
420 $self->{state} = TAG_OPEN_STATE;
421 !!!next-input-character;
422 redo A;
423 } else {
424 !!!cp (7);
425 #
426 }
427 } elsif ($self->{next_char} == 0x003E) { # >
428 if ($self->{escape} and
429 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
430 if ($self->{prev_char}->[0] == 0x002D and # -
431 $self->{prev_char}->[1] == 0x002D) { # -
432 !!!cp (8);
433 delete $self->{escape};
434 } else {
435 !!!cp (9);
436 }
437 } else {
438 !!!cp (10);
439 }
440
441 #
442 } elsif ($self->{next_char} == -1) {
443 !!!cp (11);
444 !!!emit ({type => END_OF_FILE_TOKEN});
445 last A; ## TODO: ok?
446 } else {
447 !!!cp (12);
448 }
449 # Anything else
450 my $token = {type => CHARACTER_TOKEN,
451 data => chr $self->{next_char}};
452 ## Stay in the data state
453 !!!next-input-character;
454
455 !!!emit ($token);
456
457 redo A;
458 } elsif ($self->{state} == ENTITY_DATA_STATE) {
459 ## (cannot happen in CDATA state)
460
461 my $token = $self->_tokenize_attempt_to_consume_an_entity (0, -1);
462
463 $self->{state} = DATA_STATE;
464 # next-input-character is already done
465
466 unless (defined $token) {
467 !!!cp (13);
468 !!!emit ({type => CHARACTER_TOKEN, data => '&'});
469 } else {
470 !!!cp (14);
471 !!!emit ($token);
472 }
473
474 redo A;
475 } elsif ($self->{state} == TAG_OPEN_STATE) {
476 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
477 if ($self->{next_char} == 0x002F) { # /
478 !!!cp (15);
479 !!!next-input-character;
480 $self->{state} = CLOSE_TAG_OPEN_STATE;
481 redo A;
482 } else {
483 !!!cp (16);
484 ## reconsume
485 $self->{state} = DATA_STATE;
486
487 !!!emit ({type => CHARACTER_TOKEN, data => '<'});
488
489 redo A;
490 }
491 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
492 if ($self->{next_char} == 0x0021) { # !
493 !!!cp (17);
494 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
495 !!!next-input-character;
496 redo A;
497 } elsif ($self->{next_char} == 0x002F) { # /
498 !!!cp (18);
499 $self->{state} = CLOSE_TAG_OPEN_STATE;
500 !!!next-input-character;
501 redo A;
502 } elsif (0x0041 <= $self->{next_char} and
503 $self->{next_char} <= 0x005A) { # A..Z
504 !!!cp (19);
505 $self->{current_token}
506 = {type => START_TAG_TOKEN,
507 tag_name => chr ($self->{next_char} + 0x0020)};
508 $self->{state} = TAG_NAME_STATE;
509 !!!next-input-character;
510 redo A;
511 } elsif (0x0061 <= $self->{next_char} and
512 $self->{next_char} <= 0x007A) { # a..z
513 !!!cp (20);
514 $self->{current_token} = {type => START_TAG_TOKEN,
515 tag_name => chr ($self->{next_char})};
516 $self->{state} = TAG_NAME_STATE;
517 !!!next-input-character;
518 redo A;
519 } elsif ($self->{next_char} == 0x003E) { # >
520 !!!cp (21);
521 !!!parse-error (type => 'empty start tag');
522 $self->{state} = DATA_STATE;
523 !!!next-input-character;
524
525 !!!emit ({type => CHARACTER_TOKEN, data => '<>'});
526
527 redo A;
528 } elsif ($self->{next_char} == 0x003F) { # ?
529 !!!cp (22);
530 !!!parse-error (type => 'pio');
531 $self->{state} = BOGUS_COMMENT_STATE;
532 ## $self->{next_char} is intentionally left as is
533 redo A;
534 } else {
535 !!!cp (23);
536 !!!parse-error (type => 'bare stago');
537 $self->{state} = DATA_STATE;
538 ## reconsume
539
540 !!!emit ({type => CHARACTER_TOKEN, data => '<'});
541
542 redo A;
543 }
544 } else {
545 die "$0: $self->{content_model} in tag open";
546 }
547 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
548 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
549 if (defined $self->{last_emitted_start_tag_name}) {
550 ## NOTE: <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>
551 my @next_char;
552 TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
553 push @next_char, $self->{next_char};
554 my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
555 my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
556 if ($self->{next_char} == $c or $self->{next_char} == $C) {
557 !!!cp (24);
558 !!!next-input-character;
559 next TAGNAME;
560 } else {
561 !!!cp (25);
562 $self->{next_char} = shift @next_char; # reconsume
563 !!!back-next-input-character (@next_char);
564 $self->{state} = DATA_STATE;
565
566 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
567
568 redo A;
569 }
570 }
571 push @next_char, $self->{next_char};
572
573 unless ($self->{next_char} == 0x0009 or # HT
574 $self->{next_char} == 0x000A or # LF
575 $self->{next_char} == 0x000B or # VT
576 $self->{next_char} == 0x000C or # FF
577 $self->{next_char} == 0x0020 or # SP
578 $self->{next_char} == 0x003E or # >
579 $self->{next_char} == 0x002F or # /
580 $self->{next_char} == -1) {
581 !!!cp (26);
582 $self->{next_char} = shift @next_char; # reconsume
583 !!!back-next-input-character (@next_char);
584 $self->{state} = DATA_STATE;
585 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
586 redo A;
587 } else {
588 !!!cp (27);
589 $self->{next_char} = shift @next_char;
590 !!!back-next-input-character (@next_char);
591 # and consume...
592 }
593 } else {
594 ## No start tag token has ever been emitted
595 !!!cp (28);
596 # next-input-character is already done
597 $self->{state} = DATA_STATE;
598 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
599 redo A;
600 }
601 }
602
603 if (0x0041 <= $self->{next_char} and
604 $self->{next_char} <= 0x005A) { # A..Z
605 !!!cp (29);
606 $self->{current_token} = {type => END_TAG_TOKEN,
607 tag_name => chr ($self->{next_char} + 0x0020)};
608 $self->{state} = TAG_NAME_STATE;
609 !!!next-input-character;
610 redo A;
611 } elsif (0x0061 <= $self->{next_char} and
612 $self->{next_char} <= 0x007A) { # a..z
613 !!!cp (30);
614 $self->{current_token} = {type => END_TAG_TOKEN,
615 tag_name => chr ($self->{next_char})};
616 $self->{state} = TAG_NAME_STATE;
617 !!!next-input-character;
618 redo A;
619 } elsif ($self->{next_char} == 0x003E) { # >
620 !!!cp (31);
621 !!!parse-error (type => 'empty end tag');
622 $self->{state} = DATA_STATE;
623 !!!next-input-character;
624 redo A;
625 } elsif ($self->{next_char} == -1) {
626 !!!cp (32);
627 !!!parse-error (type => 'bare etago');
628 $self->{state} = DATA_STATE;
629 # reconsume
630
631 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
632
633 redo A;
634 } else {
635 !!!cp (33);
636 !!!parse-error (type => 'bogus end tag');
637 $self->{state} = BOGUS_COMMENT_STATE;
638 ## $self->{next_char} is intentionally left as is
639 redo A;
640 }
641 } elsif ($self->{state} == TAG_NAME_STATE) {
642 if ($self->{next_char} == 0x0009 or # HT
643 $self->{next_char} == 0x000A or # LF
644 $self->{next_char} == 0x000B or # VT
645 $self->{next_char} == 0x000C or # FF
646 $self->{next_char} == 0x0020) { # SP
647 !!!cp (34);
648 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
649 !!!next-input-character;
650 redo A;
651 } elsif ($self->{next_char} == 0x003E) { # >
652 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
653 !!!cp (35);
654 $self->{current_token}->{first_start_tag}
655 = not defined $self->{last_emitted_start_tag_name};
656 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
657 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
658 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
659 #if ($self->{current_token}->{attributes}) {
660 # ## NOTE: This should never be reached.
661 # !!! cp (36);
662 # !!! parse-error (type => 'end tag attribute');
663 #} else {
664 !!!cp (37);
665 #}
666 } else {
667 die "$0: $self->{current_token}->{type}: Unknown token type";
668 }
669 $self->{state} = DATA_STATE;
670 !!!next-input-character;
671
672 !!!emit ($self->{current_token}); # start tag or end tag
673
674 redo A;
675 } elsif (0x0041 <= $self->{next_char} and
676 $self->{next_char} <= 0x005A) { # A..Z
677 !!!cp (38);
678 $self->{current_token}->{tag_name} .= chr ($self->{next_char} + 0x0020);
679 # start tag or end tag
680 ## Stay in this state
681 !!!next-input-character;
682 redo A;
683 } elsif ($self->{next_char} == -1) {
684 !!!parse-error (type => 'unclosed tag');
685 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
686 !!!cp (39);
687 $self->{current_token}->{first_start_tag}
688 = not defined $self->{last_emitted_start_tag_name};
689 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
690 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
691 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
692 #if ($self->{current_token}->{attributes}) {
693 # ## NOTE: This state should never be reached.
694 # !!! cp (40);
695 # !!! parse-error (type => 'end tag attribute');
696 #} else {
697 !!!cp (41);
698 #}
699 } else {
700 die "$0: $self->{current_token}->{type}: Unknown token type";
701 }
702 $self->{state} = DATA_STATE;
703 # reconsume
704
705 !!!emit ($self->{current_token}); # start tag or end tag
706
707 redo A;
708 } elsif ($self->{next_char} == 0x002F) { # /
709 !!!next-input-character;
710 if ($self->{next_char} == 0x003E and # >
711 $self->{current_token}->{type} == START_TAG_TOKEN and
712 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
713 # permitted slash
714 !!!cp (42);
715 #
716 } else {
717 !!!cp (43);
718 !!!parse-error (type => 'nestc');
719 }
720 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
721 # next-input-character is already done
722 redo A;
723 } else {
724 !!!cp (44);
725 $self->{current_token}->{tag_name} .= chr $self->{next_char};
726 # start tag or end tag
727 ## Stay in the state
728 !!!next-input-character;
729 redo A;
730 }
731 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
732 if ($self->{next_char} == 0x0009 or # HT
733 $self->{next_char} == 0x000A or # LF
734 $self->{next_char} == 0x000B or # VT
735 $self->{next_char} == 0x000C or # FF
736 $self->{next_char} == 0x0020) { # SP
737 !!!cp (45);
738 ## Stay in the state
739 !!!next-input-character;
740 redo A;
741 } elsif ($self->{next_char} == 0x003E) { # >
742 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
743 !!!cp (46);
744 $self->{current_token}->{first_start_tag}
745 = not defined $self->{last_emitted_start_tag_name};
746 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
747 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
748 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
749 if ($self->{current_token}->{attributes}) {
750 !!!cp (47);
751 !!!parse-error (type => 'end tag attribute');
752 } else {
753 !!!cp (48);
754 }
755 } else {
756 die "$0: $self->{current_token}->{type}: Unknown token type";
757 }
758 $self->{state} = DATA_STATE;
759 !!!next-input-character;
760
761 !!!emit ($self->{current_token}); # start tag or end tag
762
763 redo A;
764 } elsif (0x0041 <= $self->{next_char} and
765 $self->{next_char} <= 0x005A) { # A..Z
766 !!!cp (49);
767 $self->{current_attribute} = {name => chr ($self->{next_char} + 0x0020),
768 value => ''};
769 $self->{state} = ATTRIBUTE_NAME_STATE;
770 !!!next-input-character;
771 redo A;
772 } elsif ($self->{next_char} == 0x002F) { # /
773 !!!next-input-character;
774 if ($self->{next_char} == 0x003E and # >
775 $self->{current_token}->{type} == START_TAG_TOKEN and
776 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
777 # permitted slash
778 !!!cp (50);
779 #
780 } else {
781 !!!cp (51);
782 !!!parse-error (type => 'nestc');
783 }
784 ## Stay in the state
785 # next-input-character is already done
786 redo A;
787 } elsif ($self->{next_char} == -1) {
788 !!!parse-error (type => 'unclosed tag');
789 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
790 !!!cp (52);
791 $self->{current_token}->{first_start_tag}
792 = not defined $self->{last_emitted_start_tag_name};
793 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
794 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
795 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
796 if ($self->{current_token}->{attributes}) {
797 !!!cp (53);
798 !!!parse-error (type => 'end tag attribute');
799 } else {
800 !!!cp (54);
801 }
802 } else {
803 die "$0: $self->{current_token}->{type}: Unknown token type";
804 }
805 $self->{state} = DATA_STATE;
806 # reconsume
807
808 !!!emit ($self->{current_token}); # start tag or end tag
809
810 redo A;
811 } else {
812 if ({
813 0x0022 => 1, # "
814 0x0027 => 1, # '
815 0x003D => 1, # =
816 }->{$self->{next_char}}) {
817 !!!cp (55);
818 !!!parse-error (type => 'bad attribute name');
819 } else {
820 !!!cp (56);
821 }
822 $self->{current_attribute} = {name => chr ($self->{next_char}),
823 value => ''};
824 $self->{state} = ATTRIBUTE_NAME_STATE;
825 !!!next-input-character;
826 redo A;
827 }
828 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
829 my $before_leave = sub {
830 if (exists $self->{current_token}->{attributes} # start tag or end tag
831 ->{$self->{current_attribute}->{name}}) { # MUST
832 !!!cp (57);
833 !!!parse-error (type => 'duplicate attribute:'.$self->{current_attribute}->{name});
834 ## Discard $self->{current_attribute} # MUST
835 } else {
836 !!!cp (58);
837 $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}
838 = $self->{current_attribute};
839 }
840 }; # $before_leave
841
842 if ($self->{next_char} == 0x0009 or # HT
843 $self->{next_char} == 0x000A or # LF
844 $self->{next_char} == 0x000B or # VT
845 $self->{next_char} == 0x000C or # FF
846 $self->{next_char} == 0x0020) { # SP
847 !!!cp (59);
848 $before_leave->();
849 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
850 !!!next-input-character;
851 redo A;
852 } elsif ($self->{next_char} == 0x003D) { # =
853 !!!cp (60);
854 $before_leave->();
855 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
856 !!!next-input-character;
857 redo A;
858 } elsif ($self->{next_char} == 0x003E) { # >
859 $before_leave->();
860 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
861 !!!cp (61);
862 $self->{current_token}->{first_start_tag}
863 = not defined $self->{last_emitted_start_tag_name};
864 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
865 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
866 !!!cp (62);
867 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
868 if ($self->{current_token}->{attributes}) {
869 !!!parse-error (type => 'end tag attribute');
870 }
871 } else {
872 die "$0: $self->{current_token}->{type}: Unknown token type";
873 }
874 $self->{state} = DATA_STATE;
875 !!!next-input-character;
876
877 !!!emit ($self->{current_token}); # start tag or end tag
878
879 redo A;
880 } elsif (0x0041 <= $self->{next_char} and
881 $self->{next_char} <= 0x005A) { # A..Z
882 !!!cp (63);
883 $self->{current_attribute}->{name} .= chr ($self->{next_char} + 0x0020);
884 ## Stay in the state
885 !!!next-input-character;
886 redo A;
887 } elsif ($self->{next_char} == 0x002F) { # /
888 $before_leave->();
889 !!!next-input-character;
890 if ($self->{next_char} == 0x003E and # >
891 $self->{current_token}->{type} == START_TAG_TOKEN and
892 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
893 # permitted slash
894 !!!cp (64);
895 #
896 } else {
897 !!!cp (65);
898 !!!parse-error (type => 'nestc');
899 }
900 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
901 # next-input-character is already done
902 redo A;
903 } elsif ($self->{next_char} == -1) {
904 !!!parse-error (type => 'unclosed tag');
905 $before_leave->();
906 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
907 !!!cp (66);
908 $self->{current_token}->{first_start_tag}
909 = not defined $self->{last_emitted_start_tag_name};
910 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
911 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
912 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
913 if ($self->{current_token}->{attributes}) {
914 !!!cp (67);
915 !!!parse-error (type => 'end tag attribute');
916 } else {
917 ## NOTE: This state should never be reached.
918 !!!cp (68);
919 }
920 } else {
921 die "$0: $self->{current_token}->{type}: Unknown token type";
922 }
923 $self->{state} = DATA_STATE;
924 # reconsume
925
926 !!!emit ($self->{current_token}); # start tag or end tag
927
928 redo A;
929 } else {
930 if ($self->{next_char} == 0x0022 or # "
931 $self->{next_char} == 0x0027) { # '
932 !!!cp (69);
933 !!!parse-error (type => 'bad attribute name');
934 } else {
935 !!!cp (70);
936 }
937 $self->{current_attribute}->{name} .= chr ($self->{next_char});
938 ## Stay in the state
939 !!!next-input-character;
940 redo A;
941 }
942 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
943 if ($self->{next_char} == 0x0009 or # HT
944 $self->{next_char} == 0x000A or # LF
945 $self->{next_char} == 0x000B or # VT
946 $self->{next_char} == 0x000C or # FF
947 $self->{next_char} == 0x0020) { # SP
948 !!!cp (71);
949 ## Stay in the state
950 !!!next-input-character;
951 redo A;
952 } elsif ($self->{next_char} == 0x003D) { # =
953 !!!cp (72);
954 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
955 !!!next-input-character;
956 redo A;
957 } elsif ($self->{next_char} == 0x003E) { # >
958 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
959 !!!cp (73);
960 $self->{current_token}->{first_start_tag}
961 = not defined $self->{last_emitted_start_tag_name};
962 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
963 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
964 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
965 if ($self->{current_token}->{attributes}) {
966 !!!cp (74);
967 !!!parse-error (type => 'end tag attribute');
968 } else {
969 ## NOTE: This state should never be reached.
970 !!!cp (75);
971 }
972 } else {
973 die "$0: $self->{current_token}->{type}: Unknown token type";
974 }
975 $self->{state} = DATA_STATE;
976 !!!next-input-character;
977
978 !!!emit ($self->{current_token}); # start tag or end tag
979
980 redo A;
981 } elsif (0x0041 <= $self->{next_char} and
982 $self->{next_char} <= 0x005A) { # A..Z
983 !!!cp (76);
984 $self->{current_attribute} = {name => chr ($self->{next_char} + 0x0020),
985 value => ''};
986 $self->{state} = ATTRIBUTE_NAME_STATE;
987 !!!next-input-character;
988 redo A;
989 } elsif ($self->{next_char} == 0x002F) { # /
990 !!!next-input-character;
991 if ($self->{next_char} == 0x003E and # >
992 $self->{current_token}->{type} == START_TAG_TOKEN and
993 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
994 # permitted slash
995 !!!cp (77);
996 #
997 } else {
998 !!!cp (78);
999 !!!parse-error (type => 'nestc');
1000 ## TODO: Different error type for <aa / bb> than <aa/>
1001 }
1002 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1003 # next-input-character is already done
1004 redo A;
1005 } elsif ($self->{next_char} == -1) {
1006 !!!parse-error (type => 'unclosed tag');
1007 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1008 !!!cp (79);
1009 $self->{current_token}->{first_start_tag}
1010 = not defined $self->{last_emitted_start_tag_name};
1011 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1012 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1013 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1014 if ($self->{current_token}->{attributes}) {
1015 !!!cp (80);
1016 !!!parse-error (type => 'end tag attribute');
1017 } else {
1018 ## NOTE: This state should never be reached.
1019 !!!cp (81);
1020 }
1021 } else {
1022 die "$0: $self->{current_token}->{type}: Unknown token type";
1023 }
1024 $self->{state} = DATA_STATE;
1025 # reconsume
1026
1027 !!!emit ($self->{current_token}); # start tag or end tag
1028
1029 redo A;
1030 } else {
1031 !!!cp (82);
1032 $self->{current_attribute} = {name => chr ($self->{next_char}),
1033 value => ''};
1034 $self->{state} = ATTRIBUTE_NAME_STATE;
1035 !!!next-input-character;
1036 redo A;
1037 }
1038 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1039 if ($self->{next_char} == 0x0009 or # HT
1040 $self->{next_char} == 0x000A or # LF
1041 $self->{next_char} == 0x000B or # VT
1042 $self->{next_char} == 0x000C or # FF
1043 $self->{next_char} == 0x0020) { # SP
1044 !!!cp (83);
1045 ## Stay in the state
1046 !!!next-input-character;
1047 redo A;
1048 } elsif ($self->{next_char} == 0x0022) { # "
1049 !!!cp (84);
1050 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1051 !!!next-input-character;
1052 redo A;
1053 } elsif ($self->{next_char} == 0x0026) { # &
1054 !!!cp (85);
1055 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1056 ## reconsume
1057 redo A;
1058 } elsif ($self->{next_char} == 0x0027) { # '
1059 !!!cp (86);
1060 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1061 !!!next-input-character;
1062 redo A;
1063 } elsif ($self->{next_char} == 0x003E) { # >
1064 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1065 !!!cp (87);
1066 $self->{current_token}->{first_start_tag}
1067 = not defined $self->{last_emitted_start_tag_name};
1068 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1069 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1070 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1071 if ($self->{current_token}->{attributes}) {
1072 !!!cp (88);
1073 !!!parse-error (type => 'end tag attribute');
1074 } else {
1075 ## NOTE: This state should never be reached.
1076 !!!cp (89);
1077 }
1078 } else {
1079 die "$0: $self->{current_token}->{type}: Unknown token type";
1080 }
1081 $self->{state} = DATA_STATE;
1082 !!!next-input-character;
1083
1084 !!!emit ($self->{current_token}); # start tag or end tag
1085
1086 redo A;
1087 } elsif ($self->{next_char} == -1) {
1088 !!!parse-error (type => 'unclosed tag');
1089 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1090 !!!cp (90);
1091 $self->{current_token}->{first_start_tag}
1092 = not defined $self->{last_emitted_start_tag_name};
1093 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1094 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1095 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1096 if ($self->{current_token}->{attributes}) {
1097 !!!cp (91);
1098 !!!parse-error (type => 'end tag attribute');
1099 } else {
1100 ## NOTE: This state should never be reached.
1101 !!!cp (92);
1102 }
1103 } else {
1104 die "$0: $self->{current_token}->{type}: Unknown token type";
1105 }
1106 $self->{state} = DATA_STATE;
1107 ## reconsume
1108
1109 !!!emit ($self->{current_token}); # start tag or end tag
1110
1111 redo A;
1112 } else {
1113 if ($self->{next_char} == 0x003D) { # =
1114 !!!cp (93);
1115 !!!parse-error (type => 'bad attribute value');
1116 } else {
1117 !!!cp (94);
1118 }
1119 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1120 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1121 !!!next-input-character;
1122 redo A;
1123 }
1124 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1125 if ($self->{next_char} == 0x0022) { # "
1126 !!!cp (95);
1127 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1128 !!!next-input-character;
1129 redo A;
1130 } elsif ($self->{next_char} == 0x0026) { # &
1131 !!!cp (96);
1132 $self->{last_attribute_value_state} = $self->{state};
1133 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1134 !!!next-input-character;
1135 redo A;
1136 } elsif ($self->{next_char} == -1) {
1137 !!!parse-error (type => 'unclosed attribute value');
1138 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1139 !!!cp (97);
1140 $self->{current_token}->{first_start_tag}
1141 = not defined $self->{last_emitted_start_tag_name};
1142 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1143 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1144 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1145 if ($self->{current_token}->{attributes}) {
1146 !!!cp (98);
1147 !!!parse-error (type => 'end tag attribute');
1148 } else {
1149 ## NOTE: This state should never be reached.
1150 !!!cp (99);
1151 }
1152 } else {
1153 die "$0: $self->{current_token}->{type}: Unknown token type";
1154 }
1155 $self->{state} = DATA_STATE;
1156 ## reconsume
1157
1158 !!!emit ($self->{current_token}); # start tag or end tag
1159
1160 redo A;
1161 } else {
1162 !!!cp (100);
1163 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1164 ## Stay in the state
1165 !!!next-input-character;
1166 redo A;
1167 }
1168 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1169 if ($self->{next_char} == 0x0027) { # '
1170 !!!cp (101);
1171 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1172 !!!next-input-character;
1173 redo A;
1174 } elsif ($self->{next_char} == 0x0026) { # &
1175 !!!cp (102);
1176 $self->{last_attribute_value_state} = $self->{state};
1177 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1178 !!!next-input-character;
1179 redo A;
1180 } elsif ($self->{next_char} == -1) {
1181 !!!parse-error (type => 'unclosed attribute value');
1182 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1183 !!!cp (103);
1184 $self->{current_token}->{first_start_tag}
1185 = not defined $self->{last_emitted_start_tag_name};
1186 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1187 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1188 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1189 if ($self->{current_token}->{attributes}) {
1190 !!!cp (104);
1191 !!!parse-error (type => 'end tag attribute');
1192 } else {
1193 ## NOTE: This state should never be reached.
1194 !!!cp (105);
1195 }
1196 } else {
1197 die "$0: $self->{current_token}->{type}: Unknown token type";
1198 }
1199 $self->{state} = DATA_STATE;
1200 ## reconsume
1201
1202 !!!emit ($self->{current_token}); # start tag or end tag
1203
1204 redo A;
1205 } else {
1206 !!!cp (106);
1207 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1208 ## Stay in the state
1209 !!!next-input-character;
1210 redo A;
1211 }
1212 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1213 if ($self->{next_char} == 0x0009 or # HT
1214 $self->{next_char} == 0x000A or # LF
1215 $self->{next_char} == 0x000B or # HT
1216 $self->{next_char} == 0x000C or # FF
1217 $self->{next_char} == 0x0020) { # SP
1218 !!!cp (107);
1219 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1220 !!!next-input-character;
1221 redo A;
1222 } elsif ($self->{next_char} == 0x0026) { # &
1223 !!!cp (108);
1224 $self->{last_attribute_value_state} = $self->{state};
1225 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1226 !!!next-input-character;
1227 redo A;
1228 } elsif ($self->{next_char} == 0x003E) { # >
1229 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1230 !!!cp (109);
1231 $self->{current_token}->{first_start_tag}
1232 = not defined $self->{last_emitted_start_tag_name};
1233 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1234 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1235 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1236 if ($self->{current_token}->{attributes}) {
1237 !!!cp (110);
1238 !!!parse-error (type => 'end tag attribute');
1239 } else {
1240 ## NOTE: This state should never be reached.
1241 !!!cp (111);
1242 }
1243 } else {
1244 die "$0: $self->{current_token}->{type}: Unknown token type";
1245 }
1246 $self->{state} = DATA_STATE;
1247 !!!next-input-character;
1248
1249 !!!emit ($self->{current_token}); # start tag or end tag
1250
1251 redo A;
1252 } elsif ($self->{next_char} == -1) {
1253 !!!parse-error (type => 'unclosed tag');
1254 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1255 !!!cp (112);
1256 $self->{current_token}->{first_start_tag}
1257 = not defined $self->{last_emitted_start_tag_name};
1258 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1259 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1260 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1261 if ($self->{current_token}->{attributes}) {
1262 !!!cp (113);
1263 !!!parse-error (type => 'end tag attribute');
1264 } else {
1265 ## NOTE: This state should never be reached.
1266 !!!cp (114);
1267 }
1268 } else {
1269 die "$0: $self->{current_token}->{type}: Unknown token type";
1270 }
1271 $self->{state} = DATA_STATE;
1272 ## reconsume
1273
1274 !!!emit ($self->{current_token}); # start tag or end tag
1275
1276 redo A;
1277 } else {
1278 if ({
1279 0x0022 => 1, # "
1280 0x0027 => 1, # '
1281 0x003D => 1, # =
1282 }->{$self->{next_char}}) {
1283 !!!cp (115);
1284 !!!parse-error (type => 'bad attribute value');
1285 } else {
1286 !!!cp (116);
1287 }
1288 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1289 ## Stay in the state
1290 !!!next-input-character;
1291 redo A;
1292 }
1293 } elsif ($self->{state} == ENTITY_IN_ATTRIBUTE_VALUE_STATE) {
1294 my $token = $self->_tokenize_attempt_to_consume_an_entity
1295 (1,
1296 $self->{last_attribute_value_state}
1297 == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE ? 0x0022 : # "
1298 $self->{last_attribute_value_state}
1299 == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE ? 0x0027 : # '
1300 -1);
1301
1302 unless (defined $token) {
1303 !!!cp (117);
1304 $self->{current_attribute}->{value} .= '&';
1305 } else {
1306 !!!cp (118);
1307 $self->{current_attribute}->{value} .= $token->{data};
1308 $self->{current_attribute}->{has_reference} = $token->{has_reference};
1309 ## ISSUE: spec says "append the returned character token to the current attribute's value"
1310 }
1311
1312 $self->{state} = $self->{last_attribute_value_state};
1313 # next-input-character is already done
1314 redo A;
1315 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1316 if ($self->{next_char} == 0x0009 or # HT
1317 $self->{next_char} == 0x000A or # LF
1318 $self->{next_char} == 0x000B or # VT
1319 $self->{next_char} == 0x000C or # FF
1320 $self->{next_char} == 0x0020) { # SP
1321 !!!cp (118);
1322 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1323 !!!next-input-character;
1324 redo A;
1325 } elsif ($self->{next_char} == 0x003E) { # >
1326 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1327 !!!cp (119);
1328 $self->{current_token}->{first_start_tag}
1329 = not defined $self->{last_emitted_start_tag_name};
1330 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1331 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1332 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1333 if ($self->{current_token}->{attributes}) {
1334 !!!cp (120);
1335 !!!parse-error (type => 'end tag attribute');
1336 } else {
1337 ## NOTE: This state should never be reached.
1338 !!!cp (121);
1339 }
1340 } else {
1341 die "$0: $self->{current_token}->{type}: Unknown token type";
1342 }
1343 $self->{state} = DATA_STATE;
1344 !!!next-input-character;
1345
1346 !!!emit ($self->{current_token}); # start tag or end tag
1347
1348 redo A;
1349 } elsif ($self->{next_char} == 0x002F) { # /
1350 !!!next-input-character;
1351 if ($self->{next_char} == 0x003E and # >
1352 $self->{current_token}->{type} == START_TAG_TOKEN and
1353 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
1354 # permitted slash
1355 !!!cp (122);
1356 #
1357 } else {
1358 !!!cp (123);
1359 !!!parse-error (type => 'nestc');
1360 }
1361 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1362 # next-input-character is already done
1363 redo A;
1364 } else {
1365 !!!cp (124);
1366 !!!parse-error (type => 'no space between attributes');
1367 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1368 ## reconsume
1369 redo A;
1370 }
1371 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1372 ## (only happen if PCDATA state)
1373
1374 my $token = {type => COMMENT_TOKEN, data => ''};
1375
1376 BC: {
1377 if ($self->{next_char} == 0x003E) { # >
1378 !!!cp (124);
1379 $self->{state} = DATA_STATE;
1380 !!!next-input-character;
1381
1382 !!!emit ($token);
1383
1384 redo A;
1385 } elsif ($self->{next_char} == -1) {
1386 !!!cp (125);
1387 $self->{state} = DATA_STATE;
1388 ## reconsume
1389
1390 !!!emit ($token);
1391
1392 redo A;
1393 } else {
1394 !!!cp (126);
1395 $token->{data} .= chr ($self->{next_char});
1396 !!!next-input-character;
1397 redo BC;
1398 }
1399 } # BC
1400
1401 die "$0: _get_next_token: unexpected case [BC]";
1402 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1403 ## (only happen if PCDATA state)
1404
1405 my @next_char;
1406 push @next_char, $self->{next_char};
1407
1408 if ($self->{next_char} == 0x002D) { # -
1409 !!!next-input-character;
1410 push @next_char, $self->{next_char};
1411 if ($self->{next_char} == 0x002D) { # -
1412 !!!cp (127);
1413 $self->{current_token} = {type => COMMENT_TOKEN, data => ''};
1414 $self->{state} = COMMENT_START_STATE;
1415 !!!next-input-character;
1416 redo A;
1417 } else {
1418 !!!cp (128);
1419 }
1420 } elsif ($self->{next_char} == 0x0044 or # D
1421 $self->{next_char} == 0x0064) { # d
1422 !!!next-input-character;
1423 push @next_char, $self->{next_char};
1424 if ($self->{next_char} == 0x004F or # O
1425 $self->{next_char} == 0x006F) { # o
1426 !!!next-input-character;
1427 push @next_char, $self->{next_char};
1428 if ($self->{next_char} == 0x0043 or # C
1429 $self->{next_char} == 0x0063) { # c
1430 !!!next-input-character;
1431 push @next_char, $self->{next_char};
1432 if ($self->{next_char} == 0x0054 or # T
1433 $self->{next_char} == 0x0074) { # t
1434 !!!next-input-character;
1435 push @next_char, $self->{next_char};
1436 if ($self->{next_char} == 0x0059 or # Y
1437 $self->{next_char} == 0x0079) { # y
1438 !!!next-input-character;
1439 push @next_char, $self->{next_char};
1440 if ($self->{next_char} == 0x0050 or # P
1441 $self->{next_char} == 0x0070) { # p
1442 !!!next-input-character;
1443 push @next_char, $self->{next_char};
1444 if ($self->{next_char} == 0x0045 or # E
1445 $self->{next_char} == 0x0065) { # e
1446 !!!cp (129);
1447 ## TODO: What a stupid code this is!
1448 $self->{state} = DOCTYPE_STATE;
1449 !!!next-input-character;
1450 redo A;
1451 } else {
1452 !!!cp (130);
1453 }
1454 } else {
1455 !!!cp (131);
1456 }
1457 } else {
1458 !!!cp (132);
1459 }
1460 } else {
1461 !!!cp (133);
1462 }
1463 } else {
1464 !!!cp (134);
1465 }
1466 } else {
1467 !!!cp (135);
1468 }
1469 } else {
1470 !!!cp (136);
1471 }
1472
1473 !!!parse-error (type => 'bogus comment');
1474 $self->{next_char} = shift @next_char;
1475 !!!back-next-input-character (@next_char);
1476 $self->{state} = BOGUS_COMMENT_STATE;
1477 redo A;
1478
1479 ## ISSUE: typos in spec: chacacters, is is a parse error
1480 ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
1481 } elsif ($self->{state} == COMMENT_START_STATE) {
1482 if ($self->{next_char} == 0x002D) { # -
1483 !!!cp (137);
1484 $self->{state} = COMMENT_START_DASH_STATE;
1485 !!!next-input-character;
1486 redo A;
1487 } elsif ($self->{next_char} == 0x003E) { # >
1488 !!!cp (138);
1489 !!!parse-error (type => 'bogus comment');
1490 $self->{state} = DATA_STATE;
1491 !!!next-input-character;
1492
1493 !!!emit ($self->{current_token}); # comment
1494
1495 redo A;
1496 } elsif ($self->{next_char} == -1) {
1497 !!!cp (139);
1498 !!!parse-error (type => 'unclosed comment');
1499 $self->{state} = DATA_STATE;
1500 ## reconsume
1501
1502 !!!emit ($self->{current_token}); # comment
1503
1504 redo A;
1505 } else {
1506 !!!cp (140);
1507 $self->{current_token}->{data} # comment
1508 .= chr ($self->{next_char});
1509 $self->{state} = COMMENT_STATE;
1510 !!!next-input-character;
1511 redo A;
1512 }
1513 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1514 if ($self->{next_char} == 0x002D) { # -
1515 !!!cp (141);
1516 $self->{state} = COMMENT_END_STATE;
1517 !!!next-input-character;
1518 redo A;
1519 } elsif ($self->{next_char} == 0x003E) { # >
1520 !!!cp (142);
1521 !!!parse-error (type => 'bogus comment');
1522 $self->{state} = DATA_STATE;
1523 !!!next-input-character;
1524
1525 !!!emit ($self->{current_token}); # comment
1526
1527 redo A;
1528 } elsif ($self->{next_char} == -1) {
1529 !!!cp (143);
1530 !!!parse-error (type => 'unclosed comment');
1531 $self->{state} = DATA_STATE;
1532 ## reconsume
1533
1534 !!!emit ($self->{current_token}); # comment
1535
1536 redo A;
1537 } else {
1538 !!!cp (144);
1539 $self->{current_token}->{data} # comment
1540 .= '-' . chr ($self->{next_char});
1541 $self->{state} = COMMENT_STATE;
1542 !!!next-input-character;
1543 redo A;
1544 }
1545 } elsif ($self->{state} == COMMENT_STATE) {
1546 if ($self->{next_char} == 0x002D) { # -
1547 !!!cp (145);
1548 $self->{state} = COMMENT_END_DASH_STATE;
1549 !!!next-input-character;
1550 redo A;
1551 } elsif ($self->{next_char} == -1) {
1552 !!!cp (146);
1553 !!!parse-error (type => 'unclosed comment');
1554 $self->{state} = DATA_STATE;
1555 ## reconsume
1556
1557 !!!emit ($self->{current_token}); # comment
1558
1559 redo A;
1560 } else {
1561 !!!cp (147);
1562 $self->{current_token}->{data} .= chr ($self->{next_char}); # comment
1563 ## Stay in the state
1564 !!!next-input-character;
1565 redo A;
1566 }
1567 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1568 if ($self->{next_char} == 0x002D) { # -
1569 !!!cp (148);
1570 $self->{state} = COMMENT_END_STATE;
1571 !!!next-input-character;
1572 redo A;
1573 } elsif ($self->{next_char} == -1) {
1574 !!!cp (149);
1575 !!!parse-error (type => 'unclosed comment');
1576 $self->{state} = DATA_STATE;
1577 ## reconsume
1578
1579 !!!emit ($self->{current_token}); # comment
1580
1581 redo A;
1582 } else {
1583 !!!cp (150);
1584 $self->{current_token}->{data} .= '-' . chr ($self->{next_char}); # comment
1585 $self->{state} = COMMENT_STATE;
1586 !!!next-input-character;
1587 redo A;
1588 }
1589 } elsif ($self->{state} == COMMENT_END_STATE) {
1590 if ($self->{next_char} == 0x003E) { # >
1591 !!!cp (151);
1592 $self->{state} = DATA_STATE;
1593 !!!next-input-character;
1594
1595 !!!emit ($self->{current_token}); # comment
1596
1597 redo A;
1598 } elsif ($self->{next_char} == 0x002D) { # -
1599 !!!cp (152);
1600 !!!parse-error (type => 'dash in comment');
1601 $self->{current_token}->{data} .= '-'; # comment
1602 ## Stay in the state
1603 !!!next-input-character;
1604 redo A;
1605 } elsif ($self->{next_char} == -1) {
1606 !!!cp (153);
1607 !!!parse-error (type => 'unclosed comment');
1608 $self->{state} = DATA_STATE;
1609 ## reconsume
1610
1611 !!!emit ($self->{current_token}); # comment
1612
1613 redo A;
1614 } else {
1615 !!!cp (154);
1616 !!!parse-error (type => 'dash in comment');
1617 $self->{current_token}->{data} .= '--' . chr ($self->{next_char}); # comment
1618 $self->{state} = COMMENT_STATE;
1619 !!!next-input-character;
1620 redo A;
1621 }
1622 } elsif ($self->{state} == DOCTYPE_STATE) {
1623 if ($self->{next_char} == 0x0009 or # HT
1624 $self->{next_char} == 0x000A or # LF
1625 $self->{next_char} == 0x000B or # VT
1626 $self->{next_char} == 0x000C or # FF
1627 $self->{next_char} == 0x0020) { # SP
1628 !!!cp (155);
1629 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1630 !!!next-input-character;
1631 redo A;
1632 } else {
1633 !!!cp (156);
1634 !!!parse-error (type => 'no space before DOCTYPE name');
1635 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1636 ## reconsume
1637 redo A;
1638 }
1639 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
1640 if ($self->{next_char} == 0x0009 or # HT
1641 $self->{next_char} == 0x000A or # LF
1642 $self->{next_char} == 0x000B or # VT
1643 $self->{next_char} == 0x000C or # FF
1644 $self->{next_char} == 0x0020) { # SP
1645 !!!cp (157);
1646 ## Stay in the state
1647 !!!next-input-character;
1648 redo A;
1649 } elsif ($self->{next_char} == 0x003E) { # >
1650 !!!cp (158);
1651 !!!parse-error (type => 'no DOCTYPE name');
1652 $self->{state} = DATA_STATE;
1653 !!!next-input-character;
1654
1655 !!!emit ({type => DOCTYPE_TOKEN, quirks => 1});
1656
1657 redo A;
1658 } elsif ($self->{next_char} == -1) {
1659 !!!cp (159);
1660 !!!parse-error (type => 'no DOCTYPE name');
1661 $self->{state} = DATA_STATE;
1662 ## reconsume
1663
1664 !!!emit ({type => DOCTYPE_TOKEN, quirks => 1});
1665
1666 redo A;
1667 } else {
1668 !!!cp (160);
1669 $self->{current_token}
1670 = {type => DOCTYPE_TOKEN,
1671 name => chr ($self->{next_char}),
1672 #quirks => 0,
1673 };
1674 ## ISSUE: "Set the token's name name to the" in the spec
1675 $self->{state} = DOCTYPE_NAME_STATE;
1676 !!!next-input-character;
1677 redo A;
1678 }
1679 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
1680 ## ISSUE: Redundant "First," in the spec.
1681 if ($self->{next_char} == 0x0009 or # HT
1682 $self->{next_char} == 0x000A or # LF
1683 $self->{next_char} == 0x000B or # VT
1684 $self->{next_char} == 0x000C or # FF
1685 $self->{next_char} == 0x0020) { # SP
1686 !!!cp (161);
1687 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
1688 !!!next-input-character;
1689 redo A;
1690 } elsif ($self->{next_char} == 0x003E) { # >
1691 !!!cp (162);
1692 $self->{state} = DATA_STATE;
1693 !!!next-input-character;
1694
1695 !!!emit ($self->{current_token}); # DOCTYPE
1696
1697 redo A;
1698 } elsif ($self->{next_char} == -1) {
1699 !!!cp (163);
1700 !!!parse-error (type => 'unclosed DOCTYPE');
1701 $self->{state} = DATA_STATE;
1702 ## reconsume
1703
1704 $self->{current_token}->{quirks} = 1;
1705 !!!emit ($self->{current_token}); # DOCTYPE
1706
1707 redo A;
1708 } else {
1709 !!!cp (164);
1710 $self->{current_token}->{name}
1711 .= chr ($self->{next_char}); # DOCTYPE
1712 ## Stay in the state
1713 !!!next-input-character;
1714 redo A;
1715 }
1716 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
1717 if ($self->{next_char} == 0x0009 or # HT
1718 $self->{next_char} == 0x000A or # LF
1719 $self->{next_char} == 0x000B or # VT
1720 $self->{next_char} == 0x000C or # FF
1721 $self->{next_char} == 0x0020) { # SP
1722 !!!cp (165);
1723 ## Stay in the state
1724 !!!next-input-character;
1725 redo A;
1726 } elsif ($self->{next_char} == 0x003E) { # >
1727 !!!cp (166);
1728 $self->{state} = DATA_STATE;
1729 !!!next-input-character;
1730
1731 !!!emit ($self->{current_token}); # DOCTYPE
1732
1733 redo A;
1734 } elsif ($self->{next_char} == -1) {
1735 !!!cp (167);
1736 !!!parse-error (type => 'unclosed DOCTYPE');
1737 $self->{state} = DATA_STATE;
1738 ## reconsume
1739
1740 $self->{current_token}->{quirks} = 1;
1741 !!!emit ($self->{current_token}); # DOCTYPE
1742
1743 redo A;
1744 } elsif ($self->{next_char} == 0x0050 or # P
1745 $self->{next_char} == 0x0070) { # p
1746 !!!next-input-character;
1747 if ($self->{next_char} == 0x0055 or # U
1748 $self->{next_char} == 0x0075) { # u
1749 !!!next-input-character;
1750 if ($self->{next_char} == 0x0042 or # B
1751 $self->{next_char} == 0x0062) { # b
1752 !!!next-input-character;
1753 if ($self->{next_char} == 0x004C or # L
1754 $self->{next_char} == 0x006C) { # l
1755 !!!next-input-character;
1756 if ($self->{next_char} == 0x0049 or # I
1757 $self->{next_char} == 0x0069) { # i
1758 !!!next-input-character;
1759 if ($self->{next_char} == 0x0043 or # C
1760 $self->{next_char} == 0x0063) { # c
1761 !!!cp (168);
1762 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1763 !!!next-input-character;
1764 redo A;
1765 } else {
1766 !!!cp (169);
1767 }
1768 } else {
1769 !!!cp (170);
1770 }
1771 } else {
1772 !!!cp (171);
1773 }
1774 } else {
1775 !!!cp (172);
1776 }
1777 } else {
1778 !!!cp (173);
1779 }
1780
1781 #
1782 } elsif ($self->{next_char} == 0x0053 or # S
1783 $self->{next_char} == 0x0073) { # s
1784 !!!next-input-character;
1785 if ($self->{next_char} == 0x0059 or # Y
1786 $self->{next_char} == 0x0079) { # y
1787 !!!next-input-character;
1788 if ($self->{next_char} == 0x0053 or # S
1789 $self->{next_char} == 0x0073) { # s
1790 !!!next-input-character;
1791 if ($self->{next_char} == 0x0054 or # T
1792 $self->{next_char} == 0x0074) { # t
1793 !!!next-input-character;
1794 if ($self->{next_char} == 0x0045 or # E
1795 $self->{next_char} == 0x0065) { # e
1796 !!!next-input-character;
1797 if ($self->{next_char} == 0x004D or # M
1798 $self->{next_char} == 0x006D) { # m
1799 !!!cp (174);
1800 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1801 !!!next-input-character;
1802 redo A;
1803 } else {
1804 !!!cp (175);
1805 }
1806 } else {
1807 !!!cp (176);
1808 }
1809 } else {
1810 !!!cp (177);
1811 }
1812 } else {
1813 !!!cp (178);
1814 }
1815 } else {
1816 !!!cp (179);
1817 }
1818
1819 #
1820 } else {
1821 !!!cp (180);
1822 !!!next-input-character;
1823 #
1824 }
1825
1826 !!!parse-error (type => 'string after DOCTYPE name');
1827 $self->{current_token}->{quirks} = 1;
1828
1829 $self->{state} = BOGUS_DOCTYPE_STATE;
1830 # next-input-character is already done
1831 redo A;
1832 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
1833 if ({
1834 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1835 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1836 }->{$self->{next_char}}) {
1837 !!!cp (181);
1838 ## Stay in the state
1839 !!!next-input-character;
1840 redo A;
1841 } elsif ($self->{next_char} eq 0x0022) { # "
1842 !!!cp (182);
1843 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
1844 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
1845 !!!next-input-character;
1846 redo A;
1847 } elsif ($self->{next_char} eq 0x0027) { # '
1848 !!!cp (183);
1849 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
1850 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
1851 !!!next-input-character;
1852 redo A;
1853 } elsif ($self->{next_char} eq 0x003E) { # >
1854 !!!cp (184);
1855 !!!parse-error (type => 'no PUBLIC literal');
1856
1857 $self->{state} = DATA_STATE;
1858 !!!next-input-character;
1859
1860 $self->{current_token}->{quirks} = 1;
1861 !!!emit ($self->{current_token}); # DOCTYPE
1862
1863 redo A;
1864 } elsif ($self->{next_char} == -1) {
1865 !!!cp (185);
1866 !!!parse-error (type => 'unclosed DOCTYPE');
1867
1868 $self->{state} = DATA_STATE;
1869 ## reconsume
1870
1871 $self->{current_token}->{quirks} = 1;
1872 !!!emit ($self->{current_token}); # DOCTYPE
1873
1874 redo A;
1875 } else {
1876 !!!cp (186);
1877 !!!parse-error (type => 'string after PUBLIC');
1878 $self->{current_token}->{quirks} = 1;
1879
1880 $self->{state} = BOGUS_DOCTYPE_STATE;
1881 !!!next-input-character;
1882 redo A;
1883 }
1884 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
1885 if ($self->{next_char} == 0x0022) { # "
1886 !!!cp (187);
1887 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1888 !!!next-input-character;
1889 redo A;
1890 } elsif ($self->{next_char} == 0x003E) { # >
1891 !!!cp (188);
1892 !!!parse-error (type => 'unclosed PUBLIC literal');
1893
1894 $self->{state} = DATA_STATE;
1895 !!!next-input-character;
1896
1897 $self->{current_token}->{quirks} = 1;
1898 !!!emit ($self->{current_token}); # DOCTYPE
1899
1900 redo A;
1901 } elsif ($self->{next_char} == -1) {
1902 !!!cp (189);
1903 !!!parse-error (type => 'unclosed PUBLIC literal');
1904
1905 $self->{state} = DATA_STATE;
1906 ## reconsume
1907
1908 $self->{current_token}->{quirks} = 1;
1909 !!!emit ($self->{current_token}); # DOCTYPE
1910
1911 redo A;
1912 } else {
1913 !!!cp (190);
1914 $self->{current_token}->{public_identifier} # DOCTYPE
1915 .= chr $self->{next_char};
1916 ## Stay in the state
1917 !!!next-input-character;
1918 redo A;
1919 }
1920 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
1921 if ($self->{next_char} == 0x0027) { # '
1922 !!!cp (191);
1923 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1924 !!!next-input-character;
1925 redo A;
1926 } elsif ($self->{next_char} == 0x003E) { # >
1927 !!!cp (192);
1928 !!!parse-error (type => 'unclosed PUBLIC literal');
1929
1930 $self->{state} = DATA_STATE;
1931 !!!next-input-character;
1932
1933 $self->{current_token}->{quirks} = 1;
1934 !!!emit ($self->{current_token}); # DOCTYPE
1935
1936 redo A;
1937 } elsif ($self->{next_char} == -1) {
1938 !!!cp (193);
1939 !!!parse-error (type => 'unclosed PUBLIC literal');
1940
1941 $self->{state} = DATA_STATE;
1942 ## reconsume
1943
1944 $self->{current_token}->{quirks} = 1;
1945 !!!emit ($self->{current_token}); # DOCTYPE
1946
1947 redo A;
1948 } else {
1949 !!!cp (194);
1950 $self->{current_token}->{public_identifier} # DOCTYPE
1951 .= chr $self->{next_char};
1952 ## Stay in the state
1953 !!!next-input-character;
1954 redo A;
1955 }
1956 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
1957 if ({
1958 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1959 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1960 }->{$self->{next_char}}) {
1961 !!!cp (195);
1962 ## Stay in the state
1963 !!!next-input-character;
1964 redo A;
1965 } elsif ($self->{next_char} == 0x0022) { # "
1966 !!!cp (196);
1967 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1968 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
1969 !!!next-input-character;
1970 redo A;
1971 } elsif ($self->{next_char} == 0x0027) { # '
1972 !!!cp (197);
1973 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1974 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
1975 !!!next-input-character;
1976 redo A;
1977 } elsif ($self->{next_char} == 0x003E) { # >
1978 !!!cp (198);
1979 $self->{state} = DATA_STATE;
1980 !!!next-input-character;
1981
1982 !!!emit ($self->{current_token}); # DOCTYPE
1983
1984 redo A;
1985 } elsif ($self->{next_char} == -1) {
1986 !!!cp (199);
1987 !!!parse-error (type => 'unclosed DOCTYPE');
1988
1989 $self->{state} = DATA_STATE;
1990 ## reconsume
1991
1992 $self->{current_token}->{quirks} = 1;
1993 !!!emit ($self->{current_token}); # DOCTYPE
1994
1995 redo A;
1996 } else {
1997 !!!cp (200);
1998 !!!parse-error (type => 'string after PUBLIC literal');
1999 $self->{current_token}->{quirks} = 1;
2000
2001 $self->{state} = BOGUS_DOCTYPE_STATE;
2002 !!!next-input-character;
2003 redo A;
2004 }
2005 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2006 if ({
2007 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2008 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2009 }->{$self->{next_char}}) {
2010 !!!cp (201);
2011 ## Stay in the state
2012 !!!next-input-character;
2013 redo A;
2014 } elsif ($self->{next_char} == 0x0022) { # "
2015 !!!cp (202);
2016 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2017 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2018 !!!next-input-character;
2019 redo A;
2020 } elsif ($self->{next_char} == 0x0027) { # '
2021 !!!cp (203);
2022 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2023 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2024 !!!next-input-character;
2025 redo A;
2026 } elsif ($self->{next_char} == 0x003E) { # >
2027 !!!cp (204);
2028 !!!parse-error (type => 'no SYSTEM literal');
2029 $self->{state} = DATA_STATE;
2030 !!!next-input-character;
2031
2032 $self->{current_token}->{quirks} = 1;
2033 !!!emit ($self->{current_token}); # DOCTYPE
2034
2035 redo A;
2036 } elsif ($self->{next_char} == -1) {
2037 !!!cp (205);
2038 !!!parse-error (type => 'unclosed DOCTYPE');
2039
2040 $self->{state} = DATA_STATE;
2041 ## reconsume
2042
2043 $self->{current_token}->{quirks} = 1;
2044 !!!emit ($self->{current_token}); # DOCTYPE
2045
2046 redo A;
2047 } else {
2048 !!!cp (206);
2049 !!!parse-error (type => 'string after SYSTEM');
2050 $self->{current_token}->{quirks} = 1;
2051
2052 $self->{state} = BOGUS_DOCTYPE_STATE;
2053 !!!next-input-character;
2054 redo A;
2055 }
2056 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2057 if ($self->{next_char} == 0x0022) { # "
2058 !!!cp (207);
2059 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2060 !!!next-input-character;
2061 redo A;
2062 } elsif ($self->{next_char} == 0x003E) { # >
2063 !!!cp (208);
2064 !!!parse-error (type => 'unclosed PUBLIC literal');
2065
2066 $self->{state} = DATA_STATE;
2067 !!!next-input-character;
2068
2069 $self->{current_token}->{quirks} = 1;
2070 !!!emit ($self->{current_token}); # DOCTYPE
2071
2072 redo A;
2073 } elsif ($self->{next_char} == -1) {
2074 !!!cp (209);
2075 !!!parse-error (type => 'unclosed SYSTEM literal');
2076
2077 $self->{state} = DATA_STATE;
2078 ## reconsume
2079
2080 $self->{current_token}->{quirks} = 1;
2081 !!!emit ($self->{current_token}); # DOCTYPE
2082
2083 redo A;
2084 } else {
2085 !!!cp (210);
2086 $self->{current_token}->{system_identifier} # DOCTYPE
2087 .= chr $self->{next_char};
2088 ## Stay in the state
2089 !!!next-input-character;
2090 redo A;
2091 }
2092 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2093 if ($self->{next_char} == 0x0027) { # '
2094 !!!cp (211);
2095 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2096 !!!next-input-character;
2097 redo A;
2098 } elsif ($self->{next_char} == 0x003E) { # >
2099 !!!cp (212);
2100 !!!parse-error (type => 'unclosed PUBLIC literal');
2101
2102 $self->{state} = DATA_STATE;
2103 !!!next-input-character;
2104
2105 $self->{current_token}->{quirks} = 1;
2106 !!!emit ($self->{current_token}); # DOCTYPE
2107
2108 redo A;
2109 } elsif ($self->{next_char} == -1) {
2110 !!!cp (213);
2111 !!!parse-error (type => 'unclosed SYSTEM literal');
2112
2113 $self->{state} = DATA_STATE;
2114 ## reconsume
2115
2116 $self->{current_token}->{quirks} = 1;
2117 !!!emit ($self->{current_token}); # DOCTYPE
2118
2119 redo A;
2120 } else {
2121 !!!cp (214);
2122 $self->{current_token}->{system_identifier} # DOCTYPE
2123 .= chr $self->{next_char};
2124 ## Stay in the state
2125 !!!next-input-character;
2126 redo A;
2127 }
2128 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2129 if ({
2130 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2131 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2132 }->{$self->{next_char}}) {
2133 !!!cp (215);
2134 ## Stay in the state
2135 !!!next-input-character;
2136 redo A;
2137 } elsif ($self->{next_char} == 0x003E) { # >
2138 !!!cp (216);
2139 $self->{state} = DATA_STATE;
2140 !!!next-input-character;
2141
2142 !!!emit ($self->{current_token}); # DOCTYPE
2143
2144 redo A;
2145 } elsif ($self->{next_char} == -1) {
2146 !!!cp (217);
2147 !!!parse-error (type => 'unclosed DOCTYPE');
2148
2149 $self->{state} = DATA_STATE;
2150 ## reconsume
2151
2152 $self->{current_token}->{quirks} = 1;
2153 !!!emit ($self->{current_token}); # DOCTYPE
2154
2155 redo A;
2156 } else {
2157 !!!cp (218);
2158 !!!parse-error (type => 'string after SYSTEM literal');
2159 #$self->{current_token}->{quirks} = 1;
2160
2161 $self->{state} = BOGUS_DOCTYPE_STATE;
2162 !!!next-input-character;
2163 redo A;
2164 }
2165 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2166 if ($self->{next_char} == 0x003E) { # >
2167 !!!cp (219);
2168 $self->{state} = DATA_STATE;
2169 !!!next-input-character;
2170
2171 !!!emit ($self->{current_token}); # DOCTYPE
2172
2173 redo A;
2174 } elsif ($self->{next_char} == -1) {
2175 !!!cp (220);
2176 !!!parse-error (type => 'unclosed DOCTYPE');
2177 $self->{state} = DATA_STATE;
2178 ## reconsume
2179
2180 !!!emit ($self->{current_token}); # DOCTYPE
2181
2182 redo A;
2183 } else {
2184 !!!cp (221);
2185 ## Stay in the state
2186 !!!next-input-character;
2187 redo A;
2188 }
2189 } else {
2190 die "$0: $self->{state}: Unknown state";
2191 }
2192 } # A
2193
2194 die "$0: _get_next_token: unexpected case";
2195 } # _get_next_token
2196
2197 sub _tokenize_attempt_to_consume_an_entity ($$$) {
2198 my ($self, $in_attr, $additional) = @_;
2199
2200 if ({
2201 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
2202 0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR
2203 $additional => 1,
2204 }->{$self->{next_char}}) {
2205 !!!cp (1001);
2206 ## Don't consume
2207 ## No error
2208 return undef;
2209 } elsif ($self->{next_char} == 0x0023) { # #
2210 !!!next-input-character;
2211 if ($self->{next_char} == 0x0078 or # x
2212 $self->{next_char} == 0x0058) { # X
2213 my $code;
2214 X: {
2215 my $x_char = $self->{next_char};
2216 !!!next-input-character;
2217 if (0x0030 <= $self->{next_char} and
2218 $self->{next_char} <= 0x0039) { # 0..9
2219 !!!cp (1002);
2220 $code ||= 0;
2221 $code *= 0x10;
2222 $code += $self->{next_char} - 0x0030;
2223 redo X;
2224 } elsif (0x0061 <= $self->{next_char} and
2225 $self->{next_char} <= 0x0066) { # a..f
2226 !!!cp (1003);
2227 $code ||= 0;
2228 $code *= 0x10;
2229 $code += $self->{next_char} - 0x0060 + 9;
2230 redo X;
2231 } elsif (0x0041 <= $self->{next_char} and
2232 $self->{next_char} <= 0x0046) { # A..F
2233 !!!cp (1004);
2234 $code ||= 0;
2235 $code *= 0x10;
2236 $code += $self->{next_char} - 0x0040 + 9;
2237 redo X;
2238 } elsif (not defined $code) { # no hexadecimal digit
2239 !!!cp (1005);
2240 !!!parse-error (type => 'bare hcro');
2241 !!!back-next-input-character ($x_char, $self->{next_char});
2242 $self->{next_char} = 0x0023; # #
2243 return undef;
2244 } elsif ($self->{next_char} == 0x003B) { # ;
2245 !!!cp (1006);
2246 !!!next-input-character;
2247 } else {
2248 !!!cp (1007);
2249 !!!parse-error (type => 'no refc');
2250 }
2251
2252 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
2253 !!!cp (1008);
2254 !!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code);
2255 $code = 0xFFFD;
2256 } elsif ($code > 0x10FFFF) {
2257 !!!cp (1009);
2258 !!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code);
2259 $code = 0xFFFD;
2260 } elsif ($code == 0x000D) {
2261 !!!cp (1010);
2262 !!!parse-error (type => 'CR character reference');
2263 $code = 0x000A;
2264 } elsif (0x80 <= $code and $code <= 0x9F) {
2265 !!!cp (1011);
2266 !!!parse-error (type => sprintf 'C1 character reference:U+%04X', $code);
2267 $code = $c1_entity_char->{$code};
2268 }
2269
2270 return {type => CHARACTER_TOKEN, data => chr $code,
2271 has_reference => 1};
2272 } # X
2273 } elsif (0x0030 <= $self->{next_char} and
2274 $self->{next_char} <= 0x0039) { # 0..9
2275 my $code = $self->{next_char} - 0x0030;
2276 !!!next-input-character;
2277
2278 while (0x0030 <= $self->{next_char} and
2279 $self->{next_char} <= 0x0039) { # 0..9
2280 !!!cp (1012);
2281 $code *= 10;
2282 $code += $self->{next_char} - 0x0030;
2283
2284 !!!next-input-character;
2285 }
2286
2287 if ($self->{next_char} == 0x003B) { # ;
2288 !!!cp (1013);
2289 !!!next-input-character;
2290 } else {
2291 !!!cp (1014);
2292 !!!parse-error (type => 'no refc');
2293 }
2294
2295 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
2296 !!!cp (1015);
2297 !!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code);
2298 $code = 0xFFFD;
2299 } elsif ($code > 0x10FFFF) {
2300 !!!cp (1016);
2301 !!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code);
2302 $code = 0xFFFD;
2303 } elsif ($code == 0x000D) {
2304 !!!cp (1017);
2305 !!!parse-error (type => 'CR character reference');
2306 $code = 0x000A;
2307 } elsif (0x80 <= $code and $code <= 0x9F) {
2308 !!!cp (1018);
2309 !!!parse-error (type => sprintf 'C1 character reference:U+%04X', $code);
2310 $code = $c1_entity_char->{$code};
2311 }
2312
2313 return {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1};
2314 } else {
2315 !!!cp (1019);
2316 !!!parse-error (type => 'bare nero');
2317 !!!back-next-input-character ($self->{next_char});
2318 $self->{next_char} = 0x0023; # #
2319 return undef;
2320 }
2321 } elsif ((0x0041 <= $self->{next_char} and
2322 $self->{next_char} <= 0x005A) or
2323 (0x0061 <= $self->{next_char} and
2324 $self->{next_char} <= 0x007A)) {
2325 my $entity_name = chr $self->{next_char};
2326 !!!next-input-character;
2327
2328 my $value = $entity_name;
2329 my $match = 0;
2330 require Whatpm::_NamedEntityList;
2331 our $EntityChar;
2332
2333 while (length $entity_name < 10 and
2334 ## NOTE: Some number greater than the maximum length of entity name
2335 ((0x0041 <= $self->{next_char} and # a
2336 $self->{next_char} <= 0x005A) or # x
2337 (0x0061 <= $self->{next_char} and # a
2338 $self->{next_char} <= 0x007A) or # z
2339 (0x0030 <= $self->{next_char} and # 0
2340 $self->{next_char} <= 0x0039) or # 9
2341 $self->{next_char} == 0x003B)) { # ;
2342 $entity_name .= chr $self->{next_char};
2343 if (defined $EntityChar->{$entity_name}) {
2344 if ($self->{next_char} == 0x003B) { # ;
2345 !!!cp (1020);
2346 $value = $EntityChar->{$entity_name};
2347 $match = 1;
2348 !!!next-input-character;
2349 last;
2350 } else {
2351 !!!cp (1021);
2352 $value = $EntityChar->{$entity_name};
2353 $match = -1;
2354 !!!next-input-character;
2355 }
2356 } else {
2357 !!!cp (1022);
2358 $value .= chr $self->{next_char};
2359 $match *= 2;
2360 !!!next-input-character;
2361 }
2362 }
2363
2364 if ($match > 0) {
2365 !!!cp (1023);
2366 return {type => CHARACTER_TOKEN, data => $value, has_reference => 1};
2367 } elsif ($match < 0) {
2368 !!!parse-error (type => 'no refc');
2369 if ($in_attr and $match < -1) {
2370 !!!cp (1024);
2371 return {type => CHARACTER_TOKEN, data => '&'.$entity_name};
2372 } else {
2373 !!!cp (1025);
2374 return {type => CHARACTER_TOKEN, data => $value, has_reference => 1};
2375 }
2376 } else {
2377 !!!cp (1026);
2378 !!!parse-error (type => 'bare ero');
2379 ## NOTE: "No characters are consumed" in the spec.
2380 return {type => CHARACTER_TOKEN, data => '&'.$value};
2381 }
2382 } else {
2383 !!!cp (1027);
2384 ## no characters are consumed
2385 !!!parse-error (type => 'bare ero');
2386 return undef;
2387 }
2388 } # _tokenize_attempt_to_consume_an_entity
2389
2390 sub _initialize_tree_constructor ($) {
2391 my $self = shift;
2392 ## NOTE: $self->{document} MUST be specified before this method is called
2393 $self->{document}->strict_error_checking (0);
2394 ## TODO: Turn mutation events off # MUST
2395 ## TODO: Turn loose Document option (manakai extension) on
2396 $self->{document}->manakai_is_html (1); # MUST
2397 } # _initialize_tree_constructor
2398
2399 sub _terminate_tree_constructor ($) {
2400 my $self = shift;
2401 $self->{document}->strict_error_checking (1);
2402 ## TODO: Turn mutation events on
2403 } # _terminate_tree_constructor
2404
2405 ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
2406
2407 { # tree construction stage
2408 my $token;
2409
2410 sub _construct_tree ($) {
2411 my ($self) = @_;
2412
2413 ## When an interactive UA render the $self->{document} available
2414 ## to the user, or when it begin accepting user input, are
2415 ## not defined.
2416
2417 ## Append a character: collect it and all subsequent consecutive
2418 ## characters and insert one Text node whose data is concatenation
2419 ## of all those characters. # MUST
2420
2421 !!!next-token;
2422
2423 $self->{insertion_mode} = BEFORE_HEAD_IM;
2424 undef $self->{form_element};
2425 undef $self->{head_element};
2426 $self->{open_elements} = [];
2427 undef $self->{inner_html_node};
2428
2429 $self->_tree_construction_initial; # MUST
2430 $self->_tree_construction_root_element;
2431 $self->_tree_construction_main;
2432 } # _construct_tree
2433
2434 sub _tree_construction_initial ($) {
2435 my $self = shift;
2436 INITIAL: {
2437 if ($token->{type} == DOCTYPE_TOKEN) {
2438 ## NOTE: Conformance checkers MAY, instead of reporting "not HTML5"
2439 ## error, switch to a conformance checking mode for another
2440 ## language.
2441 my $doctype_name = $token->{name};
2442 $doctype_name = '' unless defined $doctype_name;
2443 $doctype_name =~ tr/a-z/A-Z/;
2444 if (not defined $token->{name} or # <!DOCTYPE>
2445 defined $token->{public_identifier} or
2446 defined $token->{system_identifier}) {
2447 !!!parse-error (type => 'not HTML5');
2448 } elsif ($doctype_name ne 'HTML') {
2449 ## ISSUE: ASCII case-insensitive? (in fact it does not matter)
2450 !!!parse-error (type => 'not HTML5');
2451 }
2452
2453 my $doctype = $self->{document}->create_document_type_definition
2454 ($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)?
2455 $doctype->public_id ($token->{public_identifier})
2456 if defined $token->{public_identifier};
2457 $doctype->system_id ($token->{system_identifier})
2458 if defined $token->{system_identifier};
2459 ## NOTE: Other DocumentType attributes are null or empty lists.
2460 ## ISSUE: internalSubset = null??
2461 $self->{document}->append_child ($doctype);
2462
2463 if ($token->{quirks} or $doctype_name ne 'HTML') {
2464 $self->{document}->manakai_compat_mode ('quirks');
2465 } elsif (defined $token->{public_identifier}) {
2466 my $pubid = $token->{public_identifier};
2467 $pubid =~ tr/a-z/A-z/;
2468 if ({
2469 "+//SILMARIL//DTD HTML PRO V0R11 19970101//EN" => 1,
2470 "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
2471 "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
2472 "-//IETF//DTD HTML 2.0 LEVEL 1//EN" => 1,
2473 "-//IETF//DTD HTML 2.0 LEVEL 2//EN" => 1,
2474 "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//EN" => 1,
2475 "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//EN" => 1,
2476 "-//IETF//DTD HTML 2.0 STRICT//EN" => 1,
2477 "-//IETF//DTD HTML 2.0//EN" => 1,
2478 "-//IETF//DTD HTML 2.1E//EN" => 1,
2479 "-//IETF//DTD HTML 3.0//EN" => 1,
2480 "-//IETF//DTD HTML 3.0//EN//" => 1,
2481 "-//IETF//DTD HTML 3.2 FINAL//EN" => 1,
2482 "-//IETF//DTD HTML 3.2//EN" => 1,
2483 "-//IETF//DTD HTML 3//EN" => 1,
2484 "-//IETF//DTD HTML LEVEL 0//EN" => 1,
2485 "-//IETF//DTD HTML LEVEL 0//EN//2.0" => 1,
2486 "-//IETF//DTD HTML LEVEL 1//EN" => 1,
2487 "-//IETF//DTD HTML LEVEL 1//EN//2.0" => 1,
2488 "-//IETF//DTD HTML LEVEL 2//EN" => 1,
2489 "-//IETF//DTD HTML LEVEL 2//EN//2.0" => 1,
2490 "-//IETF//DTD HTML LEVEL 3//EN" => 1,
2491 "-//IETF//DTD HTML LEVEL 3//EN//3.0" => 1,
2492 "-//IETF//DTD HTML STRICT LEVEL 0//EN" => 1,
2493 "-//IETF//DTD HTML STRICT LEVEL 0//EN//2.0" => 1,
2494 "-//IETF//DTD HTML STRICT LEVEL 1//EN" => 1,
2495 "-//IETF//DTD HTML STRICT LEVEL 1//EN//2.0" => 1,
2496 "-//IETF//DTD HTML STRICT LEVEL 2//EN" => 1,
2497 "-//IETF//DTD HTML STRICT LEVEL 2//EN//2.0" => 1,
2498 "-//IETF//DTD HTML STRICT LEVEL 3//EN" => 1,
2499 "-//IETF//DTD HTML STRICT LEVEL 3//EN//3.0" => 1,
2500 "-//IETF//DTD HTML STRICT//EN" => 1,
2501 "-//IETF//DTD HTML STRICT//EN//2.0" => 1,
2502 "-//IETF//DTD HTML STRICT//EN//3.0" => 1,
2503 "-//IETF//DTD HTML//EN" => 1,
2504 "-//IETF//DTD HTML//EN//2.0" => 1,
2505 "-//IETF//DTD HTML//EN//3.0" => 1,
2506 "-//METRIUS//DTD METRIUS PRESENTATIONAL//EN" => 1,
2507 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//EN" => 1,
2508 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//EN" => 1,
2509 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//EN" => 1,
2510 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//EN" => 1,
2511 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//EN" => 1,
2512 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//EN" => 1,
2513 "-//NETSCAPE COMM. CORP.//DTD HTML//EN" => 1,
2514 "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1,
2515 "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1,
2516 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1,
2517 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED RELAXED 1.0//EN" => 1,
2518 "-//SOFTQUAD SOFTWARE//DTD HOTMETAL PRO 6.0::19990601::EXTENSIONS TO HTML 4.0//EN" => 1,
2519 "-//SOFTQUAD//DTD HOTMETAL PRO 4.0::19971010::EXTENSIONS TO HTML 4.0//EN" => 1,
2520 "-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1,
2521 "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1,
2522 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1,
2523 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//EN" => 1,
2524 "-//W3C//DTD HTML 3 1995-03-24//EN" => 1,
2525 "-//W3C//DTD HTML 3.2 DRAFT//EN" => 1,
2526 "-//W3C//DTD HTML 3.2 FINAL//EN" => 1,
2527 "-//W3C//DTD HTML 3.2//EN" => 1,
2528 "-//W3C//DTD HTML 3.2S DRAFT//EN" => 1,
2529 "-//W3C//DTD HTML 4.0 FRAMESET//EN" => 1,
2530 "-//W3C//DTD HTML 4.0 TRANSITIONAL//EN" => 1,
2531 "-//W3C//DTD HTML EXPERIMETNAL 19960712//EN" => 1,
2532 "-//W3C//DTD HTML EXPERIMENTAL 970421//EN" => 1,
2533 "-//W3C//DTD W3 HTML//EN" => 1,
2534 "-//W3O//DTD W3 HTML 3.0//EN" => 1,
2535 "-//W3O//DTD W3 HTML 3.0//EN//" => 1,
2536 "-//W3O//DTD W3 HTML STRICT 3.0//EN//" => 1,
2537 "-//WEBTECHS//DTD MOZILLA HTML 2.0//EN" => 1,
2538 "-//WEBTECHS//DTD MOZILLA HTML//EN" => 1,
2539 "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" => 1,
2540 "HTML" => 1,
2541 }->{$pubid}) {
2542 $self->{document}->manakai_compat_mode ('quirks');
2543 } elsif ($pubid eq "-//W3C//DTD HTML 4.01 FRAMESET//EN" or
2544 $pubid eq "-//W3C//DTD HTML 4.01 TRANSITIONAL//EN") {
2545 if (defined $token->{system_identifier}) {
2546 $self->{document}->manakai_compat_mode ('quirks');
2547 } else {
2548 $self->{document}->manakai_compat_mode ('limited quirks');
2549 }
2550 } elsif ($pubid eq "-//W3C//DTD XHTML 1.0 Frameset//EN" or
2551 $pubid eq "-//W3C//DTD XHTML 1.0 Transitional//EN") {
2552 $self->{document}->manakai_compat_mode ('limited quirks');
2553 }
2554 }
2555 if (defined $token->{system_identifier}) {
2556 my $sysid = $token->{system_identifier};
2557 $sysid =~ tr/A-Z/a-z/;
2558 if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
2559 $self->{document}->manakai_compat_mode ('quirks');
2560 }
2561 }
2562
2563 ## Go to the root element phase.
2564 !!!next-token;
2565 return;
2566 } elsif ({
2567 START_TAG_TOKEN, 1,
2568 END_TAG_TOKEN, 1,
2569 END_OF_FILE_TOKEN, 1,
2570 }->{$token->{type}}) {
2571 !!!parse-error (type => 'no DOCTYPE');
2572 $self->{document}->manakai_compat_mode ('quirks');
2573 ## Go to the root element phase
2574 ## reprocess
2575 return;
2576 } elsif ($token->{type} == CHARACTER_TOKEN) {
2577 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
2578 ## Ignore the token
2579
2580 unless (length $token->{data}) {
2581 ## Stay in the phase
2582 !!!next-token;
2583 redo INITIAL;
2584 }
2585 }
2586
2587 !!!parse-error (type => 'no DOCTYPE');
2588 $self->{document}->manakai_compat_mode ('quirks');
2589 ## Go to the root element phase
2590 ## reprocess
2591 return;
2592 } elsif ($token->{type} == COMMENT_TOKEN) {
2593 my $comment = $self->{document}->create_comment ($token->{data});
2594 $self->{document}->append_child ($comment);
2595
2596 ## Stay in the phase.
2597 !!!next-token;
2598 redo INITIAL;
2599 } else {
2600 die "$0: $token->{type}: Unknown token type";
2601 }
2602 } # INITIAL
2603 } # _tree_construction_initial
2604
2605 sub _tree_construction_root_element ($) {
2606 my $self = shift;
2607
2608 B: {
2609 if ($token->{type} == DOCTYPE_TOKEN) {
2610 !!!parse-error (type => 'in html:#DOCTYPE');
2611 ## Ignore the token
2612 ## Stay in the phase
2613 !!!next-token;
2614 redo B;
2615 } elsif ($token->{type} == COMMENT_TOKEN) {
2616 my $comment = $self->{document}->create_comment ($token->{data});
2617 $self->{document}->append_child ($comment);
2618 ## Stay in the phase
2619 !!!next-token;
2620 redo B;
2621 } elsif ($token->{type} == CHARACTER_TOKEN) {
2622 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
2623 ## Ignore the token.
2624
2625 unless (length $token->{data}) {
2626 ## Stay in the phase
2627 !!!next-token;
2628 redo B;
2629 }
2630 }
2631
2632 $self->{application_cache_selection}->(undef);
2633
2634 #
2635 } elsif ($token->{type} == START_TAG_TOKEN) {
2636 if ($token->{tag_name} eq 'html' and
2637 $token->{attributes}->{manifest}) {
2638 $self->{application_cache_selection}
2639 ->($token->{attributes}->{manifest}->{value});
2640 ## ISSUE: No relative reference resolution?
2641 } else {
2642 $self->{application_cache_selection}->(undef);
2643 }
2644
2645 ## ISSUE: There is an issue in the spec
2646 #
2647 } elsif ({
2648 END_TAG_TOKEN, 1,
2649 END_OF_FILE_TOKEN, 1,
2650 }->{$token->{type}}) {
2651 $self->{application_cache_selection}->(undef);
2652
2653 ## ISSUE: There is an issue in the spec
2654 #
2655 } else {
2656 die "$0: $token->{type}: Unknown token type";
2657 }
2658
2659 my $root_element; !!!create-element ($root_element, 'html');
2660 $self->{document}->append_child ($root_element);
2661 push @{$self->{open_elements}}, [$root_element, 'html'];
2662 ## reprocess
2663 #redo B;
2664 return; ## Go to the main phase.
2665 } # B
2666 } # _tree_construction_root_element
2667
2668 sub _reset_insertion_mode ($) {
2669 my $self = shift;
2670
2671 ## Step 1
2672 my $last;
2673
2674 ## Step 2
2675 my $i = -1;
2676 my $node = $self->{open_elements}->[$i];
2677
2678 ## Step 3
2679 S3: {
2680 ## ISSUE: Oops! "If node is the first node in the stack of open
2681 ## elements, then set last to true. If the context element of the
2682 ## HTML fragment parsing algorithm is neither a td element nor a
2683 ## th element, then set node to the context element. (fragment case)":
2684 ## The second "if" is in the scope of the first "if"!?
2685 if ($self->{open_elements}->[0]->[0] eq $node->[0]) {
2686 $last = 1;
2687 if (defined $self->{inner_html_node}) {
2688 if ($self->{inner_html_node}->[1] eq 'td' or
2689 $self->{inner_html_node}->[1] eq 'th') {
2690 #
2691 } else {
2692 $node = $self->{inner_html_node};
2693 }
2694 }
2695 }
2696
2697 ## Step 4..13
2698 my $new_mode = {
2699 select => IN_SELECT_IM,
2700 td => IN_CELL_IM,
2701 th => IN_CELL_IM,
2702 tr => IN_ROW_IM,
2703 tbody => IN_TABLE_BODY_IM,
2704 thead => IN_TABLE_BODY_IM,
2705 tfoot => IN_TABLE_BODY_IM,
2706 caption => IN_CAPTION_IM,
2707 colgroup => IN_COLUMN_GROUP_IM,
2708 table => IN_TABLE_IM,
2709 head => IN_BODY_IM, # not in head!
2710 body => IN_BODY_IM,
2711 frameset => IN_FRAMESET_IM,
2712 }->{$node->[1]};
2713 $self->{insertion_mode} = $new_mode and return if defined $new_mode;
2714
2715 ## Step 14
2716 if ($node->[1] eq 'html') {
2717 unless (defined $self->{head_element}) {
2718 $self->{insertion_mode} = BEFORE_HEAD_IM;
2719 } else {
2720 $self->{insertion_mode} = AFTER_HEAD_IM;
2721 }
2722 return;
2723 }
2724
2725 ## Step 15
2726 $self->{insertion_mode} = IN_BODY_IM and return if $last;
2727
2728 ## Step 16
2729 $i--;
2730 $node = $self->{open_elements}->[$i];
2731
2732 ## Step 17
2733 redo S3;
2734 } # S3
2735 } # _reset_insertion_mode
2736
2737 sub _tree_construction_main ($) {
2738 my $self = shift;
2739
2740 my $active_formatting_elements = [];
2741
2742 my $reconstruct_active_formatting_elements = sub { # MUST
2743 my $insert = shift;
2744
2745 ## Step 1
2746 return unless @$active_formatting_elements;
2747
2748 ## Step 3
2749 my $i = -1;
2750 my $entry = $active_formatting_elements->[$i];
2751
2752 ## Step 2
2753 return if $entry->[0] eq '#marker';
2754 for (@{$self->{open_elements}}) {
2755 if ($entry->[0] eq $_->[0]) {
2756 return;
2757 }
2758 }
2759
2760 S4: {
2761 ## Step 4
2762 last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
2763
2764 ## Step 5
2765 $i--;
2766 $entry = $active_formatting_elements->[$i];
2767
2768 ## Step 6
2769 if ($entry->[0] eq '#marker') {
2770 #
2771 } else {
2772 my $in_open_elements;
2773 OE: for (@{$self->{open_elements}}) {
2774 if ($entry->[0] eq $_->[0]) {
2775 $in_open_elements = 1;
2776 last OE;
2777 }
2778 }
2779 if ($in_open_elements) {
2780 #
2781 } else {
2782 redo S4;
2783 }
2784 }
2785
2786 ## Step 7
2787 $i++;
2788 $entry = $active_formatting_elements->[$i];
2789 } # S4
2790
2791 S7: {
2792 ## Step 8
2793 my $clone = [$entry->[0]->clone_node (0), $entry->[1]];
2794
2795 ## Step 9
2796 $insert->($clone->[0]);
2797 push @{$self->{open_elements}}, $clone;
2798
2799 ## Step 10
2800 $active_formatting_elements->[$i] = $self->{open_elements}->[-1];
2801
2802 ## Step 11
2803 unless ($clone->[0] eq $active_formatting_elements->[-1]->[0]) {
2804 ## Step 7'
2805 $i++;
2806 $entry = $active_formatting_elements->[$i];
2807
2808 redo S7;
2809 }
2810 } # S7
2811 }; # $reconstruct_active_formatting_elements
2812
2813 my $clear_up_to_marker = sub {
2814 for (reverse 0..$#$active_formatting_elements) {
2815 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
2816 splice @$active_formatting_elements, $_;
2817 return;
2818 }
2819 }
2820 }; # $clear_up_to_marker
2821
2822 my $parse_rcdata = sub ($$) {
2823 my ($content_model_flag, $insert) = @_;
2824
2825 ## Step 1
2826 my $start_tag_name = $token->{tag_name};
2827 my $el;
2828 !!!create-element ($el, $start_tag_name, $token->{attributes});
2829
2830 ## Step 2
2831 $insert->($el); # /context node/->append_child ($el)
2832
2833 ## Step 3
2834 $self->{content_model} = $content_model_flag; # CDATA or RCDATA
2835 delete $self->{escape}; # MUST
2836
2837 ## Step 4
2838 my $text = '';
2839 !!!next-token;
2840 while ($token->{type} == CHARACTER_TOKEN) { # or until stop tokenizing
2841 $text .= $token->{data};
2842 !!!next-token;
2843 }
2844
2845 ## Step 5
2846 if (length $text) {
2847 my $text = $self->{document}->create_text_node ($text);
2848 $el->append_child ($text);
2849 }
2850
2851 ## Step 6
2852 $self->{content_model} = PCDATA_CONTENT_MODEL;
2853
2854 ## Step 7
2855 if ($token->{type} == END_TAG_TOKEN and $token->{tag_name} eq $start_tag_name) {
2856 ## Ignore the token
2857 } elsif ($content_model_flag == CDATA_CONTENT_MODEL) {
2858 !!!parse-error (type => 'in CDATA:#'.$token->{type});
2859 } elsif ($content_model_flag == RCDATA_CONTENT_MODEL) {
2860 !!!parse-error (type => 'in RCDATA:#'.$token->{type});
2861 } else {
2862 die "$0: $content_model_flag in parse_rcdata";
2863 }
2864 !!!next-token;
2865 }; # $parse_rcdata
2866
2867 my $script_start_tag = sub ($) {
2868 my $insert = $_[0];
2869 my $script_el;
2870 !!!create-element ($script_el, 'script', $token->{attributes});
2871 ## TODO: mark as "parser-inserted"
2872
2873 $self->{content_model} = CDATA_CONTENT_MODEL;
2874 delete $self->{escape}; # MUST
2875
2876 my $text = '';
2877 !!!next-token;
2878 while ($token->{type} == CHARACTER_TOKEN) {
2879 $text .= $token->{data};
2880 !!!next-token;
2881 } # stop if non-character token or tokenizer stops tokenising
2882 if (length $text) {
2883 $script_el->manakai_append_text ($text);
2884 }
2885
2886 $self->{content_model} = PCDATA_CONTENT_MODEL;
2887
2888 if ($token->{type} == END_TAG_TOKEN and
2889 $token->{tag_name} eq 'script') {
2890 ## Ignore the token
2891 } else {
2892 !!!parse-error (type => 'in CDATA:#'.$token->{type});
2893 ## ISSUE: And ignore?
2894 ## TODO: mark as "already executed"
2895 }
2896
2897 if (defined $self->{inner_html_node}) {
2898 ## TODO: mark as "already executed"
2899 } else {
2900 ## TODO: $old_insertion_point = current insertion point
2901 ## TODO: insertion point = just before the next input character
2902
2903 $insert->($script_el);
2904
2905 ## TODO: insertion point = $old_insertion_point (might be "undefined")
2906
2907 ## TODO: if there is a script that will execute as soon as the parser resume, then...
2908 }
2909
2910 !!!next-token;
2911 }; # $script_start_tag
2912
2913 my $formatting_end_tag = sub {
2914 my $tag_name = shift;
2915
2916 FET: {
2917 ## Step 1
2918 my $formatting_element;
2919 my $formatting_element_i_in_active;
2920 AFE: for (reverse 0..$#$active_formatting_elements) {
2921 if ($active_formatting_elements->[$_]->[1] eq $tag_name) {
2922 $formatting_element = $active_formatting_elements->[$_];
2923 $formatting_element_i_in_active = $_;
2924 last AFE;
2925 } elsif ($active_formatting_elements->[$_]->[0] eq '#marker') {
2926 last AFE;
2927 }
2928 } # AFE
2929 unless (defined $formatting_element) {
2930 !!!parse-error (type => 'unmatched end tag:'.$tag_name);
2931 ## Ignore the token
2932 !!!next-token;
2933 return;
2934 }
2935 ## has an element in scope
2936 my $in_scope = 1;
2937 my $formatting_element_i_in_open;
2938 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2939 my $node = $self->{open_elements}->[$_];
2940 if ($node->[0] eq $formatting_element->[0]) {
2941 if ($in_scope) {
2942 $formatting_element_i_in_open = $_;
2943 last INSCOPE;
2944 } else { # in open elements but not in scope
2945 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2946 ## Ignore the token
2947 !!!next-token;
2948 return;
2949 }
2950 } elsif ({
2951 table => 1, caption => 1, td => 1, th => 1,
2952 button => 1, marquee => 1, object => 1, html => 1,
2953 }->{$node->[1]}) {
2954 $in_scope = 0;
2955 }
2956 } # INSCOPE
2957 unless (defined $formatting_element_i_in_open) {
2958 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2959 pop @$active_formatting_elements; # $formatting_element
2960 !!!next-token; ## TODO: ok?
2961 return;
2962 }
2963 if (not $self->{open_elements}->[-1]->[0] eq $formatting_element->[0]) {
2964 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
2965 }
2966
2967 ## Step 2
2968 my $furthest_block;
2969 my $furthest_block_i_in_open;
2970 OE: for (reverse 0..$#{$self->{open_elements}}) {
2971 my $node = $self->{open_elements}->[$_];
2972 if (not $formatting_category->{$node->[1]} and
2973 #not $phrasing_category->{$node->[1]} and
2974 ($special_category->{$node->[1]} or
2975 $scoping_category->{$node->[1]})) {
2976 $furthest_block = $node;
2977 $furthest_block_i_in_open = $_;
2978 } elsif ($node->[0] eq $formatting_element->[0]) {
2979 last OE;
2980 }
2981 } # OE
2982
2983 ## Step 3
2984 unless (defined $furthest_block) { # MUST
2985 splice @{$self->{open_elements}}, $formatting_element_i_in_open;
2986 splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
2987 !!!next-token;
2988 return;
2989 }
2990
2991 ## Step 4
2992 my $common_ancestor_node = $self->{open_elements}->[$formatting_element_i_in_open - 1];
2993
2994 ## Step 5
2995 my $furthest_block_parent = $furthest_block->[0]->parent_node;
2996 if (defined $furthest_block_parent) {
2997 $furthest_block_parent->remove_child ($furthest_block->[0]);
2998 }
2999
3000 ## Step 6
3001 my $bookmark_prev_el
3002 = $active_formatting_elements->[$formatting_element_i_in_active - 1]
3003 ->[0];
3004
3005 ## Step 7
3006 my $node = $furthest_block;
3007 my $node_i_in_open = $furthest_block_i_in_open;
3008 my $last_node = $furthest_block;
3009 S7: {
3010 ## Step 1
3011 $node_i_in_open--;
3012 $node = $self->{open_elements}->[$node_i_in_open];
3013
3014 ## Step 2
3015 my $node_i_in_active;
3016 S7S2: {
3017 for (reverse 0..$#$active_formatting_elements) {
3018 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
3019 $node_i_in_active = $_;
3020 last S7S2;
3021 }
3022 }
3023 splice @{$self->{open_elements}}, $node_i_in_open, 1;
3024 redo S7;
3025 } # S7S2
3026
3027 ## Step 3
3028 last S7 if $node->[0] eq $formatting_element->[0];
3029
3030 ## Step 4
3031 if ($last_node->[0] eq $furthest_block->[0]) {
3032 $bookmark_prev_el = $node->[0];
3033 }
3034
3035 ## Step 5
3036 if ($node->[0]->has_child_nodes ()) {
3037 my $clone = [$node->[0]->clone_node (0), $node->[1]];
3038 $active_formatting_elements->[$node_i_in_active] = $clone;
3039 $self->{open_elements}->[$node_i_in_open] = $clone;
3040 $node = $clone;
3041 }
3042
3043 ## Step 6
3044 $node->[0]->append_child ($last_node->[0]);
3045
3046 ## Step 7
3047 $last_node = $node;
3048
3049 ## Step 8
3050 redo S7;
3051 } # S7
3052
3053 ## Step 8
3054 $common_ancestor_node->[0]->append_child ($last_node->[0]);
3055
3056 ## Step 9
3057 my $clone = [$formatting_element->[0]->clone_node (0),
3058 $formatting_element->[1]];
3059
3060 ## Step 10
3061 my @cn = @{$furthest_block->[0]->child_nodes};
3062 $clone->[0]->append_child ($_) for @cn;
3063
3064 ## Step 11
3065 $furthest_block->[0]->append_child ($clone->[0]);
3066
3067 ## Step 12
3068 my $i;
3069 AFE: for (reverse 0..$#$active_formatting_elements) {
3070 if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
3071 splice @$active_formatting_elements, $_, 1;
3072 $i-- and last AFE if defined $i;
3073 } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
3074 $i = $_;
3075 }
3076 } # AFE
3077 splice @$active_formatting_elements, $i + 1, 0, $clone;
3078
3079 ## Step 13
3080 undef $i;
3081 OE: for (reverse 0..$#{$self->{open_elements}}) {
3082 if ($self->{open_elements}->[$_]->[0] eq $formatting_element->[0]) {
3083 splice @{$self->{open_elements}}, $_, 1;
3084 $i-- and last OE if defined $i;
3085 } elsif ($self->{open_elements}->[$_]->[0] eq $furthest_block->[0]) {
3086 $i = $_;
3087 }
3088 } # OE
3089 splice @{$self->{open_elements}}, $i + 1, 1, $clone;
3090
3091 ## Step 14
3092 redo FET;
3093 } # FET
3094 }; # $formatting_end_tag
3095
3096 my $insert_to_current = sub {
3097 $self->{open_elements}->[-1]->[0]->append_child ($_[0]);
3098 }; # $insert_to_current
3099
3100 my $insert_to_foster = sub {
3101 my $child = shift;
3102 if ({
3103 table => 1, tbody => 1, tfoot => 1,
3104 thead => 1, tr => 1,
3105 }->{$self->{open_elements}->[-1]->[1]}) {
3106 # MUST
3107 my $foster_parent_element;
3108 my $next_sibling;
3109 OE: for (reverse 0..$#{$self->{open_elements}}) {
3110 if ($self->{open_elements}->[$_]->[1] eq 'table') {
3111 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3112 if (defined $parent and $parent->node_type == 1) {
3113 $foster_parent_element = $parent;
3114 $next_sibling = $self->{open_elements}->[$_]->[0];
3115 } else {
3116 $foster_parent_element
3117 = $self->{open_elements}->[$_ - 1]->[0];
3118 }
3119 last OE;
3120 }
3121 } # OE
3122 $foster_parent_element = $self->{open_elements}->[0]->[0]
3123 unless defined $foster_parent_element;
3124 $foster_parent_element->insert_before
3125 ($child, $next_sibling);
3126 } else {
3127 $self->{open_elements}->[-1]->[0]->append_child ($child);
3128 }
3129 }; # $insert_to_foster
3130
3131 my $insert;
3132
3133 B: {
3134 if ($token->{type} == DOCTYPE_TOKEN) {
3135 !!!parse-error (type => 'DOCTYPE in the middle');
3136 ## Ignore the token
3137 ## Stay in the phase
3138 !!!next-token;
3139 redo B;
3140 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
3141 if ($self->{insertion_mode} & AFTER_HTML_IMS) {
3142 #
3143 } else {
3144 ## Generate implied end tags
3145 if ({
3146 dd => 1, dt => 1, li => 1, p => 1, td => 1, th => 1, tr => 1,
3147 tbody => 1, tfoot=> 1, thead => 1,
3148 }->{$self->{open_elements}->[-1]->[1]}) {
3149 !!!back-token;
3150 $token = {type => END_TAG_TOKEN, tag_name => $self->{open_elements}->[-1]->[1]};
3151 redo B;
3152 }
3153
3154 if (@{$self->{open_elements}} > 2 or
3155 (@{$self->{open_elements}} == 2 and $self->{open_elements}->[1]->[1] ne 'body')) {
3156 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3157 } elsif (defined $self->{inner_html_node} and
3158 @{$self->{open_elements}} > 1 and
3159 $self->{open_elements}->[1]->[1] ne 'body') {
3160 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3161 }
3162
3163 ## ISSUE: There is an issue in the spec.
3164 }
3165
3166 ## Stop parsing
3167 last B;
3168 } elsif ($token->{type} == START_TAG_TOKEN and
3169 $token->{tag_name} eq 'html') {
3170 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
3171 ## Turn into the main phase
3172 !!!parse-error (type => 'after html:html');
3173 $self->{insertion_mode} = AFTER_BODY_IM;
3174 } elsif ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
3175 ## Turn into the main phase
3176 !!!parse-error (type => 'after html:html');
3177 $self->{insertion_mode} = AFTER_FRAMESET_IM;
3178 }
3179
3180 ## ISSUE: "aa<html>" is not a parse error.
3181 ## ISSUE: "<html>" in fragment is not a parse error.
3182 unless ($token->{first_start_tag}) {
3183 !!!parse-error (type => 'not first start tag');
3184 }
3185 my $top_el = $self->{open_elements}->[0]->[0];
3186 for my $attr_name (keys %{$token->{attributes}}) {
3187 unless ($top_el->has_attribute_ns (undef, $attr_name)) {
3188 $top_el->set_attribute_ns
3189 (undef, [undef, $attr_name],
3190 $token->{attributes}->{$attr_name}->{value});
3191 }
3192 }
3193 !!!next-token;
3194 redo B;
3195 } elsif ($token->{type} == COMMENT_TOKEN) {
3196 my $comment = $self->{document}->create_comment ($token->{data});
3197 if ($self->{insertion_mode} & AFTER_HTML_IMS) {
3198 $self->{document}->append_child ($comment);
3199 } elsif ($self->{insertion_mode} == AFTER_BODY_IM) {
3200 $self->{open_elements}->[0]->[0]->append_child ($comment);
3201 } else {
3202 $self->{open_elements}->[-1]->[0]->append_child ($comment);
3203 }
3204 !!!next-token;
3205 redo B;
3206 } elsif ($self->{insertion_mode} & HEAD_IMS) {
3207 if ($token->{type} == CHARACTER_TOKEN) {
3208 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3209 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3210 unless (length $token->{data}) {
3211 !!!next-token;
3212 redo B;
3213 }
3214 }
3215
3216 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3217 ## As if <head>
3218 !!!create-element ($self->{head_element}, 'head');
3219 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3220 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3221
3222 ## Reprocess in the "in head" insertion mode...
3223 pop @{$self->{open_elements}};
3224
3225 ## Reprocess in the "after head" insertion mode...
3226 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3227 ## As if </noscript>
3228 pop @{$self->{open_elements}};
3229 !!!parse-error (type => 'in noscript:#character');
3230
3231 ## Reprocess in the "in head" insertion mode...
3232 ## As if </head>
3233 pop @{$self->{open_elements}};
3234
3235 ## Reprocess in the "after head" insertion mode...
3236 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3237 pop @{$self->{open_elements}};
3238
3239 ## Reprocess in the "after head" insertion mode...
3240 }
3241
3242 ## "after head" insertion mode
3243 ## As if <body>
3244 !!!insert-element ('body');
3245 $self->{insertion_mode} = IN_BODY_IM;
3246 ## reprocess
3247 redo B;
3248 } elsif ($token->{type} == START_TAG_TOKEN) {
3249 if ($token->{tag_name} eq 'head') {
3250 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3251 !!!create-element ($self->{head_element}, $token->{tag_name}, $token->{attributes});
3252 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3253 push @{$self->{open_elements}}, [$self->{head_element}, $token->{tag_name}];
3254 $self->{insertion_mode} = IN_HEAD_IM;
3255 !!!next-token;
3256 redo B;
3257 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
3258 #
3259 } else {
3260 !!!parse-error (type => 'in head:head'); # or in head noscript
3261 ## Ignore the token
3262 !!!next-token;
3263 redo B;
3264 }
3265 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3266 ## As if <head>
3267 !!!create-element ($self->{head_element}, 'head');
3268 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3269 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3270
3271 $self->{insertion_mode} = IN_HEAD_IM;
3272 ## Reprocess in the "in head" insertion mode...
3273 }
3274
3275 if ($token->{tag_name} eq 'base') {
3276 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3277 ## As if </noscript>
3278 pop @{$self->{open_elements}};
3279 !!!parse-error (type => 'in noscript:base');
3280
3281 $self->{insertion_mode} = IN_HEAD_IM;
3282 ## Reprocess in the "in head" insertion mode...
3283 }
3284
3285 ## NOTE: There is a "as if in head" code clone.
3286 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
3287 !!!parse-error (type => 'after head:'.$token->{tag_name});
3288 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3289 }
3290 !!!insert-element ($token->{tag_name}, $token->{attributes});
3291 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
3292 pop @{$self->{open_elements}}
3293 if $self->{insertion_mode} == AFTER_HEAD_IM;
3294 !!!next-token;
3295 redo B;
3296 } elsif ($token->{tag_name} eq 'link') {
3297 ## NOTE: There is a "as if in head" code clone.
3298 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
3299 !!!parse-error (type => 'after head:'.$token->{tag_name});
3300 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3301 }
3302 !!!insert-element ($token->{tag_name}, $token->{attributes});
3303 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
3304 pop @{$self->{open_elements}}
3305 if $self->{insertion_mode} == AFTER_HEAD_IM;
3306 !!!next-token;
3307 redo B;
3308 } elsif ($token->{tag_name} eq 'meta') {
3309 ## NOTE: There is a "as if in head" code clone.
3310 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
3311 !!!parse-error (type => 'after head:'.$token->{tag_name});
3312 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3313 }
3314 !!!insert-element ($token->{tag_name}, $token->{attributes});
3315 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
3316
3317 unless ($self->{confident}) {
3318 if ($token->{attributes}->{charset}) { ## TODO: And if supported
3319 $self->{change_encoding}
3320 ->($self, $token->{attributes}->{charset}->{value});
3321
3322 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
3323 ->set_user_data (manakai_has_reference =>
3324 $token->{attributes}->{charset}
3325 ->{has_reference});
3326 } elsif ($token->{attributes}->{content}) {
3327 ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
3328 if ($token->{attributes}->{content}->{value}
3329 =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
3330 [\x09-\x0D\x20]*=
3331 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
3332 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
3333 $self->{change_encoding}
3334 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3);
3335 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
3336 ->set_user_data (manakai_has_reference =>
3337 $token->{attributes}->{content}
3338 ->{has_reference});
3339 }
3340 }
3341 } else {
3342 if ($token->{attributes}->{charset}) {
3343 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
3344 ->set_user_data (manakai_has_reference =>
3345 $token->{attributes}->{charset}
3346 ->{has_reference});
3347 }
3348 if ($token->{attributes}->{content}) {
3349 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
3350 ->set_user_data (manakai_has_reference =>
3351 $token->{attributes}->{content}
3352 ->{has_reference});
3353 }
3354 }
3355
3356 pop @{$self->{open_elements}}
3357 if $self->{insertion_mode} == AFTER_HEAD_IM;
3358 !!!next-token;
3359 redo B;
3360 } elsif ($token->{tag_name} eq 'title') {
3361 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3362 ## As if </noscript>
3363 pop @{$self->{open_elements}};
3364 !!!parse-error (type => 'in noscript:title');
3365
3366 $self->{insertion_mode} = IN_HEAD_IM;
3367 ## Reprocess in the "in head" insertion mode...
3368 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
3369 !!!parse-error (type => 'after head:'.$token->{tag_name});
3370 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3371 }
3372
3373 ## NOTE: There is a "as if in head" code clone.
3374 my $parent = defined $self->{head_element} ? $self->{head_element}
3375 : $self->{open_elements}->[-1]->[0];
3376 $parse_rcdata->(RCDATA_CONTENT_MODEL,
3377 sub { $parent->append_child ($_[0]) });
3378 pop @{$self->{open_elements}}
3379 if $self->{insertion_mode} == AFTER_HEAD_IM;
3380 redo B;
3381 } elsif ($token->{tag_name} eq 'style') {
3382 ## NOTE: Or (scripting is enabled and tag_name eq 'noscript' and
3383 ## insertion mode IN_HEAD_IM)
3384 ## NOTE: There is a "as if in head" code clone.
3385 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
3386 !!!parse-error (type => 'after head:'.$token->{tag_name});
3387 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3388 }
3389 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert_to_current);
3390 pop @{$self->{open_elements}}
3391 if $self->{insertion_mode} == AFTER_HEAD_IM;
3392 redo B;
3393 } elsif ($token->{tag_name} eq 'noscript') {
3394 if ($self->{insertion_mode} == IN_HEAD_IM) {
3395 ## NOTE: and scripting is disalbed
3396 !!!insert-element ($token->{tag_name}, $token->{attributes});
3397 $self->{insertion_mode} = IN_HEAD_NOSCRIPT_IM;
3398 !!!next-token;
3399 redo B;
3400 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3401 !!!parse-error (type => 'in noscript:noscript');
3402 ## Ignore the token
3403 !!!next-token;
3404 redo B;
3405 } else {
3406 #
3407 }
3408 } elsif ($token->{tag_name} eq 'script') {
3409 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3410 ## As if </noscript>
3411 pop @{$self->{open_elements}};
3412 !!!parse-error (type => 'in noscript:script');
3413
3414 $self->{insertion_mode} = IN_HEAD_IM;
3415 ## Reprocess in the "in head" insertion mode...
3416 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
3417 !!!parse-error (type => 'after head:'.$token->{tag_name});
3418 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3419 }
3420
3421 ## NOTE: There is a "as if in head" code clone.
3422 $script_start_tag->($insert_to_current);
3423 pop @{$self->{open_elements}}
3424 if $self->{insertion_mode} == AFTER_HEAD_IM;
3425 redo B;
3426 } elsif ($token->{tag_name} eq 'body' or
3427 $token->{tag_name} eq 'frameset') {
3428 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3429 ## As if </noscript>
3430 pop @{$self->{open_elements}};
3431 !!!parse-error (type => 'in noscript:'.$token->{tag_name});
3432
3433 ## Reprocess in the "in head" insertion mode...
3434 ## As if </head>
3435 pop @{$self->{open_elements}};
3436
3437 ## Reprocess in the "after head" insertion mode...
3438 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3439 pop @{$self->{open_elements}};
3440
3441 ## Reprocess in the "after head" insertion mode...
3442 }
3443
3444 ## "after head" insertion mode
3445 !!!insert-element ($token->{tag_name}, $token->{attributes});
3446 if ($token->{tag_name} eq 'body') {
3447 $self->{insertion_mode} = IN_BODY_IM;
3448 } elsif ($token->{tag_name} eq 'frameset') {
3449 $self->{insertion_mode} = IN_FRAMESET_IM;
3450 } else {
3451 die "$0: tag name: $self->{tag_name}";
3452 }
3453 !!!next-token;
3454 redo B;
3455 } else {
3456 #
3457 }
3458
3459 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3460 ## As if </noscript>
3461 pop @{$self->{open_elements}};
3462 !!!parse-error (type => 'in noscript:/'.$token->{tag_name});
3463
3464 ## Reprocess in the "in head" insertion mode...
3465 ## As if </head>
3466 pop @{$self->{open_elements}};
3467
3468 ## Reprocess in the "after head" insertion mode...
3469 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3470 ## As if </head>
3471 pop @{$self->{open_elements}};
3472
3473 ## Reprocess in the "after head" insertion mode...
3474 }
3475
3476 ## "after head" insertion mode
3477 ## As if <body>
3478 !!!insert-element ('body');
3479 $self->{insertion_mode} = IN_BODY_IM;
3480 ## reprocess
3481 redo B;
3482 } elsif ($token->{type} == END_TAG_TOKEN) {
3483 if ($token->{tag_name} eq 'head') {
3484 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3485 ## As if <head>
3486 !!!create-element ($self->{head_element}, 'head');
3487 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3488 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3489
3490 ## Reprocess in the "in head" insertion mode...
3491 pop @{$self->{open_elements}};
3492 $self->{insertion_mode} = AFTER_HEAD_IM;
3493 !!!next-token;
3494 redo B;
3495 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3496 ## As if </noscript>
3497 pop @{$self->{open_elements}};
3498 !!!parse-error (type => 'in noscript:script');
3499
3500 ## Reprocess in the "in head" insertion mode...
3501 pop @{$self->{open_elements}};
3502 $self->{insertion_mode} = AFTER_HEAD_IM;
3503 !!!next-token;
3504 redo B;
3505 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3506 pop @{$self->{open_elements}};
3507 $self->{insertion_mode} = AFTER_HEAD_IM;
3508 !!!next-token;
3509 redo B;
3510 } else {
3511 #
3512 }
3513 } elsif ($token->{tag_name} eq 'noscript') {
3514 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3515 pop @{$self->{open_elements}};
3516 $self->{insertion_mode} = IN_HEAD_IM;
3517 !!!next-token;
3518 redo B;
3519 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3520 !!!parse-error (type => 'unmatched end tag:noscript');
3521 ## Ignore the token ## ISSUE: An issue in the spec.
3522 !!!next-token;
3523 redo B;
3524 } else {
3525 #
3526 }
3527 } elsif ({
3528 body => 1, html => 1,
3529 }->{$token->{tag_name}}) {
3530 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3531 ## As if <head>
3532 !!!create-element ($self->{head_element}, 'head');
3533 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3534 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3535
3536 $self->{insertion_mode} = IN_HEAD_IM;
3537 ## Reprocess in the "in head" insertion mode...
3538 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3539 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3540 ## Ignore the token
3541 !!!next-token;
3542 redo B;
3543 }
3544
3545 #
3546 } elsif ({
3547 p => 1, br => 1,
3548 }->{$token->{tag_name}}) {
3549 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3550 ## As if <head>
3551 !!!create-element ($self->{head_element}, 'head');
3552 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3553 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3554
3555 $self->{insertion_mode} = IN_HEAD_IM;
3556 ## Reprocess in the "in head" insertion mode...
3557 }
3558
3559 #
3560 } else {
3561 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
3562 #
3563 } else {
3564 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3565 ## Ignore the token
3566 !!!next-token;
3567 redo B;
3568 }
3569 }
3570
3571 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3572 ## As if </noscript>
3573 pop @{$self->{open_elements}};
3574 !!!parse-error (type => 'in noscript:/'.$token->{tag_name});
3575
3576 ## Reprocess in the "in head" insertion mode...
3577 ## As if </head>
3578 pop @{$self->{open_elements}};
3579
3580 ## Reprocess in the "after head" insertion mode...
3581 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3582 ## As if </head>
3583 pop @{$self->{open_elements}};
3584
3585 ## Reprocess in the "after head" insertion mode...
3586 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3587 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3588 ## Ignore the token ## ISSUE: An issue in the spec.
3589 !!!next-token;
3590 redo B;
3591 }
3592
3593 ## "after head" insertion mode
3594 ## As if <body>
3595 !!!insert-element ('body');
3596 $self->{insertion_mode} = IN_BODY_IM;
3597 ## reprocess
3598 redo B;
3599 } else {
3600 die "$0: $token->{type}: Unknown token type";
3601 }
3602
3603 ## ISSUE: An issue in the spec.
3604 } elsif ($self->{insertion_mode} & BODY_IMS) {
3605 if ($token->{type} == CHARACTER_TOKEN) {
3606 ## NOTE: There is a code clone of "character in body".
3607 $reconstruct_active_formatting_elements->($insert_to_current);
3608
3609 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3610
3611 !!!next-token;
3612 redo B;
3613 } elsif ($token->{type} == START_TAG_TOKEN) {
3614 if ({
3615 caption => 1, col => 1, colgroup => 1, tbody => 1,
3616 td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
3617 }->{$token->{tag_name}}) {
3618 if ($self->{insertion_mode} == IN_CELL_IM) {
3619 ## have an element in table scope
3620 my $tn;
3621 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3622 my $node = $self->{open_elements}->[$_];
3623 if ($node->[1] eq 'td' or $node->[1] eq 'th') {
3624 $tn = $node->[1];
3625 last INSCOPE;
3626 } elsif ({
3627 table => 1, html => 1,
3628 }->{$node->[1]}) {
3629 last INSCOPE;
3630 }
3631 } # INSCOPE
3632 unless (defined $tn) {
3633 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3634 ## Ignore the token
3635 !!!next-token;
3636 redo B;
3637 }
3638
3639 ## Close the cell
3640 !!!back-token; # <?>
3641 $token = {type => END_TAG_TOKEN, tag_name => $tn};
3642 redo B;
3643 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
3644 !!!parse-error (type => 'not closed:caption');
3645
3646 ## As if </caption>
3647 ## have a table element in table scope
3648 my $i;
3649 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3650 my $node = $self->{open_elements}->[$_];
3651 if ($node->[1] eq 'caption') {
3652 $i = $_;
3653 last INSCOPE;
3654 } elsif ({
3655 table => 1, html => 1,
3656 }->{$node->[1]}) {
3657 last INSCOPE;
3658 }
3659 } # INSCOPE
3660 unless (defined $i) {
3661 !!!parse-error (type => 'unmatched end tag:caption');
3662 ## Ignore the token
3663 !!!next-token;
3664 redo B;
3665 }
3666
3667 ## generate implied end tags
3668 if ({
3669 dd => 1, dt => 1, li => 1, p => 1,
3670 td => 1, th => 1, tr => 1,
3671 tbody => 1, tfoot=> 1, thead => 1,
3672 }->{$self->{open_elements}->[-1]->[1]}) {
3673 !!!back-token; # <?>
3674 $token = {type => END_TAG_TOKEN, tag_name => 'caption'};
3675 !!!back-token;
3676 $token = {type => END_TAG_TOKEN,
3677 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3678 redo B;
3679 }
3680
3681 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3682 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3683 }
3684
3685 splice @{$self->{open_elements}}, $i;
3686
3687 $clear_up_to_marker->();
3688
3689 $self->{insertion_mode} = IN_TABLE_IM;
3690
3691 ## reprocess
3692 redo B;
3693 } else {
3694 #
3695 }
3696 } else {
3697 #
3698 }
3699 } elsif ($token->{type} == END_TAG_TOKEN) {
3700 if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') {
3701 if ($self->{insertion_mode} == IN_CELL_IM) {
3702 ## have an element in table scope
3703 my $i;
3704 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3705 my $node = $self->{open_elements}->[$_];
3706 if ($node->[1] eq $token->{tag_name}) {
3707 $i = $_;
3708 last INSCOPE;
3709 } elsif ({
3710 table => 1, html => 1,
3711 }->{$node->[1]}) {
3712 last INSCOPE;
3713 }
3714 } # INSCOPE
3715 unless (defined $i) {
3716 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3717 ## Ignore the token
3718 !!!next-token;
3719 redo B;
3720 }
3721
3722 ## generate implied end tags
3723 if ({
3724 dd => 1, dt => 1, li => 1, p => 1,
3725 td => ($token->{tag_name} eq 'th'),
3726 th => ($token->{tag_name} eq 'td'),
3727 tr => 1,
3728 tbody => 1, tfoot=> 1, thead => 1,
3729 }->{$self->{open_elements}->[-1]->[1]}) {
3730 !!!back-token;
3731 $token = {type => END_TAG_TOKEN,
3732 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3733 redo B;
3734 }
3735
3736 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
3737 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3738 }
3739
3740 splice @{$self->{open_elements}}, $i;
3741
3742 $clear_up_to_marker->();
3743
3744 $self->{insertion_mode} = IN_ROW_IM;
3745
3746 !!!next-token;
3747 redo B;
3748 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
3749 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3750 ## Ignore the token
3751 !!!next-token;
3752 redo B;
3753 } else {
3754 #
3755 }
3756 } elsif ($token->{tag_name} eq 'caption') {
3757 if ($self->{insertion_mode} == IN_CAPTION_IM) {
3758 ## have a table element in table scope
3759 my $i;
3760 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3761 my $node = $self->{open_elements}->[$_];
3762 if ($node->[1] eq $token->{tag_name}) {
3763 $i = $_;
3764 last INSCOPE;
3765 } elsif ({
3766 table => 1, html => 1,
3767 }->{$node->[1]}) {
3768 last INSCOPE;
3769 }
3770 } # INSCOPE
3771 unless (defined $i) {
3772 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3773 ## Ignore the token
3774 !!!next-token;
3775 redo B;
3776 }
3777
3778 ## generate implied end tags
3779 if ({
3780 dd => 1, dt => 1, li => 1, p => 1,
3781 td => 1, th => 1, tr => 1,
3782 tbody => 1, tfoot=> 1, thead => 1,
3783 }->{$self->{open_elements}->[-1]->[1]}) {
3784 !!!back-token;
3785 $token = {type => END_TAG_TOKEN,
3786 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3787 redo B;
3788 }
3789
3790 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3791 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3792 }
3793
3794 splice @{$self->{open_elements}}, $i;
3795
3796 $clear_up_to_marker->();
3797
3798 $self->{insertion_mode} = IN_TABLE_IM;
3799
3800 !!!next-token;
3801 redo B;
3802 } elsif ($self->{insertion_mode} == IN_CELL_IM) {
3803 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3804 ## Ignore the token
3805 !!!next-token;
3806 redo B;
3807 } else {
3808 #
3809 }
3810 } elsif ({
3811 table => 1, tbody => 1, tfoot => 1,
3812 thead => 1, tr => 1,
3813 }->{$token->{tag_name}} and
3814 $self->{insertion_mode} == IN_CELL_IM) {
3815 ## have an element in table scope
3816 my $i;
3817 my $tn;
3818 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3819 my $node = $self->{open_elements}->[$_];
3820 if ($node->[1] eq $token->{tag_name}) {
3821 $i = $_;
3822 last INSCOPE;
3823 } elsif ($node->[1] eq 'td' or $node->[1] eq 'th') {
3824 $tn = $node->[1];
3825 ## NOTE: There is exactly one |td| or |th| element
3826 ## in scope in the stack of open elements by definition.
3827 } elsif ({
3828 table => 1, html => 1,
3829 }->{$node->[1]}) {
3830 last INSCOPE;
3831 }
3832 } # INSCOPE
3833 unless (defined $i) {
3834 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3835 ## Ignore the token
3836 !!!next-token;
3837 redo B;
3838 }
3839
3840 ## Close the cell
3841 !!!back-token; # </?>
3842 $token = {type => END_TAG_TOKEN, tag_name => $tn};
3843 redo B;
3844 } elsif ($token->{tag_name} eq 'table' and
3845 $self->{insertion_mode} == IN_CAPTION_IM) {
3846 !!!parse-error (type => 'not closed:caption');
3847
3848 ## As if </caption>
3849 ## have a table element in table scope
3850 my $i;
3851 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3852 my $node = $self->{open_elements}->[$_];
3853 if ($node->[1] eq 'caption') {
3854 $i = $_;
3855 last INSCOPE;
3856 } elsif ({
3857 table => 1, html => 1,
3858 }->{$node->[1]}) {
3859 last INSCOPE;
3860 }
3861 } # INSCOPE
3862 unless (defined $i) {
3863 !!!parse-error (type => 'unmatched end tag:caption');
3864 ## Ignore the token
3865 !!!next-token;
3866 redo B;
3867 }
3868
3869 ## generate implied end tags
3870 if ({
3871 dd => 1, dt => 1, li => 1, p => 1,
3872 td => 1, th => 1, tr => 1,
3873 tbody => 1, tfoot=> 1, thead => 1,
3874 }->{$self->{open_elements}->[-1]->[1]}) {
3875 !!!back-token; # </table>
3876 $token = {type => END_TAG_TOKEN, tag_name => 'caption'};
3877 !!!back-token;
3878 $token = {type => END_TAG_TOKEN,
3879 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3880 redo B;
3881 }
3882
3883 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3884 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3885 }
3886
3887 splice @{$self->{open_elements}}, $i;
3888
3889 $clear_up_to_marker->();
3890
3891 $self->{insertion_mode} = IN_TABLE_IM;
3892
3893 ## reprocess
3894 redo B;
3895 } elsif ({
3896 body => 1, col => 1, colgroup => 1, html => 1,
3897 }->{$token->{tag_name}}) {
3898 if ($self->{insertion_mode} & BODY_TABLE_IMS) {
3899 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3900 ## Ignore the token
3901 !!!next-token;
3902 redo B;
3903 } else {
3904 #
3905 }
3906 } elsif ({
3907 tbody => 1, tfoot => 1,
3908 thead => 1, tr => 1,
3909 }->{$token->{tag_name}} and
3910 $self->{insertion_mode} == IN_CAPTION_IM) {
3911 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3912 ## Ignore the token
3913 !!!next-token;
3914 redo B;
3915 } else {
3916 #
3917 }
3918 } else {
3919 die "$0: $token->{type}: Unknown token type";
3920 }
3921
3922 $insert = $insert_to_current;
3923 #
3924 } elsif ($self->{insertion_mode} & TABLE_IMS) {
3925 if ($token->{type} == CHARACTER_TOKEN) {
3926 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3927 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3928
3929 unless (length $token->{data}) {
3930 !!!next-token;
3931 redo B;
3932 }
3933 }
3934
3935 !!!parse-error (type => 'in table:#character');
3936
3937 ## As if in body, but insert into foster parent element
3938 ## ISSUE: Spec says that "whenever a node would be inserted
3939 ## into the current node" while characters might not be
3940 ## result in a new Text node.
3941 $reconstruct_active_formatting_elements->($insert_to_foster);
3942
3943 if ({
3944 table => 1, tbody => 1, tfoot => 1,
3945 thead => 1, tr => 1,
3946 }->{$self->{open_elements}->[-1]->[1]}) {
3947 # MUST
3948 my $foster_parent_element;
3949 my $next_sibling;
3950 my $prev_sibling;
3951 OE: for (reverse 0..$#{$self->{open_elements}}) {
3952 if ($self->{open_elements}->[$_]->[1] eq 'table') {
3953 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3954 if (defined $parent and $parent->node_type == 1) {
3955 $foster_parent_element = $parent;
3956 $next_sibling = $self->{open_elements}->[$_]->[0];
3957 $prev_sibling = $next_sibling->previous_sibling;
3958 } else {
3959 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
3960 $prev_sibling = $foster_parent_element->last_child;
3961 }
3962 last OE;
3963 }
3964 } # OE
3965 $foster_parent_element = $self->{open_elements}->[0]->[0] and
3966 $prev_sibling = $foster_parent_element->last_child
3967 unless defined $foster_parent_element;
3968 if (defined $prev_sibling and
3969 $prev_sibling->node_type == 3) {
3970 $prev_sibling->manakai_append_text ($token->{data});
3971 } else {
3972 $foster_parent_element->insert_before
3973 ($self->{document}->create_text_node ($token->{data}),
3974 $next_sibling);
3975 }
3976 } else {
3977 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3978 }
3979
3980 !!!next-token;
3981 redo B;
3982 } elsif ($token->{type} == START_TAG_TOKEN) {
3983 if ({
3984 tr => ($self->{insertion_mode} != IN_ROW_IM),
3985 th => 1, td => 1,
3986 }->{$token->{tag_name}}) {
3987 if ($self->{insertion_mode} == IN_TABLE_IM) {
3988 ## Clear back to table context
3989 while ($self->{open_elements}->[-1]->[1] ne 'table' and
3990 $self->{open_elements}->[-1]->[1] ne 'html') {
3991 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3992 pop @{$self->{open_elements}};
3993 }
3994
3995 !!!insert-element ('tbody');
3996 $self->{insertion_mode} = IN_TABLE_BODY_IM;
3997 ## reprocess in the "in table body" insertion mode...
3998 }
3999
4000 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
4001 unless ($token->{tag_name} eq 'tr') {
4002 !!!parse-error (type => 'missing start tag:tr');
4003 }
4004
4005 ## Clear back to table body context
4006 while (not {
4007 tbody => 1, tfoot => 1, thead => 1, html => 1,
4008 }->{$self->{open_elements}->[-1]->[1]}) {
4009 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4010 pop @{$self->{open_elements}};
4011 }
4012
4013 $self->{insertion_mode} = IN_ROW_IM;
4014 if ($token->{tag_name} eq 'tr') {
4015 !!!insert-element ($token->{tag_name}, $token->{attributes});
4016 !!!next-token;
4017 redo B;
4018 } else {
4019 !!!insert-element ('tr');
4020 ## reprocess in the "in row" insertion mode
4021 }
4022 }
4023
4024 ## Clear back to table row context
4025 while (not {
4026 tr => 1, html => 1,
4027 }->{$self->{open_elements}->[-1]->[1]}) {
4028 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4029 pop @{$self->{open_elements}};
4030 }
4031
4032 !!!insert-element ($token->{tag_name}, $token->{attributes});
4033 $self->{insertion_mode} = IN_CELL_IM;
4034
4035 push @$active_formatting_elements, ['#marker', ''];
4036
4037 !!!next-token;
4038 redo B;
4039 } elsif ({
4040 caption => 1, col => 1, colgroup => 1,
4041 tbody => 1, tfoot => 1, thead => 1,
4042 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
4043 }->{$token->{tag_name}}) {
4044 if ($self->{insertion_mode} == IN_ROW_IM) {
4045 ## As if </tr>
4046 ## have an element in table scope
4047 my $i;
4048 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4049 my $node = $self->{open_elements}->[$_];
4050 if ($node->[1] eq 'tr') {
4051 $i = $_;
4052 last INSCOPE;
4053 } elsif ({
4054 table => 1, html => 1,
4055 }->{$node->[1]}) {
4056 last INSCOPE;
4057 }
4058 } # INSCOPE
4059 unless (defined $i) {
4060 !!!parse-error (type => 'unmacthed end tag:'.$token->{tag_name});
4061 ## Ignore the token
4062 !!!next-token;
4063 redo B;
4064 }
4065
4066 ## Clear back to table row context
4067 while (not {
4068 tr => 1, html => 1,
4069 }->{$self->{open_elements}->[-1]->[1]}) {
4070 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4071 pop @{$self->{open_elements}};
4072 }
4073
4074 pop @{$self->{open_elements}}; # tr
4075 $self->{insertion_mode} = IN_TABLE_BODY_IM;
4076 if ($token->{tag_name} eq 'tr') {
4077 ## reprocess
4078 redo B;
4079 } else {
4080 ## reprocess in the "in table body" insertion mode...
4081 }
4082 }
4083
4084 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
4085 ## have an element in table scope
4086 my $i;
4087 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4088 my $node = $self->{open_elements}->[$_];
4089 if ({
4090 tbody => 1, thead => 1, tfoot => 1,
4091 }->{$node->[1]}) {
4092 $i = $_;
4093 last INSCOPE;
4094 } elsif ({
4095 table => 1, html => 1,
4096 }->{$node->[1]}) {
4097 last INSCOPE;
4098 }
4099 } # INSCOPE
4100 unless (defined $i) {
4101 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4102 ## Ignore the token
4103 !!!next-token;
4104 redo B;
4105 }
4106
4107 ## Clear back to table body context
4108 while (not {
4109 tbody => 1, tfoot => 1, thead => 1, html => 1,
4110 }->{$self->{open_elements}->[-1]->[1]}) {
4111 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4112 pop @{$self->{open_elements}};
4113 }
4114
4115 ## As if <{current node}>
4116 ## have an element in table scope
4117 ## true by definition
4118
4119 ## Clear back to table body context
4120 ## nop by definition
4121
4122 pop @{$self->{open_elements}};
4123 $self->{insertion_mode} = IN_TABLE_IM;
4124 ## reprocess in "in table" insertion mode...
4125 }
4126
4127 if ($token->{tag_name} eq 'col') {
4128 ## Clear back to table context
4129 while ($self->{open_elements}->[-1]->[1] ne 'table' and
4130 $self->{open_elements}->[-1]->[1] ne 'html') {
4131 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4132 pop @{$self->{open_elements}};
4133 }
4134
4135 !!!insert-element ('colgroup');
4136 $self->{insertion_mode} = IN_COLUMN_GROUP_IM;
4137 ## reprocess
4138 redo B;
4139 } elsif ({
4140 caption => 1,
4141 colgroup => 1,
4142 tbody => 1, tfoot => 1, thead => 1,
4143 }->{$token->{tag_name}}) {
4144 ## Clear back to table context
4145 while ($self->{open_elements}->[-1]->[1] ne 'table' and
4146 $self->{open_elements}->[-1]->[1] ne 'html') {
4147 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4148 pop @{$self->{open_elements}};
4149 }
4150
4151 push @$active_formatting_elements, ['#marker', '']
4152 if $token->{tag_name} eq 'caption';
4153
4154 !!!insert-element ($token->{tag_name}, $token->{attributes});
4155 $self->{insertion_mode} = {
4156 caption => IN_CAPTION_IM,
4157 colgroup => IN_COLUMN_GROUP_IM,
4158 tbody => IN_TABLE_BODY_IM,
4159 tfoot => IN_TABLE_BODY_IM,
4160 thead => IN_TABLE_BODY_IM,
4161 }->{$token->{tag_name}};
4162 !!!next-token;
4163 redo B;
4164 } else {
4165 die "$0: in table: <>: $token->{tag_name}";
4166 }
4167 } elsif ($token->{tag_name} eq 'table') {
4168 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4169
4170 ## As if </table>
4171 ## have a table element in table scope
4172 my $i;
4173 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4174 my $node = $self->{open_elements}->[$_];
4175 if ($node->[1] eq 'table') {
4176 $i = $_;
4177 last INSCOPE;
4178 } elsif ({
4179 table => 1, html => 1,
4180 }->{$node->[1]}) {
4181 last INSCOPE;
4182 }
4183 } # INSCOPE
4184 unless (defined $i) {
4185 !!!parse-error (type => 'unmatched end tag:table');
4186 ## Ignore tokens </table><table>
4187 !!!next-token;
4188 redo B;
4189 }
4190
4191 ## generate implied end tags
4192 if ({
4193 dd => 1, dt => 1, li => 1, p => 1,
4194 td => 1, th => 1, tr => 1,
4195 tbody => 1, tfoot=> 1, thead => 1,
4196 }->{$self->{open_elements}->[-1]->[1]}) {
4197 !!!back-token; # <table>
4198 $token = {type => END_TAG_TOKEN, tag_name => 'table'};
4199 !!!back-token;
4200 $token = {type => END_TAG_TOKEN,
4201 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
4202 redo B;
4203 }
4204
4205 if ($self->{open_elements}->[-1]->[1] ne 'table') {
4206 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4207 }
4208
4209 splice @{$self->{open_elements}}, $i;
4210
4211 $self->_reset_insertion_mode;
4212
4213 ## reprocess
4214 redo B;
4215 } else {
4216 !!!parse-error (type => 'in table:'.$token->{tag_name});
4217
4218 $insert = $insert_to_foster;
4219 #
4220 }
4221 } elsif ($token->{type} == END_TAG_TOKEN) {
4222 if ($token->{tag_name} eq 'tr' and
4223 $self->{insertion_mode} == IN_ROW_IM) {
4224 ## have an element in table scope
4225 my $i;
4226 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4227 my $node = $self->{open_elements}->[$_];
4228 if ($node->[1] eq $token->{tag_name}) {
4229 $i = $_;
4230 last INSCOPE;
4231 } elsif ({
4232 table => 1, html => 1,
4233 }->{$node->[1]}) {
4234 last INSCOPE;
4235 }
4236 } # INSCOPE
4237 unless (defined $i) {
4238 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4239 ## Ignore the token
4240 !!!next-token;
4241 redo B;
4242 }
4243
4244 ## Clear back to table row context
4245 while (not {
4246 tr => 1, html => 1,
4247 }->{$self->{open_elements}->[-1]->[1]}) {
4248 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4249 pop @{$self->{open_elements}};
4250 }
4251
4252 pop @{$self->{open_elements}}; # tr
4253 $self->{insertion_mode} = IN_TABLE_BODY_IM;
4254 !!!next-token;
4255 redo B;
4256 } elsif ($token->{tag_name} eq 'table') {
4257 if ($self->{insertion_mode} == IN_ROW_IM) {
4258 ## As if </tr>
4259 ## have an element in table scope
4260 my $i;
4261 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4262 my $node = $self->{open_elements}->[$_];
4263 if ($node->[1] eq 'tr') {
4264 $i = $_;
4265 last INSCOPE;
4266 } elsif ({
4267 table => 1, html => 1,
4268 }->{$node->[1]}) {
4269 last INSCOPE;
4270 }
4271 } # INSCOPE
4272 unless (defined $i) {
4273 !!!parse-error (type => 'unmatched end tag:'.$token->{type});
4274 ## Ignore the token
4275 !!!next-token;
4276 redo B;
4277 }
4278
4279 ## Clear back to table row context
4280 while (not {
4281 tr => 1, html => 1,
4282 }->{$self->{open_elements}->[-1]->[1]}) {
4283 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4284 pop @{$self->{open_elements}};
4285 }
4286
4287 pop @{$self->{open_elements}}; # tr
4288 $self->{insertion_mode} = IN_TABLE_BODY_IM;
4289 ## reprocess in the "in table body" insertion mode...
4290 }
4291
4292 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
4293 ## have an element in table scope
4294 my $i;
4295 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4296 my $node = $self->{open_elements}->[$_];
4297 if ({
4298 tbody => 1, thead => 1, tfoot => 1,
4299 }->{$node->[1]}) {
4300 $i = $_;
4301 last INSCOPE;
4302 } elsif ({
4303 table => 1, html => 1,
4304 }->{$node->[1]}) {
4305 last INSCOPE;
4306 }
4307 } # INSCOPE
4308 unless (defined $i) {
4309 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4310 ## Ignore the token
4311 !!!next-token;
4312 redo B;
4313 }
4314
4315 ## Clear back to table body context
4316 while (not {
4317 tbody => 1, tfoot => 1, thead => 1, html => 1,
4318 }->{$self->{open_elements}->[-1]->[1]}) {
4319 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4320 pop @{$self->{open_elements}};
4321 }
4322
4323 ## As if <{current node}>
4324 ## have an element in table scope
4325 ## true by definition
4326
4327 ## Clear back to table body context
4328 ## nop by definition
4329
4330 pop @{$self->{open_elements}};
4331 $self->{insertion_mode} = IN_TABLE_IM;
4332 ## reprocess in the "in table" insertion mode...
4333 }
4334
4335 ## have a table element in table scope
4336 my $i;
4337 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4338 my $node = $self->{open_elements}->[$_];
4339 if ($node->[1] eq $token->{tag_name}) {
4340 $i = $_;
4341 last INSCOPE;
4342 } elsif ({
4343 table => 1, html => 1,
4344 }->{$node->[1]}) {
4345 last INSCOPE;
4346 }
4347 } # INSCOPE
4348 unless (defined $i) {
4349 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4350 ## Ignore the token
4351 !!!next-token;
4352 redo B;
4353 }
4354
4355 ## generate implied end tags
4356 if ({
4357 dd => 1, dt => 1, li => 1, p => 1,
4358 td => 1, th => 1, tr => 1,
4359 tbody => 1, tfoot=> 1, thead => 1,
4360 }->{$self->{open_elements}->[-1]->[1]}) {
4361 !!!back-token;
4362 $token = {type => END_TAG_TOKEN,
4363 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
4364 redo B;
4365 }
4366
4367 if ($self->{open_elements}->[-1]->[1] ne 'table') {
4368 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4369 }
4370
4371 splice @{$self->{open_elements}}, $i;
4372
4373 $self->_reset_insertion_mode;
4374
4375 !!!next-token;
4376 redo B;
4377 } elsif ({
4378 tbody => 1, tfoot => 1, thead => 1,
4379 }->{$token->{tag_name}} and
4380 $self->{insertion_mode} & ROW_IMS) {
4381 if ($self->{insertion_mode} == IN_ROW_IM) {
4382 ## have an element in table scope
4383 my $i;
4384 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4385 my $node = $self->{open_elements}->[$_];
4386 if ($node->[1] eq $token->{tag_name}) {
4387 $i = $_;
4388 last INSCOPE;
4389 } elsif ({
4390 table => 1, html => 1,
4391 }->{$node->[1]}) {
4392 last INSCOPE;
4393 }
4394 } # INSCOPE
4395 unless (defined $i) {
4396 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4397 ## Ignore the token
4398 !!!next-token;
4399 redo B;
4400 }
4401
4402 ## As if </tr>
4403 ## have an element in table scope
4404 my $i;
4405 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4406 my $node = $self->{open_elements}->[$_];
4407 if ($node->[1] eq 'tr') {
4408 $i = $_;
4409 last INSCOPE;
4410 } elsif ({
4411 table => 1, html => 1,
4412 }->{$node->[1]}) {
4413 last INSCOPE;
4414 }
4415 } # INSCOPE
4416 unless (defined $i) {
4417 !!!parse-error (type => 'unmatched end tag:tr');
4418 ## Ignore the token
4419 !!!next-token;
4420 redo B;
4421 }
4422
4423 ## Clear back to table row context
4424 while (not {
4425 tr => 1, html => 1,
4426 }->{$self->{open_elements}->[-1]->[1]}) {
4427 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4428 pop @{$self->{open_elements}};
4429 }
4430
4431 pop @{$self->{open_elements}}; # tr
4432 $self->{insertion_mode} = IN_TABLE_BODY_IM;
4433 ## reprocess in the "in table body" insertion mode...
4434 }
4435
4436 ## have an element in table scope
4437 my $i;
4438 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4439 my $node = $self->{open_elements}->[$_];
4440 if ($node->[1] eq $token->{tag_name}) {
4441 $i = $_;
4442 last INSCOPE;
4443 } elsif ({
4444 table => 1, html => 1,
4445 }->{$node->[1]}) {
4446 last INSCOPE;
4447 }
4448 } # INSCOPE
4449 unless (defined $i) {
4450 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4451 ## Ignore the token
4452 !!!next-token;
4453 redo B;
4454 }
4455
4456 ## Clear back to table body context
4457 while (not {
4458 tbody => 1, tfoot => 1, thead => 1, html => 1,
4459 }->{$self->{open_elements}->[-1]->[1]}) {
4460 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4461 pop @{$self->{open_elements}};
4462 }
4463
4464 pop @{$self->{open_elements}};
4465 $self->{insertion_mode} = IN_TABLE_IM;
4466 !!!next-token;
4467 redo B;
4468 } elsif ({
4469 body => 1, caption => 1, col => 1, colgroup => 1,
4470 html => 1, td => 1, th => 1,
4471 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
4472 tbody => 1, tfoot => 1, thead => 1, # $self->{insertion_mode} == IN_TABLE_IM
4473 }->{$token->{tag_name}}) {
4474 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4475 ## Ignore the token
4476 !!!next-token;
4477 redo B;
4478 } else {
4479 !!!parse-error (type => 'in table:/'.$token->{tag_name});
4480
4481 $insert = $insert_to_foster;
4482 #
4483 }
4484 } else {
4485 die "$0: $token->{type}: Unknown token type";
4486 }
4487 } elsif ($self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
4488 if ($token->{type} == CHARACTER_TOKEN) {
4489 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4490 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4491 unless (length $token->{data}) {
4492 !!!next-token;
4493 redo B;
4494 }
4495 }
4496
4497 #
4498 } elsif ($token->{type} == START_TAG_TOKEN) {
4499 if ($token->{tag_name} eq 'col') {
4500 !!!insert-element ($token->{tag_name}, $token->{attributes});
4501 pop @{$self->{open_elements}};
4502 !!!next-token;
4503 redo B;
4504 } else {
4505 #
4506 }
4507 } elsif ($token->{type} == END_TAG_TOKEN) {
4508 if ($token->{tag_name} eq 'colgroup') {
4509 if ($self->{open_elements}->[-1]->[1] eq 'html') {
4510 !!!parse-error (type => 'unmatched end tag:colgroup');
4511 ## Ignore the token
4512 !!!next-token;
4513 redo B;
4514 } else {
4515 pop @{$self->{open_elements}}; # colgroup
4516 $self->{insertion_mode} = IN_TABLE_IM;
4517 !!!next-token;
4518 redo B;
4519 }
4520 } elsif ($token->{tag_name} eq 'col') {
4521 !!!parse-error (type => 'unmatched end tag:col');
4522 ## Ignore the token
4523 !!!next-token;
4524 redo B;
4525 } else {
4526 #
4527 }
4528 } else {
4529 #
4530 }
4531
4532 ## As if </colgroup>
4533 if ($self->{open_elements}->[-1]->[1] eq 'html') {
4534 !!!parse-error (type => 'unmatched end tag:colgroup');
4535 ## Ignore the token
4536 !!!next-token;
4537 redo B;
4538 } else {
4539 pop @{$self->{open_elements}}; # colgroup
4540 $self->{insertion_mode} = IN_TABLE_IM;
4541 ## reprocess
4542 redo B;
4543 }
4544 } elsif ($self->{insertion_mode} == IN_SELECT_IM) {
4545 if ($token->{type} == CHARACTER_TOKEN) {
4546 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4547 !!!next-token;
4548 redo B;
4549 } elsif ($token->{type} == START_TAG_TOKEN) {
4550 if ($token->{tag_name} eq 'option') {
4551 if ($self->{open_elements}->[-1]->[1] eq 'option') {
4552 ## As if </option>
4553 pop @{$self->{open_elements}};
4554 }
4555
4556 !!!insert-element ($token->{tag_name}, $token->{attributes});
4557 !!!next-token;
4558 redo B;
4559 } elsif ($token->{tag_name} eq 'optgroup') {
4560 if ($self->{open_elements}->[-1]->[1] eq 'option') {
4561 ## As if </option>
4562 pop @{$self->{open_elements}};
4563 }
4564
4565 if ($self->{open_elements}->[-1]->[1] eq 'optgroup') {
4566 ## As if </optgroup>
4567 pop @{$self->{open_elements}};
4568 }
4569
4570 !!!insert-element ($token->{tag_name}, $token->{attributes});
4571 !!!next-token;
4572 redo B;
4573 } elsif ($token->{tag_name} eq 'select') {
4574 !!!parse-error (type => 'not closed:select');
4575 ## As if </select> instead
4576 ## have an element in table scope
4577 my $i;
4578 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4579 my $node = $self->{open_elements}->[$_];
4580 if ($node->[1] eq $token->{tag_name}) {
4581 $i = $_;
4582 last INSCOPE;
4583 } elsif ({
4584 table => 1, html => 1,
4585 }->{$node->[1]}) {
4586 last INSCOPE;
4587 }
4588 } # INSCOPE
4589 unless (defined $i) {
4590 !!!parse-error (type => 'unmatched end tag:select');
4591 ## Ignore the token
4592 !!!next-token;
4593 redo B;
4594 }
4595
4596 splice @{$self->{open_elements}}, $i;
4597
4598 $self->_reset_insertion_mode;
4599
4600 !!!next-token;
4601 redo B;
4602 } else {
4603 !!!parse-error (type => 'in select:'.$token->{tag_name});
4604 ## Ignore the token
4605 !!!next-token;
4606 redo B;
4607 }
4608 } elsif ($token->{type} == END_TAG_TOKEN) {
4609 if ($token->{tag_name} eq 'optgroup') {
4610 if ($self->{open_elements}->[-1]->[1] eq 'option' and
4611 $self->{open_elements}->[-2]->[1] eq 'optgroup') {
4612 ## As if </option>
4613 splice @{$self->{open_elements}}, -2;
4614 } elsif ($self->{open_elements}->[-1]->[1] eq 'optgroup') {
4615 pop @{$self->{open_elements}};
4616 } else {
4617 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4618 ## Ignore the token
4619 }
4620 !!!next-token;
4621 redo B;
4622 } elsif ($token->{tag_name} eq 'option') {
4623 if ($self->{open_elements}->[-1]->[1] eq 'option') {
4624 pop @{$self->{open_elements}};
4625 } else {
4626 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4627 ## Ignore the token
4628 }
4629 !!!next-token;
4630 redo B;
4631 } elsif ($token->{tag_name} eq 'select') {
4632 ## have an element in table scope
4633 my $i;
4634 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4635 my $node = $self->{open_elements}->[$_];
4636 if ($node->[1] eq $token->{tag_name}) {
4637 $i = $_;
4638 last INSCOPE;
4639 } elsif ({
4640 table => 1, html => 1,
4641 }->{$node->[1]}) {
4642 last INSCOPE;
4643 }
4644 } # INSCOPE
4645 unless (defined $i) {
4646 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4647 ## Ignore the token
4648 !!!next-token;
4649 redo B;
4650 }
4651
4652 splice @{$self->{open_elements}}, $i;
4653
4654 $self->_reset_insertion_mode;
4655
4656 !!!next-token;
4657 redo B;
4658 } elsif ({
4659 caption => 1, table => 1, tbody => 1,
4660 tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
4661 }->{$token->{tag_name}}) {
4662 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4663
4664 ## have an element in table scope
4665 my $i;
4666 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4667 my $node = $self->{open_elements}->[$_];
4668 if ($node->[1] eq $token->{tag_name}) {
4669 $i = $_;
4670 last INSCOPE;
4671 } elsif ({
4672 table => 1, html => 1,
4673 }->{$node->[1]}) {
4674 last INSCOPE;
4675 }
4676 } # INSCOPE
4677 unless (defined $i) {
4678 ## Ignore the token
4679 !!!next-token;
4680 redo B;
4681 }
4682
4683 ## As if </select>
4684 ## have an element in table scope
4685 undef $i;
4686 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4687 my $node = $self->{open_elements}->[$_];
4688 if ($node->[1] eq 'select') {
4689 $i = $_;
4690 last INSCOPE;
4691 } elsif ({
4692 table => 1, html => 1,
4693 }->{$node->[1]}) {
4694 last INSCOPE;
4695 }
4696 } # INSCOPE
4697 unless (defined $i) {
4698 !!!parse-error (type => 'unmatched end tag:select');
4699 ## Ignore the </select> token
4700 !!!next-token; ## TODO: ok?
4701 redo B;
4702 }
4703
4704 splice @{$self->{open_elements}}, $i;
4705
4706 $self->_reset_insertion_mode;
4707
4708 ## reprocess
4709 redo B;
4710 } else {
4711 !!!parse-error (type => 'in select:/'.$token->{tag_name});
4712 ## Ignore the token
4713 !!!next-token;
4714 redo B;
4715 }
4716 } else {
4717 die "$0: $token->{type}: Unknown token type";
4718 }
4719 } elsif ($self->{insertion_mode} & BODY_AFTER_IMS) {
4720 if ($token->{type} == CHARACTER_TOKEN) {
4721 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4722 my $data = $1;
4723 ## As if in body
4724 $reconstruct_active_formatting_elements->($insert_to_current);
4725
4726 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4727
4728 unless (length $token->{data}) {
4729 !!!next-token;
4730 redo B;
4731 }
4732 }
4733
4734 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
4735 !!!parse-error (type => 'after html:#character');
4736
4737 ## Reprocess in the "main" phase, "after body" insertion mode...
4738 }
4739
4740 ## "after body" insertion mode
4741 !!!parse-error (type => 'after body:#character');
4742
4743 $self->{insertion_mode} = IN_BODY_IM;
4744 ## reprocess
4745 redo B;
4746 } elsif ($token->{type} == START_TAG_TOKEN) {
4747 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
4748 !!!parse-error (type => 'after html:'.$token->{tag_name});
4749
4750 ## Reprocess in the "main" phase, "after body" insertion mode...
4751 }
4752
4753 ## "after body" insertion mode
4754 !!!parse-error (type => 'after body:'.$token->{tag_name});
4755
4756 $self->{insertion_mode} = IN_BODY_IM;
4757 ## reprocess
4758 redo B;
4759 } elsif ($token->{type} == END_TAG_TOKEN) {
4760 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
4761 !!!parse-error (type => 'after html:/'.$token->{tag_name});
4762
4763 $self->{insertion_mode} = AFTER_BODY_IM;
4764 ## Reprocess in the "main" phase, "after body" insertion mode...
4765 }
4766
4767 ## "after body" insertion mode
4768 if ($token->{tag_name} eq 'html') {
4769 if (defined $self->{inner_html_node}) {
4770 !!!parse-error (type => 'unmatched end tag:html');
4771 ## Ignore the token
4772 !!!next-token;
4773 redo B;
4774 } else {
4775 $self->{insertion_mode} = AFTER_HTML_BODY_IM;
4776 !!!next-token;
4777 redo B;
4778 }
4779 } else {
4780 !!!parse-error (type => 'after body:/'.$token->{tag_name});
4781
4782 $self->{insertion_mode} = IN_BODY_IM;
4783 ## reprocess
4784 redo B;
4785 }
4786 } else {
4787 die "$0: $token->{type}: Unknown token type";
4788 }
4789 } elsif ($self->{insertion_mode} & FRAME_IMS) {
4790 if ($token->{type} == CHARACTER_TOKEN) {
4791 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4792 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4793
4794 unless (length $token->{data}) {
4795 !!!next-token;
4796 redo B;
4797 }
4798 }
4799
4800 if ($token->{data} =~ s/^[^\x09\x0A\x0B\x0C\x20]+//) {
4801 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
4802 !!!parse-error (type => 'in frameset:#character');
4803 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
4804 !!!parse-error (type => 'after frameset:#character');
4805 } else { # "after html frameset"
4806 !!!parse-error (type => 'after html:#character');
4807
4808 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4809 ## Reprocess in the "main" phase, "after frameset"...
4810 !!!parse-error (type => 'after frameset:#character');
4811 }
4812
4813 ## Ignore the token.
4814 if (length $token->{data}) {
4815 ## reprocess the rest of characters
4816 } else {
4817 !!!next-token;
4818 }
4819 redo B;
4820 }
4821
4822 die qq[$0: Character "$token->{data}"];
4823 } elsif ($token->{type} == START_TAG_TOKEN) {
4824 if ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
4825 !!!parse-error (type => 'after html:'.$token->{tag_name});
4826
4827 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4828 ## Process in the "main" phase, "after frameset" insertion mode...
4829 }
4830
4831 if ($token->{tag_name} eq 'frameset' and
4832 $self->{insertion_mode} == IN_FRAMESET_IM) {
4833 !!!insert-element ($token->{tag_name}, $token->{attributes});
4834 !!!next-token;
4835 redo B;
4836 } elsif ($token->{tag_name} eq 'frame' and
4837 $self->{insertion_mode} == IN_FRAMESET_IM) {
4838 !!!insert-element ($token->{tag_name}, $token->{attributes});
4839 pop @{$self->{open_elements}};
4840 !!!next-token;
4841 redo B;
4842 } elsif ($token->{tag_name} eq 'noframes') {
4843 ## NOTE: As if in body.
4844 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert_to_current);
4845 redo B;
4846 } else {
4847 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
4848 !!!parse-error (type => 'in frameset:'.$token->{tag_name});
4849 } else {
4850 !!!parse-error (type => 'after frameset:'.$token->{tag_name});
4851 }
4852 ## Ignore the token
4853 !!!next-token;
4854 redo B;
4855 }
4856 } elsif ($token->{type} == END_TAG_TOKEN) {
4857 if ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
4858 !!!parse-error (type => 'after html:/'.$token->{tag_name});
4859
4860 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4861 ## Process in the "main" phase, "after frameset" insertion mode...
4862 }
4863
4864 if ($token->{tag_name} eq 'frameset' and
4865 $self->{insertion_mode} == IN_FRAMESET_IM) {
4866 if ($self->{open_elements}->[-1]->[1] eq 'html' and
4867 @{$self->{open_elements}} == 1) {
4868 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4869 ## Ignore the token
4870 !!!next-token;
4871 } else {
4872 pop @{$self->{open_elements}};
4873 !!!next-token;
4874 }
4875
4876 if (not defined $self->{inner_html_node} and
4877 $self->{open_elements}->[-1]->[1] ne 'frameset') {
4878 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4879 }
4880 redo B;
4881 } elsif ($token->{tag_name} eq 'html' and
4882 $self->{insertion_mode} == AFTER_FRAMESET_IM) {
4883 $self->{insertion_mode} = AFTER_HTML_FRAMESET_IM;
4884 !!!next-token;
4885 redo B;
4886 } else {
4887 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
4888 !!!parse-error (type => 'in frameset:/'.$token->{tag_name});
4889 } else {
4890 !!!parse-error (type => 'after frameset:/'.$token->{tag_name});
4891 }
4892 ## Ignore the token
4893 !!!next-token;
4894 redo B;
4895 }
4896 } else {
4897 die "$0: $token->{type}: Unknown token type";
4898 }
4899
4900 ## ISSUE: An issue in spec here
4901 } else {
4902 die "$0: $self->{insertion_mode}: Unknown insertion mode";
4903 }
4904
4905 ## "in body" insertion mode
4906 if ($token->{type} == START_TAG_TOKEN) {
4907 if ($token->{tag_name} eq 'script') {
4908 ## NOTE: This is an "as if in head" code clone
4909 $script_start_tag->($insert);
4910 redo B;
4911 } elsif ($token->{tag_name} eq 'style') {
4912 ## NOTE: This is an "as if in head" code clone
4913 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);
4914 redo B;
4915 } elsif ({
4916 base => 1, link => 1,
4917 }->{$token->{tag_name}}) {
4918 ## NOTE: This is an "as if in head" code clone, only "-t" differs
4919 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4920 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4921 !!!next-token;
4922 redo B;
4923 } elsif ($token->{tag_name} eq 'meta') {
4924 ## NOTE: This is an "as if in head" code clone, only "-t" differs
4925 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4926 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4927
4928 unless ($self->{confident}) {
4929 if ($token->{attributes}->{charset}) { ## TODO: And if supported
4930 $self->{change_encoding}
4931 ->($self, $token->{attributes}->{charset}->{value});
4932
4933 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4934 ->set_user_data (manakai_has_reference =>
4935 $token->{attributes}->{charset}
4936 ->{has_reference});
4937 } elsif ($token->{attributes}->{content}) {
4938 ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
4939 if ($token->{attributes}->{content}->{value}
4940 =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
4941 [\x09-\x0D\x20]*=
4942 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
4943 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
4944 $self->{change_encoding}
4945 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3);
4946 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4947 ->set_user_data (manakai_has_reference =>
4948 $token->{attributes}->{content}
4949 ->{has_reference});
4950 }
4951 }
4952 } else {
4953 if ($token->{attributes}->{charset}) {
4954 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4955 ->set_user_data (manakai_has_reference =>
4956 $token->{attributes}->{charset}
4957 ->{has_reference});
4958 }
4959 if ($token->{attributes}->{content}) {
4960 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4961 ->set_user_data (manakai_has_reference =>
4962 $token->{attributes}->{content}
4963 ->{has_reference});
4964 }
4965 }
4966
4967 !!!next-token;
4968 redo B;
4969 } elsif ($token->{tag_name} eq 'title') {
4970 !!!parse-error (type => 'in body:title');
4971 ## NOTE: This is an "as if in head" code clone
4972 $parse_rcdata->(RCDATA_CONTENT_MODEL, sub {
4973 if (defined $self->{head_element}) {
4974 $self->{head_element}->append_child ($_[0]);
4975 } else {
4976 $insert->($_[0]);
4977 }
4978 });
4979 redo B;
4980 } elsif ($token->{tag_name} eq 'body') {
4981 !!!parse-error (type => 'in body:body');
4982
4983 if (@{$self->{open_elements}} == 1 or
4984 $self->{open_elements}->[1]->[1] ne 'body') {
4985 ## Ignore the token
4986 } else {
4987 my $body_el = $self->{open_elements}->[1]->[0];
4988 for my $attr_name (keys %{$token->{attributes}}) {
4989 unless ($body_el->has_attribute_ns (undef, $attr_name)) {
4990 $body_el->set_attribute_ns
4991 (undef, [undef, $attr_name],
4992 $token->{attributes}->{$attr_name}->{value});
4993 }
4994 }
4995 }
4996 !!!next-token;
4997 redo B;
4998 } elsif ({
4999 address => 1, blockquote => 1, center => 1, dir => 1,
5000 div => 1, dl => 1, fieldset => 1, listing => 1,
5001 menu => 1, ol => 1, p => 1, ul => 1,
5002 pre => 1,
5003 }->{$token->{tag_name}}) {
5004 ## has a p element in scope
5005 INSCOPE: for (reverse @{$self->{open_elements}}) {
5006 if ($_->[1] eq 'p') {
5007 !!!back-token;
5008 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
5009 redo B;
5010 } elsif ({
5011 table => 1, caption => 1, td => 1, th => 1,
5012 button => 1, marquee => 1, object => 1, html => 1,
5013 }->{$_->[1]}) {
5014 last INSCOPE;
5015 }
5016 } # INSCOPE
5017
5018 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5019 if ($token->{tag_name} eq 'pre') {
5020 !!!next-token;
5021 if ($token->{type} == CHARACTER_TOKEN) {
5022 $token->{data} =~ s/^\x0A//;
5023 unless (length $token->{data}) {
5024 !!!next-token;
5025 }
5026 }
5027 } else {
5028 !!!next-token;
5029 }
5030 redo B;
5031 } elsif ($token->{tag_name} eq 'form') {
5032 if (defined $self->{form_element}) {
5033 !!!parse-error (type => 'in form:form');
5034 ## Ignore the token
5035 !!!next-token;
5036 redo B;
5037 } else {
5038 ## has a p element in scope
5039 INSCOPE: for (reverse @{$self->{open_elements}}) {
5040 if ($_->[1] eq 'p') {
5041 !!!back-token;
5042 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
5043 redo B;
5044 } elsif ({
5045 table => 1, caption => 1, td => 1, th => 1,
5046 button => 1, marquee => 1, object => 1, html => 1,
5047 }->{$_->[1]}) {
5048 last INSCOPE;
5049 }
5050 } # INSCOPE
5051
5052 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5053 $self->{form_element} = $self->{open_elements}->[-1]->[0];
5054 !!!next-token;
5055 redo B;
5056 }
5057 } elsif ($token->{tag_name} eq 'li') {
5058 ## has a p element in scope
5059 INSCOPE: for (reverse @{$self->{open_elements}}) {
5060 if ($_->[1] eq 'p') {
5061 !!!back-token;
5062 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
5063 redo B;
5064 } elsif ({
5065 table => 1, caption => 1, td => 1, th => 1,
5066 button => 1, marquee => 1, object => 1, html => 1,
5067 }->{$_->[1]}) {
5068 last INSCOPE;
5069 }
5070 } # INSCOPE
5071
5072 ## Step 1
5073 my $i = -1;
5074 my $node = $self->{open_elements}->[$i];
5075 LI: {
5076 ## Step 2
5077 if ($node->[1] eq 'li') {
5078 if ($i != -1) {
5079 !!!parse-error (type => 'end tag missing:'.
5080 $self->{open_elements}->[-1]->[1]);
5081 }
5082 splice @{$self->{open_elements}}, $i;
5083 last LI;
5084 }
5085
5086 ## Step 3
5087 if (not $formatting_category->{$node->[1]} and
5088 #not $phrasing_category->{$node->[1]} and
5089 ($special_category->{$node->[1]} or
5090 $scoping_category->{$node->[1]}) and
5091 $node->[1] ne 'address' and $node->[1] ne 'div') {
5092 last LI;
5093 }
5094
5095 ## Step 4
5096 $i--;
5097 $node = $self->{open_elements}->[$i];
5098 redo LI;
5099 } # LI
5100
5101 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5102 !!!next-token;
5103 redo B;
5104 } elsif ($token->{tag_name} eq 'dd' or $token->{tag_name} eq 'dt') {
5105 ## has a p element in scope
5106 INSCOPE: for (reverse @{$self->{open_elements}}) {
5107 if ($_->[1] eq 'p') {
5108 !!!back-token;
5109 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
5110 redo B;
5111 } elsif ({
5112 table => 1, caption => 1, td => 1, th => 1,
5113 button => 1, marquee => 1, object => 1, html => 1,
5114 }->{$_->[1]}) {
5115 last INSCOPE;
5116 }
5117 } # INSCOPE
5118
5119 ## Step 1
5120 my $i = -1;
5121 my $node = $self->{open_elements}->[$i];
5122 LI: {
5123 ## Step 2
5124 if ($node->[1] eq 'dt' or $node->[1] eq 'dd') {
5125 if ($i != -1) {
5126 !!!parse-error (type => 'end tag missing:'.
5127 $self->{open_elements}->[-1]->[1]);
5128 }
5129 splice @{$self->{open_elements}}, $i;
5130 last LI;
5131 }
5132
5133 ## Step 3
5134 if (not $formatting_category->{$node->[1]} and
5135 #not $phrasing_category->{$node->[1]} and
5136 ($special_category->{$node->[1]} or
5137 $scoping_category->{$node->[1]}) and
5138 $node->[1] ne 'address' and $node->[1] ne 'div') {
5139 last LI;
5140 }
5141
5142 ## Step 4
5143 $i--;
5144 $node = $self->{open_elements}->[$i];
5145 redo LI;
5146 } # LI
5147
5148 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5149 !!!next-token;
5150 redo B;
5151 } elsif ($token->{tag_name} eq 'plaintext') {
5152 ## has a p element in scope
5153 INSCOPE: for (reverse @{$self->{open_elements}}) {
5154 if ($_->[1] eq 'p') {
5155 !!!back-token;
5156 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
5157 redo B;
5158 } elsif ({
5159 table => 1, caption => 1, td => 1, th => 1,
5160 button => 1, marquee => 1, object => 1, html => 1,
5161 }->{$_->[1]}) {
5162 last INSCOPE;
5163 }
5164 } # INSCOPE
5165
5166 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5167
5168 $self->{content_model} = PLAINTEXT_CONTENT_MODEL;
5169
5170 !!!next-token;
5171 redo B;
5172 } elsif ({
5173 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
5174 }->{$token->{tag_name}}) {
5175 ## has a p element in scope
5176 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5177 my $node = $self->{open_elements}->[$_];
5178 if ($node->[1] eq 'p') {
5179 !!!back-token;
5180 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
5181 redo B;
5182 } elsif ({
5183 table => 1, caption => 1, td => 1, th => 1,
5184 button => 1, marquee => 1, object => 1, html => 1,
5185 }->{$node->[1]}) {
5186 last INSCOPE;
5187 }
5188 } # INSCOPE
5189
5190 ## NOTE: See <http://html5.org/tools/web-apps-tracker?from=925&to=926>
5191 ## has an element in scope
5192 #my $i;
5193 #INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5194 # my $node = $self->{open_elements}->[$_];
5195 # if ({
5196 # h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
5197 # }->{$node->[1]}) {
5198 # $i = $_;
5199 # last INSCOPE;
5200 # } elsif ({
5201 # table => 1, caption => 1, td => 1, th => 1,
5202 # button => 1, marquee => 1, object => 1, html => 1,
5203 # }->{$node->[1]}) {
5204 # last INSCOPE;
5205 # }
5206 #} # INSCOPE
5207 #
5208 #if (defined $i) {
5209 # !!! parse-error (type => 'in hn:hn');
5210 # splice @{$self->{open_elements}}, $i;
5211 #}
5212
5213 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5214
5215 !!!next-token;
5216 redo B;
5217 } elsif ($token->{tag_name} eq 'a') {
5218 AFE: for my $i (reverse 0..$#$active_formatting_elements) {
5219 my $node = $active_formatting_elements->[$i];
5220 if ($node->[1] eq 'a') {
5221 !!!parse-error (type => 'in a:a');
5222
5223 !!!back-token;
5224 $token = {type => END_TAG_TOKEN, tag_name => 'a'};
5225 $formatting_end_tag->($token->{tag_name});
5226
5227 AFE2: for (reverse 0..$#$active_formatting_elements) {
5228 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
5229 splice @$active_formatting_elements, $_, 1;
5230 last AFE2;
5231 }
5232 } # AFE2
5233 OE: for (reverse 0..$#{$self->{open_elements}}) {
5234 if ($self->{open_elements}->[$_]->[0] eq $node->[0]) {
5235 splice @{$self->{open_elements}}, $_, 1;
5236 last OE;
5237 }
5238 } # OE
5239 last AFE;
5240 } elsif ($node->[0] eq '#marker') {
5241 last AFE;
5242 }
5243 } # AFE
5244
5245 $reconstruct_active_formatting_elements->($insert_to_current);
5246
5247 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5248 push @$active_formatting_elements, $self->{open_elements}->[-1];
5249
5250 !!!next-token;
5251 redo B;
5252 } elsif ({
5253 b => 1, big => 1, em => 1, font => 1, i => 1,
5254 s => 1, small => 1, strile => 1,
5255 strong => 1, tt => 1, u => 1,
5256 }->{$token->{tag_name}}) {
5257 $reconstruct_active_formatting_elements->($insert_to_current);
5258
5259 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5260 push @$active_formatting_elements, $self->{open_elements}->[-1];
5261
5262 !!!next-token;
5263 redo B;
5264 } elsif ($token->{tag_name} eq 'nobr') {
5265 $reconstruct_active_formatting_elements->($insert_to_current);
5266
5267 ## has a |nobr| element in scope
5268 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5269 my $node = $self->{open_elements}->[$_];
5270 if ($node->[1] eq 'nobr') {
5271 !!!parse-error (type => 'in nobr:nobr');
5272 !!!back-token;
5273 $token = {type => END_TAG_TOKEN, tag_name => 'nobr'};
5274 redo B;
5275 } elsif ({
5276 table => 1, caption => 1, td => 1, th => 1,
5277 button => 1, marquee => 1, object => 1, html => 1,
5278 }->{$node->[1]}) {
5279 last INSCOPE;
5280 }
5281 } # INSCOPE
5282
5283 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5284 push @$active_formatting_elements, $self->{open_elements}->[-1];
5285
5286 !!!next-token;
5287 redo B;
5288 } elsif ($token->{tag_name} eq 'button') {
5289 ## has a button element in scope
5290 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5291 my $node = $self->{open_elements}->[$_];
5292 if ($node->[1] eq 'button') {
5293 !!!parse-error (type => 'in button:button');
5294 !!!back-token;
5295 $token = {type => END_TAG_TOKEN, tag_name => 'button'};
5296 redo B;
5297 } elsif ({
5298 table => 1, caption => 1, td => 1, th => 1,
5299 button => 1, marquee => 1, object => 1, html => 1,
5300 }->{$node->[1]}) {
5301 last INSCOPE;
5302 }
5303 } # INSCOPE
5304
5305 $reconstruct_active_formatting_elements->($insert_to_current);
5306
5307 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5308 push @$active_formatting_elements, ['#marker', ''];
5309
5310 !!!next-token;
5311 redo B;
5312 } elsif ($token->{tag_name} eq 'marquee' or
5313 $token->{tag_name} eq 'object') {
5314 $reconstruct_active_formatting_elements->($insert_to_current);
5315
5316 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5317 push @$active_formatting_elements, ['#marker', ''];
5318
5319 !!!next-token;
5320 redo B;
5321 } elsif ($token->{tag_name} eq 'xmp') {
5322 $reconstruct_active_formatting_elements->($insert_to_current);
5323 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);
5324 redo B;
5325 } elsif ($token->{tag_name} eq 'table') {
5326 ## has a p element in scope
5327 INSCOPE: for (reverse @{$self->{open_elements}}) {
5328 if ($_->[1] eq 'p') {
5329 !!!back-token;
5330 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
5331 redo B;
5332 } elsif ({
5333 table => 1, caption => 1, td => 1, th => 1,
5334 button => 1, marquee => 1, object => 1, html => 1,
5335 }->{$_->[1]}) {
5336 last INSCOPE;
5337 }
5338 } # INSCOPE
5339
5340 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5341
5342 $self->{insertion_mode} = IN_TABLE_IM;
5343
5344 !!!next-token;
5345 redo B;
5346 } elsif ({
5347 area => 1, basefont => 1, bgsound => 1, br => 1,
5348 embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
5349 image => 1,
5350 }->{$token->{tag_name}}) {
5351 if ($token->{tag_name} eq 'image') {
5352 !!!parse-error (type => 'image');
5353 $token->{tag_name} = 'img';
5354 }
5355
5356 ## NOTE: There is an "as if <br>" code clone.
5357 $reconstruct_active_formatting_elements->($insert_to_current);
5358
5359 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5360 pop @{$self->{open_elements}};
5361
5362 !!!next-token;
5363 redo B;
5364 } elsif ($token->{tag_name} eq 'hr') {
5365 ## has a p element in scope
5366 INSCOPE: for (reverse @{$self->{open_elements}}) {
5367 if ($_->[1] eq 'p') {
5368 !!!back-token;
5369 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
5370 redo B;
5371 } elsif ({
5372 table => 1, caption => 1, td => 1, th => 1,
5373 button => 1, marquee => 1, object => 1, html => 1,
5374 }->{$_->[1]}) {
5375 last INSCOPE;
5376 }
5377 } # INSCOPE
5378
5379 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5380 pop @{$self->{open_elements}};
5381
5382 !!!next-token;
5383 redo B;
5384 } elsif ($token->{tag_name} eq 'input') {
5385 $reconstruct_active_formatting_elements->($insert_to_current);
5386
5387 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5388 ## TODO: associate with $self->{form_element} if defined
5389 pop @{$self->{open_elements}};
5390
5391 !!!next-token;
5392 redo B;
5393 } elsif ($token->{tag_name} eq 'isindex') {
5394 !!!parse-error (type => 'isindex');
5395
5396 if (defined $self->{form_element}) {
5397 ## Ignore the token
5398 !!!next-token;
5399 redo B;
5400 } else {
5401 my $at = $token->{attributes};
5402 my $form_attrs;
5403 $form_attrs->{action} = $at->{action} if $at->{action};
5404 my $prompt_attr = $at->{prompt};
5405 $at->{name} = {name => 'name', value => 'isindex'};
5406 delete $at->{action};
5407 delete $at->{prompt};
5408 my @tokens = (
5409 {type => START_TAG_TOKEN, tag_name => 'form',
5410 attributes => $form_attrs},
5411 {type => START_TAG_TOKEN, tag_name => 'hr'},
5412 {type => START_TAG_TOKEN, tag_name => 'p'},
5413 {type => START_TAG_TOKEN, tag_name => 'label'},
5414 );
5415 if ($prompt_attr) {
5416 push @tokens, {type => CHARACTER_TOKEN, data => $prompt_attr->{value}};
5417 } else {
5418 push @tokens, {type => CHARACTER_TOKEN,
5419 data => 'This is a searchable index. Insert your search keywords here: '}; # SHOULD
5420 ## TODO: make this configurable
5421 }
5422 push @tokens,
5423 {type => START_TAG_TOKEN, tag_name => 'input', attributes => $at},
5424 #{type => CHARACTER_TOKEN, data => ''}, # SHOULD
5425 {type => END_TAG_TOKEN, tag_name => 'label'},
5426 {type => END_TAG_TOKEN, tag_name => 'p'},
5427 {type => START_TAG_TOKEN, tag_name => 'hr'},
5428 {type => END_TAG_TOKEN, tag_name => 'form'};
5429 $token = shift @tokens;
5430 !!!back-token (@tokens);
5431 redo B;
5432 }
5433 } elsif ($token->{tag_name} eq 'textarea') {
5434 my $tag_name = $token->{tag_name};
5435 my $el;
5436 !!!create-element ($el, $token->{tag_name}, $token->{attributes});
5437
5438 ## TODO: $self->{form_element} if defined
5439 $self->{content_model} = RCDATA_CONTENT_MODEL;
5440 delete $self->{escape}; # MUST
5441
5442 $insert->($el);
5443
5444 my $text = '';
5445 !!!next-token;
5446 if ($token->{type} == CHARACTER_TOKEN) {
5447 $token->{data} =~ s/^\x0A//;
5448 unless (length $token->{data}) {
5449 !!!next-token;
5450 }
5451 }
5452 while ($token->{type} == CHARACTER_TOKEN) {
5453 $text .= $token->{data};
5454 !!!next-token;
5455 }
5456 if (length $text) {
5457 $el->manakai_append_text ($text);
5458 }
5459
5460 $self->{content_model} = PCDATA_CONTENT_MODEL;
5461
5462 if ($token->{type} == END_TAG_TOKEN and
5463 $token->{tag_name} eq $tag_name) {
5464 ## Ignore the token
5465 } else {
5466 !!!parse-error (type => 'in RCDATA:#'.$token->{type});
5467 }
5468 !!!next-token;
5469 redo B;
5470 } elsif ({
5471 iframe => 1,
5472 noembed => 1,
5473 noframes => 1,
5474 noscript => 0, ## TODO: 1 if scripting is enabled
5475 }->{$token->{tag_name}}) {
5476 ## NOTE: There is an "as if in body" code clone.
5477 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);
5478 redo B;
5479 } elsif ($token->{tag_name} eq 'select') {
5480 $reconstruct_active_formatting_elements->($insert_to_current);
5481
5482 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5483
5484 $self->{insertion_mode} = IN_SELECT_IM;
5485 !!!next-token;
5486 redo B;
5487 } elsif ({
5488 caption => 1, col => 1, colgroup => 1, frame => 1,
5489 frameset => 1, head => 1, option => 1, optgroup => 1,
5490 tbody => 1, td => 1, tfoot => 1, th => 1,
5491 thead => 1, tr => 1,
5492 }->{$token->{tag_name}}) {
5493 !!!parse-error (type => 'in body:'.$token->{tag_name});
5494 ## Ignore the token
5495 !!!next-token;
5496 redo B;
5497
5498 ## ISSUE: An issue on HTML5 new elements in the spec.
5499 } else {
5500 $reconstruct_active_formatting_elements->($insert_to_current);
5501
5502 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5503
5504 !!!next-token;
5505 redo B;
5506 }
5507 } elsif ($token->{type} == END_TAG_TOKEN) {
5508 if ($token->{tag_name} eq 'body') {
5509 if (@{$self->{open_elements}} > 1 and
5510 $self->{open_elements}->[1]->[1] eq 'body') {
5511 for (@{$self->{open_elements}}) {
5512 unless ({
5513 dd => 1, dt => 1, li => 1, p => 1, td => 1,
5514 th => 1, tr => 1, body => 1, html => 1,
5515 tbody => 1, tfoot => 1, thead => 1,
5516 }->{$_->[1]}) {
5517 !!!parse-error (type => 'not closed:'.$_->[1]);
5518 }
5519 }
5520
5521 $self->{insertion_mode} = AFTER_BODY_IM;
5522 !!!next-token;
5523 redo B;
5524 } else {
5525 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5526 ## Ignore the token
5527 !!!next-token;
5528 redo B;
5529 }
5530 } elsif ($token->{tag_name} eq 'html') {
5531 if (@{$self->{open_elements}} > 1 and $self->{open_elements}->[1]->[1] eq 'body') {
5532 ## ISSUE: There is an issue in the spec.
5533 if ($self->{open_elements}->[-1]->[1] ne 'body') {
5534 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[1]->[1]);
5535 }
5536 $self->{insertion_mode} = AFTER_BODY_IM;
5537 ## reprocess
5538 redo B;
5539 } else {
5540 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5541 ## Ignore the token
5542 !!!next-token;
5543 redo B;
5544 }
5545 } elsif ({
5546 address => 1, blockquote => 1, center => 1, dir => 1,
5547 div => 1, dl => 1, fieldset => 1, listing => 1,
5548 menu => 1, ol => 1, pre => 1, ul => 1,
5549 p => 1,
5550 dd => 1, dt => 1, li => 1,
5551 button => 1, marquee => 1, object => 1,
5552 }->{$token->{tag_name}}) {
5553 ## has an element in scope
5554 my $i;
5555 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5556 my $node = $self->{open_elements}->[$_];
5557 if ($node->[1] eq $token->{tag_name}) {
5558 ## generate implied end tags
5559 if ({
5560 dd => ($token->{tag_name} ne 'dd'),
5561 dt => ($token->{tag_name} ne 'dt'),
5562 li => ($token->{tag_name} ne 'li'),
5563 p => ($token->{tag_name} ne 'p'),
5564 td => 1, th => 1, tr => 1,
5565 tbody => 1, tfoot=> 1, thead => 1,
5566 }->{$self->{open_elements}->[-1]->[1]}) {
5567 !!!back-token;
5568 $token = {type => END_TAG_TOKEN,
5569 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
5570 redo B;
5571 }
5572 $i = $_;
5573 last INSCOPE unless $token->{tag_name} eq 'p';
5574 } elsif ({
5575 table => 1, caption => 1, td => 1, th => 1,
5576 button => 1, marquee => 1, object => 1, html => 1,
5577 }->{$node->[1]}) {
5578 last INSCOPE;
5579 }
5580 } # INSCOPE
5581
5582 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
5583 if (defined $i) {
5584 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
5585 } else {
5586 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5587 }
5588 }
5589
5590 if (defined $i) {
5591 splice @{$self->{open_elements}}, $i;
5592 } elsif ($token->{tag_name} eq 'p') {
5593 ## As if <p>, then reprocess the current token
5594 my $el;
5595 !!!create-element ($el, 'p');
5596 $insert->($el);
5597 }
5598 $clear_up_to_marker->()
5599 if {
5600 button => 1, marquee => 1, object => 1,
5601 }->{$token->{tag_name}};
5602 !!!next-token;
5603 redo B;
5604 } elsif ($token->{tag_name} eq 'form') {
5605 ## has an element in scope
5606 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5607 my $node = $self->{open_elements}->[$_];
5608 if ($node->[1] eq $token->{tag_name}) {
5609 ## generate implied end tags
5610 if ({
5611 dd => 1, dt => 1, li => 1, p => 1,
5612 td => 1, th => 1, tr => 1,
5613 tbody => 1, tfoot=> 1, thead => 1,
5614 }->{$self->{open_elements}->[-1]->[1]}) {
5615 !!!back-token;
5616 $token = {type => END_TAG_TOKEN,
5617 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
5618 redo B;
5619 }
5620 last INSCOPE;
5621 } elsif ({
5622 table => 1, caption => 1, td => 1, th => 1,
5623 button => 1, marquee => 1, object => 1, html => 1,
5624 }->{$node->[1]}) {
5625 last INSCOPE;
5626 }
5627 } # INSCOPE
5628
5629 if ($self->{open_elements}->[-1]->[1] eq $token->{tag_name}) {
5630 pop @{$self->{open_elements}};
5631 } else {
5632 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5633 }
5634
5635 undef $self->{form_element};
5636 !!!next-token;
5637 redo B;
5638 } elsif ({
5639 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
5640 }->{$token->{tag_name}}) {
5641 ## has an element in scope
5642 my $i;
5643 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5644 my $node = $self->{open_elements}->[$_];
5645 if ({
5646 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
5647 }->{$node->[1]}) {
5648 ## generate implied end tags
5649 if ({
5650 dd => 1, dt => 1, li => 1, p => 1,
5651 td => 1, th => 1, tr => 1,
5652 tbody => 1, tfoot=> 1, thead => 1,
5653 }->{$self->{open_elements}->[-1]->[1]}) {
5654 !!!back-token;
5655 $token = {type => END_TAG_TOKEN,
5656 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
5657 redo B;
5658 }
5659 $i = $_;
5660 last INSCOPE;
5661 } elsif ({
5662 table => 1, caption => 1, td => 1, th => 1,
5663 button => 1, marquee => 1, object => 1, html => 1,
5664 }->{$node->[1]}) {
5665 last INSCOPE;
5666 }
5667 } # INSCOPE
5668
5669 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
5670 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5671 }
5672
5673 splice @{$self->{open_elements}}, $i if defined $i;
5674 !!!next-token;
5675 redo B;
5676 } elsif ({
5677 a => 1,
5678 b => 1, big => 1, em => 1, font => 1, i => 1,
5679 nobr => 1, s => 1, small => 1, strile => 1,
5680 strong => 1, tt => 1, u => 1,
5681 }->{$token->{tag_name}}) {
5682 $formatting_end_tag->($token->{tag_name});
5683 redo B;
5684 } elsif ($token->{tag_name} eq 'br') {
5685 !!!parse-error (type => 'unmatched end tag:br');
5686
5687 ## As if <br>
5688 $reconstruct_active_formatting_elements->($insert_to_current);
5689
5690 my $el;
5691 !!!create-element ($el, 'br');
5692 $insert->($el);
5693
5694 ## Ignore the token.
5695 !!!next-token;
5696 redo B;
5697 } elsif ({
5698 caption => 1, col => 1, colgroup => 1, frame => 1,
5699 frameset => 1, head => 1, option => 1, optgroup => 1,
5700 tbody => 1, td => 1, tfoot => 1, th => 1,
5701 thead => 1, tr => 1,
5702 area => 1, basefont => 1, bgsound => 1,
5703 embed => 1, hr => 1, iframe => 1, image => 1,
5704 img => 1, input => 1, isindex => 1, noembed => 1,
5705 noframes => 1, param => 1, select => 1, spacer => 1,
5706 table => 1, textarea => 1, wbr => 1,
5707 noscript => 0, ## TODO: if scripting is enabled
5708 }->{$token->{tag_name}}) {
5709 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5710 ## Ignore the token
5711 !!!next-token;
5712 redo B;
5713
5714 ## ISSUE: Issue on HTML5 new elements in spec
5715
5716 } else {
5717 ## Step 1
5718 my $node_i = -1;
5719 my $node = $self->{open_elements}->[$node_i];
5720
5721 ## Step 2
5722 S2: {
5723 if ($node->[1] eq $token->{tag_name}) {
5724 ## Step 1
5725 ## generate implied end tags
5726 if ({
5727 dd => 1, dt => 1, li => 1, p => 1,
5728 td => 1, th => 1, tr => 1,
5729 tbody => 1, tfoot => 1, thead => 1,
5730 }->{$self->{open_elements}->[-1]->[1]}) {
5731 !!!back-token;
5732 $token = {type => END_TAG_TOKEN,
5733 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
5734 redo B;
5735 }
5736
5737 ## Step 2
5738 if ($token->{tag_name} ne $self->{open_elements}->[-1]->[1]) {
5739 ## NOTE: <x><y></x>
5740 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
5741 }
5742
5743 ## Step 3
5744 splice @{$self->{open_elements}}, $node_i;
5745
5746 !!!next-token;
5747 last S2;
5748 } else {
5749 ## Step 3
5750 if (not $formatting_category->{$node->[1]} and
5751 #not $phrasing_category->{$node->[1]} and
5752 ($special_category->{$node->[1]} or
5753 $scoping_category->{$node->[1]})) {
5754 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5755 ## Ignore the token
5756 !!!next-token;
5757 last S2;
5758 }
5759 }
5760
5761 ## Step 4
5762 $node_i--;
5763 $node = $self->{open_elements}->[$node_i];
5764
5765 ## Step 5;
5766 redo S2;
5767 } # S2
5768 redo B;
5769 }
5770 }
5771 redo B;
5772 } # B
5773
5774 ## NOTE: The "trailing end" phase in HTML5 is split into
5775 ## two insertion modes: "after html body" and "after html frameset".
5776 ## NOTE: States in the main stage is preserved while
5777 ## the parser stays in the trailing end phase. # MUST
5778
5779 ## Stop parsing # MUST
5780
5781 ## TODO: script stuffs
5782 } # _tree_construct_main
5783
5784 sub set_inner_html ($$$) {
5785 my $class = shift;
5786 my $node = shift;
5787 my $s = \$_[0];
5788 my $onerror = $_[1];
5789
5790 ## ISSUE: Should {confident} be true?
5791
5792 my $nt = $node->node_type;
5793 if ($nt == 9) {
5794 # MUST
5795
5796 ## Step 1 # MUST
5797 ## TODO: If the document has an active parser, ...
5798 ## ISSUE: There is an issue in the spec.
5799
5800 ## Step 2 # MUST
5801 my @cn = @{$node->child_nodes};
5802 for (@cn) {
5803 $node->remove_child ($_);
5804 }
5805
5806 ## Step 3, 4, 5 # MUST
5807 $class->parse_string ($$s => $node, $onerror);
5808 } elsif ($nt == 1) {
5809 ## TODO: If non-html element
5810
5811 ## NOTE: Most of this code is copied from |parse_string|
5812
5813 ## Step 1 # MUST
5814 my $this_doc = $node->owner_document;
5815 my $doc = $this_doc->implementation->create_document;
5816 $doc->manakai_is_html (1);
5817 my $p = $class->new;
5818 $p->{document} = $doc;
5819
5820 ## Step 9 # MUST
5821 my $i = 0;
5822 my $line = 1;
5823 my $column = 0;
5824 $p->{set_next_char} = sub {
5825 my $self = shift;
5826
5827 pop @{$self->{prev_char}};
5828 unshift @{$self->{prev_char}}, $self->{next_char};
5829
5830 $self->{next_char} = -1 and return if $i >= length $$s;
5831 $self->{next_char} = ord substr $$s, $i++, 1;
5832 $column++;
5833
5834 if ($self->{next_char} == 0x000A) { # LF
5835 $line++;
5836 $column = 0;
5837 } elsif ($self->{next_char} == 0x000D) { # CR
5838 $i++ if substr ($$s, $i, 1) eq "\x0A";
5839 $self->{next_char} = 0x000A; # LF # MUST
5840 $line++;
5841 $column = 0;
5842 } elsif ($self->{next_char} > 0x10FFFF) {
5843 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
5844 } elsif ($self->{next_char} == 0x0000) { # NULL
5845 !!!parse-error (type => 'NULL');
5846 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
5847 }
5848 };
5849 $p->{prev_char} = [-1, -1, -1];
5850 $p->{next_char} = -1;
5851
5852 my $ponerror = $onerror || sub {
5853 my (%opt) = @_;
5854 warn "Parse error ($opt{type}) at line $opt{line} column $opt{column}\n";
5855 };
5856 $p->{parse_error} = sub {
5857 $ponerror->(@_, line => $line, column => $column);
5858 };
5859
5860 $p->_initialize_tokenizer;
5861 $p->_initialize_tree_constructor;
5862
5863 ## Step 2
5864 my $node_ln = $node->manakai_local_name;
5865 $p->{content_model} = {
5866 title => RCDATA_CONTENT_MODEL,
5867 textarea => RCDATA_CONTENT_MODEL,
5868 style => CDATA_CONTENT_MODEL,
5869 script => CDATA_CONTENT_MODEL,
5870 xmp => CDATA_CONTENT_MODEL,
5871 iframe => CDATA_CONTENT_MODEL,
5872 noembed => CDATA_CONTENT_MODEL,
5873 noframes => CDATA_CONTENT_MODEL,
5874 noscript => CDATA_CONTENT_MODEL,
5875 plaintext => PLAINTEXT_CONTENT_MODEL,
5876 }->{$node_ln};
5877 $p->{content_model} = PCDATA_CONTENT_MODEL
5878 unless defined $p->{content_model};
5879 ## ISSUE: What is "the name of the element"? local name?
5880
5881 $p->{inner_html_node} = [$node, $node_ln];
5882
5883 ## Step 4
5884 my $root = $doc->create_element_ns
5885 ('http://www.w3.org/1999/xhtml', [undef, 'html']);
5886
5887 ## Step 5 # MUST
5888 $doc->append_child ($root);
5889
5890 ## Step 6 # MUST
5891 push @{$p->{open_elements}}, [$root, 'html'];
5892
5893 undef $p->{head_element};
5894
5895 ## Step 7 # MUST
5896 $p->_reset_insertion_mode;
5897
5898 ## Step 8 # MUST
5899 my $anode = $node;
5900 AN: while (defined $anode) {
5901 if ($anode->node_type == 1) {
5902 my $nsuri = $anode->namespace_uri;
5903 if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {
5904 if ($anode->manakai_local_name eq 'form') {
5905 $p->{form_element} = $anode;
5906 last AN;
5907 }
5908 }
5909 }
5910 $anode = $anode->parent_node;
5911 } # AN
5912
5913 ## Step 3 # MUST
5914 ## Step 10 # MUST
5915 {
5916 my $self = $p;
5917 !!!next-token;
5918 }
5919 $p->_tree_construction_main;
5920
5921 ## Step 11 # MUST
5922 my @cn = @{$node->child_nodes};
5923 for (@cn) {
5924 $node->remove_child ($_);
5925 }
5926 ## ISSUE: mutation events? read-only?
5927
5928 ## Step 12 # MUST
5929 @cn = @{$root->child_nodes};
5930 for (@cn) {
5931 $this_doc->adopt_node ($_);
5932 $node->append_child ($_);
5933 }
5934 ## ISSUE: mutation events?
5935
5936 $p->_terminate_tree_constructor;
5937 } else {
5938 die "$0: |set_inner_html| is not defined for node of type $nt";
5939 }
5940 } # set_inner_html
5941
5942 } # tree construction stage
5943
5944 package Whatpm::HTML::RestartParser;
5945 push our @ISA, 'Error';
5946
5947 1;
5948 # $Date: 2008/03/03 10:20:19 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24