/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.72 - (show annotations) (download) (as text)
Sun Mar 2 14:32:26 2008 UTC (17 years, 10 months ago) by wakaba
Branch: MAIN
Changes since 1.71: +89 -9 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	2 Mar 2008 14:06:22 -0000
	* tokenizer-test-1.test: Tests for |<span ===>| is added (HTML5
	revision 1292).  Tests for & at the end of attribute value
	are added (HTML5 revision 1296).  Tests for bogus comments
	are added (HTML5 revision 1297).  Tests for |=| in
	unquoted attribute values are added (HTML5 revision 1299).
	Tests for single or double quotes in unquoted attribute
	values or attribute names and tests for missing spaces
	between attributes are added (HTML5 revision 1303).

2008-03-02  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ChangeLog	2 Mar 2008 14:05:38 -0000
	* HTML.pm.src: Raise a parse error for |<span ===>| (HTML5 revision
	1292).  Entities are not parsed in comment-like part in RCDATA
	elements (HTML5 revision 1294).  Allow bare & at the end
	of attribute value literals (HTML5 revision 1296).  More
	quirks mode doctypes (HTML5 revision 1302).  Requires spaces
	between attributes and ban attribute names or unquoted
	attribute values containing single or double quotes (HTML5
	revision 1303).

2008-03-02  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.71 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 use Error qw(:try);
5
6 ## ISSUE:
7 ## var doc = implementation.createDocument (null, null, null);
8 ## doc.write ('');
9 ## alert (doc.compatMode);
10
11 ## TODO: Control charcters and noncharacters are not allowed (HTML5 revision 1263)
12 ## TODO: 1252 parse error (revision 1264)
13 ## TODO: 8859-11 = 874 (revision 1271)
14
15 my $permitted_slash_tag_name = {
16 base => 1,
17 link => 1,
18 meta => 1,
19 hr => 1,
20 br => 1,
21 img => 1,
22 embed => 1,
23 param => 1,
24 area => 1,
25 col => 1,
26 input => 1,
27 };
28
29 my $c1_entity_char = {
30 0x80 => 0x20AC,
31 0x81 => 0xFFFD,
32 0x82 => 0x201A,
33 0x83 => 0x0192,
34 0x84 => 0x201E,
35 0x85 => 0x2026,
36 0x86 => 0x2020,
37 0x87 => 0x2021,
38 0x88 => 0x02C6,
39 0x89 => 0x2030,
40 0x8A => 0x0160,
41 0x8B => 0x2039,
42 0x8C => 0x0152,
43 0x8D => 0xFFFD,
44 0x8E => 0x017D,
45 0x8F => 0xFFFD,
46 0x90 => 0xFFFD,
47 0x91 => 0x2018,
48 0x92 => 0x2019,
49 0x93 => 0x201C,
50 0x94 => 0x201D,
51 0x95 => 0x2022,
52 0x96 => 0x2013,
53 0x97 => 0x2014,
54 0x98 => 0x02DC,
55 0x99 => 0x2122,
56 0x9A => 0x0161,
57 0x9B => 0x203A,
58 0x9C => 0x0153,
59 0x9D => 0xFFFD,
60 0x9E => 0x017E,
61 0x9F => 0x0178,
62 }; # $c1_entity_char
63
64 my $special_category = {
65 address => 1, area => 1, base => 1, basefont => 1, bgsound => 1,
66 blockquote => 1, body => 1, br => 1, center => 1, col => 1, colgroup => 1,
67 dd => 1, dir => 1, div => 1, dl => 1, dt => 1, embed => 1, fieldset => 1,
68 form => 1, frame => 1, frameset => 1, h1 => 1, h2 => 1, h3 => 1,
69 h4 => 1, h5 => 1, h6 => 1, head => 1, hr => 1, iframe => 1, image => 1,
70 img => 1, input => 1, isindex => 1, li => 1, link => 1, listing => 1,
71 menu => 1, meta => 1, noembed => 1, noframes => 1, noscript => 1,
72 ol => 1, optgroup => 1, option => 1, p => 1, param => 1, plaintext => 1,
73 pre => 1, script => 1, select => 1, spacer => 1, style => 1, tbody => 1,
74 textarea => 1, tfoot => 1, thead => 1, title => 1, tr => 1, ul => 1, wbr => 1,
75 };
76 my $scoping_category = {
77 button => 1, caption => 1, html => 1, marquee => 1, object => 1,
78 table => 1, td => 1, th => 1,
79 };
80 my $formatting_category = {
81 a => 1, b => 1, big => 1, em => 1, font => 1, i => 1, nobr => 1,
82 s => 1, small => 1, strile => 1, strong => 1, tt => 1, u => 1,
83 };
84 # $phrasing_category: all other elements
85
86 sub parse_byte_string ($$$$;$) {
87 my $self = ref $_[0] ? shift : shift->new;
88 my $charset = shift;
89 my $bytes_s = ref $_[0] ? $_[0] : \($_[0]);
90 my $s;
91
92 if (defined $charset) {
93 require Encode; ## TODO: decode(utf8) don't delete BOM
94 $s = \ (Encode::decode ($charset, $$bytes_s));
95 $self->{input_encoding} = lc $charset; ## TODO: normalize name
96 $self->{confident} = 1;
97 } else {
98 ## TODO: Implement HTML5 detection algorithm
99 require Whatpm::Charset::UniversalCharDet;
100 $charset = Whatpm::Charset::UniversalCharDet->detect_byte_string
101 (substr ($$bytes_s, 0, 1024));
102 $charset ||= 'windows-1252';
103 $s = \ (Encode::decode ($charset, $$bytes_s));
104 $self->{input_encoding} = $charset;
105 $self->{confident} = 0;
106 }
107
108 $self->{change_encoding} = sub {
109 my $self = shift;
110 my $charset = lc shift;
111 ## TODO: if $charset is supported
112 ## TODO: normalize charset name
113
114 ## "Change the encoding" algorithm:
115
116 ## Step 1
117 if ($charset eq 'utf-16') { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8?
118 $charset = 'utf-8';
119 }
120
121 ## Step 2
122 if (defined $self->{input_encoding} and
123 $self->{input_encoding} eq $charset) {
124 $self->{confident} = 1;
125 return;
126 }
127
128 !!!parse-error (type => 'charset label detected:'.$self->{input_encoding}.
129 ':'.$charset, level => 'w');
130
131 ## Step 3
132 # if (can) {
133 ## change the encoding on the fly.
134 #$self->{confident} = 1;
135 #return;
136 # }
137
138 ## Step 4
139 throw Whatpm::HTML::RestartParser (charset => $charset);
140 }; # $self->{change_encoding}
141
142 my @args = @_; shift @args; # $s
143 my $return;
144 try {
145 $return = $self->parse_char_string ($s, @args);
146 } catch Whatpm::HTML::RestartParser with {
147 my $charset = shift->{charset};
148 $s = \ (Encode::decode ($charset, $$bytes_s));
149 $self->{input_encoding} = $charset; ## TODO: normalize
150 $self->{confident} = 1;
151 $return = $self->parse_char_string ($s, @args);
152 };
153 return $return;
154 } # parse_byte_string
155
156 ## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM
157 ## and the HTML layer MUST ignore it. However, we does strip BOM in
158 ## the encoding layer and the HTML layer does not ignore any U+FEFF,
159 ## because the core part of our HTML parser expects a string of character,
160 ## not a string of bytes or code units or anything which might contain a BOM.
161 ## Therefore, any parser interface that accepts a string of bytes,
162 ## such as |parse_byte_string| in this module, must ensure that it does
163 ## strip the BOM and never strip any ZWNBSP.
164
165 *parse_char_string = \&parse_string;
166
167 sub parse_string ($$$;$) {
168 my $self = ref $_[0] ? shift : shift->new;
169 my $s = ref $_[0] ? $_[0] : \($_[0]);
170 $self->{document} = $_[1];
171 @{$self->{document}->child_nodes} = ();
172
173 ## NOTE: |set_inner_html| copies most of this method's code
174
175 $self->{confident} = 1 unless exists $self->{confident};
176 $self->{document}->input_encoding ($self->{input_encoding})
177 if defined $self->{input_encoding};
178
179 my $i = 0;
180 my $line = 1;
181 my $column = 0;
182 $self->{set_next_input_character} = sub {
183 my $self = shift;
184
185 pop @{$self->{prev_input_character}};
186 unshift @{$self->{prev_input_character}}, $self->{next_input_character};
187
188 $self->{next_input_character} = -1 and return if $i >= length $$s;
189 $self->{next_input_character} = ord substr $$s, $i++, 1;
190 $column++;
191
192 if ($self->{next_input_character} == 0x000A) { # LF
193 $line++;
194 $column = 0;
195 } elsif ($self->{next_input_character} == 0x000D) { # CR
196 $i++ if substr ($$s, $i, 1) eq "\x0A";
197 $self->{next_input_character} = 0x000A; # LF # MUST
198 $line++;
199 $column = 0;
200 } elsif ($self->{next_input_character} > 0x10FFFF) {
201 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
202 } elsif ($self->{next_input_character} == 0x0000) { # NULL
203 !!!parse-error (type => 'NULL');
204 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
205 }
206 };
207 $self->{prev_input_character} = [-1, -1, -1];
208 $self->{next_input_character} = -1;
209
210 my $onerror = $_[2] || sub {
211 my (%opt) = @_;
212 warn "Parse error ($opt{type}) at line $opt{line} column $opt{column}\n";
213 };
214 $self->{parse_error} = sub {
215 $onerror->(@_, line => $line, column => $column);
216 };
217
218 $self->_initialize_tokenizer;
219 $self->_initialize_tree_constructor;
220 $self->_construct_tree;
221 $self->_terminate_tree_constructor;
222
223 return $self->{document};
224 } # parse_string
225
226 sub new ($) {
227 my $class = shift;
228 my $self = bless {}, $class;
229 $self->{set_next_input_character} = sub {
230 $self->{next_input_character} = -1;
231 };
232 $self->{parse_error} = sub {
233 #
234 };
235 $self->{change_encoding} = sub {
236 # if ($_[0] is a supported encoding) {
237 # run "change the encoding" algorithm;
238 # throw Whatpm::HTML::RestartParser (charset => $new_encoding);
239 # }
240 };
241 $self->{application_cache_selection} = sub {
242 #
243 };
244 return $self;
245 } # new
246
247 sub CM_ENTITY () { 0b001 } # & markup in data
248 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
249 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
250
251 sub PLAINTEXT_CONTENT_MODEL () { 0 }
252 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
253 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
254 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
255
256 sub DATA_STATE () { 0 }
257 sub ENTITY_DATA_STATE () { 1 }
258 sub TAG_OPEN_STATE () { 2 }
259 sub CLOSE_TAG_OPEN_STATE () { 3 }
260 sub TAG_NAME_STATE () { 4 }
261 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
262 sub ATTRIBUTE_NAME_STATE () { 6 }
263 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
264 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
265 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
266 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
267 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
268 sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
269 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
270 sub COMMENT_START_STATE () { 14 }
271 sub COMMENT_START_DASH_STATE () { 15 }
272 sub COMMENT_STATE () { 16 }
273 sub COMMENT_END_STATE () { 17 }
274 sub COMMENT_END_DASH_STATE () { 18 }
275 sub BOGUS_COMMENT_STATE () { 19 }
276 sub DOCTYPE_STATE () { 20 }
277 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
278 sub DOCTYPE_NAME_STATE () { 22 }
279 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
280 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
281 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
282 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
283 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
284 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
285 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
286 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
287 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
288 sub BOGUS_DOCTYPE_STATE () { 32 }
289 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
290
291 sub DOCTYPE_TOKEN () { 1 }
292 sub COMMENT_TOKEN () { 2 }
293 sub START_TAG_TOKEN () { 3 }
294 sub END_TAG_TOKEN () { 4 }
295 sub END_OF_FILE_TOKEN () { 5 }
296 sub CHARACTER_TOKEN () { 6 }
297
298 sub AFTER_HTML_IMS () { 0b100 }
299 sub HEAD_IMS () { 0b1000 }
300 sub BODY_IMS () { 0b10000 }
301 sub BODY_TABLE_IMS () { 0b100000 }
302 sub TABLE_IMS () { 0b1000000 }
303 sub ROW_IMS () { 0b10000000 }
304 sub BODY_AFTER_IMS () { 0b100000000 }
305 sub FRAME_IMS () { 0b1000000000 }
306
307 sub AFTER_HTML_BODY_IM () { AFTER_HTML_IMS | BODY_AFTER_IMS }
308 sub AFTER_HTML_FRAMESET_IM () { AFTER_HTML_IMS | FRAME_IMS }
309 sub IN_HEAD_IM () { HEAD_IMS | 0b00 }
310 sub IN_HEAD_NOSCRIPT_IM () { HEAD_IMS | 0b01 }
311 sub AFTER_HEAD_IM () { HEAD_IMS | 0b10 }
312 sub BEFORE_HEAD_IM () { HEAD_IMS | 0b11 }
313 sub IN_BODY_IM () { BODY_IMS }
314 sub IN_CELL_IM () { BODY_IMS | BODY_TABLE_IMS | 0b01 }
315 sub IN_CAPTION_IM () { BODY_IMS | BODY_TABLE_IMS | 0b10 }
316 sub IN_ROW_IM () { TABLE_IMS | ROW_IMS | 0b01 }
317 sub IN_TABLE_BODY_IM () { TABLE_IMS | ROW_IMS | 0b10 }
318 sub IN_TABLE_IM () { TABLE_IMS }
319 sub AFTER_BODY_IM () { BODY_AFTER_IMS }
320 sub IN_FRAMESET_IM () { FRAME_IMS | 0b01 }
321 sub AFTER_FRAMESET_IM () { FRAME_IMS | 0b10 }
322 sub IN_SELECT_IM () { 0b01 }
323 sub IN_COLUMN_GROUP_IM () { 0b10 }
324
325 ## Implementations MUST act as if state machine in the spec
326
327 sub _initialize_tokenizer ($) {
328 my $self = shift;
329 $self->{state} = DATA_STATE; # MUST
330 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
331 undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE
332 undef $self->{current_attribute};
333 undef $self->{last_emitted_start_tag_name};
334 undef $self->{last_attribute_value_state};
335 $self->{char} = [];
336 # $self->{next_input_character}
337 !!!next-input-character;
338 $self->{token} = [];
339 # $self->{escape}
340 } # _initialize_tokenizer
341
342 ## A token has:
343 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
344 ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
345 ## ->{name} (DOCTYPE_TOKEN)
346 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
347 ## ->{public_identifier} (DOCTYPE_TOKEN)
348 ## ->{system_identifier} (DOCTYPE_TOKEN)
349 ## ->{correct} == 1 or 0 (DOCTYPE_TOKEN)
350 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
351 ## ->{name}
352 ## ->{value}
353 ## ->{has_reference} == 1 or 0
354 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
355
356 ## Emitted token MUST immediately be handled by the tree construction state.
357
358 ## Before each step, UA MAY check to see if either one of the scripts in
359 ## "list of scripts that will execute as soon as possible" or the first
360 ## script in the "list of scripts that will execute asynchronously",
361 ## has completed loading. If one has, then it MUST be executed
362 ## and removed from the list.
363
364 ## NOTE: HTML5 "Writing HTML documents" section, applied to
365 ## documents and not to user agents and conformance checkers,
366 ## contains some requirements that are not detected by the
367 ## parsing algorithm:
368 ## - Some requirements on character encoding declarations. ## TODO
369 ## - "Elements MUST NOT contain content that their content model disallows."
370 ## ... Some are parse error, some are not (will be reported by c.c.).
371 ## - Polytheistic slash SHOULD NOT be used. (Applied only to atheists.) ## TODO
372 ## - Text (in elements, attributes, and comments) SHOULD NOT contain
373 ## control characters other than space characters. ## TODO: (what is control character? C0, C1 and DEL? Unicode control character?)
374
375 ## TODO: HTML5 poses authors two SHOULD-level requirements that cannot
376 ## be detected by the HTML5 parsing algorithm:
377 ## - Text,
378
379 sub _get_next_token ($) {
380 my $self = shift;
381 if (@{$self->{token}}) {
382 return shift @{$self->{token}};
383 }
384
385 A: {
386 if ($self->{state} == DATA_STATE) {
387 if ($self->{next_input_character} == 0x0026) { # &
388 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
389 not $self->{escape}) {
390 $self->{state} = ENTITY_DATA_STATE;
391 !!!next-input-character;
392 redo A;
393 } else {
394 #
395 }
396 } elsif ($self->{next_input_character} == 0x002D) { # -
397 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
398 unless ($self->{escape}) {
399 if ($self->{prev_input_character}->[0] == 0x002D and # -
400 $self->{prev_input_character}->[1] == 0x0021 and # !
401 $self->{prev_input_character}->[2] == 0x003C) { # <
402 $self->{escape} = 1;
403 }
404 }
405 }
406
407 #
408 } elsif ($self->{next_input_character} == 0x003C) { # <
409 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
410 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
411 not $self->{escape})) {
412 $self->{state} = TAG_OPEN_STATE;
413 !!!next-input-character;
414 redo A;
415 } else {
416 #
417 }
418 } elsif ($self->{next_input_character} == 0x003E) { # >
419 if ($self->{escape} and
420 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
421 if ($self->{prev_input_character}->[0] == 0x002D and # -
422 $self->{prev_input_character}->[1] == 0x002D) { # -
423 delete $self->{escape};
424 }
425 }
426
427 #
428 } elsif ($self->{next_input_character} == -1) {
429 !!!emit ({type => END_OF_FILE_TOKEN});
430 last A; ## TODO: ok?
431 }
432 # Anything else
433 my $token = {type => CHARACTER_TOKEN,
434 data => chr $self->{next_input_character}};
435 ## Stay in the data state
436 !!!next-input-character;
437
438 !!!emit ($token);
439
440 redo A;
441 } elsif ($self->{state} == ENTITY_DATA_STATE) {
442 ## (cannot happen in CDATA state)
443
444 my $token = $self->_tokenize_attempt_to_consume_an_entity (0, -1);
445
446 $self->{state} = DATA_STATE;
447 # next-input-character is already done
448
449 unless (defined $token) {
450 !!!emit ({type => CHARACTER_TOKEN, data => '&'});
451 } else {
452 !!!emit ($token);
453 }
454
455 redo A;
456 } elsif ($self->{state} == TAG_OPEN_STATE) {
457 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
458 if ($self->{next_input_character} == 0x002F) { # /
459 !!!next-input-character;
460 $self->{state} = CLOSE_TAG_OPEN_STATE;
461 redo A;
462 } else {
463 ## reconsume
464 $self->{state} = DATA_STATE;
465
466 !!!emit ({type => CHARACTER_TOKEN, data => '<'});
467
468 redo A;
469 }
470 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
471 if ($self->{next_input_character} == 0x0021) { # !
472 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
473 !!!next-input-character;
474 redo A;
475 } elsif ($self->{next_input_character} == 0x002F) { # /
476 $self->{state} = CLOSE_TAG_OPEN_STATE;
477 !!!next-input-character;
478 redo A;
479 } elsif (0x0041 <= $self->{next_input_character} and
480 $self->{next_input_character} <= 0x005A) { # A..Z
481 $self->{current_token}
482 = {type => START_TAG_TOKEN,
483 tag_name => chr ($self->{next_input_character} + 0x0020)};
484 $self->{state} = TAG_NAME_STATE;
485 !!!next-input-character;
486 redo A;
487 } elsif (0x0061 <= $self->{next_input_character} and
488 $self->{next_input_character} <= 0x007A) { # a..z
489 $self->{current_token} = {type => START_TAG_TOKEN,
490 tag_name => chr ($self->{next_input_character})};
491 $self->{state} = TAG_NAME_STATE;
492 !!!next-input-character;
493 redo A;
494 } elsif ($self->{next_input_character} == 0x003E) { # >
495 !!!parse-error (type => 'empty start tag');
496 $self->{state} = DATA_STATE;
497 !!!next-input-character;
498
499 !!!emit ({type => CHARACTER_TOKEN, data => '<>'});
500
501 redo A;
502 } elsif ($self->{next_input_character} == 0x003F) { # ?
503 !!!parse-error (type => 'pio');
504 $self->{state} = BOGUS_COMMENT_STATE;
505 ## $self->{next_input_character} is intentionally left as is
506 redo A;
507 } else {
508 !!!parse-error (type => 'bare stago');
509 $self->{state} = DATA_STATE;
510 ## reconsume
511
512 !!!emit ({type => CHARACTER_TOKEN, data => '<'});
513
514 redo A;
515 }
516 } else {
517 die "$0: $self->{content_model} in tag open";
518 }
519 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
520 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
521 if (defined $self->{last_emitted_start_tag_name}) {
522 ## NOTE: <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>
523 my @next_char;
524 TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
525 push @next_char, $self->{next_input_character};
526 my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
527 my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
528 if ($self->{next_input_character} == $c or $self->{next_input_character} == $C) {
529 !!!next-input-character;
530 next TAGNAME;
531 } else {
532 $self->{next_input_character} = shift @next_char; # reconsume
533 !!!back-next-input-character (@next_char);
534 $self->{state} = DATA_STATE;
535
536 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
537
538 redo A;
539 }
540 }
541 push @next_char, $self->{next_input_character};
542
543 unless ($self->{next_input_character} == 0x0009 or # HT
544 $self->{next_input_character} == 0x000A or # LF
545 $self->{next_input_character} == 0x000B or # VT
546 $self->{next_input_character} == 0x000C or # FF
547 $self->{next_input_character} == 0x0020 or # SP
548 $self->{next_input_character} == 0x003E or # >
549 $self->{next_input_character} == 0x002F or # /
550 $self->{next_input_character} == -1) {
551 $self->{next_input_character} = shift @next_char; # reconsume
552 !!!back-next-input-character (@next_char);
553 $self->{state} = DATA_STATE;
554 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
555 redo A;
556 } else {
557 $self->{next_input_character} = shift @next_char;
558 !!!back-next-input-character (@next_char);
559 # and consume...
560 }
561 } else {
562 ## No start tag token has ever been emitted
563 # next-input-character is already done
564 $self->{state} = DATA_STATE;
565 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
566 redo A;
567 }
568 }
569
570 if (0x0041 <= $self->{next_input_character} and
571 $self->{next_input_character} <= 0x005A) { # A..Z
572 $self->{current_token} = {type => END_TAG_TOKEN,
573 tag_name => chr ($self->{next_input_character} + 0x0020)};
574 $self->{state} = TAG_NAME_STATE;
575 !!!next-input-character;
576 redo A;
577 } elsif (0x0061 <= $self->{next_input_character} and
578 $self->{next_input_character} <= 0x007A) { # a..z
579 $self->{current_token} = {type => END_TAG_TOKEN,
580 tag_name => chr ($self->{next_input_character})};
581 $self->{state} = TAG_NAME_STATE;
582 !!!next-input-character;
583 redo A;
584 } elsif ($self->{next_input_character} == 0x003E) { # >
585 !!!parse-error (type => 'empty end tag');
586 $self->{state} = DATA_STATE;
587 !!!next-input-character;
588 redo A;
589 } elsif ($self->{next_input_character} == -1) {
590 !!!parse-error (type => 'bare etago');
591 $self->{state} = DATA_STATE;
592 # reconsume
593
594 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
595
596 redo A;
597 } else {
598 !!!parse-error (type => 'bogus end tag');
599 $self->{state} = BOGUS_COMMENT_STATE;
600 ## $self->{next_input_character} is intentionally left as is
601 redo A;
602 }
603 } elsif ($self->{state} == TAG_NAME_STATE) {
604 if ($self->{next_input_character} == 0x0009 or # HT
605 $self->{next_input_character} == 0x000A or # LF
606 $self->{next_input_character} == 0x000B or # VT
607 $self->{next_input_character} == 0x000C or # FF
608 $self->{next_input_character} == 0x0020) { # SP
609 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
610 !!!next-input-character;
611 redo A;
612 } elsif ($self->{next_input_character} == 0x003E) { # >
613 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
614 $self->{current_token}->{first_start_tag}
615 = not defined $self->{last_emitted_start_tag_name};
616 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
617 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
618 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
619 if ($self->{current_token}->{attributes}) {
620 !!!parse-error (type => 'end tag attribute');
621 }
622 } else {
623 die "$0: $self->{current_token}->{type}: Unknown token type";
624 }
625 $self->{state} = DATA_STATE;
626 !!!next-input-character;
627
628 !!!emit ($self->{current_token}); # start tag or end tag
629
630 redo A;
631 } elsif (0x0041 <= $self->{next_input_character} and
632 $self->{next_input_character} <= 0x005A) { # A..Z
633 $self->{current_token}->{tag_name} .= chr ($self->{next_input_character} + 0x0020);
634 # start tag or end tag
635 ## Stay in this state
636 !!!next-input-character;
637 redo A;
638 } elsif ($self->{next_input_character} == -1) {
639 !!!parse-error (type => 'unclosed tag');
640 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
641 $self->{current_token}->{first_start_tag}
642 = not defined $self->{last_emitted_start_tag_name};
643 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
644 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
645 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
646 if ($self->{current_token}->{attributes}) {
647 !!!parse-error (type => 'end tag attribute');
648 }
649 } else {
650 die "$0: $self->{current_token}->{type}: Unknown token type";
651 }
652 $self->{state} = DATA_STATE;
653 # reconsume
654
655 !!!emit ($self->{current_token}); # start tag or end tag
656
657 redo A;
658 } elsif ($self->{next_input_character} == 0x002F) { # /
659 !!!next-input-character;
660 if ($self->{next_input_character} == 0x003E and # >
661 $self->{current_token}->{type} == START_TAG_TOKEN and
662 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
663 # permitted slash
664 #
665 } else {
666 !!!parse-error (type => 'nestc');
667 }
668 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
669 # next-input-character is already done
670 redo A;
671 } else {
672 $self->{current_token}->{tag_name} .= chr $self->{next_input_character};
673 # start tag or end tag
674 ## Stay in the state
675 !!!next-input-character;
676 redo A;
677 }
678 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
679 if ($self->{next_input_character} == 0x0009 or # HT
680 $self->{next_input_character} == 0x000A or # LF
681 $self->{next_input_character} == 0x000B or # VT
682 $self->{next_input_character} == 0x000C or # FF
683 $self->{next_input_character} == 0x0020) { # SP
684 ## Stay in the state
685 !!!next-input-character;
686 redo A;
687 } elsif ($self->{next_input_character} == 0x003E) { # >
688 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
689 $self->{current_token}->{first_start_tag}
690 = not defined $self->{last_emitted_start_tag_name};
691 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
692 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
693 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
694 if ($self->{current_token}->{attributes}) {
695 !!!parse-error (type => 'end tag attribute');
696 }
697 } else {
698 die "$0: $self->{current_token}->{type}: Unknown token type";
699 }
700 $self->{state} = DATA_STATE;
701 !!!next-input-character;
702
703 !!!emit ($self->{current_token}); # start tag or end tag
704
705 redo A;
706 } elsif (0x0041 <= $self->{next_input_character} and
707 $self->{next_input_character} <= 0x005A) { # A..Z
708 $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
709 value => ''};
710 $self->{state} = ATTRIBUTE_NAME_STATE;
711 !!!next-input-character;
712 redo A;
713 } elsif ($self->{next_input_character} == 0x002F) { # /
714 !!!next-input-character;
715 if ($self->{next_input_character} == 0x003E and # >
716 $self->{current_token}->{type} == START_TAG_TOKEN and
717 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
718 # permitted slash
719 #
720 } else {
721 !!!parse-error (type => 'nestc');
722 }
723 ## Stay in the state
724 # next-input-character is already done
725 redo A;
726 } elsif ($self->{next_input_character} == -1) {
727 !!!parse-error (type => 'unclosed tag');
728 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
729 $self->{current_token}->{first_start_tag}
730 = not defined $self->{last_emitted_start_tag_name};
731 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
732 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
733 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
734 if ($self->{current_token}->{attributes}) {
735 !!!parse-error (type => 'end tag attribute');
736 }
737 } else {
738 die "$0: $self->{current_token}->{type}: Unknown token type";
739 }
740 $self->{state} = DATA_STATE;
741 # reconsume
742
743 !!!emit ($self->{current_token}); # start tag or end tag
744
745 redo A;
746 } else {
747 if ({
748 0x0022 => 1, # "
749 0x0027 => 1, # '
750 0x003D => 1, # =
751 }->{$self->{next_input_character}}) {
752 !!!parse-error (type => 'bad attribute name');
753 }
754 $self->{current_attribute} = {name => chr ($self->{next_input_character}),
755 value => ''};
756 $self->{state} = ATTRIBUTE_NAME_STATE;
757 !!!next-input-character;
758 redo A;
759 }
760 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
761 my $before_leave = sub {
762 if (exists $self->{current_token}->{attributes} # start tag or end tag
763 ->{$self->{current_attribute}->{name}}) { # MUST
764 !!!parse-error (type => 'duplicate attribute:'.$self->{current_attribute}->{name});
765 ## Discard $self->{current_attribute} # MUST
766 } else {
767 $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}
768 = $self->{current_attribute};
769 }
770 }; # $before_leave
771
772 if ($self->{next_input_character} == 0x0009 or # HT
773 $self->{next_input_character} == 0x000A or # LF
774 $self->{next_input_character} == 0x000B or # VT
775 $self->{next_input_character} == 0x000C or # FF
776 $self->{next_input_character} == 0x0020) { # SP
777 $before_leave->();
778 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
779 !!!next-input-character;
780 redo A;
781 } elsif ($self->{next_input_character} == 0x003D) { # =
782 $before_leave->();
783 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
784 !!!next-input-character;
785 redo A;
786 } elsif ($self->{next_input_character} == 0x003E) { # >
787 $before_leave->();
788 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
789 $self->{current_token}->{first_start_tag}
790 = not defined $self->{last_emitted_start_tag_name};
791 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
792 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
793 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
794 if ($self->{current_token}->{attributes}) {
795 !!!parse-error (type => 'end tag attribute');
796 }
797 } else {
798 die "$0: $self->{current_token}->{type}: Unknown token type";
799 }
800 $self->{state} = DATA_STATE;
801 !!!next-input-character;
802
803 !!!emit ($self->{current_token}); # start tag or end tag
804
805 redo A;
806 } elsif (0x0041 <= $self->{next_input_character} and
807 $self->{next_input_character} <= 0x005A) { # A..Z
808 $self->{current_attribute}->{name} .= chr ($self->{next_input_character} + 0x0020);
809 ## Stay in the state
810 !!!next-input-character;
811 redo A;
812 } elsif ($self->{next_input_character} == 0x002F) { # /
813 $before_leave->();
814 !!!next-input-character;
815 if ($self->{next_input_character} == 0x003E and # >
816 $self->{current_token}->{type} == START_TAG_TOKEN and
817 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
818 # permitted slash
819 #
820 } else {
821 !!!parse-error (type => 'nestc');
822 }
823 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
824 # next-input-character is already done
825 redo A;
826 } elsif ($self->{next_input_character} == -1) {
827 !!!parse-error (type => 'unclosed tag');
828 $before_leave->();
829 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
830 $self->{current_token}->{first_start_tag}
831 = not defined $self->{last_emitted_start_tag_name};
832 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
833 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
834 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
835 if ($self->{current_token}->{attributes}) {
836 !!!parse-error (type => 'end tag attribute');
837 }
838 } else {
839 die "$0: $self->{current_token}->{type}: Unknown token type";
840 }
841 $self->{state} = DATA_STATE;
842 # reconsume
843
844 !!!emit ($self->{current_token}); # start tag or end tag
845
846 redo A;
847 } else {
848 if ($self->{next_input_character} == 0x0022 or # "
849 $self->{next_input_character} == 0x0027) { # '
850 !!!parse-error (type => 'bad attribute name');
851 }
852 $self->{current_attribute}->{name} .= chr ($self->{next_input_character});
853 ## Stay in the state
854 !!!next-input-character;
855 redo A;
856 }
857 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
858 if ($self->{next_input_character} == 0x0009 or # HT
859 $self->{next_input_character} == 0x000A or # LF
860 $self->{next_input_character} == 0x000B or # VT
861 $self->{next_input_character} == 0x000C or # FF
862 $self->{next_input_character} == 0x0020) { # SP
863 ## Stay in the state
864 !!!next-input-character;
865 redo A;
866 } elsif ($self->{next_input_character} == 0x003D) { # =
867 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
868 !!!next-input-character;
869 redo A;
870 } elsif ($self->{next_input_character} == 0x003E) { # >
871 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
872 $self->{current_token}->{first_start_tag}
873 = not defined $self->{last_emitted_start_tag_name};
874 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
875 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
876 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
877 if ($self->{current_token}->{attributes}) {
878 !!!parse-error (type => 'end tag attribute');
879 }
880 } else {
881 die "$0: $self->{current_token}->{type}: Unknown token type";
882 }
883 $self->{state} = DATA_STATE;
884 !!!next-input-character;
885
886 !!!emit ($self->{current_token}); # start tag or end tag
887
888 redo A;
889 } elsif (0x0041 <= $self->{next_input_character} and
890 $self->{next_input_character} <= 0x005A) { # A..Z
891 $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
892 value => ''};
893 $self->{state} = ATTRIBUTE_NAME_STATE;
894 !!!next-input-character;
895 redo A;
896 } elsif ($self->{next_input_character} == 0x002F) { # /
897 !!!next-input-character;
898 if ($self->{next_input_character} == 0x003E and # >
899 $self->{current_token}->{type} == START_TAG_TOKEN and
900 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
901 # permitted slash
902 #
903 } else {
904 !!!parse-error (type => 'nestc');
905 ## TODO: Different error type for <aa / bb> than <aa/>
906 }
907 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
908 # next-input-character is already done
909 redo A;
910 } elsif ($self->{next_input_character} == -1) {
911 !!!parse-error (type => 'unclosed tag');
912 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
913 $self->{current_token}->{first_start_tag}
914 = not defined $self->{last_emitted_start_tag_name};
915 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
916 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
917 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
918 if ($self->{current_token}->{attributes}) {
919 !!!parse-error (type => 'end tag attribute');
920 }
921 } else {
922 die "$0: $self->{current_token}->{type}: Unknown token type";
923 }
924 $self->{state} = DATA_STATE;
925 # reconsume
926
927 !!!emit ($self->{current_token}); # start tag or end tag
928
929 redo A;
930 } else {
931 $self->{current_attribute} = {name => chr ($self->{next_input_character}),
932 value => ''};
933 $self->{state} = ATTRIBUTE_NAME_STATE;
934 !!!next-input-character;
935 redo A;
936 }
937 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
938 if ($self->{next_input_character} == 0x0009 or # HT
939 $self->{next_input_character} == 0x000A or # LF
940 $self->{next_input_character} == 0x000B or # VT
941 $self->{next_input_character} == 0x000C or # FF
942 $self->{next_input_character} == 0x0020) { # SP
943 ## Stay in the state
944 !!!next-input-character;
945 redo A;
946 } elsif ($self->{next_input_character} == 0x0022) { # "
947 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
948 !!!next-input-character;
949 redo A;
950 } elsif ($self->{next_input_character} == 0x0026) { # &
951 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
952 ## reconsume
953 redo A;
954 } elsif ($self->{next_input_character} == 0x0027) { # '
955 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
956 !!!next-input-character;
957 redo A;
958 } elsif ($self->{next_input_character} == 0x003E) { # >
959 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
960 $self->{current_token}->{first_start_tag}
961 = not defined $self->{last_emitted_start_tag_name};
962 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
963 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
964 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
965 if ($self->{current_token}->{attributes}) {
966 !!!parse-error (type => 'end tag attribute');
967 }
968 } else {
969 die "$0: $self->{current_token}->{type}: Unknown token type";
970 }
971 $self->{state} = DATA_STATE;
972 !!!next-input-character;
973
974 !!!emit ($self->{current_token}); # start tag or end tag
975
976 redo A;
977 } elsif ($self->{next_input_character} == -1) {
978 !!!parse-error (type => 'unclosed tag');
979 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
980 $self->{current_token}->{first_start_tag}
981 = not defined $self->{last_emitted_start_tag_name};
982 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
983 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
984 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
985 if ($self->{current_token}->{attributes}) {
986 !!!parse-error (type => 'end tag attribute');
987 }
988 } else {
989 die "$0: $self->{current_token}->{type}: Unknown token type";
990 }
991 $self->{state} = DATA_STATE;
992 ## reconsume
993
994 !!!emit ($self->{current_token}); # start tag or end tag
995
996 redo A;
997 } else {
998 if ($self->{next_input_character} == 0x003D) { # =
999 !!!parse-error (type => 'bad attribute value');
1000 }
1001 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1002 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1003 !!!next-input-character;
1004 redo A;
1005 }
1006 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1007 if ($self->{next_input_character} == 0x0022) { # "
1008 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1009 !!!next-input-character;
1010 redo A;
1011 } elsif ($self->{next_input_character} == 0x0026) { # &
1012 $self->{last_attribute_value_state} = $self->{state};
1013 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1014 !!!next-input-character;
1015 redo A;
1016 } elsif ($self->{next_input_character} == -1) {
1017 !!!parse-error (type => 'unclosed attribute value');
1018 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1019 $self->{current_token}->{first_start_tag}
1020 = not defined $self->{last_emitted_start_tag_name};
1021 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1022 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1023 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1024 if ($self->{current_token}->{attributes}) {
1025 !!!parse-error (type => 'end tag attribute');
1026 }
1027 } else {
1028 die "$0: $self->{current_token}->{type}: Unknown token type";
1029 }
1030 $self->{state} = DATA_STATE;
1031 ## reconsume
1032
1033 !!!emit ($self->{current_token}); # start tag or end tag
1034
1035 redo A;
1036 } else {
1037 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1038 ## Stay in the state
1039 !!!next-input-character;
1040 redo A;
1041 }
1042 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1043 if ($self->{next_input_character} == 0x0027) { # '
1044 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1045 !!!next-input-character;
1046 redo A;
1047 } elsif ($self->{next_input_character} == 0x0026) { # &
1048 $self->{last_attribute_value_state} = $self->{state};
1049 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1050 !!!next-input-character;
1051 redo A;
1052 } elsif ($self->{next_input_character} == -1) {
1053 !!!parse-error (type => 'unclosed attribute value');
1054 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1055 $self->{current_token}->{first_start_tag}
1056 = not defined $self->{last_emitted_start_tag_name};
1057 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1058 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1059 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1060 if ($self->{current_token}->{attributes}) {
1061 !!!parse-error (type => 'end tag attribute');
1062 }
1063 } else {
1064 die "$0: $self->{current_token}->{type}: Unknown token type";
1065 }
1066 $self->{state} = DATA_STATE;
1067 ## reconsume
1068
1069 !!!emit ($self->{current_token}); # start tag or end tag
1070
1071 redo A;
1072 } else {
1073 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1074 ## Stay in the state
1075 !!!next-input-character;
1076 redo A;
1077 }
1078 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1079 if ($self->{next_input_character} == 0x0009 or # HT
1080 $self->{next_input_character} == 0x000A or # LF
1081 $self->{next_input_character} == 0x000B or # HT
1082 $self->{next_input_character} == 0x000C or # FF
1083 $self->{next_input_character} == 0x0020) { # SP
1084 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1085 !!!next-input-character;
1086 redo A;
1087 } elsif ($self->{next_input_character} == 0x0026) { # &
1088 $self->{last_attribute_value_state} = $self->{state};
1089 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1090 !!!next-input-character;
1091 redo A;
1092 } elsif ($self->{next_input_character} == 0x003E) { # >
1093 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1094 $self->{current_token}->{first_start_tag}
1095 = not defined $self->{last_emitted_start_tag_name};
1096 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1097 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1098 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1099 if ($self->{current_token}->{attributes}) {
1100 !!!parse-error (type => 'end tag attribute');
1101 }
1102 } else {
1103 die "$0: $self->{current_token}->{type}: Unknown token type";
1104 }
1105 $self->{state} = DATA_STATE;
1106 !!!next-input-character;
1107
1108 !!!emit ($self->{current_token}); # start tag or end tag
1109
1110 redo A;
1111 } elsif ($self->{next_input_character} == -1) {
1112 !!!parse-error (type => 'unclosed tag');
1113 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1114 $self->{current_token}->{first_start_tag}
1115 = not defined $self->{last_emitted_start_tag_name};
1116 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1117 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1118 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1119 if ($self->{current_token}->{attributes}) {
1120 !!!parse-error (type => 'end tag attribute');
1121 }
1122 } else {
1123 die "$0: $self->{current_token}->{type}: Unknown token type";
1124 }
1125 $self->{state} = DATA_STATE;
1126 ## reconsume
1127
1128 !!!emit ($self->{current_token}); # start tag or end tag
1129
1130 redo A;
1131 } else {
1132 if ({
1133 0x0022 => 1, # "
1134 0x0027 => 1, # '
1135 0x003D => 1, # =
1136 }->{$self->{next_input_character}}) {
1137 !!!parse-error (type => 'bad attribute value');
1138 }
1139 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1140 ## Stay in the state
1141 !!!next-input-character;
1142 redo A;
1143 }
1144 } elsif ($self->{state} == ENTITY_IN_ATTRIBUTE_VALUE_STATE) {
1145 my $token = $self->_tokenize_attempt_to_consume_an_entity
1146 (1,
1147 $self->{last_attribute_value_state}
1148 == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE ? 0x0022 : # "
1149 $self->{last_attribute_value_state}
1150 == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE ? 0x0027 : # '
1151 -1);
1152
1153 unless (defined $token) {
1154 $self->{current_attribute}->{value} .= '&';
1155 } else {
1156 $self->{current_attribute}->{value} .= $token->{data};
1157 $self->{current_attribute}->{has_reference} = $token->{has_reference};
1158 ## ISSUE: spec says "append the returned character token to the current attribute's value"
1159 }
1160
1161 $self->{state} = $self->{last_attribute_value_state};
1162 # next-input-character is already done
1163 redo A;
1164 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1165 if ($self->{next_input_character} == 0x0009 or # HT
1166 $self->{next_input_character} == 0x000A or # LF
1167 $self->{next_input_character} == 0x000B or # VT
1168 $self->{next_input_character} == 0x000C or # FF
1169 $self->{next_input_character} == 0x0020) { # SP
1170 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1171 !!!next-input-character;
1172 redo A;
1173 } elsif ($self->{next_input_character} == 0x003E) { # >
1174 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1175 $self->{current_token}->{first_start_tag}
1176 = not defined $self->{last_emitted_start_tag_name};
1177 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1178 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1179 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1180 if ($self->{current_token}->{attributes}) {
1181 !!!parse-error (type => 'end tag attribute');
1182 }
1183 } else {
1184 die "$0: $self->{current_token}->{type}: Unknown token type";
1185 }
1186 $self->{state} = DATA_STATE;
1187 !!!next-input-character;
1188
1189 !!!emit ($self->{current_token}); # start tag or end tag
1190
1191 redo A;
1192 } elsif ($self->{next_input_character} == 0x002F) { # /
1193 !!!next-input-character;
1194 if ($self->{next_input_character} == 0x003E and # >
1195 $self->{current_token}->{type} == START_TAG_TOKEN and
1196 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
1197 # permitted slash
1198 #
1199 } else {
1200 !!!parse-error (type => 'nestc');
1201 }
1202 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1203 # next-input-character is already done
1204 redo A;
1205 } else {
1206 !!!parse-error (type => 'no space between attributes');
1207 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1208 ## reconsume
1209 redo A;
1210 }
1211 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1212 ## (only happen if PCDATA state)
1213
1214 my $token = {type => COMMENT_TOKEN, data => ''};
1215
1216 BC: {
1217 if ($self->{next_input_character} == 0x003E) { # >
1218 $self->{state} = DATA_STATE;
1219 !!!next-input-character;
1220
1221 !!!emit ($token);
1222
1223 redo A;
1224 } elsif ($self->{next_input_character} == -1) {
1225 $self->{state} = DATA_STATE;
1226 ## reconsume
1227
1228 !!!emit ($token);
1229
1230 redo A;
1231 } else {
1232 $token->{data} .= chr ($self->{next_input_character});
1233 !!!next-input-character;
1234 redo BC;
1235 }
1236 } # BC
1237 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1238 ## (only happen if PCDATA state)
1239
1240 my @next_char;
1241 push @next_char, $self->{next_input_character};
1242
1243 if ($self->{next_input_character} == 0x002D) { # -
1244 !!!next-input-character;
1245 push @next_char, $self->{next_input_character};
1246 if ($self->{next_input_character} == 0x002D) { # -
1247 $self->{current_token} = {type => COMMENT_TOKEN, data => ''};
1248 $self->{state} = COMMENT_START_STATE;
1249 !!!next-input-character;
1250 redo A;
1251 }
1252 } elsif ($self->{next_input_character} == 0x0044 or # D
1253 $self->{next_input_character} == 0x0064) { # d
1254 !!!next-input-character;
1255 push @next_char, $self->{next_input_character};
1256 if ($self->{next_input_character} == 0x004F or # O
1257 $self->{next_input_character} == 0x006F) { # o
1258 !!!next-input-character;
1259 push @next_char, $self->{next_input_character};
1260 if ($self->{next_input_character} == 0x0043 or # C
1261 $self->{next_input_character} == 0x0063) { # c
1262 !!!next-input-character;
1263 push @next_char, $self->{next_input_character};
1264 if ($self->{next_input_character} == 0x0054 or # T
1265 $self->{next_input_character} == 0x0074) { # t
1266 !!!next-input-character;
1267 push @next_char, $self->{next_input_character};
1268 if ($self->{next_input_character} == 0x0059 or # Y
1269 $self->{next_input_character} == 0x0079) { # y
1270 !!!next-input-character;
1271 push @next_char, $self->{next_input_character};
1272 if ($self->{next_input_character} == 0x0050 or # P
1273 $self->{next_input_character} == 0x0070) { # p
1274 !!!next-input-character;
1275 push @next_char, $self->{next_input_character};
1276 if ($self->{next_input_character} == 0x0045 or # E
1277 $self->{next_input_character} == 0x0065) { # e
1278 ## ISSUE: What a stupid code this is!
1279 $self->{state} = DOCTYPE_STATE;
1280 !!!next-input-character;
1281 redo A;
1282 }
1283 }
1284 }
1285 }
1286 }
1287 }
1288 }
1289
1290 !!!parse-error (type => 'bogus comment');
1291 $self->{next_input_character} = shift @next_char;
1292 !!!back-next-input-character (@next_char);
1293 $self->{state} = BOGUS_COMMENT_STATE;
1294 redo A;
1295
1296 ## ISSUE: typos in spec: chacacters, is is a parse error
1297 ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
1298 } elsif ($self->{state} == COMMENT_START_STATE) {
1299 if ($self->{next_input_character} == 0x002D) { # -
1300 $self->{state} = COMMENT_START_DASH_STATE;
1301 !!!next-input-character;
1302 redo A;
1303 } elsif ($self->{next_input_character} == 0x003E) { # >
1304 !!!parse-error (type => 'bogus comment');
1305 $self->{state} = DATA_STATE;
1306 !!!next-input-character;
1307
1308 !!!emit ($self->{current_token}); # comment
1309
1310 redo A;
1311 } elsif ($self->{next_input_character} == -1) {
1312 !!!parse-error (type => 'unclosed comment');
1313 $self->{state} = DATA_STATE;
1314 ## reconsume
1315
1316 !!!emit ($self->{current_token}); # comment
1317
1318 redo A;
1319 } else {
1320 $self->{current_token}->{data} # comment
1321 .= chr ($self->{next_input_character});
1322 $self->{state} = COMMENT_STATE;
1323 !!!next-input-character;
1324 redo A;
1325 }
1326 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1327 if ($self->{next_input_character} == 0x002D) { # -
1328 $self->{state} = COMMENT_END_STATE;
1329 !!!next-input-character;
1330 redo A;
1331 } elsif ($self->{next_input_character} == 0x003E) { # >
1332 !!!parse-error (type => 'bogus comment');
1333 $self->{state} = DATA_STATE;
1334 !!!next-input-character;
1335
1336 !!!emit ($self->{current_token}); # comment
1337
1338 redo A;
1339 } elsif ($self->{next_input_character} == -1) {
1340 !!!parse-error (type => 'unclosed comment');
1341 $self->{state} = DATA_STATE;
1342 ## reconsume
1343
1344 !!!emit ($self->{current_token}); # comment
1345
1346 redo A;
1347 } else {
1348 $self->{current_token}->{data} # comment
1349 .= '-' . chr ($self->{next_input_character});
1350 $self->{state} = COMMENT_STATE;
1351 !!!next-input-character;
1352 redo A;
1353 }
1354 } elsif ($self->{state} == COMMENT_STATE) {
1355 if ($self->{next_input_character} == 0x002D) { # -
1356 $self->{state} = COMMENT_END_DASH_STATE;
1357 !!!next-input-character;
1358 redo A;
1359 } elsif ($self->{next_input_character} == -1) {
1360 !!!parse-error (type => 'unclosed comment');
1361 $self->{state} = DATA_STATE;
1362 ## reconsume
1363
1364 !!!emit ($self->{current_token}); # comment
1365
1366 redo A;
1367 } else {
1368 $self->{current_token}->{data} .= chr ($self->{next_input_character}); # comment
1369 ## Stay in the state
1370 !!!next-input-character;
1371 redo A;
1372 }
1373 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1374 if ($self->{next_input_character} == 0x002D) { # -
1375 $self->{state} = COMMENT_END_STATE;
1376 !!!next-input-character;
1377 redo A;
1378 } elsif ($self->{next_input_character} == -1) {
1379 !!!parse-error (type => 'unclosed comment');
1380 $self->{state} = DATA_STATE;
1381 ## reconsume
1382
1383 !!!emit ($self->{current_token}); # comment
1384
1385 redo A;
1386 } else {
1387 $self->{current_token}->{data} .= '-' . chr ($self->{next_input_character}); # comment
1388 $self->{state} = COMMENT_STATE;
1389 !!!next-input-character;
1390 redo A;
1391 }
1392 } elsif ($self->{state} == COMMENT_END_STATE) {
1393 if ($self->{next_input_character} == 0x003E) { # >
1394 $self->{state} = DATA_STATE;
1395 !!!next-input-character;
1396
1397 !!!emit ($self->{current_token}); # comment
1398
1399 redo A;
1400 } elsif ($self->{next_input_character} == 0x002D) { # -
1401 !!!parse-error (type => 'dash in comment');
1402 $self->{current_token}->{data} .= '-'; # comment
1403 ## Stay in the state
1404 !!!next-input-character;
1405 redo A;
1406 } elsif ($self->{next_input_character} == -1) {
1407 !!!parse-error (type => 'unclosed comment');
1408 $self->{state} = DATA_STATE;
1409 ## reconsume
1410
1411 !!!emit ($self->{current_token}); # comment
1412
1413 redo A;
1414 } else {
1415 !!!parse-error (type => 'dash in comment');
1416 $self->{current_token}->{data} .= '--' . chr ($self->{next_input_character}); # comment
1417 $self->{state} = COMMENT_STATE;
1418 !!!next-input-character;
1419 redo A;
1420 }
1421 } elsif ($self->{state} == DOCTYPE_STATE) {
1422 if ($self->{next_input_character} == 0x0009 or # HT
1423 $self->{next_input_character} == 0x000A or # LF
1424 $self->{next_input_character} == 0x000B or # VT
1425 $self->{next_input_character} == 0x000C or # FF
1426 $self->{next_input_character} == 0x0020) { # SP
1427 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1428 !!!next-input-character;
1429 redo A;
1430 } else {
1431 !!!parse-error (type => 'no space before DOCTYPE name');
1432 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1433 ## reconsume
1434 redo A;
1435 }
1436 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
1437 if ($self->{next_input_character} == 0x0009 or # HT
1438 $self->{next_input_character} == 0x000A or # LF
1439 $self->{next_input_character} == 0x000B or # VT
1440 $self->{next_input_character} == 0x000C or # FF
1441 $self->{next_input_character} == 0x0020) { # SP
1442 ## Stay in the state
1443 !!!next-input-character;
1444 redo A;
1445 } elsif ($self->{next_input_character} == 0x003E) { # >
1446 !!!parse-error (type => 'no DOCTYPE name');
1447 $self->{state} = DATA_STATE;
1448 !!!next-input-character;
1449
1450 !!!emit ({type => DOCTYPE_TOKEN}); # incorrect
1451
1452 redo A;
1453 } elsif ($self->{next_input_character} == -1) {
1454 !!!parse-error (type => 'no DOCTYPE name');
1455 $self->{state} = DATA_STATE;
1456 ## reconsume
1457
1458 !!!emit ({type => DOCTYPE_TOKEN}); # incorrect
1459
1460 redo A;
1461 } else {
1462 $self->{current_token}
1463 = {type => DOCTYPE_TOKEN,
1464 name => chr ($self->{next_input_character}),
1465 correct => 1};
1466 ## ISSUE: "Set the token's name name to the" in the spec
1467 $self->{state} = DOCTYPE_NAME_STATE;
1468 !!!next-input-character;
1469 redo A;
1470 }
1471 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
1472 ## ISSUE: Redundant "First," in the spec.
1473 if ($self->{next_input_character} == 0x0009 or # HT
1474 $self->{next_input_character} == 0x000A or # LF
1475 $self->{next_input_character} == 0x000B or # VT
1476 $self->{next_input_character} == 0x000C or # FF
1477 $self->{next_input_character} == 0x0020) { # SP
1478 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
1479 !!!next-input-character;
1480 redo A;
1481 } elsif ($self->{next_input_character} == 0x003E) { # >
1482 $self->{state} = DATA_STATE;
1483 !!!next-input-character;
1484
1485 !!!emit ($self->{current_token}); # DOCTYPE
1486
1487 redo A;
1488 } elsif ($self->{next_input_character} == -1) {
1489 !!!parse-error (type => 'unclosed DOCTYPE');
1490 $self->{state} = DATA_STATE;
1491 ## reconsume
1492
1493 delete $self->{current_token}->{correct};
1494 !!!emit ($self->{current_token}); # DOCTYPE
1495
1496 redo A;
1497 } else {
1498 $self->{current_token}->{name}
1499 .= chr ($self->{next_input_character}); # DOCTYPE
1500 ## Stay in the state
1501 !!!next-input-character;
1502 redo A;
1503 }
1504 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
1505 if ($self->{next_input_character} == 0x0009 or # HT
1506 $self->{next_input_character} == 0x000A or # LF
1507 $self->{next_input_character} == 0x000B or # VT
1508 $self->{next_input_character} == 0x000C or # FF
1509 $self->{next_input_character} == 0x0020) { # SP
1510 ## Stay in the state
1511 !!!next-input-character;
1512 redo A;
1513 } elsif ($self->{next_input_character} == 0x003E) { # >
1514 $self->{state} = DATA_STATE;
1515 !!!next-input-character;
1516
1517 !!!emit ($self->{current_token}); # DOCTYPE
1518
1519 redo A;
1520 } elsif ($self->{next_input_character} == -1) {
1521 !!!parse-error (type => 'unclosed DOCTYPE');
1522 $self->{state} = DATA_STATE;
1523 ## reconsume
1524
1525 delete $self->{current_token}->{correct};
1526 !!!emit ($self->{current_token}); # DOCTYPE
1527
1528 redo A;
1529 } elsif ($self->{next_input_character} == 0x0050 or # P
1530 $self->{next_input_character} == 0x0070) { # p
1531 !!!next-input-character;
1532 if ($self->{next_input_character} == 0x0055 or # U
1533 $self->{next_input_character} == 0x0075) { # u
1534 !!!next-input-character;
1535 if ($self->{next_input_character} == 0x0042 or # B
1536 $self->{next_input_character} == 0x0062) { # b
1537 !!!next-input-character;
1538 if ($self->{next_input_character} == 0x004C or # L
1539 $self->{next_input_character} == 0x006C) { # l
1540 !!!next-input-character;
1541 if ($self->{next_input_character} == 0x0049 or # I
1542 $self->{next_input_character} == 0x0069) { # i
1543 !!!next-input-character;
1544 if ($self->{next_input_character} == 0x0043 or # C
1545 $self->{next_input_character} == 0x0063) { # c
1546 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1547 !!!next-input-character;
1548 redo A;
1549 }
1550 }
1551 }
1552 }
1553 }
1554
1555 #
1556 } elsif ($self->{next_input_character} == 0x0053 or # S
1557 $self->{next_input_character} == 0x0073) { # s
1558 !!!next-input-character;
1559 if ($self->{next_input_character} == 0x0059 or # Y
1560 $self->{next_input_character} == 0x0079) { # y
1561 !!!next-input-character;
1562 if ($self->{next_input_character} == 0x0053 or # S
1563 $self->{next_input_character} == 0x0073) { # s
1564 !!!next-input-character;
1565 if ($self->{next_input_character} == 0x0054 or # T
1566 $self->{next_input_character} == 0x0074) { # t
1567 !!!next-input-character;
1568 if ($self->{next_input_character} == 0x0045 or # E
1569 $self->{next_input_character} == 0x0065) { # e
1570 !!!next-input-character;
1571 if ($self->{next_input_character} == 0x004D or # M
1572 $self->{next_input_character} == 0x006D) { # m
1573 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1574 !!!next-input-character;
1575 redo A;
1576 }
1577 }
1578 }
1579 }
1580 }
1581
1582 #
1583 } else {
1584 !!!next-input-character;
1585 #
1586 }
1587
1588 !!!parse-error (type => 'string after DOCTYPE name');
1589 $self->{state} = BOGUS_DOCTYPE_STATE;
1590 # next-input-character is already done
1591 redo A;
1592 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
1593 if ({
1594 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1595 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1596 }->{$self->{next_input_character}}) {
1597 ## Stay in the state
1598 !!!next-input-character;
1599 redo A;
1600 } elsif ($self->{next_input_character} eq 0x0022) { # "
1601 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
1602 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
1603 !!!next-input-character;
1604 redo A;
1605 } elsif ($self->{next_input_character} eq 0x0027) { # '
1606 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
1607 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
1608 !!!next-input-character;
1609 redo A;
1610 } elsif ($self->{next_input_character} eq 0x003E) { # >
1611 !!!parse-error (type => 'no PUBLIC literal');
1612
1613 $self->{state} = DATA_STATE;
1614 !!!next-input-character;
1615
1616 delete $self->{current_token}->{correct};
1617 !!!emit ($self->{current_token}); # DOCTYPE
1618
1619 redo A;
1620 } elsif ($self->{next_input_character} == -1) {
1621 !!!parse-error (type => 'unclosed DOCTYPE');
1622
1623 $self->{state} = DATA_STATE;
1624 ## reconsume
1625
1626 delete $self->{current_token}->{correct};
1627 !!!emit ($self->{current_token}); # DOCTYPE
1628
1629 redo A;
1630 } else {
1631 !!!parse-error (type => 'string after PUBLIC');
1632 $self->{state} = BOGUS_DOCTYPE_STATE;
1633 !!!next-input-character;
1634 redo A;
1635 }
1636 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
1637 if ($self->{next_input_character} == 0x0022) { # "
1638 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1639 !!!next-input-character;
1640 redo A;
1641 } elsif ($self->{next_input_character} == 0x003E) { # >
1642 !!!parse-error (type => 'unclosed PUBLIC literal');
1643
1644 $self->{state} = DATA_STATE;
1645 !!!next-input-character;
1646
1647 delete $self->{current_token}->{correct};
1648 !!!emit ($self->{current_token}); # DOCTYPE
1649
1650 redo A;
1651 } elsif ($self->{next_input_character} == -1) {
1652 !!!parse-error (type => 'unclosed PUBLIC literal');
1653
1654 $self->{state} = DATA_STATE;
1655 ## reconsume
1656
1657 delete $self->{current_token}->{correct};
1658 !!!emit ($self->{current_token}); # DOCTYPE
1659
1660 redo A;
1661 } else {
1662 $self->{current_token}->{public_identifier} # DOCTYPE
1663 .= chr $self->{next_input_character};
1664 ## Stay in the state
1665 !!!next-input-character;
1666 redo A;
1667 }
1668 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
1669 if ($self->{next_input_character} == 0x0027) { # '
1670 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1671 !!!next-input-character;
1672 redo A;
1673 } elsif ($self->{next_input_character} == 0x003E) { # >
1674 !!!parse-error (type => 'unclosed PUBLIC literal');
1675
1676 $self->{state} = DATA_STATE;
1677 !!!next-input-character;
1678
1679 delete $self->{current_token}->{correct};
1680 !!!emit ($self->{current_token}); # DOCTYPE
1681
1682 redo A;
1683 } elsif ($self->{next_input_character} == -1) {
1684 !!!parse-error (type => 'unclosed PUBLIC literal');
1685
1686 $self->{state} = DATA_STATE;
1687 ## reconsume
1688
1689 delete $self->{current_token}->{correct};
1690 !!!emit ($self->{current_token}); # DOCTYPE
1691
1692 redo A;
1693 } else {
1694 $self->{current_token}->{public_identifier} # DOCTYPE
1695 .= chr $self->{next_input_character};
1696 ## Stay in the state
1697 !!!next-input-character;
1698 redo A;
1699 }
1700 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
1701 if ({
1702 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1703 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1704 }->{$self->{next_input_character}}) {
1705 ## Stay in the state
1706 !!!next-input-character;
1707 redo A;
1708 } elsif ($self->{next_input_character} == 0x0022) { # "
1709 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1710 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
1711 !!!next-input-character;
1712 redo A;
1713 } elsif ($self->{next_input_character} == 0x0027) { # '
1714 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1715 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
1716 !!!next-input-character;
1717 redo A;
1718 } elsif ($self->{next_input_character} == 0x003E) { # >
1719 $self->{state} = DATA_STATE;
1720 !!!next-input-character;
1721
1722 !!!emit ($self->{current_token}); # DOCTYPE
1723
1724 redo A;
1725 } elsif ($self->{next_input_character} == -1) {
1726 !!!parse-error (type => 'unclosed DOCTYPE');
1727
1728 $self->{state} = DATA_STATE;
1729 ## reconsume
1730
1731 delete $self->{current_token}->{correct};
1732 !!!emit ($self->{current_token}); # DOCTYPE
1733
1734 redo A;
1735 } else {
1736 !!!parse-error (type => 'string after PUBLIC literal');
1737 $self->{state} = BOGUS_DOCTYPE_STATE;
1738 !!!next-input-character;
1739 redo A;
1740 }
1741 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
1742 if ({
1743 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1744 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1745 }->{$self->{next_input_character}}) {
1746 ## Stay in the state
1747 !!!next-input-character;
1748 redo A;
1749 } elsif ($self->{next_input_character} == 0x0022) { # "
1750 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1751 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
1752 !!!next-input-character;
1753 redo A;
1754 } elsif ($self->{next_input_character} == 0x0027) { # '
1755 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1756 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
1757 !!!next-input-character;
1758 redo A;
1759 } elsif ($self->{next_input_character} == 0x003E) { # >
1760 !!!parse-error (type => 'no SYSTEM literal');
1761 $self->{state} = DATA_STATE;
1762 !!!next-input-character;
1763
1764 delete $self->{current_token}->{correct};
1765 !!!emit ($self->{current_token}); # DOCTYPE
1766
1767 redo A;
1768 } elsif ($self->{next_input_character} == -1) {
1769 !!!parse-error (type => 'unclosed DOCTYPE');
1770
1771 $self->{state} = DATA_STATE;
1772 ## reconsume
1773
1774 delete $self->{current_token}->{correct};
1775 !!!emit ($self->{current_token}); # DOCTYPE
1776
1777 redo A;
1778 } else {
1779 !!!parse-error (type => 'string after SYSTEM');
1780 $self->{state} = BOGUS_DOCTYPE_STATE;
1781 !!!next-input-character;
1782 redo A;
1783 }
1784 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
1785 if ($self->{next_input_character} == 0x0022) { # "
1786 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1787 !!!next-input-character;
1788 redo A;
1789 } elsif ($self->{next_input_character} == 0x003E) { # >
1790 !!!parse-error (type => 'unclosed PUBLIC literal');
1791
1792 $self->{state} = DATA_STATE;
1793 !!!next-input-character;
1794
1795 delete $self->{current_token}->{correct};
1796 !!!emit ($self->{current_token}); # DOCTYPE
1797
1798 redo A;
1799 } elsif ($self->{next_input_character} == -1) {
1800 !!!parse-error (type => 'unclosed SYSTEM literal');
1801
1802 $self->{state} = DATA_STATE;
1803 ## reconsume
1804
1805 delete $self->{current_token}->{correct};
1806 !!!emit ($self->{current_token}); # DOCTYPE
1807
1808 redo A;
1809 } else {
1810 $self->{current_token}->{system_identifier} # DOCTYPE
1811 .= chr $self->{next_input_character};
1812 ## Stay in the state
1813 !!!next-input-character;
1814 redo A;
1815 }
1816 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
1817 if ($self->{next_input_character} == 0x0027) { # '
1818 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1819 !!!next-input-character;
1820 redo A;
1821 } elsif ($self->{next_input_character} == 0x003E) { # >
1822 !!!parse-error (type => 'unclosed PUBLIC literal');
1823
1824 $self->{state} = DATA_STATE;
1825 !!!next-input-character;
1826
1827 delete $self->{current_token}->{correct};
1828 !!!emit ($self->{current_token}); # DOCTYPE
1829
1830 redo A;
1831 } elsif ($self->{next_input_character} == -1) {
1832 !!!parse-error (type => 'unclosed SYSTEM literal');
1833
1834 $self->{state} = DATA_STATE;
1835 ## reconsume
1836
1837 delete $self->{current_token}->{correct};
1838 !!!emit ($self->{current_token}); # DOCTYPE
1839
1840 redo A;
1841 } else {
1842 $self->{current_token}->{system_identifier} # DOCTYPE
1843 .= chr $self->{next_input_character};
1844 ## Stay in the state
1845 !!!next-input-character;
1846 redo A;
1847 }
1848 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
1849 if ({
1850 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1851 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1852 }->{$self->{next_input_character}}) {
1853 ## Stay in the state
1854 !!!next-input-character;
1855 redo A;
1856 } elsif ($self->{next_input_character} == 0x003E) { # >
1857 $self->{state} = DATA_STATE;
1858 !!!next-input-character;
1859
1860 !!!emit ($self->{current_token}); # DOCTYPE
1861
1862 redo A;
1863 } elsif ($self->{next_input_character} == -1) {
1864 !!!parse-error (type => 'unclosed DOCTYPE');
1865
1866 $self->{state} = DATA_STATE;
1867 ## reconsume
1868
1869 delete $self->{current_token}->{correct};
1870 !!!emit ($self->{current_token}); # DOCTYPE
1871
1872 redo A;
1873 } else {
1874 !!!parse-error (type => 'string after SYSTEM literal');
1875 $self->{state} = BOGUS_DOCTYPE_STATE;
1876 !!!next-input-character;
1877 redo A;
1878 }
1879 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
1880 if ($self->{next_input_character} == 0x003E) { # >
1881 $self->{state} = DATA_STATE;
1882 !!!next-input-character;
1883
1884 delete $self->{current_token}->{correct};
1885 !!!emit ($self->{current_token}); # DOCTYPE
1886
1887 redo A;
1888 } elsif ($self->{next_input_character} == -1) {
1889 !!!parse-error (type => 'unclosed DOCTYPE');
1890 $self->{state} = DATA_STATE;
1891 ## reconsume
1892
1893 delete $self->{current_token}->{correct};
1894 !!!emit ($self->{current_token}); # DOCTYPE
1895
1896 redo A;
1897 } else {
1898 ## Stay in the state
1899 !!!next-input-character;
1900 redo A;
1901 }
1902 } else {
1903 die "$0: $self->{state}: Unknown state";
1904 }
1905 } # A
1906
1907 die "$0: _get_next_token: unexpected case";
1908 } # _get_next_token
1909
1910 sub _tokenize_attempt_to_consume_an_entity ($$$) {
1911 my ($self, $in_attr, $additional) = @_;
1912
1913 if ({
1914 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
1915 0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR
1916 $additional => 1,
1917 }->{$self->{next_input_character}}) {
1918 ## Don't consume
1919 ## No error
1920 return undef;
1921 } elsif ($self->{next_input_character} == 0x0023) { # #
1922 !!!next-input-character;
1923 if ($self->{next_input_character} == 0x0078 or # x
1924 $self->{next_input_character} == 0x0058) { # X
1925 my $code;
1926 X: {
1927 my $x_char = $self->{next_input_character};
1928 !!!next-input-character;
1929 if (0x0030 <= $self->{next_input_character} and
1930 $self->{next_input_character} <= 0x0039) { # 0..9
1931 $code ||= 0;
1932 $code *= 0x10;
1933 $code += $self->{next_input_character} - 0x0030;
1934 redo X;
1935 } elsif (0x0061 <= $self->{next_input_character} and
1936 $self->{next_input_character} <= 0x0066) { # a..f
1937 $code ||= 0;
1938 $code *= 0x10;
1939 $code += $self->{next_input_character} - 0x0060 + 9;
1940 redo X;
1941 } elsif (0x0041 <= $self->{next_input_character} and
1942 $self->{next_input_character} <= 0x0046) { # A..F
1943 $code ||= 0;
1944 $code *= 0x10;
1945 $code += $self->{next_input_character} - 0x0040 + 9;
1946 redo X;
1947 } elsif (not defined $code) { # no hexadecimal digit
1948 !!!parse-error (type => 'bare hcro');
1949 !!!back-next-input-character ($x_char, $self->{next_input_character});
1950 $self->{next_input_character} = 0x0023; # #
1951 return undef;
1952 } elsif ($self->{next_input_character} == 0x003B) { # ;
1953 !!!next-input-character;
1954 } else {
1955 !!!parse-error (type => 'no refc');
1956 }
1957
1958 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
1959 !!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code);
1960 $code = 0xFFFD;
1961 } elsif ($code > 0x10FFFF) {
1962 !!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code);
1963 $code = 0xFFFD;
1964 } elsif ($code == 0x000D) {
1965 !!!parse-error (type => 'CR character reference');
1966 $code = 0x000A;
1967 } elsif (0x80 <= $code and $code <= 0x9F) {
1968 !!!parse-error (type => sprintf 'C1 character reference:U+%04X', $code);
1969 $code = $c1_entity_char->{$code};
1970 }
1971
1972 return {type => CHARACTER_TOKEN, data => chr $code,
1973 has_reference => 1};
1974 } # X
1975 } elsif (0x0030 <= $self->{next_input_character} and
1976 $self->{next_input_character} <= 0x0039) { # 0..9
1977 my $code = $self->{next_input_character} - 0x0030;
1978 !!!next-input-character;
1979
1980 while (0x0030 <= $self->{next_input_character} and
1981 $self->{next_input_character} <= 0x0039) { # 0..9
1982 $code *= 10;
1983 $code += $self->{next_input_character} - 0x0030;
1984
1985 !!!next-input-character;
1986 }
1987
1988 if ($self->{next_input_character} == 0x003B) { # ;
1989 !!!next-input-character;
1990 } else {
1991 !!!parse-error (type => 'no refc');
1992 }
1993
1994 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
1995 !!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code);
1996 $code = 0xFFFD;
1997 } elsif ($code > 0x10FFFF) {
1998 !!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code);
1999 $code = 0xFFFD;
2000 } elsif ($code == 0x000D) {
2001 !!!parse-error (type => 'CR character reference');
2002 $code = 0x000A;
2003 } elsif (0x80 <= $code and $code <= 0x9F) {
2004 !!!parse-error (type => sprintf 'C1 character reference:U+%04X', $code);
2005 $code = $c1_entity_char->{$code};
2006 }
2007
2008 return {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1};
2009 } else {
2010 !!!parse-error (type => 'bare nero');
2011 !!!back-next-input-character ($self->{next_input_character});
2012 $self->{next_input_character} = 0x0023; # #
2013 return undef;
2014 }
2015 } elsif ((0x0041 <= $self->{next_input_character} and
2016 $self->{next_input_character} <= 0x005A) or
2017 (0x0061 <= $self->{next_input_character} and
2018 $self->{next_input_character} <= 0x007A)) {
2019 my $entity_name = chr $self->{next_input_character};
2020 !!!next-input-character;
2021
2022 my $value = $entity_name;
2023 my $match = 0;
2024 require Whatpm::_NamedEntityList;
2025 our $EntityChar;
2026
2027 while (length $entity_name < 10 and
2028 ## NOTE: Some number greater than the maximum length of entity name
2029 ((0x0041 <= $self->{next_input_character} and # a
2030 $self->{next_input_character} <= 0x005A) or # x
2031 (0x0061 <= $self->{next_input_character} and # a
2032 $self->{next_input_character} <= 0x007A) or # z
2033 (0x0030 <= $self->{next_input_character} and # 0
2034 $self->{next_input_character} <= 0x0039) or # 9
2035 $self->{next_input_character} == 0x003B)) { # ;
2036 $entity_name .= chr $self->{next_input_character};
2037 if (defined $EntityChar->{$entity_name}) {
2038 if ($self->{next_input_character} == 0x003B) { # ;
2039 $value = $EntityChar->{$entity_name};
2040 $match = 1;
2041 !!!next-input-character;
2042 last;
2043 } else {
2044 $value = $EntityChar->{$entity_name};
2045 $match = -1;
2046 !!!next-input-character;
2047 }
2048 } else {
2049 $value .= chr $self->{next_input_character};
2050 $match *= 2;
2051 !!!next-input-character;
2052 }
2053 }
2054
2055 if ($match > 0) {
2056 return {type => CHARACTER_TOKEN, data => $value, has_reference => 1};
2057 } elsif ($match < 0) {
2058 !!!parse-error (type => 'no refc');
2059 if ($in_attr and $match < -1) {
2060 return {type => CHARACTER_TOKEN, data => '&'.$entity_name};
2061 } else {
2062 return {type => CHARACTER_TOKEN, data => $value, has_reference => 1};
2063 }
2064 } else {
2065 !!!parse-error (type => 'bare ero');
2066 ## NOTE: "No characters are consumed" in the spec.
2067 return {type => CHARACTER_TOKEN, data => '&'.$value};
2068 }
2069 } else {
2070 ## no characters are consumed
2071 !!!parse-error (type => 'bare ero');
2072 return undef;
2073 }
2074 } # _tokenize_attempt_to_consume_an_entity
2075
2076 sub _initialize_tree_constructor ($) {
2077 my $self = shift;
2078 ## NOTE: $self->{document} MUST be specified before this method is called
2079 $self->{document}->strict_error_checking (0);
2080 ## TODO: Turn mutation events off # MUST
2081 ## TODO: Turn loose Document option (manakai extension) on
2082 $self->{document}->manakai_is_html (1); # MUST
2083 } # _initialize_tree_constructor
2084
2085 sub _terminate_tree_constructor ($) {
2086 my $self = shift;
2087 $self->{document}->strict_error_checking (1);
2088 ## TODO: Turn mutation events on
2089 } # _terminate_tree_constructor
2090
2091 ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
2092
2093 { # tree construction stage
2094 my $token;
2095
2096 sub _construct_tree ($) {
2097 my ($self) = @_;
2098
2099 ## When an interactive UA render the $self->{document} available
2100 ## to the user, or when it begin accepting user input, are
2101 ## not defined.
2102
2103 ## Append a character: collect it and all subsequent consecutive
2104 ## characters and insert one Text node whose data is concatenation
2105 ## of all those characters. # MUST
2106
2107 !!!next-token;
2108
2109 $self->{insertion_mode} = BEFORE_HEAD_IM;
2110 undef $self->{form_element};
2111 undef $self->{head_element};
2112 $self->{open_elements} = [];
2113 undef $self->{inner_html_node};
2114
2115 $self->_tree_construction_initial; # MUST
2116 $self->_tree_construction_root_element;
2117 $self->_tree_construction_main;
2118 } # _construct_tree
2119
2120 sub _tree_construction_initial ($) {
2121 my $self = shift;
2122 INITIAL: {
2123 if ($token->{type} == DOCTYPE_TOKEN) {
2124 ## NOTE: Conformance checkers MAY, instead of reporting "not HTML5"
2125 ## error, switch to a conformance checking mode for another
2126 ## language.
2127 my $doctype_name = $token->{name};
2128 $doctype_name = '' unless defined $doctype_name;
2129 $doctype_name =~ tr/a-z/A-Z/;
2130 if (not defined $token->{name} or # <!DOCTYPE>
2131 defined $token->{public_identifier} or
2132 defined $token->{system_identifier}) {
2133 !!!parse-error (type => 'not HTML5');
2134 } elsif ($doctype_name ne 'HTML') {
2135 ## ISSUE: ASCII case-insensitive? (in fact it does not matter)
2136 !!!parse-error (type => 'not HTML5');
2137 }
2138
2139 my $doctype = $self->{document}->create_document_type_definition
2140 ($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)?
2141 $doctype->public_id ($token->{public_identifier})
2142 if defined $token->{public_identifier};
2143 $doctype->system_id ($token->{system_identifier})
2144 if defined $token->{system_identifier};
2145 ## NOTE: Other DocumentType attributes are null or empty lists.
2146 ## ISSUE: internalSubset = null??
2147 $self->{document}->append_child ($doctype);
2148
2149 if (not $token->{correct} or $doctype_name ne 'HTML') {
2150 $self->{document}->manakai_compat_mode ('quirks');
2151 } elsif (defined $token->{public_identifier}) {
2152 my $pubid = $token->{public_identifier};
2153 $pubid =~ tr/a-z/A-z/;
2154 if ({
2155 "+//SILMARIL//DTD HTML PRO V0R11 19970101//EN" => 1,
2156 "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
2157 "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
2158 "-//IETF//DTD HTML 2.0 LEVEL 1//EN" => 1,
2159 "-//IETF//DTD HTML 2.0 LEVEL 2//EN" => 1,
2160 "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//EN" => 1,
2161 "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//EN" => 1,
2162 "-//IETF//DTD HTML 2.0 STRICT//EN" => 1,
2163 "-//IETF//DTD HTML 2.0//EN" => 1,
2164 "-//IETF//DTD HTML 2.1E//EN" => 1,
2165 "-//IETF//DTD HTML 3.0//EN" => 1,
2166 "-//IETF//DTD HTML 3.0//EN//" => 1,
2167 "-//IETF//DTD HTML 3.2 FINAL//EN" => 1,
2168 "-//IETF//DTD HTML 3.2//EN" => 1,
2169 "-//IETF//DTD HTML 3//EN" => 1,
2170 "-//IETF//DTD HTML LEVEL 0//EN" => 1,
2171 "-//IETF//DTD HTML LEVEL 0//EN//2.0" => 1,
2172 "-//IETF//DTD HTML LEVEL 1//EN" => 1,
2173 "-//IETF//DTD HTML LEVEL 1//EN//2.0" => 1,
2174 "-//IETF//DTD HTML LEVEL 2//EN" => 1,
2175 "-//IETF//DTD HTML LEVEL 2//EN//2.0" => 1,
2176 "-//IETF//DTD HTML LEVEL 3//EN" => 1,
2177 "-//IETF//DTD HTML LEVEL 3//EN//3.0" => 1,
2178 "-//IETF//DTD HTML STRICT LEVEL 0//EN" => 1,
2179 "-//IETF//DTD HTML STRICT LEVEL 0//EN//2.0" => 1,
2180 "-//IETF//DTD HTML STRICT LEVEL 1//EN" => 1,
2181 "-//IETF//DTD HTML STRICT LEVEL 1//EN//2.0" => 1,
2182 "-//IETF//DTD HTML STRICT LEVEL 2//EN" => 1,
2183 "-//IETF//DTD HTML STRICT LEVEL 2//EN//2.0" => 1,
2184 "-//IETF//DTD HTML STRICT LEVEL 3//EN" => 1,
2185 "-//IETF//DTD HTML STRICT LEVEL 3//EN//3.0" => 1,
2186 "-//IETF//DTD HTML STRICT//EN" => 1,
2187 "-//IETF//DTD HTML STRICT//EN//2.0" => 1,
2188 "-//IETF//DTD HTML STRICT//EN//3.0" => 1,
2189 "-//IETF//DTD HTML//EN" => 1,
2190 "-//IETF//DTD HTML//EN//2.0" => 1,
2191 "-//IETF//DTD HTML//EN//3.0" => 1,
2192 "-//METRIUS//DTD METRIUS PRESENTATIONAL//EN" => 1,
2193 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//EN" => 1,
2194 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//EN" => 1,
2195 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//EN" => 1,
2196 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//EN" => 1,
2197 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//EN" => 1,
2198 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//EN" => 1,
2199 "-//NETSCAPE COMM. CORP.//DTD HTML//EN" => 1,
2200 "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1,
2201 "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1,
2202 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1,
2203 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED RELAXED 1.0//EN" => 1,
2204 "-//SOFTQUAD SOFTWARE//DTD HOTMETAL PRO 6.0::19990601::EXTENSIONS TO HTML 4.0//EN" => 1,
2205 "-//SOFTQUAD//DTD HOTMETAL PRO 4.0::19971010::EXTENSIONS TO HTML 4.0//EN" => 1,
2206 "-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1,
2207 "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1,
2208 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1,
2209 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//EN" => 1,
2210 "-//W3C//DTD HTML 3 1995-03-24//EN" => 1,
2211 "-//W3C//DTD HTML 3.2 DRAFT//EN" => 1,
2212 "-//W3C//DTD HTML 3.2 FINAL//EN" => 1,
2213 "-//W3C//DTD HTML 3.2//EN" => 1,
2214 "-//W3C//DTD HTML 3.2S DRAFT//EN" => 1,
2215 "-//W3C//DTD HTML 4.0 FRAMESET//EN" => 1,
2216 "-//W3C//DTD HTML 4.0 TRANSITIONAL//EN" => 1,
2217 "-//W3C//DTD HTML EXPERIMETNAL 19960712//EN" => 1,
2218 "-//W3C//DTD HTML EXPERIMENTAL 970421//EN" => 1,
2219 "-//W3C//DTD W3 HTML//EN" => 1,
2220 "-//W3O//DTD W3 HTML 3.0//EN" => 1,
2221 "-//W3O//DTD W3 HTML 3.0//EN//" => 1,
2222 "-//W3O//DTD W3 HTML STRICT 3.0//EN//" => 1,
2223 "-//WEBTECHS//DTD MOZILLA HTML 2.0//EN" => 1,
2224 "-//WEBTECHS//DTD MOZILLA HTML//EN" => 1,
2225 "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" => 1,
2226 "HTML" => 1,
2227 }->{$pubid}) {
2228 $self->{document}->manakai_compat_mode ('quirks');
2229 } elsif ($pubid eq "-//W3C//DTD HTML 4.01 FRAMESET//EN" or
2230 $pubid eq "-//W3C//DTD HTML 4.01 TRANSITIONAL//EN") {
2231 if (defined $token->{system_identifier}) {
2232 $self->{document}->manakai_compat_mode ('quirks');
2233 } else {
2234 $self->{document}->manakai_compat_mode ('limited quirks');
2235 }
2236 } elsif ($pubid eq "-//W3C//DTD XHTML 1.0 Frameset//EN" or
2237 $pubid eq "-//W3C//DTD XHTML 1.0 Transitional//EN") {
2238 $self->{document}->manakai_compat_mode ('limited quirks');
2239 }
2240 }
2241 if (defined $token->{system_identifier}) {
2242 my $sysid = $token->{system_identifier};
2243 $sysid =~ tr/A-Z/a-z/;
2244 if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
2245 $self->{document}->manakai_compat_mode ('quirks');
2246 }
2247 }
2248
2249 ## Go to the root element phase.
2250 !!!next-token;
2251 return;
2252 } elsif ({
2253 START_TAG_TOKEN, 1,
2254 END_TAG_TOKEN, 1,
2255 END_OF_FILE_TOKEN, 1,
2256 }->{$token->{type}}) {
2257 !!!parse-error (type => 'no DOCTYPE');
2258 $self->{document}->manakai_compat_mode ('quirks');
2259 ## Go to the root element phase
2260 ## reprocess
2261 return;
2262 } elsif ($token->{type} == CHARACTER_TOKEN) {
2263 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
2264 ## Ignore the token
2265
2266 unless (length $token->{data}) {
2267 ## Stay in the phase
2268 !!!next-token;
2269 redo INITIAL;
2270 }
2271 }
2272
2273 !!!parse-error (type => 'no DOCTYPE');
2274 $self->{document}->manakai_compat_mode ('quirks');
2275 ## Go to the root element phase
2276 ## reprocess
2277 return;
2278 } elsif ($token->{type} == COMMENT_TOKEN) {
2279 my $comment = $self->{document}->create_comment ($token->{data});
2280 $self->{document}->append_child ($comment);
2281
2282 ## Stay in the phase.
2283 !!!next-token;
2284 redo INITIAL;
2285 } else {
2286 die "$0: $token->{type}: Unknown token type";
2287 }
2288 } # INITIAL
2289 } # _tree_construction_initial
2290
2291 sub _tree_construction_root_element ($) {
2292 my $self = shift;
2293
2294 B: {
2295 if ($token->{type} == DOCTYPE_TOKEN) {
2296 !!!parse-error (type => 'in html:#DOCTYPE');
2297 ## Ignore the token
2298 ## Stay in the phase
2299 !!!next-token;
2300 redo B;
2301 } elsif ($token->{type} == COMMENT_TOKEN) {
2302 my $comment = $self->{document}->create_comment ($token->{data});
2303 $self->{document}->append_child ($comment);
2304 ## Stay in the phase
2305 !!!next-token;
2306 redo B;
2307 } elsif ($token->{type} == CHARACTER_TOKEN) {
2308 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
2309 ## Ignore the token.
2310
2311 unless (length $token->{data}) {
2312 ## Stay in the phase
2313 !!!next-token;
2314 redo B;
2315 }
2316 }
2317
2318 $self->{application_cache_selection}->(undef);
2319
2320 #
2321 } elsif ($token->{type} == START_TAG_TOKEN) {
2322 if ($token->{tag_name} eq 'html' and
2323 $token->{attributes}->{manifest}) {
2324 $self->{application_cache_selection}
2325 ->($token->{attributes}->{manifest}->{value});
2326 ## ISSUE: No relative reference resolution?
2327 } else {
2328 $self->{application_cache_selection}->(undef);
2329 }
2330
2331 ## ISSUE: There is an issue in the spec
2332 #
2333 } elsif ({
2334 END_TAG_TOKEN, 1,
2335 END_OF_FILE_TOKEN, 1,
2336 }->{$token->{type}}) {
2337 $self->{application_cache_selection}->(undef);
2338
2339 ## ISSUE: There is an issue in the spec
2340 #
2341 } else {
2342 die "$0: $token->{type}: Unknown token type";
2343 }
2344
2345 my $root_element; !!!create-element ($root_element, 'html');
2346 $self->{document}->append_child ($root_element);
2347 push @{$self->{open_elements}}, [$root_element, 'html'];
2348 ## reprocess
2349 #redo B;
2350 return; ## Go to the main phase.
2351 } # B
2352 } # _tree_construction_root_element
2353
2354 sub _reset_insertion_mode ($) {
2355 my $self = shift;
2356
2357 ## Step 1
2358 my $last;
2359
2360 ## Step 2
2361 my $i = -1;
2362 my $node = $self->{open_elements}->[$i];
2363
2364 ## Step 3
2365 S3: {
2366 ## ISSUE: Oops! "If node is the first node in the stack of open
2367 ## elements, then set last to true. If the context element of the
2368 ## HTML fragment parsing algorithm is neither a td element nor a
2369 ## th element, then set node to the context element. (fragment case)":
2370 ## The second "if" is in the scope of the first "if"!?
2371 if ($self->{open_elements}->[0]->[0] eq $node->[0]) {
2372 $last = 1;
2373 if (defined $self->{inner_html_node}) {
2374 if ($self->{inner_html_node}->[1] eq 'td' or
2375 $self->{inner_html_node}->[1] eq 'th') {
2376 #
2377 } else {
2378 $node = $self->{inner_html_node};
2379 }
2380 }
2381 }
2382
2383 ## Step 4..13
2384 my $new_mode = {
2385 select => IN_SELECT_IM,
2386 td => IN_CELL_IM,
2387 th => IN_CELL_IM,
2388 tr => IN_ROW_IM,
2389 tbody => IN_TABLE_BODY_IM,
2390 thead => IN_TABLE_BODY_IM,
2391 tfoot => IN_TABLE_BODY_IM,
2392 caption => IN_CAPTION_IM,
2393 colgroup => IN_COLUMN_GROUP_IM,
2394 table => IN_TABLE_IM,
2395 head => IN_BODY_IM, # not in head!
2396 body => IN_BODY_IM,
2397 frameset => IN_FRAMESET_IM,
2398 }->{$node->[1]};
2399 $self->{insertion_mode} = $new_mode and return if defined $new_mode;
2400
2401 ## Step 14
2402 if ($node->[1] eq 'html') {
2403 unless (defined $self->{head_element}) {
2404 $self->{insertion_mode} = BEFORE_HEAD_IM;
2405 } else {
2406 $self->{insertion_mode} = AFTER_HEAD_IM;
2407 }
2408 return;
2409 }
2410
2411 ## Step 15
2412 $self->{insertion_mode} = IN_BODY_IM and return if $last;
2413
2414 ## Step 16
2415 $i--;
2416 $node = $self->{open_elements}->[$i];
2417
2418 ## Step 17
2419 redo S3;
2420 } # S3
2421 } # _reset_insertion_mode
2422
2423 sub _tree_construction_main ($) {
2424 my $self = shift;
2425
2426 my $active_formatting_elements = [];
2427
2428 my $reconstruct_active_formatting_elements = sub { # MUST
2429 my $insert = shift;
2430
2431 ## Step 1
2432 return unless @$active_formatting_elements;
2433
2434 ## Step 3
2435 my $i = -1;
2436 my $entry = $active_formatting_elements->[$i];
2437
2438 ## Step 2
2439 return if $entry->[0] eq '#marker';
2440 for (@{$self->{open_elements}}) {
2441 if ($entry->[0] eq $_->[0]) {
2442 return;
2443 }
2444 }
2445
2446 S4: {
2447 ## Step 4
2448 last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
2449
2450 ## Step 5
2451 $i--;
2452 $entry = $active_formatting_elements->[$i];
2453
2454 ## Step 6
2455 if ($entry->[0] eq '#marker') {
2456 #
2457 } else {
2458 my $in_open_elements;
2459 OE: for (@{$self->{open_elements}}) {
2460 if ($entry->[0] eq $_->[0]) {
2461 $in_open_elements = 1;
2462 last OE;
2463 }
2464 }
2465 if ($in_open_elements) {
2466 #
2467 } else {
2468 redo S4;
2469 }
2470 }
2471
2472 ## Step 7
2473 $i++;
2474 $entry = $active_formatting_elements->[$i];
2475 } # S4
2476
2477 S7: {
2478 ## Step 8
2479 my $clone = [$entry->[0]->clone_node (0), $entry->[1]];
2480
2481 ## Step 9
2482 $insert->($clone->[0]);
2483 push @{$self->{open_elements}}, $clone;
2484
2485 ## Step 10
2486 $active_formatting_elements->[$i] = $self->{open_elements}->[-1];
2487
2488 ## Step 11
2489 unless ($clone->[0] eq $active_formatting_elements->[-1]->[0]) {
2490 ## Step 7'
2491 $i++;
2492 $entry = $active_formatting_elements->[$i];
2493
2494 redo S7;
2495 }
2496 } # S7
2497 }; # $reconstruct_active_formatting_elements
2498
2499 my $clear_up_to_marker = sub {
2500 for (reverse 0..$#$active_formatting_elements) {
2501 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
2502 splice @$active_formatting_elements, $_;
2503 return;
2504 }
2505 }
2506 }; # $clear_up_to_marker
2507
2508 my $parse_rcdata = sub ($$) {
2509 my ($content_model_flag, $insert) = @_;
2510
2511 ## Step 1
2512 my $start_tag_name = $token->{tag_name};
2513 my $el;
2514 !!!create-element ($el, $start_tag_name, $token->{attributes});
2515
2516 ## Step 2
2517 $insert->($el); # /context node/->append_child ($el)
2518
2519 ## Step 3
2520 $self->{content_model} = $content_model_flag; # CDATA or RCDATA
2521 delete $self->{escape}; # MUST
2522
2523 ## Step 4
2524 my $text = '';
2525 !!!next-token;
2526 while ($token->{type} == CHARACTER_TOKEN) { # or until stop tokenizing
2527 $text .= $token->{data};
2528 !!!next-token;
2529 }
2530
2531 ## Step 5
2532 if (length $text) {
2533 my $text = $self->{document}->create_text_node ($text);
2534 $el->append_child ($text);
2535 }
2536
2537 ## Step 6
2538 $self->{content_model} = PCDATA_CONTENT_MODEL;
2539
2540 ## Step 7
2541 if ($token->{type} == END_TAG_TOKEN and $token->{tag_name} eq $start_tag_name) {
2542 ## Ignore the token
2543 } elsif ($content_model_flag == CDATA_CONTENT_MODEL) {
2544 !!!parse-error (type => 'in CDATA:#'.$token->{type});
2545 } elsif ($content_model_flag == RCDATA_CONTENT_MODEL) {
2546 !!!parse-error (type => 'in RCDATA:#'.$token->{type});
2547 } else {
2548 die "$0: $content_model_flag in parse_rcdata";
2549 }
2550 !!!next-token;
2551 }; # $parse_rcdata
2552
2553 my $script_start_tag = sub ($) {
2554 my $insert = $_[0];
2555 my $script_el;
2556 !!!create-element ($script_el, 'script', $token->{attributes});
2557 ## TODO: mark as "parser-inserted"
2558
2559 $self->{content_model} = CDATA_CONTENT_MODEL;
2560 delete $self->{escape}; # MUST
2561
2562 my $text = '';
2563 !!!next-token;
2564 while ($token->{type} == CHARACTER_TOKEN) {
2565 $text .= $token->{data};
2566 !!!next-token;
2567 } # stop if non-character token or tokenizer stops tokenising
2568 if (length $text) {
2569 $script_el->manakai_append_text ($text);
2570 }
2571
2572 $self->{content_model} = PCDATA_CONTENT_MODEL;
2573
2574 if ($token->{type} == END_TAG_TOKEN and
2575 $token->{tag_name} eq 'script') {
2576 ## Ignore the token
2577 } else {
2578 !!!parse-error (type => 'in CDATA:#'.$token->{type});
2579 ## ISSUE: And ignore?
2580 ## TODO: mark as "already executed"
2581 }
2582
2583 if (defined $self->{inner_html_node}) {
2584 ## TODO: mark as "already executed"
2585 } else {
2586 ## TODO: $old_insertion_point = current insertion point
2587 ## TODO: insertion point = just before the next input character
2588
2589 $insert->($script_el);
2590
2591 ## TODO: insertion point = $old_insertion_point (might be "undefined")
2592
2593 ## TODO: if there is a script that will execute as soon as the parser resume, then...
2594 }
2595
2596 !!!next-token;
2597 }; # $script_start_tag
2598
2599 my $formatting_end_tag = sub {
2600 my $tag_name = shift;
2601
2602 FET: {
2603 ## Step 1
2604 my $formatting_element;
2605 my $formatting_element_i_in_active;
2606 AFE: for (reverse 0..$#$active_formatting_elements) {
2607 if ($active_formatting_elements->[$_]->[1] eq $tag_name) {
2608 $formatting_element = $active_formatting_elements->[$_];
2609 $formatting_element_i_in_active = $_;
2610 last AFE;
2611 } elsif ($active_formatting_elements->[$_]->[0] eq '#marker') {
2612 last AFE;
2613 }
2614 } # AFE
2615 unless (defined $formatting_element) {
2616 !!!parse-error (type => 'unmatched end tag:'.$tag_name);
2617 ## Ignore the token
2618 !!!next-token;
2619 return;
2620 }
2621 ## has an element in scope
2622 my $in_scope = 1;
2623 my $formatting_element_i_in_open;
2624 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2625 my $node = $self->{open_elements}->[$_];
2626 if ($node->[0] eq $formatting_element->[0]) {
2627 if ($in_scope) {
2628 $formatting_element_i_in_open = $_;
2629 last INSCOPE;
2630 } else { # in open elements but not in scope
2631 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2632 ## Ignore the token
2633 !!!next-token;
2634 return;
2635 }
2636 } elsif ({
2637 table => 1, caption => 1, td => 1, th => 1,
2638 button => 1, marquee => 1, object => 1, html => 1,
2639 }->{$node->[1]}) {
2640 $in_scope = 0;
2641 }
2642 } # INSCOPE
2643 unless (defined $formatting_element_i_in_open) {
2644 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2645 pop @$active_formatting_elements; # $formatting_element
2646 !!!next-token; ## TODO: ok?
2647 return;
2648 }
2649 if (not $self->{open_elements}->[-1]->[0] eq $formatting_element->[0]) {
2650 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
2651 }
2652
2653 ## Step 2
2654 my $furthest_block;
2655 my $furthest_block_i_in_open;
2656 OE: for (reverse 0..$#{$self->{open_elements}}) {
2657 my $node = $self->{open_elements}->[$_];
2658 if (not $formatting_category->{$node->[1]} and
2659 #not $phrasing_category->{$node->[1]} and
2660 ($special_category->{$node->[1]} or
2661 $scoping_category->{$node->[1]})) {
2662 $furthest_block = $node;
2663 $furthest_block_i_in_open = $_;
2664 } elsif ($node->[0] eq $formatting_element->[0]) {
2665 last OE;
2666 }
2667 } # OE
2668
2669 ## Step 3
2670 unless (defined $furthest_block) { # MUST
2671 splice @{$self->{open_elements}}, $formatting_element_i_in_open;
2672 splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
2673 !!!next-token;
2674 return;
2675 }
2676
2677 ## Step 4
2678 my $common_ancestor_node = $self->{open_elements}->[$formatting_element_i_in_open - 1];
2679
2680 ## Step 5
2681 my $furthest_block_parent = $furthest_block->[0]->parent_node;
2682 if (defined $furthest_block_parent) {
2683 $furthest_block_parent->remove_child ($furthest_block->[0]);
2684 }
2685
2686 ## Step 6
2687 my $bookmark_prev_el
2688 = $active_formatting_elements->[$formatting_element_i_in_active - 1]
2689 ->[0];
2690
2691 ## Step 7
2692 my $node = $furthest_block;
2693 my $node_i_in_open = $furthest_block_i_in_open;
2694 my $last_node = $furthest_block;
2695 S7: {
2696 ## Step 1
2697 $node_i_in_open--;
2698 $node = $self->{open_elements}->[$node_i_in_open];
2699
2700 ## Step 2
2701 my $node_i_in_active;
2702 S7S2: {
2703 for (reverse 0..$#$active_formatting_elements) {
2704 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
2705 $node_i_in_active = $_;
2706 last S7S2;
2707 }
2708 }
2709 splice @{$self->{open_elements}}, $node_i_in_open, 1;
2710 redo S7;
2711 } # S7S2
2712
2713 ## Step 3
2714 last S7 if $node->[0] eq $formatting_element->[0];
2715
2716 ## Step 4
2717 if ($last_node->[0] eq $furthest_block->[0]) {
2718 $bookmark_prev_el = $node->[0];
2719 }
2720
2721 ## Step 5
2722 if ($node->[0]->has_child_nodes ()) {
2723 my $clone = [$node->[0]->clone_node (0), $node->[1]];
2724 $active_formatting_elements->[$node_i_in_active] = $clone;
2725 $self->{open_elements}->[$node_i_in_open] = $clone;
2726 $node = $clone;
2727 }
2728
2729 ## Step 6
2730 $node->[0]->append_child ($last_node->[0]);
2731
2732 ## Step 7
2733 $last_node = $node;
2734
2735 ## Step 8
2736 redo S7;
2737 } # S7
2738
2739 ## Step 8
2740 $common_ancestor_node->[0]->append_child ($last_node->[0]);
2741
2742 ## Step 9
2743 my $clone = [$formatting_element->[0]->clone_node (0),
2744 $formatting_element->[1]];
2745
2746 ## Step 10
2747 my @cn = @{$furthest_block->[0]->child_nodes};
2748 $clone->[0]->append_child ($_) for @cn;
2749
2750 ## Step 11
2751 $furthest_block->[0]->append_child ($clone->[0]);
2752
2753 ## Step 12
2754 my $i;
2755 AFE: for (reverse 0..$#$active_formatting_elements) {
2756 if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
2757 splice @$active_formatting_elements, $_, 1;
2758 $i-- and last AFE if defined $i;
2759 } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
2760 $i = $_;
2761 }
2762 } # AFE
2763 splice @$active_formatting_elements, $i + 1, 0, $clone;
2764
2765 ## Step 13
2766 undef $i;
2767 OE: for (reverse 0..$#{$self->{open_elements}}) {
2768 if ($self->{open_elements}->[$_]->[0] eq $formatting_element->[0]) {
2769 splice @{$self->{open_elements}}, $_, 1;
2770 $i-- and last OE if defined $i;
2771 } elsif ($self->{open_elements}->[$_]->[0] eq $furthest_block->[0]) {
2772 $i = $_;
2773 }
2774 } # OE
2775 splice @{$self->{open_elements}}, $i + 1, 1, $clone;
2776
2777 ## Step 14
2778 redo FET;
2779 } # FET
2780 }; # $formatting_end_tag
2781
2782 my $insert_to_current = sub {
2783 $self->{open_elements}->[-1]->[0]->append_child ($_[0]);
2784 }; # $insert_to_current
2785
2786 my $insert_to_foster = sub {
2787 my $child = shift;
2788 if ({
2789 table => 1, tbody => 1, tfoot => 1,
2790 thead => 1, tr => 1,
2791 }->{$self->{open_elements}->[-1]->[1]}) {
2792 # MUST
2793 my $foster_parent_element;
2794 my $next_sibling;
2795 OE: for (reverse 0..$#{$self->{open_elements}}) {
2796 if ($self->{open_elements}->[$_]->[1] eq 'table') {
2797 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
2798 if (defined $parent and $parent->node_type == 1) {
2799 $foster_parent_element = $parent;
2800 $next_sibling = $self->{open_elements}->[$_]->[0];
2801 } else {
2802 $foster_parent_element
2803 = $self->{open_elements}->[$_ - 1]->[0];
2804 }
2805 last OE;
2806 }
2807 } # OE
2808 $foster_parent_element = $self->{open_elements}->[0]->[0]
2809 unless defined $foster_parent_element;
2810 $foster_parent_element->insert_before
2811 ($child, $next_sibling);
2812 } else {
2813 $self->{open_elements}->[-1]->[0]->append_child ($child);
2814 }
2815 }; # $insert_to_foster
2816
2817 my $insert;
2818
2819 B: {
2820 if ($token->{type} == DOCTYPE_TOKEN) {
2821 !!!parse-error (type => 'DOCTYPE in the middle');
2822 ## Ignore the token
2823 ## Stay in the phase
2824 !!!next-token;
2825 redo B;
2826 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
2827 if ($self->{insertion_mode} & AFTER_HTML_IMS) {
2828 #
2829 } else {
2830 ## Generate implied end tags
2831 if ({
2832 dd => 1, dt => 1, li => 1, p => 1, td => 1, th => 1, tr => 1,
2833 tbody => 1, tfoot=> 1, thead => 1,
2834 }->{$self->{open_elements}->[-1]->[1]}) {
2835 !!!back-token;
2836 $token = {type => END_TAG_TOKEN, tag_name => $self->{open_elements}->[-1]->[1]};
2837 redo B;
2838 }
2839
2840 if (@{$self->{open_elements}} > 2 or
2841 (@{$self->{open_elements}} == 2 and $self->{open_elements}->[1]->[1] ne 'body')) {
2842 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
2843 } elsif (defined $self->{inner_html_node} and
2844 @{$self->{open_elements}} > 1 and
2845 $self->{open_elements}->[1]->[1] ne 'body') {
2846 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
2847 }
2848
2849 ## ISSUE: There is an issue in the spec.
2850 }
2851
2852 ## Stop parsing
2853 last B;
2854 } elsif ($token->{type} == START_TAG_TOKEN and
2855 $token->{tag_name} eq 'html') {
2856 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
2857 ## Turn into the main phase
2858 !!!parse-error (type => 'after html:html');
2859 $self->{insertion_mode} = AFTER_BODY_IM;
2860 } elsif ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
2861 ## Turn into the main phase
2862 !!!parse-error (type => 'after html:html');
2863 $self->{insertion_mode} = AFTER_FRAMESET_IM;
2864 }
2865
2866 ## ISSUE: "aa<html>" is not a parse error.
2867 ## ISSUE: "<html>" in fragment is not a parse error.
2868 unless ($token->{first_start_tag}) {
2869 !!!parse-error (type => 'not first start tag');
2870 }
2871 my $top_el = $self->{open_elements}->[0]->[0];
2872 for my $attr_name (keys %{$token->{attributes}}) {
2873 unless ($top_el->has_attribute_ns (undef, $attr_name)) {
2874 $top_el->set_attribute_ns
2875 (undef, [undef, $attr_name],
2876 $token->{attributes}->{$attr_name}->{value});
2877 }
2878 }
2879 !!!next-token;
2880 redo B;
2881 } elsif ($token->{type} == COMMENT_TOKEN) {
2882 my $comment = $self->{document}->create_comment ($token->{data});
2883 if ($self->{insertion_mode} & AFTER_HTML_IMS) {
2884 $self->{document}->append_child ($comment);
2885 } elsif ($self->{insertion_mode} == AFTER_BODY_IM) {
2886 $self->{open_elements}->[0]->[0]->append_child ($comment);
2887 } else {
2888 $self->{open_elements}->[-1]->[0]->append_child ($comment);
2889 }
2890 !!!next-token;
2891 redo B;
2892 } elsif ($self->{insertion_mode} & HEAD_IMS) {
2893 if ($token->{type} == CHARACTER_TOKEN) {
2894 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
2895 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
2896 unless (length $token->{data}) {
2897 !!!next-token;
2898 redo B;
2899 }
2900 }
2901
2902 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
2903 ## As if <head>
2904 !!!create-element ($self->{head_element}, 'head');
2905 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
2906 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2907
2908 ## Reprocess in the "in head" insertion mode...
2909 pop @{$self->{open_elements}};
2910
2911 ## Reprocess in the "after head" insertion mode...
2912 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2913 ## As if </noscript>
2914 pop @{$self->{open_elements}};
2915 !!!parse-error (type => 'in noscript:#character');
2916
2917 ## Reprocess in the "in head" insertion mode...
2918 ## As if </head>
2919 pop @{$self->{open_elements}};
2920
2921 ## Reprocess in the "after head" insertion mode...
2922 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
2923 pop @{$self->{open_elements}};
2924
2925 ## Reprocess in the "after head" insertion mode...
2926 }
2927
2928 ## "after head" insertion mode
2929 ## As if <body>
2930 !!!insert-element ('body');
2931 $self->{insertion_mode} = IN_BODY_IM;
2932 ## reprocess
2933 redo B;
2934 } elsif ($token->{type} == START_TAG_TOKEN) {
2935 if ($token->{tag_name} eq 'head') {
2936 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
2937 !!!create-element ($self->{head_element}, $token->{tag_name}, $token->{attributes});
2938 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
2939 push @{$self->{open_elements}}, [$self->{head_element}, $token->{tag_name}];
2940 $self->{insertion_mode} = IN_HEAD_IM;
2941 !!!next-token;
2942 redo B;
2943 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
2944 #
2945 } else {
2946 !!!parse-error (type => 'in head:head'); # or in head noscript
2947 ## Ignore the token
2948 !!!next-token;
2949 redo B;
2950 }
2951 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
2952 ## As if <head>
2953 !!!create-element ($self->{head_element}, 'head');
2954 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
2955 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2956
2957 $self->{insertion_mode} = IN_HEAD_IM;
2958 ## Reprocess in the "in head" insertion mode...
2959 }
2960
2961 if ($token->{tag_name} eq 'base') {
2962 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2963 ## As if </noscript>
2964 pop @{$self->{open_elements}};
2965 !!!parse-error (type => 'in noscript:base');
2966
2967 $self->{insertion_mode} = IN_HEAD_IM;
2968 ## Reprocess in the "in head" insertion mode...
2969 }
2970
2971 ## NOTE: There is a "as if in head" code clone.
2972 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
2973 !!!parse-error (type => 'after head:'.$token->{tag_name});
2974 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2975 }
2976 !!!insert-element ($token->{tag_name}, $token->{attributes});
2977 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
2978 pop @{$self->{open_elements}}
2979 if $self->{insertion_mode} == AFTER_HEAD_IM;
2980 !!!next-token;
2981 redo B;
2982 } elsif ($token->{tag_name} eq 'link') {
2983 ## NOTE: There is a "as if in head" code clone.
2984 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
2985 !!!parse-error (type => 'after head:'.$token->{tag_name});
2986 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2987 }
2988 !!!insert-element ($token->{tag_name}, $token->{attributes});
2989 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
2990 pop @{$self->{open_elements}}
2991 if $self->{insertion_mode} == AFTER_HEAD_IM;
2992 !!!next-token;
2993 redo B;
2994 } elsif ($token->{tag_name} eq 'meta') {
2995 ## NOTE: There is a "as if in head" code clone.
2996 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
2997 !!!parse-error (type => 'after head:'.$token->{tag_name});
2998 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2999 }
3000 !!!insert-element ($token->{tag_name}, $token->{attributes});
3001 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
3002
3003 unless ($self->{confident}) {
3004 if ($token->{attributes}->{charset}) { ## TODO: And if supported
3005 $self->{change_encoding}
3006 ->($self, $token->{attributes}->{charset}->{value});
3007
3008 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
3009 ->set_user_data (manakai_has_reference =>
3010 $token->{attributes}->{charset}
3011 ->{has_reference});
3012 } elsif ($token->{attributes}->{content}) {
3013 ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
3014 if ($token->{attributes}->{content}->{value}
3015 =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
3016 [\x09-\x0D\x20]*=
3017 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
3018 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
3019 $self->{change_encoding}
3020 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3);
3021 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
3022 ->set_user_data (manakai_has_reference =>
3023 $token->{attributes}->{content}
3024 ->{has_reference});
3025 }
3026 }
3027 } else {
3028 if ($token->{attributes}->{charset}) {
3029 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
3030 ->set_user_data (manakai_has_reference =>
3031 $token->{attributes}->{charset}
3032 ->{has_reference});
3033 }
3034 if ($token->{attributes}->{content}) {
3035 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
3036 ->set_user_data (manakai_has_reference =>
3037 $token->{attributes}->{content}
3038 ->{has_reference});
3039 }
3040 }
3041
3042 pop @{$self->{open_elements}}
3043 if $self->{insertion_mode} == AFTER_HEAD_IM;
3044 !!!next-token;
3045 redo B;
3046 } elsif ($token->{tag_name} eq 'title') {
3047 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3048 ## As if </noscript>
3049 pop @{$self->{open_elements}};
3050 !!!parse-error (type => 'in noscript:title');
3051
3052 $self->{insertion_mode} = IN_HEAD_IM;
3053 ## Reprocess in the "in head" insertion mode...
3054 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
3055 !!!parse-error (type => 'after head:'.$token->{tag_name});
3056 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3057 }
3058
3059 ## NOTE: There is a "as if in head" code clone.
3060 my $parent = defined $self->{head_element} ? $self->{head_element}
3061 : $self->{open_elements}->[-1]->[0];
3062 $parse_rcdata->(RCDATA_CONTENT_MODEL,
3063 sub { $parent->append_child ($_[0]) });
3064 pop @{$self->{open_elements}}
3065 if $self->{insertion_mode} == AFTER_HEAD_IM;
3066 redo B;
3067 } elsif ($token->{tag_name} eq 'style') {
3068 ## NOTE: Or (scripting is enabled and tag_name eq 'noscript' and
3069 ## insertion mode IN_HEAD_IM)
3070 ## NOTE: There is a "as if in head" code clone.
3071 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
3072 !!!parse-error (type => 'after head:'.$token->{tag_name});
3073 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3074 }
3075 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert_to_current);
3076 pop @{$self->{open_elements}}
3077 if $self->{insertion_mode} == AFTER_HEAD_IM;
3078 redo B;
3079 } elsif ($token->{tag_name} eq 'noscript') {
3080 if ($self->{insertion_mode} == IN_HEAD_IM) {
3081 ## NOTE: and scripting is disalbed
3082 !!!insert-element ($token->{tag_name}, $token->{attributes});
3083 $self->{insertion_mode} = IN_HEAD_NOSCRIPT_IM;
3084 !!!next-token;
3085 redo B;
3086 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3087 !!!parse-error (type => 'in noscript:noscript');
3088 ## Ignore the token
3089 !!!next-token;
3090 redo B;
3091 } else {
3092 #
3093 }
3094 } elsif ($token->{tag_name} eq 'script') {
3095 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3096 ## As if </noscript>
3097 pop @{$self->{open_elements}};
3098 !!!parse-error (type => 'in noscript:script');
3099
3100 $self->{insertion_mode} = IN_HEAD_IM;
3101 ## Reprocess in the "in head" insertion mode...
3102 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
3103 !!!parse-error (type => 'after head:'.$token->{tag_name});
3104 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3105 }
3106
3107 ## NOTE: There is a "as if in head" code clone.
3108 $script_start_tag->($insert_to_current);
3109 pop @{$self->{open_elements}}
3110 if $self->{insertion_mode} == AFTER_HEAD_IM;
3111 redo B;
3112 } elsif ($token->{tag_name} eq 'body' or
3113 $token->{tag_name} eq 'frameset') {
3114 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3115 ## As if </noscript>
3116 pop @{$self->{open_elements}};
3117 !!!parse-error (type => 'in noscript:'.$token->{tag_name});
3118
3119 ## Reprocess in the "in head" insertion mode...
3120 ## As if </head>
3121 pop @{$self->{open_elements}};
3122
3123 ## Reprocess in the "after head" insertion mode...
3124 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3125 pop @{$self->{open_elements}};
3126
3127 ## Reprocess in the "after head" insertion mode...
3128 }
3129
3130 ## "after head" insertion mode
3131 !!!insert-element ($token->{tag_name}, $token->{attributes});
3132 if ($token->{tag_name} eq 'body') {
3133 $self->{insertion_mode} = IN_BODY_IM;
3134 } elsif ($token->{tag_name} eq 'frameset') {
3135 $self->{insertion_mode} = IN_FRAMESET_IM;
3136 } else {
3137 die "$0: tag name: $self->{tag_name}";
3138 }
3139 !!!next-token;
3140 redo B;
3141 } else {
3142 #
3143 }
3144
3145 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3146 ## As if </noscript>
3147 pop @{$self->{open_elements}};
3148 !!!parse-error (type => 'in noscript:/'.$token->{tag_name});
3149
3150 ## Reprocess in the "in head" insertion mode...
3151 ## As if </head>
3152 pop @{$self->{open_elements}};
3153
3154 ## Reprocess in the "after head" insertion mode...
3155 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3156 ## As if </head>
3157 pop @{$self->{open_elements}};
3158
3159 ## Reprocess in the "after head" insertion mode...
3160 }
3161
3162 ## "after head" insertion mode
3163 ## As if <body>
3164 !!!insert-element ('body');
3165 $self->{insertion_mode} = IN_BODY_IM;
3166 ## reprocess
3167 redo B;
3168 } elsif ($token->{type} == END_TAG_TOKEN) {
3169 if ($token->{tag_name} eq 'head') {
3170 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3171 ## As if <head>
3172 !!!create-element ($self->{head_element}, 'head');
3173 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3174 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3175
3176 ## Reprocess in the "in head" insertion mode...
3177 pop @{$self->{open_elements}};
3178 $self->{insertion_mode} = AFTER_HEAD_IM;
3179 !!!next-token;
3180 redo B;
3181 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3182 ## As if </noscript>
3183 pop @{$self->{open_elements}};
3184 !!!parse-error (type => 'in noscript:script');
3185
3186 ## Reprocess in the "in head" insertion mode...
3187 pop @{$self->{open_elements}};
3188 $self->{insertion_mode} = AFTER_HEAD_IM;
3189 !!!next-token;
3190 redo B;
3191 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3192 pop @{$self->{open_elements}};
3193 $self->{insertion_mode} = AFTER_HEAD_IM;
3194 !!!next-token;
3195 redo B;
3196 } else {
3197 #
3198 }
3199 } elsif ($token->{tag_name} eq 'noscript') {
3200 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3201 pop @{$self->{open_elements}};
3202 $self->{insertion_mode} = IN_HEAD_IM;
3203 !!!next-token;
3204 redo B;
3205 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3206 !!!parse-error (type => 'unmatched end tag:noscript');
3207 ## Ignore the token ## ISSUE: An issue in the spec.
3208 !!!next-token;
3209 redo B;
3210 } else {
3211 #
3212 }
3213 } elsif ({
3214 body => 1, html => 1,
3215 }->{$token->{tag_name}}) {
3216 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3217 ## As if <head>
3218 !!!create-element ($self->{head_element}, 'head');
3219 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3220 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3221
3222 $self->{insertion_mode} = IN_HEAD_IM;
3223 ## Reprocess in the "in head" insertion mode...
3224 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3225 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3226 ## Ignore the token
3227 !!!next-token;
3228 redo B;
3229 }
3230
3231 #
3232 } elsif ({
3233 p => 1, br => 1,
3234 }->{$token->{tag_name}}) {
3235 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3236 ## As if <head>
3237 !!!create-element ($self->{head_element}, 'head');
3238 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3239 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3240
3241 $self->{insertion_mode} = IN_HEAD_IM;
3242 ## Reprocess in the "in head" insertion mode...
3243 }
3244
3245 #
3246 } else {
3247 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
3248 #
3249 } else {
3250 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3251 ## Ignore the token
3252 !!!next-token;
3253 redo B;
3254 }
3255 }
3256
3257 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3258 ## As if </noscript>
3259 pop @{$self->{open_elements}};
3260 !!!parse-error (type => 'in noscript:/'.$token->{tag_name});
3261
3262 ## Reprocess in the "in head" insertion mode...
3263 ## As if </head>
3264 pop @{$self->{open_elements}};
3265
3266 ## Reprocess in the "after head" insertion mode...
3267 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3268 ## As if </head>
3269 pop @{$self->{open_elements}};
3270
3271 ## Reprocess in the "after head" insertion mode...
3272 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3273 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3274 ## Ignore the token ## ISSUE: An issue in the spec.
3275 !!!next-token;
3276 redo B;
3277 }
3278
3279 ## "after head" insertion mode
3280 ## As if <body>
3281 !!!insert-element ('body');
3282 $self->{insertion_mode} = IN_BODY_IM;
3283 ## reprocess
3284 redo B;
3285 } else {
3286 die "$0: $token->{type}: Unknown token type";
3287 }
3288
3289 ## ISSUE: An issue in the spec.
3290 } elsif ($self->{insertion_mode} & BODY_IMS) {
3291 if ($token->{type} == CHARACTER_TOKEN) {
3292 ## NOTE: There is a code clone of "character in body".
3293 $reconstruct_active_formatting_elements->($insert_to_current);
3294
3295 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3296
3297 !!!next-token;
3298 redo B;
3299 } elsif ($token->{type} == START_TAG_TOKEN) {
3300 if ({
3301 caption => 1, col => 1, colgroup => 1, tbody => 1,
3302 td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
3303 }->{$token->{tag_name}}) {
3304 if ($self->{insertion_mode} == IN_CELL_IM) {
3305 ## have an element in table scope
3306 my $tn;
3307 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3308 my $node = $self->{open_elements}->[$_];
3309 if ($node->[1] eq 'td' or $node->[1] eq 'th') {
3310 $tn = $node->[1];
3311 last INSCOPE;
3312 } elsif ({
3313 table => 1, html => 1,
3314 }->{$node->[1]}) {
3315 last INSCOPE;
3316 }
3317 } # INSCOPE
3318 unless (defined $tn) {
3319 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3320 ## Ignore the token
3321 !!!next-token;
3322 redo B;
3323 }
3324
3325 ## Close the cell
3326 !!!back-token; # <?>
3327 $token = {type => END_TAG_TOKEN, tag_name => $tn};
3328 redo B;
3329 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
3330 !!!parse-error (type => 'not closed:caption');
3331
3332 ## As if </caption>
3333 ## have a table element in table scope
3334 my $i;
3335 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3336 my $node = $self->{open_elements}->[$_];
3337 if ($node->[1] eq 'caption') {
3338 $i = $_;
3339 last INSCOPE;
3340 } elsif ({
3341 table => 1, html => 1,
3342 }->{$node->[1]}) {
3343 last INSCOPE;
3344 }
3345 } # INSCOPE
3346 unless (defined $i) {
3347 !!!parse-error (type => 'unmatched end tag:caption');
3348 ## Ignore the token
3349 !!!next-token;
3350 redo B;
3351 }
3352
3353 ## generate implied end tags
3354 if ({
3355 dd => 1, dt => 1, li => 1, p => 1,
3356 td => 1, th => 1, tr => 1,
3357 tbody => 1, tfoot=> 1, thead => 1,
3358 }->{$self->{open_elements}->[-1]->[1]}) {
3359 !!!back-token; # <?>
3360 $token = {type => END_TAG_TOKEN, tag_name => 'caption'};
3361 !!!back-token;
3362 $token = {type => END_TAG_TOKEN,
3363 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3364 redo B;
3365 }
3366
3367 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3368 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3369 }
3370
3371 splice @{$self->{open_elements}}, $i;
3372
3373 $clear_up_to_marker->();
3374
3375 $self->{insertion_mode} = IN_TABLE_IM;
3376
3377 ## reprocess
3378 redo B;
3379 } else {
3380 #
3381 }
3382 } else {
3383 #
3384 }
3385 } elsif ($token->{type} == END_TAG_TOKEN) {
3386 if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') {
3387 if ($self->{insertion_mode} == IN_CELL_IM) {
3388 ## have an element in table scope
3389 my $i;
3390 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3391 my $node = $self->{open_elements}->[$_];
3392 if ($node->[1] eq $token->{tag_name}) {
3393 $i = $_;
3394 last INSCOPE;
3395 } elsif ({
3396 table => 1, html => 1,
3397 }->{$node->[1]}) {
3398 last INSCOPE;
3399 }
3400 } # INSCOPE
3401 unless (defined $i) {
3402 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3403 ## Ignore the token
3404 !!!next-token;
3405 redo B;
3406 }
3407
3408 ## generate implied end tags
3409 if ({
3410 dd => 1, dt => 1, li => 1, p => 1,
3411 td => ($token->{tag_name} eq 'th'),
3412 th => ($token->{tag_name} eq 'td'),
3413 tr => 1,
3414 tbody => 1, tfoot=> 1, thead => 1,
3415 }->{$self->{open_elements}->[-1]->[1]}) {
3416 !!!back-token;
3417 $token = {type => END_TAG_TOKEN,
3418 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3419 redo B;
3420 }
3421
3422 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
3423 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3424 }
3425
3426 splice @{$self->{open_elements}}, $i;
3427
3428 $clear_up_to_marker->();
3429
3430 $self->{insertion_mode} = IN_ROW_IM;
3431
3432 !!!next-token;
3433 redo B;
3434 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
3435 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3436 ## Ignore the token
3437 !!!next-token;
3438 redo B;
3439 } else {
3440 #
3441 }
3442 } elsif ($token->{tag_name} eq 'caption') {
3443 if ($self->{insertion_mode} == IN_CAPTION_IM) {
3444 ## have a table element in table scope
3445 my $i;
3446 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3447 my $node = $self->{open_elements}->[$_];
3448 if ($node->[1] eq $token->{tag_name}) {
3449 $i = $_;
3450 last INSCOPE;
3451 } elsif ({
3452 table => 1, html => 1,
3453 }->{$node->[1]}) {
3454 last INSCOPE;
3455 }
3456 } # INSCOPE
3457 unless (defined $i) {
3458 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3459 ## Ignore the token
3460 !!!next-token;
3461 redo B;
3462 }
3463
3464 ## generate implied end tags
3465 if ({
3466 dd => 1, dt => 1, li => 1, p => 1,
3467 td => 1, th => 1, tr => 1,
3468 tbody => 1, tfoot=> 1, thead => 1,
3469 }->{$self->{open_elements}->[-1]->[1]}) {
3470 !!!back-token;
3471 $token = {type => END_TAG_TOKEN,
3472 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3473 redo B;
3474 }
3475
3476 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3477 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3478 }
3479
3480 splice @{$self->{open_elements}}, $i;
3481
3482 $clear_up_to_marker->();
3483
3484 $self->{insertion_mode} = IN_TABLE_IM;
3485
3486 !!!next-token;
3487 redo B;
3488 } elsif ($self->{insertion_mode} == IN_CELL_IM) {
3489 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3490 ## Ignore the token
3491 !!!next-token;
3492 redo B;
3493 } else {
3494 #
3495 }
3496 } elsif ({
3497 table => 1, tbody => 1, tfoot => 1,
3498 thead => 1, tr => 1,
3499 }->{$token->{tag_name}} and
3500 $self->{insertion_mode} == IN_CELL_IM) {
3501 ## have an element in table scope
3502 my $i;
3503 my $tn;
3504 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3505 my $node = $self->{open_elements}->[$_];
3506 if ($node->[1] eq $token->{tag_name}) {
3507 $i = $_;
3508 last INSCOPE;
3509 } elsif ($node->[1] eq 'td' or $node->[1] eq 'th') {
3510 $tn = $node->[1];
3511 ## NOTE: There is exactly one |td| or |th| element
3512 ## in scope in the stack of open elements by definition.
3513 } elsif ({
3514 table => 1, html => 1,
3515 }->{$node->[1]}) {
3516 last INSCOPE;
3517 }
3518 } # INSCOPE
3519 unless (defined $i) {
3520 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3521 ## Ignore the token
3522 !!!next-token;
3523 redo B;
3524 }
3525
3526 ## Close the cell
3527 !!!back-token; # </?>
3528 $token = {type => END_TAG_TOKEN, tag_name => $tn};
3529 redo B;
3530 } elsif ($token->{tag_name} eq 'table' and
3531 $self->{insertion_mode} == IN_CAPTION_IM) {
3532 !!!parse-error (type => 'not closed:caption');
3533
3534 ## As if </caption>
3535 ## have a table element in table scope
3536 my $i;
3537 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3538 my $node = $self->{open_elements}->[$_];
3539 if ($node->[1] eq 'caption') {
3540 $i = $_;
3541 last INSCOPE;
3542 } elsif ({
3543 table => 1, html => 1,
3544 }->{$node->[1]}) {
3545 last INSCOPE;
3546 }
3547 } # INSCOPE
3548 unless (defined $i) {
3549 !!!parse-error (type => 'unmatched end tag:caption');
3550 ## Ignore the token
3551 !!!next-token;
3552 redo B;
3553 }
3554
3555 ## generate implied end tags
3556 if ({
3557 dd => 1, dt => 1, li => 1, p => 1,
3558 td => 1, th => 1, tr => 1,
3559 tbody => 1, tfoot=> 1, thead => 1,
3560 }->{$self->{open_elements}->[-1]->[1]}) {
3561 !!!back-token; # </table>
3562 $token = {type => END_TAG_TOKEN, tag_name => 'caption'};
3563 !!!back-token;
3564 $token = {type => END_TAG_TOKEN,
3565 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3566 redo B;
3567 }
3568
3569 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3570 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3571 }
3572
3573 splice @{$self->{open_elements}}, $i;
3574
3575 $clear_up_to_marker->();
3576
3577 $self->{insertion_mode} = IN_TABLE_IM;
3578
3579 ## reprocess
3580 redo B;
3581 } elsif ({
3582 body => 1, col => 1, colgroup => 1, html => 1,
3583 }->{$token->{tag_name}}) {
3584 if ($self->{insertion_mode} & BODY_TABLE_IMS) {
3585 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3586 ## Ignore the token
3587 !!!next-token;
3588 redo B;
3589 } else {
3590 #
3591 }
3592 } elsif ({
3593 tbody => 1, tfoot => 1,
3594 thead => 1, tr => 1,
3595 }->{$token->{tag_name}} and
3596 $self->{insertion_mode} == IN_CAPTION_IM) {
3597 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3598 ## Ignore the token
3599 !!!next-token;
3600 redo B;
3601 } else {
3602 #
3603 }
3604 } else {
3605 die "$0: $token->{type}: Unknown token type";
3606 }
3607
3608 $insert = $insert_to_current;
3609 #
3610 } elsif ($self->{insertion_mode} & TABLE_IMS) {
3611 if ($token->{type} == CHARACTER_TOKEN) {
3612 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3613 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3614
3615 unless (length $token->{data}) {
3616 !!!next-token;
3617 redo B;
3618 }
3619 }
3620
3621 !!!parse-error (type => 'in table:#character');
3622
3623 ## As if in body, but insert into foster parent element
3624 ## ISSUE: Spec says that "whenever a node would be inserted
3625 ## into the current node" while characters might not be
3626 ## result in a new Text node.
3627 $reconstruct_active_formatting_elements->($insert_to_foster);
3628
3629 if ({
3630 table => 1, tbody => 1, tfoot => 1,
3631 thead => 1, tr => 1,
3632 }->{$self->{open_elements}->[-1]->[1]}) {
3633 # MUST
3634 my $foster_parent_element;
3635 my $next_sibling;
3636 my $prev_sibling;
3637 OE: for (reverse 0..$#{$self->{open_elements}}) {
3638 if ($self->{open_elements}->[$_]->[1] eq 'table') {
3639 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3640 if (defined $parent and $parent->node_type == 1) {
3641 $foster_parent_element = $parent;
3642 $next_sibling = $self->{open_elements}->[$_]->[0];
3643 $prev_sibling = $next_sibling->previous_sibling;
3644 } else {
3645 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
3646 $prev_sibling = $foster_parent_element->last_child;
3647 }
3648 last OE;
3649 }
3650 } # OE
3651 $foster_parent_element = $self->{open_elements}->[0]->[0] and
3652 $prev_sibling = $foster_parent_element->last_child
3653 unless defined $foster_parent_element;
3654 if (defined $prev_sibling and
3655 $prev_sibling->node_type == 3) {
3656 $prev_sibling->manakai_append_text ($token->{data});
3657 } else {
3658 $foster_parent_element->insert_before
3659 ($self->{document}->create_text_node ($token->{data}),
3660 $next_sibling);
3661 }
3662 } else {
3663 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3664 }
3665
3666 !!!next-token;
3667 redo B;
3668 } elsif ($token->{type} == START_TAG_TOKEN) {
3669 if ({
3670 tr => ($self->{insertion_mode} != IN_ROW_IM),
3671 th => 1, td => 1,
3672 }->{$token->{tag_name}}) {
3673 if ($self->{insertion_mode} == IN_TABLE_IM) {
3674 ## Clear back to table context
3675 while ($self->{open_elements}->[-1]->[1] ne 'table' and
3676 $self->{open_elements}->[-1]->[1] ne 'html') {
3677 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3678 pop @{$self->{open_elements}};
3679 }
3680
3681 !!!insert-element ('tbody');
3682 $self->{insertion_mode} = IN_TABLE_BODY_IM;
3683 ## reprocess in the "in table body" insertion mode...
3684 }
3685
3686 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
3687 unless ($token->{tag_name} eq 'tr') {
3688 !!!parse-error (type => 'missing start tag:tr');
3689 }
3690
3691 ## Clear back to table body context
3692 while (not {
3693 tbody => 1, tfoot => 1, thead => 1, html => 1,
3694 }->{$self->{open_elements}->[-1]->[1]}) {
3695 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3696 pop @{$self->{open_elements}};
3697 }
3698
3699 $self->{insertion_mode} = IN_ROW_IM;
3700 if ($token->{tag_name} eq 'tr') {
3701 !!!insert-element ($token->{tag_name}, $token->{attributes});
3702 !!!next-token;
3703 redo B;
3704 } else {
3705 !!!insert-element ('tr');
3706 ## reprocess in the "in row" insertion mode
3707 }
3708 }
3709
3710 ## Clear back to table row context
3711 while (not {
3712 tr => 1, html => 1,
3713 }->{$self->{open_elements}->[-1]->[1]}) {
3714 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3715 pop @{$self->{open_elements}};
3716 }
3717
3718 !!!insert-element ($token->{tag_name}, $token->{attributes});
3719 $self->{insertion_mode} = IN_CELL_IM;
3720
3721 push @$active_formatting_elements, ['#marker', ''];
3722
3723 !!!next-token;
3724 redo B;
3725 } elsif ({
3726 caption => 1, col => 1, colgroup => 1,
3727 tbody => 1, tfoot => 1, thead => 1,
3728 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
3729 }->{$token->{tag_name}}) {
3730 if ($self->{insertion_mode} == IN_ROW_IM) {
3731 ## As if </tr>
3732 ## have an element in table scope
3733 my $i;
3734 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3735 my $node = $self->{open_elements}->[$_];
3736 if ($node->[1] eq 'tr') {
3737 $i = $_;
3738 last INSCOPE;
3739 } elsif ({
3740 table => 1, html => 1,
3741 }->{$node->[1]}) {
3742 last INSCOPE;
3743 }
3744 } # INSCOPE
3745 unless (defined $i) {
3746 !!!parse-error (type => 'unmacthed end tag:'.$token->{tag_name});
3747 ## Ignore the token
3748 !!!next-token;
3749 redo B;
3750 }
3751
3752 ## Clear back to table row context
3753 while (not {
3754 tr => 1, html => 1,
3755 }->{$self->{open_elements}->[-1]->[1]}) {
3756 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3757 pop @{$self->{open_elements}};
3758 }
3759
3760 pop @{$self->{open_elements}}; # tr
3761 $self->{insertion_mode} = IN_TABLE_BODY_IM;
3762 if ($token->{tag_name} eq 'tr') {
3763 ## reprocess
3764 redo B;
3765 } else {
3766 ## reprocess in the "in table body" insertion mode...
3767 }
3768 }
3769
3770 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
3771 ## have an element in table scope
3772 my $i;
3773 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3774 my $node = $self->{open_elements}->[$_];
3775 if ({
3776 tbody => 1, thead => 1, tfoot => 1,
3777 }->{$node->[1]}) {
3778 $i = $_;
3779 last INSCOPE;
3780 } elsif ({
3781 table => 1, html => 1,
3782 }->{$node->[1]}) {
3783 last INSCOPE;
3784 }
3785 } # INSCOPE
3786 unless (defined $i) {
3787 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3788 ## Ignore the token
3789 !!!next-token;
3790 redo B;
3791 }
3792
3793 ## Clear back to table body context
3794 while (not {
3795 tbody => 1, tfoot => 1, thead => 1, html => 1,
3796 }->{$self->{open_elements}->[-1]->[1]}) {
3797 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3798 pop @{$self->{open_elements}};
3799 }
3800
3801 ## As if <{current node}>
3802 ## have an element in table scope
3803 ## true by definition
3804
3805 ## Clear back to table body context
3806 ## nop by definition
3807
3808 pop @{$self->{open_elements}};
3809 $self->{insertion_mode} = IN_TABLE_IM;
3810 ## reprocess in "in table" insertion mode...
3811 }
3812
3813 if ($token->{tag_name} eq 'col') {
3814 ## Clear back to table context
3815 while ($self->{open_elements}->[-1]->[1] ne 'table' and
3816 $self->{open_elements}->[-1]->[1] ne 'html') {
3817 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3818 pop @{$self->{open_elements}};
3819 }
3820
3821 !!!insert-element ('colgroup');
3822 $self->{insertion_mode} = IN_COLUMN_GROUP_IM;
3823 ## reprocess
3824 redo B;
3825 } elsif ({
3826 caption => 1,
3827 colgroup => 1,
3828 tbody => 1, tfoot => 1, thead => 1,
3829 }->{$token->{tag_name}}) {
3830 ## Clear back to table context
3831 while ($self->{open_elements}->[-1]->[1] ne 'table' and
3832 $self->{open_elements}->[-1]->[1] ne 'html') {
3833 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3834 pop @{$self->{open_elements}};
3835 }
3836
3837 push @$active_formatting_elements, ['#marker', '']
3838 if $token->{tag_name} eq 'caption';
3839
3840 !!!insert-element ($token->{tag_name}, $token->{attributes});
3841 $self->{insertion_mode} = {
3842 caption => IN_CAPTION_IM,
3843 colgroup => IN_COLUMN_GROUP_IM,
3844 tbody => IN_TABLE_BODY_IM,
3845 tfoot => IN_TABLE_BODY_IM,
3846 thead => IN_TABLE_BODY_IM,
3847 }->{$token->{tag_name}};
3848 !!!next-token;
3849 redo B;
3850 } else {
3851 die "$0: in table: <>: $token->{tag_name}";
3852 }
3853 } elsif ($token->{tag_name} eq 'table') {
3854 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3855
3856 ## As if </table>
3857 ## have a table element in table scope
3858 my $i;
3859 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3860 my $node = $self->{open_elements}->[$_];
3861 if ($node->[1] eq 'table') {
3862 $i = $_;
3863 last INSCOPE;
3864 } elsif ({
3865 table => 1, html => 1,
3866 }->{$node->[1]}) {
3867 last INSCOPE;
3868 }
3869 } # INSCOPE
3870 unless (defined $i) {
3871 !!!parse-error (type => 'unmatched end tag:table');
3872 ## Ignore tokens </table><table>
3873 !!!next-token;
3874 redo B;
3875 }
3876
3877 ## generate implied end tags
3878 if ({
3879 dd => 1, dt => 1, li => 1, p => 1,
3880 td => 1, th => 1, tr => 1,
3881 tbody => 1, tfoot=> 1, thead => 1,
3882 }->{$self->{open_elements}->[-1]->[1]}) {
3883 !!!back-token; # <table>
3884 $token = {type => END_TAG_TOKEN, tag_name => 'table'};
3885 !!!back-token;
3886 $token = {type => END_TAG_TOKEN,
3887 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3888 redo B;
3889 }
3890
3891 if ($self->{open_elements}->[-1]->[1] ne 'table') {
3892 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3893 }
3894
3895 splice @{$self->{open_elements}}, $i;
3896
3897 $self->_reset_insertion_mode;
3898
3899 ## reprocess
3900 redo B;
3901 } else {
3902 !!!parse-error (type => 'in table:'.$token->{tag_name});
3903
3904 $insert = $insert_to_foster;
3905 #
3906 }
3907 } elsif ($token->{type} == END_TAG_TOKEN) {
3908 if ($token->{tag_name} eq 'tr' and
3909 $self->{insertion_mode} == IN_ROW_IM) {
3910 ## have an element in table scope
3911 my $i;
3912 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3913 my $node = $self->{open_elements}->[$_];
3914 if ($node->[1] eq $token->{tag_name}) {
3915 $i = $_;
3916 last INSCOPE;
3917 } elsif ({
3918 table => 1, html => 1,
3919 }->{$node->[1]}) {
3920 last INSCOPE;
3921 }
3922 } # INSCOPE
3923 unless (defined $i) {
3924 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3925 ## Ignore the token
3926 !!!next-token;
3927 redo B;
3928 }
3929
3930 ## Clear back to table row context
3931 while (not {
3932 tr => 1, html => 1,
3933 }->{$self->{open_elements}->[-1]->[1]}) {
3934 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3935 pop @{$self->{open_elements}};
3936 }
3937
3938 pop @{$self->{open_elements}}; # tr
3939 $self->{insertion_mode} = IN_TABLE_BODY_IM;
3940 !!!next-token;
3941 redo B;
3942 } elsif ($token->{tag_name} eq 'table') {
3943 if ($self->{insertion_mode} == IN_ROW_IM) {
3944 ## As if </tr>
3945 ## have an element in table scope
3946 my $i;
3947 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3948 my $node = $self->{open_elements}->[$_];
3949 if ($node->[1] eq 'tr') {
3950 $i = $_;
3951 last INSCOPE;
3952 } elsif ({
3953 table => 1, html => 1,
3954 }->{$node->[1]}) {
3955 last INSCOPE;
3956 }
3957 } # INSCOPE
3958 unless (defined $i) {
3959 !!!parse-error (type => 'unmatched end tag:'.$token->{type});
3960 ## Ignore the token
3961 !!!next-token;
3962 redo B;
3963 }
3964
3965 ## Clear back to table row context
3966 while (not {
3967 tr => 1, html => 1,
3968 }->{$self->{open_elements}->[-1]->[1]}) {
3969 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3970 pop @{$self->{open_elements}};
3971 }
3972
3973 pop @{$self->{open_elements}}; # tr
3974 $self->{insertion_mode} = IN_TABLE_BODY_IM;
3975 ## reprocess in the "in table body" insertion mode...
3976 }
3977
3978 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
3979 ## have an element in table scope
3980 my $i;
3981 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3982 my $node = $self->{open_elements}->[$_];
3983 if ({
3984 tbody => 1, thead => 1, tfoot => 1,
3985 }->{$node->[1]}) {
3986 $i = $_;
3987 last INSCOPE;
3988 } elsif ({
3989 table => 1, html => 1,
3990 }->{$node->[1]}) {
3991 last INSCOPE;
3992 }
3993 } # INSCOPE
3994 unless (defined $i) {
3995 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3996 ## Ignore the token
3997 !!!next-token;
3998 redo B;
3999 }
4000
4001 ## Clear back to table body context
4002 while (not {
4003 tbody => 1, tfoot => 1, thead => 1, html => 1,
4004 }->{$self->{open_elements}->[-1]->[1]}) {
4005 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4006 pop @{$self->{open_elements}};
4007 }
4008
4009 ## As if <{current node}>
4010 ## have an element in table scope
4011 ## true by definition
4012
4013 ## Clear back to table body context
4014 ## nop by definition
4015
4016 pop @{$self->{open_elements}};
4017 $self->{insertion_mode} = IN_TABLE_IM;
4018 ## reprocess in the "in table" insertion mode...
4019 }
4020
4021 ## have a table element in table scope
4022 my $i;
4023 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4024 my $node = $self->{open_elements}->[$_];
4025 if ($node->[1] eq $token->{tag_name}) {
4026 $i = $_;
4027 last INSCOPE;
4028 } elsif ({
4029 table => 1, html => 1,
4030 }->{$node->[1]}) {
4031 last INSCOPE;
4032 }
4033 } # INSCOPE
4034 unless (defined $i) {
4035 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4036 ## Ignore the token
4037 !!!next-token;
4038 redo B;
4039 }
4040
4041 ## generate implied end tags
4042 if ({
4043 dd => 1, dt => 1, li => 1, p => 1,
4044 td => 1, th => 1, tr => 1,
4045 tbody => 1, tfoot=> 1, thead => 1,
4046 }->{$self->{open_elements}->[-1]->[1]}) {
4047 !!!back-token;
4048 $token = {type => END_TAG_TOKEN,
4049 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
4050 redo B;
4051 }
4052
4053 if ($self->{open_elements}->[-1]->[1] ne 'table') {
4054 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4055 }
4056
4057 splice @{$self->{open_elements}}, $i;
4058
4059 $self->_reset_insertion_mode;
4060
4061 !!!next-token;
4062 redo B;
4063 } elsif ({
4064 tbody => 1, tfoot => 1, thead => 1,
4065 }->{$token->{tag_name}} and
4066 $self->{insertion_mode} & ROW_IMS) {
4067 if ($self->{insertion_mode} == IN_ROW_IM) {
4068 ## have an element in table scope
4069 my $i;
4070 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4071 my $node = $self->{open_elements}->[$_];
4072 if ($node->[1] eq $token->{tag_name}) {
4073 $i = $_;
4074 last INSCOPE;
4075 } elsif ({
4076 table => 1, html => 1,
4077 }->{$node->[1]}) {
4078 last INSCOPE;
4079 }
4080 } # INSCOPE
4081 unless (defined $i) {
4082 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4083 ## Ignore the token
4084 !!!next-token;
4085 redo B;
4086 }
4087
4088 ## As if </tr>
4089 ## have an element in table scope
4090 my $i;
4091 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4092 my $node = $self->{open_elements}->[$_];
4093 if ($node->[1] eq 'tr') {
4094 $i = $_;
4095 last INSCOPE;
4096 } elsif ({
4097 table => 1, html => 1,
4098 }->{$node->[1]}) {
4099 last INSCOPE;
4100 }
4101 } # INSCOPE
4102 unless (defined $i) {
4103 !!!parse-error (type => 'unmatched end tag:tr');
4104 ## Ignore the token
4105 !!!next-token;
4106 redo B;
4107 }
4108
4109 ## Clear back to table row context
4110 while (not {
4111 tr => 1, html => 1,
4112 }->{$self->{open_elements}->[-1]->[1]}) {
4113 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4114 pop @{$self->{open_elements}};
4115 }
4116
4117 pop @{$self->{open_elements}}; # tr
4118 $self->{insertion_mode} = IN_TABLE_BODY_IM;
4119 ## reprocess in the "in table body" insertion mode...
4120 }
4121
4122 ## have an element in table scope
4123 my $i;
4124 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4125 my $node = $self->{open_elements}->[$_];
4126 if ($node->[1] eq $token->{tag_name}) {
4127 $i = $_;
4128 last INSCOPE;
4129 } elsif ({
4130 table => 1, html => 1,
4131 }->{$node->[1]}) {
4132 last INSCOPE;
4133 }
4134 } # INSCOPE
4135 unless (defined $i) {
4136 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4137 ## Ignore the token
4138 !!!next-token;
4139 redo B;
4140 }
4141
4142 ## Clear back to table body context
4143 while (not {
4144 tbody => 1, tfoot => 1, thead => 1, html => 1,
4145 }->{$self->{open_elements}->[-1]->[1]}) {
4146 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4147 pop @{$self->{open_elements}};
4148 }
4149
4150 pop @{$self->{open_elements}};
4151 $self->{insertion_mode} = IN_TABLE_IM;
4152 !!!next-token;
4153 redo B;
4154 } elsif ({
4155 body => 1, caption => 1, col => 1, colgroup => 1,
4156 html => 1, td => 1, th => 1,
4157 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
4158 tbody => 1, tfoot => 1, thead => 1, # $self->{insertion_mode} == IN_TABLE_IM
4159 }->{$token->{tag_name}}) {
4160 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4161 ## Ignore the token
4162 !!!next-token;
4163 redo B;
4164 } else {
4165 !!!parse-error (type => 'in table:/'.$token->{tag_name});
4166
4167 $insert = $insert_to_foster;
4168 #
4169 }
4170 } else {
4171 die "$0: $token->{type}: Unknown token type";
4172 }
4173 } elsif ($self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
4174 if ($token->{type} == CHARACTER_TOKEN) {
4175 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4176 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4177 unless (length $token->{data}) {
4178 !!!next-token;
4179 redo B;
4180 }
4181 }
4182
4183 #
4184 } elsif ($token->{type} == START_TAG_TOKEN) {
4185 if ($token->{tag_name} eq 'col') {
4186 !!!insert-element ($token->{tag_name}, $token->{attributes});
4187 pop @{$self->{open_elements}};
4188 !!!next-token;
4189 redo B;
4190 } else {
4191 #
4192 }
4193 } elsif ($token->{type} == END_TAG_TOKEN) {
4194 if ($token->{tag_name} eq 'colgroup') {
4195 if ($self->{open_elements}->[-1]->[1] eq 'html') {
4196 !!!parse-error (type => 'unmatched end tag:colgroup');
4197 ## Ignore the token
4198 !!!next-token;
4199 redo B;
4200 } else {
4201 pop @{$self->{open_elements}}; # colgroup
4202 $self->{insertion_mode} = IN_TABLE_IM;
4203 !!!next-token;
4204 redo B;
4205 }
4206 } elsif ($token->{tag_name} eq 'col') {
4207 !!!parse-error (type => 'unmatched end tag:col');
4208 ## Ignore the token
4209 !!!next-token;
4210 redo B;
4211 } else {
4212 #
4213 }
4214 } else {
4215 #
4216 }
4217
4218 ## As if </colgroup>
4219 if ($self->{open_elements}->[-1]->[1] eq 'html') {
4220 !!!parse-error (type => 'unmatched end tag:colgroup');
4221 ## Ignore the token
4222 !!!next-token;
4223 redo B;
4224 } else {
4225 pop @{$self->{open_elements}}; # colgroup
4226 $self->{insertion_mode} = IN_TABLE_IM;
4227 ## reprocess
4228 redo B;
4229 }
4230 } elsif ($self->{insertion_mode} == IN_SELECT_IM) {
4231 if ($token->{type} == CHARACTER_TOKEN) {
4232 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4233 !!!next-token;
4234 redo B;
4235 } elsif ($token->{type} == START_TAG_TOKEN) {
4236 if ($token->{tag_name} eq 'option') {
4237 if ($self->{open_elements}->[-1]->[1] eq 'option') {
4238 ## As if </option>
4239 pop @{$self->{open_elements}};
4240 }
4241
4242 !!!insert-element ($token->{tag_name}, $token->{attributes});
4243 !!!next-token;
4244 redo B;
4245 } elsif ($token->{tag_name} eq 'optgroup') {
4246 if ($self->{open_elements}->[-1]->[1] eq 'option') {
4247 ## As if </option>
4248 pop @{$self->{open_elements}};
4249 }
4250
4251 if ($self->{open_elements}->[-1]->[1] eq 'optgroup') {
4252 ## As if </optgroup>
4253 pop @{$self->{open_elements}};
4254 }
4255
4256 !!!insert-element ($token->{tag_name}, $token->{attributes});
4257 !!!next-token;
4258 redo B;
4259 } elsif ($token->{tag_name} eq 'select') {
4260 !!!parse-error (type => 'not closed:select');
4261 ## As if </select> instead
4262 ## have an element in table scope
4263 my $i;
4264 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4265 my $node = $self->{open_elements}->[$_];
4266 if ($node->[1] eq $token->{tag_name}) {
4267 $i = $_;
4268 last INSCOPE;
4269 } elsif ({
4270 table => 1, html => 1,
4271 }->{$node->[1]}) {
4272 last INSCOPE;
4273 }
4274 } # INSCOPE
4275 unless (defined $i) {
4276 !!!parse-error (type => 'unmatched end tag:select');
4277 ## Ignore the token
4278 !!!next-token;
4279 redo B;
4280 }
4281
4282 splice @{$self->{open_elements}}, $i;
4283
4284 $self->_reset_insertion_mode;
4285
4286 !!!next-token;
4287 redo B;
4288 } else {
4289 !!!parse-error (type => 'in select:'.$token->{tag_name});
4290 ## Ignore the token
4291 !!!next-token;
4292 redo B;
4293 }
4294 } elsif ($token->{type} == END_TAG_TOKEN) {
4295 if ($token->{tag_name} eq 'optgroup') {
4296 if ($self->{open_elements}->[-1]->[1] eq 'option' and
4297 $self->{open_elements}->[-2]->[1] eq 'optgroup') {
4298 ## As if </option>
4299 splice @{$self->{open_elements}}, -2;
4300 } elsif ($self->{open_elements}->[-1]->[1] eq 'optgroup') {
4301 pop @{$self->{open_elements}};
4302 } else {
4303 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4304 ## Ignore the token
4305 }
4306 !!!next-token;
4307 redo B;
4308 } elsif ($token->{tag_name} eq 'option') {
4309 if ($self->{open_elements}->[-1]->[1] eq 'option') {
4310 pop @{$self->{open_elements}};
4311 } else {
4312 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4313 ## Ignore the token
4314 }
4315 !!!next-token;
4316 redo B;
4317 } elsif ($token->{tag_name} eq 'select') {
4318 ## have an element in table scope
4319 my $i;
4320 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4321 my $node = $self->{open_elements}->[$_];
4322 if ($node->[1] eq $token->{tag_name}) {
4323 $i = $_;
4324 last INSCOPE;
4325 } elsif ({
4326 table => 1, html => 1,
4327 }->{$node->[1]}) {
4328 last INSCOPE;
4329 }
4330 } # INSCOPE
4331 unless (defined $i) {
4332 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4333 ## Ignore the token
4334 !!!next-token;
4335 redo B;
4336 }
4337
4338 splice @{$self->{open_elements}}, $i;
4339
4340 $self->_reset_insertion_mode;
4341
4342 !!!next-token;
4343 redo B;
4344 } elsif ({
4345 caption => 1, table => 1, tbody => 1,
4346 tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
4347 }->{$token->{tag_name}}) {
4348 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4349
4350 ## have an element in table scope
4351 my $i;
4352 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4353 my $node = $self->{open_elements}->[$_];
4354 if ($node->[1] eq $token->{tag_name}) {
4355 $i = $_;
4356 last INSCOPE;
4357 } elsif ({
4358 table => 1, html => 1,
4359 }->{$node->[1]}) {
4360 last INSCOPE;
4361 }
4362 } # INSCOPE
4363 unless (defined $i) {
4364 ## Ignore the token
4365 !!!next-token;
4366 redo B;
4367 }
4368
4369 ## As if </select>
4370 ## have an element in table scope
4371 undef $i;
4372 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4373 my $node = $self->{open_elements}->[$_];
4374 if ($node->[1] eq 'select') {
4375 $i = $_;
4376 last INSCOPE;
4377 } elsif ({
4378 table => 1, html => 1,
4379 }->{$node->[1]}) {
4380 last INSCOPE;
4381 }
4382 } # INSCOPE
4383 unless (defined $i) {
4384 !!!parse-error (type => 'unmatched end tag:select');
4385 ## Ignore the </select> token
4386 !!!next-token; ## TODO: ok?
4387 redo B;
4388 }
4389
4390 splice @{$self->{open_elements}}, $i;
4391
4392 $self->_reset_insertion_mode;
4393
4394 ## reprocess
4395 redo B;
4396 } else {
4397 !!!parse-error (type => 'in select:/'.$token->{tag_name});
4398 ## Ignore the token
4399 !!!next-token;
4400 redo B;
4401 }
4402 } else {
4403 die "$0: $token->{type}: Unknown token type";
4404 }
4405 } elsif ($self->{insertion_mode} & BODY_AFTER_IMS) {
4406 if ($token->{type} == CHARACTER_TOKEN) {
4407 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4408 my $data = $1;
4409 ## As if in body
4410 $reconstruct_active_formatting_elements->($insert_to_current);
4411
4412 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4413
4414 unless (length $token->{data}) {
4415 !!!next-token;
4416 redo B;
4417 }
4418 }
4419
4420 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
4421 !!!parse-error (type => 'after html:#character');
4422
4423 ## Reprocess in the "main" phase, "after body" insertion mode...
4424 }
4425
4426 ## "after body" insertion mode
4427 !!!parse-error (type => 'after body:#character');
4428
4429 $self->{insertion_mode} = IN_BODY_IM;
4430 ## reprocess
4431 redo B;
4432 } elsif ($token->{type} == START_TAG_TOKEN) {
4433 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
4434 !!!parse-error (type => 'after html:'.$token->{tag_name});
4435
4436 ## Reprocess in the "main" phase, "after body" insertion mode...
4437 }
4438
4439 ## "after body" insertion mode
4440 !!!parse-error (type => 'after body:'.$token->{tag_name});
4441
4442 $self->{insertion_mode} = IN_BODY_IM;
4443 ## reprocess
4444 redo B;
4445 } elsif ($token->{type} == END_TAG_TOKEN) {
4446 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
4447 !!!parse-error (type => 'after html:/'.$token->{tag_name});
4448
4449 $self->{insertion_mode} = AFTER_BODY_IM;
4450 ## Reprocess in the "main" phase, "after body" insertion mode...
4451 }
4452
4453 ## "after body" insertion mode
4454 if ($token->{tag_name} eq 'html') {
4455 if (defined $self->{inner_html_node}) {
4456 !!!parse-error (type => 'unmatched end tag:html');
4457 ## Ignore the token
4458 !!!next-token;
4459 redo B;
4460 } else {
4461 $self->{insertion_mode} = AFTER_HTML_BODY_IM;
4462 !!!next-token;
4463 redo B;
4464 }
4465 } else {
4466 !!!parse-error (type => 'after body:/'.$token->{tag_name});
4467
4468 $self->{insertion_mode} = IN_BODY_IM;
4469 ## reprocess
4470 redo B;
4471 }
4472 } else {
4473 die "$0: $token->{type}: Unknown token type";
4474 }
4475 } elsif ($self->{insertion_mode} & FRAME_IMS) {
4476 if ($token->{type} == CHARACTER_TOKEN) {
4477 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4478 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4479
4480 unless (length $token->{data}) {
4481 !!!next-token;
4482 redo B;
4483 }
4484 }
4485
4486 if ($token->{data} =~ s/^[^\x09\x0A\x0B\x0C\x20]+//) {
4487 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
4488 !!!parse-error (type => 'in frameset:#character');
4489 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
4490 !!!parse-error (type => 'after frameset:#character');
4491 } else { # "after html frameset"
4492 !!!parse-error (type => 'after html:#character');
4493
4494 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4495 ## Reprocess in the "main" phase, "after frameset"...
4496 !!!parse-error (type => 'after frameset:#character');
4497 }
4498
4499 ## Ignore the token.
4500 if (length $token->{data}) {
4501 ## reprocess the rest of characters
4502 } else {
4503 !!!next-token;
4504 }
4505 redo B;
4506 }
4507
4508 die qq[$0: Character "$token->{data}"];
4509 } elsif ($token->{type} == START_TAG_TOKEN) {
4510 if ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
4511 !!!parse-error (type => 'after html:'.$token->{tag_name});
4512
4513 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4514 ## Process in the "main" phase, "after frameset" insertion mode...
4515 }
4516
4517 if ($token->{tag_name} eq 'frameset' and
4518 $self->{insertion_mode} == IN_FRAMESET_IM) {
4519 !!!insert-element ($token->{tag_name}, $token->{attributes});
4520 !!!next-token;
4521 redo B;
4522 } elsif ($token->{tag_name} eq 'frame' and
4523 $self->{insertion_mode} == IN_FRAMESET_IM) {
4524 !!!insert-element ($token->{tag_name}, $token->{attributes});
4525 pop @{$self->{open_elements}};
4526 !!!next-token;
4527 redo B;
4528 } elsif ($token->{tag_name} eq 'noframes') {
4529 ## NOTE: As if in body.
4530 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert_to_current);
4531 redo B;
4532 } else {
4533 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
4534 !!!parse-error (type => 'in frameset:'.$token->{tag_name});
4535 } else {
4536 !!!parse-error (type => 'after frameset:'.$token->{tag_name});
4537 }
4538 ## Ignore the token
4539 !!!next-token;
4540 redo B;
4541 }
4542 } elsif ($token->{type} == END_TAG_TOKEN) {
4543 if ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
4544 !!!parse-error (type => 'after html:/'.$token->{tag_name});
4545
4546 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4547 ## Process in the "main" phase, "after frameset" insertion mode...
4548 }
4549
4550 if ($token->{tag_name} eq 'frameset' and
4551 $self->{insertion_mode} == IN_FRAMESET_IM) {
4552 if ($self->{open_elements}->[-1]->[1] eq 'html' and
4553 @{$self->{open_elements}} == 1) {
4554 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4555 ## Ignore the token
4556 !!!next-token;
4557 } else {
4558 pop @{$self->{open_elements}};
4559 !!!next-token;
4560 }
4561
4562 if (not defined $self->{inner_html_node} and
4563 $self->{open_elements}->[-1]->[1] ne 'frameset') {
4564 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4565 }
4566 redo B;
4567 } elsif ($token->{tag_name} eq 'html' and
4568 $self->{insertion_mode} == AFTER_FRAMESET_IM) {
4569 $self->{insertion_mode} = AFTER_HTML_FRAMESET_IM;
4570 !!!next-token;
4571 redo B;
4572 } else {
4573 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
4574 !!!parse-error (type => 'in frameset:/'.$token->{tag_name});
4575 } else {
4576 !!!parse-error (type => 'after frameset:/'.$token->{tag_name});
4577 }
4578 ## Ignore the token
4579 !!!next-token;
4580 redo B;
4581 }
4582 } else {
4583 die "$0: $token->{type}: Unknown token type";
4584 }
4585
4586 ## ISSUE: An issue in spec here
4587 } else {
4588 die "$0: $self->{insertion_mode}: Unknown insertion mode";
4589 }
4590
4591 ## "in body" insertion mode
4592 if ($token->{type} == START_TAG_TOKEN) {
4593 if ($token->{tag_name} eq 'script') {
4594 ## NOTE: This is an "as if in head" code clone
4595 $script_start_tag->($insert);
4596 redo B;
4597 } elsif ($token->{tag_name} eq 'style') {
4598 ## NOTE: This is an "as if in head" code clone
4599 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);
4600 redo B;
4601 } elsif ({
4602 base => 1, link => 1,
4603 }->{$token->{tag_name}}) {
4604 ## NOTE: This is an "as if in head" code clone, only "-t" differs
4605 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4606 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4607 !!!next-token;
4608 redo B;
4609 } elsif ($token->{tag_name} eq 'meta') {
4610 ## NOTE: This is an "as if in head" code clone, only "-t" differs
4611 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4612 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4613
4614 unless ($self->{confident}) {
4615 if ($token->{attributes}->{charset}) { ## TODO: And if supported
4616 $self->{change_encoding}
4617 ->($self, $token->{attributes}->{charset}->{value});
4618
4619 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4620 ->set_user_data (manakai_has_reference =>
4621 $token->{attributes}->{charset}
4622 ->{has_reference});
4623 } elsif ($token->{attributes}->{content}) {
4624 ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
4625 if ($token->{attributes}->{content}->{value}
4626 =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
4627 [\x09-\x0D\x20]*=
4628 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
4629 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
4630 $self->{change_encoding}
4631 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3);
4632 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4633 ->set_user_data (manakai_has_reference =>
4634 $token->{attributes}->{content}
4635 ->{has_reference});
4636 }
4637 }
4638 } else {
4639 if ($token->{attributes}->{charset}) {
4640 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4641 ->set_user_data (manakai_has_reference =>
4642 $token->{attributes}->{charset}
4643 ->{has_reference});
4644 }
4645 if ($token->{attributes}->{content}) {
4646 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4647 ->set_user_data (manakai_has_reference =>
4648 $token->{attributes}->{content}
4649 ->{has_reference});
4650 }
4651 }
4652
4653 !!!next-token;
4654 redo B;
4655 } elsif ($token->{tag_name} eq 'title') {
4656 !!!parse-error (type => 'in body:title');
4657 ## NOTE: This is an "as if in head" code clone
4658 $parse_rcdata->(RCDATA_CONTENT_MODEL, sub {
4659 if (defined $self->{head_element}) {
4660 $self->{head_element}->append_child ($_[0]);
4661 } else {
4662 $insert->($_[0]);
4663 }
4664 });
4665 redo B;
4666 } elsif ($token->{tag_name} eq 'body') {
4667 !!!parse-error (type => 'in body:body');
4668
4669 if (@{$self->{open_elements}} == 1 or
4670 $self->{open_elements}->[1]->[1] ne 'body') {
4671 ## Ignore the token
4672 } else {
4673 my $body_el = $self->{open_elements}->[1]->[0];
4674 for my $attr_name (keys %{$token->{attributes}}) {
4675 unless ($body_el->has_attribute_ns (undef, $attr_name)) {
4676 $body_el->set_attribute_ns
4677 (undef, [undef, $attr_name],
4678 $token->{attributes}->{$attr_name}->{value});
4679 }
4680 }
4681 }
4682 !!!next-token;
4683 redo B;
4684 } elsif ({
4685 address => 1, blockquote => 1, center => 1, dir => 1,
4686 div => 1, dl => 1, fieldset => 1, listing => 1,
4687 menu => 1, ol => 1, p => 1, ul => 1,
4688 pre => 1,
4689 }->{$token->{tag_name}}) {
4690 ## has a p element in scope
4691 INSCOPE: for (reverse @{$self->{open_elements}}) {
4692 if ($_->[1] eq 'p') {
4693 !!!back-token;
4694 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4695 redo B;
4696 } elsif ({
4697 table => 1, caption => 1, td => 1, th => 1,
4698 button => 1, marquee => 1, object => 1, html => 1,
4699 }->{$_->[1]}) {
4700 last INSCOPE;
4701 }
4702 } # INSCOPE
4703
4704 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4705 if ($token->{tag_name} eq 'pre') {
4706 !!!next-token;
4707 if ($token->{type} == CHARACTER_TOKEN) {
4708 $token->{data} =~ s/^\x0A//;
4709 unless (length $token->{data}) {
4710 !!!next-token;
4711 }
4712 }
4713 } else {
4714 !!!next-token;
4715 }
4716 redo B;
4717 } elsif ($token->{tag_name} eq 'form') {
4718 if (defined $self->{form_element}) {
4719 !!!parse-error (type => 'in form:form');
4720 ## Ignore the token
4721 !!!next-token;
4722 redo B;
4723 } else {
4724 ## has a p element in scope
4725 INSCOPE: for (reverse @{$self->{open_elements}}) {
4726 if ($_->[1] eq 'p') {
4727 !!!back-token;
4728 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4729 redo B;
4730 } elsif ({
4731 table => 1, caption => 1, td => 1, th => 1,
4732 button => 1, marquee => 1, object => 1, html => 1,
4733 }->{$_->[1]}) {
4734 last INSCOPE;
4735 }
4736 } # INSCOPE
4737
4738 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4739 $self->{form_element} = $self->{open_elements}->[-1]->[0];
4740 !!!next-token;
4741 redo B;
4742 }
4743 } elsif ($token->{tag_name} eq 'li') {
4744 ## has a p element in scope
4745 INSCOPE: for (reverse @{$self->{open_elements}}) {
4746 if ($_->[1] eq 'p') {
4747 !!!back-token;
4748 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4749 redo B;
4750 } elsif ({
4751 table => 1, caption => 1, td => 1, th => 1,
4752 button => 1, marquee => 1, object => 1, html => 1,
4753 }->{$_->[1]}) {
4754 last INSCOPE;
4755 }
4756 } # INSCOPE
4757
4758 ## Step 1
4759 my $i = -1;
4760 my $node = $self->{open_elements}->[$i];
4761 LI: {
4762 ## Step 2
4763 if ($node->[1] eq 'li') {
4764 if ($i != -1) {
4765 !!!parse-error (type => 'end tag missing:'.
4766 $self->{open_elements}->[-1]->[1]);
4767 }
4768 splice @{$self->{open_elements}}, $i;
4769 last LI;
4770 }
4771
4772 ## Step 3
4773 if (not $formatting_category->{$node->[1]} and
4774 #not $phrasing_category->{$node->[1]} and
4775 ($special_category->{$node->[1]} or
4776 $scoping_category->{$node->[1]}) and
4777 $node->[1] ne 'address' and $node->[1] ne 'div') {
4778 last LI;
4779 }
4780
4781 ## Step 4
4782 $i--;
4783 $node = $self->{open_elements}->[$i];
4784 redo LI;
4785 } # LI
4786
4787 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4788 !!!next-token;
4789 redo B;
4790 } elsif ($token->{tag_name} eq 'dd' or $token->{tag_name} eq 'dt') {
4791 ## has a p element in scope
4792 INSCOPE: for (reverse @{$self->{open_elements}}) {
4793 if ($_->[1] eq 'p') {
4794 !!!back-token;
4795 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4796 redo B;
4797 } elsif ({
4798 table => 1, caption => 1, td => 1, th => 1,
4799 button => 1, marquee => 1, object => 1, html => 1,
4800 }->{$_->[1]}) {
4801 last INSCOPE;
4802 }
4803 } # INSCOPE
4804
4805 ## Step 1
4806 my $i = -1;
4807 my $node = $self->{open_elements}->[$i];
4808 LI: {
4809 ## Step 2
4810 if ($node->[1] eq 'dt' or $node->[1] eq 'dd') {
4811 if ($i != -1) {
4812 !!!parse-error (type => 'end tag missing:'.
4813 $self->{open_elements}->[-1]->[1]);
4814 }
4815 splice @{$self->{open_elements}}, $i;
4816 last LI;
4817 }
4818
4819 ## Step 3
4820 if (not $formatting_category->{$node->[1]} and
4821 #not $phrasing_category->{$node->[1]} and
4822 ($special_category->{$node->[1]} or
4823 $scoping_category->{$node->[1]}) and
4824 $node->[1] ne 'address' and $node->[1] ne 'div') {
4825 last LI;
4826 }
4827
4828 ## Step 4
4829 $i--;
4830 $node = $self->{open_elements}->[$i];
4831 redo LI;
4832 } # LI
4833
4834 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4835 !!!next-token;
4836 redo B;
4837 } elsif ($token->{tag_name} eq 'plaintext') {
4838 ## has a p element in scope
4839 INSCOPE: for (reverse @{$self->{open_elements}}) {
4840 if ($_->[1] eq 'p') {
4841 !!!back-token;
4842 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4843 redo B;
4844 } elsif ({
4845 table => 1, caption => 1, td => 1, th => 1,
4846 button => 1, marquee => 1, object => 1, html => 1,
4847 }->{$_->[1]}) {
4848 last INSCOPE;
4849 }
4850 } # INSCOPE
4851
4852 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4853
4854 $self->{content_model} = PLAINTEXT_CONTENT_MODEL;
4855
4856 !!!next-token;
4857 redo B;
4858 } elsif ({
4859 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
4860 }->{$token->{tag_name}}) {
4861 ## has a p element in scope
4862 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4863 my $node = $self->{open_elements}->[$_];
4864 if ($node->[1] eq 'p') {
4865 !!!back-token;
4866 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4867 redo B;
4868 } elsif ({
4869 table => 1, caption => 1, td => 1, th => 1,
4870 button => 1, marquee => 1, object => 1, html => 1,
4871 }->{$node->[1]}) {
4872 last INSCOPE;
4873 }
4874 } # INSCOPE
4875
4876 ## NOTE: See <http://html5.org/tools/web-apps-tracker?from=925&to=926>
4877 ## has an element in scope
4878 #my $i;
4879 #INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4880 # my $node = $self->{open_elements}->[$_];
4881 # if ({
4882 # h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
4883 # }->{$node->[1]}) {
4884 # $i = $_;
4885 # last INSCOPE;
4886 # } elsif ({
4887 # table => 1, caption => 1, td => 1, th => 1,
4888 # button => 1, marquee => 1, object => 1, html => 1,
4889 # }->{$node->[1]}) {
4890 # last INSCOPE;
4891 # }
4892 #} # INSCOPE
4893 #
4894 #if (defined $i) {
4895 # !!! parse-error (type => 'in hn:hn');
4896 # splice @{$self->{open_elements}}, $i;
4897 #}
4898
4899 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4900
4901 !!!next-token;
4902 redo B;
4903 } elsif ($token->{tag_name} eq 'a') {
4904 AFE: for my $i (reverse 0..$#$active_formatting_elements) {
4905 my $node = $active_formatting_elements->[$i];
4906 if ($node->[1] eq 'a') {
4907 !!!parse-error (type => 'in a:a');
4908
4909 !!!back-token;
4910 $token = {type => END_TAG_TOKEN, tag_name => 'a'};
4911 $formatting_end_tag->($token->{tag_name});
4912
4913 AFE2: for (reverse 0..$#$active_formatting_elements) {
4914 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
4915 splice @$active_formatting_elements, $_, 1;
4916 last AFE2;
4917 }
4918 } # AFE2
4919 OE: for (reverse 0..$#{$self->{open_elements}}) {
4920 if ($self->{open_elements}->[$_]->[0] eq $node->[0]) {
4921 splice @{$self->{open_elements}}, $_, 1;
4922 last OE;
4923 }
4924 } # OE
4925 last AFE;
4926 } elsif ($node->[0] eq '#marker') {
4927 last AFE;
4928 }
4929 } # AFE
4930
4931 $reconstruct_active_formatting_elements->($insert_to_current);
4932
4933 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4934 push @$active_formatting_elements, $self->{open_elements}->[-1];
4935
4936 !!!next-token;
4937 redo B;
4938 } elsif ({
4939 b => 1, big => 1, em => 1, font => 1, i => 1,
4940 s => 1, small => 1, strile => 1,
4941 strong => 1, tt => 1, u => 1,
4942 }->{$token->{tag_name}}) {
4943 $reconstruct_active_formatting_elements->($insert_to_current);
4944
4945 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4946 push @$active_formatting_elements, $self->{open_elements}->[-1];
4947
4948 !!!next-token;
4949 redo B;
4950 } elsif ($token->{tag_name} eq 'nobr') {
4951 $reconstruct_active_formatting_elements->($insert_to_current);
4952
4953 ## has a |nobr| element in scope
4954 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4955 my $node = $self->{open_elements}->[$_];
4956 if ($node->[1] eq 'nobr') {
4957 !!!parse-error (type => 'in nobr:nobr');
4958 !!!back-token;
4959 $token = {type => END_TAG_TOKEN, tag_name => 'nobr'};
4960 redo B;
4961 } elsif ({
4962 table => 1, caption => 1, td => 1, th => 1,
4963 button => 1, marquee => 1, object => 1, html => 1,
4964 }->{$node->[1]}) {
4965 last INSCOPE;
4966 }
4967 } # INSCOPE
4968
4969 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4970 push @$active_formatting_elements, $self->{open_elements}->[-1];
4971
4972 !!!next-token;
4973 redo B;
4974 } elsif ($token->{tag_name} eq 'button') {
4975 ## has a button element in scope
4976 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4977 my $node = $self->{open_elements}->[$_];
4978 if ($node->[1] eq 'button') {
4979 !!!parse-error (type => 'in button:button');
4980 !!!back-token;
4981 $token = {type => END_TAG_TOKEN, tag_name => 'button'};
4982 redo B;
4983 } elsif ({
4984 table => 1, caption => 1, td => 1, th => 1,
4985 button => 1, marquee => 1, object => 1, html => 1,
4986 }->{$node->[1]}) {
4987 last INSCOPE;
4988 }
4989 } # INSCOPE
4990
4991 $reconstruct_active_formatting_elements->($insert_to_current);
4992
4993 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4994 push @$active_formatting_elements, ['#marker', ''];
4995
4996 !!!next-token;
4997 redo B;
4998 } elsif ($token->{tag_name} eq 'marquee' or
4999 $token->{tag_name} eq 'object') {
5000 $reconstruct_active_formatting_elements->($insert_to_current);
5001
5002 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5003 push @$active_formatting_elements, ['#marker', ''];
5004
5005 !!!next-token;
5006 redo B;
5007 } elsif ($token->{tag_name} eq 'xmp') {
5008 $reconstruct_active_formatting_elements->($insert_to_current);
5009 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);
5010 redo B;
5011 } elsif ($token->{tag_name} eq 'table') {
5012 ## has a p element in scope
5013 INSCOPE: for (reverse @{$self->{open_elements}}) {
5014 if ($_->[1] eq 'p') {
5015 !!!back-token;
5016 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
5017 redo B;
5018 } elsif ({
5019 table => 1, caption => 1, td => 1, th => 1,
5020 button => 1, marquee => 1, object => 1, html => 1,
5021 }->{$_->[1]}) {
5022 last INSCOPE;
5023 }
5024 } # INSCOPE
5025
5026 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5027
5028 $self->{insertion_mode} = IN_TABLE_IM;
5029
5030 !!!next-token;
5031 redo B;
5032 } elsif ({
5033 area => 1, basefont => 1, bgsound => 1, br => 1,
5034 embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
5035 image => 1,
5036 }->{$token->{tag_name}}) {
5037 if ($token->{tag_name} eq 'image') {
5038 !!!parse-error (type => 'image');
5039 $token->{tag_name} = 'img';
5040 }
5041
5042 ## NOTE: There is an "as if <br>" code clone.
5043 $reconstruct_active_formatting_elements->($insert_to_current);
5044
5045 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5046 pop @{$self->{open_elements}};
5047
5048 !!!next-token;
5049 redo B;
5050 } elsif ($token->{tag_name} eq 'hr') {
5051 ## has a p element in scope
5052 INSCOPE: for (reverse @{$self->{open_elements}}) {
5053 if ($_->[1] eq 'p') {
5054 !!!back-token;
5055 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
5056 redo B;
5057 } elsif ({
5058 table => 1, caption => 1, td => 1, th => 1,
5059 button => 1, marquee => 1, object => 1, html => 1,
5060 }->{$_->[1]}) {
5061 last INSCOPE;
5062 }
5063 } # INSCOPE
5064
5065 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5066 pop @{$self->{open_elements}};
5067
5068 !!!next-token;
5069 redo B;
5070 } elsif ($token->{tag_name} eq 'input') {
5071 $reconstruct_active_formatting_elements->($insert_to_current);
5072
5073 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5074 ## TODO: associate with $self->{form_element} if defined
5075 pop @{$self->{open_elements}};
5076
5077 !!!next-token;
5078 redo B;
5079 } elsif ($token->{tag_name} eq 'isindex') {
5080 !!!parse-error (type => 'isindex');
5081
5082 if (defined $self->{form_element}) {
5083 ## Ignore the token
5084 !!!next-token;
5085 redo B;
5086 } else {
5087 my $at = $token->{attributes};
5088 my $form_attrs;
5089 $form_attrs->{action} = $at->{action} if $at->{action};
5090 my $prompt_attr = $at->{prompt};
5091 $at->{name} = {name => 'name', value => 'isindex'};
5092 delete $at->{action};
5093 delete $at->{prompt};
5094 my @tokens = (
5095 {type => START_TAG_TOKEN, tag_name => 'form',
5096 attributes => $form_attrs},
5097 {type => START_TAG_TOKEN, tag_name => 'hr'},
5098 {type => START_TAG_TOKEN, tag_name => 'p'},
5099 {type => START_TAG_TOKEN, tag_name => 'label'},
5100 );
5101 if ($prompt_attr) {
5102 push @tokens, {type => CHARACTER_TOKEN, data => $prompt_attr->{value}};
5103 } else {
5104 push @tokens, {type => CHARACTER_TOKEN,
5105 data => 'This is a searchable index. Insert your search keywords here: '}; # SHOULD
5106 ## TODO: make this configurable
5107 }
5108 push @tokens,
5109 {type => START_TAG_TOKEN, tag_name => 'input', attributes => $at},
5110 #{type => CHARACTER_TOKEN, data => ''}, # SHOULD
5111 {type => END_TAG_TOKEN, tag_name => 'label'},
5112 {type => END_TAG_TOKEN, tag_name => 'p'},
5113 {type => START_TAG_TOKEN, tag_name => 'hr'},
5114 {type => END_TAG_TOKEN, tag_name => 'form'};
5115 $token = shift @tokens;
5116 !!!back-token (@tokens);
5117 redo B;
5118 }
5119 } elsif ($token->{tag_name} eq 'textarea') {
5120 my $tag_name = $token->{tag_name};
5121 my $el;
5122 !!!create-element ($el, $token->{tag_name}, $token->{attributes});
5123
5124 ## TODO: $self->{form_element} if defined
5125 $self->{content_model} = RCDATA_CONTENT_MODEL;
5126 delete $self->{escape}; # MUST
5127
5128 $insert->($el);
5129
5130 my $text = '';
5131 !!!next-token;
5132 if ($token->{type} == CHARACTER_TOKEN) {
5133 $token->{data} =~ s/^\x0A//;
5134 unless (length $token->{data}) {
5135 !!!next-token;
5136 }
5137 }
5138 while ($token->{type} == CHARACTER_TOKEN) {
5139 $text .= $token->{data};
5140 !!!next-token;
5141 }
5142 if (length $text) {
5143 $el->manakai_append_text ($text);
5144 }
5145
5146 $self->{content_model} = PCDATA_CONTENT_MODEL;
5147
5148 if ($token->{type} == END_TAG_TOKEN and
5149 $token->{tag_name} eq $tag_name) {
5150 ## Ignore the token
5151 } else {
5152 !!!parse-error (type => 'in RCDATA:#'.$token->{type});
5153 }
5154 !!!next-token;
5155 redo B;
5156 } elsif ({
5157 iframe => 1,
5158 noembed => 1,
5159 noframes => 1,
5160 noscript => 0, ## TODO: 1 if scripting is enabled
5161 }->{$token->{tag_name}}) {
5162 ## NOTE: There is an "as if in body" code clone.
5163 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);
5164 redo B;
5165 } elsif ($token->{tag_name} eq 'select') {
5166 $reconstruct_active_formatting_elements->($insert_to_current);
5167
5168 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5169
5170 $self->{insertion_mode} = IN_SELECT_IM;
5171 !!!next-token;
5172 redo B;
5173 } elsif ({
5174 caption => 1, col => 1, colgroup => 1, frame => 1,
5175 frameset => 1, head => 1, option => 1, optgroup => 1,
5176 tbody => 1, td => 1, tfoot => 1, th => 1,
5177 thead => 1, tr => 1,
5178 }->{$token->{tag_name}}) {
5179 !!!parse-error (type => 'in body:'.$token->{tag_name});
5180 ## Ignore the token
5181 !!!next-token;
5182 redo B;
5183
5184 ## ISSUE: An issue on HTML5 new elements in the spec.
5185 } else {
5186 $reconstruct_active_formatting_elements->($insert_to_current);
5187
5188 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
5189
5190 !!!next-token;
5191 redo B;
5192 }
5193 } elsif ($token->{type} == END_TAG_TOKEN) {
5194 if ($token->{tag_name} eq 'body') {
5195 if (@{$self->{open_elements}} > 1 and
5196 $self->{open_elements}->[1]->[1] eq 'body') {
5197 for (@{$self->{open_elements}}) {
5198 unless ({
5199 dd => 1, dt => 1, li => 1, p => 1, td => 1,
5200 th => 1, tr => 1, body => 1, html => 1,
5201 tbody => 1, tfoot => 1, thead => 1,
5202 }->{$_->[1]}) {
5203 !!!parse-error (type => 'not closed:'.$_->[1]);
5204 }
5205 }
5206
5207 $self->{insertion_mode} = AFTER_BODY_IM;
5208 !!!next-token;
5209 redo B;
5210 } else {
5211 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5212 ## Ignore the token
5213 !!!next-token;
5214 redo B;
5215 }
5216 } elsif ($token->{tag_name} eq 'html') {
5217 if (@{$self->{open_elements}} > 1 and $self->{open_elements}->[1]->[1] eq 'body') {
5218 ## ISSUE: There is an issue in the spec.
5219 if ($self->{open_elements}->[-1]->[1] ne 'body') {
5220 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[1]->[1]);
5221 }
5222 $self->{insertion_mode} = AFTER_BODY_IM;
5223 ## reprocess
5224 redo B;
5225 } else {
5226 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5227 ## Ignore the token
5228 !!!next-token;
5229 redo B;
5230 }
5231 } elsif ({
5232 address => 1, blockquote => 1, center => 1, dir => 1,
5233 div => 1, dl => 1, fieldset => 1, listing => 1,
5234 menu => 1, ol => 1, pre => 1, ul => 1,
5235 p => 1,
5236 dd => 1, dt => 1, li => 1,
5237 button => 1, marquee => 1, object => 1,
5238 }->{$token->{tag_name}}) {
5239 ## has an element in scope
5240 my $i;
5241 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5242 my $node = $self->{open_elements}->[$_];
5243 if ($node->[1] eq $token->{tag_name}) {
5244 ## generate implied end tags
5245 if ({
5246 dd => ($token->{tag_name} ne 'dd'),
5247 dt => ($token->{tag_name} ne 'dt'),
5248 li => ($token->{tag_name} ne 'li'),
5249 p => ($token->{tag_name} ne 'p'),
5250 td => 1, th => 1, tr => 1,
5251 tbody => 1, tfoot=> 1, thead => 1,
5252 }->{$self->{open_elements}->[-1]->[1]}) {
5253 !!!back-token;
5254 $token = {type => END_TAG_TOKEN,
5255 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
5256 redo B;
5257 }
5258 $i = $_;
5259 last INSCOPE unless $token->{tag_name} eq 'p';
5260 } elsif ({
5261 table => 1, caption => 1, td => 1, th => 1,
5262 button => 1, marquee => 1, object => 1, html => 1,
5263 }->{$node->[1]}) {
5264 last INSCOPE;
5265 }
5266 } # INSCOPE
5267
5268 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
5269 if (defined $i) {
5270 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
5271 } else {
5272 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5273 }
5274 }
5275
5276 if (defined $i) {
5277 splice @{$self->{open_elements}}, $i;
5278 } elsif ($token->{tag_name} eq 'p') {
5279 ## As if <p>, then reprocess the current token
5280 my $el;
5281 !!!create-element ($el, 'p');
5282 $insert->($el);
5283 }
5284 $clear_up_to_marker->()
5285 if {
5286 button => 1, marquee => 1, object => 1,
5287 }->{$token->{tag_name}};
5288 !!!next-token;
5289 redo B;
5290 } elsif ($token->{tag_name} eq 'form') {
5291 ## has an element in scope
5292 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5293 my $node = $self->{open_elements}->[$_];
5294 if ($node->[1] eq $token->{tag_name}) {
5295 ## generate implied end tags
5296 if ({
5297 dd => 1, dt => 1, li => 1, p => 1,
5298 td => 1, th => 1, tr => 1,
5299 tbody => 1, tfoot=> 1, thead => 1,
5300 }->{$self->{open_elements}->[-1]->[1]}) {
5301 !!!back-token;
5302 $token = {type => END_TAG_TOKEN,
5303 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
5304 redo B;
5305 }
5306 last INSCOPE;
5307 } elsif ({
5308 table => 1, caption => 1, td => 1, th => 1,
5309 button => 1, marquee => 1, object => 1, html => 1,
5310 }->{$node->[1]}) {
5311 last INSCOPE;
5312 }
5313 } # INSCOPE
5314
5315 if ($self->{open_elements}->[-1]->[1] eq $token->{tag_name}) {
5316 pop @{$self->{open_elements}};
5317 } else {
5318 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5319 }
5320
5321 undef $self->{form_element};
5322 !!!next-token;
5323 redo B;
5324 } elsif ({
5325 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
5326 }->{$token->{tag_name}}) {
5327 ## has an element in scope
5328 my $i;
5329 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5330 my $node = $self->{open_elements}->[$_];
5331 if ({
5332 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
5333 }->{$node->[1]}) {
5334 ## generate implied end tags
5335 if ({
5336 dd => 1, dt => 1, li => 1, p => 1,
5337 td => 1, th => 1, tr => 1,
5338 tbody => 1, tfoot=> 1, thead => 1,
5339 }->{$self->{open_elements}->[-1]->[1]}) {
5340 !!!back-token;
5341 $token = {type => END_TAG_TOKEN,
5342 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
5343 redo B;
5344 }
5345 $i = $_;
5346 last INSCOPE;
5347 } elsif ({
5348 table => 1, caption => 1, td => 1, th => 1,
5349 button => 1, marquee => 1, object => 1, html => 1,
5350 }->{$node->[1]}) {
5351 last INSCOPE;
5352 }
5353 } # INSCOPE
5354
5355 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
5356 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5357 }
5358
5359 splice @{$self->{open_elements}}, $i if defined $i;
5360 !!!next-token;
5361 redo B;
5362 } elsif ({
5363 a => 1,
5364 b => 1, big => 1, em => 1, font => 1, i => 1,
5365 nobr => 1, s => 1, small => 1, strile => 1,
5366 strong => 1, tt => 1, u => 1,
5367 }->{$token->{tag_name}}) {
5368 $formatting_end_tag->($token->{tag_name});
5369 redo B;
5370 } elsif ($token->{tag_name} eq 'br') {
5371 !!!parse-error (type => 'unmatched end tag:br');
5372
5373 ## As if <br>
5374 $reconstruct_active_formatting_elements->($insert_to_current);
5375
5376 my $el;
5377 !!!create-element ($el, 'br');
5378 $insert->($el);
5379
5380 ## Ignore the token.
5381 !!!next-token;
5382 redo B;
5383 } elsif ({
5384 caption => 1, col => 1, colgroup => 1, frame => 1,
5385 frameset => 1, head => 1, option => 1, optgroup => 1,
5386 tbody => 1, td => 1, tfoot => 1, th => 1,
5387 thead => 1, tr => 1,
5388 area => 1, basefont => 1, bgsound => 1,
5389 embed => 1, hr => 1, iframe => 1, image => 1,
5390 img => 1, input => 1, isindex => 1, noembed => 1,
5391 noframes => 1, param => 1, select => 1, spacer => 1,
5392 table => 1, textarea => 1, wbr => 1,
5393 noscript => 0, ## TODO: if scripting is enabled
5394 }->{$token->{tag_name}}) {
5395 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5396 ## Ignore the token
5397 !!!next-token;
5398 redo B;
5399
5400 ## ISSUE: Issue on HTML5 new elements in spec
5401
5402 } else {
5403 ## Step 1
5404 my $node_i = -1;
5405 my $node = $self->{open_elements}->[$node_i];
5406
5407 ## Step 2
5408 S2: {
5409 if ($node->[1] eq $token->{tag_name}) {
5410 ## Step 1
5411 ## generate implied end tags
5412 if ({
5413 dd => 1, dt => 1, li => 1, p => 1,
5414 td => 1, th => 1, tr => 1,
5415 tbody => 1, tfoot => 1, thead => 1,
5416 }->{$self->{open_elements}->[-1]->[1]}) {
5417 !!!back-token;
5418 $token = {type => END_TAG_TOKEN,
5419 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
5420 redo B;
5421 }
5422
5423 ## Step 2
5424 if ($token->{tag_name} ne $self->{open_elements}->[-1]->[1]) {
5425 ## NOTE: <x><y></x>
5426 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
5427 }
5428
5429 ## Step 3
5430 splice @{$self->{open_elements}}, $node_i;
5431
5432 !!!next-token;
5433 last S2;
5434 } else {
5435 ## Step 3
5436 if (not $formatting_category->{$node->[1]} and
5437 #not $phrasing_category->{$node->[1]} and
5438 ($special_category->{$node->[1]} or
5439 $scoping_category->{$node->[1]})) {
5440 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5441 ## Ignore the token
5442 !!!next-token;
5443 last S2;
5444 }
5445 }
5446
5447 ## Step 4
5448 $node_i--;
5449 $node = $self->{open_elements}->[$node_i];
5450
5451 ## Step 5;
5452 redo S2;
5453 } # S2
5454 redo B;
5455 }
5456 }
5457 redo B;
5458 } # B
5459
5460 ## NOTE: The "trailing end" phase in HTML5 is split into
5461 ## two insertion modes: "after html body" and "after html frameset".
5462 ## NOTE: States in the main stage is preserved while
5463 ## the parser stays in the trailing end phase. # MUST
5464
5465 ## Stop parsing # MUST
5466
5467 ## TODO: script stuffs
5468 } # _tree_construct_main
5469
5470 sub set_inner_html ($$$) {
5471 my $class = shift;
5472 my $node = shift;
5473 my $s = \$_[0];
5474 my $onerror = $_[1];
5475
5476 ## ISSUE: Should {confident} be true?
5477
5478 my $nt = $node->node_type;
5479 if ($nt == 9) {
5480 # MUST
5481
5482 ## Step 1 # MUST
5483 ## TODO: If the document has an active parser, ...
5484 ## ISSUE: There is an issue in the spec.
5485
5486 ## Step 2 # MUST
5487 my @cn = @{$node->child_nodes};
5488 for (@cn) {
5489 $node->remove_child ($_);
5490 }
5491
5492 ## Step 3, 4, 5 # MUST
5493 $class->parse_string ($$s => $node, $onerror);
5494 } elsif ($nt == 1) {
5495 ## TODO: If non-html element
5496
5497 ## NOTE: Most of this code is copied from |parse_string|
5498
5499 ## Step 1 # MUST
5500 my $this_doc = $node->owner_document;
5501 my $doc = $this_doc->implementation->create_document;
5502 $doc->manakai_is_html (1);
5503 my $p = $class->new;
5504 $p->{document} = $doc;
5505
5506 ## Step 9 # MUST
5507 my $i = 0;
5508 my $line = 1;
5509 my $column = 0;
5510 $p->{set_next_input_character} = sub {
5511 my $self = shift;
5512
5513 pop @{$self->{prev_input_character}};
5514 unshift @{$self->{prev_input_character}}, $self->{next_input_character};
5515
5516 $self->{next_input_character} = -1 and return if $i >= length $$s;
5517 $self->{next_input_character} = ord substr $$s, $i++, 1;
5518 $column++;
5519
5520 if ($self->{next_input_character} == 0x000A) { # LF
5521 $line++;
5522 $column = 0;
5523 } elsif ($self->{next_input_character} == 0x000D) { # CR
5524 $i++ if substr ($$s, $i, 1) eq "\x0A";
5525 $self->{next_input_character} = 0x000A; # LF # MUST
5526 $line++;
5527 $column = 0;
5528 } elsif ($self->{next_input_character} > 0x10FFFF) {
5529 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
5530 } elsif ($self->{next_input_character} == 0x0000) { # NULL
5531 !!!parse-error (type => 'NULL');
5532 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
5533 }
5534 };
5535 $p->{prev_input_character} = [-1, -1, -1];
5536 $p->{next_input_character} = -1;
5537
5538 my $ponerror = $onerror || sub {
5539 my (%opt) = @_;
5540 warn "Parse error ($opt{type}) at line $opt{line} column $opt{column}\n";
5541 };
5542 $p->{parse_error} = sub {
5543 $ponerror->(@_, line => $line, column => $column);
5544 };
5545
5546 $p->_initialize_tokenizer;
5547 $p->_initialize_tree_constructor;
5548
5549 ## Step 2
5550 my $node_ln = $node->manakai_local_name;
5551 $p->{content_model} = {
5552 title => RCDATA_CONTENT_MODEL,
5553 textarea => RCDATA_CONTENT_MODEL,
5554 style => CDATA_CONTENT_MODEL,
5555 script => CDATA_CONTENT_MODEL,
5556 xmp => CDATA_CONTENT_MODEL,
5557 iframe => CDATA_CONTENT_MODEL,
5558 noembed => CDATA_CONTENT_MODEL,
5559 noframes => CDATA_CONTENT_MODEL,
5560 noscript => CDATA_CONTENT_MODEL,
5561 plaintext => PLAINTEXT_CONTENT_MODEL,
5562 }->{$node_ln};
5563 $p->{content_model} = PCDATA_CONTENT_MODEL
5564 unless defined $p->{content_model};
5565 ## ISSUE: What is "the name of the element"? local name?
5566
5567 $p->{inner_html_node} = [$node, $node_ln];
5568
5569 ## Step 4
5570 my $root = $doc->create_element_ns
5571 ('http://www.w3.org/1999/xhtml', [undef, 'html']);
5572
5573 ## Step 5 # MUST
5574 $doc->append_child ($root);
5575
5576 ## Step 6 # MUST
5577 push @{$p->{open_elements}}, [$root, 'html'];
5578
5579 undef $p->{head_element};
5580
5581 ## Step 7 # MUST
5582 $p->_reset_insertion_mode;
5583
5584 ## Step 8 # MUST
5585 my $anode = $node;
5586 AN: while (defined $anode) {
5587 if ($anode->node_type == 1) {
5588 my $nsuri = $anode->namespace_uri;
5589 if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {
5590 if ($anode->manakai_local_name eq 'form') {
5591 $p->{form_element} = $anode;
5592 last AN;
5593 }
5594 }
5595 }
5596 $anode = $anode->parent_node;
5597 } # AN
5598
5599 ## Step 3 # MUST
5600 ## Step 10 # MUST
5601 {
5602 my $self = $p;
5603 !!!next-token;
5604 }
5605 $p->_tree_construction_main;
5606
5607 ## Step 11 # MUST
5608 my @cn = @{$node->child_nodes};
5609 for (@cn) {
5610 $node->remove_child ($_);
5611 }
5612 ## ISSUE: mutation events? read-only?
5613
5614 ## Step 12 # MUST
5615 @cn = @{$root->child_nodes};
5616 for (@cn) {
5617 $this_doc->adopt_node ($_);
5618 $node->append_child ($_);
5619 }
5620 ## ISSUE: mutation events?
5621
5622 $p->_terminate_tree_constructor;
5623 } else {
5624 die "$0: |set_inner_html| is not defined for node of type $nt";
5625 }
5626 } # set_inner_html
5627
5628 } # tree construction stage
5629
5630 package Whatpm::HTML::RestartParser;
5631 push our @ISA, 'Error';
5632
5633 1;
5634 # $Date: 2008/03/02 03:39:41 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24