/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.58 - (show annotations) (download) (as text)
Tue Sep 4 11:19:07 2007 UTC (17 years, 2 months ago) by wakaba
Branch: MAIN
Changes since 1.57: +42 -41 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	4 Sep 2007 11:19:01 -0000
2007-09-04  Wakaba  <wakaba@suika.fam.cx>

	* tree-test-1.dat: New tests are added.

++ whatpm/Whatpm/ChangeLog	4 Sep 2007 11:18:29 -0000
2007-09-04  Wakaba  <wakaba@suika.fam.cx>

	* HTML.pm.src: Some error types were wrong.

1 package Whatpm::HTML;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.57 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 ## ISSUE:
6 ## var doc = implementation.createDocument (null, null, null);
7 ## doc.write ('');
8 ## alert (doc.compatMode);
9
10 ## ISSUE: HTML5 revision 967 says that the encoding layer MUST NOT
11 ## strip BOM and the HTML layer MUST ignore it. Whether we can do it
12 ## is not yet clear.
13 ## "{U+FEFF}..." in UTF-16BE/UTF-16LE is three or four characters?
14 ## "{U+FEFF}..." in GB18030?
15
16 my $permitted_slash_tag_name = {
17 base => 1,
18 link => 1,
19 meta => 1,
20 hr => 1,
21 br => 1,
22 img=> 1,
23 embed => 1,
24 param => 1,
25 area => 1,
26 col => 1,
27 input => 1,
28 };
29
30 my $c1_entity_char = {
31 0x80 => 0x20AC,
32 0x81 => 0xFFFD,
33 0x82 => 0x201A,
34 0x83 => 0x0192,
35 0x84 => 0x201E,
36 0x85 => 0x2026,
37 0x86 => 0x2020,
38 0x87 => 0x2021,
39 0x88 => 0x02C6,
40 0x89 => 0x2030,
41 0x8A => 0x0160,
42 0x8B => 0x2039,
43 0x8C => 0x0152,
44 0x8D => 0xFFFD,
45 0x8E => 0x017D,
46 0x8F => 0xFFFD,
47 0x90 => 0xFFFD,
48 0x91 => 0x2018,
49 0x92 => 0x2019,
50 0x93 => 0x201C,
51 0x94 => 0x201D,
52 0x95 => 0x2022,
53 0x96 => 0x2013,
54 0x97 => 0x2014,
55 0x98 => 0x02DC,
56 0x99 => 0x2122,
57 0x9A => 0x0161,
58 0x9B => 0x203A,
59 0x9C => 0x0153,
60 0x9D => 0xFFFD,
61 0x9E => 0x017E,
62 0x9F => 0x0178,
63 }; # $c1_entity_char
64
65 my $special_category = {
66 address => 1, area => 1, base => 1, basefont => 1, bgsound => 1,
67 blockquote => 1, body => 1, br => 1, center => 1, col => 1, colgroup => 1,
68 dd => 1, dir => 1, div => 1, dl => 1, dt => 1, embed => 1, fieldset => 1,
69 form => 1, frame => 1, frameset => 1, h1 => 1, h2 => 1, h3 => 1,
70 h4 => 1, h5 => 1, h6 => 1, head => 1, hr => 1, iframe => 1, image => 1,
71 img => 1, input => 1, isindex => 1, li => 1, link => 1, listing => 1,
72 menu => 1, meta => 1, noembed => 1, noframes => 1, noscript => 1,
73 ol => 1, optgroup => 1, option => 1, p => 1, param => 1, plaintext => 1,
74 pre => 1, script => 1, select => 1, spacer => 1, style => 1, tbody => 1,
75 textarea => 1, tfoot => 1, thead => 1, title => 1, tr => 1, ul => 1, wbr => 1,
76 };
77 my $scoping_category = {
78 button => 1, caption => 1, html => 1, marquee => 1, object => 1,
79 table => 1, td => 1, th => 1,
80 };
81 my $formatting_category = {
82 a => 1, b => 1, big => 1, em => 1, font => 1, i => 1, nobr => 1,
83 s => 1, small => 1, strile => 1, strong => 1, tt => 1, u => 1,
84 };
85 # $phrasing_category: all other elements
86
87 sub parse_string ($$$;$) {
88 my $self = shift->new;
89 my $s = \$_[0];
90 $self->{document} = $_[1];
91
92 ## NOTE: |set_inner_html| copies most of this method's code
93
94 my $i = 0;
95 my $line = 1;
96 my $column = 0;
97 $self->{set_next_input_character} = sub {
98 my $self = shift;
99
100 pop @{$self->{prev_input_character}};
101 unshift @{$self->{prev_input_character}}, $self->{next_input_character};
102
103 $self->{next_input_character} = -1 and return if $i >= length $$s;
104 $self->{next_input_character} = ord substr $$s, $i++, 1;
105 $column++;
106
107 if ($self->{next_input_character} == 0x000A) { # LF
108 $line++;
109 $column = 0;
110 } elsif ($self->{next_input_character} == 0x000D) { # CR
111 $i++ if substr ($$s, $i, 1) eq "\x0A";
112 $self->{next_input_character} = 0x000A; # LF # MUST
113 $line++;
114 $column = 0;
115 } elsif ($self->{next_input_character} > 0x10FFFF) {
116 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
117 } elsif ($self->{next_input_character} == 0x0000) { # NULL
118 !!!parse-error (type => 'NULL');
119 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
120 }
121 };
122 $self->{prev_input_character} = [-1, -1, -1];
123 $self->{next_input_character} = -1;
124
125 my $onerror = $_[2] || sub {
126 my (%opt) = @_;
127 warn "Parse error ($opt{type}) at line $opt{line} column $opt{column}\n";
128 };
129 $self->{parse_error} = sub {
130 $onerror->(@_, line => $line, column => $column);
131 };
132
133 $self->_initialize_tokenizer;
134 $self->_initialize_tree_constructor;
135 $self->_construct_tree;
136 $self->_terminate_tree_constructor;
137
138 return $self->{document};
139 } # parse_string
140
141 sub new ($) {
142 my $class = shift;
143 my $self = bless {}, $class;
144 $self->{set_next_input_character} = sub {
145 $self->{next_input_character} = -1;
146 };
147 $self->{parse_error} = sub {
148 #
149 };
150 return $self;
151 } # new
152
153 sub CM_ENTITY () { 0b001 } # & markup in data
154 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
155 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
156
157 sub PLAINTEXT_CONTENT_MODEL () { 0 }
158 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
159 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
160 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
161
162 sub DATA_STATE () { 0 }
163 sub ENTITY_DATA_STATE () { 1 }
164 sub TAG_OPEN_STATE () { 2 }
165 sub CLOSE_TAG_OPEN_STATE () { 3 }
166 sub TAG_NAME_STATE () { 4 }
167 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
168 sub ATTRIBUTE_NAME_STATE () { 6 }
169 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
170 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
171 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
172 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
173 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
174 sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
175 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
176 sub COMMENT_START_STATE () { 14 }
177 sub COMMENT_START_DASH_STATE () { 15 }
178 sub COMMENT_STATE () { 16 }
179 sub COMMENT_END_STATE () { 17 }
180 sub COMMENT_END_DASH_STATE () { 18 }
181 sub BOGUS_COMMENT_STATE () { 19 }
182 sub DOCTYPE_STATE () { 20 }
183 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
184 sub DOCTYPE_NAME_STATE () { 22 }
185 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
186 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
187 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
188 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
189 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
190 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
191 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
192 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
193 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
194 sub BOGUS_DOCTYPE_STATE () { 32 }
195
196 sub DOCTYPE_TOKEN () { 1 }
197 sub COMMENT_TOKEN () { 2 }
198 sub START_TAG_TOKEN () { 3 }
199 sub END_TAG_TOKEN () { 4 }
200 sub END_OF_FILE_TOKEN () { 5 }
201 sub CHARACTER_TOKEN () { 6 }
202
203 sub AFTER_HTML_IMS () { 0b100 }
204 sub HEAD_IMS () { 0b1000 }
205 sub BODY_IMS () { 0b10000 }
206 sub BODY_TABLE_IMS () { 0b100000 }
207 sub TABLE_IMS () { 0b1000000 }
208 sub ROW_IMS () { 0b10000000 }
209 sub BODY_AFTER_IMS () { 0b100000000 }
210 sub FRAME_IMS () { 0b1000000000 }
211
212 sub AFTER_HTML_BODY_IM () { AFTER_HTML_IMS | BODY_AFTER_IMS }
213 sub AFTER_HTML_FRAMESET_IM () { AFTER_HTML_IMS | FRAME_IMS }
214 sub IN_HEAD_IM () { HEAD_IMS | 0b00 }
215 sub IN_HEAD_NOSCRIPT_IM () { HEAD_IMS | 0b01 }
216 sub AFTER_HEAD_IM () { HEAD_IMS | 0b10 }
217 sub BEFORE_HEAD_IM () { HEAD_IMS | 0b11 }
218 sub IN_BODY_IM () { BODY_IMS }
219 sub IN_CELL_IM () { BODY_IMS | BODY_TABLE_IMS | 0b01 }
220 sub IN_CAPTION_IM () { BODY_IMS | BODY_TABLE_IMS | 0b10 }
221 sub IN_ROW_IM () { TABLE_IMS | ROW_IMS | 0b01 }
222 sub IN_TABLE_BODY_IM () { TABLE_IMS | ROW_IMS | 0b10 }
223 sub IN_TABLE_IM () { TABLE_IMS }
224 sub AFTER_BODY_IM () { BODY_AFTER_IMS }
225 sub IN_FRAMESET_IM () { FRAME_IMS | 0b01 }
226 sub AFTER_FRAMESET_IM () { FRAME_IMS | 0b10 }
227 sub IN_SELECT_IM () { 0b01 }
228 sub IN_COLUMN_GROUP_IM () { 0b10 }
229
230 ## Implementations MUST act as if state machine in the spec
231
232 sub _initialize_tokenizer ($) {
233 my $self = shift;
234 $self->{state} = DATA_STATE; # MUST
235 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
236 undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE
237 undef $self->{current_attribute};
238 undef $self->{last_emitted_start_tag_name};
239 undef $self->{last_attribute_value_state};
240 $self->{char} = [];
241 # $self->{next_input_character}
242 !!!next-input-character;
243 $self->{token} = [];
244 # $self->{escape}
245 } # _initialize_tokenizer
246
247 ## A token has:
248 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
249 ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
250 ## ->{name} (DOCTYPE_TOKEN)
251 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
252 ## ->{public_identifier} (DOCTYPE_TOKEN)
253 ## ->{system_identifier} (DOCTYPE_TOKEN)
254 ## ->{correct} == 1 or 0 (DOCTYPE_TOKEN)
255 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
256 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
257
258 ## Emitted token MUST immediately be handled by the tree construction state.
259
260 ## Before each step, UA MAY check to see if either one of the scripts in
261 ## "list of scripts that will execute as soon as possible" or the first
262 ## script in the "list of scripts that will execute asynchronously",
263 ## has completed loading. If one has, then it MUST be executed
264 ## and removed from the list.
265
266 sub _get_next_token ($) {
267 my $self = shift;
268 if (@{$self->{token}}) {
269 return shift @{$self->{token}};
270 }
271
272 A: {
273 if ($self->{state} == DATA_STATE) {
274 if ($self->{next_input_character} == 0x0026) { # &
275 if ($self->{content_model} & CM_ENTITY) { # PCDATA | RCDATA
276 $self->{state} = ENTITY_DATA_STATE;
277 !!!next-input-character;
278 redo A;
279 } else {
280 #
281 }
282 } elsif ($self->{next_input_character} == 0x002D) { # -
283 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
284 unless ($self->{escape}) {
285 if ($self->{prev_input_character}->[0] == 0x002D and # -
286 $self->{prev_input_character}->[1] == 0x0021 and # !
287 $self->{prev_input_character}->[2] == 0x003C) { # <
288 $self->{escape} = 1;
289 }
290 }
291 }
292
293 #
294 } elsif ($self->{next_input_character} == 0x003C) { # <
295 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
296 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
297 not $self->{escape})) {
298 $self->{state} = TAG_OPEN_STATE;
299 !!!next-input-character;
300 redo A;
301 } else {
302 #
303 }
304 } elsif ($self->{next_input_character} == 0x003E) { # >
305 if ($self->{escape} and
306 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
307 if ($self->{prev_input_character}->[0] == 0x002D and # -
308 $self->{prev_input_character}->[1] == 0x002D) { # -
309 delete $self->{escape};
310 }
311 }
312
313 #
314 } elsif ($self->{next_input_character} == -1) {
315 !!!emit ({type => END_OF_FILE_TOKEN});
316 last A; ## TODO: ok?
317 }
318 # Anything else
319 my $token = {type => CHARACTER_TOKEN,
320 data => chr $self->{next_input_character}};
321 ## Stay in the data state
322 !!!next-input-character;
323
324 !!!emit ($token);
325
326 redo A;
327 } elsif ($self->{state} == ENTITY_DATA_STATE) {
328 ## (cannot happen in CDATA state)
329
330 my $token = $self->_tokenize_attempt_to_consume_an_entity (0);
331
332 $self->{state} = DATA_STATE;
333 # next-input-character is already done
334
335 unless (defined $token) {
336 !!!emit ({type => CHARACTER_TOKEN, data => '&'});
337 } else {
338 !!!emit ($token);
339 }
340
341 redo A;
342 } elsif ($self->{state} == TAG_OPEN_STATE) {
343 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
344 if ($self->{next_input_character} == 0x002F) { # /
345 !!!next-input-character;
346 $self->{state} = CLOSE_TAG_OPEN_STATE;
347 redo A;
348 } else {
349 ## reconsume
350 $self->{state} = DATA_STATE;
351
352 !!!emit ({type => CHARACTER_TOKEN, data => '<'});
353
354 redo A;
355 }
356 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
357 if ($self->{next_input_character} == 0x0021) { # !
358 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
359 !!!next-input-character;
360 redo A;
361 } elsif ($self->{next_input_character} == 0x002F) { # /
362 $self->{state} = CLOSE_TAG_OPEN_STATE;
363 !!!next-input-character;
364 redo A;
365 } elsif (0x0041 <= $self->{next_input_character} and
366 $self->{next_input_character} <= 0x005A) { # A..Z
367 $self->{current_token}
368 = {type => START_TAG_TOKEN,
369 tag_name => chr ($self->{next_input_character} + 0x0020)};
370 $self->{state} = TAG_NAME_STATE;
371 !!!next-input-character;
372 redo A;
373 } elsif (0x0061 <= $self->{next_input_character} and
374 $self->{next_input_character} <= 0x007A) { # a..z
375 $self->{current_token} = {type => START_TAG_TOKEN,
376 tag_name => chr ($self->{next_input_character})};
377 $self->{state} = TAG_NAME_STATE;
378 !!!next-input-character;
379 redo A;
380 } elsif ($self->{next_input_character} == 0x003E) { # >
381 !!!parse-error (type => 'empty start tag');
382 $self->{state} = DATA_STATE;
383 !!!next-input-character;
384
385 !!!emit ({type => CHARACTER_TOKEN, data => '<>'});
386
387 redo A;
388 } elsif ($self->{next_input_character} == 0x003F) { # ?
389 !!!parse-error (type => 'pio');
390 $self->{state} = BOGUS_COMMENT_STATE;
391 ## $self->{next_input_character} is intentionally left as is
392 redo A;
393 } else {
394 !!!parse-error (type => 'bare stago');
395 $self->{state} = DATA_STATE;
396 ## reconsume
397
398 !!!emit ({type => CHARACTER_TOKEN, data => '<'});
399
400 redo A;
401 }
402 } else {
403 die "$0: $self->{content_model} in tag open";
404 }
405 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
406 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
407 if (defined $self->{last_emitted_start_tag_name}) {
408 ## NOTE: <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>
409 my @next_char;
410 TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
411 push @next_char, $self->{next_input_character};
412 my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
413 my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
414 if ($self->{next_input_character} == $c or $self->{next_input_character} == $C) {
415 !!!next-input-character;
416 next TAGNAME;
417 } else {
418 $self->{next_input_character} = shift @next_char; # reconsume
419 !!!back-next-input-character (@next_char);
420 $self->{state} = DATA_STATE;
421
422 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
423
424 redo A;
425 }
426 }
427 push @next_char, $self->{next_input_character};
428
429 unless ($self->{next_input_character} == 0x0009 or # HT
430 $self->{next_input_character} == 0x000A or # LF
431 $self->{next_input_character} == 0x000B or # VT
432 $self->{next_input_character} == 0x000C or # FF
433 $self->{next_input_character} == 0x0020 or # SP
434 $self->{next_input_character} == 0x003E or # >
435 $self->{next_input_character} == 0x002F or # /
436 $self->{next_input_character} == -1) {
437 $self->{next_input_character} = shift @next_char; # reconsume
438 !!!back-next-input-character (@next_char);
439 $self->{state} = DATA_STATE;
440 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
441 redo A;
442 } else {
443 $self->{next_input_character} = shift @next_char;
444 !!!back-next-input-character (@next_char);
445 # and consume...
446 }
447 } else {
448 ## No start tag token has ever been emitted
449 # next-input-character is already done
450 $self->{state} = DATA_STATE;
451 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
452 redo A;
453 }
454 }
455
456 if (0x0041 <= $self->{next_input_character} and
457 $self->{next_input_character} <= 0x005A) { # A..Z
458 $self->{current_token} = {type => END_TAG_TOKEN,
459 tag_name => chr ($self->{next_input_character} + 0x0020)};
460 $self->{state} = TAG_NAME_STATE;
461 !!!next-input-character;
462 redo A;
463 } elsif (0x0061 <= $self->{next_input_character} and
464 $self->{next_input_character} <= 0x007A) { # a..z
465 $self->{current_token} = {type => END_TAG_TOKEN,
466 tag_name => chr ($self->{next_input_character})};
467 $self->{state} = TAG_NAME_STATE;
468 !!!next-input-character;
469 redo A;
470 } elsif ($self->{next_input_character} == 0x003E) { # >
471 !!!parse-error (type => 'empty end tag');
472 $self->{state} = DATA_STATE;
473 !!!next-input-character;
474 redo A;
475 } elsif ($self->{next_input_character} == -1) {
476 !!!parse-error (type => 'bare etago');
477 $self->{state} = DATA_STATE;
478 # reconsume
479
480 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
481
482 redo A;
483 } else {
484 !!!parse-error (type => 'bogus end tag');
485 $self->{state} = BOGUS_COMMENT_STATE;
486 ## $self->{next_input_character} is intentionally left as is
487 redo A;
488 }
489 } elsif ($self->{state} == TAG_NAME_STATE) {
490 if ($self->{next_input_character} == 0x0009 or # HT
491 $self->{next_input_character} == 0x000A or # LF
492 $self->{next_input_character} == 0x000B or # VT
493 $self->{next_input_character} == 0x000C or # FF
494 $self->{next_input_character} == 0x0020) { # SP
495 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
496 !!!next-input-character;
497 redo A;
498 } elsif ($self->{next_input_character} == 0x003E) { # >
499 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
500 $self->{current_token}->{first_start_tag}
501 = not defined $self->{last_emitted_start_tag_name};
502 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
503 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
504 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
505 if ($self->{current_token}->{attributes}) {
506 !!!parse-error (type => 'end tag attribute');
507 }
508 } else {
509 die "$0: $self->{current_token}->{type}: Unknown token type";
510 }
511 $self->{state} = DATA_STATE;
512 !!!next-input-character;
513
514 !!!emit ($self->{current_token}); # start tag or end tag
515
516 redo A;
517 } elsif (0x0041 <= $self->{next_input_character} and
518 $self->{next_input_character} <= 0x005A) { # A..Z
519 $self->{current_token}->{tag_name} .= chr ($self->{next_input_character} + 0x0020);
520 # start tag or end tag
521 ## Stay in this state
522 !!!next-input-character;
523 redo A;
524 } elsif ($self->{next_input_character} == -1) {
525 !!!parse-error (type => 'unclosed tag');
526 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
527 $self->{current_token}->{first_start_tag}
528 = not defined $self->{last_emitted_start_tag_name};
529 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
530 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
531 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
532 if ($self->{current_token}->{attributes}) {
533 !!!parse-error (type => 'end tag attribute');
534 }
535 } else {
536 die "$0: $self->{current_token}->{type}: Unknown token type";
537 }
538 $self->{state} = DATA_STATE;
539 # reconsume
540
541 !!!emit ($self->{current_token}); # start tag or end tag
542
543 redo A;
544 } elsif ($self->{next_input_character} == 0x002F) { # /
545 !!!next-input-character;
546 if ($self->{next_input_character} == 0x003E and # >
547 $self->{current_token}->{type} == START_TAG_TOKEN and
548 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
549 # permitted slash
550 #
551 } else {
552 !!!parse-error (type => 'nestc');
553 }
554 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
555 # next-input-character is already done
556 redo A;
557 } else {
558 $self->{current_token}->{tag_name} .= chr $self->{next_input_character};
559 # start tag or end tag
560 ## Stay in the state
561 !!!next-input-character;
562 redo A;
563 }
564 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
565 if ($self->{next_input_character} == 0x0009 or # HT
566 $self->{next_input_character} == 0x000A or # LF
567 $self->{next_input_character} == 0x000B or # VT
568 $self->{next_input_character} == 0x000C or # FF
569 $self->{next_input_character} == 0x0020) { # SP
570 ## Stay in the state
571 !!!next-input-character;
572 redo A;
573 } elsif ($self->{next_input_character} == 0x003E) { # >
574 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
575 $self->{current_token}->{first_start_tag}
576 = not defined $self->{last_emitted_start_tag_name};
577 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
578 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
579 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
580 if ($self->{current_token}->{attributes}) {
581 !!!parse-error (type => 'end tag attribute');
582 }
583 } else {
584 die "$0: $self->{current_token}->{type}: Unknown token type";
585 }
586 $self->{state} = DATA_STATE;
587 !!!next-input-character;
588
589 !!!emit ($self->{current_token}); # start tag or end tag
590
591 redo A;
592 } elsif (0x0041 <= $self->{next_input_character} and
593 $self->{next_input_character} <= 0x005A) { # A..Z
594 $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
595 value => ''};
596 $self->{state} = ATTRIBUTE_NAME_STATE;
597 !!!next-input-character;
598 redo A;
599 } elsif ($self->{next_input_character} == 0x002F) { # /
600 !!!next-input-character;
601 if ($self->{next_input_character} == 0x003E and # >
602 $self->{current_token}->{type} == START_TAG_TOKEN and
603 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
604 # permitted slash
605 #
606 } else {
607 !!!parse-error (type => 'nestc');
608 }
609 ## Stay in the state
610 # next-input-character is already done
611 redo A;
612 } elsif ($self->{next_input_character} == -1) {
613 !!!parse-error (type => 'unclosed tag');
614 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
615 $self->{current_token}->{first_start_tag}
616 = not defined $self->{last_emitted_start_tag_name};
617 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
618 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
619 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
620 if ($self->{current_token}->{attributes}) {
621 !!!parse-error (type => 'end tag attribute');
622 }
623 } else {
624 die "$0: $self->{current_token}->{type}: Unknown token type";
625 }
626 $self->{state} = DATA_STATE;
627 # reconsume
628
629 !!!emit ($self->{current_token}); # start tag or end tag
630
631 redo A;
632 } else {
633 $self->{current_attribute} = {name => chr ($self->{next_input_character}),
634 value => ''};
635 $self->{state} = ATTRIBUTE_NAME_STATE;
636 !!!next-input-character;
637 redo A;
638 }
639 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
640 my $before_leave = sub {
641 if (exists $self->{current_token}->{attributes} # start tag or end tag
642 ->{$self->{current_attribute}->{name}}) { # MUST
643 !!!parse-error (type => 'duplicate attribute:'.$self->{current_attribute}->{name});
644 ## Discard $self->{current_attribute} # MUST
645 } else {
646 $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}
647 = $self->{current_attribute};
648 }
649 }; # $before_leave
650
651 if ($self->{next_input_character} == 0x0009 or # HT
652 $self->{next_input_character} == 0x000A or # LF
653 $self->{next_input_character} == 0x000B or # VT
654 $self->{next_input_character} == 0x000C or # FF
655 $self->{next_input_character} == 0x0020) { # SP
656 $before_leave->();
657 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
658 !!!next-input-character;
659 redo A;
660 } elsif ($self->{next_input_character} == 0x003D) { # =
661 $before_leave->();
662 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
663 !!!next-input-character;
664 redo A;
665 } elsif ($self->{next_input_character} == 0x003E) { # >
666 $before_leave->();
667 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
668 $self->{current_token}->{first_start_tag}
669 = not defined $self->{last_emitted_start_tag_name};
670 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
671 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
672 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
673 if ($self->{current_token}->{attributes}) {
674 !!!parse-error (type => 'end tag attribute');
675 }
676 } else {
677 die "$0: $self->{current_token}->{type}: Unknown token type";
678 }
679 $self->{state} = DATA_STATE;
680 !!!next-input-character;
681
682 !!!emit ($self->{current_token}); # start tag or end tag
683
684 redo A;
685 } elsif (0x0041 <= $self->{next_input_character} and
686 $self->{next_input_character} <= 0x005A) { # A..Z
687 $self->{current_attribute}->{name} .= chr ($self->{next_input_character} + 0x0020);
688 ## Stay in the state
689 !!!next-input-character;
690 redo A;
691 } elsif ($self->{next_input_character} == 0x002F) { # /
692 $before_leave->();
693 !!!next-input-character;
694 if ($self->{next_input_character} == 0x003E and # >
695 $self->{current_token}->{type} == START_TAG_TOKEN and
696 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
697 # permitted slash
698 #
699 } else {
700 !!!parse-error (type => 'nestc');
701 }
702 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
703 # next-input-character is already done
704 redo A;
705 } elsif ($self->{next_input_character} == -1) {
706 !!!parse-error (type => 'unclosed tag');
707 $before_leave->();
708 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
709 $self->{current_token}->{first_start_tag}
710 = not defined $self->{last_emitted_start_tag_name};
711 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
712 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
713 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
714 if ($self->{current_token}->{attributes}) {
715 !!!parse-error (type => 'end tag attribute');
716 }
717 } else {
718 die "$0: $self->{current_token}->{type}: Unknown token type";
719 }
720 $self->{state} = DATA_STATE;
721 # reconsume
722
723 !!!emit ($self->{current_token}); # start tag or end tag
724
725 redo A;
726 } else {
727 $self->{current_attribute}->{name} .= chr ($self->{next_input_character});
728 ## Stay in the state
729 !!!next-input-character;
730 redo A;
731 }
732 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
733 if ($self->{next_input_character} == 0x0009 or # HT
734 $self->{next_input_character} == 0x000A or # LF
735 $self->{next_input_character} == 0x000B or # VT
736 $self->{next_input_character} == 0x000C or # FF
737 $self->{next_input_character} == 0x0020) { # SP
738 ## Stay in the state
739 !!!next-input-character;
740 redo A;
741 } elsif ($self->{next_input_character} == 0x003D) { # =
742 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
743 !!!next-input-character;
744 redo A;
745 } elsif ($self->{next_input_character} == 0x003E) { # >
746 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
747 $self->{current_token}->{first_start_tag}
748 = not defined $self->{last_emitted_start_tag_name};
749 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
750 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
751 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
752 if ($self->{current_token}->{attributes}) {
753 !!!parse-error (type => 'end tag attribute');
754 }
755 } else {
756 die "$0: $self->{current_token}->{type}: Unknown token type";
757 }
758 $self->{state} = DATA_STATE;
759 !!!next-input-character;
760
761 !!!emit ($self->{current_token}); # start tag or end tag
762
763 redo A;
764 } elsif (0x0041 <= $self->{next_input_character} and
765 $self->{next_input_character} <= 0x005A) { # A..Z
766 $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
767 value => ''};
768 $self->{state} = ATTRIBUTE_NAME_STATE;
769 !!!next-input-character;
770 redo A;
771 } elsif ($self->{next_input_character} == 0x002F) { # /
772 !!!next-input-character;
773 if ($self->{next_input_character} == 0x003E and # >
774 $self->{current_token}->{type} == START_TAG_TOKEN and
775 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
776 # permitted slash
777 #
778 } else {
779 !!!parse-error (type => 'nestc');
780 ## TODO: Different error type for <aa / bb> than <aa/>
781 }
782 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
783 # next-input-character is already done
784 redo A;
785 } elsif ($self->{next_input_character} == -1) {
786 !!!parse-error (type => 'unclosed tag');
787 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
788 $self->{current_token}->{first_start_tag}
789 = not defined $self->{last_emitted_start_tag_name};
790 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
791 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
792 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
793 if ($self->{current_token}->{attributes}) {
794 !!!parse-error (type => 'end tag attribute');
795 }
796 } else {
797 die "$0: $self->{current_token}->{type}: Unknown token type";
798 }
799 $self->{state} = DATA_STATE;
800 # reconsume
801
802 !!!emit ($self->{current_token}); # start tag or end tag
803
804 redo A;
805 } else {
806 $self->{current_attribute} = {name => chr ($self->{next_input_character}),
807 value => ''};
808 $self->{state} = ATTRIBUTE_NAME_STATE;
809 !!!next-input-character;
810 redo A;
811 }
812 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
813 if ($self->{next_input_character} == 0x0009 or # HT
814 $self->{next_input_character} == 0x000A or # LF
815 $self->{next_input_character} == 0x000B or # VT
816 $self->{next_input_character} == 0x000C or # FF
817 $self->{next_input_character} == 0x0020) { # SP
818 ## Stay in the state
819 !!!next-input-character;
820 redo A;
821 } elsif ($self->{next_input_character} == 0x0022) { # "
822 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
823 !!!next-input-character;
824 redo A;
825 } elsif ($self->{next_input_character} == 0x0026) { # &
826 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
827 ## reconsume
828 redo A;
829 } elsif ($self->{next_input_character} == 0x0027) { # '
830 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
831 !!!next-input-character;
832 redo A;
833 } elsif ($self->{next_input_character} == 0x003E) { # >
834 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
835 $self->{current_token}->{first_start_tag}
836 = not defined $self->{last_emitted_start_tag_name};
837 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
838 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
839 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
840 if ($self->{current_token}->{attributes}) {
841 !!!parse-error (type => 'end tag attribute');
842 }
843 } else {
844 die "$0: $self->{current_token}->{type}: Unknown token type";
845 }
846 $self->{state} = DATA_STATE;
847 !!!next-input-character;
848
849 !!!emit ($self->{current_token}); # start tag or end tag
850
851 redo A;
852 } elsif ($self->{next_input_character} == -1) {
853 !!!parse-error (type => 'unclosed tag');
854 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
855 $self->{current_token}->{first_start_tag}
856 = not defined $self->{last_emitted_start_tag_name};
857 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
858 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
859 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
860 if ($self->{current_token}->{attributes}) {
861 !!!parse-error (type => 'end tag attribute');
862 }
863 } else {
864 die "$0: $self->{current_token}->{type}: Unknown token type";
865 }
866 $self->{state} = DATA_STATE;
867 ## reconsume
868
869 !!!emit ($self->{current_token}); # start tag or end tag
870
871 redo A;
872 } else {
873 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
874 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
875 !!!next-input-character;
876 redo A;
877 }
878 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
879 if ($self->{next_input_character} == 0x0022) { # "
880 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
881 !!!next-input-character;
882 redo A;
883 } elsif ($self->{next_input_character} == 0x0026) { # &
884 $self->{last_attribute_value_state} = $self->{state};
885 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
886 !!!next-input-character;
887 redo A;
888 } elsif ($self->{next_input_character} == -1) {
889 !!!parse-error (type => 'unclosed attribute value');
890 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
891 $self->{current_token}->{first_start_tag}
892 = not defined $self->{last_emitted_start_tag_name};
893 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
894 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
895 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
896 if ($self->{current_token}->{attributes}) {
897 !!!parse-error (type => 'end tag attribute');
898 }
899 } else {
900 die "$0: $self->{current_token}->{type}: Unknown token type";
901 }
902 $self->{state} = DATA_STATE;
903 ## reconsume
904
905 !!!emit ($self->{current_token}); # start tag or end tag
906
907 redo A;
908 } else {
909 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
910 ## Stay in the state
911 !!!next-input-character;
912 redo A;
913 }
914 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
915 if ($self->{next_input_character} == 0x0027) { # '
916 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
917 !!!next-input-character;
918 redo A;
919 } elsif ($self->{next_input_character} == 0x0026) { # &
920 $self->{last_attribute_value_state} = $self->{state};
921 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
922 !!!next-input-character;
923 redo A;
924 } elsif ($self->{next_input_character} == -1) {
925 !!!parse-error (type => 'unclosed attribute value');
926 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
927 $self->{current_token}->{first_start_tag}
928 = not defined $self->{last_emitted_start_tag_name};
929 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
930 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
931 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
932 if ($self->{current_token}->{attributes}) {
933 !!!parse-error (type => 'end tag attribute');
934 }
935 } else {
936 die "$0: $self->{current_token}->{type}: Unknown token type";
937 }
938 $self->{state} = DATA_STATE;
939 ## reconsume
940
941 !!!emit ($self->{current_token}); # start tag or end tag
942
943 redo A;
944 } else {
945 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
946 ## Stay in the state
947 !!!next-input-character;
948 redo A;
949 }
950 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
951 if ($self->{next_input_character} == 0x0009 or # HT
952 $self->{next_input_character} == 0x000A or # LF
953 $self->{next_input_character} == 0x000B or # HT
954 $self->{next_input_character} == 0x000C or # FF
955 $self->{next_input_character} == 0x0020) { # SP
956 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
957 !!!next-input-character;
958 redo A;
959 } elsif ($self->{next_input_character} == 0x0026) { # &
960 $self->{last_attribute_value_state} = $self->{state};
961 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
962 !!!next-input-character;
963 redo A;
964 } elsif ($self->{next_input_character} == 0x003E) { # >
965 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
966 $self->{current_token}->{first_start_tag}
967 = not defined $self->{last_emitted_start_tag_name};
968 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
969 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
970 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
971 if ($self->{current_token}->{attributes}) {
972 !!!parse-error (type => 'end tag attribute');
973 }
974 } else {
975 die "$0: $self->{current_token}->{type}: Unknown token type";
976 }
977 $self->{state} = DATA_STATE;
978 !!!next-input-character;
979
980 !!!emit ($self->{current_token}); # start tag or end tag
981
982 redo A;
983 } elsif ($self->{next_input_character} == -1) {
984 !!!parse-error (type => 'unclosed tag');
985 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
986 $self->{current_token}->{first_start_tag}
987 = not defined $self->{last_emitted_start_tag_name};
988 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
989 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
990 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
991 if ($self->{current_token}->{attributes}) {
992 !!!parse-error (type => 'end tag attribute');
993 }
994 } else {
995 die "$0: $self->{current_token}->{type}: Unknown token type";
996 }
997 $self->{state} = DATA_STATE;
998 ## reconsume
999
1000 !!!emit ($self->{current_token}); # start tag or end tag
1001
1002 redo A;
1003 } else {
1004 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1005 ## Stay in the state
1006 !!!next-input-character;
1007 redo A;
1008 }
1009 } elsif ($self->{state} == ENTITY_IN_ATTRIBUTE_VALUE_STATE) {
1010 my $token = $self->_tokenize_attempt_to_consume_an_entity (1);
1011
1012 unless (defined $token) {
1013 $self->{current_attribute}->{value} .= '&';
1014 } else {
1015 $self->{current_attribute}->{value} .= $token->{data};
1016 ## ISSUE: spec says "append the returned character token to the current attribute's value"
1017 }
1018
1019 $self->{state} = $self->{last_attribute_value_state};
1020 # next-input-character is already done
1021 redo A;
1022 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1023 ## (only happen if PCDATA state)
1024
1025 my $token = {type => COMMENT_TOKEN, data => ''};
1026
1027 BC: {
1028 if ($self->{next_input_character} == 0x003E) { # >
1029 $self->{state} = DATA_STATE;
1030 !!!next-input-character;
1031
1032 !!!emit ($token);
1033
1034 redo A;
1035 } elsif ($self->{next_input_character} == -1) {
1036 $self->{state} = DATA_STATE;
1037 ## reconsume
1038
1039 !!!emit ($token);
1040
1041 redo A;
1042 } else {
1043 $token->{data} .= chr ($self->{next_input_character});
1044 !!!next-input-character;
1045 redo BC;
1046 }
1047 } # BC
1048 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1049 ## (only happen if PCDATA state)
1050
1051 my @next_char;
1052 push @next_char, $self->{next_input_character};
1053
1054 if ($self->{next_input_character} == 0x002D) { # -
1055 !!!next-input-character;
1056 push @next_char, $self->{next_input_character};
1057 if ($self->{next_input_character} == 0x002D) { # -
1058 $self->{current_token} = {type => COMMENT_TOKEN, data => ''};
1059 $self->{state} = COMMENT_START_STATE;
1060 !!!next-input-character;
1061 redo A;
1062 }
1063 } elsif ($self->{next_input_character} == 0x0044 or # D
1064 $self->{next_input_character} == 0x0064) { # d
1065 !!!next-input-character;
1066 push @next_char, $self->{next_input_character};
1067 if ($self->{next_input_character} == 0x004F or # O
1068 $self->{next_input_character} == 0x006F) { # o
1069 !!!next-input-character;
1070 push @next_char, $self->{next_input_character};
1071 if ($self->{next_input_character} == 0x0043 or # C
1072 $self->{next_input_character} == 0x0063) { # c
1073 !!!next-input-character;
1074 push @next_char, $self->{next_input_character};
1075 if ($self->{next_input_character} == 0x0054 or # T
1076 $self->{next_input_character} == 0x0074) { # t
1077 !!!next-input-character;
1078 push @next_char, $self->{next_input_character};
1079 if ($self->{next_input_character} == 0x0059 or # Y
1080 $self->{next_input_character} == 0x0079) { # y
1081 !!!next-input-character;
1082 push @next_char, $self->{next_input_character};
1083 if ($self->{next_input_character} == 0x0050 or # P
1084 $self->{next_input_character} == 0x0070) { # p
1085 !!!next-input-character;
1086 push @next_char, $self->{next_input_character};
1087 if ($self->{next_input_character} == 0x0045 or # E
1088 $self->{next_input_character} == 0x0065) { # e
1089 ## ISSUE: What a stupid code this is!
1090 $self->{state} = DOCTYPE_STATE;
1091 !!!next-input-character;
1092 redo A;
1093 }
1094 }
1095 }
1096 }
1097 }
1098 }
1099 }
1100
1101 !!!parse-error (type => 'bogus comment');
1102 $self->{next_input_character} = shift @next_char;
1103 !!!back-next-input-character (@next_char);
1104 $self->{state} = BOGUS_COMMENT_STATE;
1105 redo A;
1106
1107 ## ISSUE: typos in spec: chacacters, is is a parse error
1108 ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
1109 } elsif ($self->{state} == COMMENT_START_STATE) {
1110 if ($self->{next_input_character} == 0x002D) { # -
1111 $self->{state} = COMMENT_START_DASH_STATE;
1112 !!!next-input-character;
1113 redo A;
1114 } elsif ($self->{next_input_character} == 0x003E) { # >
1115 !!!parse-error (type => 'bogus comment');
1116 $self->{state} = DATA_STATE;
1117 !!!next-input-character;
1118
1119 !!!emit ($self->{current_token}); # comment
1120
1121 redo A;
1122 } elsif ($self->{next_input_character} == -1) {
1123 !!!parse-error (type => 'unclosed comment');
1124 $self->{state} = DATA_STATE;
1125 ## reconsume
1126
1127 !!!emit ($self->{current_token}); # comment
1128
1129 redo A;
1130 } else {
1131 $self->{current_token}->{data} # comment
1132 .= chr ($self->{next_input_character});
1133 $self->{state} = COMMENT_STATE;
1134 !!!next-input-character;
1135 redo A;
1136 }
1137 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1138 if ($self->{next_input_character} == 0x002D) { # -
1139 $self->{state} = COMMENT_END_STATE;
1140 !!!next-input-character;
1141 redo A;
1142 } elsif ($self->{next_input_character} == 0x003E) { # >
1143 !!!parse-error (type => 'bogus comment');
1144 $self->{state} = DATA_STATE;
1145 !!!next-input-character;
1146
1147 !!!emit ($self->{current_token}); # comment
1148
1149 redo A;
1150 } elsif ($self->{next_input_character} == -1) {
1151 !!!parse-error (type => 'unclosed comment');
1152 $self->{state} = DATA_STATE;
1153 ## reconsume
1154
1155 !!!emit ($self->{current_token}); # comment
1156
1157 redo A;
1158 } else {
1159 $self->{current_token}->{data} # comment
1160 .= '-' . chr ($self->{next_input_character});
1161 $self->{state} = COMMENT_STATE;
1162 !!!next-input-character;
1163 redo A;
1164 }
1165 } elsif ($self->{state} == COMMENT_STATE) {
1166 if ($self->{next_input_character} == 0x002D) { # -
1167 $self->{state} = COMMENT_END_DASH_STATE;
1168 !!!next-input-character;
1169 redo A;
1170 } elsif ($self->{next_input_character} == -1) {
1171 !!!parse-error (type => 'unclosed comment');
1172 $self->{state} = DATA_STATE;
1173 ## reconsume
1174
1175 !!!emit ($self->{current_token}); # comment
1176
1177 redo A;
1178 } else {
1179 $self->{current_token}->{data} .= chr ($self->{next_input_character}); # comment
1180 ## Stay in the state
1181 !!!next-input-character;
1182 redo A;
1183 }
1184 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1185 if ($self->{next_input_character} == 0x002D) { # -
1186 $self->{state} = COMMENT_END_STATE;
1187 !!!next-input-character;
1188 redo A;
1189 } elsif ($self->{next_input_character} == -1) {
1190 !!!parse-error (type => 'unclosed comment');
1191 $self->{state} = DATA_STATE;
1192 ## reconsume
1193
1194 !!!emit ($self->{current_token}); # comment
1195
1196 redo A;
1197 } else {
1198 $self->{current_token}->{data} .= '-' . chr ($self->{next_input_character}); # comment
1199 $self->{state} = COMMENT_STATE;
1200 !!!next-input-character;
1201 redo A;
1202 }
1203 } elsif ($self->{state} == COMMENT_END_STATE) {
1204 if ($self->{next_input_character} == 0x003E) { # >
1205 $self->{state} = DATA_STATE;
1206 !!!next-input-character;
1207
1208 !!!emit ($self->{current_token}); # comment
1209
1210 redo A;
1211 } elsif ($self->{next_input_character} == 0x002D) { # -
1212 !!!parse-error (type => 'dash in comment');
1213 $self->{current_token}->{data} .= '-'; # comment
1214 ## Stay in the state
1215 !!!next-input-character;
1216 redo A;
1217 } elsif ($self->{next_input_character} == -1) {
1218 !!!parse-error (type => 'unclosed comment');
1219 $self->{state} = DATA_STATE;
1220 ## reconsume
1221
1222 !!!emit ($self->{current_token}); # comment
1223
1224 redo A;
1225 } else {
1226 !!!parse-error (type => 'dash in comment');
1227 $self->{current_token}->{data} .= '--' . chr ($self->{next_input_character}); # comment
1228 $self->{state} = COMMENT_STATE;
1229 !!!next-input-character;
1230 redo A;
1231 }
1232 } elsif ($self->{state} == DOCTYPE_STATE) {
1233 if ($self->{next_input_character} == 0x0009 or # HT
1234 $self->{next_input_character} == 0x000A or # LF
1235 $self->{next_input_character} == 0x000B or # VT
1236 $self->{next_input_character} == 0x000C or # FF
1237 $self->{next_input_character} == 0x0020) { # SP
1238 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1239 !!!next-input-character;
1240 redo A;
1241 } else {
1242 !!!parse-error (type => 'no space before DOCTYPE name');
1243 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1244 ## reconsume
1245 redo A;
1246 }
1247 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
1248 if ($self->{next_input_character} == 0x0009 or # HT
1249 $self->{next_input_character} == 0x000A or # LF
1250 $self->{next_input_character} == 0x000B or # VT
1251 $self->{next_input_character} == 0x000C or # FF
1252 $self->{next_input_character} == 0x0020) { # SP
1253 ## Stay in the state
1254 !!!next-input-character;
1255 redo A;
1256 } elsif ($self->{next_input_character} == 0x003E) { # >
1257 !!!parse-error (type => 'no DOCTYPE name');
1258 $self->{state} = DATA_STATE;
1259 !!!next-input-character;
1260
1261 !!!emit ({type => DOCTYPE_TOKEN}); # incorrect
1262
1263 redo A;
1264 } elsif ($self->{next_input_character} == -1) {
1265 !!!parse-error (type => 'no DOCTYPE name');
1266 $self->{state} = DATA_STATE;
1267 ## reconsume
1268
1269 !!!emit ({type => DOCTYPE_TOKEN}); # incorrect
1270
1271 redo A;
1272 } else {
1273 $self->{current_token}
1274 = {type => DOCTYPE_TOKEN,
1275 name => chr ($self->{next_input_character}),
1276 correct => 1};
1277 ## ISSUE: "Set the token's name name to the" in the spec
1278 $self->{state} = DOCTYPE_NAME_STATE;
1279 !!!next-input-character;
1280 redo A;
1281 }
1282 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
1283 ## ISSUE: Redundant "First," in the spec.
1284 if ($self->{next_input_character} == 0x0009 or # HT
1285 $self->{next_input_character} == 0x000A or # LF
1286 $self->{next_input_character} == 0x000B or # VT
1287 $self->{next_input_character} == 0x000C or # FF
1288 $self->{next_input_character} == 0x0020) { # SP
1289 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
1290 !!!next-input-character;
1291 redo A;
1292 } elsif ($self->{next_input_character} == 0x003E) { # >
1293 $self->{state} = DATA_STATE;
1294 !!!next-input-character;
1295
1296 !!!emit ($self->{current_token}); # DOCTYPE
1297
1298 redo A;
1299 } elsif ($self->{next_input_character} == -1) {
1300 !!!parse-error (type => 'unclosed DOCTYPE');
1301 $self->{state} = DATA_STATE;
1302 ## reconsume
1303
1304 delete $self->{current_token}->{correct};
1305 !!!emit ($self->{current_token}); # DOCTYPE
1306
1307 redo A;
1308 } else {
1309 $self->{current_token}->{name}
1310 .= chr ($self->{next_input_character}); # DOCTYPE
1311 ## Stay in the state
1312 !!!next-input-character;
1313 redo A;
1314 }
1315 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
1316 if ($self->{next_input_character} == 0x0009 or # HT
1317 $self->{next_input_character} == 0x000A or # LF
1318 $self->{next_input_character} == 0x000B or # VT
1319 $self->{next_input_character} == 0x000C or # FF
1320 $self->{next_input_character} == 0x0020) { # SP
1321 ## Stay in the state
1322 !!!next-input-character;
1323 redo A;
1324 } elsif ($self->{next_input_character} == 0x003E) { # >
1325 $self->{state} = DATA_STATE;
1326 !!!next-input-character;
1327
1328 !!!emit ($self->{current_token}); # DOCTYPE
1329
1330 redo A;
1331 } elsif ($self->{next_input_character} == -1) {
1332 !!!parse-error (type => 'unclosed DOCTYPE');
1333 $self->{state} = DATA_STATE;
1334 ## reconsume
1335
1336 delete $self->{current_token}->{correct};
1337 !!!emit ($self->{current_token}); # DOCTYPE
1338
1339 redo A;
1340 } elsif ($self->{next_input_character} == 0x0050 or # P
1341 $self->{next_input_character} == 0x0070) { # p
1342 !!!next-input-character;
1343 if ($self->{next_input_character} == 0x0055 or # U
1344 $self->{next_input_character} == 0x0075) { # u
1345 !!!next-input-character;
1346 if ($self->{next_input_character} == 0x0042 or # B
1347 $self->{next_input_character} == 0x0062) { # b
1348 !!!next-input-character;
1349 if ($self->{next_input_character} == 0x004C or # L
1350 $self->{next_input_character} == 0x006C) { # l
1351 !!!next-input-character;
1352 if ($self->{next_input_character} == 0x0049 or # I
1353 $self->{next_input_character} == 0x0069) { # i
1354 !!!next-input-character;
1355 if ($self->{next_input_character} == 0x0043 or # C
1356 $self->{next_input_character} == 0x0063) { # c
1357 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1358 !!!next-input-character;
1359 redo A;
1360 }
1361 }
1362 }
1363 }
1364 }
1365
1366 #
1367 } elsif ($self->{next_input_character} == 0x0053 or # S
1368 $self->{next_input_character} == 0x0073) { # s
1369 !!!next-input-character;
1370 if ($self->{next_input_character} == 0x0059 or # Y
1371 $self->{next_input_character} == 0x0079) { # y
1372 !!!next-input-character;
1373 if ($self->{next_input_character} == 0x0053 or # S
1374 $self->{next_input_character} == 0x0073) { # s
1375 !!!next-input-character;
1376 if ($self->{next_input_character} == 0x0054 or # T
1377 $self->{next_input_character} == 0x0074) { # t
1378 !!!next-input-character;
1379 if ($self->{next_input_character} == 0x0045 or # E
1380 $self->{next_input_character} == 0x0065) { # e
1381 !!!next-input-character;
1382 if ($self->{next_input_character} == 0x004D or # M
1383 $self->{next_input_character} == 0x006D) { # m
1384 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1385 !!!next-input-character;
1386 redo A;
1387 }
1388 }
1389 }
1390 }
1391 }
1392
1393 #
1394 } else {
1395 !!!next-input-character;
1396 #
1397 }
1398
1399 !!!parse-error (type => 'string after DOCTYPE name');
1400 $self->{state} = BOGUS_DOCTYPE_STATE;
1401 # next-input-character is already done
1402 redo A;
1403 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
1404 if ({
1405 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1406 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1407 }->{$self->{next_input_character}}) {
1408 ## Stay in the state
1409 !!!next-input-character;
1410 redo A;
1411 } elsif ($self->{next_input_character} eq 0x0022) { # "
1412 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
1413 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
1414 !!!next-input-character;
1415 redo A;
1416 } elsif ($self->{next_input_character} eq 0x0027) { # '
1417 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
1418 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
1419 !!!next-input-character;
1420 redo A;
1421 } elsif ($self->{next_input_character} eq 0x003E) { # >
1422 !!!parse-error (type => 'no PUBLIC literal');
1423
1424 $self->{state} = DATA_STATE;
1425 !!!next-input-character;
1426
1427 delete $self->{current_token}->{correct};
1428 !!!emit ($self->{current_token}); # DOCTYPE
1429
1430 redo A;
1431 } elsif ($self->{next_input_character} == -1) {
1432 !!!parse-error (type => 'unclosed DOCTYPE');
1433
1434 $self->{state} = DATA_STATE;
1435 ## reconsume
1436
1437 delete $self->{current_token}->{correct};
1438 !!!emit ($self->{current_token}); # DOCTYPE
1439
1440 redo A;
1441 } else {
1442 !!!parse-error (type => 'string after PUBLIC');
1443 $self->{state} = BOGUS_DOCTYPE_STATE;
1444 !!!next-input-character;
1445 redo A;
1446 }
1447 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
1448 if ($self->{next_input_character} == 0x0022) { # "
1449 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1450 !!!next-input-character;
1451 redo A;
1452 } elsif ($self->{next_input_character} == -1) {
1453 !!!parse-error (type => 'unclosed PUBLIC literal');
1454
1455 $self->{state} = DATA_STATE;
1456 ## reconsume
1457
1458 delete $self->{current_token}->{correct};
1459 !!!emit ($self->{current_token}); # DOCTYPE
1460
1461 redo A;
1462 } else {
1463 $self->{current_token}->{public_identifier} # DOCTYPE
1464 .= chr $self->{next_input_character};
1465 ## Stay in the state
1466 !!!next-input-character;
1467 redo A;
1468 }
1469 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
1470 if ($self->{next_input_character} == 0x0027) { # '
1471 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1472 !!!next-input-character;
1473 redo A;
1474 } elsif ($self->{next_input_character} == -1) {
1475 !!!parse-error (type => 'unclosed PUBLIC literal');
1476
1477 $self->{state} = DATA_STATE;
1478 ## reconsume
1479
1480 delete $self->{current_token}->{correct};
1481 !!!emit ($self->{current_token}); # DOCTYPE
1482
1483 redo A;
1484 } else {
1485 $self->{current_token}->{public_identifier} # DOCTYPE
1486 .= chr $self->{next_input_character};
1487 ## Stay in the state
1488 !!!next-input-character;
1489 redo A;
1490 }
1491 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
1492 if ({
1493 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1494 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1495 }->{$self->{next_input_character}}) {
1496 ## Stay in the state
1497 !!!next-input-character;
1498 redo A;
1499 } elsif ($self->{next_input_character} == 0x0022) { # "
1500 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1501 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
1502 !!!next-input-character;
1503 redo A;
1504 } elsif ($self->{next_input_character} == 0x0027) { # '
1505 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1506 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
1507 !!!next-input-character;
1508 redo A;
1509 } elsif ($self->{next_input_character} == 0x003E) { # >
1510 $self->{state} = DATA_STATE;
1511 !!!next-input-character;
1512
1513 !!!emit ($self->{current_token}); # DOCTYPE
1514
1515 redo A;
1516 } elsif ($self->{next_input_character} == -1) {
1517 !!!parse-error (type => 'unclosed DOCTYPE');
1518
1519 $self->{state} = DATA_STATE;
1520 ## reconsume
1521
1522 delete $self->{current_token}->{correct};
1523 !!!emit ($self->{current_token}); # DOCTYPE
1524
1525 redo A;
1526 } else {
1527 !!!parse-error (type => 'string after PUBLIC literal');
1528 $self->{state} = BOGUS_DOCTYPE_STATE;
1529 !!!next-input-character;
1530 redo A;
1531 }
1532 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
1533 if ({
1534 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1535 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1536 }->{$self->{next_input_character}}) {
1537 ## Stay in the state
1538 !!!next-input-character;
1539 redo A;
1540 } elsif ($self->{next_input_character} == 0x0022) { # "
1541 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1542 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
1543 !!!next-input-character;
1544 redo A;
1545 } elsif ($self->{next_input_character} == 0x0027) { # '
1546 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1547 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
1548 !!!next-input-character;
1549 redo A;
1550 } elsif ($self->{next_input_character} == 0x003E) { # >
1551 !!!parse-error (type => 'no SYSTEM literal');
1552 $self->{state} = DATA_STATE;
1553 !!!next-input-character;
1554
1555 delete $self->{current_token}->{correct};
1556 !!!emit ($self->{current_token}); # DOCTYPE
1557
1558 redo A;
1559 } elsif ($self->{next_input_character} == -1) {
1560 !!!parse-error (type => 'unclosed DOCTYPE');
1561
1562 $self->{state} = DATA_STATE;
1563 ## reconsume
1564
1565 delete $self->{current_token}->{correct};
1566 !!!emit ($self->{current_token}); # DOCTYPE
1567
1568 redo A;
1569 } else {
1570 !!!parse-error (type => 'string after SYSTEM');
1571 $self->{state} = BOGUS_DOCTYPE_STATE;
1572 !!!next-input-character;
1573 redo A;
1574 }
1575 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
1576 if ($self->{next_input_character} == 0x0022) { # "
1577 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1578 !!!next-input-character;
1579 redo A;
1580 } elsif ($self->{next_input_character} == -1) {
1581 !!!parse-error (type => 'unclosed SYSTEM literal');
1582
1583 $self->{state} = DATA_STATE;
1584 ## reconsume
1585
1586 delete $self->{current_token}->{correct};
1587 !!!emit ($self->{current_token}); # DOCTYPE
1588
1589 redo A;
1590 } else {
1591 $self->{current_token}->{system_identifier} # DOCTYPE
1592 .= chr $self->{next_input_character};
1593 ## Stay in the state
1594 !!!next-input-character;
1595 redo A;
1596 }
1597 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
1598 if ($self->{next_input_character} == 0x0027) { # '
1599 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1600 !!!next-input-character;
1601 redo A;
1602 } elsif ($self->{next_input_character} == -1) {
1603 !!!parse-error (type => 'unclosed SYSTEM literal');
1604
1605 $self->{state} = DATA_STATE;
1606 ## reconsume
1607
1608 delete $self->{current_token}->{correct};
1609 !!!emit ($self->{current_token}); # DOCTYPE
1610
1611 redo A;
1612 } else {
1613 $self->{current_token}->{system_identifier} # DOCTYPE
1614 .= chr $self->{next_input_character};
1615 ## Stay in the state
1616 !!!next-input-character;
1617 redo A;
1618 }
1619 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
1620 if ({
1621 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1622 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1623 }->{$self->{next_input_character}}) {
1624 ## Stay in the state
1625 !!!next-input-character;
1626 redo A;
1627 } elsif ($self->{next_input_character} == 0x003E) { # >
1628 $self->{state} = DATA_STATE;
1629 !!!next-input-character;
1630
1631 !!!emit ($self->{current_token}); # DOCTYPE
1632
1633 redo A;
1634 } elsif ($self->{next_input_character} == -1) {
1635 !!!parse-error (type => 'unclosed DOCTYPE');
1636
1637 $self->{state} = DATA_STATE;
1638 ## reconsume
1639
1640 delete $self->{current_token}->{correct};
1641 !!!emit ($self->{current_token}); # DOCTYPE
1642
1643 redo A;
1644 } else {
1645 !!!parse-error (type => 'string after SYSTEM literal');
1646 $self->{state} = BOGUS_DOCTYPE_STATE;
1647 !!!next-input-character;
1648 redo A;
1649 }
1650 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
1651 if ($self->{next_input_character} == 0x003E) { # >
1652 $self->{state} = DATA_STATE;
1653 !!!next-input-character;
1654
1655 delete $self->{current_token}->{correct};
1656 !!!emit ($self->{current_token}); # DOCTYPE
1657
1658 redo A;
1659 } elsif ($self->{next_input_character} == -1) {
1660 !!!parse-error (type => 'unclosed DOCTYPE');
1661 $self->{state} = DATA_STATE;
1662 ## reconsume
1663
1664 delete $self->{current_token}->{correct};
1665 !!!emit ($self->{current_token}); # DOCTYPE
1666
1667 redo A;
1668 } else {
1669 ## Stay in the state
1670 !!!next-input-character;
1671 redo A;
1672 }
1673 } else {
1674 die "$0: $self->{state}: Unknown state";
1675 }
1676 } # A
1677
1678 die "$0: _get_next_token: unexpected case";
1679 } # _get_next_token
1680
1681 sub _tokenize_attempt_to_consume_an_entity ($$) {
1682 my ($self, $in_attr) = @_;
1683
1684 if ({
1685 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
1686 0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR
1687 }->{$self->{next_input_character}}) {
1688 ## Don't consume
1689 ## No error
1690 return undef;
1691 } elsif ($self->{next_input_character} == 0x0023) { # #
1692 !!!next-input-character;
1693 if ($self->{next_input_character} == 0x0078 or # x
1694 $self->{next_input_character} == 0x0058) { # X
1695 my $code;
1696 X: {
1697 my $x_char = $self->{next_input_character};
1698 !!!next-input-character;
1699 if (0x0030 <= $self->{next_input_character} and
1700 $self->{next_input_character} <= 0x0039) { # 0..9
1701 $code ||= 0;
1702 $code *= 0x10;
1703 $code += $self->{next_input_character} - 0x0030;
1704 redo X;
1705 } elsif (0x0061 <= $self->{next_input_character} and
1706 $self->{next_input_character} <= 0x0066) { # a..f
1707 $code ||= 0;
1708 $code *= 0x10;
1709 $code += $self->{next_input_character} - 0x0060 + 9;
1710 redo X;
1711 } elsif (0x0041 <= $self->{next_input_character} and
1712 $self->{next_input_character} <= 0x0046) { # A..F
1713 $code ||= 0;
1714 $code *= 0x10;
1715 $code += $self->{next_input_character} - 0x0040 + 9;
1716 redo X;
1717 } elsif (not defined $code) { # no hexadecimal digit
1718 !!!parse-error (type => 'bare hcro');
1719 !!!back-next-input-character ($x_char, $self->{next_input_character});
1720 $self->{next_input_character} = 0x0023; # #
1721 return undef;
1722 } elsif ($self->{next_input_character} == 0x003B) { # ;
1723 !!!next-input-character;
1724 } else {
1725 !!!parse-error (type => 'no refc');
1726 }
1727
1728 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
1729 !!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code);
1730 $code = 0xFFFD;
1731 } elsif ($code > 0x10FFFF) {
1732 !!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code);
1733 $code = 0xFFFD;
1734 } elsif ($code == 0x000D) {
1735 !!!parse-error (type => 'CR character reference');
1736 $code = 0x000A;
1737 } elsif (0x80 <= $code and $code <= 0x9F) {
1738 !!!parse-error (type => sprintf 'C1 character reference:U+%04X', $code);
1739 $code = $c1_entity_char->{$code};
1740 }
1741
1742 return {type => CHARACTER_TOKEN, data => chr $code};
1743 } # X
1744 } elsif (0x0030 <= $self->{next_input_character} and
1745 $self->{next_input_character} <= 0x0039) { # 0..9
1746 my $code = $self->{next_input_character} - 0x0030;
1747 !!!next-input-character;
1748
1749 while (0x0030 <= $self->{next_input_character} and
1750 $self->{next_input_character} <= 0x0039) { # 0..9
1751 $code *= 10;
1752 $code += $self->{next_input_character} - 0x0030;
1753
1754 !!!next-input-character;
1755 }
1756
1757 if ($self->{next_input_character} == 0x003B) { # ;
1758 !!!next-input-character;
1759 } else {
1760 !!!parse-error (type => 'no refc');
1761 }
1762
1763 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
1764 !!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code);
1765 $code = 0xFFFD;
1766 } elsif ($code > 0x10FFFF) {
1767 !!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code);
1768 $code = 0xFFFD;
1769 } elsif ($code == 0x000D) {
1770 !!!parse-error (type => 'CR character reference');
1771 $code = 0x000A;
1772 } elsif (0x80 <= $code and $code <= 0x9F) {
1773 !!!parse-error (type => sprintf 'C1 character reference:U+%04X', $code);
1774 $code = $c1_entity_char->{$code};
1775 }
1776
1777 return {type => CHARACTER_TOKEN, data => chr $code};
1778 } else {
1779 !!!parse-error (type => 'bare nero');
1780 !!!back-next-input-character ($self->{next_input_character});
1781 $self->{next_input_character} = 0x0023; # #
1782 return undef;
1783 }
1784 } elsif ((0x0041 <= $self->{next_input_character} and
1785 $self->{next_input_character} <= 0x005A) or
1786 (0x0061 <= $self->{next_input_character} and
1787 $self->{next_input_character} <= 0x007A)) {
1788 my $entity_name = chr $self->{next_input_character};
1789 !!!next-input-character;
1790
1791 my $value = $entity_name;
1792 my $match = 0;
1793 require Whatpm::_NamedEntityList;
1794 our $EntityChar;
1795
1796 while (length $entity_name < 10 and
1797 ## NOTE: Some number greater than the maximum length of entity name
1798 ((0x0041 <= $self->{next_input_character} and # a
1799 $self->{next_input_character} <= 0x005A) or # x
1800 (0x0061 <= $self->{next_input_character} and # a
1801 $self->{next_input_character} <= 0x007A) or # z
1802 (0x0030 <= $self->{next_input_character} and # 0
1803 $self->{next_input_character} <= 0x0039) or # 9
1804 $self->{next_input_character} == 0x003B)) { # ;
1805 $entity_name .= chr $self->{next_input_character};
1806 if (defined $EntityChar->{$entity_name}) {
1807 if ($self->{next_input_character} == 0x003B) { # ;
1808 $value = $EntityChar->{$entity_name};
1809 $match = 1;
1810 !!!next-input-character;
1811 last;
1812 } else {
1813 $value = $EntityChar->{$entity_name};
1814 $match = -1;
1815 !!!next-input-character;
1816 }
1817 } else {
1818 $value .= chr $self->{next_input_character};
1819 $match *= 2;
1820 !!!next-input-character;
1821 }
1822 }
1823
1824 if ($match > 0) {
1825 return {type => CHARACTER_TOKEN, data => $value};
1826 } elsif ($match < 0) {
1827 !!!parse-error (type => 'no refc');
1828 if ($in_attr and $match < -1) {
1829 return {type => CHARACTER_TOKEN, data => '&'.$entity_name};
1830 } else {
1831 return {type => CHARACTER_TOKEN, data => $value};
1832 }
1833 } else {
1834 !!!parse-error (type => 'bare ero');
1835 ## NOTE: No characters are consumed in the spec.
1836 return {type => CHARACTER_TOKEN, data => '&'.$value};
1837 }
1838 } else {
1839 ## no characters are consumed
1840 !!!parse-error (type => 'bare ero');
1841 return undef;
1842 }
1843 } # _tokenize_attempt_to_consume_an_entity
1844
1845 sub _initialize_tree_constructor ($) {
1846 my $self = shift;
1847 ## NOTE: $self->{document} MUST be specified before this method is called
1848 $self->{document}->strict_error_checking (0);
1849 ## TODO: Turn mutation events off # MUST
1850 ## TODO: Turn loose Document option (manakai extension) on
1851 $self->{document}->manakai_is_html (1); # MUST
1852 } # _initialize_tree_constructor
1853
1854 sub _terminate_tree_constructor ($) {
1855 my $self = shift;
1856 $self->{document}->strict_error_checking (1);
1857 ## TODO: Turn mutation events on
1858 } # _terminate_tree_constructor
1859
1860 ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
1861
1862 { # tree construction stage
1863 my $token;
1864
1865 sub _construct_tree ($) {
1866 my ($self) = @_;
1867
1868 ## When an interactive UA render the $self->{document} available
1869 ## to the user, or when it begin accepting user input, are
1870 ## not defined.
1871
1872 ## Append a character: collect it and all subsequent consecutive
1873 ## characters and insert one Text node whose data is concatenation
1874 ## of all those characters. # MUST
1875
1876 !!!next-token;
1877
1878 $self->{insertion_mode} = BEFORE_HEAD_IM;
1879 undef $self->{form_element};
1880 undef $self->{head_element};
1881 $self->{open_elements} = [];
1882 undef $self->{inner_html_node};
1883
1884 $self->_tree_construction_initial; # MUST
1885 $self->_tree_construction_root_element;
1886 $self->_tree_construction_main;
1887 } # _construct_tree
1888
1889 sub _tree_construction_initial ($) {
1890 my $self = shift;
1891 INITIAL: {
1892 if ($token->{type} == DOCTYPE_TOKEN) {
1893 ## NOTE: Conformance checkers MAY, instead of reporting "not HTML5"
1894 ## error, switch to a conformance checking mode for another
1895 ## language.
1896 my $doctype_name = $token->{name};
1897 $doctype_name = '' unless defined $doctype_name;
1898 $doctype_name =~ tr/a-z/A-Z/;
1899 if (not defined $token->{name} or # <!DOCTYPE>
1900 defined $token->{public_identifier} or
1901 defined $token->{system_identifier}) {
1902 !!!parse-error (type => 'not HTML5');
1903 } elsif ($doctype_name ne 'HTML') {
1904 ## ISSUE: ASCII case-insensitive? (in fact it does not matter)
1905 !!!parse-error (type => 'not HTML5');
1906 }
1907
1908 my $doctype = $self->{document}->create_document_type_definition
1909 ($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)?
1910 $doctype->public_id ($token->{public_identifier})
1911 if defined $token->{public_identifier};
1912 $doctype->system_id ($token->{system_identifier})
1913 if defined $token->{system_identifier};
1914 ## NOTE: Other DocumentType attributes are null or empty lists.
1915 ## ISSUE: internalSubset = null??
1916 $self->{document}->append_child ($doctype);
1917
1918 if (not $token->{correct} or $doctype_name ne 'HTML') {
1919 $self->{document}->manakai_compat_mode ('quirks');
1920 } elsif (defined $token->{public_identifier}) {
1921 my $pubid = $token->{public_identifier};
1922 $pubid =~ tr/a-z/A-z/;
1923 if ({
1924 "+//SILMARIL//DTD HTML PRO V0R11 19970101//EN" => 1,
1925 "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
1926 "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
1927 "-//IETF//DTD HTML 2.0 LEVEL 1//EN" => 1,
1928 "-//IETF//DTD HTML 2.0 LEVEL 2//EN" => 1,
1929 "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//EN" => 1,
1930 "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//EN" => 1,
1931 "-//IETF//DTD HTML 2.0 STRICT//EN" => 1,
1932 "-//IETF//DTD HTML 2.0//EN" => 1,
1933 "-//IETF//DTD HTML 2.1E//EN" => 1,
1934 "-//IETF//DTD HTML 3.0//EN" => 1,
1935 "-//IETF//DTD HTML 3.0//EN//" => 1,
1936 "-//IETF//DTD HTML 3.2 FINAL//EN" => 1,
1937 "-//IETF//DTD HTML 3.2//EN" => 1,
1938 "-//IETF//DTD HTML 3//EN" => 1,
1939 "-//IETF//DTD HTML LEVEL 0//EN" => 1,
1940 "-//IETF//DTD HTML LEVEL 0//EN//2.0" => 1,
1941 "-//IETF//DTD HTML LEVEL 1//EN" => 1,
1942 "-//IETF//DTD HTML LEVEL 1//EN//2.0" => 1,
1943 "-//IETF//DTD HTML LEVEL 2//EN" => 1,
1944 "-//IETF//DTD HTML LEVEL 2//EN//2.0" => 1,
1945 "-//IETF//DTD HTML LEVEL 3//EN" => 1,
1946 "-//IETF//DTD HTML LEVEL 3//EN//3.0" => 1,
1947 "-//IETF//DTD HTML STRICT LEVEL 0//EN" => 1,
1948 "-//IETF//DTD HTML STRICT LEVEL 0//EN//2.0" => 1,
1949 "-//IETF//DTD HTML STRICT LEVEL 1//EN" => 1,
1950 "-//IETF//DTD HTML STRICT LEVEL 1//EN//2.0" => 1,
1951 "-//IETF//DTD HTML STRICT LEVEL 2//EN" => 1,
1952 "-//IETF//DTD HTML STRICT LEVEL 2//EN//2.0" => 1,
1953 "-//IETF//DTD HTML STRICT LEVEL 3//EN" => 1,
1954 "-//IETF//DTD HTML STRICT LEVEL 3//EN//3.0" => 1,
1955 "-//IETF//DTD HTML STRICT//EN" => 1,
1956 "-//IETF//DTD HTML STRICT//EN//2.0" => 1,
1957 "-//IETF//DTD HTML STRICT//EN//3.0" => 1,
1958 "-//IETF//DTD HTML//EN" => 1,
1959 "-//IETF//DTD HTML//EN//2.0" => 1,
1960 "-//IETF//DTD HTML//EN//3.0" => 1,
1961 "-//METRIUS//DTD METRIUS PRESENTATIONAL//EN" => 1,
1962 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//EN" => 1,
1963 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//EN" => 1,
1964 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//EN" => 1,
1965 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//EN" => 1,
1966 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//EN" => 1,
1967 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//EN" => 1,
1968 "-//NETSCAPE COMM. CORP.//DTD HTML//EN" => 1,
1969 "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1,
1970 "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1,
1971 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1,
1972 "-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1,
1973 "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1,
1974 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1,
1975 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//EN" => 1,
1976 "-//W3C//DTD HTML 3 1995-03-24//EN" => 1,
1977 "-//W3C//DTD HTML 3.2 DRAFT//EN" => 1,
1978 "-//W3C//DTD HTML 3.2 FINAL//EN" => 1,
1979 "-//W3C//DTD HTML 3.2//EN" => 1,
1980 "-//W3C//DTD HTML 3.2S DRAFT//EN" => 1,
1981 "-//W3C//DTD HTML 4.0 FRAMESET//EN" => 1,
1982 "-//W3C//DTD HTML 4.0 TRANSITIONAL//EN" => 1,
1983 "-//W3C//DTD HTML EXPERIMETNAL 19960712//EN" => 1,
1984 "-//W3C//DTD HTML EXPERIMENTAL 970421//EN" => 1,
1985 "-//W3C//DTD W3 HTML//EN" => 1,
1986 "-//W3O//DTD W3 HTML 3.0//EN" => 1,
1987 "-//W3O//DTD W3 HTML 3.0//EN//" => 1,
1988 "-//W3O//DTD W3 HTML STRICT 3.0//EN//" => 1,
1989 "-//WEBTECHS//DTD MOZILLA HTML 2.0//EN" => 1,
1990 "-//WEBTECHS//DTD MOZILLA HTML//EN" => 1,
1991 "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" => 1,
1992 "HTML" => 1,
1993 }->{$pubid}) {
1994 $self->{document}->manakai_compat_mode ('quirks');
1995 } elsif ($pubid eq "-//W3C//DTD HTML 4.01 FRAMESET//EN" or
1996 $pubid eq "-//W3C//DTD HTML 4.01 TRANSITIONAL//EN") {
1997 if (defined $token->{system_identifier}) {
1998 $self->{document}->manakai_compat_mode ('quirks');
1999 } else {
2000 $self->{document}->manakai_compat_mode ('limited quirks');
2001 }
2002 } elsif ($pubid eq "-//W3C//DTD XHTML 1.0 Frameset//EN" or
2003 $pubid eq "-//W3C//DTD XHTML 1.0 Transitional//EN") {
2004 $self->{document}->manakai_compat_mode ('limited quirks');
2005 }
2006 }
2007 if (defined $token->{system_identifier}) {
2008 my $sysid = $token->{system_identifier};
2009 $sysid =~ tr/A-Z/a-z/;
2010 if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
2011 $self->{document}->manakai_compat_mode ('quirks');
2012 }
2013 }
2014
2015 ## Go to the root element phase.
2016 !!!next-token;
2017 return;
2018 } elsif ({
2019 START_TAG_TOKEN, 1,
2020 END_TAG_TOKEN, 1,
2021 END_OF_FILE_TOKEN, 1,
2022 }->{$token->{type}}) {
2023 !!!parse-error (type => 'no DOCTYPE');
2024 $self->{document}->manakai_compat_mode ('quirks');
2025 ## Go to the root element phase
2026 ## reprocess
2027 return;
2028 } elsif ($token->{type} == CHARACTER_TOKEN) {
2029 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
2030 ## Ignore the token
2031
2032 unless (length $token->{data}) {
2033 ## Stay in the phase
2034 !!!next-token;
2035 redo INITIAL;
2036 }
2037 }
2038
2039 !!!parse-error (type => 'no DOCTYPE');
2040 $self->{document}->manakai_compat_mode ('quirks');
2041 ## Go to the root element phase
2042 ## reprocess
2043 return;
2044 } elsif ($token->{type} == COMMENT_TOKEN) {
2045 my $comment = $self->{document}->create_comment ($token->{data});
2046 $self->{document}->append_child ($comment);
2047
2048 ## Stay in the phase.
2049 !!!next-token;
2050 redo INITIAL;
2051 } else {
2052 die "$0: $token->{type}: Unknown token type";
2053 }
2054 } # INITIAL
2055 } # _tree_construction_initial
2056
2057 sub _tree_construction_root_element ($) {
2058 my $self = shift;
2059
2060 B: {
2061 if ($token->{type} == DOCTYPE_TOKEN) {
2062 !!!parse-error (type => 'in html:#DOCTYPE');
2063 ## Ignore the token
2064 ## Stay in the phase
2065 !!!next-token;
2066 redo B;
2067 } elsif ($token->{type} == COMMENT_TOKEN) {
2068 my $comment = $self->{document}->create_comment ($token->{data});
2069 $self->{document}->append_child ($comment);
2070 ## Stay in the phase
2071 !!!next-token;
2072 redo B;
2073 } elsif ($token->{type} == CHARACTER_TOKEN) {
2074 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
2075 ## Ignore the token.
2076
2077 unless (length $token->{data}) {
2078 ## Stay in the phase
2079 !!!next-token;
2080 redo B;
2081 }
2082 }
2083 #
2084 } elsif ({
2085 START_TAG_TOKEN, 1,
2086 END_TAG_TOKEN, 1,
2087 END_OF_FILE_TOKEN, 1,
2088 }->{$token->{type}}) {
2089 ## ISSUE: There is an issue in the spec
2090 #
2091 } else {
2092 die "$0: $token->{type}: Unknown token type";
2093 }
2094 my $root_element; !!!create-element ($root_element, 'html');
2095 $self->{document}->append_child ($root_element);
2096 push @{$self->{open_elements}}, [$root_element, 'html'];
2097 ## reprocess
2098 #redo B;
2099 return; ## Go to the main phase.
2100 } # B
2101 } # _tree_construction_root_element
2102
2103 sub _reset_insertion_mode ($) {
2104 my $self = shift;
2105
2106 ## Step 1
2107 my $last;
2108
2109 ## Step 2
2110 my $i = -1;
2111 my $node = $self->{open_elements}->[$i];
2112
2113 ## Step 3
2114 S3: {
2115 ## ISSUE: Oops! "If node is the first node in the stack of open
2116 ## elements, then set last to true. If the context element of the
2117 ## HTML fragment parsing algorithm is neither a td element nor a
2118 ## th element, then set node to the context element. (fragment case)":
2119 ## The second "if" is in the scope of the first "if"!?
2120 if ($self->{open_elements}->[0]->[0] eq $node->[0]) {
2121 $last = 1;
2122 if (defined $self->{inner_html_node}) {
2123 if ($self->{inner_html_node}->[1] eq 'td' or
2124 $self->{inner_html_node}->[1] eq 'th') {
2125 #
2126 } else {
2127 $node = $self->{inner_html_node};
2128 }
2129 }
2130 }
2131
2132 ## Step 4..13
2133 my $new_mode = {
2134 select => IN_SELECT_IM,
2135 td => IN_CELL_IM,
2136 th => IN_CELL_IM,
2137 tr => IN_ROW_IM,
2138 tbody => IN_TABLE_BODY_IM,
2139 thead => IN_TABLE_BODY_IM,
2140 tfoot => IN_TABLE_BODY_IM,
2141 caption => IN_CAPTION_IM,
2142 colgroup => IN_COLUMN_GROUP_IM,
2143 table => IN_TABLE_IM,
2144 head => IN_BODY_IM, # not in head!
2145 body => IN_BODY_IM,
2146 frameset => IN_FRAMESET_IM,
2147 }->{$node->[1]};
2148 $self->{insertion_mode} = $new_mode and return if defined $new_mode;
2149
2150 ## Step 14
2151 if ($node->[1] eq 'html') {
2152 unless (defined $self->{head_element}) {
2153 $self->{insertion_mode} = BEFORE_HEAD_IM;
2154 } else {
2155 $self->{insertion_mode} = AFTER_HEAD_IM;
2156 }
2157 return;
2158 }
2159
2160 ## Step 15
2161 $self->{insertion_mode} = IN_BODY_IM and return if $last;
2162
2163 ## Step 16
2164 $i--;
2165 $node = $self->{open_elements}->[$i];
2166
2167 ## Step 17
2168 redo S3;
2169 } # S3
2170 } # _reset_insertion_mode
2171
2172 sub _tree_construction_main ($) {
2173 my $self = shift;
2174
2175 my $active_formatting_elements = [];
2176
2177 my $reconstruct_active_formatting_elements = sub { # MUST
2178 my $insert = shift;
2179
2180 ## Step 1
2181 return unless @$active_formatting_elements;
2182
2183 ## Step 3
2184 my $i = -1;
2185 my $entry = $active_formatting_elements->[$i];
2186
2187 ## Step 2
2188 return if $entry->[0] eq '#marker';
2189 for (@{$self->{open_elements}}) {
2190 if ($entry->[0] eq $_->[0]) {
2191 return;
2192 }
2193 }
2194
2195 S4: {
2196 ## Step 4
2197 last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
2198
2199 ## Step 5
2200 $i--;
2201 $entry = $active_formatting_elements->[$i];
2202
2203 ## Step 6
2204 if ($entry->[0] eq '#marker') {
2205 #
2206 } else {
2207 my $in_open_elements;
2208 OE: for (@{$self->{open_elements}}) {
2209 if ($entry->[0] eq $_->[0]) {
2210 $in_open_elements = 1;
2211 last OE;
2212 }
2213 }
2214 if ($in_open_elements) {
2215 #
2216 } else {
2217 redo S4;
2218 }
2219 }
2220
2221 ## Step 7
2222 $i++;
2223 $entry = $active_formatting_elements->[$i];
2224 } # S4
2225
2226 S7: {
2227 ## Step 8
2228 my $clone = [$entry->[0]->clone_node (0), $entry->[1]];
2229
2230 ## Step 9
2231 $insert->($clone->[0]);
2232 push @{$self->{open_elements}}, $clone;
2233
2234 ## Step 10
2235 $active_formatting_elements->[$i] = $self->{open_elements}->[-1];
2236
2237 ## Step 11
2238 unless ($clone->[0] eq $active_formatting_elements->[-1]->[0]) {
2239 ## Step 7'
2240 $i++;
2241 $entry = $active_formatting_elements->[$i];
2242
2243 redo S7;
2244 }
2245 } # S7
2246 }; # $reconstruct_active_formatting_elements
2247
2248 my $clear_up_to_marker = sub {
2249 for (reverse 0..$#$active_formatting_elements) {
2250 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
2251 splice @$active_formatting_elements, $_;
2252 return;
2253 }
2254 }
2255 }; # $clear_up_to_marker
2256
2257 my $parse_rcdata = sub ($$) {
2258 my ($content_model_flag, $insert) = @_;
2259
2260 ## Step 1
2261 my $start_tag_name = $token->{tag_name};
2262 my $el;
2263 !!!create-element ($el, $start_tag_name, $token->{attributes});
2264
2265 ## Step 2
2266 $insert->($el); # /context node/->append_child ($el)
2267
2268 ## Step 3
2269 $self->{content_model} = $content_model_flag; # CDATA or RCDATA
2270 delete $self->{escape}; # MUST
2271
2272 ## Step 4
2273 my $text = '';
2274 !!!next-token;
2275 while ($token->{type} == CHARACTER_TOKEN) { # or until stop tokenizing
2276 $text .= $token->{data};
2277 !!!next-token;
2278 }
2279
2280 ## Step 5
2281 if (length $text) {
2282 my $text = $self->{document}->create_text_node ($text);
2283 $el->append_child ($text);
2284 }
2285
2286 ## Step 6
2287 $self->{content_model} = PCDATA_CONTENT_MODEL;
2288
2289 ## Step 7
2290 if ($token->{type} == END_TAG_TOKEN and $token->{tag_name} eq $start_tag_name) {
2291 ## Ignore the token
2292 } elsif ($content_model_flag == CDATA_CONTENT_MODEL) {
2293 !!!parse-error (type => 'in CDATA:#'.$token->{type});
2294 } elsif ($content_model_flag == RCDATA_CONTENT_MODEL) {
2295 !!!parse-error (type => 'in RCDATA:#'.$token->{type});
2296 } else {
2297 die "$0: $content_model_flag in parse_rcdata";
2298 }
2299 !!!next-token;
2300 }; # $parse_rcdata
2301
2302 my $script_start_tag = sub ($) {
2303 my $insert = $_[0];
2304 my $script_el;
2305 !!!create-element ($script_el, 'script', $token->{attributes});
2306 ## TODO: mark as "parser-inserted"
2307
2308 $self->{content_model} = CDATA_CONTENT_MODEL;
2309 delete $self->{escape}; # MUST
2310
2311 my $text = '';
2312 !!!next-token;
2313 while ($token->{type} == CHARACTER_TOKEN) {
2314 $text .= $token->{data};
2315 !!!next-token;
2316 } # stop if non-character token or tokenizer stops tokenising
2317 if (length $text) {
2318 $script_el->manakai_append_text ($text);
2319 }
2320
2321 $self->{content_model} = PCDATA_CONTENT_MODEL;
2322
2323 if ($token->{type} == END_TAG_TOKEN and
2324 $token->{tag_name} eq 'script') {
2325 ## Ignore the token
2326 } else {
2327 !!!parse-error (type => 'in CDATA:#'.$token->{type});
2328 ## ISSUE: And ignore?
2329 ## TODO: mark as "already executed"
2330 }
2331
2332 if (defined $self->{inner_html_node}) {
2333 ## TODO: mark as "already executed"
2334 } else {
2335 ## TODO: $old_insertion_point = current insertion point
2336 ## TODO: insertion point = just before the next input character
2337
2338 $insert->($script_el);
2339
2340 ## TODO: insertion point = $old_insertion_point (might be "undefined")
2341
2342 ## TODO: if there is a script that will execute as soon as the parser resume, then...
2343 }
2344
2345 !!!next-token;
2346 }; # $script_start_tag
2347
2348 my $formatting_end_tag = sub {
2349 my $tag_name = shift;
2350
2351 FET: {
2352 ## Step 1
2353 my $formatting_element;
2354 my $formatting_element_i_in_active;
2355 AFE: for (reverse 0..$#$active_formatting_elements) {
2356 if ($active_formatting_elements->[$_]->[1] eq $tag_name) {
2357 $formatting_element = $active_formatting_elements->[$_];
2358 $formatting_element_i_in_active = $_;
2359 last AFE;
2360 } elsif ($active_formatting_elements->[$_]->[0] eq '#marker') {
2361 last AFE;
2362 }
2363 } # AFE
2364 unless (defined $formatting_element) {
2365 !!!parse-error (type => 'unmatched end tag:'.$tag_name);
2366 ## Ignore the token
2367 !!!next-token;
2368 return;
2369 }
2370 ## has an element in scope
2371 my $in_scope = 1;
2372 my $formatting_element_i_in_open;
2373 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2374 my $node = $self->{open_elements}->[$_];
2375 if ($node->[0] eq $formatting_element->[0]) {
2376 if ($in_scope) {
2377 $formatting_element_i_in_open = $_;
2378 last INSCOPE;
2379 } else { # in open elements but not in scope
2380 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2381 ## Ignore the token
2382 !!!next-token;
2383 return;
2384 }
2385 } elsif ({
2386 table => 1, caption => 1, td => 1, th => 1,
2387 button => 1, marquee => 1, object => 1, html => 1,
2388 }->{$node->[1]}) {
2389 $in_scope = 0;
2390 }
2391 } # INSCOPE
2392 unless (defined $formatting_element_i_in_open) {
2393 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2394 pop @$active_formatting_elements; # $formatting_element
2395 !!!next-token; ## TODO: ok?
2396 return;
2397 }
2398 if (not $self->{open_elements}->[-1]->[0] eq $formatting_element->[0]) {
2399 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
2400 }
2401
2402 ## Step 2
2403 my $furthest_block;
2404 my $furthest_block_i_in_open;
2405 OE: for (reverse 0..$#{$self->{open_elements}}) {
2406 my $node = $self->{open_elements}->[$_];
2407 if (not $formatting_category->{$node->[1]} and
2408 #not $phrasing_category->{$node->[1]} and
2409 ($special_category->{$node->[1]} or
2410 $scoping_category->{$node->[1]})) {
2411 $furthest_block = $node;
2412 $furthest_block_i_in_open = $_;
2413 } elsif ($node->[0] eq $formatting_element->[0]) {
2414 last OE;
2415 }
2416 } # OE
2417
2418 ## Step 3
2419 unless (defined $furthest_block) { # MUST
2420 splice @{$self->{open_elements}}, $formatting_element_i_in_open;
2421 splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
2422 !!!next-token;
2423 return;
2424 }
2425
2426 ## Step 4
2427 my $common_ancestor_node = $self->{open_elements}->[$formatting_element_i_in_open - 1];
2428
2429 ## Step 5
2430 my $furthest_block_parent = $furthest_block->[0]->parent_node;
2431 if (defined $furthest_block_parent) {
2432 $furthest_block_parent->remove_child ($furthest_block->[0]);
2433 }
2434
2435 ## Step 6
2436 my $bookmark_prev_el
2437 = $active_formatting_elements->[$formatting_element_i_in_active - 1]
2438 ->[0];
2439
2440 ## Step 7
2441 my $node = $furthest_block;
2442 my $node_i_in_open = $furthest_block_i_in_open;
2443 my $last_node = $furthest_block;
2444 S7: {
2445 ## Step 1
2446 $node_i_in_open--;
2447 $node = $self->{open_elements}->[$node_i_in_open];
2448
2449 ## Step 2
2450 my $node_i_in_active;
2451 S7S2: {
2452 for (reverse 0..$#$active_formatting_elements) {
2453 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
2454 $node_i_in_active = $_;
2455 last S7S2;
2456 }
2457 }
2458 splice @{$self->{open_elements}}, $node_i_in_open, 1;
2459 redo S7;
2460 } # S7S2
2461
2462 ## Step 3
2463 last S7 if $node->[0] eq $formatting_element->[0];
2464
2465 ## Step 4
2466 if ($last_node->[0] eq $furthest_block->[0]) {
2467 $bookmark_prev_el = $node->[0];
2468 }
2469
2470 ## Step 5
2471 if ($node->[0]->has_child_nodes ()) {
2472 my $clone = [$node->[0]->clone_node (0), $node->[1]];
2473 $active_formatting_elements->[$node_i_in_active] = $clone;
2474 $self->{open_elements}->[$node_i_in_open] = $clone;
2475 $node = $clone;
2476 }
2477
2478 ## Step 6
2479 $node->[0]->append_child ($last_node->[0]);
2480
2481 ## Step 7
2482 $last_node = $node;
2483
2484 ## Step 8
2485 redo S7;
2486 } # S7
2487
2488 ## Step 8
2489 $common_ancestor_node->[0]->append_child ($last_node->[0]);
2490
2491 ## Step 9
2492 my $clone = [$formatting_element->[0]->clone_node (0),
2493 $formatting_element->[1]];
2494
2495 ## Step 10
2496 my @cn = @{$furthest_block->[0]->child_nodes};
2497 $clone->[0]->append_child ($_) for @cn;
2498
2499 ## Step 11
2500 $furthest_block->[0]->append_child ($clone->[0]);
2501
2502 ## Step 12
2503 my $i;
2504 AFE: for (reverse 0..$#$active_formatting_elements) {
2505 if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
2506 splice @$active_formatting_elements, $_, 1;
2507 $i-- and last AFE if defined $i;
2508 } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
2509 $i = $_;
2510 }
2511 } # AFE
2512 splice @$active_formatting_elements, $i + 1, 0, $clone;
2513
2514 ## Step 13
2515 undef $i;
2516 OE: for (reverse 0..$#{$self->{open_elements}}) {
2517 if ($self->{open_elements}->[$_]->[0] eq $formatting_element->[0]) {
2518 splice @{$self->{open_elements}}, $_, 1;
2519 $i-- and last OE if defined $i;
2520 } elsif ($self->{open_elements}->[$_]->[0] eq $furthest_block->[0]) {
2521 $i = $_;
2522 }
2523 } # OE
2524 splice @{$self->{open_elements}}, $i + 1, 1, $clone;
2525
2526 ## Step 14
2527 redo FET;
2528 } # FET
2529 }; # $formatting_end_tag
2530
2531 my $insert_to_current = sub {
2532 $self->{open_elements}->[-1]->[0]->append_child ($_[0]);
2533 }; # $insert_to_current
2534
2535 my $insert_to_foster = sub {
2536 my $child = shift;
2537 if ({
2538 table => 1, tbody => 1, tfoot => 1,
2539 thead => 1, tr => 1,
2540 }->{$self->{open_elements}->[-1]->[1]}) {
2541 # MUST
2542 my $foster_parent_element;
2543 my $next_sibling;
2544 OE: for (reverse 0..$#{$self->{open_elements}}) {
2545 if ($self->{open_elements}->[$_]->[1] eq 'table') {
2546 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
2547 if (defined $parent and $parent->node_type == 1) {
2548 $foster_parent_element = $parent;
2549 $next_sibling = $self->{open_elements}->[$_]->[0];
2550 } else {
2551 $foster_parent_element
2552 = $self->{open_elements}->[$_ - 1]->[0];
2553 }
2554 last OE;
2555 }
2556 } # OE
2557 $foster_parent_element = $self->{open_elements}->[0]->[0]
2558 unless defined $foster_parent_element;
2559 $foster_parent_element->insert_before
2560 ($child, $next_sibling);
2561 } else {
2562 $self->{open_elements}->[-1]->[0]->append_child ($child);
2563 }
2564 }; # $insert_to_foster
2565
2566 my $insert;
2567
2568 B: {
2569 if ($token->{type} == DOCTYPE_TOKEN) {
2570 !!!parse-error (type => 'DOCTYPE in the middle');
2571 ## Ignore the token
2572 ## Stay in the phase
2573 !!!next-token;
2574 redo B;
2575 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
2576 if ($self->{insertion_mode} & AFTER_HTML_IMS) {
2577 #
2578 } else {
2579 ## Generate implied end tags
2580 if ({
2581 dd => 1, dt => 1, li => 1, p => 1, td => 1, th => 1, tr => 1,
2582 tbody => 1, tfoot=> 1, thead => 1,
2583 }->{$self->{open_elements}->[-1]->[1]}) {
2584 !!!back-token;
2585 $token = {type => END_TAG_TOKEN, tag_name => $self->{open_elements}->[-1]->[1]};
2586 redo B;
2587 }
2588
2589 if (@{$self->{open_elements}} > 2 or
2590 (@{$self->{open_elements}} == 2 and $self->{open_elements}->[1]->[1] ne 'body')) {
2591 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
2592 } elsif (defined $self->{inner_html_node} and
2593 @{$self->{open_elements}} > 1 and
2594 $self->{open_elements}->[1]->[1] ne 'body') {
2595 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
2596 }
2597
2598 ## ISSUE: There is an issue in the spec.
2599 }
2600
2601 ## Stop parsing
2602 last B;
2603 } elsif ($token->{type} == START_TAG_TOKEN and
2604 $token->{tag_name} eq 'html') {
2605 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
2606 ## Turn into the main phase
2607 !!!parse-error (type => 'after html:html');
2608 $self->{insertion_mode} = AFTER_BODY_IM;
2609 } elsif ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
2610 ## Turn into the main phase
2611 !!!parse-error (type => 'after html:html');
2612 $self->{insertion_mode} = AFTER_FRAMESET_IM;
2613 }
2614
2615 ## ISSUE: "aa<html>" is not a parse error.
2616 ## ISSUE: "<html>" in fragment is not a parse error.
2617 unless ($token->{first_start_tag}) {
2618 !!!parse-error (type => 'not first start tag');
2619 }
2620 my $top_el = $self->{open_elements}->[0]->[0];
2621 for my $attr_name (keys %{$token->{attributes}}) {
2622 unless ($top_el->has_attribute_ns (undef, $attr_name)) {
2623 $top_el->set_attribute_ns
2624 (undef, [undef, $attr_name],
2625 $token->{attributes}->{$attr_name}->{value});
2626 }
2627 }
2628 !!!next-token;
2629 redo B;
2630 } elsif ($token->{type} == COMMENT_TOKEN) {
2631 my $comment = $self->{document}->create_comment ($token->{data});
2632 if ($self->{insertion_mode} & AFTER_HTML_IMS) {
2633 $self->{document}->append_child ($comment);
2634 } elsif ($self->{insertion_mode} == AFTER_BODY_IM) {
2635 $self->{open_elements}->[0]->[0]->append_child ($comment);
2636 } else {
2637 $self->{open_elements}->[-1]->[0]->append_child ($comment);
2638 }
2639 !!!next-token;
2640 redo B;
2641 } elsif ($self->{insertion_mode} & HEAD_IMS) {
2642 if ($token->{type} == CHARACTER_TOKEN) {
2643 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
2644 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
2645 unless (length $token->{data}) {
2646 !!!next-token;
2647 redo B;
2648 }
2649 }
2650
2651 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
2652 ## As if <head>
2653 !!!create-element ($self->{head_element}, 'head');
2654 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
2655 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2656
2657 ## Reprocess in the "in head" insertion mode...
2658 pop @{$self->{open_elements}};
2659
2660 ## Reprocess in the "after head" insertion mode...
2661 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2662 ## As if </noscript>
2663 pop @{$self->{open_elements}};
2664 !!!parse-error (type => 'in noscript:#character');
2665
2666 ## Reprocess in the "in head" insertion mode...
2667 ## As if </head>
2668 pop @{$self->{open_elements}};
2669
2670 ## Reprocess in the "after head" insertion mode...
2671 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
2672 pop @{$self->{open_elements}};
2673
2674 ## Reprocess in the "after head" insertion mode...
2675 }
2676
2677 ## "after head" insertion mode
2678 ## As if <body>
2679 !!!insert-element ('body');
2680 $self->{insertion_mode} = IN_BODY_IM;
2681 ## reprocess
2682 redo B;
2683 } elsif ($token->{type} == START_TAG_TOKEN) {
2684 if ($token->{tag_name} eq 'head') {
2685 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
2686 !!!create-element ($self->{head_element}, $token->{tag_name}, $token->{attributes});
2687 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
2688 push @{$self->{open_elements}}, [$self->{head_element}, $token->{tag_name}];
2689 $self->{insertion_mode} = IN_HEAD_IM;
2690 !!!next-token;
2691 redo B;
2692 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
2693 #
2694 } else {
2695 !!!parse-error (type => 'in head:head'); # or in head noscript
2696 ## Ignore the token
2697 !!!next-token;
2698 redo B;
2699 }
2700 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
2701 ## As if <head>
2702 !!!create-element ($self->{head_element}, 'head');
2703 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
2704 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2705
2706 $self->{insertion_mode} = IN_HEAD_IM;
2707 ## Reprocess in the "in head" insertion mode...
2708 }
2709
2710 if ($token->{tag_name} eq 'base') {
2711 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2712 ## As if </noscript>
2713 pop @{$self->{open_elements}};
2714 !!!parse-error (type => 'in noscript:base');
2715
2716 $self->{insertion_mode} = IN_HEAD_IM;
2717 ## Reprocess in the "in head" insertion mode...
2718 }
2719
2720 ## NOTE: There is a "as if in head" code clone.
2721 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
2722 !!!parse-error (type => 'after head:'.$token->{tag_name});
2723 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2724 }
2725 !!!insert-element ($token->{tag_name}, $token->{attributes});
2726 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
2727 pop @{$self->{open_elements}}
2728 if $self->{insertion_mode} == AFTER_HEAD_IM;
2729 !!!next-token;
2730 redo B;
2731 } elsif ($token->{tag_name} eq 'link') {
2732 ## NOTE: There is a "as if in head" code clone.
2733 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
2734 !!!parse-error (type => 'after head:'.$token->{tag_name});
2735 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2736 }
2737 !!!insert-element ($token->{tag_name}, $token->{attributes});
2738 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
2739 pop @{$self->{open_elements}}
2740 if $self->{insertion_mode} == AFTER_HEAD_IM;
2741 !!!next-token;
2742 redo B;
2743 } elsif ($token->{tag_name} eq 'meta') {
2744 ## NOTE: There is a "as if in head" code clone.
2745 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
2746 !!!parse-error (type => 'after head:'.$token->{tag_name});
2747 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2748 }
2749 !!!insert-element ($token->{tag_name}, $token->{attributes});
2750 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
2751
2752 unless ($self->{confident}) {
2753 my $charset;
2754 if ($token->{attributes}->{charset}) { ## TODO: And if supported
2755 $charset = $token->{attributes}->{charset}->{value};
2756 }
2757 if ($token->{attributes}->{'http-equiv'}) {
2758 ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
2759 if ($token->{attributes}->{'http-equiv'}->{value}
2760 =~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*=
2761 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
2762 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
2763 $charset = defined $1 ? $1 : defined $2 ? $2 : $3;
2764 } ## TODO: And if supported
2765 }
2766 ## TODO: Change the encoding
2767 }
2768
2769 ## TODO: Extracting |charset| from |meta|.
2770 pop @{$self->{open_elements}}
2771 if $self->{insertion_mode} == AFTER_HEAD_IM;
2772 !!!next-token;
2773 redo B;
2774 } elsif ($token->{tag_name} eq 'title') {
2775 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2776 ## As if </noscript>
2777 pop @{$self->{open_elements}};
2778 !!!parse-error (type => 'in noscript:title');
2779
2780 $self->{insertion_mode} = IN_HEAD_IM;
2781 ## Reprocess in the "in head" insertion mode...
2782 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
2783 !!!parse-error (type => 'after head:'.$token->{tag_name});
2784 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2785 }
2786
2787 ## NOTE: There is a "as if in head" code clone.
2788 my $parent = defined $self->{head_element} ? $self->{head_element}
2789 : $self->{open_elements}->[-1]->[0];
2790 $parse_rcdata->(RCDATA_CONTENT_MODEL,
2791 sub { $parent->append_child ($_[0]) });
2792 pop @{$self->{open_elements}}
2793 if $self->{insertion_mode} == AFTER_HEAD_IM;
2794 redo B;
2795 } elsif ($token->{tag_name} eq 'style') {
2796 ## NOTE: Or (scripting is enabled and tag_name eq 'noscript' and
2797 ## insertion mode IN_HEAD_IM)
2798 ## NOTE: There is a "as if in head" code clone.
2799 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
2800 !!!parse-error (type => 'after head:'.$token->{tag_name});
2801 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2802 }
2803 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert_to_current);
2804 pop @{$self->{open_elements}}
2805 if $self->{insertion_mode} == AFTER_HEAD_IM;
2806 redo B;
2807 } elsif ($token->{tag_name} eq 'noscript') {
2808 if ($self->{insertion_mode} == IN_HEAD_IM) {
2809 ## NOTE: and scripting is disalbed
2810 !!!insert-element ($token->{tag_name}, $token->{attributes});
2811 $self->{insertion_mode} = IN_HEAD_NOSCRIPT_IM;
2812 !!!next-token;
2813 redo B;
2814 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2815 !!!parse-error (type => 'in noscript:noscript');
2816 ## Ignore the token
2817 !!!next-token;
2818 redo B;
2819 } else {
2820 #
2821 }
2822 } elsif ($token->{tag_name} eq 'script') {
2823 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2824 ## As if </noscript>
2825 pop @{$self->{open_elements}};
2826 !!!parse-error (type => 'in noscript:script');
2827
2828 $self->{insertion_mode} = IN_HEAD_IM;
2829 ## Reprocess in the "in head" insertion mode...
2830 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
2831 !!!parse-error (type => 'after head:'.$token->{tag_name});
2832 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2833 }
2834
2835 ## NOTE: There is a "as if in head" code clone.
2836 $script_start_tag->($insert_to_current);
2837 pop @{$self->{open_elements}}
2838 if $self->{insertion_mode} == AFTER_HEAD_IM;
2839 redo B;
2840 } elsif ($token->{tag_name} eq 'body' or
2841 $token->{tag_name} eq 'frameset') {
2842 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2843 ## As if </noscript>
2844 pop @{$self->{open_elements}};
2845 !!!parse-error (type => 'in noscript:'.$token->{tag_name});
2846
2847 ## Reprocess in the "in head" insertion mode...
2848 ## As if </head>
2849 pop @{$self->{open_elements}};
2850
2851 ## Reprocess in the "after head" insertion mode...
2852 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
2853 pop @{$self->{open_elements}};
2854
2855 ## Reprocess in the "after head" insertion mode...
2856 }
2857
2858 ## "after head" insertion mode
2859 !!!insert-element ($token->{tag_name}, $token->{attributes});
2860 if ($token->{tag_name} eq 'body') {
2861 $self->{insertion_mode} = IN_BODY_IM;
2862 } elsif ($token->{tag_name} eq 'frameset') {
2863 $self->{insertion_mode} = IN_FRAMESET_IM;
2864 } else {
2865 die "$0: tag name: $self->{tag_name}";
2866 }
2867 !!!next-token;
2868 redo B;
2869 } else {
2870 #
2871 }
2872
2873 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2874 ## As if </noscript>
2875 pop @{$self->{open_elements}};
2876 !!!parse-error (type => 'in noscript:/'.$token->{tag_name});
2877
2878 ## Reprocess in the "in head" insertion mode...
2879 ## As if </head>
2880 pop @{$self->{open_elements}};
2881
2882 ## Reprocess in the "after head" insertion mode...
2883 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
2884 ## As if </head>
2885 pop @{$self->{open_elements}};
2886
2887 ## Reprocess in the "after head" insertion mode...
2888 }
2889
2890 ## "after head" insertion mode
2891 ## As if <body>
2892 !!!insert-element ('body');
2893 $self->{insertion_mode} = IN_BODY_IM;
2894 ## reprocess
2895 redo B;
2896 } elsif ($token->{type} == END_TAG_TOKEN) {
2897 if ($token->{tag_name} eq 'head') {
2898 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
2899 ## As if <head>
2900 !!!create-element ($self->{head_element}, 'head');
2901 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
2902 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2903
2904 ## Reprocess in the "in head" insertion mode...
2905 pop @{$self->{open_elements}};
2906 $self->{insertion_mode} = AFTER_HEAD_IM;
2907 !!!next-token;
2908 redo B;
2909 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2910 ## As if </noscript>
2911 pop @{$self->{open_elements}};
2912 !!!parse-error (type => 'in noscript:script');
2913
2914 ## Reprocess in the "in head" insertion mode...
2915 pop @{$self->{open_elements}};
2916 $self->{insertion_mode} = AFTER_HEAD_IM;
2917 !!!next-token;
2918 redo B;
2919 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
2920 pop @{$self->{open_elements}};
2921 $self->{insertion_mode} = AFTER_HEAD_IM;
2922 !!!next-token;
2923 redo B;
2924 } else {
2925 #
2926 }
2927 } elsif ($token->{tag_name} eq 'noscript') {
2928 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2929 pop @{$self->{open_elements}};
2930 $self->{insertion_mode} = IN_HEAD_IM;
2931 !!!next-token;
2932 redo B;
2933 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
2934 !!!parse-error (type => 'unmatched end tag:noscript');
2935 ## Ignore the token ## ISSUE: An issue in the spec.
2936 !!!next-token;
2937 redo B;
2938 } else {
2939 #
2940 }
2941 } elsif ({
2942 body => 1, html => 1,
2943 }->{$token->{tag_name}}) {
2944 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
2945 ## As if <head>
2946 !!!create-element ($self->{head_element}, 'head');
2947 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
2948 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2949
2950 $self->{insertion_mode} = IN_HEAD_IM;
2951 ## Reprocess in the "in head" insertion mode...
2952 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2953 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2954 ## Ignore the token
2955 !!!next-token;
2956 redo B;
2957 }
2958
2959 #
2960 } elsif ({
2961 p => 1, br => 1,
2962 }->{$token->{tag_name}}) {
2963 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
2964 ## As if <head>
2965 !!!create-element ($self->{head_element}, 'head');
2966 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
2967 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2968
2969 $self->{insertion_mode} = IN_HEAD_IM;
2970 ## Reprocess in the "in head" insertion mode...
2971 }
2972
2973 #
2974 } else {
2975 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
2976 #
2977 } else {
2978 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2979 ## Ignore the token
2980 !!!next-token;
2981 redo B;
2982 }
2983 }
2984
2985 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2986 ## As if </noscript>
2987 pop @{$self->{open_elements}};
2988 !!!parse-error (type => 'in noscript:/'.$token->{tag_name});
2989
2990 ## Reprocess in the "in head" insertion mode...
2991 ## As if </head>
2992 pop @{$self->{open_elements}};
2993
2994 ## Reprocess in the "after head" insertion mode...
2995 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
2996 ## As if </head>
2997 pop @{$self->{open_elements}};
2998
2999 ## Reprocess in the "after head" insertion mode...
3000 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3001 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3002 ## Ignore the token ## ISSUE: An issue in the spec.
3003 !!!next-token;
3004 redo B;
3005 }
3006
3007 ## "after head" insertion mode
3008 ## As if <body>
3009 !!!insert-element ('body');
3010 $self->{insertion_mode} = IN_BODY_IM;
3011 ## reprocess
3012 redo B;
3013 } else {
3014 die "$0: $token->{type}: Unknown token type";
3015 }
3016
3017 ## ISSUE: An issue in the spec.
3018 } elsif ($self->{insertion_mode} & BODY_IMS) {
3019 if ($token->{type} == CHARACTER_TOKEN) {
3020 ## NOTE: There is a code clone of "character in body".
3021 $reconstruct_active_formatting_elements->($insert_to_current);
3022
3023 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3024
3025 !!!next-token;
3026 redo B;
3027 } elsif ($token->{type} == START_TAG_TOKEN) {
3028 if ({
3029 caption => 1, col => 1, colgroup => 1, tbody => 1,
3030 td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
3031 }->{$token->{tag_name}}) {
3032 if ($self->{insertion_mode} == IN_CELL_IM) {
3033 ## have an element in table scope
3034 my $tn;
3035 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3036 my $node = $self->{open_elements}->[$_];
3037 if ($node->[1] eq 'td' or $node->[1] eq 'th') {
3038 $tn = $node->[1];
3039 last INSCOPE;
3040 } elsif ({
3041 table => 1, html => 1,
3042 }->{$node->[1]}) {
3043 last INSCOPE;
3044 }
3045 } # INSCOPE
3046 unless (defined $tn) {
3047 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3048 ## Ignore the token
3049 !!!next-token;
3050 redo B;
3051 }
3052
3053 ## Close the cell
3054 !!!back-token; # <?>
3055 $token = {type => END_TAG_TOKEN, tag_name => $tn};
3056 redo B;
3057 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
3058 !!!parse-error (type => 'not closed:caption');
3059
3060 ## As if </caption>
3061 ## have a table element in table scope
3062 my $i;
3063 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3064 my $node = $self->{open_elements}->[$_];
3065 if ($node->[1] eq 'caption') {
3066 $i = $_;
3067 last INSCOPE;
3068 } elsif ({
3069 table => 1, html => 1,
3070 }->{$node->[1]}) {
3071 last INSCOPE;
3072 }
3073 } # INSCOPE
3074 unless (defined $i) {
3075 !!!parse-error (type => 'unmatched end tag:caption');
3076 ## Ignore the token
3077 !!!next-token;
3078 redo B;
3079 }
3080
3081 ## generate implied end tags
3082 if ({
3083 dd => 1, dt => 1, li => 1, p => 1,
3084 td => 1, th => 1, tr => 1,
3085 tbody => 1, tfoot=> 1, thead => 1,
3086 }->{$self->{open_elements}->[-1]->[1]}) {
3087 !!!back-token; # <?>
3088 $token = {type => END_TAG_TOKEN, tag_name => 'caption'};
3089 !!!back-token;
3090 $token = {type => END_TAG_TOKEN,
3091 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3092 redo B;
3093 }
3094
3095 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3096 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3097 }
3098
3099 splice @{$self->{open_elements}}, $i;
3100
3101 $clear_up_to_marker->();
3102
3103 $self->{insertion_mode} = IN_TABLE_IM;
3104
3105 ## reprocess
3106 redo B;
3107 } else {
3108 #
3109 }
3110 } else {
3111 #
3112 }
3113 } elsif ($token->{type} == END_TAG_TOKEN) {
3114 if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') {
3115 if ($self->{insertion_mode} == IN_CELL_IM) {
3116 ## have an element in table scope
3117 my $i;
3118 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3119 my $node = $self->{open_elements}->[$_];
3120 if ($node->[1] eq $token->{tag_name}) {
3121 $i = $_;
3122 last INSCOPE;
3123 } elsif ({
3124 table => 1, html => 1,
3125 }->{$node->[1]}) {
3126 last INSCOPE;
3127 }
3128 } # INSCOPE
3129 unless (defined $i) {
3130 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3131 ## Ignore the token
3132 !!!next-token;
3133 redo B;
3134 }
3135
3136 ## generate implied end tags
3137 if ({
3138 dd => 1, dt => 1, li => 1, p => 1,
3139 td => ($token->{tag_name} eq 'th'),
3140 th => ($token->{tag_name} eq 'td'),
3141 tr => 1,
3142 tbody => 1, tfoot=> 1, thead => 1,
3143 }->{$self->{open_elements}->[-1]->[1]}) {
3144 !!!back-token;
3145 $token = {type => END_TAG_TOKEN,
3146 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3147 redo B;
3148 }
3149
3150 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
3151 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3152 }
3153
3154 splice @{$self->{open_elements}}, $i;
3155
3156 $clear_up_to_marker->();
3157
3158 $self->{insertion_mode} = IN_ROW_IM;
3159
3160 !!!next-token;
3161 redo B;
3162 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
3163 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3164 ## Ignore the token
3165 !!!next-token;
3166 redo B;
3167 } else {
3168 #
3169 }
3170 } elsif ($token->{tag_name} eq 'caption') {
3171 if ($self->{insertion_mode} == IN_CAPTION_IM) {
3172 ## have a table element in table scope
3173 my $i;
3174 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3175 my $node = $self->{open_elements}->[$_];
3176 if ($node->[1] eq $token->{tag_name}) {
3177 $i = $_;
3178 last INSCOPE;
3179 } elsif ({
3180 table => 1, html => 1,
3181 }->{$node->[1]}) {
3182 last INSCOPE;
3183 }
3184 } # INSCOPE
3185 unless (defined $i) {
3186 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3187 ## Ignore the token
3188 !!!next-token;
3189 redo B;
3190 }
3191
3192 ## generate implied end tags
3193 if ({
3194 dd => 1, dt => 1, li => 1, p => 1,
3195 td => 1, th => 1, tr => 1,
3196 tbody => 1, tfoot=> 1, thead => 1,
3197 }->{$self->{open_elements}->[-1]->[1]}) {
3198 !!!back-token;
3199 $token = {type => END_TAG_TOKEN,
3200 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3201 redo B;
3202 }
3203
3204 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3205 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3206 }
3207
3208 splice @{$self->{open_elements}}, $i;
3209
3210 $clear_up_to_marker->();
3211
3212 $self->{insertion_mode} = IN_TABLE_IM;
3213
3214 !!!next-token;
3215 redo B;
3216 } elsif ($self->{insertion_mode} == IN_CELL_IM) {
3217 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3218 ## Ignore the token
3219 !!!next-token;
3220 redo B;
3221 } else {
3222 #
3223 }
3224 } elsif ({
3225 table => 1, tbody => 1, tfoot => 1,
3226 thead => 1, tr => 1,
3227 }->{$token->{tag_name}} and
3228 $self->{insertion_mode} == IN_CELL_IM) {
3229 ## have an element in table scope
3230 my $i;
3231 my $tn;
3232 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3233 my $node = $self->{open_elements}->[$_];
3234 if ($node->[1] eq $token->{tag_name}) {
3235 $i = $_;
3236 last INSCOPE;
3237 } elsif ($node->[1] eq 'td' or $node->[1] eq 'th') {
3238 $tn = $node->[1];
3239 ## NOTE: There is exactly one |td| or |th| element
3240 ## in scope in the stack of open elements by definition.
3241 } elsif ({
3242 table => 1, html => 1,
3243 }->{$node->[1]}) {
3244 last INSCOPE;
3245 }
3246 } # INSCOPE
3247 unless (defined $i) {
3248 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3249 ## Ignore the token
3250 !!!next-token;
3251 redo B;
3252 }
3253
3254 ## Close the cell
3255 !!!back-token; # </?>
3256 $token = {type => END_TAG_TOKEN, tag_name => $tn};
3257 redo B;
3258 } elsif ($token->{tag_name} eq 'table' and
3259 $self->{insertion_mode} == IN_CAPTION_IM) {
3260 !!!parse-error (type => 'not closed:caption');
3261
3262 ## As if </caption>
3263 ## have a table element in table scope
3264 my $i;
3265 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3266 my $node = $self->{open_elements}->[$_];
3267 if ($node->[1] eq 'caption') {
3268 $i = $_;
3269 last INSCOPE;
3270 } elsif ({
3271 table => 1, html => 1,
3272 }->{$node->[1]}) {
3273 last INSCOPE;
3274 }
3275 } # INSCOPE
3276 unless (defined $i) {
3277 !!!parse-error (type => 'unmatched end tag:caption');
3278 ## Ignore the token
3279 !!!next-token;
3280 redo B;
3281 }
3282
3283 ## generate implied end tags
3284 if ({
3285 dd => 1, dt => 1, li => 1, p => 1,
3286 td => 1, th => 1, tr => 1,
3287 tbody => 1, tfoot=> 1, thead => 1,
3288 }->{$self->{open_elements}->[-1]->[1]}) {
3289 !!!back-token; # </table>
3290 $token = {type => END_TAG_TOKEN, tag_name => 'caption'};
3291 !!!back-token;
3292 $token = {type => END_TAG_TOKEN,
3293 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3294 redo B;
3295 }
3296
3297 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3298 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3299 }
3300
3301 splice @{$self->{open_elements}}, $i;
3302
3303 $clear_up_to_marker->();
3304
3305 $self->{insertion_mode} = IN_TABLE_IM;
3306
3307 ## reprocess
3308 redo B;
3309 } elsif ({
3310 body => 1, col => 1, colgroup => 1, html => 1,
3311 }->{$token->{tag_name}}) {
3312 if ($self->{insertion_mode} & BODY_TABLE_IMS) {
3313 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3314 ## Ignore the token
3315 !!!next-token;
3316 redo B;
3317 } else {
3318 #
3319 }
3320 } elsif ({
3321 tbody => 1, tfoot => 1,
3322 thead => 1, tr => 1,
3323 }->{$token->{tag_name}} and
3324 $self->{insertion_mode} == IN_CAPTION_IM) {
3325 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3326 ## Ignore the token
3327 !!!next-token;
3328 redo B;
3329 } else {
3330 #
3331 }
3332 } else {
3333 die "$0: $token->{type}: Unknown token type";
3334 }
3335
3336 $insert = $insert_to_current;
3337 #
3338 } elsif ($self->{insertion_mode} & TABLE_IMS) {
3339 if ($token->{type} == CHARACTER_TOKEN) {
3340 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3341 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3342
3343 unless (length $token->{data}) {
3344 !!!next-token;
3345 redo B;
3346 }
3347 }
3348
3349 !!!parse-error (type => 'in table:#character');
3350
3351 ## As if in body, but insert into foster parent element
3352 ## ISSUE: Spec says that "whenever a node would be inserted
3353 ## into the current node" while characters might not be
3354 ## result in a new Text node.
3355 $reconstruct_active_formatting_elements->($insert_to_foster);
3356
3357 if ({
3358 table => 1, tbody => 1, tfoot => 1,
3359 thead => 1, tr => 1,
3360 }->{$self->{open_elements}->[-1]->[1]}) {
3361 # MUST
3362 my $foster_parent_element;
3363 my $next_sibling;
3364 my $prev_sibling;
3365 OE: for (reverse 0..$#{$self->{open_elements}}) {
3366 if ($self->{open_elements}->[$_]->[1] eq 'table') {
3367 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3368 if (defined $parent and $parent->node_type == 1) {
3369 $foster_parent_element = $parent;
3370 $next_sibling = $self->{open_elements}->[$_]->[0];
3371 $prev_sibling = $next_sibling->previous_sibling;
3372 } else {
3373 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
3374 $prev_sibling = $foster_parent_element->last_child;
3375 }
3376 last OE;
3377 }
3378 } # OE
3379 $foster_parent_element = $self->{open_elements}->[0]->[0] and
3380 $prev_sibling = $foster_parent_element->last_child
3381 unless defined $foster_parent_element;
3382 if (defined $prev_sibling and
3383 $prev_sibling->node_type == 3) {
3384 $prev_sibling->manakai_append_text ($token->{data});
3385 } else {
3386 $foster_parent_element->insert_before
3387 ($self->{document}->create_text_node ($token->{data}),
3388 $next_sibling);
3389 }
3390 } else {
3391 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3392 }
3393
3394 !!!next-token;
3395 redo B;
3396 } elsif ($token->{type} == START_TAG_TOKEN) {
3397 if ({
3398 tr => ($self->{insertion_mode} != IN_ROW_IM),
3399 th => 1, td => 1,
3400 }->{$token->{tag_name}}) {
3401 if ($self->{insertion_mode} == IN_TABLE_IM) {
3402 ## Clear back to table context
3403 while ($self->{open_elements}->[-1]->[1] ne 'table' and
3404 $self->{open_elements}->[-1]->[1] ne 'html') {
3405 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3406 pop @{$self->{open_elements}};
3407 }
3408
3409 !!!insert-element ('tbody');
3410 $self->{insertion_mode} = IN_TABLE_BODY_IM;
3411 ## reprocess in the "in table body" insertion mode...
3412 }
3413
3414 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
3415 unless ($token->{tag_name} eq 'tr') {
3416 !!!parse-error (type => 'missing start tag:tr');
3417 }
3418
3419 ## Clear back to table body context
3420 while (not {
3421 tbody => 1, tfoot => 1, thead => 1, html => 1,
3422 }->{$self->{open_elements}->[-1]->[1]}) {
3423 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3424 pop @{$self->{open_elements}};
3425 }
3426
3427 $self->{insertion_mode} = IN_ROW_IM;
3428 if ($token->{tag_name} eq 'tr') {
3429 !!!insert-element ($token->{tag_name}, $token->{attributes});
3430 !!!next-token;
3431 redo B;
3432 } else {
3433 !!!insert-element ('tr');
3434 ## reprocess in the "in row" insertion mode
3435 }
3436 }
3437
3438 ## Clear back to table row context
3439 while (not {
3440 tr => 1, html => 1,
3441 }->{$self->{open_elements}->[-1]->[1]}) {
3442 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3443 pop @{$self->{open_elements}};
3444 }
3445
3446 !!!insert-element ($token->{tag_name}, $token->{attributes});
3447 $self->{insertion_mode} = IN_CELL_IM;
3448
3449 push @$active_formatting_elements, ['#marker', ''];
3450
3451 !!!next-token;
3452 redo B;
3453 } elsif ({
3454 caption => 1, col => 1, colgroup => 1,
3455 tbody => 1, tfoot => 1, thead => 1,
3456 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
3457 }->{$token->{tag_name}}) {
3458 if ($self->{insertion_mode} == IN_ROW_IM) {
3459 ## As if </tr>
3460 ## have an element in table scope
3461 my $i;
3462 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3463 my $node = $self->{open_elements}->[$_];
3464 if ($node->[1] eq 'tr') {
3465 $i = $_;
3466 last INSCOPE;
3467 } elsif ({
3468 table => 1, html => 1,
3469 }->{$node->[1]}) {
3470 last INSCOPE;
3471 }
3472 } # INSCOPE
3473 unless (defined $i) {
3474 !!!parse-error (type => 'unmacthed end tag:'.$token->{tag_name});
3475 ## Ignore the token
3476 !!!next-token;
3477 redo B;
3478 }
3479
3480 ## Clear back to table row context
3481 while (not {
3482 tr => 1, html => 1,
3483 }->{$self->{open_elements}->[-1]->[1]}) {
3484 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3485 pop @{$self->{open_elements}};
3486 }
3487
3488 pop @{$self->{open_elements}}; # tr
3489 $self->{insertion_mode} = IN_TABLE_BODY_IM;
3490 if ($token->{tag_name} eq 'tr') {
3491 ## reprocess
3492 redo B;
3493 } else {
3494 ## reprocess in the "in table body" insertion mode...
3495 }
3496 }
3497
3498 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
3499 ## have an element in table scope
3500 my $i;
3501 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3502 my $node = $self->{open_elements}->[$_];
3503 if ({
3504 tbody => 1, thead => 1, tfoot => 1,
3505 }->{$node->[1]}) {
3506 $i = $_;
3507 last INSCOPE;
3508 } elsif ({
3509 table => 1, html => 1,
3510 }->{$node->[1]}) {
3511 last INSCOPE;
3512 }
3513 } # INSCOPE
3514 unless (defined $i) {
3515 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3516 ## Ignore the token
3517 !!!next-token;
3518 redo B;
3519 }
3520
3521 ## Clear back to table body context
3522 while (not {
3523 tbody => 1, tfoot => 1, thead => 1, html => 1,
3524 }->{$self->{open_elements}->[-1]->[1]}) {
3525 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3526 pop @{$self->{open_elements}};
3527 }
3528
3529 ## As if <{current node}>
3530 ## have an element in table scope
3531 ## true by definition
3532
3533 ## Clear back to table body context
3534 ## nop by definition
3535
3536 pop @{$self->{open_elements}};
3537 $self->{insertion_mode} = IN_TABLE_IM;
3538 ## reprocess in "in table" insertion mode...
3539 }
3540
3541 if ($token->{tag_name} eq 'col') {
3542 ## Clear back to table context
3543 while ($self->{open_elements}->[-1]->[1] ne 'table' and
3544 $self->{open_elements}->[-1]->[1] ne 'html') {
3545 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3546 pop @{$self->{open_elements}};
3547 }
3548
3549 !!!insert-element ('colgroup');
3550 $self->{insertion_mode} = IN_COLUMN_GROUP_IM;
3551 ## reprocess
3552 redo B;
3553 } elsif ({
3554 caption => 1,
3555 colgroup => 1,
3556 tbody => 1, tfoot => 1, thead => 1,
3557 }->{$token->{tag_name}}) {
3558 ## Clear back to table context
3559 while ($self->{open_elements}->[-1]->[1] ne 'table' and
3560 $self->{open_elements}->[-1]->[1] ne 'html') {
3561 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3562 pop @{$self->{open_elements}};
3563 }
3564
3565 push @$active_formatting_elements, ['#marker', '']
3566 if $token->{tag_name} eq 'caption';
3567
3568 !!!insert-element ($token->{tag_name}, $token->{attributes});
3569 $self->{insertion_mode} = {
3570 caption => IN_CAPTION_IM,
3571 colgroup => IN_COLUMN_GROUP_IM,
3572 tbody => IN_TABLE_BODY_IM,
3573 tfoot => IN_TABLE_BODY_IM,
3574 thead => IN_TABLE_BODY_IM,
3575 }->{$token->{tag_name}};
3576 !!!next-token;
3577 redo B;
3578 } else {
3579 die "$0: in table: <>: $token->{tag_name}";
3580 }
3581 } elsif ($token->{tag_name} eq 'table') {
3582 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3583
3584 ## As if </table>
3585 ## have a table element in table scope
3586 my $i;
3587 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3588 my $node = $self->{open_elements}->[$_];
3589 if ($node->[1] eq 'table') {
3590 $i = $_;
3591 last INSCOPE;
3592 } elsif ({
3593 table => 1, html => 1,
3594 }->{$node->[1]}) {
3595 last INSCOPE;
3596 }
3597 } # INSCOPE
3598 unless (defined $i) {
3599 !!!parse-error (type => 'unmatched end tag:table');
3600 ## Ignore tokens </table><table>
3601 !!!next-token;
3602 redo B;
3603 }
3604
3605 ## generate implied end tags
3606 if ({
3607 dd => 1, dt => 1, li => 1, p => 1,
3608 td => 1, th => 1, tr => 1,
3609 tbody => 1, tfoot=> 1, thead => 1,
3610 }->{$self->{open_elements}->[-1]->[1]}) {
3611 !!!back-token; # <table>
3612 $token = {type => END_TAG_TOKEN, tag_name => 'table'};
3613 !!!back-token;
3614 $token = {type => END_TAG_TOKEN,
3615 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3616 redo B;
3617 }
3618
3619 if ($self->{open_elements}->[-1]->[1] ne 'table') {
3620 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3621 }
3622
3623 splice @{$self->{open_elements}}, $i;
3624
3625 $self->_reset_insertion_mode;
3626
3627 ## reprocess
3628 redo B;
3629 } else {
3630 !!!parse-error (type => 'in table:'.$token->{tag_name});
3631
3632 $insert = $insert_to_foster;
3633 #
3634 }
3635 } elsif ($token->{type} == END_TAG_TOKEN) {
3636 if ($token->{tag_name} eq 'tr' and
3637 $self->{insertion_mode} == IN_ROW_IM) {
3638 ## have an element in table scope
3639 my $i;
3640 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3641 my $node = $self->{open_elements}->[$_];
3642 if ($node->[1] eq $token->{tag_name}) {
3643 $i = $_;
3644 last INSCOPE;
3645 } elsif ({
3646 table => 1, html => 1,
3647 }->{$node->[1]}) {
3648 last INSCOPE;
3649 }
3650 } # INSCOPE
3651 unless (defined $i) {
3652 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3653 ## Ignore the token
3654 !!!next-token;
3655 redo B;
3656 }
3657
3658 ## Clear back to table row context
3659 while (not {
3660 tr => 1, html => 1,
3661 }->{$self->{open_elements}->[-1]->[1]}) {
3662 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3663 pop @{$self->{open_elements}};
3664 }
3665
3666 pop @{$self->{open_elements}}; # tr
3667 $self->{insertion_mode} = IN_TABLE_BODY_IM;
3668 !!!next-token;
3669 redo B;
3670 } elsif ($token->{tag_name} eq 'table') {
3671 if ($self->{insertion_mode} == IN_ROW_IM) {
3672 ## As if </tr>
3673 ## have an element in table scope
3674 my $i;
3675 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3676 my $node = $self->{open_elements}->[$_];
3677 if ($node->[1] eq 'tr') {
3678 $i = $_;
3679 last INSCOPE;
3680 } elsif ({
3681 table => 1, html => 1,
3682 }->{$node->[1]}) {
3683 last INSCOPE;
3684 }
3685 } # INSCOPE
3686 unless (defined $i) {
3687 !!!parse-error (type => 'unmatched end tag:'.$token->{type});
3688 ## Ignore the token
3689 !!!next-token;
3690 redo B;
3691 }
3692
3693 ## Clear back to table row context
3694 while (not {
3695 tr => 1, html => 1,
3696 }->{$self->{open_elements}->[-1]->[1]}) {
3697 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3698 pop @{$self->{open_elements}};
3699 }
3700
3701 pop @{$self->{open_elements}}; # tr
3702 $self->{insertion_mode} = IN_TABLE_BODY_IM;
3703 ## reprocess in the "in table body" insertion mode...
3704 }
3705
3706 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
3707 ## have an element in table scope
3708 my $i;
3709 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3710 my $node = $self->{open_elements}->[$_];
3711 if ({
3712 tbody => 1, thead => 1, tfoot => 1,
3713 }->{$node->[1]}) {
3714 $i = $_;
3715 last INSCOPE;
3716 } elsif ({
3717 table => 1, html => 1,
3718 }->{$node->[1]}) {
3719 last INSCOPE;
3720 }
3721 } # INSCOPE
3722 unless (defined $i) {
3723 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3724 ## Ignore the token
3725 !!!next-token;
3726 redo B;
3727 }
3728
3729 ## Clear back to table body context
3730 while (not {
3731 tbody => 1, tfoot => 1, thead => 1, html => 1,
3732 }->{$self->{open_elements}->[-1]->[1]}) {
3733 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3734 pop @{$self->{open_elements}};
3735 }
3736
3737 ## As if <{current node}>
3738 ## have an element in table scope
3739 ## true by definition
3740
3741 ## Clear back to table body context
3742 ## nop by definition
3743
3744 pop @{$self->{open_elements}};
3745 $self->{insertion_mode} = IN_TABLE_IM;
3746 ## reprocess in the "in table" insertion mode...
3747 }
3748
3749 ## have a table element in table scope
3750 my $i;
3751 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3752 my $node = $self->{open_elements}->[$_];
3753 if ($node->[1] eq $token->{tag_name}) {
3754 $i = $_;
3755 last INSCOPE;
3756 } elsif ({
3757 table => 1, html => 1,
3758 }->{$node->[1]}) {
3759 last INSCOPE;
3760 }
3761 } # INSCOPE
3762 unless (defined $i) {
3763 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3764 ## Ignore the token
3765 !!!next-token;
3766 redo B;
3767 }
3768
3769 ## generate implied end tags
3770 if ({
3771 dd => 1, dt => 1, li => 1, p => 1,
3772 td => 1, th => 1, tr => 1,
3773 tbody => 1, tfoot=> 1, thead => 1,
3774 }->{$self->{open_elements}->[-1]->[1]}) {
3775 !!!back-token;
3776 $token = {type => END_TAG_TOKEN,
3777 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3778 redo B;
3779 }
3780
3781 if ($self->{open_elements}->[-1]->[1] ne 'table') {
3782 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3783 }
3784
3785 splice @{$self->{open_elements}}, $i;
3786
3787 $self->_reset_insertion_mode;
3788
3789 !!!next-token;
3790 redo B;
3791 } elsif ({
3792 tbody => 1, tfoot => 1, thead => 1,
3793 }->{$token->{tag_name}} and
3794 $self->{insertion_mode} & ROW_IMS) {
3795 if ($self->{insertion_mode} == IN_ROW_IM) {
3796 ## have an element in table scope
3797 my $i;
3798 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3799 my $node = $self->{open_elements}->[$_];
3800 if ($node->[1] eq $token->{tag_name}) {
3801 $i = $_;
3802 last INSCOPE;
3803 } elsif ({
3804 table => 1, html => 1,
3805 }->{$node->[1]}) {
3806 last INSCOPE;
3807 }
3808 } # INSCOPE
3809 unless (defined $i) {
3810 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3811 ## Ignore the token
3812 !!!next-token;
3813 redo B;
3814 }
3815
3816 ## As if </tr>
3817 ## have an element in table scope
3818 my $i;
3819 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3820 my $node = $self->{open_elements}->[$_];
3821 if ($node->[1] eq 'tr') {
3822 $i = $_;
3823 last INSCOPE;
3824 } elsif ({
3825 table => 1, html => 1,
3826 }->{$node->[1]}) {
3827 last INSCOPE;
3828 }
3829 } # INSCOPE
3830 unless (defined $i) {
3831 !!!parse-error (type => 'unmatched end tag:tr');
3832 ## Ignore the token
3833 !!!next-token;
3834 redo B;
3835 }
3836
3837 ## Clear back to table row context
3838 while (not {
3839 tr => 1, html => 1,
3840 }->{$self->{open_elements}->[-1]->[1]}) {
3841 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3842 pop @{$self->{open_elements}};
3843 }
3844
3845 pop @{$self->{open_elements}}; # tr
3846 $self->{insertion_mode} = IN_TABLE_BODY_IM;
3847 ## reprocess in the "in table body" insertion mode...
3848 }
3849
3850 ## have an element in table scope
3851 my $i;
3852 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3853 my $node = $self->{open_elements}->[$_];
3854 if ($node->[1] eq $token->{tag_name}) {
3855 $i = $_;
3856 last INSCOPE;
3857 } elsif ({
3858 table => 1, html => 1,
3859 }->{$node->[1]}) {
3860 last INSCOPE;
3861 }
3862 } # INSCOPE
3863 unless (defined $i) {
3864 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3865 ## Ignore the token
3866 !!!next-token;
3867 redo B;
3868 }
3869
3870 ## Clear back to table body context
3871 while (not {
3872 tbody => 1, tfoot => 1, thead => 1, html => 1,
3873 }->{$self->{open_elements}->[-1]->[1]}) {
3874 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3875 pop @{$self->{open_elements}};
3876 }
3877
3878 pop @{$self->{open_elements}};
3879 $self->{insertion_mode} = IN_TABLE_IM;
3880 !!!next-token;
3881 redo B;
3882 } elsif ({
3883 body => 1, caption => 1, col => 1, colgroup => 1,
3884 html => 1, td => 1, th => 1,
3885 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
3886 tbody => 1, tfoot => 1, thead => 1, # $self->{insertion_mode} == IN_TABLE_IM
3887 }->{$token->{tag_name}}) {
3888 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3889 ## Ignore the token
3890 !!!next-token;
3891 redo B;
3892 } else {
3893 !!!parse-error (type => 'in table:/'.$token->{tag_name});
3894
3895 $insert = $insert_to_foster;
3896 #
3897 }
3898 } else {
3899 die "$0: $token->{type}: Unknown token type";
3900 }
3901 } elsif ($self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
3902 if ($token->{type} == CHARACTER_TOKEN) {
3903 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3904 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3905 unless (length $token->{data}) {
3906 !!!next-token;
3907 redo B;
3908 }
3909 }
3910
3911 #
3912 } elsif ($token->{type} == START_TAG_TOKEN) {
3913 if ($token->{tag_name} eq 'col') {
3914 !!!insert-element ($token->{tag_name}, $token->{attributes});
3915 pop @{$self->{open_elements}};
3916 !!!next-token;
3917 redo B;
3918 } else {
3919 #
3920 }
3921 } elsif ($token->{type} == END_TAG_TOKEN) {
3922 if ($token->{tag_name} eq 'colgroup') {
3923 if ($self->{open_elements}->[-1]->[1] eq 'html') {
3924 !!!parse-error (type => 'unmatched end tag:colgroup');
3925 ## Ignore the token
3926 !!!next-token;
3927 redo B;
3928 } else {
3929 pop @{$self->{open_elements}}; # colgroup
3930 $self->{insertion_mode} = IN_TABLE_IM;
3931 !!!next-token;
3932 redo B;
3933 }
3934 } elsif ($token->{tag_name} eq 'col') {
3935 !!!parse-error (type => 'unmatched end tag:col');
3936 ## Ignore the token
3937 !!!next-token;
3938 redo B;
3939 } else {
3940 #
3941 }
3942 } else {
3943 #
3944 }
3945
3946 ## As if </colgroup>
3947 if ($self->{open_elements}->[-1]->[1] eq 'html') {
3948 !!!parse-error (type => 'unmatched end tag:colgroup');
3949 ## Ignore the token
3950 !!!next-token;
3951 redo B;
3952 } else {
3953 pop @{$self->{open_elements}}; # colgroup
3954 $self->{insertion_mode} = IN_TABLE_IM;
3955 ## reprocess
3956 redo B;
3957 }
3958 } elsif ($self->{insertion_mode} == IN_SELECT_IM) {
3959 if ($token->{type} == CHARACTER_TOKEN) {
3960 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3961 !!!next-token;
3962 redo B;
3963 } elsif ($token->{type} == START_TAG_TOKEN) {
3964 if ($token->{tag_name} eq 'option') {
3965 if ($self->{open_elements}->[-1]->[1] eq 'option') {
3966 ## As if </option>
3967 pop @{$self->{open_elements}};
3968 }
3969
3970 !!!insert-element ($token->{tag_name}, $token->{attributes});
3971 !!!next-token;
3972 redo B;
3973 } elsif ($token->{tag_name} eq 'optgroup') {
3974 if ($self->{open_elements}->[-1]->[1] eq 'option') {
3975 ## As if </option>
3976 pop @{$self->{open_elements}};
3977 }
3978
3979 if ($self->{open_elements}->[-1]->[1] eq 'optgroup') {
3980 ## As if </optgroup>
3981 pop @{$self->{open_elements}};
3982 }
3983
3984 !!!insert-element ($token->{tag_name}, $token->{attributes});
3985 !!!next-token;
3986 redo B;
3987 } elsif ($token->{tag_name} eq 'select') {
3988 !!!parse-error (type => 'not closed:select');
3989 ## As if </select> instead
3990 ## have an element in table scope
3991 my $i;
3992 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3993 my $node = $self->{open_elements}->[$_];
3994 if ($node->[1] eq $token->{tag_name}) {
3995 $i = $_;
3996 last INSCOPE;
3997 } elsif ({
3998 table => 1, html => 1,
3999 }->{$node->[1]}) {
4000 last INSCOPE;
4001 }
4002 } # INSCOPE
4003 unless (defined $i) {
4004 !!!parse-error (type => 'unmatched end tag:select');
4005 ## Ignore the token
4006 !!!next-token;
4007 redo B;
4008 }
4009
4010 splice @{$self->{open_elements}}, $i;
4011
4012 $self->_reset_insertion_mode;
4013
4014 !!!next-token;
4015 redo B;
4016 } else {
4017 !!!parse-error (type => 'in select:'.$token->{tag_name});
4018 ## Ignore the token
4019 !!!next-token;
4020 redo B;
4021 }
4022 } elsif ($token->{type} == END_TAG_TOKEN) {
4023 if ($token->{tag_name} eq 'optgroup') {
4024 if ($self->{open_elements}->[-1]->[1] eq 'option' and
4025 $self->{open_elements}->[-2]->[1] eq 'optgroup') {
4026 ## As if </option>
4027 splice @{$self->{open_elements}}, -2;
4028 } elsif ($self->{open_elements}->[-1]->[1] eq 'optgroup') {
4029 pop @{$self->{open_elements}};
4030 } else {
4031 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4032 ## Ignore the token
4033 }
4034 !!!next-token;
4035 redo B;
4036 } elsif ($token->{tag_name} eq 'option') {
4037 if ($self->{open_elements}->[-1]->[1] eq 'option') {
4038 pop @{$self->{open_elements}};
4039 } else {
4040 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4041 ## Ignore the token
4042 }
4043 !!!next-token;
4044 redo B;
4045 } elsif ($token->{tag_name} eq 'select') {
4046 ## have an element in table scope
4047 my $i;
4048 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4049 my $node = $self->{open_elements}->[$_];
4050 if ($node->[1] eq $token->{tag_name}) {
4051 $i = $_;
4052 last INSCOPE;
4053 } elsif ({
4054 table => 1, html => 1,
4055 }->{$node->[1]}) {
4056 last INSCOPE;
4057 }
4058 } # INSCOPE
4059 unless (defined $i) {
4060 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4061 ## Ignore the token
4062 !!!next-token;
4063 redo B;
4064 }
4065
4066 splice @{$self->{open_elements}}, $i;
4067
4068 $self->_reset_insertion_mode;
4069
4070 !!!next-token;
4071 redo B;
4072 } elsif ({
4073 caption => 1, table => 1, tbody => 1,
4074 tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
4075 }->{$token->{tag_name}}) {
4076 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4077
4078 ## have an element in table scope
4079 my $i;
4080 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4081 my $node = $self->{open_elements}->[$_];
4082 if ($node->[1] eq $token->{tag_name}) {
4083 $i = $_;
4084 last INSCOPE;
4085 } elsif ({
4086 table => 1, html => 1,
4087 }->{$node->[1]}) {
4088 last INSCOPE;
4089 }
4090 } # INSCOPE
4091 unless (defined $i) {
4092 ## Ignore the token
4093 !!!next-token;
4094 redo B;
4095 }
4096
4097 ## As if </select>
4098 ## have an element in table scope
4099 undef $i;
4100 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4101 my $node = $self->{open_elements}->[$_];
4102 if ($node->[1] eq 'select') {
4103 $i = $_;
4104 last INSCOPE;
4105 } elsif ({
4106 table => 1, html => 1,
4107 }->{$node->[1]}) {
4108 last INSCOPE;
4109 }
4110 } # INSCOPE
4111 unless (defined $i) {
4112 !!!parse-error (type => 'unmatched end tag:select');
4113 ## Ignore the </select> token
4114 !!!next-token; ## TODO: ok?
4115 redo B;
4116 }
4117
4118 splice @{$self->{open_elements}}, $i;
4119
4120 $self->_reset_insertion_mode;
4121
4122 ## reprocess
4123 redo B;
4124 } else {
4125 !!!parse-error (type => 'in select:/'.$token->{tag_name});
4126 ## Ignore the token
4127 !!!next-token;
4128 redo B;
4129 }
4130 } else {
4131 die "$0: $token->{type}: Unknown token type";
4132 }
4133 } elsif ($self->{insertion_mode} & BODY_AFTER_IMS) {
4134 if ($token->{type} == CHARACTER_TOKEN) {
4135 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4136 my $data = $1;
4137 ## As if in body
4138 $reconstruct_active_formatting_elements->($insert_to_current);
4139
4140 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4141
4142 unless (length $token->{data}) {
4143 !!!next-token;
4144 redo B;
4145 }
4146 }
4147
4148 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
4149 !!!parse-error (type => 'after html:#character');
4150
4151 ## Reprocess in the "main" phase, "after body" insertion mode...
4152 }
4153
4154 ## "after body" insertion mode
4155 !!!parse-error (type => 'after body:#character');
4156
4157 $self->{insertion_mode} = IN_BODY_IM;
4158 ## reprocess
4159 redo B;
4160 } elsif ($token->{type} == START_TAG_TOKEN) {
4161 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
4162 !!!parse-error (type => 'after html:'.$token->{tag_name});
4163
4164 ## Reprocess in the "main" phase, "after body" insertion mode...
4165 }
4166
4167 ## "after body" insertion mode
4168 !!!parse-error (type => 'after body:'.$token->{tag_name});
4169
4170 $self->{insertion_mode} = IN_BODY_IM;
4171 ## reprocess
4172 redo B;
4173 } elsif ($token->{type} == END_TAG_TOKEN) {
4174 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
4175 !!!parse-error (type => 'after html:/'.$token->{tag_name});
4176
4177 $self->{insertion_mode} = AFTER_BODY_IM;
4178 ## Reprocess in the "main" phase, "after body" insertion mode...
4179 }
4180
4181 ## "after body" insertion mode
4182 if ($token->{tag_name} eq 'html') {
4183 if (defined $self->{inner_html_node}) {
4184 !!!parse-error (type => 'unmatched end tag:html');
4185 ## Ignore the token
4186 !!!next-token;
4187 redo B;
4188 } else {
4189 $self->{insertion_mode} = AFTER_HTML_BODY_IM;
4190 !!!next-token;
4191 redo B;
4192 }
4193 } else {
4194 !!!parse-error (type => 'after body:/'.$token->{tag_name});
4195
4196 $self->{insertion_mode} = IN_BODY_IM;
4197 ## reprocess
4198 redo B;
4199 }
4200 } else {
4201 die "$0: $token->{type}: Unknown token type";
4202 }
4203 } elsif ($self->{insertion_mode} & FRAME_IMS) {
4204 if ($token->{type} == CHARACTER_TOKEN) {
4205 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4206 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4207
4208 unless (length $token->{data}) {
4209 !!!next-token;
4210 redo B;
4211 }
4212 }
4213
4214 if ($token->{data} =~ s/^[^\x09\x0A\x0B\x0C\x20]+//) {
4215 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
4216 !!!parse-error (type => 'in frameset:#character');
4217 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
4218 !!!parse-error (type => 'after frameset:#character');
4219 } else { # "after html frameset"
4220 !!!parse-error (type => 'after html:#character');
4221
4222 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4223 ## Reprocess in the "main" phase, "after frameset"...
4224 !!!parse-error (type => 'after frameset:#character');
4225 }
4226
4227 ## Ignore the token.
4228 if (length $token->{data}) {
4229 ## reprocess the rest of characters
4230 } else {
4231 !!!next-token;
4232 }
4233 redo B;
4234 }
4235
4236 die qq[$0: Character "$token->{data}"];
4237 } elsif ($token->{type} == START_TAG_TOKEN) {
4238 if ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
4239 !!!parse-error (type => 'after html:'.$token->{tag_name});
4240
4241 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4242 ## Process in the "main" phase, "after frameset" insertion mode...
4243 }
4244
4245 if ($token->{tag_name} eq 'frameset' and
4246 $self->{insertion_mode} == IN_FRAMESET_IM) {
4247 !!!insert-element ($token->{tag_name}, $token->{attributes});
4248 !!!next-token;
4249 redo B;
4250 } elsif ($token->{tag_name} eq 'frame' and
4251 $self->{insertion_mode} == IN_FRAMESET_IM) {
4252 !!!insert-element ($token->{tag_name}, $token->{attributes});
4253 pop @{$self->{open_elements}};
4254 !!!next-token;
4255 redo B;
4256 } elsif ($token->{tag_name} eq 'noframes') {
4257 ## NOTE: As if in body.
4258 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert_to_current);
4259 redo B;
4260 } else {
4261 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
4262 !!!parse-error (type => 'in frameset:'.$token->{tag_name});
4263 } else {
4264 !!!parse-error (type => 'after frameset:'.$token->{tag_name});
4265 }
4266 ## Ignore the token
4267 !!!next-token;
4268 redo B;
4269 }
4270 } elsif ($token->{type} == END_TAG_TOKEN) {
4271 if ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
4272 !!!parse-error (type => 'after html:/'.$token->{tag_name});
4273
4274 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4275 ## Process in the "main" phase, "after frameset" insertion mode...
4276 }
4277
4278 if ($token->{tag_name} eq 'frameset' and
4279 $self->{insertion_mode} == IN_FRAMESET_IM) {
4280 if ($self->{open_elements}->[-1]->[1] eq 'html' and
4281 @{$self->{open_elements}} == 1) {
4282 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4283 ## Ignore the token
4284 !!!next-token;
4285 } else {
4286 pop @{$self->{open_elements}};
4287 !!!next-token;
4288 }
4289
4290 if (not defined $self->{inner_html_node} and
4291 $self->{open_elements}->[-1]->[1] ne 'frameset') {
4292 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4293 }
4294 redo B;
4295 } elsif ($token->{tag_name} eq 'html' and
4296 $self->{insertion_mode} == AFTER_FRAMESET_IM) {
4297 $self->{insertion_mode} = AFTER_HTML_FRAMESET_IM;
4298 !!!next-token;
4299 redo B;
4300 } else {
4301 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
4302 !!!parse-error (type => 'in frameset:/'.$token->{tag_name});
4303 } else {
4304 !!!parse-error (type => 'after frameset:/'.$token->{tag_name});
4305 }
4306 ## Ignore the token
4307 !!!next-token;
4308 redo B;
4309 }
4310 } else {
4311 die "$0: $token->{type}: Unknown token type";
4312 }
4313
4314 ## ISSUE: An issue in spec here
4315 } else {
4316 die "$0: $self->{insertion_mode}: Unknown insertion mode";
4317 }
4318
4319 ## "in body" insertion mode
4320 if ($token->{type} == START_TAG_TOKEN) {
4321 if ($token->{tag_name} eq 'script') {
4322 ## NOTE: This is an "as if in head" code clone
4323 $script_start_tag->($insert);
4324 redo B;
4325 } elsif ($token->{tag_name} eq 'style') {
4326 ## NOTE: This is an "as if in head" code clone
4327 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);
4328 redo B;
4329 } elsif ({
4330 base => 1, link => 1,
4331 }->{$token->{tag_name}}) {
4332 ## NOTE: This is an "as if in head" code clone, only "-t" differs
4333 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4334 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4335 !!!next-token;
4336 redo B;
4337 } elsif ($token->{tag_name} eq 'meta') {
4338 ## NOTE: This is an "as if in head" code clone, only "-t" differs
4339 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4340 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4341
4342 unless ($self->{confident}) {
4343 my $charset;
4344 if ($token->{attributes}->{charset}) { ## TODO: And if supported
4345 $charset = $token->{attributes}->{charset}->{value};
4346 }
4347 if ($token->{attributes}->{'http-equiv'}) {
4348 ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
4349 if ($token->{attributes}->{'http-equiv'}->{value}
4350 =~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*=
4351 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
4352 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
4353 $charset = defined $1 ? $1 : defined $2 ? $2 : $3;
4354 } ## TODO: And if supported
4355 }
4356 ## TODO: Change the encoding
4357 }
4358
4359 !!!next-token;
4360 redo B;
4361 } elsif ($token->{tag_name} eq 'title') {
4362 !!!parse-error (type => 'in body:title');
4363 ## NOTE: This is an "as if in head" code clone
4364 $parse_rcdata->(RCDATA_CONTENT_MODEL, sub {
4365 if (defined $self->{head_element}) {
4366 $self->{head_element}->append_child ($_[0]);
4367 } else {
4368 $insert->($_[0]);
4369 }
4370 });
4371 redo B;
4372 } elsif ($token->{tag_name} eq 'body') {
4373 !!!parse-error (type => 'in body:body');
4374
4375 if (@{$self->{open_elements}} == 1 or
4376 $self->{open_elements}->[1]->[1] ne 'body') {
4377 ## Ignore the token
4378 } else {
4379 my $body_el = $self->{open_elements}->[1]->[0];
4380 for my $attr_name (keys %{$token->{attributes}}) {
4381 unless ($body_el->has_attribute_ns (undef, $attr_name)) {
4382 $body_el->set_attribute_ns
4383 (undef, [undef, $attr_name],
4384 $token->{attributes}->{$attr_name}->{value});
4385 }
4386 }
4387 }
4388 !!!next-token;
4389 redo B;
4390 } elsif ({
4391 address => 1, blockquote => 1, center => 1, dir => 1,
4392 div => 1, dl => 1, fieldset => 1, listing => 1,
4393 menu => 1, ol => 1, p => 1, ul => 1,
4394 pre => 1,
4395 }->{$token->{tag_name}}) {
4396 ## has a p element in scope
4397 INSCOPE: for (reverse @{$self->{open_elements}}) {
4398 if ($_->[1] eq 'p') {
4399 !!!back-token;
4400 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4401 redo B;
4402 } elsif ({
4403 table => 1, caption => 1, td => 1, th => 1,
4404 button => 1, marquee => 1, object => 1, html => 1,
4405 }->{$_->[1]}) {
4406 last INSCOPE;
4407 }
4408 } # INSCOPE
4409
4410 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4411 if ($token->{tag_name} eq 'pre') {
4412 !!!next-token;
4413 if ($token->{type} == CHARACTER_TOKEN) {
4414 $token->{data} =~ s/^\x0A//;
4415 unless (length $token->{data}) {
4416 !!!next-token;
4417 }
4418 }
4419 } else {
4420 !!!next-token;
4421 }
4422 redo B;
4423 } elsif ($token->{tag_name} eq 'form') {
4424 if (defined $self->{form_element}) {
4425 !!!parse-error (type => 'in form:form');
4426 ## Ignore the token
4427 !!!next-token;
4428 redo B;
4429 } else {
4430 ## has a p element in scope
4431 INSCOPE: for (reverse @{$self->{open_elements}}) {
4432 if ($_->[1] eq 'p') {
4433 !!!back-token;
4434 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4435 redo B;
4436 } elsif ({
4437 table => 1, caption => 1, td => 1, th => 1,
4438 button => 1, marquee => 1, object => 1, html => 1,
4439 }->{$_->[1]}) {
4440 last INSCOPE;
4441 }
4442 } # INSCOPE
4443
4444 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4445 $self->{form_element} = $self->{open_elements}->[-1]->[0];
4446 !!!next-token;
4447 redo B;
4448 }
4449 } elsif ($token->{tag_name} eq 'li') {
4450 ## has a p element in scope
4451 INSCOPE: for (reverse @{$self->{open_elements}}) {
4452 if ($_->[1] eq 'p') {
4453 !!!back-token;
4454 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4455 redo B;
4456 } elsif ({
4457 table => 1, caption => 1, td => 1, th => 1,
4458 button => 1, marquee => 1, object => 1, html => 1,
4459 }->{$_->[1]}) {
4460 last INSCOPE;
4461 }
4462 } # INSCOPE
4463
4464 ## Step 1
4465 my $i = -1;
4466 my $node = $self->{open_elements}->[$i];
4467 LI: {
4468 ## Step 2
4469 if ($node->[1] eq 'li') {
4470 if ($i != -1) {
4471 !!!parse-error (type => 'end tag missing:'.
4472 $self->{open_elements}->[-1]->[1]);
4473 }
4474 splice @{$self->{open_elements}}, $i;
4475 last LI;
4476 }
4477
4478 ## Step 3
4479 if (not $formatting_category->{$node->[1]} and
4480 #not $phrasing_category->{$node->[1]} and
4481 ($special_category->{$node->[1]} or
4482 $scoping_category->{$node->[1]}) and
4483 $node->[1] ne 'address' and $node->[1] ne 'div') {
4484 last LI;
4485 }
4486
4487 ## Step 4
4488 $i--;
4489 $node = $self->{open_elements}->[$i];
4490 redo LI;
4491 } # LI
4492
4493 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4494 !!!next-token;
4495 redo B;
4496 } elsif ($token->{tag_name} eq 'dd' or $token->{tag_name} eq 'dt') {
4497 ## has a p element in scope
4498 INSCOPE: for (reverse @{$self->{open_elements}}) {
4499 if ($_->[1] eq 'p') {
4500 !!!back-token;
4501 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4502 redo B;
4503 } elsif ({
4504 table => 1, caption => 1, td => 1, th => 1,
4505 button => 1, marquee => 1, object => 1, html => 1,
4506 }->{$_->[1]}) {
4507 last INSCOPE;
4508 }
4509 } # INSCOPE
4510
4511 ## Step 1
4512 my $i = -1;
4513 my $node = $self->{open_elements}->[$i];
4514 LI: {
4515 ## Step 2
4516 if ($node->[1] eq 'dt' or $node->[1] eq 'dd') {
4517 if ($i != -1) {
4518 !!!parse-error (type => 'end tag missing:'.
4519 $self->{open_elements}->[-1]->[1]);
4520 }
4521 splice @{$self->{open_elements}}, $i;
4522 last LI;
4523 }
4524
4525 ## Step 3
4526 if (not $formatting_category->{$node->[1]} and
4527 #not $phrasing_category->{$node->[1]} and
4528 ($special_category->{$node->[1]} or
4529 $scoping_category->{$node->[1]}) and
4530 $node->[1] ne 'address' and $node->[1] ne 'div') {
4531 last LI;
4532 }
4533
4534 ## Step 4
4535 $i--;
4536 $node = $self->{open_elements}->[$i];
4537 redo LI;
4538 } # LI
4539
4540 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4541 !!!next-token;
4542 redo B;
4543 } elsif ($token->{tag_name} eq 'plaintext') {
4544 ## has a p element in scope
4545 INSCOPE: for (reverse @{$self->{open_elements}}) {
4546 if ($_->[1] eq 'p') {
4547 !!!back-token;
4548 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4549 redo B;
4550 } elsif ({
4551 table => 1, caption => 1, td => 1, th => 1,
4552 button => 1, marquee => 1, object => 1, html => 1,
4553 }->{$_->[1]}) {
4554 last INSCOPE;
4555 }
4556 } # INSCOPE
4557
4558 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4559
4560 $self->{content_model} = PLAINTEXT_CONTENT_MODEL;
4561
4562 !!!next-token;
4563 redo B;
4564 } elsif ({
4565 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
4566 }->{$token->{tag_name}}) {
4567 ## has a p element in scope
4568 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4569 my $node = $self->{open_elements}->[$_];
4570 if ($node->[1] eq 'p') {
4571 !!!back-token;
4572 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4573 redo B;
4574 } elsif ({
4575 table => 1, caption => 1, td => 1, th => 1,
4576 button => 1, marquee => 1, object => 1, html => 1,
4577 }->{$node->[1]}) {
4578 last INSCOPE;
4579 }
4580 } # INSCOPE
4581
4582 ## NOTE: See <http://html5.org/tools/web-apps-tracker?from=925&to=926>
4583 ## has an element in scope
4584 #my $i;
4585 #INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4586 # my $node = $self->{open_elements}->[$_];
4587 # if ({
4588 # h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
4589 # }->{$node->[1]}) {
4590 # $i = $_;
4591 # last INSCOPE;
4592 # } elsif ({
4593 # table => 1, caption => 1, td => 1, th => 1,
4594 # button => 1, marquee => 1, object => 1, html => 1,
4595 # }->{$node->[1]}) {
4596 # last INSCOPE;
4597 # }
4598 #} # INSCOPE
4599 #
4600 #if (defined $i) {
4601 # !!! parse-error (type => 'in hn:hn');
4602 # splice @{$self->{open_elements}}, $i;
4603 #}
4604
4605 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4606
4607 !!!next-token;
4608 redo B;
4609 } elsif ($token->{tag_name} eq 'a') {
4610 AFE: for my $i (reverse 0..$#$active_formatting_elements) {
4611 my $node = $active_formatting_elements->[$i];
4612 if ($node->[1] eq 'a') {
4613 !!!parse-error (type => 'in a:a');
4614
4615 !!!back-token;
4616 $token = {type => END_TAG_TOKEN, tag_name => 'a'};
4617 $formatting_end_tag->($token->{tag_name});
4618
4619 AFE2: for (reverse 0..$#$active_formatting_elements) {
4620 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
4621 splice @$active_formatting_elements, $_, 1;
4622 last AFE2;
4623 }
4624 } # AFE2
4625 OE: for (reverse 0..$#{$self->{open_elements}}) {
4626 if ($self->{open_elements}->[$_]->[0] eq $node->[0]) {
4627 splice @{$self->{open_elements}}, $_, 1;
4628 last OE;
4629 }
4630 } # OE
4631 last AFE;
4632 } elsif ($node->[0] eq '#marker') {
4633 last AFE;
4634 }
4635 } # AFE
4636
4637 $reconstruct_active_formatting_elements->($insert_to_current);
4638
4639 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4640 push @$active_formatting_elements, $self->{open_elements}->[-1];
4641
4642 !!!next-token;
4643 redo B;
4644 } elsif ({
4645 b => 1, big => 1, em => 1, font => 1, i => 1,
4646 s => 1, small => 1, strile => 1,
4647 strong => 1, tt => 1, u => 1,
4648 }->{$token->{tag_name}}) {
4649 $reconstruct_active_formatting_elements->($insert_to_current);
4650
4651 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4652 push @$active_formatting_elements, $self->{open_elements}->[-1];
4653
4654 !!!next-token;
4655 redo B;
4656 } elsif ($token->{tag_name} eq 'nobr') {
4657 $reconstruct_active_formatting_elements->($insert_to_current);
4658
4659 ## has a |nobr| element in scope
4660 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4661 my $node = $self->{open_elements}->[$_];
4662 if ($node->[1] eq 'nobr') {
4663 !!!parse-error (type => 'in nobr:nobr');
4664 !!!back-token;
4665 $token = {type => END_TAG_TOKEN, tag_name => 'nobr'};
4666 redo B;
4667 } elsif ({
4668 table => 1, caption => 1, td => 1, th => 1,
4669 button => 1, marquee => 1, object => 1, html => 1,
4670 }->{$node->[1]}) {
4671 last INSCOPE;
4672 }
4673 } # INSCOPE
4674
4675 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4676 push @$active_formatting_elements, $self->{open_elements}->[-1];
4677
4678 !!!next-token;
4679 redo B;
4680 } elsif ($token->{tag_name} eq 'button') {
4681 ## has a button element in scope
4682 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4683 my $node = $self->{open_elements}->[$_];
4684 if ($node->[1] eq 'button') {
4685 !!!parse-error (type => 'in button:button');
4686 !!!back-token;
4687 $token = {type => END_TAG_TOKEN, tag_name => 'button'};
4688 redo B;
4689 } elsif ({
4690 table => 1, caption => 1, td => 1, th => 1,
4691 button => 1, marquee => 1, object => 1, html => 1,
4692 }->{$node->[1]}) {
4693 last INSCOPE;
4694 }
4695 } # INSCOPE
4696
4697 $reconstruct_active_formatting_elements->($insert_to_current);
4698
4699 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4700 push @$active_formatting_elements, ['#marker', ''];
4701
4702 !!!next-token;
4703 redo B;
4704 } elsif ($token->{tag_name} eq 'marquee' or
4705 $token->{tag_name} eq 'object') {
4706 $reconstruct_active_formatting_elements->($insert_to_current);
4707
4708 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4709 push @$active_formatting_elements, ['#marker', ''];
4710
4711 !!!next-token;
4712 redo B;
4713 } elsif ($token->{tag_name} eq 'xmp') {
4714 $reconstruct_active_formatting_elements->($insert_to_current);
4715 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);
4716 redo B;
4717 } elsif ($token->{tag_name} eq 'table') {
4718 ## has a p element in scope
4719 INSCOPE: for (reverse @{$self->{open_elements}}) {
4720 if ($_->[1] eq 'p') {
4721 !!!back-token;
4722 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4723 redo B;
4724 } elsif ({
4725 table => 1, caption => 1, td => 1, th => 1,
4726 button => 1, marquee => 1, object => 1, html => 1,
4727 }->{$_->[1]}) {
4728 last INSCOPE;
4729 }
4730 } # INSCOPE
4731
4732 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4733
4734 $self->{insertion_mode} = IN_TABLE_IM;
4735
4736 !!!next-token;
4737 redo B;
4738 } elsif ({
4739 area => 1, basefont => 1, bgsound => 1, br => 1,
4740 embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
4741 image => 1,
4742 }->{$token->{tag_name}}) {
4743 if ($token->{tag_name} eq 'image') {
4744 !!!parse-error (type => 'image');
4745 $token->{tag_name} = 'img';
4746 }
4747
4748 ## NOTE: There is an "as if <br>" code clone.
4749 $reconstruct_active_formatting_elements->($insert_to_current);
4750
4751 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4752 pop @{$self->{open_elements}};
4753
4754 !!!next-token;
4755 redo B;
4756 } elsif ($token->{tag_name} eq 'hr') {
4757 ## has a p element in scope
4758 INSCOPE: for (reverse @{$self->{open_elements}}) {
4759 if ($_->[1] eq 'p') {
4760 !!!back-token;
4761 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4762 redo B;
4763 } elsif ({
4764 table => 1, caption => 1, td => 1, th => 1,
4765 button => 1, marquee => 1, object => 1, html => 1,
4766 }->{$_->[1]}) {
4767 last INSCOPE;
4768 }
4769 } # INSCOPE
4770
4771 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4772 pop @{$self->{open_elements}};
4773
4774 !!!next-token;
4775 redo B;
4776 } elsif ($token->{tag_name} eq 'input') {
4777 $reconstruct_active_formatting_elements->($insert_to_current);
4778
4779 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4780 ## TODO: associate with $self->{form_element} if defined
4781 pop @{$self->{open_elements}};
4782
4783 !!!next-token;
4784 redo B;
4785 } elsif ($token->{tag_name} eq 'isindex') {
4786 !!!parse-error (type => 'isindex');
4787
4788 if (defined $self->{form_element}) {
4789 ## Ignore the token
4790 !!!next-token;
4791 redo B;
4792 } else {
4793 my $at = $token->{attributes};
4794 my $form_attrs;
4795 $form_attrs->{action} = $at->{action} if $at->{action};
4796 my $prompt_attr = $at->{prompt};
4797 $at->{name} = {name => 'name', value => 'isindex'};
4798 delete $at->{action};
4799 delete $at->{prompt};
4800 my @tokens = (
4801 {type => START_TAG_TOKEN, tag_name => 'form',
4802 attributes => $form_attrs},
4803 {type => START_TAG_TOKEN, tag_name => 'hr'},
4804 {type => START_TAG_TOKEN, tag_name => 'p'},
4805 {type => START_TAG_TOKEN, tag_name => 'label'},
4806 );
4807 if ($prompt_attr) {
4808 push @tokens, {type => CHARACTER_TOKEN, data => $prompt_attr->{value}};
4809 } else {
4810 push @tokens, {type => CHARACTER_TOKEN,
4811 data => 'This is a searchable index. Insert your search keywords here: '}; # SHOULD
4812 ## TODO: make this configurable
4813 }
4814 push @tokens,
4815 {type => START_TAG_TOKEN, tag_name => 'input', attributes => $at},
4816 #{type => CHARACTER_TOKEN, data => ''}, # SHOULD
4817 {type => END_TAG_TOKEN, tag_name => 'label'},
4818 {type => END_TAG_TOKEN, tag_name => 'p'},
4819 {type => START_TAG_TOKEN, tag_name => 'hr'},
4820 {type => END_TAG_TOKEN, tag_name => 'form'};
4821 $token = shift @tokens;
4822 !!!back-token (@tokens);
4823 redo B;
4824 }
4825 } elsif ($token->{tag_name} eq 'textarea') {
4826 my $tag_name = $token->{tag_name};
4827 my $el;
4828 !!!create-element ($el, $token->{tag_name}, $token->{attributes});
4829
4830 ## TODO: $self->{form_element} if defined
4831 $self->{content_model} = RCDATA_CONTENT_MODEL;
4832 delete $self->{escape}; # MUST
4833
4834 $insert->($el);
4835
4836 my $text = '';
4837 !!!next-token;
4838 if ($token->{type} == CHARACTER_TOKEN) {
4839 $token->{data} =~ s/^\x0A//;
4840 unless (length $token->{data}) {
4841 !!!next-token;
4842 }
4843 }
4844 while ($token->{type} == CHARACTER_TOKEN) {
4845 $text .= $token->{data};
4846 !!!next-token;
4847 }
4848 if (length $text) {
4849 $el->manakai_append_text ($text);
4850 }
4851
4852 $self->{content_model} = PCDATA_CONTENT_MODEL;
4853
4854 if ($token->{type} == END_TAG_TOKEN and
4855 $token->{tag_name} eq $tag_name) {
4856 ## Ignore the token
4857 } else {
4858 !!!parse-error (type => 'in RCDATA:#'.$token->{type});
4859 }
4860 !!!next-token;
4861 redo B;
4862 } elsif ({
4863 iframe => 1,
4864 noembed => 1,
4865 noframes => 1,
4866 noscript => 0, ## TODO: 1 if scripting is enabled
4867 }->{$token->{tag_name}}) {
4868 ## NOTE: There is an "as if in body" code clone.
4869 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);
4870 redo B;
4871 } elsif ($token->{tag_name} eq 'select') {
4872 $reconstruct_active_formatting_elements->($insert_to_current);
4873
4874 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4875
4876 $self->{insertion_mode} = IN_SELECT_IM;
4877 !!!next-token;
4878 redo B;
4879 } elsif ({
4880 caption => 1, col => 1, colgroup => 1, frame => 1,
4881 frameset => 1, head => 1, option => 1, optgroup => 1,
4882 tbody => 1, td => 1, tfoot => 1, th => 1,
4883 thead => 1, tr => 1,
4884 }->{$token->{tag_name}}) {
4885 !!!parse-error (type => 'in body:'.$token->{tag_name});
4886 ## Ignore the token
4887 !!!next-token;
4888 redo B;
4889
4890 ## ISSUE: An issue on HTML5 new elements in the spec.
4891 } else {
4892 $reconstruct_active_formatting_elements->($insert_to_current);
4893
4894 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4895
4896 !!!next-token;
4897 redo B;
4898 }
4899 } elsif ($token->{type} == END_TAG_TOKEN) {
4900 if ($token->{tag_name} eq 'body') {
4901 if (@{$self->{open_elements}} > 1 and
4902 $self->{open_elements}->[1]->[1] eq 'body') {
4903 for (@{$self->{open_elements}}) {
4904 unless ({
4905 dd => 1, dt => 1, li => 1, p => 1, td => 1,
4906 th => 1, tr => 1, body => 1, html => 1,
4907 tbody => 1, tfoot => 1, thead => 1,
4908 }->{$_->[1]}) {
4909 !!!parse-error (type => 'not closed:'.$_->[1]);
4910 }
4911 }
4912
4913 $self->{insertion_mode} = AFTER_BODY_IM;
4914 !!!next-token;
4915 redo B;
4916 } else {
4917 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4918 ## Ignore the token
4919 !!!next-token;
4920 redo B;
4921 }
4922 } elsif ($token->{tag_name} eq 'html') {
4923 if (@{$self->{open_elements}} > 1 and $self->{open_elements}->[1]->[1] eq 'body') {
4924 ## ISSUE: There is an issue in the spec.
4925 if ($self->{open_elements}->[-1]->[1] ne 'body') {
4926 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[1]->[1]);
4927 }
4928 $self->{insertion_mode} = AFTER_BODY_IM;
4929 ## reprocess
4930 redo B;
4931 } else {
4932 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4933 ## Ignore the token
4934 !!!next-token;
4935 redo B;
4936 }
4937 } elsif ({
4938 address => 1, blockquote => 1, center => 1, dir => 1,
4939 div => 1, dl => 1, fieldset => 1, listing => 1,
4940 menu => 1, ol => 1, pre => 1, ul => 1,
4941 p => 1,
4942 dd => 1, dt => 1, li => 1,
4943 button => 1, marquee => 1, object => 1,
4944 }->{$token->{tag_name}}) {
4945 ## has an element in scope
4946 my $i;
4947 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4948 my $node = $self->{open_elements}->[$_];
4949 if ($node->[1] eq $token->{tag_name}) {
4950 ## generate implied end tags
4951 if ({
4952 dd => ($token->{tag_name} ne 'dd'),
4953 dt => ($token->{tag_name} ne 'dt'),
4954 li => ($token->{tag_name} ne 'li'),
4955 p => ($token->{tag_name} ne 'p'),
4956 td => 1, th => 1, tr => 1,
4957 tbody => 1, tfoot=> 1, thead => 1,
4958 }->{$self->{open_elements}->[-1]->[1]}) {
4959 !!!back-token;
4960 $token = {type => END_TAG_TOKEN,
4961 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
4962 redo B;
4963 }
4964 $i = $_;
4965 last INSCOPE unless $token->{tag_name} eq 'p';
4966 } elsif ({
4967 table => 1, caption => 1, td => 1, th => 1,
4968 button => 1, marquee => 1, object => 1, html => 1,
4969 }->{$node->[1]}) {
4970 last INSCOPE;
4971 }
4972 } # INSCOPE
4973
4974 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
4975 if (defined $i) {
4976 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4977 } else {
4978 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4979 }
4980 }
4981
4982 if (defined $i) {
4983 splice @{$self->{open_elements}}, $i;
4984 } elsif ($token->{tag_name} eq 'p') {
4985 ## As if <p>, then reprocess the current token
4986 my $el;
4987 !!!create-element ($el, 'p');
4988 $insert->($el);
4989 }
4990 $clear_up_to_marker->()
4991 if {
4992 button => 1, marquee => 1, object => 1,
4993 }->{$token->{tag_name}};
4994 !!!next-token;
4995 redo B;
4996 } elsif ($token->{tag_name} eq 'form') {
4997 ## has an element in scope
4998 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4999 my $node = $self->{open_elements}->[$_];
5000 if ($node->[1] eq $token->{tag_name}) {
5001 ## generate implied end tags
5002 if ({
5003 dd => 1, dt => 1, li => 1, p => 1,
5004 td => 1, th => 1, tr => 1,
5005 tbody => 1, tfoot=> 1, thead => 1,
5006 }->{$self->{open_elements}->[-1]->[1]}) {
5007 !!!back-token;
5008 $token = {type => END_TAG_TOKEN,
5009 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
5010 redo B;
5011 }
5012 last INSCOPE;
5013 } elsif ({
5014 table => 1, caption => 1, td => 1, th => 1,
5015 button => 1, marquee => 1, object => 1, html => 1,
5016 }->{$node->[1]}) {
5017 last INSCOPE;
5018 }
5019 } # INSCOPE
5020
5021 if ($self->{open_elements}->[-1]->[1] eq $token->{tag_name}) {
5022 pop @{$self->{open_elements}};
5023 } else {
5024 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5025 }
5026
5027 undef $self->{form_element};
5028 !!!next-token;
5029 redo B;
5030 } elsif ({
5031 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
5032 }->{$token->{tag_name}}) {
5033 ## has an element in scope
5034 my $i;
5035 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5036 my $node = $self->{open_elements}->[$_];
5037 if ({
5038 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
5039 }->{$node->[1]}) {
5040 ## generate implied end tags
5041 if ({
5042 dd => 1, dt => 1, li => 1, p => 1,
5043 td => 1, th => 1, tr => 1,
5044 tbody => 1, tfoot=> 1, thead => 1,
5045 }->{$self->{open_elements}->[-1]->[1]}) {
5046 !!!back-token;
5047 $token = {type => END_TAG_TOKEN,
5048 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
5049 redo B;
5050 }
5051 $i = $_;
5052 last INSCOPE;
5053 } elsif ({
5054 table => 1, caption => 1, td => 1, th => 1,
5055 button => 1, marquee => 1, object => 1, html => 1,
5056 }->{$node->[1]}) {
5057 last INSCOPE;
5058 }
5059 } # INSCOPE
5060
5061 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
5062 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5063 }
5064
5065 splice @{$self->{open_elements}}, $i if defined $i;
5066 !!!next-token;
5067 redo B;
5068 } elsif ({
5069 a => 1,
5070 b => 1, big => 1, em => 1, font => 1, i => 1,
5071 nobr => 1, s => 1, small => 1, strile => 1,
5072 strong => 1, tt => 1, u => 1,
5073 }->{$token->{tag_name}}) {
5074 $formatting_end_tag->($token->{tag_name});
5075 redo B;
5076 } elsif ($token->{tag_name} eq 'br') {
5077 !!!parse-error (type => 'unmatched end tag:br');
5078
5079 ## As if <br>
5080 $reconstruct_active_formatting_elements->($insert_to_current);
5081
5082 my $el;
5083 !!!create-element ($el, 'br');
5084 $insert->($el);
5085
5086 ## Ignore the token.
5087 !!!next-token;
5088 redo B;
5089 } elsif ({
5090 caption => 1, col => 1, colgroup => 1, frame => 1,
5091 frameset => 1, head => 1, option => 1, optgroup => 1,
5092 tbody => 1, td => 1, tfoot => 1, th => 1,
5093 thead => 1, tr => 1,
5094 area => 1, basefont => 1, bgsound => 1,
5095 embed => 1, hr => 1, iframe => 1, image => 1,
5096 img => 1, input => 1, isindex => 1, noembed => 1,
5097 noframes => 1, param => 1, select => 1, spacer => 1,
5098 table => 1, textarea => 1, wbr => 1,
5099 noscript => 0, ## TODO: if scripting is enabled
5100 }->{$token->{tag_name}}) {
5101 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5102 ## Ignore the token
5103 !!!next-token;
5104 redo B;
5105
5106 ## ISSUE: Issue on HTML5 new elements in spec
5107
5108 } else {
5109 ## Step 1
5110 my $node_i = -1;
5111 my $node = $self->{open_elements}->[$node_i];
5112
5113 ## Step 2
5114 S2: {
5115 if ($node->[1] eq $token->{tag_name}) {
5116 ## Step 1
5117 ## generate implied end tags
5118 if ({
5119 dd => 1, dt => 1, li => 1, p => 1,
5120 td => 1, th => 1, tr => 1,
5121 tbody => 1, tfoot => 1, thead => 1,
5122 }->{$self->{open_elements}->[-1]->[1]}) {
5123 !!!back-token;
5124 $token = {type => END_TAG_TOKEN,
5125 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
5126 redo B;
5127 }
5128
5129 ## Step 2
5130 if ($token->{tag_name} ne $self->{open_elements}->[-1]->[1]) {
5131 ## NOTE: <x><y></x>
5132 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
5133 }
5134
5135 ## Step 3
5136 splice @{$self->{open_elements}}, $node_i;
5137
5138 !!!next-token;
5139 last S2;
5140 } else {
5141 ## Step 3
5142 if (not $formatting_category->{$node->[1]} and
5143 #not $phrasing_category->{$node->[1]} and
5144 ($special_category->{$node->[1]} or
5145 $scoping_category->{$node->[1]})) {
5146 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5147 ## Ignore the token
5148 !!!next-token;
5149 last S2;
5150 }
5151 }
5152
5153 ## Step 4
5154 $node_i--;
5155 $node = $self->{open_elements}->[$node_i];
5156
5157 ## Step 5;
5158 redo S2;
5159 } # S2
5160 redo B;
5161 }
5162 }
5163 redo B;
5164 } # B
5165
5166 ## NOTE: The "trailing end" phase in HTML5 is split into
5167 ## two insertion modes: "after html body" and "after html frameset".
5168 ## NOTE: States in the main stage is preserved while
5169 ## the parser stays in the trailing end phase. # MUST
5170
5171 ## Stop parsing # MUST
5172
5173 ## TODO: script stuffs
5174 } # _tree_construct_main
5175
5176 sub set_inner_html ($$$) {
5177 my $class = shift;
5178 my $node = shift;
5179 my $s = \$_[0];
5180 my $onerror = $_[1];
5181
5182 my $nt = $node->node_type;
5183 if ($nt == 9) {
5184 # MUST
5185
5186 ## Step 1 # MUST
5187 ## TODO: If the document has an active parser, ...
5188 ## ISSUE: There is an issue in the spec.
5189
5190 ## Step 2 # MUST
5191 my @cn = @{$node->child_nodes};
5192 for (@cn) {
5193 $node->remove_child ($_);
5194 }
5195
5196 ## Step 3, 4, 5 # MUST
5197 $class->parse_string ($$s => $node, $onerror);
5198 } elsif ($nt == 1) {
5199 ## TODO: If non-html element
5200
5201 ## NOTE: Most of this code is copied from |parse_string|
5202
5203 ## Step 1 # MUST
5204 my $this_doc = $node->owner_document;
5205 my $doc = $this_doc->implementation->create_document;
5206 $doc->manakai_is_html (1);
5207 my $p = $class->new;
5208 $p->{document} = $doc;
5209
5210 ## Step 9 # MUST
5211 my $i = 0;
5212 my $line = 1;
5213 my $column = 0;
5214 $p->{set_next_input_character} = sub {
5215 my $self = shift;
5216
5217 pop @{$self->{prev_input_character}};
5218 unshift @{$self->{prev_input_character}}, $self->{next_input_character};
5219
5220 $self->{next_input_character} = -1 and return if $i >= length $$s;
5221 $self->{next_input_character} = ord substr $$s, $i++, 1;
5222 $column++;
5223
5224 if ($self->{next_input_character} == 0x000A) { # LF
5225 $line++;
5226 $column = 0;
5227 } elsif ($self->{next_input_character} == 0x000D) { # CR
5228 $i++ if substr ($$s, $i, 1) eq "\x0A";
5229 $self->{next_input_character} = 0x000A; # LF # MUST
5230 $line++;
5231 $column = 0;
5232 } elsif ($self->{next_input_character} > 0x10FFFF) {
5233 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
5234 } elsif ($self->{next_input_character} == 0x0000) { # NULL
5235 !!!parse-error (type => 'NULL');
5236 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
5237 }
5238 };
5239 $p->{prev_input_character} = [-1, -1, -1];
5240 $p->{next_input_character} = -1;
5241
5242 my $ponerror = $onerror || sub {
5243 my (%opt) = @_;
5244 warn "Parse error ($opt{type}) at line $opt{line} column $opt{column}\n";
5245 };
5246 $p->{parse_error} = sub {
5247 $ponerror->(@_, line => $line, column => $column);
5248 };
5249
5250 $p->_initialize_tokenizer;
5251 $p->_initialize_tree_constructor;
5252
5253 ## Step 2
5254 my $node_ln = $node->local_name;
5255 $p->{content_model} = {
5256 title => RCDATA_CONTENT_MODEL,
5257 textarea => RCDATA_CONTENT_MODEL,
5258 style => CDATA_CONTENT_MODEL,
5259 script => CDATA_CONTENT_MODEL,
5260 xmp => CDATA_CONTENT_MODEL,
5261 iframe => CDATA_CONTENT_MODEL,
5262 noembed => CDATA_CONTENT_MODEL,
5263 noframes => CDATA_CONTENT_MODEL,
5264 noscript => CDATA_CONTENT_MODEL,
5265 plaintext => PLAINTEXT_CONTENT_MODEL,
5266 }->{$node_ln};
5267 $p->{content_model} = PCDATA_CONTENT_MODEL
5268 unless defined $p->{content_model};
5269 ## ISSUE: What is "the name of the element"? local name?
5270
5271 $p->{inner_html_node} = [$node, $node_ln];
5272
5273 ## Step 4
5274 my $root = $doc->create_element_ns
5275 ('http://www.w3.org/1999/xhtml', [undef, 'html']);
5276
5277 ## Step 5 # MUST
5278 $doc->append_child ($root);
5279
5280 ## Step 6 # MUST
5281 push @{$p->{open_elements}}, [$root, 'html'];
5282
5283 undef $p->{head_element};
5284
5285 ## Step 7 # MUST
5286 $p->_reset_insertion_mode;
5287
5288 ## Step 8 # MUST
5289 my $anode = $node;
5290 AN: while (defined $anode) {
5291 if ($anode->node_type == 1) {
5292 my $nsuri = $anode->namespace_uri;
5293 if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {
5294 if ($anode->local_name eq 'form') { ## TODO: case?
5295 $p->{form_element} = $anode;
5296 last AN;
5297 }
5298 }
5299 }
5300 $anode = $anode->parent_node;
5301 } # AN
5302
5303 ## Step 3 # MUST
5304 ## Step 10 # MUST
5305 {
5306 my $self = $p;
5307 !!!next-token;
5308 }
5309 $p->_tree_construction_main;
5310
5311 ## Step 11 # MUST
5312 my @cn = @{$node->child_nodes};
5313 for (@cn) {
5314 $node->remove_child ($_);
5315 }
5316 ## ISSUE: mutation events? read-only?
5317
5318 ## Step 12 # MUST
5319 @cn = @{$root->child_nodes};
5320 for (@cn) {
5321 $this_doc->adopt_node ($_);
5322 $node->append_child ($_);
5323 }
5324 ## ISSUE: mutation events?
5325
5326 $p->_terminate_tree_constructor;
5327 } else {
5328 die "$0: |set_inner_html| is not defined for node of type $nt";
5329 }
5330 } # set_inner_html
5331
5332 } # tree construction stage
5333
5334 sub get_inner_html ($$$) {
5335 my (undef, $node, $on_error) = @_;
5336
5337 ## Step 1
5338 my $s = '';
5339
5340 my $in_cdata;
5341 my $parent = $node;
5342 while (defined $parent) {
5343 if ($parent->node_type == 1 and
5344 $parent->namespace_uri eq 'http://www.w3.org/1999/xhtml' and
5345 {
5346 style => 1, script => 1, xmp => 1, iframe => 1,
5347 noembed => 1, noframes => 1, noscript => 1,
5348 }->{$parent->local_name}) { ## TODO: case thingy
5349 $in_cdata = 1;
5350 }
5351 $parent = $parent->parent_node;
5352 }
5353
5354 ## Step 2
5355 my @node = @{$node->child_nodes};
5356 C: while (@node) {
5357 my $child = shift @node;
5358 unless (ref $child) {
5359 if ($child eq 'cdata-out') {
5360 $in_cdata = 0;
5361 } else {
5362 $s .= $child; # end tag
5363 }
5364 next C;
5365 }
5366
5367 my $nt = $child->node_type;
5368 if ($nt == 1) { # Element
5369 my $tag_name = $child->tag_name; ## TODO: manakai_tag_name
5370 $s .= '<' . $tag_name;
5371 ## NOTE: Non-HTML case:
5372 ## <http://permalink.gmane.org/gmane.org.w3c.whatwg.discuss/11191>
5373
5374 my @attrs = @{$child->attributes}; # sort order MUST be stable
5375 for my $attr (@attrs) { # order is implementation dependent
5376 my $attr_name = $attr->name; ## TODO: manakai_name
5377 $s .= ' ' . $attr_name . '="';
5378 my $attr_value = $attr->value;
5379 ## escape
5380 $attr_value =~ s/&/&amp;/g;
5381 $attr_value =~ s/</&lt;/g;
5382 $attr_value =~ s/>/&gt;/g;
5383 $attr_value =~ s/"/&quot;/g;
5384 $s .= $attr_value . '"';
5385 }
5386 $s .= '>';
5387
5388 next C if {
5389 area => 1, base => 1, basefont => 1, bgsound => 1,
5390 br => 1, col => 1, embed => 1, frame => 1, hr => 1,
5391 img => 1, input => 1, link => 1, meta => 1, param => 1,
5392 spacer => 1, wbr => 1,
5393 }->{$tag_name};
5394
5395 $s .= "\x0A" if $tag_name eq 'pre' or $tag_name eq 'textarea';
5396
5397 if (not $in_cdata and {
5398 style => 1, script => 1, xmp => 1, iframe => 1,
5399 noembed => 1, noframes => 1, noscript => 1,
5400 plaintext => 1,
5401 }->{$tag_name}) {
5402 unshift @node, 'cdata-out';
5403 $in_cdata = 1;
5404 }
5405
5406 unshift @node, @{$child->child_nodes}, '</' . $tag_name . '>';
5407 } elsif ($nt == 3 or $nt == 4) {
5408 if ($in_cdata) {
5409 $s .= $child->data;
5410 } else {
5411 my $value = $child->data;
5412 $value =~ s/&/&amp;/g;
5413 $value =~ s/</&lt;/g;
5414 $value =~ s/>/&gt;/g;
5415 $value =~ s/"/&quot;/g;
5416 $s .= $value;
5417 }
5418 } elsif ($nt == 8) {
5419 $s .= '<!--' . $child->data . '-->';
5420 } elsif ($nt == 10) {
5421 $s .= '<!DOCTYPE ' . $child->name . '>';
5422 } elsif ($nt == 5) { # entrefs
5423 push @node, @{$child->child_nodes};
5424 } else {
5425 $on_error->($child) if defined $on_error;
5426 }
5427 ## ISSUE: This code does not support PIs.
5428 } # C
5429
5430 ## Step 3
5431 return \$s;
5432 } # get_inner_html
5433
5434 1;
5435 # $Date: 2007/08/11 08:08:12 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24