/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (hide annotations) (download) (as text)
Tue May 1 10:47:37 2007 UTC (17 years, 6 months ago) by wakaba
Branch: MAIN
Changes since 1.1: +3 -3 lines
File MIME type: application/x-wais-source
s/What::/Whatpm::/ to avoid name confliction

1 wakaba 1.2 package Whatpm::HTML;
2 wakaba 1.1 use strict;
3 wakaba 1.2 our $VERSION=do{my @r=(q$Revision: 1.1 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.1
5     ## This is an early version of an HTML parser.
6    
7     my $permitted_slash_tag_name = {
8     base => 1,
9     link => 1,
10     meta => 1,
11     hr => 1,
12     br => 1,
13     img=> 1,
14     embed => 1,
15     param => 1,
16     area => 1,
17     col => 1,
18     input => 1,
19     };
20    
21     my $entity_char = {
22     AElig => "\x{00C6}",
23     Aacute => "\x{00C1}",
24     Acirc => "\x{00C2}",
25     Agrave => "\x{00C0}",
26     Alpha => "\x{0391}",
27     Aring => "\x{00C5}",
28     Atilde => "\x{00C3}",
29     Auml => "\x{00C4}",
30     Beta => "\x{0392}",
31     Ccedil => "\x{00C7}",
32     Chi => "\x{03A7}",
33     Dagger => "\x{2021}",
34     Delta => "\x{0394}",
35     ETH => "\x{00D0}",
36     Eacute => "\x{00C9}",
37     Ecirc => "\x{00CA}",
38     Egrave => "\x{00C8}",
39     Epsilon => "\x{0395}",
40     Eta => "\x{0397}",
41     Euml => "\x{00CB}",
42     Gamma => "\x{0393}",
43     Iacute => "\x{00CD}",
44     Icirc => "\x{00CE}",
45     Igrave => "\x{00CC}",
46     Iota => "\x{0399}",
47     Iuml => "\x{00CF}",
48     Kappa => "\x{039A}",
49     Lambda => "\x{039B}",
50     Mu => "\x{039C}",
51     Ntilde => "\x{00D1}",
52     Nu => "\x{039D}",
53     OElig => "\x{0152}",
54     Oacute => "\x{00D3}",
55     Ocirc => "\x{00D4}",
56     Ograve => "\x{00D2}",
57     Omega => "\x{03A9}",
58     Omicron => "\x{039F}",
59     Oslash => "\x{00D8}",
60     Otilde => "\x{00D5}",
61     Ouml => "\x{00D6}",
62     Phi => "\x{03A6}",
63     Pi => "\x{03A0}",
64     Prime => "\x{2033}",
65     Psi => "\x{03A8}",
66     Rho => "\x{03A1}",
67     Scaron => "\x{0160}",
68     Sigma => "\x{03A3}",
69     THORN => "\x{00DE}",
70     Tau => "\x{03A4}",
71     Theta => "\x{0398}",
72     Uacute => "\x{00DA}",
73     Ucirc => "\x{00DB}",
74     Ugrave => "\x{00D9}",
75     Upsilon => "\x{03A5}",
76     Uuml => "\x{00DC}",
77     Xi => "\x{039E}",
78     Yacute => "\x{00DD}",
79     Yuml => "\x{0178}",
80     Zeta => "\x{0396}",
81     aacute => "\x{00E1}",
82     acirc => "\x{00E2}",
83     acute => "\x{00B4}",
84     aelig => "\x{00E6}",
85     agrave => "\x{00E0}",
86     alefsym => "\x{2135}",
87     alpha => "\x{03B1}",
88     amp => "\x{0026}",
89     AMP => "\x{0026}",
90     and => "\x{2227}",
91     ang => "\x{2220}",
92     apos => "\x{0027}",
93     aring => "\x{00E5}",
94     asymp => "\x{2248}",
95     atilde => "\x{00E3}",
96     auml => "\x{00E4}",
97     bdquo => "\x{201E}",
98     beta => "\x{03B2}",
99     brvbar => "\x{00A6}",
100     bull => "\x{2022}",
101     cap => "\x{2229}",
102     ccedil => "\x{00E7}",
103     cedil => "\x{00B8}",
104     cent => "\x{00A2}",
105     chi => "\x{03C7}",
106     circ => "\x{02C6}",
107     clubs => "\x{2663}",
108     cong => "\x{2245}",
109     copy => "\x{00A9}",
110     COPY => "\x{00A9}",
111     crarr => "\x{21B5}",
112     cup => "\x{222A}",
113     curren => "\x{00A4}",
114     dArr => "\x{21D3}",
115     dagger => "\x{2020}",
116     darr => "\x{2193}",
117     deg => "\x{00B0}",
118     delta => "\x{03B4}",
119     diams => "\x{2666}",
120     divide => "\x{00F7}",
121     eacute => "\x{00E9}",
122     ecirc => "\x{00EA}",
123     egrave => "\x{00E8}",
124     empty => "\x{2205}",
125     emsp => "\x{2003}",
126     ensp => "\x{2002}",
127     epsilon => "\x{03B5}",
128     equiv => "\x{2261}",
129     eta => "\x{03B7}",
130     eth => "\x{00F0}",
131     euml => "\x{00EB}",
132     euro => "\x{20AC}",
133     exist => "\x{2203}",
134     fnof => "\x{0192}",
135     forall => "\x{2200}",
136     frac12 => "\x{00BD}",
137     frac14 => "\x{00BC}",
138     frac34 => "\x{00BE}",
139     frasl => "\x{2044}",
140     gamma => "\x{03B3}",
141     ge => "\x{2265}",
142     gt => "\x{003E}",
143     GT => "\x{003E}",
144     hArr => "\x{21D4}",
145     harr => "\x{2194}",
146     hearts => "\x{2665}",
147     hellip => "\x{2026}",
148     iacute => "\x{00ED}",
149     icirc => "\x{00EE}",
150     iexcl => "\x{00A1}",
151     igrave => "\x{00EC}",
152     image => "\x{2111}",
153     infin => "\x{221E}",
154     int => "\x{222B}",
155     iota => "\x{03B9}",
156     iquest => "\x{00BF}",
157     isin => "\x{2208}",
158     iuml => "\x{00EF}",
159     kappa => "\x{03BA}",
160     lArr => "\x{21D0}",
161     lambda => "\x{03BB}",
162     lang => "\x{2329}",
163     laquo => "\x{00AB}",
164     larr => "\x{2190}",
165     lceil => "\x{2308}",
166     ldquo => "\x{201C}",
167     le => "\x{2264}",
168     lfloor => "\x{230A}",
169     lowast => "\x{2217}",
170     loz => "\x{25CA}",
171     lrm => "\x{200E}",
172     lsaquo => "\x{2039}",
173     lsquo => "\x{2018}",
174     lt => "\x{003C}",
175     LT => "\x{003C}",
176     macr => "\x{00AF}",
177     mdash => "\x{2014}",
178     micro => "\x{00B5}",
179     middot => "\x{00B7}",
180     minus => "\x{2212}",
181     mu => "\x{03BC}",
182     nabla => "\x{2207}",
183     nbsp => "\x{00A0}",
184     ndash => "\x{2013}",
185     ne => "\x{2260}",
186     ni => "\x{220B}",
187     not => "\x{00AC}",
188     notin => "\x{2209}",
189     nsub => "\x{2284}",
190     ntilde => "\x{00F1}",
191     nu => "\x{03BD}",
192     oacute => "\x{00F3}",
193     ocirc => "\x{00F4}",
194     oelig => "\x{0153}",
195     ograve => "\x{00F2}",
196     oline => "\x{203E}",
197     omega => "\x{03C9}",
198     omicron => "\x{03BF}",
199     oplus => "\x{2295}",
200     or => "\x{2228}",
201     ordf => "\x{00AA}",
202     ordm => "\x{00BA}",
203     oslash => "\x{00F8}",
204     otilde => "\x{00F5}",
205     otimes => "\x{2297}",
206     ouml => "\x{00F6}",
207     para => "\x{00B6}",
208     part => "\x{2202}",
209     permil => "\x{2030}",
210     perp => "\x{22A5}",
211     phi => "\x{03C6}",
212     pi => "\x{03C0}",
213     piv => "\x{03D6}",
214     plusmn => "\x{00B1}",
215     pound => "\x{00A3}",
216     prime => "\x{2032}",
217     prod => "\x{220F}",
218     prop => "\x{221D}",
219     psi => "\x{03C8}",
220     quot => "\x{0022}",
221     QUOT => "\x{0022}",
222     rArr => "\x{21D2}",
223     radic => "\x{221A}",
224     rang => "\x{232A}",
225     raquo => "\x{00BB}",
226     rarr => "\x{2192}",
227     rceil => "\x{2309}",
228     rdquo => "\x{201D}",
229     real => "\x{211C}",
230     reg => "\x{00AE}",
231     REG => "\x{00AE}",
232     rfloor => "\x{230B}",
233     rho => "\x{03C1}",
234     rlm => "\x{200F}",
235     rsaquo => "\x{203A}",
236     rsquo => "\x{2019}",
237     sbquo => "\x{201A}",
238     scaron => "\x{0161}",
239     sdot => "\x{22C5}",
240     sect => "\x{00A7}",
241     shy => "\x{00AD}",
242     sigma => "\x{03C3}",
243     sigmaf => "\x{03C2}",
244     sim => "\x{223C}",
245     spades => "\x{2660}",
246     sub => "\x{2282}",
247     sube => "\x{2286}",
248     sum => "\x{2211}",
249     sup => "\x{2283}",
250     sup1 => "\x{00B9}",
251     sup2 => "\x{00B2}",
252     sup3 => "\x{00B3}",
253     supe => "\x{2287}",
254     szlig => "\x{00DF}",
255     tau => "\x{03C4}",
256     there4 => "\x{2234}",
257     theta => "\x{03B8}",
258     thetasym => "\x{03D1}",
259     thinsp => "\x{2009}",
260     thorn => "\x{00FE}",
261     tilde => "\x{02DC}",
262     times => "\x{00D7}",
263     trade => "\x{2122}",
264     uArr => "\x{21D1}",
265     uacute => "\x{00FA}",
266     uarr => "\x{2191}",
267     ucirc => "\x{00FB}",
268     ugrave => "\x{00F9}",
269     uml => "\x{00A8}",
270     upsih => "\x{03D2}",
271     upsilon => "\x{03C5}",
272     uuml => "\x{00FC}",
273     weierp => "\x{2118}",
274     xi => "\x{03BE}",
275     yacute => "\x{00FD}",
276     yen => "\x{00A5}",
277     yuml => "\x{00FF}",
278     zeta => "\x{03B6}",
279     zwj => "\x{200D}",
280     zwnj => "\x{200C}",
281     };
282    
283     my $special_category = {
284     address => 1, area => 1, base => 1, basefont => 1, bgsound => 1,
285     blockquote => 1, body => 1, br => 1, center => 1, col => 1, colgroup => 1,
286     dd => 1, dir => 1, div => 1, dl => 1, dt => 1, embed => 1, fieldset => 1,
287     form => 1, frame => 1, frameset => 1, h1 => 1, h2 => 1, h3 => 1,
288     h4 => 1, h5 => 1, h6 => 1, head => 1, hr => 1, iframe => 1, image => 1,
289     img => 1, input => 1, isindex => 1, li => 1, link => 1, listing => 1,
290     menu => 1, meta => 1, noembed => 1, noframes => 1, noscript => 1,
291     ol => 1, optgroup => 1, option => 1, p => 1, param => 1, plaintext => 1,
292     pre => 1, script => 1, select => 1, spacer => 1, style => 1, tbody => 1,
293     textarea => 1, tfoot => 1, thead => 1, title => 1, tr => 1, ul => 1, wbr => 1,
294     };
295     my $scoping_category = {
296     button => 1, caption => 1, html => 1, marquee => 1, object => 1,
297     table => 1, td => 1, th => 1,
298     };
299     my $formatting_category = {
300     a => 1, b => 1, big => 1, em => 1, font => 1, i => 1, nobr => 1,
301     s => 1, small => 1, strile => 1, strong => 1, tt => 1, u => 1,
302     };
303     # $phrasing_category: all other elements
304    
305     sub parse_string ($$$;$) {
306     my $self = shift->new;
307     my $s = \$_[0];
308     $self->{document} = $_[1];
309    
310     my $i;
311     my $i = 0;
312     $self->{set_next_input_character} = sub {
313     my $self = shift;
314     $self->{next_input_character} = -1 and return if $i >= length $$s;
315     $self->{next_input_character} = ord substr $$s, $i++, 1;
316    
317     if ($self->{next_input_character} == 0x000D) { # CR
318     if ($i >= length $$s) {
319     #
320     } else {
321     my $next_char = ord substr $$s, $i++, 1;
322     if ($next_char == 0x000A) { # LF
323     #
324     } else {
325     push @{$self->{char}}, $next_char;
326     }
327     }
328     $self->{next_input_character} = 0x000A; # LF # MUST
329     } elsif ($self->{next_input_character} > 0x10FFFF) {
330     $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
331     } elsif ($self->{next_input_character} == 0x0000) { # NULL
332     $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
333     }
334     };
335    
336     $self->{parse_error} = $_[2] || sub {
337     warn "Parse error at character $i\n"; ## TODO: Report (line, column) pair
338     };
339    
340     $self->_initialize_tokenizer;
341     $self->_initialize_tree_constructor;
342     $self->_construct_tree;
343     $self->_terminate_tree_constructor;
344    
345     return $self->{document};
346     } # parse_string
347    
348     sub new ($) {
349     my $class = shift;
350     my $self = bless {}, $class;
351     $self->{set_next_input_character} = sub {
352     $self->{next_input_character} = -1;
353     };
354     $self->{parse_error} = sub {
355     #
356     };
357     return $self;
358     } # new
359    
360     ## Implementations MUST act as if state machine in the spec
361    
362     sub _initialize_tokenizer ($) {
363     my $self = shift;
364     $self->{state} = 'data'; # MUST
365     $self->{content_model_flag} = 'PCDATA'; # be
366     undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE
367     undef $self->{current_attribute};
368     undef $self->{last_emitted_start_tag_name};
369     undef $self->{last_attribute_value_state};
370     $self->{char} = [];
371     # $self->{next_input_character}
372     !!!next-input-character;
373     $self->{token} = [];
374     } # _initialize_tokenizer
375    
376     ## A token has:
377     ## ->{type} eq 'DOCTYPE', 'start tag', 'end tag', 'comment',
378     ## 'character', or 'end-of-file'
379     ## ->{name} (DOCTYPE, start tag (tagname), end tag (tagname))
380     ## ISSUE: the spec need s/tagname/tag name/
381     ## ->{error} == 1 or 0 (DOCTYPE)
382     ## ->{attributes} isa HASH (start tag, end tag)
383     ## ->{data} (comment, character)
384    
385     ## Macros
386     ## Macros MUST be preceded by three EXCLAMATION MARKs.
387     ## emit ($token)
388     ## Emits the specified token.
389    
390     ## Emitted token MUST immediately be handled by the tree construction state.
391    
392     ## Before each step, UA MAY check to see if either one of the scripts in
393     ## "list of scripts that will execute as soon as possible" or the first
394     ## script in the "list of scripts that will execute asynchronously",
395     ## has completed loading. If one has, then it MUST be executed
396     ## and removed from the list.
397    
398     sub _get_next_token ($) {
399     my $self = shift;
400     if (@{$self->{token}}) {
401     return shift @{$self->{token}};
402     }
403    
404     A: {
405     if ($self->{state} eq 'data') {
406     if ($self->{next_input_character} == 0x0026) { # &
407     if ($self->{content_model_flag} eq 'PCDATA' or
408     $self->{content_model_flag} eq 'RCDATA') {
409     $self->{state} = 'entity data';
410     !!!next-input-character;
411     redo A;
412     } else {
413     #
414     }
415     } elsif ($self->{next_input_character} == 0x003C) { # <
416     if ($self->{content_model_flag} ne 'PLAINTEXT') {
417     $self->{state} = 'tag open';
418     !!!next-input-character;
419     redo A;
420     } else {
421     #
422     }
423     } elsif ($self->{next_input_character} == -1) {
424     !!!emit ({type => 'end-of-file'});
425     last A; ## TODO: ok?
426     }
427     # Anything else
428     my $token = {type => 'character',
429     data => chr $self->{next_input_character}};
430     ## Stay in the data state
431     !!!next-input-character;
432    
433     !!!emit ($token);
434    
435     redo A;
436     } elsif ($self->{state} eq 'entity data') {
437     ## (cannot happen in CDATA state)
438    
439     my $token = $self->_tokenize_attempt_to_consume_an_entity;
440    
441     $self->{state} = 'data';
442     # next-input-character is already done
443    
444     unless (defined $token) {
445     !!!emit ({type => 'character', data => '&'});
446     } else {
447     !!!emit ($token);
448     }
449    
450     redo A;
451     } elsif ($self->{state} eq 'tag open') {
452     if ($self->{content_model_flag} eq 'RCDATA' or
453     $self->{content_model_flag} eq 'CDATA') {
454     if ($self->{next_input_character} == 0x002F) { # /
455     !!!next-input-character;
456     $self->{state} = 'close tag open';
457     redo A;
458     } else {
459     ## reconsume
460     $self->{state} = 'data';
461    
462     !!!emit ({type => 'character', data => '<'});
463    
464     redo A;
465     }
466     } elsif ($self->{content_model_flag} eq 'PCDATA') {
467     if ($self->{next_input_character} == 0x0021) { # !
468     $self->{state} = 'markup declaration open';
469     !!!next-input-character;
470     redo A;
471     } elsif ($self->{next_input_character} == 0x002F) { # /
472     $self->{state} = 'close tag open';
473     !!!next-input-character;
474     redo A;
475     } elsif (0x0041 <= $self->{next_input_character} and
476     $self->{next_input_character} <= 0x005A) { # A..Z
477     $self->{current_token}
478     = {type => 'start tag',
479     tag_name => chr ($self->{next_input_character} + 0x0020)};
480     $self->{state} = 'tag name';
481     !!!next-input-character;
482     redo A;
483     } elsif (0x0061 <= $self->{next_input_character} and
484     $self->{next_input_character} <= 0x007A) { # a..z
485     $self->{current_token} = {type => 'start tag',
486     tag_name => chr ($self->{next_input_character})};
487     $self->{state} = 'tag name';
488     !!!next-input-character;
489     redo A;
490     } elsif ($self->{next_input_character} == 0x003E) { # >
491     !!!parse-error;
492     $self->{state} = 'data';
493     !!!next-input-character;
494    
495     !!!emit ({type => 'character', data => '<>'});
496    
497     redo A;
498     } elsif ($self->{next_input_character} == 0x003F) { # ?
499     !!!parse-error;
500     $self->{state} = 'bogus comment';
501     ## $self->{next_input_character} is intentionally left as is
502     redo A;
503     } else {
504     !!!parse-error;
505     $self->{state} = 'data';
506     ## reconsume
507    
508     !!!emit ({type => 'character', data => '<'});
509    
510     redo A;
511     }
512     } else {
513     die "$0: $self->{content_model_flag}: Unknown content model flag";
514     }
515     } elsif ($self->{state} eq 'close tag open') {
516     if ($self->{content_model_flag} eq 'RCDATA' or
517     $self->{content_model_flag} eq 'CDATA') {
518     my @next_char;
519     TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
520     push @next_char, $self->{next_input_character};
521     my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
522     my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
523     if ($self->{next_input_character} == $c or $self->{next_input_character} == $C) {
524     !!!next-input-character;
525     next TAGNAME;
526     } else {
527     !!!parse-error;
528     $self->{next_input_character} = shift @next_char; # reconsume
529     !!!back-next-input-character (@next_char);
530     $self->{state} = 'data';
531    
532     !!!emit ({type => 'character', data => '</'});
533    
534     redo A;
535     }
536     }
537     push @next_char, $self->{next_input_character};
538    
539     unless ($self->{next_input_character} == 0x0009 or # HT
540     $self->{next_input_character} == 0x000A or # LF
541     $self->{next_input_character} == 0x000B or # VT
542     $self->{next_input_character} == 0x000C or # FF
543     $self->{next_input_character} == 0x0020 or # SP
544     $self->{next_input_character} == 0x003E or # >
545     $self->{next_input_character} == 0x002F or # /
546     $self->{next_input_character} == 0x003C or # <
547     $self->{next_input_character} == -1) {
548     !!!parse-error;
549     $self->{next_input_character} = shift @next_char; # reconsume
550     !!!back-next-input-character (@next_char);
551     $self->{state} = 'data';
552    
553     !!!emit ({type => 'character', data => '</'});
554    
555     redo A;
556     } else {
557     $self->{next_input_character} = shift @next_char;
558     !!!back-next-input-character (@next_char);
559     # and consume...
560     }
561     }
562    
563     if (0x0041 <= $self->{next_input_character} and
564     $self->{next_input_character} <= 0x005A) { # A..Z
565     $self->{current_token} = {type => 'end tag',
566     tag_name => chr ($self->{next_input_character} + 0x0020)};
567     $self->{state} = 'tag name';
568     !!!next-input-character;
569     redo A;
570     } elsif (0x0061 <= $self->{next_input_character} and
571     $self->{next_input_character} <= 0x007A) { # a..z
572     $self->{current_token} = {type => 'end tag',
573     tag_name => chr ($self->{next_input_character})};
574     $self->{state} = 'tag name';
575     !!!next-input-character;
576     redo A;
577     } elsif ($self->{next_input_character} == 0x003E) { # >
578     !!!parse-error;
579     $self->{state} = 'data';
580     !!!next-input-character;
581     redo A;
582     } elsif ($self->{next_input_character} == -1) {
583     !!!parse-error;
584     $self->{state} = 'data';
585     # reconsume
586    
587     !!!emit ({type => 'character', data => '</'});
588    
589     redo A;
590     } else {
591     !!!parse-error;
592     $self->{state} = 'bogus comment';
593     ## $self->{next_input_character} is intentionally left as is
594     redo A;
595     }
596     } elsif ($self->{state} eq 'tag name') {
597     if ($self->{next_input_character} == 0x0009 or # HT
598     $self->{next_input_character} == 0x000A or # LF
599     $self->{next_input_character} == 0x000B or # VT
600     $self->{next_input_character} == 0x000C or # FF
601     $self->{next_input_character} == 0x0020) { # SP
602     $self->{state} = 'before attribute name';
603     !!!next-input-character;
604     redo A;
605     } elsif ($self->{next_input_character} == 0x003E) { # >
606     if ($self->{current_token}->{type} eq 'start tag') {
607     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
608     } elsif ($self->{current_token}->{type} eq 'end tag') {
609     $self->{content_model_flag} = 'PCDATA'; # MUST
610     if ($self->{current_token}->{attributes}) {
611     !!!parse-error;
612     }
613     } else {
614     die "$0: $self->{current_token}->{type}: Unknown token type";
615     }
616     $self->{state} = 'data';
617     !!!next-input-character;
618    
619     !!!emit ($self->{current_token}); # start tag or end tag
620     undef $self->{current_token};
621    
622     redo A;
623     } elsif (0x0041 <= $self->{next_input_character} and
624     $self->{next_input_character} <= 0x005A) { # A..Z
625     $self->{current_token}->{tag_name} .= chr ($self->{next_input_character} + 0x0020);
626     # start tag or end tag
627     ## Stay in this state
628     !!!next-input-character;
629     redo A;
630     } elsif ($self->{next_input_character} == 0x003C or # <
631     $self->{next_input_character} == -1) {
632     !!!parse-error;
633     if ($self->{current_token}->{type} eq 'start tag') {
634     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
635     } elsif ($self->{current_token}->{type} eq 'end tag') {
636     $self->{content_model_flag} = 'PCDATA'; # MUST
637     if ($self->{current_token}->{attributes}) {
638     !!!parse-error;
639     }
640     } else {
641     die "$0: $self->{current_token}->{type}: Unknown token type";
642     }
643     $self->{state} = 'data';
644     # reconsume
645    
646     !!!emit ($self->{current_token}); # start tag or end tag
647     undef $self->{current_token};
648    
649     redo A;
650     } elsif ($self->{next_input_character} == 0x002F) { # /
651     !!!next-input-character;
652     if ($self->{next_input_character} == 0x003E and # >
653     $self->{current_token}->{type} eq 'start tag' and
654     $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
655     # permitted slash
656     #
657     } else {
658     !!!parse-error;
659     }
660     $self->{state} = 'before attribute name';
661     # next-input-character is already done
662     redo A;
663     } else {
664     $self->{current_token}->{tag_name} .= chr $self->{next_input_character};
665     # start tag or end tag
666     ## Stay in the state
667     !!!next-input-character;
668     redo A;
669     }
670     } elsif ($self->{state} eq 'before attribute name') {
671     if ($self->{next_input_character} == 0x0009 or # HT
672     $self->{next_input_character} == 0x000A or # LF
673     $self->{next_input_character} == 0x000B or # VT
674     $self->{next_input_character} == 0x000C or # FF
675     $self->{next_input_character} == 0x0020) { # SP
676     ## Stay in the state
677     !!!next-input-character;
678     redo A;
679     } elsif ($self->{next_input_character} == 0x003E) { # >
680     if ($self->{current_token}->{type} eq 'start tag') {
681     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
682     } elsif ($self->{current_token}->{type} eq 'end tag') {
683     $self->{content_model_flag} = 'PCDATA'; # MUST
684     if ($self->{current_token}->{attributes}) {
685     !!!parse-error;
686     }
687     } else {
688     die "$0: $self->{current_token}->{type}: Unknown token type";
689     }
690     $self->{state} = 'data';
691     !!!next-input-character;
692    
693     !!!emit ($self->{current_token}); # start tag or end tag
694     undef $self->{current_token};
695    
696     redo A;
697     } elsif (0x0041 <= $self->{next_input_character} and
698     $self->{next_input_character} <= 0x005A) { # A..Z
699     $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
700     value => ''};
701     $self->{state} = 'attribute name';
702     !!!next-input-character;
703     redo A;
704     } elsif ($self->{next_input_character} == 0x002F) { # /
705     !!!next-input-character;
706     if ($self->{next_input_character} == 0x003E and # >
707     $self->{current_token}->{type} eq 'start tag' and
708     $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
709     # permitted slash
710     #
711     } else {
712     !!!parse-error;
713     }
714     ## Stay in the state
715     # next-input-character is already done
716     redo A;
717     } elsif ($self->{next_input_character} == 0x003C or # <
718     $self->{next_input_character} == -1) {
719     !!!parse-error;
720     if ($self->{current_token}->{type} eq 'start tag') {
721     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
722     } elsif ($self->{current_token}->{type} eq 'end tag') {
723     $self->{content_model_flag} = 'PCDATA'; # MUST
724     if ($self->{current_token}->{attributes}) {
725     !!!parse-error;
726     }
727     } else {
728     die "$0: $self->{current_token}->{type}: Unknown token type";
729     }
730     $self->{state} = 'data';
731     # reconsume
732    
733     !!!emit ($self->{current_token}); # start tag or end tag
734     undef $self->{current_token};
735    
736     redo A;
737     } else {
738     $self->{current_attribute} = {name => chr ($self->{next_input_character}),
739     value => ''};
740     $self->{state} = 'attribute name';
741     !!!next-input-character;
742     redo A;
743     }
744     } elsif ($self->{state} eq 'attribute name') {
745     my $before_leave = sub {
746     if (exists $self->{current_token}->{attributes} # start tag or end tag
747     ->{$self->{current_attribute}->{name}}) { # MUST
748     !!!parse-error;
749     ## Discard $self->{current_attribute} # MUST
750     } else {
751     $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}
752     = $self->{current_attribute};
753     }
754     }; # $before_leave
755    
756     if ($self->{next_input_character} == 0x0009 or # HT
757     $self->{next_input_character} == 0x000A or # LF
758     $self->{next_input_character} == 0x000B or # VT
759     $self->{next_input_character} == 0x000C or # FF
760     $self->{next_input_character} == 0x0020) { # SP
761     $before_leave->();
762     $self->{state} = 'after attribute name';
763     !!!next-input-character;
764     redo A;
765     } elsif ($self->{next_input_character} == 0x003D) { # =
766     $before_leave->();
767     $self->{state} = 'before attribute value';
768     !!!next-input-character;
769     redo A;
770     } elsif ($self->{next_input_character} == 0x003E) { # >
771     $before_leave->();
772     if ($self->{current_token}->{type} eq 'start tag') {
773     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
774     } elsif ($self->{current_token}->{type} eq 'end tag') {
775     $self->{content_model_flag} = 'PCDATA'; # MUST
776     if ($self->{current_token}->{attributes}) {
777     !!!parse-error;
778     }
779     } else {
780     die "$0: $self->{current_token}->{type}: Unknown token type";
781     }
782     $self->{state} = 'data';
783     !!!next-input-character;
784    
785     !!!emit ($self->{current_token}); # start tag or end tag
786     undef $self->{current_token};
787    
788     redo A;
789     } elsif (0x0041 <= $self->{next_input_character} and
790     $self->{next_input_character} <= 0x005A) { # A..Z
791     $self->{current_attribute}->{name} .= chr ($self->{next_input_character} + 0x0020);
792     ## Stay in the state
793     !!!next-input-character;
794     redo A;
795     } elsif ($self->{next_input_character} == 0x002F) { # /
796     $before_leave->();
797     !!!next-input-character;
798     if ($self->{next_input_character} == 0x003E and # >
799     $self->{current_token}->{type} eq 'start tag' and
800     $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
801     # permitted slash
802     #
803     } else {
804     !!!parse-error;
805     }
806     $self->{state} = 'before attribute name';
807     # next-input-character is already done
808     redo A;
809     } elsif ($self->{next_input_character} == 0x003C or # <
810     $self->{next_input_character} == -1) {
811     !!!parse-error;
812     $before_leave->();
813     if ($self->{current_token}->{type} eq 'start tag') {
814     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
815     } elsif ($self->{current_token}->{type} eq 'end tag') {
816     $self->{content_model_flag} = 'PCDATA'; # MUST
817     if ($self->{current_token}->{attributes}) {
818     !!!parse-error;
819     }
820     } else {
821     die "$0: $self->{current_token}->{type}: Unknown token type";
822     }
823     $self->{state} = 'data';
824     # reconsume
825    
826     !!!emit ($self->{current_token}); # start tag or end tag
827     undef $self->{current_token};
828    
829     redo A;
830     } else {
831     $self->{current_attribute}->{name} .= chr ($self->{next_input_character});
832     ## Stay in the state
833     !!!next-input-character;
834     redo A;
835     }
836     } elsif ($self->{state} eq 'after attribute name') {
837     if ($self->{next_input_character} == 0x0009 or # HT
838     $self->{next_input_character} == 0x000A or # LF
839     $self->{next_input_character} == 0x000B or # VT
840     $self->{next_input_character} == 0x000C or # FF
841     $self->{next_input_character} == 0x0020) { # SP
842     ## Stay in the state
843     !!!next-input-character;
844     redo A;
845     } elsif ($self->{next_input_character} == 0x003D) { # =
846     $self->{state} = 'before attribute value';
847     !!!next-input-character;
848     redo A;
849     } elsif ($self->{next_input_character} == 0x003E) { # >
850     if ($self->{current_token}->{type} eq 'start tag') {
851     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
852     } elsif ($self->{current_token}->{type} eq 'end tag') {
853     $self->{content_model_flag} = 'PCDATA'; # MUST
854     if ($self->{current_token}->{attributes}) {
855     !!!parse-error;
856     }
857     } else {
858     die "$0: $self->{current_token}->{type}: Unknown token type";
859     }
860     $self->{state} = 'data';
861     !!!next-input-character;
862    
863     !!!emit ($self->{current_token}); # start tag or end tag
864     undef $self->{current_token};
865    
866     redo A;
867     } elsif (0x0041 <= $self->{next_input_character} and
868     $self->{next_input_character} <= 0x005A) { # A..Z
869     $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
870     value => ''};
871     $self->{state} = 'attribute name';
872     !!!next-input-character;
873     redo A;
874     } elsif ($self->{next_input_character} == 0x002F) { # /
875     !!!next-input-character;
876     if ($self->{next_input_character} == 0x003E and # >
877     $self->{current_token}->{type} eq 'start tag' and
878     $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
879     # permitted slash
880     #
881     } else {
882     !!!parse-error;
883     }
884     $self->{state} = 'before attribute name';
885     # next-input-character is already done
886     redo A;
887     } elsif ($self->{next_input_character} == 0x003C or # <
888     $self->{next_input_character} == -1) {
889     !!!parse-error;
890     if ($self->{current_token}->{type} eq 'start tag') {
891     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
892     } elsif ($self->{current_token}->{type} eq 'end tag') {
893     $self->{content_model_flag} = 'PCDATA'; # MUST
894     if ($self->{current_token}->{attributes}) {
895     !!!parse-error;
896     }
897     } else {
898     die "$0: $self->{current_token}->{type}: Unknown token type";
899     }
900     $self->{state} = 'data';
901     # reconsume
902    
903     !!!emit ($self->{current_token}); # start tag or end tag
904     undef $self->{current_token};
905    
906     redo A;
907     } else {
908     $self->{current_attribute} = {name => chr ($self->{next_input_character}),
909     value => ''};
910     $self->{state} = 'attribute name';
911     !!!next-input-character;
912     redo A;
913     }
914     } elsif ($self->{state} eq 'before attribute value') {
915     if ($self->{next_input_character} == 0x0009 or # HT
916     $self->{next_input_character} == 0x000A or # LF
917     $self->{next_input_character} == 0x000B or # VT
918     $self->{next_input_character} == 0x000C or # FF
919     $self->{next_input_character} == 0x0020) { # SP
920     ## Stay in the state
921     !!!next-input-character;
922     redo A;
923     } elsif ($self->{next_input_character} == 0x0022) { # "
924     $self->{state} = 'attribute value (double-quoted)';
925     !!!next-input-character;
926     redo A;
927     } elsif ($self->{next_input_character} == 0x0026) { # &
928     $self->{state} = 'attribute value (unquoted)';
929     ## reconsume
930     redo A;
931     } elsif ($self->{next_input_character} == 0x0027) { # '
932     $self->{state} = 'attribute value (single-quoted)';
933     !!!next-input-character;
934     redo A;
935     } elsif ($self->{next_input_character} == 0x003E) { # >
936     if ($self->{current_token}->{type} eq 'start tag') {
937     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
938     } elsif ($self->{current_token}->{type} eq 'end tag') {
939     $self->{content_model_flag} = 'PCDATA'; # MUST
940     if ($self->{current_token}->{attributes}) {
941     !!!parse-error;
942     }
943     } else {
944     die "$0: $self->{current_token}->{type}: Unknown token type";
945     }
946     $self->{state} = 'data';
947     !!!next-input-character;
948    
949     !!!emit ($self->{current_token}); # start tag or end tag
950     undef $self->{current_token};
951    
952     redo A;
953     } elsif ($self->{next_input_character} == 0x003C or # <
954     $self->{next_input_character} == -1) {
955     !!!parse-error;
956     if ($self->{current_token}->{type} eq 'start tag') {
957     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
958     } elsif ($self->{current_token}->{type} eq 'end tag') {
959     $self->{content_model_flag} = 'PCDATA'; # MUST
960     if ($self->{current_token}->{attributes}) {
961     !!!parse-error;
962     }
963     } else {
964     die "$0: $self->{current_token}->{type}: Unknown token type";
965     }
966     $self->{state} = 'data';
967     ## reconsume
968    
969     !!!emit ($self->{current_token}); # start tag or end tag
970     undef $self->{current_token};
971    
972     redo A;
973     } else {
974     $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
975     $self->{state} = 'attribute value (unquoted)';
976     !!!next-input-character;
977     redo A;
978     }
979     } elsif ($self->{state} eq 'attribute value (double-quoted)') {
980     if ($self->{next_input_character} == 0x0022) { # "
981     $self->{state} = 'before attribute name';
982     !!!next-input-character;
983     redo A;
984     } elsif ($self->{next_input_character} == 0x0026) { # &
985     $self->{last_attribute_value_state} = 'attribute value (double-quoted)';
986     $self->{state} = 'entity in attribute value';
987     !!!next-input-character;
988     redo A;
989     } elsif ($self->{next_input_character} == -1) {
990     !!!parse-error;
991     if ($self->{current_token}->{type} eq 'start tag') {
992     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
993     } elsif ($self->{current_token}->{type} eq 'end tag') {
994     $self->{content_model_flag} = 'PCDATA'; # MUST
995     if ($self->{current_token}->{attributes}) {
996     !!!parse-error;
997     }
998     } else {
999     die "$0: $self->{current_token}->{type}: Unknown token type";
1000     }
1001     $self->{state} = 'data';
1002     ## reconsume
1003    
1004     !!!emit ($self->{current_token}); # start tag or end tag
1005     undef $self->{current_token};
1006    
1007     redo A;
1008     } else {
1009     $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1010     ## Stay in the state
1011     !!!next-input-character;
1012     redo A;
1013     }
1014     } elsif ($self->{state} eq 'attribute value (single-quoted)') {
1015     if ($self->{next_input_character} == 0x0027) { # '
1016     $self->{state} = 'before attribute name';
1017     !!!next-input-character;
1018     redo A;
1019     } elsif ($self->{next_input_character} == 0x0026) { # &
1020     $self->{last_attribute_value_state} = 'attribute value (single-quoted)';
1021     $self->{state} = 'entity in attribute value';
1022     !!!next-input-character;
1023     redo A;
1024     } elsif ($self->{next_input_character} == -1) {
1025     !!!parse-error;
1026     if ($self->{current_token}->{type} eq 'start tag') {
1027     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1028     } elsif ($self->{current_token}->{type} eq 'end tag') {
1029     $self->{content_model_flag} = 'PCDATA'; # MUST
1030     if ($self->{current_token}->{attributes}) {
1031     !!!parse-error;
1032     }
1033     } else {
1034     die "$0: $self->{current_token}->{type}: Unknown token type";
1035     }
1036     $self->{state} = 'data';
1037     ## reconsume
1038    
1039     !!!emit ($self->{current_token}); # start tag or end tag
1040     undef $self->{current_token};
1041    
1042     redo A;
1043     } else {
1044     $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1045     ## Stay in the state
1046     !!!next-input-character;
1047     redo A;
1048     }
1049     } elsif ($self->{state} eq 'attribute value (unquoted)') {
1050     if ($self->{next_input_character} == 0x0009 or # HT
1051     $self->{next_input_character} == 0x000A or # LF
1052     $self->{next_input_character} == 0x000B or # HT
1053     $self->{next_input_character} == 0x000C or # FF
1054     $self->{next_input_character} == 0x0020) { # SP
1055     $self->{state} = 'before attribute name';
1056     !!!next-input-character;
1057     redo A;
1058     } elsif ($self->{next_input_character} == 0x0026) { # &
1059     $self->{last_attribute_value_state} = 'attribute value (unquoted)';
1060     $self->{state} = 'entity in attribute value';
1061     !!!next-input-character;
1062     redo A;
1063     } elsif ($self->{next_input_character} == 0x003E) { # >
1064     if ($self->{current_token}->{type} eq 'start tag') {
1065     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1066     } elsif ($self->{current_token}->{type} eq 'end tag') {
1067     $self->{content_model_flag} = 'PCDATA'; # MUST
1068     if ($self->{current_token}->{attributes}) {
1069     !!!parse-error;
1070     }
1071     } else {
1072     die "$0: $self->{current_token}->{type}: Unknown token type";
1073     }
1074     $self->{state} = 'data';
1075     !!!next-input-character;
1076    
1077     !!!emit ($self->{current_token}); # start tag or end tag
1078     undef $self->{current_token};
1079    
1080     redo A;
1081     } elsif ($self->{next_input_character} == 0x003C or # <
1082     $self->{next_input_character} == -1) {
1083     !!!parse-error;
1084     if ($self->{current_token}->{type} eq 'start tag') {
1085     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1086     } elsif ($self->{current_token}->{type} eq 'end tag') {
1087     $self->{content_model_flag} = 'PCDATA'; # MUST
1088     if ($self->{current_token}->{attributes}) {
1089     !!!parse-error;
1090     }
1091     } else {
1092     die "$0: $self->{current_token}->{type}: Unknown token type";
1093     }
1094     $self->{state} = 'data';
1095     ## reconsume
1096    
1097     !!!emit ($self->{current_token}); # start tag or end tag
1098     undef $self->{current_token};
1099    
1100     redo A;
1101     } else {
1102     $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1103     ## Stay in the state
1104     !!!next-input-character;
1105     redo A;
1106     }
1107     } elsif ($self->{state} eq 'entity in attribute value') {
1108     my $token = $self->_tokenize_attempt_to_consume_an_entity;
1109    
1110     unless (defined $token) {
1111     $self->{current_attribute}->{value} .= '&';
1112     } else {
1113     $self->{current_attribute}->{value} .= $token->{data};
1114     ## ISSUE: spec says "append the returned character token to the current attribute's value"
1115     }
1116    
1117     $self->{state} = $self->{last_attribute_value_state};
1118     # next-input-character is already done
1119     redo A;
1120     } elsif ($self->{state} eq 'bogus comment') {
1121     ## (only happen if PCDATA state)
1122    
1123     my $token = {type => 'comment', data => ''};
1124    
1125     BC: {
1126     if ($self->{next_input_character} == 0x003E) { # >
1127     $self->{state} = 'data';
1128     !!!next-input-character;
1129    
1130     !!!emit ($token);
1131    
1132     redo A;
1133     } elsif ($self->{next_input_character} == -1) {
1134     $self->{state} = 'data';
1135     ## reconsume
1136    
1137     !!!emit ($token);
1138    
1139     redo A;
1140     } else {
1141     $token->{data} .= chr ($self->{next_input_character});
1142     !!!next-input-character;
1143     redo BC;
1144     }
1145     } # BC
1146     } elsif ($self->{state} eq 'markup declaration open') {
1147     ## (only happen if PCDATA state)
1148    
1149     my @next_char;
1150     push @next_char, $self->{next_input_character};
1151    
1152     if ($self->{next_input_character} == 0x002D) { # -
1153     !!!next-input-character;
1154     push @next_char, $self->{next_input_character};
1155     if ($self->{next_input_character} == 0x002D) { # -
1156     $self->{current_token} = {type => 'comment', data => ''};
1157     $self->{state} = 'comment';
1158     !!!next-input-character;
1159     redo A;
1160     }
1161     } elsif ($self->{next_input_character} == 0x0044 or # D
1162     $self->{next_input_character} == 0x0064) { # d
1163     !!!next-input-character;
1164     push @next_char, $self->{next_input_character};
1165     if ($self->{next_input_character} == 0x004F or # O
1166     $self->{next_input_character} == 0x006F) { # o
1167     !!!next-input-character;
1168     push @next_char, $self->{next_input_character};
1169     if ($self->{next_input_character} == 0x0043 or # C
1170     $self->{next_input_character} == 0x0063) { # c
1171     !!!next-input-character;
1172     push @next_char, $self->{next_input_character};
1173     if ($self->{next_input_character} == 0x0054 or # T
1174     $self->{next_input_character} == 0x0074) { # t
1175     !!!next-input-character;
1176     push @next_char, $self->{next_input_character};
1177     if ($self->{next_input_character} == 0x0059 or # Y
1178     $self->{next_input_character} == 0x0079) { # y
1179     !!!next-input-character;
1180     push @next_char, $self->{next_input_character};
1181     if ($self->{next_input_character} == 0x0050 or # P
1182     $self->{next_input_character} == 0x0070) { # p
1183     !!!next-input-character;
1184     push @next_char, $self->{next_input_character};
1185     if ($self->{next_input_character} == 0x0045 or # E
1186     $self->{next_input_character} == 0x0065) { # e
1187     ## ISSUE: What a stupid code this is!
1188     $self->{state} = 'DOCTYPE';
1189     !!!next-input-character;
1190     redo A;
1191     }
1192     }
1193     }
1194     }
1195     }
1196     }
1197     }
1198    
1199     !!!parse-error;
1200     $self->{next_input_character} = shift @next_char;
1201     !!!back-next-input-character (@next_char);
1202     $self->{state} = 'bogus comment';
1203     redo A;
1204    
1205     ## ISSUE: typos in spec: chacacters, is is a parse error
1206     ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
1207     } elsif ($self->{state} eq 'comment') {
1208     if ($self->{next_input_character} == 0x002D) { # -
1209     $self->{state} = 'comment dash';
1210     !!!next-input-character;
1211     redo A;
1212     } elsif ($self->{next_input_character} == -1) {
1213     !!!parse-error;
1214     $self->{state} = 'data';
1215     ## reconsume
1216    
1217     !!!emit ($self->{current_token}); # comment
1218     undef $self->{current_token};
1219    
1220     redo A;
1221     } else {
1222     $self->{current_token}->{data} .= chr ($self->{next_input_character}); # comment
1223     ## Stay in the state
1224     !!!next-input-character;
1225     redo A;
1226     }
1227     } elsif ($self->{state} eq 'comment dash') {
1228     if ($self->{next_input_character} == 0x002D) { # -
1229     $self->{state} = 'comment end';
1230     !!!next-input-character;
1231     redo A;
1232     } elsif ($self->{next_input_character} == -1) {
1233     !!!parse-error;
1234     $self->{state} = 'data';
1235     ## reconsume
1236    
1237     !!!emit ($self->{current_token}); # comment
1238     undef $self->{current_token};
1239    
1240     redo A;
1241     } else {
1242     $self->{current_token}->{data} .= '-' . chr ($self->{next_input_character}); # comment
1243     $self->{state} = 'comment';
1244     !!!next-input-character;
1245     redo A;
1246     }
1247     } elsif ($self->{state} eq 'comment end') {
1248     if ($self->{next_input_character} == 0x003E) { # >
1249     $self->{state} = 'data';
1250     !!!next-input-character;
1251    
1252     !!!emit ($self->{current_token}); # comment
1253     undef $self->{current_token};
1254    
1255     redo A;
1256     } elsif ($self->{next_input_character} == 0x002D) { # -
1257     !!!parse-error;
1258     $self->{current_token}->{data} .= '-'; # comment
1259     ## Stay in the state
1260     !!!next-input-character;
1261     redo A;
1262     } elsif ($self->{next_input_character} == -1) {
1263     !!!parse-error;
1264     $self->{state} = 'data';
1265     ## reconsume
1266    
1267     !!!emit ($self->{current_token}); # comment
1268     undef $self->{current_token};
1269    
1270     redo A;
1271     } else {
1272     !!!parse-error;
1273     $self->{current_token}->{data} .= '--' . chr ($self->{next_input_character}); # comment
1274     $self->{state} = 'comment';
1275     !!!next-input-character;
1276     redo A;
1277     }
1278     } elsif ($self->{state} eq 'DOCTYPE') {
1279     if ($self->{next_input_character} == 0x0009 or # HT
1280     $self->{next_input_character} == 0x000A or # LF
1281     $self->{next_input_character} == 0x000B or # VT
1282     $self->{next_input_character} == 0x000C or # FF
1283     $self->{next_input_character} == 0x0020) { # SP
1284     $self->{state} = 'before DOCTYPE name';
1285     !!!next-input-character;
1286     redo A;
1287     } else {
1288     !!!parse-error;
1289     $self->{state} = 'before DOCTYPE name';
1290     ## reconsume
1291     redo A;
1292     }
1293     } elsif ($self->{state} eq 'before DOCTYPE name') {
1294     if ($self->{next_input_character} == 0x0009 or # HT
1295     $self->{next_input_character} == 0x000A or # LF
1296     $self->{next_input_character} == 0x000B or # VT
1297     $self->{next_input_character} == 0x000C or # FF
1298     $self->{next_input_character} == 0x0020) { # SP
1299     ## Stay in the state
1300     !!!next-input-character;
1301     redo A;
1302     } elsif (0x0061 <= $self->{next_input_character} and
1303     $self->{next_input_character} <= 0x007A) { # a..z
1304     $self->{current_token} = {type => 'DOCTYPE',
1305     name => chr ($self->{next_input_character} - 0x0020),
1306     error => 1};
1307     $self->{state} = 'DOCTYPE name';
1308     !!!next-input-character;
1309     redo A;
1310     } elsif ($self->{next_input_character} == 0x003E) { # >
1311     !!!parse-error;
1312     $self->{state} = 'data';
1313     !!!next-input-character;
1314    
1315     !!!emit ({type => 'DOCTYPE', name => '', error => 1});
1316    
1317     redo A;
1318     } elsif ($self->{next_input_character} == -1) {
1319     !!!parse-error;
1320     $self->{state} = 'data';
1321     ## reconsume
1322    
1323     !!!emit ({type => 'DOCTYPE', name => '', error => 1});
1324    
1325     redo A;
1326     } else {
1327     $self->{current_token} = {type => 'DOCTYPE',
1328     name => chr ($self->{next_input_character}),
1329     error => 1};
1330     $self->{state} = 'DOCTYPE name';
1331     !!!next-input-character;
1332     redo A;
1333     }
1334     } elsif ($self->{state} eq 'DOCTYPE name') {
1335     if ($self->{next_input_character} == 0x0009 or # HT
1336     $self->{next_input_character} == 0x000A or # LF
1337     $self->{next_input_character} == 0x000B or # VT
1338     $self->{next_input_character} == 0x000C or # FF
1339     $self->{next_input_character} == 0x0020) { # SP
1340     $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE
1341     $self->{state} = 'after DOCTYPE name';
1342     !!!next-input-character;
1343     redo A;
1344     } elsif ($self->{next_input_character} == 0x003E) { # >
1345     $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE
1346     $self->{state} = 'data';
1347     !!!next-input-character;
1348    
1349     !!!emit ($self->{current_token}); # DOCTYPE
1350     undef $self->{current_token};
1351    
1352     redo A;
1353     } elsif (0x0061 <= $self->{next_input_character} and
1354     $self->{next_input_character} <= 0x007A) { # a..z
1355     $self->{current_token}->{name} .= chr ($self->{next_input_character} - 0x0020); # DOCTYPE
1356     #$self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML');
1357     ## Stay in the state
1358     !!!next-input-character;
1359     redo A;
1360     } elsif ($self->{next_input_character} == -1) {
1361     !!!parse-error;
1362     $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE
1363     $self->{state} = 'data';
1364     ## reconsume
1365    
1366     !!!emit ($self->{current_token});
1367     undef $self->{current_token};
1368    
1369     redo A;
1370     } else {
1371     $self->{current_token}->{name}
1372     .= chr ($self->{next_input_character}); # DOCTYPE
1373     #$self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML');
1374     ## Stay in the state
1375     !!!next-input-character;
1376     redo A;
1377     }
1378     } elsif ($self->{state} eq 'after DOCTYPE name') {
1379     if ($self->{next_input_character} == 0x0009 or # HT
1380     $self->{next_input_character} == 0x000A or # LF
1381     $self->{next_input_character} == 0x000B or # VT
1382     $self->{next_input_character} == 0x000C or # FF
1383     $self->{next_input_character} == 0x0020) { # SP
1384     ## Stay in the state
1385     !!!next-input-character;
1386     redo A;
1387     } elsif ($self->{next_input_character} == 0x003E) { # >
1388     $self->{state} = 'data';
1389     !!!next-input-character;
1390    
1391     !!!emit ($self->{current_token}); # DOCTYPE
1392     undef $self->{current_token};
1393    
1394     redo A;
1395     } elsif ($self->{next_input_character} == -1) {
1396     !!!parse-error;
1397     $self->{state} = 'data';
1398     ## reconsume
1399    
1400     !!!emit ($self->{current_token}); # DOCTYPE
1401     undef $self->{current_token};
1402    
1403     redo A;
1404     } else {
1405     !!!parse-error;
1406     $self->{current_token}->{error} = 1; # DOCTYPE
1407     $self->{state} = 'bogus DOCTYPE';
1408     !!!next-input-character;
1409     redo A;
1410     }
1411     } elsif ($self->{state} eq 'bogus DOCTYPE') {
1412     if ($self->{next_input_character} == 0x003E) { # >
1413     $self->{state} = 'data';
1414     !!!next-input-character;
1415    
1416     !!!emit ($self->{current_token}); # DOCTYPE
1417     undef $self->{current_token};
1418    
1419     redo A;
1420     } elsif ($self->{next_input_character} == -1) {
1421     !!!parse-error;
1422     $self->{state} = 'data';
1423     ## reconsume
1424    
1425     !!!emit ($self->{current_token}); # DOCTYPE
1426     undef $self->{current_token};
1427    
1428     redo A;
1429     } else {
1430     ## Stay in the state
1431     !!!next-input-character;
1432     redo A;
1433     }
1434     } else {
1435     die "$0: $self->{state}: Unknown state";
1436     }
1437     } # A
1438    
1439     die "$0: _get_next_token: unexpected case";
1440     } # _get_next_token
1441    
1442     sub _tokenize_attempt_to_consume_an_entity ($) {
1443     my $self = shift;
1444    
1445     if ($self->{next_input_character} == 0x0023) { # #
1446     !!!next-input-character;
1447     my $num;
1448     if ($self->{next_input_character} == 0x0078 or # x
1449     $self->{next_input_character} == 0x0058) { # X
1450     X: {
1451     my $x_char = $self->{next_input_character};
1452     !!!next-input-character;
1453     if (0x0030 <= $self->{next_input_character} and
1454     $self->{next_input_character} <= 0x0039) { # 0..9
1455     $num ||= 0;
1456     $num *= 0x10;
1457     $num += $self->{next_input_character} - 0x0030;
1458     redo X;
1459     } elsif (0x0061 <= $self->{next_input_character} and
1460     $self->{next_input_character} <= 0x0066) { # a..f
1461     ## ISSUE: the spec says U+0078, which is apparently incorrect
1462     $num ||= 0;
1463     $num *= 0x10;
1464     $num += $self->{next_input_character} - 0x0060 + 9;
1465     redo X;
1466     } elsif (0x0041 <= $self->{next_input_character} and
1467     $self->{next_input_character} <= 0x0046) { # A..F
1468     ## ISSUE: the spec says U+0058, which is apparently incorrect
1469     $num ||= 0;
1470     $num *= 0x10;
1471     $num += $self->{next_input_character} - 0x0040 + 9;
1472     redo X;
1473     } elsif (not defined $num) { # no hexadecimal digit
1474     !!!parse-error;
1475     $self->{next_input_character} = 0x0023; # #
1476     !!!back-next-input-character ($x_char);
1477     return undef;
1478     } elsif ($self->{next_input_character} == 0x003B) { # ;
1479     !!!next-input-character;
1480     } else {
1481     !!!parse-error;
1482     }
1483    
1484     ## TODO: check the definition for |a valid Unicode character|.
1485     if ($num > 1114111 or $num == 0) {
1486     $num = 0xFFFD; # REPLACEMENT CHARACTER
1487     ## ISSUE: Why this is not an error?
1488     }
1489    
1490     return {type => 'character', data => chr $num};
1491     } # X
1492     } elsif (0x0030 <= $self->{next_input_character} and
1493     $self->{next_input_character} <= 0x0039) { # 0..9
1494     my $code = $self->{next_input_character} - 0x0030;
1495     !!!next-input-character;
1496    
1497     while (0x0030 <= $self->{next_input_character} and
1498     $self->{next_input_character} <= 0x0039) { # 0..9
1499     $code *= 10;
1500     $code += $self->{next_input_character} - 0x0030;
1501    
1502     !!!next-input-character;
1503     }
1504    
1505     if ($self->{next_input_character} == 0x003B) { # ;
1506     !!!next-input-character;
1507     } else {
1508     !!!parse-error;
1509     }
1510    
1511     ## TODO: check the definition for |a valid Unicode character|.
1512     if ($code > 1114111 or $code == 0) {
1513     $code = 0xFFFD; # REPLACEMENT CHARACTER
1514     ## ISSUE: Why this is not an error?
1515     }
1516    
1517     return {type => 'character', data => chr $code};
1518     } else {
1519     !!!parse-error;
1520     !!!back-next-input-character ($self->{next_input_character});
1521     $self->{next_input_character} = 0x0023; # #
1522     return undef;
1523     }
1524     } elsif ((0x0041 <= $self->{next_input_character} and
1525     $self->{next_input_character} <= 0x005A) or
1526     (0x0061 <= $self->{next_input_character} and
1527     $self->{next_input_character} <= 0x007A)) {
1528     my $entity_name = chr $self->{next_input_character};
1529     !!!next-input-character;
1530    
1531     my $value = $entity_name;
1532     my $match;
1533    
1534     while (length $entity_name < 10 and
1535     ## NOTE: Some number greater than the maximum length of entity name
1536     ((0x0041 <= $self->{next_input_character} and
1537     $self->{next_input_character} <= 0x005A) or
1538     (0x0061 <= $self->{next_input_character} and
1539     $self->{next_input_character} <= 0x007A) or
1540     (0x0030 <= $self->{next_input_character} and
1541     $self->{next_input_character} <= 0x0039))) {
1542     $entity_name .= chr $self->{next_input_character};
1543     if (defined $entity_char->{$entity_name}) {
1544     $value = $entity_char->{$entity_name};
1545     $match = 1;
1546     } else {
1547     $value .= chr $self->{next_input_character};
1548     }
1549     !!!next-input-character;
1550     }
1551    
1552     if ($match) {
1553     if ($self->{next_input_character} == 0x003B) { # ;
1554     !!!next-input-character;
1555     } else {
1556     !!!parse-error;
1557     }
1558    
1559     return {type => 'character', data => $value};
1560     } else {
1561     !!!parse-error;
1562     ## NOTE: No characters are consumed in the spec.
1563     !!!back-token ({type => 'character', data => $value});
1564     return undef;
1565     }
1566     } else {
1567     ## no characters are consumed
1568     !!!parse-error;
1569     return undef;
1570     }
1571     } # _tokenize_attempt_to_consume_an_entity
1572    
1573     sub _initialize_tree_constructor ($) {
1574     my $self = shift;
1575     ## NOTE: $self->{document} MUST be specified before this method is called
1576     $self->{document}->strict_error_checking (0);
1577     ## TODO: Turn mutation events off # MUST
1578     ## TODO: Turn loose Document option (manakai extension) on
1579     ## TODO: Mark the Document as an HTML document # MUST
1580     } # _initialize_tree_constructor
1581    
1582     sub _terminate_tree_constructor ($) {
1583     my $self = shift;
1584     $self->{document}->strict_error_checking (1);
1585     ## TODO: Turn mutation events on
1586     } # _terminate_tree_constructor
1587    
1588     ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
1589    
1590     sub _construct_tree ($) {
1591     my ($self) = @_;
1592    
1593     ## When an interactive UA render the $self->{document} available
1594     ## to the user, or when it begin accepting user input, are
1595     ## not defined.
1596    
1597     ## Append a character: collect it and all subsequent consecutive
1598     ## characters and insert one Text node whose data is concatenation
1599     ## of all those characters. # MUST
1600    
1601     my $token;
1602     !!!next-token;
1603    
1604     my $phase = 'initial'; # MUST
1605    
1606     my $open_elements = [];
1607     my $active_formatting_elements = [];
1608     my $head_element;
1609     my $form_element;
1610     my $insertion_mode = 'before head';
1611    
1612     my $reconstruct_active_formatting_elements = sub { # MUST
1613     my $insert = shift;
1614    
1615     ## Step 1
1616     return unless @$active_formatting_elements;
1617    
1618     ## Step 3
1619     my $i = -1;
1620     my $entry = $active_formatting_elements->[$i];
1621    
1622     ## Step 2
1623     return if $entry->[0] eq '#marker';
1624     for (@$open_elements) {
1625     if ($entry->[0] eq $_->[0]) {
1626     return;
1627     }
1628     }
1629    
1630     S4: {
1631     ## Step 4
1632     last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
1633    
1634     ## Step 5
1635     $i--;
1636     $entry = $active_formatting_elements->[$i];
1637    
1638     ## Step 6
1639     if ($entry->[0] eq '#marker') {
1640     #
1641     } else {
1642     my $in_open_elements;
1643     OE: for (@$open_elements) {
1644     if ($entry->[0] eq $_->[0]) {
1645     $in_open_elements = 1;
1646     last OE;
1647     }
1648     }
1649     if ($in_open_elements) {
1650     #
1651     } else {
1652     redo S4;
1653     }
1654     }
1655    
1656     ## Step 7
1657     $i++;
1658     $entry = $active_formatting_elements->[$i];
1659     } # S4
1660    
1661     S7: {
1662     ## Step 8
1663     my $clone = [$entry->[0]->clone_node (0), $entry->[1]];
1664    
1665     ## Step 9
1666     $insert->($clone->[0]);
1667     push @$open_elements, $clone;
1668    
1669     ## Step 10
1670     $active_formatting_elements->[$i] = $open_elements->[-1];
1671    
1672     ## Step 11
1673     unless ($clone->[0] eq $active_formatting_elements->[-1]->[0]) {
1674     ## Step 7'
1675     $i++;
1676     $entry = $active_formatting_elements->[$i];
1677    
1678     redo S7;
1679     }
1680     } # S7
1681     }; # $reconstruct_active_formatting_elements
1682    
1683     my $clear_up_to_marker = sub {
1684     for (reverse 0..$#$active_formatting_elements) {
1685     if ($active_formatting_elements->[$_]->[0] eq '#marker') {
1686     splice @$active_formatting_elements, $_;
1687     return;
1688     }
1689     }
1690     }; # $clear_up_to_marker
1691    
1692     my $reset_insertion_mode = sub {
1693     ## Step 1
1694     my $last;
1695    
1696     ## Step 2
1697     my $i = -1;
1698     my $node = $open_elements->[$i];
1699    
1700     ## Step 3
1701     S3: {
1702     $last = 1 if $open_elements->[0]->[0] eq $node->[0];
1703     ## TODO: the element whose inner_html is set is neither td nor th, then $node = the element
1704    
1705     ## Step 4..13
1706     my $new_mode = {
1707     select => 'in select',
1708     td => 'in cell',
1709     th => 'in cell',
1710     tr => 'in row',
1711     tbody => 'in table body',
1712     thead => 'in table head',
1713     tfoot => 'in table foot',
1714     caption => 'in caption',
1715     colgroup => 'in column group',
1716     table => 'in table',
1717     head => 'in body', # not in head!
1718     body => 'in body',
1719     frameset => 'in frameset',
1720     }->{$node->[1]};
1721     $insertion_mode = $new_mode and return if defined $new_mode;
1722    
1723     ## Step 14
1724     if ($node->[1] eq 'html') {
1725     unless (defined $head_element) {
1726     $insertion_mode = 'before head';
1727     } else {
1728     $insertion_mode = 'after head';
1729     }
1730     return;
1731     }
1732    
1733     ## Step 15
1734     $insertion_mode = 'in body' and return if $last;
1735    
1736     ## Step 16
1737     $i--;
1738     $node = $open_elements->[$i];
1739    
1740     ## Step 17
1741     redo S3;
1742     } # S3
1743     }; # $reset_insertion_mode
1744    
1745     my $style_start_tag = sub {
1746     my $style_el; !!!create-element ($style_el, 'style');
1747     ## $insertion_mode eq 'in head' and ... (always true)
1748     (($insertion_mode eq 'in head' and defined $head_element)
1749     ? $head_element : $open_elements->[-1]->[0])
1750     ->append_child ($style_el);
1751     $self->{content_model_flag} = 'CDATA';
1752    
1753     my $text = '';
1754     !!!next-token;
1755     while ($token->{type} eq 'character') {
1756     $text .= $token->{data};
1757     !!!next-token;
1758     } # stop if non-character token or tokenizer stops tokenising
1759     if (length $text) {
1760     $style_el->manakai_append_text ($text);
1761     }
1762    
1763     $self->{content_model_flag} = 'PCDATA';
1764    
1765     if ($token->{type} eq 'end tag' and $token->{tag_name} eq 'style') {
1766     ## Ignore the token
1767     } else {
1768     !!!parse-error;
1769     ## ISSUE: And ignore?
1770     }
1771     !!!next-token;
1772     }; # $style_start_tag
1773    
1774     my $script_start_tag = sub {
1775     my $script_el;
1776     !!!create-element ($script_el, 'script', $token->{attributes});
1777     ## TODO: mark as "parser-inserted"
1778    
1779     $self->{content_model_flag} = 'CDATA';
1780    
1781     my $text = '';
1782     !!!next-token;
1783     while ($token->{type} eq 'character') {
1784     $text .= $token->{data};
1785     !!!next-token;
1786     } # stop if non-character token or tokenizer stops tokenising
1787     if (length $text) {
1788     $script_el->manakai_append_text ($text);
1789     }
1790    
1791     $self->{content_model_flag} = 'PCDATA';
1792    
1793     if ($token->{type} eq 'end tag' and
1794     $token->{tag_name} eq 'script') {
1795     ## Ignore the token
1796     } else {
1797     !!!parse-error;
1798     ## ISSUE: And ignore?
1799     ## TODO: mark as "already executed"
1800     }
1801    
1802     ## TODO: inner_html mode then mark as "already executed" and skip
1803     if (1) {
1804     ## TODO: $old_insertion_point = current insertion point
1805     ## TODO: insertion point = just before the next input character
1806    
1807     (($insertion_mode eq 'in head' and defined $head_element)
1808     ? $head_element : $open_elements->[-1]->[0])->append_child ($script_el);
1809    
1810     ## TODO: insertion point = $old_insertion_point (might be "undefined")
1811    
1812     ## TODO: if there is a script that will execute as soon as the parser resume, then...
1813     }
1814    
1815     !!!next-token;
1816     }; # $script_start_tag
1817    
1818     my $formatting_end_tag = sub {
1819     my $tag_name = shift;
1820    
1821     FET: {
1822     ## Step 1
1823     my $formatting_element;
1824     my $formatting_element_i_in_active;
1825     AFE: for (reverse 0..$#$active_formatting_elements) {
1826     if ($active_formatting_elements->[$_]->[1] eq $tag_name) {
1827     $formatting_element = $active_formatting_elements->[$_];
1828     $formatting_element_i_in_active = $_;
1829     last AFE;
1830     } elsif ($active_formatting_elements->[$_]->[0] eq '#marker') {
1831     last AFE;
1832     }
1833     } # AFE
1834     unless (defined $formatting_element) {
1835     !!!parse-error;
1836     ## Ignore the token
1837     !!!next-token;
1838     return;
1839     }
1840     ## has an element in scope
1841     my $in_scope = 1;
1842     my $formatting_element_i_in_open;
1843     INSCOPE: for (reverse 0..$#$open_elements) {
1844     my $node = $open_elements->[$_];
1845     if ($node->[0] eq $formatting_element->[0]) {
1846     if ($in_scope) {
1847     $formatting_element_i_in_open = $_;
1848     last INSCOPE;
1849     } else { # in open elements but not in scope
1850     !!!parse-error;
1851     ## Ignore the token
1852     !!!next-token;
1853     return;
1854     }
1855     } elsif ({
1856     table => 1, caption => 1, td => 1, th => 1,
1857     button => 1, marquee => 1, object => 1, html => 1,
1858     }->{$node->[1]}) {
1859     $in_scope = 0;
1860     }
1861     } # INSCOPE
1862     unless (defined $formatting_element_i_in_open) {
1863     !!!parse-error;
1864     pop @$active_formatting_elements; # $formatting_element
1865     !!!next-token; ## TODO: ok?
1866     return;
1867     }
1868     if (not $open_elements->[-1]->[0] eq $formatting_element->[0]) {
1869     !!!parse-error;
1870     }
1871    
1872     ## Step 2
1873     my $furthest_block;
1874     my $furthest_block_i_in_open;
1875     OE: for (reverse 0..$#$open_elements) {
1876     my $node = $open_elements->[$_];
1877     if (not $formatting_category->{$node->[1]} and
1878     #not $phrasing_category->{$node->[1]} and
1879     ($special_category->{$node->[1]} or
1880     $scoping_category->{$node->[1]})) {
1881     $furthest_block = $node;
1882     $furthest_block_i_in_open = $_;
1883     } elsif ($node->[0] eq $formatting_element->[0]) {
1884     last OE;
1885     }
1886     } # OE
1887    
1888     ## Step 3
1889     unless (defined $furthest_block) { # MUST
1890     splice @$open_elements, $formatting_element_i_in_open;
1891     splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
1892     !!!next-token;
1893     return;
1894     }
1895    
1896     ## Step 4
1897     my $common_ancestor_node = $open_elements->[$formatting_element_i_in_open - 1];
1898    
1899     ## Step 5
1900     my $furthest_block_parent = $furthest_block->[0]->parent_node;
1901     if (defined $furthest_block_parent) {
1902     $furthest_block_parent->remove_child ($furthest_block->[0]);
1903     }
1904    
1905     ## Step 6
1906     my $bookmark_prev_el
1907     = $active_formatting_elements->[$formatting_element_i_in_active - 1]
1908     ->[0];
1909    
1910     ## Step 7
1911     my $node = $furthest_block;
1912     my $node_i_in_open = $furthest_block_i_in_open;
1913     my $last_node = $furthest_block;
1914     S7: {
1915     ## Step 1
1916     $node_i_in_open--;
1917     $node = $open_elements->[$node_i_in_open];
1918    
1919     ## Step 2
1920     my $node_i_in_active;
1921     S7S2: {
1922     for (reverse 0..$#$active_formatting_elements) {
1923     if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
1924     $node_i_in_active = $_;
1925     last S7S2;
1926     }
1927     }
1928     splice @$open_elements, $node_i_in_open, 1;
1929     redo S7;
1930     } # S7S2
1931    
1932     ## Step 3
1933     last S7 if $node->[0] eq $formatting_element->[0];
1934    
1935     ## Step 4
1936     if ($last_node->[0] eq $furthest_block->[0]) {
1937     $bookmark_prev_el = $node->[0];
1938     }
1939    
1940     ## Step 5
1941     if ($node->[0]->has_child_nodes ()) {
1942     my $clone = [$node->[0]->clone_node (0), $node->[1]];
1943     $active_formatting_elements->[$node_i_in_active] = $clone;
1944     $open_elements->[$node_i_in_open] = $clone;
1945     $node = $clone;
1946     }
1947    
1948     ## Step 6
1949     $node->[0]->append_child ($last_node->[0]);
1950    
1951     ## Step 7
1952     $last_node = $node;
1953    
1954     ## Step 8
1955     redo S7;
1956     } # S7
1957    
1958     ## Step 8
1959     $common_ancestor_node->[0]->append_child ($last_node->[0]);
1960    
1961     ## Step 9
1962     my $clone = [$formatting_element->[0]->clone_node (0),
1963     $formatting_element->[1]];
1964    
1965     ## Step 10
1966     my @cn = @{$furthest_block->[0]->child_nodes};
1967     $clone->[0]->append_child ($_) for @cn;
1968    
1969     ## Step 11
1970     $furthest_block->[0]->append_child ($clone->[0]);
1971    
1972     ## Step 12
1973     my $i;
1974     AFE: for (reverse 0..$#$active_formatting_elements) {
1975     if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
1976     splice @$active_formatting_elements, $_, 1;
1977     $i-- and last AFE if defined $i;
1978     } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
1979     $i = $_;
1980     }
1981     } # AFE
1982     splice @$active_formatting_elements, $i + 1, 0, $clone;
1983    
1984     ## Step 13
1985     undef $i;
1986     OE: for (reverse 0..$#$open_elements) {
1987     if ($open_elements->[$_]->[0] eq $formatting_element->[0]) {
1988     splice @$open_elements, $_, 1;
1989     $i-- and last OE if defined $i;
1990     } elsif ($open_elements->[$_]->[0] eq $furthest_block->[0]) {
1991     $i = $_;
1992     }
1993     } # OE
1994     splice @$open_elements, $i + 1, 1, $clone;
1995    
1996     ## Step 14
1997     redo FET;
1998     } # FET
1999     }; # $formatting_end_tag
2000    
2001     my $insert_to_current = sub {
2002     $open_elements->[-1]->[0]->append_child (shift);
2003     }; # $insert_to_current
2004    
2005     my $insert_to_foster = sub {
2006     my $child = shift;
2007     if ({
2008     table => 1, tbody => 1, tfoot => 1,
2009     thead => 1, tr => 1,
2010     }->{$open_elements->[-1]->[1]}) {
2011     # MUST
2012     my $foster_parent_element;
2013     my $next_sibling;
2014     OE: for (reverse 0..$#$open_elements) {
2015     if ($open_elements->[$_]->[1] eq 'table') {
2016     my $parent = $open_elements->[$_]->[0]->parent_node;
2017     if (defined $parent and $parent->node_type == 1) {
2018     $foster_parent_element = $parent;
2019     $next_sibling = $open_elements->[$_]->[0];
2020     } else {
2021     $foster_parent_element
2022     = $open_elements->[$_ - 1]->[0];
2023     }
2024     last OE;
2025     }
2026     } # OE
2027     $foster_parent_element = $open_elements->[0]->[0]
2028     unless defined $foster_parent_element;
2029     $foster_parent_element->insert_before
2030     ($child, $next_sibling);
2031     } else {
2032     $open_elements->[-1]->[0]->append_child ($child);
2033     }
2034     }; # $insert_to_foster
2035    
2036     my $in_body = sub {
2037     my $insert = shift;
2038     if ($token->{type} eq 'start tag') {
2039     if ($token->{tag_name} eq 'script') {
2040     $script_start_tag->();
2041     return;
2042     } elsif ($token->{tag_name} eq 'style') {
2043     $style_start_tag->();
2044     return;
2045     } elsif ({
2046     base => 1, link => 1, meta => 1,
2047     }->{$token->{tag_name}}) {
2048     !!!parse-error ($token->{tag_name}.' in body');
2049     ## NOTE: This is an "as if in head" code clone
2050     my $el;
2051     !!!create-element ($el, $token->{tag_name}, $token->{attributes});
2052     if (defined $head_element) {
2053     $head_element->append_child ($el);
2054     } else {
2055     $insert->($el);
2056     }
2057    
2058     !!!next-token;
2059     return;
2060     } elsif ($token->{tag_name} eq 'title') {
2061     !!!parse-error ('title in body');
2062     ## NOTE: There is an "as if in head" code clone
2063     my $title_el;
2064     !!!create-element ($title_el, 'title', $token->{attributes});
2065     (defined $head_element ? $head_element : $open_elements->[-1]->[0])
2066     ->append_child ($title_el);
2067     $self->{content_model_flag} = 'RCDATA';
2068    
2069     my $text = '';
2070     !!!next-token;
2071     while ($token->{type} eq 'character') {
2072     $text .= $token->{data};
2073     !!!next-token;
2074     }
2075     if (length $text) {
2076     $title_el->manakai_append_text ($text);
2077     }
2078    
2079     $self->{content_model_flag} = 'PCDATA';
2080    
2081     if ($token->{type} eq 'end tag' and
2082     $token->{tag_name} eq 'title') {
2083     ## Ignore the token
2084     } else {
2085     !!!parse-error;
2086     ## ISSUE: And ignore?
2087     }
2088     !!!next-token;
2089     return;
2090     } elsif ($token->{tag_name} eq 'body') {
2091     !!!parse-error;
2092    
2093     if (@$open_elements == 1 or
2094     $open_elements->[1]->[1] ne 'body') {
2095     ## Ignore the token
2096     } else {
2097     my $body_el = $open_elements->[1]->[0];
2098     for my $attr_name (keys %{$token->{attributes}}) {
2099     unless ($body_el->has_attribute_ns (undef, $attr_name)) {
2100     $body_el->set_attribute_ns
2101     (undef, [undef, $attr_name],
2102     $token->{attributes}->{$attr_name}->{value});
2103     }
2104     }
2105     }
2106     !!!next-token;
2107     return;
2108     } elsif ({
2109     address => 1, blockquote => 1, center => 1, dir => 1,
2110     div => 1, dl => 1, fieldset => 1, listing => 1,
2111     menu => 1, ol => 1, p => 1, ul => 1,
2112     pre => 1,
2113     }->{$token->{tag_name}}) {
2114     ## has a p element in scope
2115     INSCOPE: for (reverse @$open_elements) {
2116     if ($_->[1] eq 'p') {
2117     !!!back-token;
2118     $token = {type => 'end tag', tag_name => 'p'};
2119     return;
2120     } elsif ({
2121     table => 1, caption => 1, td => 1, th => 1,
2122     button => 1, marquee => 1, object => 1, html => 1,
2123     }->{$_->[1]}) {
2124     last INSCOPE;
2125     }
2126     } # INSCOPE
2127    
2128     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2129     if ($token->{tag_name} eq 'pre') {
2130     !!!next-token;
2131     if ($token->{type} eq 'character') {
2132     $token->{data} =~ s/^\x0A//;
2133     unless (length $token->{data}) {
2134     !!!next-token;
2135     }
2136     }
2137     } else {
2138     !!!next-token;
2139     }
2140     return;
2141     } elsif ($token->{tag_name} eq 'form') {
2142     if (defined $form_element) {
2143     !!!parse-error;
2144     ## Ignore the token
2145     } else {
2146     ## has a p element in scope
2147     INSCOPE: for (reverse @$open_elements) {
2148     if ($_->[1] eq 'p') {
2149     !!!back-token;
2150     $token = {type => 'end tag', tag_name => 'p'};
2151     return;
2152     } elsif ({
2153     table => 1, caption => 1, td => 1, th => 1,
2154     button => 1, marquee => 1, object => 1, html => 1,
2155     }->{$_->[1]}) {
2156     last INSCOPE;
2157     }
2158     } # INSCOPE
2159    
2160     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2161     $form_element = $open_elements->[-1]->[0];
2162     !!!next-token;
2163     return;
2164     }
2165     } elsif ($token->{tag_name} eq 'li') {
2166     ## has a p element in scope
2167     INSCOPE: for (reverse @$open_elements) {
2168     if ($_->[1] eq 'p') {
2169     !!!back-token;
2170     $token = {type => 'end tag', tag_name => 'p'};
2171     return;
2172     } elsif ({
2173     table => 1, caption => 1, td => 1, th => 1,
2174     button => 1, marquee => 1, object => 1, html => 1,
2175     }->{$_->[1]}) {
2176     last INSCOPE;
2177     }
2178     } # INSCOPE
2179    
2180     ## Step 1
2181     my $i = -1;
2182     my $node = $open_elements->[$i];
2183     LI: {
2184     ## Step 2
2185     if ($node->[1] eq 'li') {
2186     splice @$open_elements, $i;
2187     last LI;
2188     }
2189    
2190     ## Step 3
2191     if (not $formatting_category->{$node->[1]} and
2192     #not $phrasing_category->{$node->[1]} and
2193     ($special_category->{$node->[1]} or
2194     $scoping_category->{$node->[1]}) and
2195     $node->[1] ne 'address' and $node->[1] ne 'div') {
2196     last LI;
2197     }
2198    
2199     ## Step 4
2200     $i--;
2201     $node = $open_elements->[$i];
2202     redo LI;
2203     } # LI
2204    
2205     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2206     !!!next-token;
2207     return;
2208     } elsif ($token->{tag_name} eq 'dd' or $token->{tag_name} eq 'dt') {
2209     ## has a p element in scope
2210     INSCOPE: for (reverse @$open_elements) {
2211     if ($_->[1] eq 'p') {
2212     !!!back-token;
2213     $token = {type => 'end tag', tag_name => 'p'};
2214     return;
2215     } elsif ({
2216     table => 1, caption => 1, td => 1, th => 1,
2217     button => 1, marquee => 1, object => 1, html => 1,
2218     }->{$_->[1]}) {
2219     last INSCOPE;
2220     }
2221     } # INSCOPE
2222    
2223     ## Step 1
2224     my $i = -1;
2225     my $node = $open_elements->[$i];
2226     LI: {
2227     ## Step 2
2228     if ($node->[1] eq 'dt' or $node->[1] eq 'dd') {
2229     splice @$open_elements, $i;
2230     last LI;
2231     }
2232    
2233     ## Step 3
2234     if (not $formatting_category->{$node->[1]} and
2235     #not $phrasing_category->{$node->[1]} and
2236     ($special_category->{$node->[1]} or
2237     $scoping_category->{$node->[1]}) and
2238     $node->[1] ne 'address' and $node->[1] ne 'div') {
2239     last LI;
2240     }
2241    
2242     ## Step 4
2243     $i--;
2244     $node = $open_elements->[$i];
2245     redo LI;
2246     } # LI
2247    
2248     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2249     !!!next-token;
2250     return;
2251     } elsif ($token->{tag_name} eq 'plaintext') {
2252     ## has a p element in scope
2253     INSCOPE: for (reverse @$open_elements) {
2254     if ($_->[1] eq 'p') {
2255     !!!back-token;
2256     $token = {type => 'end tag', tag_name => 'p'};
2257     return;
2258     } elsif ({
2259     table => 1, caption => 1, td => 1, th => 1,
2260     button => 1, marquee => 1, object => 1, html => 1,
2261     }->{$_->[1]}) {
2262     last INSCOPE;
2263     }
2264     } # INSCOPE
2265    
2266     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2267    
2268     $self->{content_model_flag} = 'PLAINTEXT';
2269    
2270     !!!next-token;
2271     return;
2272     } elsif ({
2273     h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
2274     }->{$token->{tag_name}}) {
2275     ## has a p element in scope
2276     INSCOPE: for (reverse 0..$#$open_elements) {
2277     my $node = $open_elements->[$_];
2278     if ($node->[1] eq 'p') {
2279     !!!back-token;
2280     $token = {type => 'end tag', tag_name => 'p'};
2281     return;
2282     } elsif ({
2283     table => 1, caption => 1, td => 1, th => 1,
2284     button => 1, marquee => 1, object => 1, html => 1,
2285     }->{$node->[1]}) {
2286     last INSCOPE;
2287     }
2288     } # INSCOPE
2289    
2290     ## has an element in scope
2291     my $i;
2292     INSCOPE: for (reverse 0..$#$open_elements) {
2293     my $node = $open_elements->[$_];
2294     if ({
2295     h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
2296     }->{$node->[1]}) {
2297     $i = $_;
2298     last INSCOPE;
2299     } elsif ({
2300     table => 1, caption => 1, td => 1, th => 1,
2301     button => 1, marquee => 1, object => 1, html => 1,
2302     }->{$node->[1]}) {
2303     last INSCOPE;
2304     }
2305     } # INSCOPE
2306    
2307     if (defined $i) {
2308     !!!parse-error;
2309     splice @$open_elements, $i;
2310     }
2311    
2312     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2313    
2314     !!!next-token;
2315     return;
2316     } elsif ($token->{tag_name} eq 'a') {
2317     AFE: for my $i (reverse 0..$#$active_formatting_elements) {
2318     my $node = $active_formatting_elements->[$i];
2319     if ($node->[1] eq 'a') {
2320     !!!parse-error ('a in a');
2321    
2322     !!!back-token;
2323     $token = {type => 'end tag', tag_name => 'a'};
2324     $formatting_end_tag->($token->{tag_name});
2325    
2326     AFE2: for (reverse 0..$#$active_formatting_elements) {
2327     if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
2328     splice @$active_formatting_elements, $_, 1;
2329     last AFE2;
2330     }
2331     } # AFE2
2332     OE: for (reverse 0..$#$open_elements) {
2333     if ($open_elements->[$_]->[0] eq $node->[0]) {
2334     splice @$open_elements, $_, 1;
2335     last OE;
2336     }
2337     } # OE
2338     last AFE;
2339     } elsif ($node->[0] eq '#marker') {
2340     last AFE;
2341     }
2342     } # AFE
2343    
2344     $reconstruct_active_formatting_elements->($insert_to_current);
2345    
2346     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2347     push @$active_formatting_elements, $open_elements->[-1];
2348    
2349     !!!next-token;
2350     return;
2351     } elsif ({
2352     b => 1, big => 1, em => 1, font => 1, i => 1,
2353     nobr => 1, s => 1, small => 1, strile => 1,
2354     strong => 1, tt => 1, u => 1,
2355     }->{$token->{tag_name}}) {
2356     $reconstruct_active_formatting_elements->($insert_to_current);
2357    
2358     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2359     push @$active_formatting_elements, $open_elements->[-1];
2360    
2361     !!!next-token;
2362     return;
2363     } elsif ($token->{tag_name} eq 'button') {
2364     ## has a button element in scope
2365     INSCOPE: for (reverse 0..$#$open_elements) {
2366     my $node = $open_elements->[$_];
2367     if ($node->[1] eq 'button') {
2368     !!!parse-error;
2369     !!!back-token;
2370     $token = {type => 'end tag', tag_name => 'button'};
2371     return;
2372     } elsif ({
2373     table => 1, caption => 1, td => 1, th => 1,
2374     button => 1, marquee => 1, object => 1, html => 1,
2375     }->{$node->[1]}) {
2376     last INSCOPE;
2377     }
2378     } # INSCOPE
2379    
2380     $reconstruct_active_formatting_elements->($insert_to_current);
2381    
2382     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2383     push @$active_formatting_elements, ['#marker', ''];
2384    
2385     !!!next-token;
2386     return;
2387     } elsif ($token->{tag_name} eq 'marquee' or
2388     $token->{tag_name} eq 'object') {
2389     $reconstruct_active_formatting_elements->($insert_to_current);
2390    
2391     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2392     push @$active_formatting_elements, ['#marker', ''];
2393    
2394     !!!next-token;
2395     return;
2396     } elsif ($token->{tag_name} eq 'xmp') {
2397     $reconstruct_active_formatting_elements->($insert_to_current);
2398    
2399     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2400    
2401     $self->{content_model_flag} = 'CDATA';
2402    
2403     !!!next-token;
2404     return;
2405     } elsif ($token->{tag_name} eq 'table') {
2406     ## has a p element in scope
2407     INSCOPE: for (reverse @$open_elements) {
2408     if ($_->[1] eq 'p') {
2409     !!!back-token;
2410     $token = {type => 'end tag', tag_name => 'p'};
2411     return;
2412     } elsif ({
2413     table => 1, caption => 1, td => 1, th => 1,
2414     button => 1, marquee => 1, object => 1, html => 1,
2415     }->{$_->[1]}) {
2416     last INSCOPE;
2417     }
2418     } # INSCOPE
2419    
2420     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2421    
2422     $insertion_mode = 'in table';
2423    
2424     !!!next-token;
2425     return;
2426     } elsif ({
2427     area => 1, basefont => 1, bgsound => 1, br => 1,
2428     embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
2429     image => 1,
2430     }->{$token->{tag_name}}) {
2431     if ($token->{tag_name} eq 'image') {
2432     !!!parse-error;
2433     $token->{tag_name} = 'img';
2434     }
2435    
2436     $reconstruct_active_formatting_elements->($insert_to_current);
2437    
2438     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2439     pop @$open_elements;
2440    
2441     !!!next-token;
2442     return;
2443     } elsif ($token->{tag_name} eq 'hr') {
2444     ## has a p element in scope
2445     INSCOPE: for (reverse @$open_elements) {
2446     if ($_->[1] eq 'p') {
2447     !!!back-token;
2448     $token = {type => 'end tag', tag_name => 'p'};
2449     return;
2450     } elsif ({
2451     table => 1, caption => 1, td => 1, th => 1,
2452     button => 1, marquee => 1, object => 1, html => 1,
2453     }->{$_->[1]}) {
2454     last INSCOPE;
2455     }
2456     } # INSCOPE
2457    
2458     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2459     pop @$open_elements;
2460    
2461     !!!next-token;
2462     return;
2463     } elsif ($token->{tag_name} eq 'input') {
2464     $reconstruct_active_formatting_elements->($insert_to_current);
2465    
2466     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2467     ## TODO: associate with $form_element if defined
2468     pop @$open_elements;
2469    
2470     !!!next-token;
2471     return;
2472     } elsif ($token->{tag_name} eq 'isindex') {
2473     !!!parse-error;
2474    
2475     if (defined $form_element) {
2476     ## Ignore the token
2477     !!!next-token;
2478     return;
2479     } else {
2480     my $at = $token->{attributes};
2481     $at->{name} = {name => 'name', value => 'isindex'};
2482     my @tokens = (
2483     {type => 'start tag', tag_name => 'form'},
2484     {type => 'start tag', tag_name => 'hr'},
2485     {type => 'start tag', tag_name => 'p'},
2486     {type => 'start tag', tag_name => 'label'},
2487     {type => 'character',
2488     data => 'This is a searchable index. Insert your search keywords here: '}, # SHOULD
2489     ## TODO: make this configurable
2490     {type => 'start tag', tag_name => 'input', attributes => $at},
2491     #{type => 'character', data => ''}, # SHOULD
2492     {type => 'end tag', tag_name => 'label'},
2493     {type => 'end tag', tag_name => 'p'},
2494     {type => 'start tag', tag_name => 'hr'},
2495     {type => 'end tag', tag_name => 'form'},
2496     );
2497     $token = shift @tokens;
2498     !!!back-token (@tokens);
2499     return;
2500     }
2501     } elsif ({
2502     textarea => 1,
2503     noembed => 1,
2504     noframes => 1,
2505     noscript => 0, ## TODO: 1 if scripting is enabled
2506     }->{$token->{tag_name}}) {
2507     my $tag_name = $token->{tag_name};
2508     my $el;
2509     !!!create-element ($el, $token->{tag_name}, $token->{attributes});
2510    
2511     if ($token->{tag_name} eq 'textarea') {
2512     ## TODO: form_element if defined
2513     $self->{content_model_flag} = 'RCDATA';
2514     } else {
2515     $self->{content_model_flag} = 'CDATA';
2516     }
2517    
2518     $insert->($el);
2519    
2520     my $text = '';
2521     !!!next-token;
2522     while ($token->{type} eq 'character') {
2523     $text .= $token->{data};
2524     !!!next-token;
2525     }
2526     if (length $text) {
2527     $el->manakai_append_text ($text);
2528     }
2529    
2530     $self->{content_model_flag} = 'PCDATA';
2531    
2532     if ($token->{type} eq 'end tag' and
2533     $token->{tag_name} eq $tag_name) {
2534     ## Ignore the token
2535     } else {
2536     !!!parse-error;
2537     ## ISSUE: And ignore?
2538     }
2539     !!!next-token;
2540     return;
2541     } elsif ($token->{tag_name} eq 'select') {
2542     $reconstruct_active_formatting_elements->($insert_to_current);
2543    
2544     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2545    
2546     $insertion_mode = 'in select';
2547     !!!next-token;
2548     return;
2549     } elsif ({
2550     caption => 1, col => 1, colgroup => 1, frame => 1,
2551     frameset => 1, head => 1, option => 1, optgroup => 1,
2552     tbody => 1, td => 1, tfoot => 1, th => 1,
2553     thead => 1, tr => 1,
2554     }->{$token->{tag_name}}) {
2555     !!!parse-error;
2556     ## Ignore the token
2557     !!!next-token;
2558     return;
2559    
2560     ## ISSUE: An issue on HTML5 new elements in the spec.
2561     } else {
2562     $reconstruct_active_formatting_elements->($insert_to_current);
2563    
2564     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2565    
2566     !!!next-token;
2567     return;
2568     }
2569     } elsif ($token->{type} eq 'end tag') {
2570     if ($token->{tag_name} eq 'body') {
2571     if (@$open_elements > 1 and $open_elements->[1]->[1] eq 'body') {
2572     ## ISSUE: There is an issue in the spec.
2573     if ($open_elements->[-1]->[1] ne 'body') {
2574     !!!parse-error;
2575     }
2576     $insertion_mode = 'after body';
2577     !!!next-token;
2578     return;
2579     } else {
2580     !!!parse-error;
2581     ## Ignore the token
2582     !!!next-token;
2583     return;
2584     }
2585     } elsif ($token->{tag_name} eq 'html') {
2586     if (@$open_elements > 1 and $open_elements->[1]->[1] eq 'body') {
2587     ## ISSUE: There is an issue in the spec.
2588     if ($open_elements->[-1]->[1] ne 'body') {
2589     !!!parse-error;
2590     }
2591     $insertion_mode = 'after body';
2592     ## reprocess
2593     return;
2594     } else {
2595     !!!parse-error;
2596     ## Ignore the token
2597     !!!next-token;
2598     return;
2599     }
2600     } elsif ({
2601     address => 1, blockquote => 1, center => 1, dir => 1,
2602     div => 1, dl => 1, fieldset => 1, listing => 1,
2603     menu => 1, ol => 1, pre => 1, ul => 1,
2604     form => 1,
2605     p => 1,
2606     dd => 1, dt => 1, li => 1,
2607     button => 1, marquee => 1, object => 1,
2608     }->{$token->{tag_name}}) {
2609     ## has an element in scope
2610     my $i;
2611     INSCOPE: for (reverse 0..$#$open_elements) {
2612     my $node = $open_elements->[$_];
2613     if ($node->[1] eq $token->{tag_name}) {
2614     ## generate implied end tags
2615     if ({
2616     dd => ($token->{tag_name} ne 'dd'),
2617     dt => ($token->{tag_name} ne 'dt'),
2618     li => ($token->{tag_name} ne 'li'),
2619     p => ($token->{tag_name} ne 'p'),
2620     td => 1, th => 1, tr => 1,
2621     }->{$open_elements->[-1]->[1]}) {
2622     !!!back-token;
2623     $token = {type => 'end tag',
2624     tag_name => $open_elements->[-1]->[1]}; # MUST
2625     return;
2626     }
2627     $i = $_;
2628     last INSCOPE unless $token->{tag_name} eq 'p';
2629     } elsif ({
2630     table => 1, caption => 1, td => 1, th => 1,
2631     button => 1, marquee => 1, object => 1, html => 1,
2632     }->{$node->[1]}) {
2633     last INSCOPE;
2634     }
2635     } # INSCOPE
2636    
2637     if ($open_elements->[-1]->[1] ne $token->{tag_name}) {
2638     !!!parse-error;
2639     }
2640    
2641     splice @$open_elements, $i if defined $i;
2642     undef $form_element if $token->{tag_name} eq 'form';
2643     $clear_up_to_marker->()
2644     if {
2645     button => 1, marquee => 1, object => 1,
2646     }->{$token->{tag_name}};
2647     !!!next-token;
2648     return;
2649     } elsif ({
2650     h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
2651     }->{$token->{tag_name}}) {
2652     ## has an element in scope
2653     my $i;
2654     INSCOPE: for (reverse 0..$#$open_elements) {
2655     my $node = $open_elements->[$_];
2656     if ({
2657     h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
2658     }->{$node->[1]}) {
2659     ## generate implied end tags
2660     if ({
2661     dd => 1, dt => 1, li => 1, p => 1,
2662     td => 1, th => 1, tr => 1,
2663     }->{$open_elements->[-1]->[1]}) {
2664     !!!back-token;
2665     $token = {type => 'end tag',
2666     tag_name => $open_elements->[-1]->[1]}; # MUST
2667     return;
2668     }
2669     $i = $_;
2670     last INSCOPE;
2671     } elsif ({
2672     table => 1, caption => 1, td => 1, th => 1,
2673     button => 1, marquee => 1, object => 1, html => 1,
2674     }->{$node->[1]}) {
2675     last INSCOPE;
2676     }
2677     } # INSCOPE
2678    
2679     if ($open_elements->[-1]->[1] ne $token->{tag_name}) {
2680     !!!parse-error;
2681     }
2682    
2683     splice @$open_elements, $i if defined $i;
2684     !!!next-token;
2685     return;
2686     } elsif ({
2687     a => 1,
2688     b => 1, big => 1, em => 1, font => 1, i => 1,
2689     nobr => 1, s => 1, small => 1, strile => 1,
2690     strong => 1, tt => 1, u => 1,
2691     }->{$token->{tag_name}}) {
2692     $formatting_end_tag->($token->{tag_name});
2693     return;
2694     } elsif ({
2695     caption => 1, col => 1, colgroup => 1, frame => 1,
2696     frameset => 1, head => 1, option => 1, optgroup => 1,
2697     tbody => 1, td => 1, tfoot => 1, th => 1,
2698     thead => 1, tr => 1,
2699     area => 1, basefont => 1, bgsound => 1, br => 1,
2700     embed => 1, hr => 1, iframe => 1, image => 1,
2701     img => 1, input => 1, isindex=> 1, noembed => 1,
2702     noframes => 1, param => 1, select => 1, spacer => 1,
2703     table => 1, textarea => 1, wbr => 1,
2704     noscript => 0, ## TODO: if scripting is enabled
2705     }->{$token->{tag_name}}) {
2706     !!!parse-error;
2707     ## Ignore the token
2708     !!!next-token;
2709     return;
2710    
2711     ## ISSUE: Issue on HTML5 new elements in spec
2712    
2713     } else {
2714     ## Step 1
2715     my $node_i = -1;
2716     my $node = $open_elements->[$node_i];
2717    
2718     ## Step 2
2719     S2: {
2720     if ($node->[1] eq $token->{tag_name}) {
2721     ## Step 1
2722     ## generate implied end tags
2723     if ({
2724     dd => 1, dt => 1, li => 1, p => 1,
2725     td => 1, th => 1, tr => 1,
2726     }->{$open_elements->[-1]->[1]}) {
2727     !!!back-token;
2728     $token = {type => 'end tag',
2729     tag_name => $open_elements->[-1]->[1]}; # MUST
2730     return;
2731     }
2732    
2733     ## Step 2
2734     if ($token->{tag_name} ne $open_elements->[-1]->[1]) {
2735     !!!parse-error;
2736     }
2737    
2738     ## Step 3
2739     splice @$open_elements, $node_i;
2740     last S2;
2741     } else {
2742     ## Step 3
2743     if (not $formatting_category->{$node->[1]} and
2744     #not $phrasing_category->{$node->[1]} and
2745     ($special_category->{$node->[1]} or
2746     $scoping_category->{$node->[1]})) {
2747     !!!parse-error;
2748     ## Ignore the token
2749     !!!next-token;
2750     last S2;
2751     }
2752     }
2753    
2754     ## Step 4
2755     $node_i--;
2756     $node = $open_elements->[$node_i];
2757    
2758     ## Step 5;
2759     redo S2;
2760     } # S2
2761     }
2762     }
2763     }; # $in_body
2764    
2765     B: {
2766     if ($phase eq 'initial') {
2767     if ($token->{type} eq 'DOCTYPE') {
2768     if ($token->{error}) {
2769     ## ISSUE: Spec currently left this case undefined.
2770     !!!parse-error ('bogus DOCTYPE');
2771     }
2772     my $doctype = $self->{document}->create_document_type_definition
2773     ($token->{name});
2774     $self->{document}->append_child ($doctype);
2775     $phase = 'root element';
2776     !!!next-token;
2777     redo B;
2778     } elsif ({
2779     comment => 1,
2780     'start tag' => 1,
2781     'end tag' => 1,
2782     'end-of-file' => 1,
2783     }->{$token->{type}}) {
2784     ## ISSUE: Spec currently left this case undefined.
2785     !!!parse-error ('missing DOCTYPE');
2786     $phase = 'root element';
2787     ## reprocess
2788     redo B;
2789     } elsif ($token->{type} eq 'character') {
2790     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
2791     $self->{document}->manakai_append_text ($1);
2792     ## ISSUE: DOM3 Core does not allow Document > Text
2793     unless (length $token->{data}) {
2794     ## Stay in the phase
2795     !!!next-token;
2796     redo B;
2797     }
2798     }
2799     ## ISSUE: Spec currently left this case undefined.
2800     !!!parse-error ('missing DOCTYPE');
2801     $phase = 'root element';
2802     ## reprocess
2803     redo B;
2804     } else {
2805     die "$0: $token->{type}: Unknown token";
2806     }
2807     } elsif ($phase eq 'root element') {
2808     if ($token->{type} eq 'DOCTYPE') {
2809     !!!parse-error;
2810     ## Ignore the token
2811     ## Stay in the phase
2812     !!!next-token;
2813     redo B;
2814     } elsif ($token->{type} eq 'comment') {
2815     my $comment = $self->{document}->create_comment ($token->{data});
2816     $self->{document}->append_child ($comment);
2817     ## Stay in the phase
2818     !!!next-token;
2819     redo B;
2820     } elsif ($token->{type} eq 'character') {
2821     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
2822     $self->{document}->manakai_append_text ($1);
2823     ## ISSUE: DOM3 Core does not allow Document > Text
2824     unless (length $token->{data}) {
2825     ## Stay in the phase
2826     !!!next-token;
2827     redo B;
2828     }
2829     }
2830     #
2831     } elsif ({
2832     'start tag' => 1,
2833     'end tag' => 1,
2834     'end-of-file' => 1,
2835     }->{$token->{type}}) {
2836     ## ISSUE: There is an issue in the spec
2837     #
2838     } else {
2839     die "$0: $token->{type}: Unknown token";
2840     }
2841     my $root_element; !!!create-element ($root_element, 'html');
2842     $self->{document}->append_child ($root_element);
2843     $open_elements = [[$root_element, 'html']];
2844     $phase = 'main';
2845     ## reprocess
2846     redo B;
2847     } elsif ($phase eq 'main') {
2848     if ($token->{type} eq 'DOCTYPE') {
2849     !!!parse-error;
2850     ## Ignore the token
2851     ## Stay in the phase
2852     !!!next-token;
2853     redo B;
2854     } elsif ($token->{type} eq 'start tag' and
2855     $token->{tag_name} eq 'html') {
2856     ## TODO: unless it is the first start tag token, parse-error
2857     my $top_el = $open_elements->[0]->[0];
2858     for my $attr_name (keys %{$token->{attributes}}) {
2859     unless ($top_el->has_attribute_ns (undef, $attr_name)) {
2860     $top_el->set_attribute_ns
2861     (undef, [undef, $attr_name],
2862     $token->{attributes}->{$attr_name}->{value});
2863     }
2864     }
2865     !!!next-token;
2866     redo B;
2867     } elsif ($token->{type} eq 'end-of-file') {
2868     ## Generate implied end tags
2869     if ({
2870     dd => 1, dt => 1, li => 1, p => 1, td => 1, th => 1, tr => 1,
2871     }->{$open_elements->[-1]->[1]}) {
2872     !!!back-token;
2873     $token = {type => 'end tag', tag_name => $open_elements->[-1]->[1]};
2874     redo B;
2875     }
2876    
2877     if (@$open_elements > 2 or
2878     (@$open_elements == 2 and $open_elements->[1]->[1] ne 'body')) {
2879     !!!parse-error;
2880     } else {
2881     ## TODO: inner_html parser and @$open_elements > 1 and $open_elements->[1] ne 'body', then parse-error
2882     }
2883    
2884     ## Stop parsing
2885     last B;
2886    
2887     ## ISSUE: There is an issue in the spec.
2888     } else {
2889     if ($insertion_mode eq 'before head') {
2890     if ($token->{type} eq 'character') {
2891     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
2892     $open_elements->[-1]->[0]->manakai_append_text ($1);
2893     unless (length $token->{data}) {
2894     !!!next-token;
2895     redo B;
2896     }
2897     }
2898     ## As if <head>
2899     !!!create-element ($head_element, 'head');
2900     $open_elements->[-1]->[0]->append_child ($head_element);
2901     push @$open_elements, [$head_element, 'head'];
2902     $insertion_mode = 'in head';
2903     ## reprocess
2904     redo B;
2905     } elsif ($token->{type} eq 'comment') {
2906     my $comment = $self->{document}->create_comment ($token->{data});
2907     $open_elements->[-1]->[0]->append_child ($comment);
2908     !!!next-token;
2909     redo B;
2910     } elsif ($token->{type} eq 'start tag') {
2911     my $attr = $token->{tag_name} eq 'head' ? $token->{attributes} : {};
2912     !!!create-element ($head_element, 'head', $attr);
2913     $open_elements->[-1]->[0]->append_child ($head_element);
2914     push @$open_elements, [$head_element, 'head'];
2915     $insertion_mode = 'in head';
2916     if ($token->{tag_name} eq 'head') {
2917     !!!next-token;
2918     #} elsif ({
2919     # base => 1, link => 1, meta => 1,
2920     # script => 1, style => 1, title => 1,
2921     # }->{$token->{tag_name}}) {
2922     # ## reprocess
2923     } else {
2924     ## reprocess
2925     }
2926     redo B;
2927     } elsif ($token->{type} eq 'end tag') {
2928     if ($token->{tag_name} eq 'html') {
2929     ## As if <head>
2930     !!!create-element ($head_element, 'head');
2931     $open_elements->[-1]->[0]->append_child ($head_element);
2932     push @$open_elements, [$head_element, 'head'];
2933     $insertion_mode = 'in head';
2934     ## reprocess
2935     redo B;
2936     } else {
2937     !!!parse-error;
2938     ## Ignore the token
2939     !!!next-token;
2940     redo B;
2941     }
2942     } else {
2943     die "$0: $token->{type}: Unknown type";
2944     }
2945     } elsif ($insertion_mode eq 'in head') {
2946     if ($token->{type} eq 'character') {
2947     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
2948     $open_elements->[-1]->[0]->manakai_append_text ($1);
2949     unless (length $token->{data}) {
2950     !!!next-token;
2951     redo B;
2952     }
2953     }
2954    
2955     #
2956     } elsif ($token->{type} eq 'comment') {
2957     my $comment = $self->{document}->create_comment ($token->{data});
2958     $open_elements->[-1]->[0]->append_child ($comment);
2959     !!!next-token;
2960     redo B;
2961     } elsif ($token->{type} eq 'start tag') {
2962     if ($token->{tag_name} eq 'title') {
2963     ## NOTE: There is an "as if in head" code clone
2964     my $title_el;
2965     !!!create-element ($title_el, 'title', $token->{attributes});
2966     (defined $head_element ? $head_element : $open_elements->[-1]->[0])
2967     ->append_child ($title_el);
2968     $self->{content_model_flag} = 'RCDATA';
2969    
2970     my $text = '';
2971     !!!next-token;
2972     while ($token->{type} eq 'character') {
2973     $text .= $token->{data};
2974     !!!next-token;
2975     }
2976     if (length $text) {
2977     $title_el->manakai_append_text ($text);
2978     }
2979    
2980     $self->{content_model_flag} = 'PCDATA';
2981    
2982     if ($token->{type} eq 'end tag' and
2983     $token->{tag_name} eq 'title') {
2984     ## Ignore the token
2985     } else {
2986     !!!parse-error;
2987     ## ISSUE: And ignore?
2988     }
2989     !!!next-token;
2990     redo B;
2991     } elsif ($token->{tag_name} eq 'style') {
2992     $style_start_tag->();
2993     redo B;
2994     } elsif ($token->{tag_name} eq 'script') {
2995     $script_start_tag->();
2996     redo B;
2997     } elsif ({base => 1, link => 1, meta => 1}->{$token->{tag_name}}) {
2998     ## NOTE: There are "as if in head" code clones
2999     my $el;
3000     !!!create-element ($el, $token->{tag_name}, $token->{attributes});
3001     (defined $head_element ? $head_element : $open_elements->[-1]->[0])
3002     ->append_child ($el);
3003    
3004     !!!next-token;
3005     redo B;
3006     } elsif ($token->{tag_name} eq 'head') {
3007     !!!parse-error;
3008     ## Ignore the token
3009     !!!next-token;
3010     redo B;
3011     } else {
3012     #
3013     }
3014     } elsif ($token->{type} eq 'end tag') {
3015     if ($token->{tag_name} eq 'head') {
3016     if ($open_elements->[-1]->[1] eq 'head') {
3017     pop @$open_elements;
3018     } else {
3019     !!!parse-error;
3020     }
3021     $insertion_mode = 'after head';
3022     !!!next-token;
3023     redo B;
3024     } elsif ($token->{tag_name} eq 'html') {
3025     #
3026     } else {
3027     !!!parse-error;
3028     ## Ignore the token
3029     !!!next-token;
3030     redo B;
3031     }
3032     } else {
3033     #
3034     }
3035    
3036     if ($open_elements->[-1]->[1] eq 'head') {
3037     ## As if </head>
3038     pop @$open_elements;
3039     }
3040     $insertion_mode = 'after head';
3041     ## reprocess
3042     redo B;
3043    
3044     ## ISSUE: An issue in the spec.
3045     } elsif ($insertion_mode eq 'after head') {
3046     if ($token->{type} eq 'character') {
3047     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3048     $open_elements->[-1]->[0]->manakai_append_text ($1);
3049     unless (length $token->{data}) {
3050     !!!next-token;
3051     redo B;
3052     }
3053     }
3054    
3055     #
3056     } elsif ($token->{type} eq 'comment') {
3057     my $comment = $self->{document}->create_comment ($token->{data});
3058     $open_elements->[-1]->[0]->append_child ($comment);
3059     !!!next-token;
3060     redo B;
3061     } elsif ($token->{type} eq 'start tag') {
3062     if ($token->{tag_name} eq 'body') {
3063     !!!insert-element ('body', $token->{attributes});
3064     $insertion_mode = 'in body';
3065     !!!next-token;
3066     redo B;
3067     } elsif ($token->{tag_name} eq 'frameset') {
3068     !!!insert-element ('frameset', $token->{attributes});
3069     $insertion_mode = 'in frameset';
3070     !!!next-token;
3071     redo B;
3072     } elsif ({
3073     base => 1, link => 1, meta => 1,
3074     script=> 1, style => 1, title => 1,
3075     }->{$token->{tag_name}}) {
3076     !!!parse-error;
3077     $insertion_mode = 'in head';
3078     ## reprocess
3079     redo B;
3080     } else {
3081     #
3082     }
3083     } else {
3084     #
3085     }
3086    
3087     ## As if <body>
3088     !!!insert-element ('body');
3089     $insertion_mode = 'in body';
3090     ## reprocess
3091     redo B;
3092     } elsif ($insertion_mode eq 'in body') {
3093     if ($token->{type} eq 'character') {
3094     ## NOTE: There is a code clone of "character in body".
3095     $reconstruct_active_formatting_elements->($insert_to_current);
3096    
3097     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
3098    
3099     !!!next-token;
3100     redo B;
3101     } elsif ($token->{type} eq 'comment') {
3102     ## NOTE: There is a code clone of "comment in body".
3103     my $comment = $self->{document}->create_comment ($token->{data});
3104     $open_elements->[-1]->[0]->append_child ($comment);
3105     !!!next-token;
3106     redo B;
3107     } else {
3108     $in_body->($insert_to_current);
3109     redo B;
3110     }
3111     } elsif ($insertion_mode eq 'in table') {
3112     if ($token->{type} eq 'character') {
3113     ## NOTE: There are "character in table" code clones.
3114     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3115     $open_elements->[-1]->[0]->manakai_append_text ($1);
3116    
3117     unless (length $token->{data}) {
3118     !!!next-token;
3119     redo B;
3120     }
3121     }
3122    
3123     ## As if in body, but insert into foster parent element
3124     ## ISSUE: Spec says that "whenever a node would be inserted
3125     ## into the current node" while characters might not be
3126     ## result in a new Text node.
3127     $reconstruct_active_formatting_elements->($insert_to_foster);
3128    
3129     if ({
3130     table => 1, tbody => 1, tfoot => 1,
3131     thead => 1, tr => 1,
3132     }->{$open_elements->[-1]->[1]}) {
3133     # MUST
3134     my $foster_parent_element;
3135     my $next_sibling;
3136     my $prev_sibling;
3137     OE: for (reverse 0..$#$open_elements) {
3138     if ($open_elements->[$_]->[1] eq 'table') {
3139     my $parent = $open_elements->[$_]->[0]->parent_node;
3140     if (defined $parent and $parent->node_type == 1) {
3141     $foster_parent_element = $parent;
3142     $next_sibling = $open_elements->[$_]->[0];
3143     $prev_sibling = $next_sibling->previous_sibling;
3144     } else {
3145     $foster_parent_element = $open_elements->[$_ - 1]->[0];
3146     $prev_sibling = $foster_parent_element->last_child;
3147     }
3148     last OE;
3149     }
3150     } # OE
3151     $foster_parent_element = $open_elements->[0]->[0] and
3152     $prev_sibling = $foster_parent_element->last_child
3153     unless defined $foster_parent_element;
3154     if (defined $prev_sibling and
3155     $prev_sibling->node_type == 3) {
3156     $prev_sibling->manakai_append_text ($token->{data});
3157     } else {
3158     $foster_parent_element->insert_before
3159     ($self->{document}->create_text_node ($token->{data}),
3160     $next_sibling);
3161     }
3162     } else {
3163     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
3164     }
3165    
3166     !!!next-token;
3167     redo B;
3168     } elsif ($token->{type} eq 'comment') {
3169     my $comment = $self->{document}->create_comment ($token->{data});
3170     $open_elements->[-1]->[0]->append_child ($comment);
3171     !!!next-token;
3172     redo B;
3173     } elsif ($token->{type} eq 'start tag') {
3174     if ({
3175     caption => 1,
3176     colgroup => 1,
3177     tbody => 1, tfoot => 1, thead => 1,
3178     }->{$token->{tag_name}}) {
3179     ## Clear back to table context
3180     while ($open_elements->[-1]->[1] ne 'table' and
3181     $open_elements->[-1]->[1] ne 'html') {
3182     !!!parse-error;
3183     pop @$open_elements;
3184     }
3185    
3186     push @$active_formatting_elements, ['#marker', '']
3187     if $token->{tag_name} eq 'caption';
3188    
3189     !!!insert-element ($token->{tag_name}, $token->{attributes});
3190     $insertion_mode = {
3191     caption => 'in caption',
3192     colgroup => 'in column group',
3193     tbody => 'in table body',
3194     tfoot => 'in table body',
3195     thead => 'in table body',
3196     }->{$token->{tag_name}};
3197     !!!next-token;
3198     redo B;
3199     } elsif ({
3200     col => 1,
3201     td => 1, th => 1, tr => 1,
3202     }->{$token->{tag_name}}) {
3203     ## Clear back to table context
3204     while ($open_elements->[-1]->[1] ne 'table' and
3205     $open_elements->[-1]->[1] ne 'html') {
3206     !!!parse-error;
3207     pop @$open_elements;
3208     }
3209    
3210     !!!insert-element ($token->{tag_name} eq 'col' ? 'colgroup' : 'tbody');
3211     $insertion_mode = $token->{tag_name} eq 'col'
3212     ? 'in column group' : 'in table body';
3213     ## reprocess
3214     redo B;
3215     } elsif ($token->{tag_name} eq 'table') {
3216     ## NOTE: There are code clones for this "table in table"
3217     !!!parse-error;
3218    
3219     ## As if </table>
3220     ## have a table element in table scope
3221     my $i;
3222     INSCOPE: for (reverse 0..$#$open_elements) {
3223     my $node = $open_elements->[$_];
3224     if ($node->[1] eq 'table') {
3225     $i = $_;
3226     last INSCOPE;
3227     } elsif ({
3228     table => 1, html => 1,
3229     }->{$node->[1]}) {
3230     last INSCOPE;
3231     }
3232     } # INSCOPE
3233     unless (defined $i) {
3234     !!!parse-error;
3235     ## Ignore tokens </table><table>
3236     !!!next-token;
3237     redo B;
3238     }
3239    
3240     ## generate implied end tags
3241     if ({
3242     dd => 1, dt => 1, li => 1, p => 1,
3243     td => 1, th => 1, tr => 1,
3244     }->{$open_elements->[-1]->[1]}) {
3245     !!!back-token; # <table>
3246     $token = {type => 'end tag', tag_name => 'table'};
3247     !!!back-token;
3248     $token = {type => 'end tag',
3249     tag_name => $open_elements->[-1]->[1]}; # MUST
3250     redo B;
3251     }
3252    
3253     if ($open_elements->[-1]->[1] ne 'table') {
3254     !!!parse-error;
3255     }
3256    
3257     splice @$open_elements, $i;
3258    
3259     $reset_insertion_mode->();
3260    
3261     ## reprocess
3262     redo B;
3263     } else {
3264     #
3265     }
3266     } elsif ($token->{type} eq 'end tag') {
3267     if ($token->{tag_name} eq 'table') {
3268     ## have a table element in table scope
3269     my $i;
3270     INSCOPE: for (reverse 0..$#$open_elements) {
3271     my $node = $open_elements->[$_];
3272     if ($node->[1] eq $token->{tag_name}) {
3273     $i = $_;
3274     last INSCOPE;
3275     } elsif ({
3276     table => 1, html => 1,
3277     }->{$node->[1]}) {
3278     last INSCOPE;
3279     }
3280     } # INSCOPE
3281     unless (defined $i) {
3282     !!!parse-error;
3283     ## Ignore the token
3284     !!!next-token;
3285     redo B;
3286     }
3287    
3288     ## generate implied end tags
3289     if ({
3290     dd => 1, dt => 1, li => 1, p => 1,
3291     td => 1, th => 1, tr => 1,
3292     }->{$open_elements->[-1]->[1]}) {
3293     !!!back-token;
3294     $token = {type => 'end tag',
3295     tag_name => $open_elements->[-1]->[1]}; # MUST
3296     redo B;
3297     }
3298    
3299     if ($open_elements->[-1]->[1] ne 'table') {
3300     !!!parse-error;
3301     }
3302    
3303     splice @$open_elements, $i;
3304    
3305     $reset_insertion_mode->();
3306    
3307     !!!next-token;
3308     redo B;
3309     } elsif ({
3310     body => 1, caption => 1, col => 1, colgroup => 1,
3311     html => 1, tbody => 1, td => 1, tfoot => 1, th => 1,
3312     thead => 1, tr => 1,
3313     }->{$token->{tag_name}}) {
3314     !!!parse-error;
3315     ## Ignore the token
3316     !!!next-token;
3317     redo B;
3318     } else {
3319     #
3320     }
3321     } else {
3322     #
3323     }
3324    
3325     !!!parse-error;
3326     $in_body->($insert_to_foster);
3327     redo B;
3328     } elsif ($insertion_mode eq 'in caption') {
3329     if ($token->{type} eq 'character') {
3330     ## NOTE: This is a code clone of "character in body".
3331     $reconstruct_active_formatting_elements->($insert_to_current);
3332    
3333     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
3334    
3335     !!!next-token;
3336     redo B;
3337     } elsif ($token->{type} eq 'comment') {
3338     ## NOTE: This is a code clone of "comment in body".
3339     my $comment = $self->{document}->create_comment ($token->{data});
3340     $open_elements->[-1]->[0]->append_child ($comment);
3341     !!!next-token;
3342     redo B;
3343     } elsif ($token->{type} eq 'start tag') {
3344     if ({
3345     caption => 1, col => 1, colgroup => 1, tbody => 1,
3346     td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
3347     }->{$token->{tag_name}}) {
3348     !!!parse-error;
3349    
3350     ## As if </caption>
3351     ## have a table element in table scope
3352     my $i;
3353     INSCOPE: for (reverse 0..$#$open_elements) {
3354     my $node = $open_elements->[$_];
3355     if ($node->[1] eq 'caption') {
3356     $i = $_;
3357     last INSCOPE;
3358     } elsif ({
3359     table => 1, html => 1,
3360     }->{$node->[1]}) {
3361     last INSCOPE;
3362     }
3363     } # INSCOPE
3364     unless (defined $i) {
3365     !!!parse-error;
3366     ## Ignore the token
3367     !!!next-token;
3368     redo B;
3369     }
3370    
3371     ## generate implied end tags
3372     if ({
3373     dd => 1, dt => 1, li => 1, p => 1,
3374     td => 1, th => 1, tr => 1,
3375     }->{$open_elements->[-1]->[1]}) {
3376     !!!back-token; # <?>
3377     $token = {type => 'end tag', tag_name => 'caption'};
3378     !!!back-token;
3379     $token = {type => 'end tag',
3380     tag_name => $open_elements->[-1]->[1]}; # MUST
3381     redo B;
3382     }
3383    
3384     if ($open_elements->[-1]->[1] ne 'caption') {
3385     !!!parse-error;
3386     }
3387    
3388     splice @$open_elements, $i;
3389    
3390     $clear_up_to_marker->();
3391    
3392     $insertion_mode = 'in table';
3393    
3394     ## reprocess
3395     redo B;
3396     } else {
3397     #
3398     }
3399     } elsif ($token->{type} eq 'end tag') {
3400     if ($token->{tag_name} eq 'caption') {
3401     ## have a table element in table scope
3402     my $i;
3403     INSCOPE: for (reverse 0..$#$open_elements) {
3404     my $node = $open_elements->[$_];
3405     if ($node->[1] eq $token->{tag_name}) {
3406     $i = $_;
3407     last INSCOPE;
3408     } elsif ({
3409     table => 1, html => 1,
3410     }->{$node->[1]}) {
3411     last INSCOPE;
3412     }
3413     } # INSCOPE
3414     unless (defined $i) {
3415     !!!parse-error;
3416     ## Ignore the token
3417     !!!next-token;
3418     redo B;
3419     }
3420    
3421     ## generate implied end tags
3422     if ({
3423     dd => 1, dt => 1, li => 1, p => 1,
3424     td => 1, th => 1, tr => 1,
3425     }->{$open_elements->[-1]->[1]}) {
3426     !!!back-token;
3427     $token = {type => 'end tag',
3428     tag_name => $open_elements->[-1]->[1]}; # MUST
3429     redo B;
3430     }
3431    
3432     if ($open_elements->[-1]->[1] ne 'caption') {
3433     !!!parse-error;
3434     }
3435    
3436     splice @$open_elements, $i;
3437    
3438     $clear_up_to_marker->();
3439    
3440     $insertion_mode = 'in table';
3441    
3442     !!!next-token;
3443     redo B;
3444     } elsif ($token->{tag_name} eq 'table') {
3445     !!!parse-error;
3446    
3447     ## As if </caption>
3448     ## have a table element in table scope
3449     my $i;
3450     INSCOPE: for (reverse 0..$#$open_elements) {
3451     my $node = $open_elements->[$_];
3452     if ($node->[1] eq 'caption') {
3453     $i = $_;
3454     last INSCOPE;
3455     } elsif ({
3456     table => 1, html => 1,
3457     }->{$node->[1]}) {
3458     last INSCOPE;
3459     }
3460     } # INSCOPE
3461     unless (defined $i) {
3462     !!!parse-error;
3463     ## Ignore the token
3464     !!!next-token;
3465     redo B;
3466     }
3467    
3468     ## generate implied end tags
3469     if ({
3470     dd => 1, dt => 1, li => 1, p => 1,
3471     td => 1, th => 1, tr => 1,
3472     }->{$open_elements->[-1]->[1]}) {
3473     !!!back-token; # </table>
3474     $token = {type => 'end tag', tag_name => 'caption'};
3475     !!!back-token;
3476     $token = {type => 'end tag',
3477     tag_name => $open_elements->[-1]->[1]}; # MUST
3478     redo B;
3479     }
3480    
3481     if ($open_elements->[-1]->[1] ne 'caption') {
3482     !!!parse-error;
3483     }
3484    
3485     splice @$open_elements, $i;
3486    
3487     $clear_up_to_marker->();
3488    
3489     $insertion_mode = 'in table';
3490    
3491     ## reprocess
3492     redo B;
3493     } elsif ({
3494     body => 1, col => 1, colgroup => 1,
3495     html => 1, tbody => 1, td => 1, tfoot => 1,
3496     th => 1, thead => 1, tr => 1,
3497     }->{$token->{tag_name}}) {
3498     !!!parse-error;
3499     ## Ignore the token
3500     redo B;
3501     } else {
3502     #
3503     }
3504     } else {
3505     #
3506     }
3507    
3508     $in_body->($insert_to_current);
3509     redo B;
3510     } elsif ($insertion_mode eq 'in column group') {
3511     if ($token->{type} eq 'character') {
3512     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3513     $open_elements->[-1]->[0]->manakai_append_text ($1);
3514     unless (length $token->{data}) {
3515     !!!next-token;
3516     redo B;
3517     }
3518     }
3519    
3520     #
3521     } elsif ($token->{type} eq 'comment') {
3522     my $comment = $self->{document}->create_comment ($token->{data});
3523     $open_elements->[-1]->[0]->append_child ($comment);
3524     !!!next-token;
3525     redo B;
3526     } elsif ($token->{type} eq 'start tag') {
3527     if ($token->{tag_name} eq 'col') {
3528     !!!insert-element ($token->{tag_name}, $token->{attributes});
3529     pop @$open_elements;
3530     !!!next-token;
3531     redo B;
3532     } else {
3533     #
3534     }
3535     } elsif ($token->{type} eq 'end tag') {
3536     if ($token->{tag_name} eq 'colgroup') {
3537     if ($open_elements->[-1]->[1] eq 'html') {
3538     !!!parse-error;
3539     ## Ignore the token
3540     !!!next-token;
3541     redo B;
3542     } else {
3543     pop @$open_elements; # colgroup
3544     $insertion_mode = 'in table';
3545     !!!next-token;
3546     redo B;
3547     }
3548     } elsif ($token->{tag_name} eq 'col') {
3549     !!!parse-error;
3550     ## Ignore the token
3551     !!!next-token;
3552     redo B;
3553     } else {
3554     #
3555     }
3556     } else {
3557     #
3558     }
3559    
3560     ## As if </colgroup>
3561     if ($open_elements->[-1]->[1] eq 'html') {
3562     !!!parse-error;
3563     ## Ignore the token
3564     !!!next-token;
3565     redo B;
3566     } else {
3567     pop @$open_elements; # colgroup
3568     $insertion_mode = 'in table';
3569     ## reprocess
3570     redo B;
3571     }
3572     } elsif ($insertion_mode eq 'in table body') {
3573     if ($token->{type} eq 'character') {
3574     ## NOTE: This is a "character in table" code clone.
3575     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3576     $open_elements->[-1]->[0]->manakai_append_text ($1);
3577    
3578     unless (length $token->{data}) {
3579     !!!next-token;
3580     redo B;
3581     }
3582     }
3583    
3584     ## As if in body, but insert into foster parent element
3585     ## ISSUE: Spec says that "whenever a node would be inserted
3586     ## into the current node" while characters might not be
3587     ## result in a new Text node.
3588     $reconstruct_active_formatting_elements->($insert_to_foster);
3589    
3590     if ({
3591     table => 1, tbody => 1, tfoot => 1,
3592     thead => 1, tr => 1,
3593     }->{$open_elements->[-1]->[1]}) {
3594     # MUST
3595     my $foster_parent_element;
3596     my $next_sibling;
3597     my $prev_sibling;
3598     OE: for (reverse 0..$#$open_elements) {
3599     if ($open_elements->[$_]->[1] eq 'table') {
3600     my $parent = $open_elements->[$_]->[0]->parent_node;
3601     if (defined $parent and $parent->node_type == 1) {
3602     $foster_parent_element = $parent;
3603     $next_sibling = $open_elements->[$_]->[0];
3604     $prev_sibling = $next_sibling->previous_sibling;
3605     } else {
3606     $foster_parent_element = $open_elements->[$_ - 1]->[0];
3607     $prev_sibling = $foster_parent_element->last_child;
3608     }
3609     last OE;
3610     }
3611     } # OE
3612     $foster_parent_element = $open_elements->[0]->[0] and
3613     $prev_sibling = $foster_parent_element->last_child
3614     unless defined $foster_parent_element;
3615     if (defined $prev_sibling and
3616     $prev_sibling->node_type == 3) {
3617     $prev_sibling->manakai_append_text ($token->{data});
3618     } else {
3619     $foster_parent_element->insert_before
3620     ($self->{document}->create_text_node ($token->{data}),
3621     $next_sibling);
3622     }
3623     } else {
3624     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
3625     }
3626    
3627     !!!next-token;
3628     redo B;
3629     } elsif ($token->{type} eq 'comment') {
3630     ## Copied from 'in table'
3631     my $comment = $self->{document}->create_comment ($token->{data});
3632     $open_elements->[-1]->[0]->append_child ($comment);
3633     !!!next-token;
3634     redo B;
3635     } elsif ($token->{type} eq 'start tag') {
3636     if ({
3637     tr => 1,
3638     th => 1, td => 1,
3639     }->{$token->{tag_name}}) {
3640     ## Clear back to table body context
3641     while (not {
3642     tbody => 1, tfoot => 1, thead => 1, html => 1,
3643     }->{$open_elements->[-1]->[1]}) {
3644     !!!parse-error;
3645     pop @$open_elements;
3646     }
3647    
3648     $insertion_mode = 'in row';
3649     if ($token->{tag_name} eq 'tr') {
3650     !!!insert-element ($token->{tag_name}, $token->{attributes});
3651     !!!next-token;
3652     } else {
3653     !!!insert-element ('tr');
3654     ## reprocess
3655     }
3656     redo B;
3657     } elsif ({
3658     caption => 1, col => 1, colgroup => 1,
3659     tbody => 1, tfoot => 1, thead => 1,
3660     }->{$token->{tag_name}}) {
3661     ## have an element in table scope
3662     my $i;
3663     INSCOPE: for (reverse 0..$#$open_elements) {
3664     my $node = $open_elements->[$_];
3665     if ({
3666     tbody => 1, thead => 1, tfoot => 1,
3667     }->{$node->[1]}) {
3668     $i = $_;
3669     last INSCOPE;
3670     } elsif ({
3671     table => 1, html => 1,
3672     }->{$node->[1]}) {
3673     last INSCOPE;
3674     }
3675     } # INSCOPE
3676     unless (defined $i) {
3677     !!!parse-error;
3678     ## Ignore the token
3679     !!!next-token;
3680     redo B;
3681     }
3682    
3683     ## Clear back to table body context
3684     while (not {
3685     tbody => 1, tfoot => 1, thead => 1, html => 1,
3686     }->{$open_elements->[-1]->[1]}) {
3687     !!!parse-error;
3688     pop @$open_elements;
3689     }
3690    
3691     ## As if <{current node}>
3692     ## have an element in table scope
3693     ## true by definition
3694    
3695     ## Clear back to table body context
3696     ## nop by definition
3697    
3698     pop @$open_elements;
3699     $insertion_mode = 'in table';
3700     ## reprocess
3701     redo B;
3702     } elsif ($token->{tag_name} eq 'table') {
3703     ## NOTE: This is a code clone of "table in table"
3704     !!!parse-error;
3705    
3706     ## As if </table>
3707     ## have a table element in table scope
3708     my $i;
3709     INSCOPE: for (reverse 0..$#$open_elements) {
3710     my $node = $open_elements->[$_];
3711     if ($node->[1] eq 'table') {
3712     $i = $_;
3713     last INSCOPE;
3714     } elsif ({
3715     table => 1, html => 1,
3716     }->{$node->[1]}) {
3717     last INSCOPE;
3718     }
3719     } # INSCOPE
3720     unless (defined $i) {
3721     !!!parse-error;
3722     ## Ignore tokens </table><table>
3723     !!!next-token;
3724     redo B;
3725     }
3726    
3727     ## generate implied end tags
3728     if ({
3729     dd => 1, dt => 1, li => 1, p => 1,
3730     td => 1, th => 1, tr => 1,
3731     }->{$open_elements->[-1]->[1]}) {
3732     !!!back-token; # <table>
3733     $token = {type => 'end tag', tag_name => 'table'};
3734     !!!back-token;
3735     $token = {type => 'end tag',
3736     tag_name => $open_elements->[-1]->[1]}; # MUST
3737     redo B;
3738     }
3739    
3740     if ($open_elements->[-1]->[1] ne 'table') {
3741     !!!parse-error;
3742     }
3743    
3744     splice @$open_elements, $i;
3745    
3746     $reset_insertion_mode->();
3747    
3748     ## reprocess
3749     redo B;
3750     } else {
3751     #
3752     }
3753     } elsif ($token->{type} eq 'end tag') {
3754     if ({
3755     tbody => 1, tfoot => 1, thead => 1,
3756     }->{$token->{tag_name}}) {
3757     ## have an element in table scope
3758     my $i;
3759     INSCOPE: for (reverse 0..$#$open_elements) {
3760     my $node = $open_elements->[$_];
3761     if ($node->[1] eq $token->{tag_name}) {
3762     $i = $_;
3763     last INSCOPE;
3764     } elsif ({
3765     table => 1, html => 1,
3766     }->{$node->[1]}) {
3767     last INSCOPE;
3768     }
3769     } # INSCOPE
3770     unless (defined $i) {
3771     !!!parse-error;
3772     ## Ignore the token
3773     !!!next-token;
3774     redo B;
3775     }
3776    
3777     ## Clear back to table body context
3778     while (not {
3779     tbody => 1, tfoot => 1, thead => 1, html => 1,
3780     }->{$open_elements->[-1]->[1]}) {
3781     !!!parse-error;
3782     pop @$open_elements;
3783     }
3784    
3785     pop @$open_elements;
3786     $insertion_mode = 'in table';
3787     !!!next-token;
3788     redo B;
3789     } elsif ($token->{tag_name} eq 'table') {
3790     ## have an element in table scope
3791     my $i;
3792     INSCOPE: for (reverse 0..$#$open_elements) {
3793     my $node = $open_elements->[$_];
3794     if ({
3795     tbody => 1, thead => 1, tfoot => 1,
3796     }->{$node->[1]}) {
3797     $i = $_;
3798     last INSCOPE;
3799     } elsif ({
3800     table => 1, html => 1,
3801     }->{$node->[1]}) {
3802     last INSCOPE;
3803     }
3804     } # INSCOPE
3805     unless (defined $i) {
3806     !!!parse-error;
3807     ## Ignore the token
3808     !!!next-token;
3809     redo B;
3810     }
3811    
3812     ## Clear back to table body context
3813     while (not {
3814     tbody => 1, tfoot => 1, thead => 1, html => 1,
3815     }->{$open_elements->[-1]->[1]}) {
3816     !!!parse-error;
3817     pop @$open_elements;
3818     }
3819    
3820     ## As if <{current node}>
3821     ## have an element in table scope
3822     ## true by definition
3823    
3824     ## Clear back to table body context
3825     ## nop by definition
3826    
3827     pop @$open_elements;
3828     $insertion_mode = 'in table';
3829     ## reprocess
3830     redo B;
3831     } elsif ({
3832     body => 1, caption => 1, col => 1, colgroup => 1,
3833     html => 1, td => 1, th => 1, tr => 1,
3834     }->{$token->{tag_name}}) {
3835     !!!parse-error;
3836     ## Ignore the token
3837     !!!next-token;
3838     redo B;
3839     } else {
3840     #
3841     }
3842     } else {
3843     #
3844     }
3845    
3846     ## As if in table
3847     !!!parse-error;
3848     $in_body->($insert_to_foster);
3849     redo B;
3850     } elsif ($insertion_mode eq 'in row') {
3851     if ($token->{type} eq 'character') {
3852     ## NOTE: This is a "character in table" code clone.
3853     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3854     $open_elements->[-1]->[0]->manakai_append_text ($1);
3855    
3856     unless (length $token->{data}) {
3857     !!!next-token;
3858     redo B;
3859     }
3860     }
3861    
3862     ## As if in body, but insert into foster parent element
3863     ## ISSUE: Spec says that "whenever a node would be inserted
3864     ## into the current node" while characters might not be
3865     ## result in a new Text node.
3866     $reconstruct_active_formatting_elements->($insert_to_foster);
3867    
3868     if ({
3869     table => 1, tbody => 1, tfoot => 1,
3870     thead => 1, tr => 1,
3871     }->{$open_elements->[-1]->[1]}) {
3872     # MUST
3873     my $foster_parent_element;
3874     my $next_sibling;
3875     my $prev_sibling;
3876     OE: for (reverse 0..$#$open_elements) {
3877     if ($open_elements->[$_]->[1] eq 'table') {
3878     my $parent = $open_elements->[$_]->[0]->parent_node;
3879     if (defined $parent and $parent->node_type == 1) {
3880     $foster_parent_element = $parent;
3881     $next_sibling = $open_elements->[$_]->[0];
3882     $prev_sibling = $next_sibling->previous_sibling;
3883     } else {
3884     $foster_parent_element = $open_elements->[$_ - 1]->[0];
3885     $prev_sibling = $foster_parent_element->last_child;
3886     }
3887     last OE;
3888     }
3889     } # OE
3890     $foster_parent_element = $open_elements->[0]->[0] and
3891     $prev_sibling = $foster_parent_element->last_child
3892     unless defined $foster_parent_element;
3893     if (defined $prev_sibling and
3894     $prev_sibling->node_type == 3) {
3895     $prev_sibling->manakai_append_text ($token->{data});
3896     } else {
3897     $foster_parent_element->insert_before
3898     ($self->{document}->create_text_node ($token->{data}),
3899     $next_sibling);
3900     }
3901     } else {
3902     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
3903     }
3904    
3905     !!!next-token;
3906     redo B;
3907     } elsif ($token->{type} eq 'comment') {
3908     ## Copied from 'in table'
3909     my $comment = $self->{document}->create_comment ($token->{data});
3910     $open_elements->[-1]->[0]->append_child ($comment);
3911     !!!next-token;
3912     redo B;
3913     } elsif ($token->{type} eq 'start tag') {
3914     if ($token->{tag_name} eq 'th' or
3915     $token->{tag_name} eq 'td') {
3916     ## Clear back to table row context
3917     while (not {
3918     tr => 1, html => 1,
3919     }->{$open_elements->[-1]->[1]}) {
3920     !!!parse-error;
3921     pop @$open_elements;
3922     }
3923    
3924     !!!insert-element ($token->{tag_name}, $token->{attributes});
3925     $insertion_mode = 'in cell';
3926    
3927     push @$active_formatting_elements, ['#marker', ''];
3928    
3929     !!!next-token;
3930     redo B;
3931     } elsif ({
3932     caption => 1, col => 1, colgroup => 1,
3933     tbody => 1, tfoot => 1, thead => 1, tr => 1,
3934     }->{$token->{tag_name}}) {
3935     ## As if </tr>
3936     ## have an element in table scope
3937     my $i;
3938     INSCOPE: for (reverse 0..$#$open_elements) {
3939     my $node = $open_elements->[$_];
3940     if ($node->[1] eq 'tr') {
3941     $i = $_;
3942     last INSCOPE;
3943     } elsif ({
3944     table => 1, html => 1,
3945     }->{$node->[1]}) {
3946     last INSCOPE;
3947     }
3948     } # INSCOPE
3949     unless (defined $i) {
3950     !!!parse-error;
3951     ## Ignore the token
3952     !!!next-token;
3953     redo B;
3954     }
3955    
3956     ## Clear back to table row context
3957     while (not {
3958     tr => 1, html => 1,
3959     }->{$open_elements->[-1]->[1]}) {
3960     !!!parse-error;
3961     pop @$open_elements;
3962     }
3963    
3964     pop @$open_elements; # tr
3965     $insertion_mode = 'in table body';
3966     ## reprocess
3967     redo B;
3968     } elsif ($token->{tag_name} eq 'table') {
3969     ## NOTE: This is a code clone of "table in table"
3970     !!!parse-error;
3971    
3972     ## As if </table>
3973     ## have a table element in table scope
3974     my $i;
3975     INSCOPE: for (reverse 0..$#$open_elements) {
3976     my $node = $open_elements->[$_];
3977     if ($node->[1] eq 'table') {
3978     $i = $_;
3979     last INSCOPE;
3980     } elsif ({
3981     table => 1, html => 1,
3982     }->{$node->[1]}) {
3983     last INSCOPE;
3984     }
3985     } # INSCOPE
3986     unless (defined $i) {
3987     !!!parse-error;
3988     ## Ignore tokens </table><table>
3989     !!!next-token;
3990     redo B;
3991     }
3992    
3993     ## generate implied end tags
3994     if ({
3995     dd => 1, dt => 1, li => 1, p => 1,
3996     td => 1, th => 1, tr => 1,
3997     }->{$open_elements->[-1]->[1]}) {
3998     !!!back-token; # <table>
3999     $token = {type => 'end tag', tag_name => 'table'};
4000     !!!back-token;
4001     $token = {type => 'end tag',
4002     tag_name => $open_elements->[-1]->[1]}; # MUST
4003     redo B;
4004     }
4005    
4006     if ($open_elements->[-1]->[1] ne 'table') {
4007     !!!parse-error;
4008     }
4009    
4010     splice @$open_elements, $i;
4011    
4012     $reset_insertion_mode->();
4013    
4014     ## reprocess
4015     redo B;
4016     } else {
4017     #
4018     }
4019     } elsif ($token->{type} eq 'end tag') {
4020     if ($token->{tag_name} eq 'tr') {
4021     ## have an element in table scope
4022     my $i;
4023     INSCOPE: for (reverse 0..$#$open_elements) {
4024     my $node = $open_elements->[$_];
4025     if ($node->[1] eq $token->{tag_name}) {
4026     $i = $_;
4027     last INSCOPE;
4028     } elsif ({
4029     table => 1, html => 1,
4030     }->{$node->[1]}) {
4031     last INSCOPE;
4032     }
4033     } # INSCOPE
4034     unless (defined $i) {
4035     !!!parse-error;
4036     ## Ignore the token
4037     !!!next-token;
4038     redo B;
4039     }
4040    
4041     ## Clear back to table row context
4042     while (not {
4043     tr => 1, html => 1,
4044     }->{$open_elements->[-1]->[1]}) {
4045     !!!parse-error;
4046     pop @$open_elements;
4047     }
4048    
4049     pop @$open_elements; # tr
4050     $insertion_mode = 'in table body';
4051     !!!next-token;
4052     redo B;
4053     } elsif ($token->{tag_name} eq 'table') {
4054     ## As if </tr>
4055     ## have an element in table scope
4056     my $i;
4057     INSCOPE: for (reverse 0..$#$open_elements) {
4058     my $node = $open_elements->[$_];
4059     if ($node->[1] eq 'tr') {
4060     $i = $_;
4061     last INSCOPE;
4062     } elsif ({
4063     table => 1, html => 1,
4064     }->{$node->[1]}) {
4065     last INSCOPE;
4066     }
4067     } # INSCOPE
4068     unless (defined $i) {
4069     !!!parse-error;
4070     ## Ignore the token
4071     !!!next-token;
4072     redo B;
4073     }
4074    
4075     ## Clear back to table row context
4076     while (not {
4077     tr => 1, html => 1,
4078     }->{$open_elements->[-1]->[1]}) {
4079     !!!parse-error;
4080     pop @$open_elements;
4081     }
4082    
4083     pop @$open_elements; # tr
4084     $insertion_mode = 'in table body';
4085     ## reprocess
4086     redo B;
4087     } elsif ({
4088     tbody => 1, tfoot => 1, thead => 1,
4089     }->{$token->{tag_name}}) {
4090     ## have an element in table scope
4091     my $i;
4092     INSCOPE: for (reverse 0..$#$open_elements) {
4093     my $node = $open_elements->[$_];
4094     if ($node->[1] eq $token->{tag_name}) {
4095     $i = $_;
4096     last INSCOPE;
4097     } elsif ({
4098     table => 1, html => 1,
4099     }->{$node->[1]}) {
4100     last INSCOPE;
4101     }
4102     } # INSCOPE
4103     unless (defined $i) {
4104     !!!parse-error;
4105     ## Ignore the token
4106     !!!next-token;
4107     redo B;
4108     }
4109    
4110     ## As if </tr>
4111     ## have an element in table scope
4112     my $i;
4113     INSCOPE: for (reverse 0..$#$open_elements) {
4114     my $node = $open_elements->[$_];
4115     if ($node->[1] eq 'tr') {
4116     $i = $_;
4117     last INSCOPE;
4118     } elsif ({
4119     table => 1, html => 1,
4120     }->{$node->[1]}) {
4121     last INSCOPE;
4122     }
4123     } # INSCOPE
4124     unless (defined $i) {
4125     !!!parse-error;
4126     ## Ignore the token
4127     !!!next-token;
4128     redo B;
4129     }
4130    
4131     ## Clear back to table row context
4132     while (not {
4133     tr => 1, html => 1,
4134     }->{$open_elements->[-1]->[1]}) {
4135     !!!parse-error;
4136     pop @$open_elements;
4137     }
4138    
4139     pop @$open_elements; # tr
4140     $insertion_mode = 'in table body';
4141     ## reprocess
4142     redo B;
4143     } elsif ({
4144     body => 1, caption => 1, col => 1,
4145     colgroup => 1, html => 1, td => 1, th => 1,
4146     }->{$token->{tag_name}}) {
4147     !!!parse-error;
4148     ## Ignore the token
4149     !!!next-token;
4150     redo B;
4151     } else {
4152     #
4153     }
4154     } else {
4155     #
4156     }
4157    
4158     ## As if in table
4159     !!!parse-error;
4160     $in_body->($insert_to_foster);
4161     redo B;
4162     } elsif ($insertion_mode eq 'in cell') {
4163     if ($token->{type} eq 'character') {
4164     ## NOTE: This is a code clone of "character in body".
4165     $reconstruct_active_formatting_elements->($insert_to_current);
4166    
4167     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
4168    
4169     !!!next-token;
4170     redo B;
4171     } elsif ($token->{type} eq 'comment') {
4172     ## NOTE: This is a code clone of "comment in body".
4173     my $comment = $self->{document}->create_comment ($token->{data});
4174     $open_elements->[-1]->[0]->append_child ($comment);
4175     !!!next-token;
4176     redo B;
4177     } elsif ($token->{type} eq 'start tag') {
4178     if ({
4179     caption => 1, col => 1, colgroup => 1,
4180     tbody => 1, td => 1, tfoot => 1, th => 1,
4181     thead => 1, tr => 1,
4182     }->{$token->{tag_name}}) {
4183     ## have an element in table scope
4184     my $tn;
4185     INSCOPE: for (reverse 0..$#$open_elements) {
4186     my $node = $open_elements->[$_];
4187     if ($node->[1] eq 'td' or $node->[1] eq 'th') {
4188     $tn = $node->[1];
4189     last INSCOPE;
4190     } elsif ({
4191     table => 1, html => 1,
4192     }->{$node->[1]}) {
4193     last INSCOPE;
4194     }
4195     } # INSCOPE
4196     unless (defined $tn) {
4197     !!!parse-error;
4198     ## Ignore the token
4199     !!!next-token;
4200     redo B;
4201     }
4202    
4203     ## Close the cell
4204     !!!back-token; # <?>
4205     $token = {type => 'end tag', tag_name => $tn};
4206     redo B;
4207     } else {
4208     #
4209     }
4210     } elsif ($token->{type} eq 'end tag') {
4211     if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') {
4212     ## have an element in table scope
4213     my $i;
4214     INSCOPE: for (reverse 0..$#$open_elements) {
4215     my $node = $open_elements->[$_];
4216     if ($node->[1] eq $token->{tag_name}) {
4217     $i = $_;
4218     last INSCOPE;
4219     } elsif ({
4220     table => 1, html => 1,
4221     }->{$node->[1]}) {
4222     last INSCOPE;
4223     }
4224     } # INSCOPE
4225     unless (defined $i) {
4226     !!!parse-error;
4227     ## Ignore the token
4228     !!!next-token;
4229     redo B;
4230     }
4231    
4232     ## generate implied end tags
4233     if ({
4234     dd => 1, dt => 1, li => 1, p => 1,
4235     td => ($token->{tag_name} eq 'th'),
4236     th => ($token->{tag_name} eq 'td'),
4237     tr => 1,
4238     }->{$open_elements->[-1]->[1]}) {
4239     !!!back-token;
4240     $token = {type => 'end tag',
4241     tag_name => $open_elements->[-1]->[1]}; # MUST
4242     redo B;
4243     }
4244    
4245     if ($open_elements->[-1]->[1] ne $token->{tag_name}) {
4246     !!!parse-error;
4247     }
4248    
4249     splice @$open_elements, $i;
4250    
4251     $clear_up_to_marker->();
4252    
4253     $insertion_mode = 'in row';
4254    
4255     !!!next-token;
4256     redo B;
4257     } elsif ({
4258     body => 1, caption => 1, col => 1,
4259     colgroup => 1, html => 1,
4260     }->{$token->{tag_name}}) {
4261     !!!parse-error;
4262     ## Ignore the token
4263     !!!next-token;
4264     redo B;
4265     } elsif ({
4266     table => 1, tbody => 1, tfoot => 1,
4267     thead => 1, tr => 1,
4268     }->{$token->{tag_name}}) {
4269     ## have an element in table scope
4270     my $i;
4271     my $tn;
4272     INSCOPE: for (reverse 0..$#$open_elements) {
4273     my $node = $open_elements->[$_];
4274     if ($node->[1] eq $token->{tag_name}) {
4275     $i = $_;
4276     last INSCOPE;
4277     } elsif ($node->[1] eq 'td' or $node->[1] eq 'th') {
4278     $tn = $node->[1];
4279     ## NOTE: There is exactly one |td| or |th| element
4280     ## in scope in the stack of open elements by definition.
4281     } elsif ({
4282     table => 1, html => 1,
4283     }->{$node->[1]}) {
4284     last INSCOPE;
4285     }
4286     } # INSCOPE
4287     unless (defined $i) {
4288     !!!parse-error;
4289     ## Ignore the token
4290     !!!next-token;
4291     redo B;
4292     }
4293    
4294     ## Close the cell
4295     !!!back-token; # </?>
4296     $token = {type => 'end tag', tag_name => $tn};
4297     redo B;
4298     } else {
4299     #
4300     }
4301     } else {
4302     #
4303     }
4304    
4305     $in_body->($insert_to_current);
4306     redo B;
4307     } elsif ($insertion_mode eq 'in select') {
4308     if ($token->{type} eq 'character') {
4309     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
4310     !!!next-token;
4311     redo B;
4312     } elsif ($token->{type} eq 'comment') {
4313     my $comment = $self->{document}->create_comment ($token->{data});
4314     $open_elements->[-1]->[0]->append_child ($comment);
4315     !!!next-token;
4316     redo B;
4317     } elsif ($token->{type} eq 'start tag') {
4318     if ($token->{tag_name} eq 'option') {
4319     if ($open_elements->[-1]->[1] eq 'option') {
4320     ## As if </option>
4321     pop @$open_elements;
4322     }
4323    
4324     !!!insert-element ($token->{tag_name}, $token->{attributes});
4325     !!!next-token;
4326     redo B;
4327     } elsif ($token->{tag_name} eq 'optgroup') {
4328     if ($open_elements->[-1]->[1] eq 'option') {
4329     ## As if </option>
4330     pop @$open_elements;
4331     }
4332    
4333     if ($open_elements->[-1]->[1] eq 'optgroup') {
4334     ## As if </optgroup>
4335     pop @$open_elements;
4336     }
4337    
4338     !!!insert-element ($token->{tag_name}, $token->{attributes});
4339     !!!next-token;
4340     redo B;
4341     } elsif ($token->{tag_name} eq 'select') {
4342     !!!parse-error;
4343     ## As if </select> instead
4344     ## have an element in table scope
4345     my $i;
4346     INSCOPE: for (reverse 0..$#$open_elements) {
4347     my $node = $open_elements->[$_];
4348     if ($node->[1] eq $token->{tag_name}) {
4349     $i = $_;
4350     last INSCOPE;
4351     } elsif ({
4352     table => 1, html => 1,
4353     }->{$node->[1]}) {
4354     last INSCOPE;
4355     }
4356     } # INSCOPE
4357     unless (defined $i) {
4358     !!!parse-error;
4359     ## Ignore the token
4360     !!!next-token;
4361     redo B;
4362     }
4363    
4364     splice @$open_elements, $i;
4365    
4366     $reset_insertion_mode->();
4367    
4368     !!!next-token;
4369     redo B;
4370     } else {
4371     #
4372     }
4373     } elsif ($token->{type} eq 'end tag') {
4374     if ($token->{tag_name} eq 'optgroup') {
4375     if ($open_elements->[-1]->[1] eq 'option' and
4376     $open_elements->[-2]->[1] eq 'optgroup') {
4377     ## As if </option>
4378     splice @$open_elements, -2;
4379     } elsif ($open_elements->[-1]->[1] eq 'optgroup') {
4380     pop @$open_elements;
4381     } else {
4382     !!!parse-error;
4383     ## Ignore the token
4384     }
4385     !!!next-token;
4386     redo B;
4387     } elsif ($token->{tag_name} eq 'option') {
4388     if ($open_elements->[-1]->[1] eq 'option') {
4389     pop @$open_elements;
4390     } else {
4391     !!!parse-error;
4392     ## Ignore the token
4393     }
4394     !!!next-token;
4395     redo B;
4396     } elsif ($token->{tag_name} eq 'select') {
4397     ## have an element in table scope
4398     my $i;
4399     INSCOPE: for (reverse 0..$#$open_elements) {
4400     my $node = $open_elements->[$_];
4401     if ($node->[1] eq $token->{tag_name}) {
4402     $i = $_;
4403     last INSCOPE;
4404     } elsif ({
4405     table => 1, html => 1,
4406     }->{$node->[1]}) {
4407     last INSCOPE;
4408     }
4409     } # INSCOPE
4410     unless (defined $i) {
4411     !!!parse-error;
4412     ## Ignore the token
4413     !!!next-token;
4414     redo B;
4415     }
4416    
4417     splice @$open_elements, $i;
4418    
4419     $reset_insertion_mode->();
4420    
4421     !!!next-token;
4422     redo B;
4423     } elsif ({
4424     caption => 1, table => 1, tbody => 1,
4425     tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
4426     }->{$token->{tag_name}}) {
4427     !!!parse-error;
4428    
4429     ## have an element in table scope
4430     my $i;
4431     INSCOPE: for (reverse 0..$#$open_elements) {
4432     my $node = $open_elements->[$_];
4433     if ($node->[1] eq $token->{tag_name}) {
4434     $i = $_;
4435     last INSCOPE;
4436     } elsif ({
4437     table => 1, html => 1,
4438     }->{$node->[1]}) {
4439     last INSCOPE;
4440     }
4441     } # INSCOPE
4442     unless (defined $i) {
4443     ## Ignore the token
4444     !!!next-token;
4445     redo B;
4446     }
4447    
4448     ## As if </select>
4449     ## have an element in table scope
4450     undef $i;
4451     INSCOPE: for (reverse 0..$#$open_elements) {
4452     my $node = $open_elements->[$_];
4453     if ($node->[1] eq 'select') {
4454     $i = $_;
4455     last INSCOPE;
4456     } elsif ({
4457     table => 1, html => 1,
4458     }->{$node->[1]}) {
4459     last INSCOPE;
4460     }
4461     } # INSCOPE
4462     unless (defined $i) {
4463     !!!parse-error;
4464     ## Ignore the </select> token
4465     !!!next-token; ## TODO: ok?
4466     redo B;
4467     }
4468    
4469     splice @$open_elements, $i;
4470    
4471     $reset_insertion_mode->();
4472    
4473     ## reprocess
4474     redo B;
4475     } else {
4476     #
4477     }
4478     } else {
4479     #
4480     }
4481    
4482     !!!parse-error;
4483     ## Ignore the token
4484     !!!next-token;
4485     redo B;
4486     } elsif ($insertion_mode eq 'after body') {
4487     if ($token->{type} eq 'character') {
4488     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4489     ## As if in body
4490     $reconstruct_active_formatting_elements->($insert_to_current);
4491    
4492     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
4493    
4494     unless (length $token->{data}) {
4495     !!!next-token;
4496     redo B;
4497     }
4498     }
4499    
4500     #
4501     } elsif ($token->{type} eq 'comment') {
4502     my $comment = $self->{document}->create_comment ($token->{data});
4503     $open_elements->[0]->[0]->append_child ($comment);
4504     !!!next-token;
4505     redo B;
4506     } elsif ($token->{type} eq 'end tag') {
4507     if ($token->{tag_name} eq 'html') {
4508     ## TODO: if inner_html, parse-error, ignore the token; otherwise,
4509    
4510     $phase = 'trailing end';
4511     !!!next-token;
4512     redo B;
4513     } else {
4514     #
4515     }
4516     } else {
4517     #
4518     }
4519    
4520     !!!parse-error ('data after body');
4521     $insertion_mode = 'in body';
4522     ## reprocess
4523     redo B;
4524     } elsif ($insertion_mode eq 'in frameset') {
4525     if ($token->{type} eq 'character') {
4526     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4527     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
4528    
4529     unless (length $token->{data}) {
4530     !!!next-token;
4531     redo B;
4532     }
4533     }
4534    
4535     #
4536     } elsif ($token->{type} eq 'comment') {
4537     my $comment = $self->{document}->create_comment ($token->{data});
4538     $open_elements->[-1]->[0]->append_child ($comment);
4539     !!!next-token;
4540     redo B;
4541     } elsif ($token->{type} eq 'start tag') {
4542     if ($token->{tag_name} eq 'frameset') {
4543     !!!insert-element ($token->{tag_name}, $token->{attributes});
4544     !!!next-token;
4545     redo B;
4546     } elsif ($token->{tag_name} eq 'frame') {
4547     !!!insert-element ($token->{tag_name}, $token->{attributes});
4548     pop @$open_elements;
4549     !!!next-token;
4550     redo B;
4551     } elsif ($token->{tag_name} eq 'noframes') {
4552     $in_body->($insert_to_current);
4553     redo B;
4554     } else {
4555     #
4556     }
4557     } elsif ($token->{type} eq 'end tag') {
4558     if ($token->{tag_name} eq 'frameset') {
4559     if ($open_elements->[-1]->[1] eq 'html' and
4560     @$open_elements == 1) {
4561     !!!parse-error;
4562     ## Ignore the token
4563     !!!next-token;
4564     } else {
4565     pop @$open_elements;
4566     !!!next-token;
4567     }
4568    
4569     ## if not inner_html and
4570     if ($open_elements->[-1]->[1] ne 'frameset') {
4571     $insertion_mode = 'after frameset';
4572     }
4573     redo B;
4574     } else {
4575     #
4576     }
4577     } else {
4578     #
4579     }
4580    
4581     !!!parse-error;
4582     ## Ignore the token
4583     !!!next-token;
4584     redo B;
4585     } elsif ($insertion_mode eq 'after frameset') {
4586     if ($token->{type} eq 'character') {
4587     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4588     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
4589    
4590     unless (length $token->{data}) {
4591     !!!next-token;
4592     redo B;
4593     }
4594     }
4595    
4596     #
4597     } elsif ($token->{type} eq 'comment') {
4598     my $comment = $self->{document}->create_comment ($token->{data});
4599     $open_elements->[-1]->[0]->append_child ($comment);
4600     !!!next-token;
4601     redo B;
4602     } elsif ($token->{type} eq 'start tag') {
4603     if ($token->{tag_name} eq 'noframes') {
4604     $in_body->($insert_to_current);
4605     redo B;
4606     } else {
4607     #
4608     }
4609     } elsif ($token->{type} eq 'end tag') {
4610     if ($token->{tag_name} eq 'html') {
4611     $phase = 'trailing end';
4612     !!!next-token;
4613     redo B;
4614     } else {
4615     #
4616     }
4617     } else {
4618     #
4619     }
4620    
4621     !!!parse-error;
4622     ## Ignore the token
4623     !!!next-token;
4624     redo B;
4625    
4626     ## ISSUE: An issue in spec there
4627     } else {
4628     die "$0: $insertion_mode: Unknown insertion mode";
4629     }
4630     }
4631     } elsif ($phase eq 'trailing end') {
4632     ## states in the main stage is preserved yet # MUST
4633    
4634     if ($token->{type} eq 'DOCTYPE') {
4635     !!!parse-error;
4636     ## Ignore the token
4637     !!!next-token;
4638     redo B;
4639     } elsif ($token->{type} eq 'comment') {
4640     my $comment = $self->{document}->create_comment ($token->{data});
4641     $self->{document}->append_child ($comment);
4642     !!!next-token;
4643     redo B;
4644     } elsif ($token->{type} eq 'character') {
4645     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4646     my $data = $1;
4647     ## As if in the main phase.
4648     ## NOTE: The insertion mode in the main phase
4649     ## just before the phase has been changed to the trailing
4650     ## end phase is either "after body" or "after frameset".
4651     $reconstruct_active_formatting_elements->($insert_to_current)
4652     if $phase eq 'main';
4653    
4654     $open_elements->[-1]->[0]->manakai_append_text ($data);
4655    
4656     unless (length $token->{data}) {
4657     !!!next-token;
4658     redo B;
4659     }
4660     }
4661    
4662     !!!parse-error;
4663     $phase = 'main';
4664     ## reprocess
4665     redo B;
4666     } elsif ($token->{type} eq 'start tag' or
4667     $token->{type} eq 'end tag') {
4668     !!!parse-error;
4669     $phase = 'main';
4670     ## reprocess
4671     redo B;
4672     } elsif ($token->{type} eq 'end-of-file') {
4673     ## Stop parsing
4674     last B;
4675     } else {
4676     die "$0: $token->{type}: Unknown token";
4677     }
4678     }
4679     } # B
4680    
4681     ## Stop parsing # MUST
4682    
4683     ## TODO: script stuffs
4684     } # _construct_tree
4685    
4686     sub get_inner_html ($$$) {
4687     my ($class, $node, $on_error) = @_;
4688    
4689     ## Step 1
4690     my $s = '';
4691    
4692     my $in_cdata;
4693     my $parent = $node;
4694     while (defined $parent) {
4695     if ($parent->node_type == 1 and
4696     $parent->namespace_uri eq 'http://www.w3.org/1999/xhtml' and
4697     {
4698     style => 1, script => 1, xmp => 1, iframe => 1,
4699     noembed => 1, noframes => 1, noscript => 1,
4700     }->{$parent->local_name}) { ## TODO: case thingy
4701     $in_cdata = 1;
4702     }
4703     $parent = $parent->parent_node;
4704     }
4705    
4706     ## Step 2
4707     my @node = @{$node->child_nodes};
4708     C: while (@node) {
4709     my $child = shift @node;
4710     unless (ref $child) {
4711     if ($child eq 'cdata-out') {
4712     $in_cdata = 0;
4713     } else {
4714     $s .= $child; # end tag
4715     }
4716     next C;
4717     }
4718    
4719     my $nt = $child->node_type;
4720     if ($nt == 1) { # Element
4721     my $tag_name = lc $child->tag_name; ## ISSUE: Definition of "lowercase"
4722     $s .= '<' . $tag_name;
4723    
4724     ## ISSUE: Non-html elements
4725    
4726     my @attrs = @{$child->attributes}; # sort order MUST be stable
4727     for my $attr (@attrs) { # order is implementation dependent
4728     my $attr_name = lc $attr->name; ## ISSUE: Definition of "lowercase"
4729     $s .= ' ' . $attr_name . '="';
4730     my $attr_value = $attr->value;
4731     ## escape
4732     $attr_value =~ s/&/&amp;/g;
4733     $attr_value =~ s/</&lt;/g;
4734     $attr_value =~ s/>/&gt;/g;
4735     $attr_value =~ s/"/&quot;/g;
4736     $s .= $attr_value . '"';
4737     }
4738     $s .= '>';
4739    
4740     next C if {
4741     area => 1, base => 1, basefont => 1, bgsound => 1,
4742     br => 1, col => 1, embed => 1, frame => 1, hr => 1,
4743     img => 1, input => 1, link => 1, meta => 1, param => 1,
4744     spacer => 1, wbr => 1,
4745     }->{$tag_name};
4746    
4747     if (not $in_cdata and {
4748     style => 1, script => 1, xmp => 1, iframe => 1,
4749     noembed => 1, noframes => 1, noscript => 1,
4750     }->{$tag_name}) {
4751     unshift @node, 'cdata-out';
4752     $in_cdata = 1;
4753     }
4754    
4755     unshift @node, @{$child->child_nodes}, '</' . $tag_name . '>';
4756     } elsif ($nt == 3 or $nt == 4) {
4757     if ($in_cdata) {
4758     $s .= $child->data;
4759     } else {
4760     my $value = $child->data;
4761     $value =~ s/&/&amp;/g;
4762     $value =~ s/</&lt;/g;
4763     $value =~ s/>/&gt;/g;
4764     $value =~ s/"/&quot;/g;
4765     $s .= $value;
4766     }
4767     } elsif ($nt == 8) {
4768     $s .= '<!--' . $child->data . '-->';
4769     } elsif ($nt == 10) {
4770     $s .= '<!DOCTYPE ' . $child->name . '>';
4771     } elsif ($nt == 5) { # entrefs
4772     push @node, @{$child->child_nodes};
4773     } else {
4774     $on_error->($child) if defined $on_error;
4775     }
4776     ## ISSUE: This code does not support PIs.
4777     } # C
4778    
4779     ## Step 3
4780     return \$s;
4781     } # get_inner_html
4782    
4783     1;
4784 wakaba 1.2 # $Date: 2007/05/01 10:36:06 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24