/[suikacvs]/markup/html/whatpm/What/HTML.pm.src
Suika

Contents of /markup/html/whatpm/What/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (hide annotations) (download) (as text)
Mon Apr 30 12:06:12 2007 UTC (17 years, 7 months ago) by wakaba
Branch: MAIN
Changes since 1.4: +315 -8 lines
File MIME type: application/x-wais-source
++ whatpm/What/ChangeLog	30 Apr 2007 12:05:44 -0000
	* mkhtmlparser.pl, Makefile: References to the |HTML-consume-entity.src|
	are removed.

	* HTML.pm.src: Tokenizer's handling on named entities are rewritten.

	* HTML-consume-entity.src: Removed.

2007-04-30  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package What::HTML;
2     use strict;
3 wakaba 1.5 our $VERSION=do{my @r=(q$Revision: 1.4 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.1
5     ## This is a very, very early version of an HTML parser.
6    
7     my $permitted_slash_tag_name = {
8     base => 1,
9     link => 1,
10     meta => 1,
11     hr => 1,
12     br => 1,
13     img=> 1,
14     embed => 1,
15     param => 1,
16     area => 1,
17     col => 1,
18     input => 1,
19     };
20    
21 wakaba 1.5 my $entity_char = {
22     AElig => "\x{00C6}",
23     Aacute => "\x{00C1}",
24     Acirc => "\x{00C2}",
25     Agrave => "\x{00C0}",
26     Alpha => "\x{0391}",
27     Aring => "\x{00C5}",
28     Atilde => "\x{00C3}",
29     Auml => "\x{00C4}",
30     Beta => "\x{0392}",
31     Ccedil => "\x{00C7}",
32     Chi => "\x{03A7}",
33     Dagger => "\x{2021}",
34     Delta => "\x{0394}",
35     ETH => "\x{00D0}",
36     Eacute => "\x{00C9}",
37     Ecirc => "\x{00CA}",
38     Egrave => "\x{00C8}",
39     Epsilon => "\x{0395}",
40     Eta => "\x{0397}",
41     Euml => "\x{00CB}",
42     Gamma => "\x{0393}",
43     Iacute => "\x{00CD}",
44     Icirc => "\x{00CE}",
45     Igrave => "\x{00CC}",
46     Iota => "\x{0399}",
47     Iuml => "\x{00CF}",
48     Kappa => "\x{039A}",
49     Lambda => "\x{039B}",
50     Mu => "\x{039C}",
51     Ntilde => "\x{00D1}",
52     Nu => "\x{039D}",
53     OElig => "\x{0152}",
54     Oacute => "\x{00D3}",
55     Ocirc => "\x{00D4}",
56     Ograve => "\x{00D2}",
57     Omega => "\x{03A9}",
58     Omicron => "\x{039F}",
59     Oslash => "\x{00D8}",
60     Otilde => "\x{00D5}",
61     Ouml => "\x{00D6}",
62     Phi => "\x{03A6}",
63     Pi => "\x{03A0}",
64     Prime => "\x{2033}",
65     Psi => "\x{03A8}",
66     Rho => "\x{03A1}",
67     Scaron => "\x{0160}",
68     Sigma => "\x{03A3}",
69     THORN => "\x{00DE}",
70     Tau => "\x{03A4}",
71     Theta => "\x{0398}",
72     Uacute => "\x{00DA}",
73     Ucirc => "\x{00DB}",
74     Ugrave => "\x{00D9}",
75     Upsilon => "\x{03A5}",
76     Uuml => "\x{00DC}",
77     Xi => "\x{039E}",
78     Yacute => "\x{00DD}",
79     Yuml => "\x{0178}",
80     Zeta => "\x{0396}",
81     aacute => "\x{00E1}",
82     acirc => "\x{00E2}",
83     acute => "\x{00B4}",
84     aelig => "\x{00E6}",
85     agrave => "\x{00E0}",
86     alefsym => "\x{2135}",
87     alpha => "\x{03B1}",
88     amp => "\x{0026}",
89     AMP => "\x{0026}",
90     and => "\x{2227}",
91     ang => "\x{2220}",
92     apos => "\x{0027}",
93     aring => "\x{00E5}",
94     asymp => "\x{2248}",
95     atilde => "\x{00E3}",
96     auml => "\x{00E4}",
97     bdquo => "\x{201E}",
98     beta => "\x{03B2}",
99     brvbar => "\x{00A6}",
100     bull => "\x{2022}",
101     cap => "\x{2229}",
102     ccedil => "\x{00E7}",
103     cedil => "\x{00B8}",
104     cent => "\x{00A2}",
105     chi => "\x{03C7}",
106     circ => "\x{02C6}",
107     clubs => "\x{2663}",
108     cong => "\x{2245}",
109     copy => "\x{00A9}",
110     COPY => "\x{00A9}",
111     crarr => "\x{21B5}",
112     cup => "\x{222A}",
113     curren => "\x{00A4}",
114     dArr => "\x{21D3}",
115     dagger => "\x{2020}",
116     darr => "\x{2193}",
117     deg => "\x{00B0}",
118     delta => "\x{03B4}",
119     diams => "\x{2666}",
120     divide => "\x{00F7}",
121     eacute => "\x{00E9}",
122     ecirc => "\x{00EA}",
123     egrave => "\x{00E8}",
124     empty => "\x{2205}",
125     emsp => "\x{2003}",
126     ensp => "\x{2002}",
127     epsilon => "\x{03B5}",
128     equiv => "\x{2261}",
129     eta => "\x{03B7}",
130     eth => "\x{00F0}",
131     euml => "\x{00EB}",
132     euro => "\x{20AC}",
133     exist => "\x{2203}",
134     fnof => "\x{0192}",
135     forall => "\x{2200}",
136     frac12 => "\x{00BD}",
137     frac14 => "\x{00BC}",
138     frac34 => "\x{00BE}",
139     frasl => "\x{2044}",
140     gamma => "\x{03B3}",
141     ge => "\x{2265}",
142     gt => "\x{003E}",
143     GT => "\x{003E}",
144     hArr => "\x{21D4}",
145     harr => "\x{2194}",
146     hearts => "\x{2665}",
147     hellip => "\x{2026}",
148     iacute => "\x{00ED}",
149     icirc => "\x{00EE}",
150     iexcl => "\x{00A1}",
151     igrave => "\x{00EC}",
152     image => "\x{2111}",
153     infin => "\x{221E}",
154     int => "\x{222B}",
155     iota => "\x{03B9}",
156     iquest => "\x{00BF}",
157     isin => "\x{2208}",
158     iuml => "\x{00EF}",
159     kappa => "\x{03BA}",
160     lArr => "\x{21D0}",
161     lambda => "\x{03BB}",
162     lang => "\x{2329}",
163     laquo => "\x{00AB}",
164     larr => "\x{2190}",
165     lceil => "\x{2308}",
166     ldquo => "\x{201C}",
167     le => "\x{2264}",
168     lfloor => "\x{230A}",
169     lowast => "\x{2217}",
170     loz => "\x{25CA}",
171     lrm => "\x{200E}",
172     lsaquo => "\x{2039}",
173     lsquo => "\x{2018}",
174     lt => "\x{003C}",
175     LT => "\x{003C}",
176     macr => "\x{00AF}",
177     mdash => "\x{2014}",
178     micro => "\x{00B5}",
179     middot => "\x{00B7}",
180     minus => "\x{2212}",
181     mu => "\x{03BC}",
182     nabla => "\x{2207}",
183     nbsp => "\x{00A0}",
184     ndash => "\x{2013}",
185     ne => "\x{2260}",
186     ni => "\x{220B}",
187     not => "\x{00AC}",
188     notin => "\x{2209}",
189     nsub => "\x{2284}",
190     ntilde => "\x{00F1}",
191     nu => "\x{03BD}",
192     oacute => "\x{00F3}",
193     ocirc => "\x{00F4}",
194     oelig => "\x{0153}",
195     ograve => "\x{00F2}",
196     oline => "\x{203E}",
197     omega => "\x{03C9}",
198     omicron => "\x{03BF}",
199     oplus => "\x{2295}",
200     or => "\x{2228}",
201     ordf => "\x{00AA}",
202     ordm => "\x{00BA}",
203     oslash => "\x{00F8}",
204     otilde => "\x{00F5}",
205     otimes => "\x{2297}",
206     ouml => "\x{00F6}",
207     para => "\x{00B6}",
208     part => "\x{2202}",
209     permil => "\x{2030}",
210     perp => "\x{22A5}",
211     phi => "\x{03C6}",
212     pi => "\x{03C0}",
213     piv => "\x{03D6}",
214     plusmn => "\x{00B1}",
215     pound => "\x{00A3}",
216     prime => "\x{2032}",
217     prod => "\x{220F}",
218     prop => "\x{221D}",
219     psi => "\x{03C8}",
220     quot => "\x{0022}",
221     QUOT => "\x{0022}",
222     rArr => "\x{21D2}",
223     radic => "\x{221A}",
224     rang => "\x{232A}",
225     raquo => "\x{00BB}",
226     rarr => "\x{2192}",
227     rceil => "\x{2309}",
228     rdquo => "\x{201D}",
229     real => "\x{211C}",
230     reg => "\x{00AE}",
231     REG => "\x{00AE}",
232     rfloor => "\x{230B}",
233     rho => "\x{03C1}",
234     rlm => "\x{200F}",
235     rsaquo => "\x{203A}",
236     rsquo => "\x{2019}",
237     sbquo => "\x{201A}",
238     scaron => "\x{0161}",
239     sdot => "\x{22C5}",
240     sect => "\x{00A7}",
241     shy => "\x{00AD}",
242     sigma => "\x{03C3}",
243     sigmaf => "\x{03C2}",
244     sim => "\x{223C}",
245     spades => "\x{2660}",
246     sub => "\x{2282}",
247     sube => "\x{2286}",
248     sum => "\x{2211}",
249     sup => "\x{2283}",
250     sup1 => "\x{00B9}",
251     sup2 => "\x{00B2}",
252     sup3 => "\x{00B3}",
253     supe => "\x{2287}",
254     szlig => "\x{00DF}",
255     tau => "\x{03C4}",
256     there4 => "\x{2234}",
257     theta => "\x{03B8}",
258     thetasym => "\x{03D1}",
259     thinsp => "\x{2009}",
260     thorn => "\x{00FE}",
261     tilde => "\x{02DC}",
262     times => "\x{00D7}",
263     trade => "\x{2122}",
264     uArr => "\x{21D1}",
265     uacute => "\x{00FA}",
266     uarr => "\x{2191}",
267     ucirc => "\x{00FB}",
268     ugrave => "\x{00F9}",
269     uml => "\x{00A8}",
270     upsih => "\x{03D2}",
271     upsilon => "\x{03C5}",
272     uuml => "\x{00FC}",
273     weierp => "\x{2118}",
274     xi => "\x{03BE}",
275     yacute => "\x{00FD}",
276     yen => "\x{00A5}",
277     yuml => "\x{00FF}",
278     zeta => "\x{03B6}",
279     zwj => "\x{200D}",
280     zwnj => "\x{200C}",
281     };
282    
283 wakaba 1.2 my $special_category = {
284     address => 1, area => 1, base => 1, basefont => 1, bgsound => 1,
285     blockquote => 1, body => 1, br => 1, center => 1, col => 1, colgroup => 1,
286     dd => 1, dir => 1, div => 1, dl => 1, dt => 1, embed => 1, fieldset => 1,
287     form => 1, frame => 1, frameset => 1, h1 => 1, h2 => 1, h3 => 1,
288     h4 => 1, h5 => 1, h6 => 1, head => 1, hr => 1, iframe => 1, image => 1,
289     img => 1, input => 1, isindex => 1, li => 1, link => 1, listing => 1,
290     menu => 1, meta => 1, noembed => 1, noframes => 1, noscript => 1,
291     ol => 1, optgroup => 1, option => 1, p => 1, param => 1, plaintext => 1,
292     pre => 1, script => 1, select => 1, spacer => 1, style => 1, tbody => 1,
293     textarea => 1, tfoot => 1, thead => 1, title => 1, tr => 1, ul => 1, wbr => 1,
294     };
295     my $scoping_category = {
296     button => 1, caption => 1, html => 1, marquee => 1, object => 1,
297     table => 1, td => 1, th => 1,
298     };
299     my $formatting_category = {
300     a => 1, b => 1, big => 1, em => 1, font => 1, i => 1, nobr => 1,
301     s => 1, small => 1, strile => 1, strong => 1, tt => 1, u => 1,
302     };
303     # $phrasing_category: all other elements
304    
305 wakaba 1.1 sub new ($) {
306     my $class = shift;
307     my $self = bless {}, $class;
308     $self->{set_next_input_character} = sub {
309     $self->{next_input_character} = -1;
310     };
311     $self->{parse_error} = sub {
312     #
313     };
314     return $self;
315     } # new
316    
317     ## Implementations MUST act as if state machine in the spec
318    
319     sub _initialize_tokenizer ($) {
320     my $self = shift;
321     $self->{state} = 'data'; # MUST
322     $self->{content_model_flag} = 'PCDATA'; # be
323     undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE
324     undef $self->{current_attribute};
325     undef $self->{last_emitted_start_tag_name};
326     undef $self->{last_attribute_value_state};
327     $self->{char} = [];
328     # $self->{next_input_character}
329     !!!next-input-character;
330     $self->{token} = [];
331     } # _initialize_tokenizer
332    
333     ## A token has:
334     ## ->{type} eq 'DOCTYPE', 'start tag', 'end tag', 'comment',
335     ## 'character', or 'end-of-file'
336     ## ->{name} (DOCTYPE, start tag (tagname), end tag (tagname))
337     ## ISSUE: the spec need s/tagname/tag name/
338     ## ->{error} == 1 or 0 (DOCTYPE)
339     ## ->{attributes} isa HASH (start tag, end tag)
340     ## ->{data} (comment, character)
341    
342     ## Macros
343     ## Macros MUST be preceded by three EXCLAMATION MARKs.
344     ## emit ($token)
345     ## Emits the specified token.
346    
347     ## Emitted token MUST immediately be handled by the tree construction state.
348    
349     ## Before each step, UA MAY check to see if either one of the scripts in
350     ## "list of scripts that will execute as soon as possible" or the first
351     ## script in the "list of scripts that will execute asynchronously",
352     ## has completed loading. If one has, then it MUST be executed
353     ## and removed from the list.
354    
355     sub _get_next_token ($) {
356     my $self = shift;
357     if (@{$self->{token}}) {
358     return shift @{$self->{token}};
359     }
360    
361     A: {
362     if ($self->{state} eq 'data') {
363     if ($self->{next_input_character} == 0x0026) { # &
364     if ($self->{content_model_flag} eq 'PCDATA' or
365     $self->{content_model_flag} eq 'RCDATA') {
366     $self->{state} = 'entity data';
367     !!!next-input-character;
368     redo A;
369     } else {
370     #
371     }
372     } elsif ($self->{next_input_character} == 0x003C) { # <
373     if ($self->{content_model_flag} ne 'PLAINTEXT') {
374     $self->{state} = 'tag open';
375     !!!next-input-character;
376     redo A;
377     } else {
378     #
379     }
380     } elsif ($self->{next_input_character} == -1) {
381     !!!emit ({type => 'end-of-file'});
382     last A; ## TODO: ok?
383     }
384     # Anything else
385     my $token = {type => 'character',
386     data => chr $self->{next_input_character}};
387     ## Stay in the data state
388     !!!next-input-character;
389    
390     !!!emit ($token);
391    
392     redo A;
393     } elsif ($self->{state} eq 'entity data') {
394     ## (cannot happen in CDATA state)
395    
396     my $token = $self->_tokenize_attempt_to_consume_an_entity;
397    
398     $self->{state} = 'data';
399     # next-input-character is already done
400    
401     unless (defined $token) {
402     !!!emit ({type => 'character', data => '&'});
403     } else {
404     !!!emit ($token);
405     }
406    
407     redo A;
408     } elsif ($self->{state} eq 'tag open') {
409     if ($self->{content_model_flag} eq 'RCDATA' or
410     $self->{content_model_flag} eq 'CDATA') {
411     if ($self->{next_input_character} == 0x002F) { # /
412     !!!next-input-character;
413     $self->{state} = 'close tag open';
414     redo A;
415     } else {
416     ## reconsume
417     $self->{state} = 'data';
418    
419     !!!emit (type => 'character', data => {'/'});
420    
421     redo A;
422     }
423     } elsif ($self->{content_model_flag} eq 'PCDATA') {
424     if ($self->{next_input_character} == 0x0021) { # !
425     $self->{state} = 'markup declaration open';
426     !!!next-input-character;
427     redo A;
428     } elsif ($self->{next_input_character} == 0x002F) { # /
429     $self->{state} = 'close tag open';
430     !!!next-input-character;
431     redo A;
432     } elsif (0x0041 <= $self->{next_input_character} and
433     $self->{next_input_character} <= 0x005A) { # A..Z
434     $self->{current_token}
435     = {type => 'start tag',
436     tag_name => chr ($self->{next_input_character} + 0x0020)};
437     $self->{state} = 'tag name';
438     !!!next-input-character;
439     redo A;
440     } elsif (0x0061 <= $self->{next_input_character} and
441     $self->{next_input_character} <= 0x007A) { # a..z
442     $self->{current_token} = {type => 'start tag',
443     tag_name => chr ($self->{next_input_character})};
444     $self->{state} = 'tag name';
445     !!!next-input-character;
446     redo A;
447     } elsif ($self->{next_input_character} == 0x003E) { # >
448     !!!parse-error;
449     $self->{state} = 'data';
450     !!!next-input-character;
451    
452 wakaba 1.3 !!!emit ({type => 'character', data => '<>'});
453 wakaba 1.1
454     redo A;
455     } elsif ($self->{next_input_character} == 0x003F) { # ?
456     !!!parse-error;
457     $self->{state} = 'bogus comment';
458     ## $self->{next_input_character} is intentionally left as is
459     redo A;
460     } else {
461     !!!parse-error;
462     $self->{state} = 'data';
463     ## reconsume
464    
465     !!!emit ({type => 'character', data => '<'});
466    
467     redo A;
468     }
469     } else {
470     die "$0: $self->{content_model_flag}: Unknown content model flag";
471     }
472     } elsif ($self->{state} eq 'close tag open') {
473     if ($self->{content_model_flag} eq 'RCDATA' or
474     $self->{content_model_flag} eq 'CDATA') {
475     my @next_char;
476     TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
477     push @next_char, $self->{next_input_character};
478     my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
479     my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
480     if ($self->{next_input_character} == $c or $self->{next_input_character} == $C) {
481     !!!next-input-character;
482     next TAGNAME;
483     } else {
484     !!!parse-error;
485     $self->{next_input_character} = shift @next_char; # reconsume
486     !!!back-next-input-character (@next_char);
487     $self->{state} = 'data';
488    
489     !!!emit ({type => 'character', data => '</'});
490    
491     redo A;
492     }
493     }
494 wakaba 1.2 push @next_char, $self->{next_input_character};
495 wakaba 1.1
496 wakaba 1.2 unless ($self->{next_input_character} == 0x0009 or # HT
497     $self->{next_input_character} == 0x000A or # LF
498     $self->{next_input_character} == 0x000B or # VT
499     $self->{next_input_character} == 0x000C or # FF
500     $self->{next_input_character} == 0x0020 or # SP
501     $self->{next_input_character} == 0x003E or # >
502     $self->{next_input_character} == 0x002F or # /
503     $self->{next_input_character} == 0x003C or # <
504 wakaba 1.1 $self->{next_input_character} == -1) {
505     !!!parse-error;
506     $self->{next_input_character} = shift @next_char; # reconsume
507     !!!back-next-input-character (@next_char);
508     $self->{state} = 'data';
509    
510     !!!emit ({type => 'character', data => '</'});
511    
512     redo A;
513     } else {
514     $self->{next_input_character} = shift @next_char;
515     !!!back-next-input-character (@next_char);
516     # and consume...
517     }
518     }
519    
520     if (0x0041 <= $self->{next_input_character} and
521     $self->{next_input_character} <= 0x005A) { # A..Z
522     $self->{current_token} = {type => 'end tag',
523     tag_name => chr ($self->{next_input_character} + 0x0020)};
524     $self->{state} = 'tag name';
525     !!!next-input-character;
526     redo A;
527     } elsif (0x0061 <= $self->{next_input_character} and
528     $self->{next_input_character} <= 0x007A) { # a..z
529     $self->{current_token} = {type => 'end tag',
530     tag_name => chr ($self->{next_input_character})};
531     $self->{state} = 'tag name';
532     !!!next-input-character;
533     redo A;
534     } elsif ($self->{next_input_character} == 0x003E) { # >
535     !!!parse-error;
536     $self->{state} = 'data';
537     !!!next-input-character;
538     redo A;
539     } elsif ($self->{next_input_character} == -1) {
540     !!!parse-error;
541     $self->{state} = 'data';
542     # reconsume
543    
544     !!!emit ({type => 'character', data => '</'});
545    
546     redo A;
547     } else {
548     !!!parse-error;
549     $self->{state} = 'bogus comment';
550     ## $self->{next_input_character} is intentionally left as is
551     redo A;
552     }
553     } elsif ($self->{state} eq 'tag name') {
554     if ($self->{next_input_character} == 0x0009 or # HT
555     $self->{next_input_character} == 0x000A or # LF
556     $self->{next_input_character} == 0x000B or # VT
557     $self->{next_input_character} == 0x000C or # FF
558     $self->{next_input_character} == 0x0020) { # SP
559     $self->{state} = 'before attribute name';
560     !!!next-input-character;
561     redo A;
562     } elsif ($self->{next_input_character} == 0x003E) { # >
563     if ($self->{current_token}->{type} eq 'start tag') {
564     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
565     } elsif ($self->{current_token}->{type} eq 'end tag') {
566     $self->{content_model_flag} = 'PCDATA'; # MUST
567 wakaba 1.2 if ($self->{current_token}->{attributes}) {
568 wakaba 1.1 !!!parse-error;
569     }
570     } else {
571     die "$0: $self->{current_token}->{type}: Unknown token type";
572     }
573     $self->{state} = 'data';
574     !!!next-input-character;
575    
576     !!!emit ($self->{current_token}); # start tag or end tag
577     undef $self->{current_token};
578    
579     redo A;
580     } elsif (0x0041 <= $self->{next_input_character} and
581     $self->{next_input_character} <= 0x005A) { # A..Z
582     $self->{current_token}->{tag_name} .= chr ($self->{next_input_character} + 0x0020);
583     # start tag or end tag
584     ## Stay in this state
585     !!!next-input-character;
586     redo A;
587     } elsif ($self->{next_input_character} == 0x003C or # <
588     $self->{next_input_character} == -1) {
589     !!!parse-error;
590     if ($self->{current_token}->{type} eq 'start tag') {
591     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
592     } elsif ($self->{current_token}->{type} eq 'end tag') {
593     $self->{content_model_flag} = 'PCDATA'; # MUST
594 wakaba 1.2 if ($self->{current_token}->{attributes}) {
595 wakaba 1.1 !!!parse-error;
596     }
597     } else {
598     die "$0: $self->{current_token}->{type}: Unknown token type";
599     }
600     $self->{state} = 'data';
601     # reconsume
602    
603     !!!emit ($self->{current_token}); # start tag or end tag
604     undef $self->{current_token};
605    
606     redo A;
607     } elsif ($self->{next_input_character} == 0x002F) { # /
608     !!!next-input-character;
609     if ($self->{next_input_character} == 0x003E and # >
610     $self->{current_token}->{type} eq 'start tag' and
611     $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
612     # permitted slash
613     #
614     } else {
615     !!!parse-error;
616     }
617     $self->{state} = 'before attribute name';
618     # next-input-character is already done
619     redo A;
620     } else {
621     $self->{current_token}->{tag_name} .= chr $self->{next_input_character};
622     # start tag or end tag
623     ## Stay in the state
624     !!!next-input-character;
625     redo A;
626     }
627     } elsif ($self->{state} eq 'before attribute name') {
628     if ($self->{next_input_character} == 0x0009 or # HT
629     $self->{next_input_character} == 0x000A or # LF
630     $self->{next_input_character} == 0x000B or # VT
631     $self->{next_input_character} == 0x000C or # FF
632     $self->{next_input_character} == 0x0020) { # SP
633     ## Stay in the state
634     !!!next-input-character;
635     redo A;
636     } elsif ($self->{next_input_character} == 0x003E) { # >
637     if ($self->{current_token}->{type} eq 'start tag') {
638     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
639     } elsif ($self->{current_token}->{type} eq 'end tag') {
640     $self->{content_model_flag} = 'PCDATA'; # MUST
641 wakaba 1.2 if ($self->{current_token}->{attributes}) {
642 wakaba 1.1 !!!parse-error;
643     }
644     } else {
645     die "$0: $self->{current_token}->{type}: Unknown token type";
646     }
647     $self->{state} = 'data';
648     !!!next-input-character;
649    
650     !!!emit ($self->{current_token}); # start tag or end tag
651     undef $self->{current_token};
652    
653     redo A;
654     } elsif (0x0041 <= $self->{next_input_character} and
655     $self->{next_input_character} <= 0x005A) { # A..Z
656     $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
657     value => ''};
658     $self->{state} = 'attribute name';
659     !!!next-input-character;
660     redo A;
661     } elsif ($self->{next_input_character} == 0x002F) { # /
662     !!!next-input-character;
663     if ($self->{next_input_character} == 0x003E and # >
664     $self->{current_token}->{type} eq 'start tag' and
665     $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
666     # permitted slash
667     #
668     } else {
669     !!!parse-error;
670     }
671     ## Stay in the state
672     # next-input-character is already done
673     redo A;
674     } elsif ($self->{next_input_character} == 0x003C or # <
675     $self->{next_input_character} == -1) {
676     !!!parse-error;
677     if ($self->{current_token}->{type} eq 'start tag') {
678     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
679     } elsif ($self->{current_token}->{type} eq 'end tag') {
680     $self->{content_model_flag} = 'PCDATA'; # MUST
681 wakaba 1.2 if ($self->{current_token}->{attributes}) {
682 wakaba 1.1 !!!parse-error;
683     }
684     } else {
685     die "$0: $self->{current_token}->{type}: Unknown token type";
686     }
687     $self->{state} = 'data';
688     # reconsume
689    
690     !!!emit ($self->{current_token}); # start tag or end tag
691     undef $self->{current_token};
692    
693     redo A;
694     } else {
695     $self->{current_attribute} = {name => chr ($self->{next_input_character}),
696     value => ''};
697     $self->{state} = 'attribute name';
698     !!!next-input-character;
699     redo A;
700     }
701     } elsif ($self->{state} eq 'attribute name') {
702     my $before_leave = sub {
703 wakaba 1.2 if (exists $self->{current_token}->{attributes} # start tag or end tag
704 wakaba 1.1 ->{$self->{current_attribute}->{name}}) { # MUST
705     !!!parse-error;
706     ## Discard $self->{current_attribute} # MUST
707     } else {
708 wakaba 1.2 $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}
709 wakaba 1.1 = $self->{current_attribute};
710     }
711     }; # $before_leave
712    
713     if ($self->{next_input_character} == 0x0009 or # HT
714     $self->{next_input_character} == 0x000A or # LF
715     $self->{next_input_character} == 0x000B or # VT
716     $self->{next_input_character} == 0x000C or # FF
717     $self->{next_input_character} == 0x0020) { # SP
718     $before_leave->();
719     $self->{state} = 'after attribute name';
720     !!!next-input-character;
721     redo A;
722     } elsif ($self->{next_input_character} == 0x003D) { # =
723     $before_leave->();
724     $self->{state} = 'before attribute value';
725     !!!next-input-character;
726     redo A;
727     } elsif ($self->{next_input_character} == 0x003E) { # >
728     $before_leave->();
729     if ($self->{current_token}->{type} eq 'start tag') {
730     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
731     } elsif ($self->{current_token}->{type} eq 'end tag') {
732     $self->{content_model_flag} = 'PCDATA'; # MUST
733 wakaba 1.2 if ($self->{current_token}->{attributes}) {
734 wakaba 1.1 !!!parse-error;
735     }
736     } else {
737     die "$0: $self->{current_token}->{type}: Unknown token type";
738     }
739     $self->{state} = 'data';
740     !!!next-input-character;
741    
742     !!!emit ($self->{current_token}); # start tag or end tag
743     undef $self->{current_token};
744    
745     redo A;
746     } elsif (0x0041 <= $self->{next_input_character} and
747     $self->{next_input_character} <= 0x005A) { # A..Z
748     $self->{current_attribute}->{name} .= chr ($self->{next_input_character} + 0x0020);
749     ## Stay in the state
750     !!!next-input-character;
751     redo A;
752     } elsif ($self->{next_input_character} == 0x002F) { # /
753     $before_leave->();
754     !!!next-input-character;
755     if ($self->{next_input_character} == 0x003E and # >
756     $self->{current_token}->{type} eq 'start tag' and
757     $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
758     # permitted slash
759     #
760     } else {
761     !!!parse-error;
762     }
763     $self->{state} = 'before attribute name';
764     # next-input-character is already done
765     redo A;
766     } elsif ($self->{next_input_character} == 0x003C or # <
767     $self->{next_input_character} == -1) {
768     !!!parse-error;
769     $before_leave->();
770     if ($self->{current_token}->{type} eq 'start tag') {
771     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
772     } elsif ($self->{current_token}->{type} eq 'end tag') {
773     $self->{content_model_flag} = 'PCDATA'; # MUST
774 wakaba 1.2 if ($self->{current_token}->{attributes}) {
775 wakaba 1.1 !!!parse-error;
776     }
777     } else {
778     die "$0: $self->{current_token}->{type}: Unknown token type";
779     }
780     $self->{state} = 'data';
781     # reconsume
782    
783     !!!emit ($self->{current_token}); # start tag or end tag
784     undef $self->{current_token};
785    
786     redo A;
787     } else {
788     $self->{current_attribute}->{name} .= chr ($self->{next_input_character});
789     ## Stay in the state
790     !!!next-input-character;
791     redo A;
792     }
793     } elsif ($self->{state} eq 'after attribute name') {
794     if ($self->{next_input_character} == 0x0009 or # HT
795     $self->{next_input_character} == 0x000A or # LF
796     $self->{next_input_character} == 0x000B or # VT
797     $self->{next_input_character} == 0x000C or # FF
798     $self->{next_input_character} == 0x0020) { # SP
799     ## Stay in the state
800     !!!next-input-character;
801     redo A;
802     } elsif ($self->{next_input_character} == 0x003D) { # =
803     $self->{state} = 'before attribute value';
804     !!!next-input-character;
805     redo A;
806     } elsif ($self->{next_input_character} == 0x003E) { # >
807     if ($self->{current_token}->{type} eq 'start tag') {
808     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
809     } elsif ($self->{current_token}->{type} eq 'end tag') {
810     $self->{content_model_flag} = 'PCDATA'; # MUST
811 wakaba 1.2 if ($self->{current_token}->{attributes}) {
812 wakaba 1.1 !!!parse-error;
813     }
814     } else {
815     die "$0: $self->{current_token}->{type}: Unknown token type";
816     }
817     $self->{state} = 'data';
818     !!!next-input-character;
819    
820     !!!emit ($self->{current_token}); # start tag or end tag
821     undef $self->{current_token};
822    
823     redo A;
824     } elsif (0x0041 <= $self->{next_input_character} and
825     $self->{next_input_character} <= 0x005A) { # A..Z
826     $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
827     value => ''};
828     $self->{state} = 'attribute name';
829     !!!next-input-character;
830     redo A;
831     } elsif ($self->{next_input_character} == 0x002F) { # /
832     !!!next-input-character;
833     if ($self->{next_input_character} == 0x003E and # >
834     $self->{current_token}->{type} eq 'start tag' and
835     $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
836     # permitted slash
837     #
838     } else {
839     !!!parse-error;
840     }
841     $self->{state} = 'before attribute name';
842     # next-input-character is already done
843     redo A;
844     } elsif ($self->{next_input_character} == 0x003C or # <
845     $self->{next_input_character} == -1) {
846     !!!parse-error;
847     if ($self->{current_token}->{type} eq 'start tag') {
848     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
849     } elsif ($self->{current_token}->{type} eq 'end tag') {
850     $self->{content_model_flag} = 'PCDATA'; # MUST
851 wakaba 1.2 if ($self->{current_token}->{attributes}) {
852 wakaba 1.1 !!!parse-error;
853     }
854     } else {
855     die "$0: $self->{current_token}->{type}: Unknown token type";
856     }
857     $self->{state} = 'data';
858     # reconsume
859    
860     !!!emit ($self->{current_token}); # start tag or end tag
861     undef $self->{current_token};
862    
863     redo A;
864     } else {
865     $self->{current_attribute} = {name => chr ($self->{next_input_character}),
866     value => ''};
867     $self->{state} = 'attribute name';
868     !!!next-input-character;
869     redo A;
870     }
871     } elsif ($self->{state} eq 'before attribute value') {
872     if ($self->{next_input_character} == 0x0009 or # HT
873     $self->{next_input_character} == 0x000A or # LF
874     $self->{next_input_character} == 0x000B or # VT
875     $self->{next_input_character} == 0x000C or # FF
876     $self->{next_input_character} == 0x0020) { # SP
877     ## Stay in the state
878     !!!next-input-character;
879     redo A;
880     } elsif ($self->{next_input_character} == 0x0022) { # "
881     $self->{state} = 'attribute value (double-quoted)';
882     !!!next-input-character;
883     redo A;
884     } elsif ($self->{next_input_character} == 0x0026) { # &
885     $self->{state} = 'attribute value (unquoted)';
886     ## reconsume
887     redo A;
888     } elsif ($self->{next_input_character} == 0x0027) { # '
889     $self->{state} = 'attribute value (single-quoted)';
890     !!!next-input-character;
891     redo A;
892     } elsif ($self->{next_input_character} == 0x003E) { # >
893     if ($self->{current_token}->{type} eq 'start tag') {
894     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
895     } elsif ($self->{current_token}->{type} eq 'end tag') {
896     $self->{content_model_flag} = 'PCDATA'; # MUST
897 wakaba 1.2 if ($self->{current_token}->{attributes}) {
898 wakaba 1.1 !!!parse-error;
899     }
900     } else {
901     die "$0: $self->{current_token}->{type}: Unknown token type";
902     }
903     $self->{state} = 'data';
904     !!!next-input-character;
905    
906     !!!emit ($self->{current_token}); # start tag or end tag
907     undef $self->{current_token};
908    
909     redo A;
910     } elsif ($self->{next_input_character} == 0x003C or # <
911     $self->{next_input_character} == -1) {
912     !!!parse-error;
913     if ($self->{current_token}->{type} eq 'start tag') {
914     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
915     } elsif ($self->{current_token}->{type} eq 'end tag') {
916     $self->{content_model_flag} = 'PCDATA'; # MUST
917 wakaba 1.2 if ($self->{current_token}->{attributes}) {
918 wakaba 1.1 !!!parse-error;
919     }
920     } else {
921     die "$0: $self->{current_token}->{type}: Unknown token type";
922     }
923     $self->{state} = 'data';
924     ## reconsume
925    
926     !!!emit ($self->{current_token}); # start tag or end tag
927     undef $self->{current_token};
928    
929     redo A;
930     } else {
931     $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
932     $self->{state} = 'attribute value (unquoted)';
933     !!!next-input-character;
934     redo A;
935     }
936     } elsif ($self->{state} eq 'attribute value (double-quoted)') {
937     if ($self->{next_input_character} == 0x0022) { # "
938     $self->{state} = 'before attribute name';
939     !!!next-input-character;
940     redo A;
941     } elsif ($self->{next_input_character} == 0x0026) { # &
942     $self->{last_attribute_value_state} = 'attribute value (double-quoted)';
943     $self->{state} = 'entity in attribute value';
944     !!!next-input-character;
945     redo A;
946     } elsif ($self->{next_input_character} == -1) {
947     !!!parse-error;
948     if ($self->{current_token}->{type} eq 'start tag') {
949     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
950     } elsif ($self->{current_token}->{type} eq 'end tag') {
951     $self->{content_model_flag} = 'PCDATA'; # MUST
952 wakaba 1.2 if ($self->{current_token}->{attributes}) {
953 wakaba 1.1 !!!parse-error;
954     }
955     } else {
956     die "$0: $self->{current_token}->{type}: Unknown token type";
957     }
958     $self->{state} = 'data';
959     ## reconsume
960    
961     !!!emit ($self->{current_token}); # start tag or end tag
962     undef $self->{current_token};
963    
964     redo A;
965     } else {
966     $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
967     ## Stay in the state
968     !!!next-input-character;
969     redo A;
970     }
971     } elsif ($self->{state} eq 'attribute value (single-quoted)') {
972     if ($self->{next_input_character} == 0x0027) { # '
973     $self->{state} = 'before attribute name';
974     !!!next-input-character;
975     redo A;
976     } elsif ($self->{next_input_character} == 0x0026) { # &
977     $self->{last_attribute_value_state} = 'attribute value (single-quoted)';
978     $self->{state} = 'entity in attribute value';
979     !!!next-input-character;
980     redo A;
981     } elsif ($self->{next_input_character} == -1) {
982     !!!parse-error;
983     if ($self->{current_token}->{type} eq 'start tag') {
984     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
985     } elsif ($self->{current_token}->{type} eq 'end tag') {
986     $self->{content_model_flag} = 'PCDATA'; # MUST
987 wakaba 1.2 if ($self->{current_token}->{attributes}) {
988 wakaba 1.1 !!!parse-error;
989     }
990     } else {
991     die "$0: $self->{current_token}->{type}: Unknown token type";
992     }
993     $self->{state} = 'data';
994     ## reconsume
995    
996     !!!emit ($self->{current_token}); # start tag or end tag
997     undef $self->{current_token};
998    
999     redo A;
1000     } else {
1001     $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1002     ## Stay in the state
1003     !!!next-input-character;
1004     redo A;
1005     }
1006     } elsif ($self->{state} eq 'attribute value (unquoted)') {
1007     if ($self->{next_input_character} == 0x0009 or # HT
1008     $self->{next_input_character} == 0x000A or # LF
1009     $self->{next_input_character} == 0x000B or # HT
1010     $self->{next_input_character} == 0x000C or # FF
1011     $self->{next_input_character} == 0x0020) { # SP
1012     $self->{state} = 'before attribute name';
1013     !!!next-input-character;
1014     redo A;
1015     } elsif ($self->{next_input_character} == 0x0026) { # &
1016     $self->{last_attribute_value_state} = 'attribute value (unquoted)';
1017     $self->{state} = 'entity in attribute value';
1018     !!!next-input-character;
1019     redo A;
1020     } elsif ($self->{next_input_character} == 0x003E) { # >
1021     if ($self->{current_token}->{type} eq 'start tag') {
1022     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1023     } elsif ($self->{current_token}->{type} eq 'end tag') {
1024     $self->{content_model_flag} = 'PCDATA'; # MUST
1025 wakaba 1.2 if ($self->{current_token}->{attributes}) {
1026 wakaba 1.1 !!!parse-error;
1027     }
1028     } else {
1029     die "$0: $self->{current_token}->{type}: Unknown token type";
1030     }
1031     $self->{state} = 'data';
1032     !!!next-input-character;
1033    
1034     !!!emit ($self->{current_token}); # start tag or end tag
1035     undef $self->{current_token};
1036    
1037     redo A;
1038     } elsif ($self->{next_input_character} == 0x003C or # <
1039     $self->{next_input_character} == -1) {
1040     !!!parse-error;
1041     if ($self->{current_token}->{type} eq 'start tag') {
1042     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1043     } elsif ($self->{current_token}->{type} eq 'end tag') {
1044     $self->{content_model_flag} = 'PCDATA'; # MUST
1045 wakaba 1.2 if ($self->{current_token}->{attributes}) {
1046 wakaba 1.1 !!!parse-error;
1047     }
1048     } else {
1049     die "$0: $self->{current_token}->{type}: Unknown token type";
1050     }
1051     $self->{state} = 'data';
1052     ## reconsume
1053    
1054     !!!emit ($self->{current_token}); # start tag or end tag
1055     undef $self->{current_token};
1056    
1057     redo A;
1058     } else {
1059     $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1060     ## Stay in the state
1061     !!!next-input-character;
1062     redo A;
1063     }
1064     } elsif ($self->{state} eq 'entity in attribute value') {
1065     my $token = $self->_tokenize_attempt_to_consume_an_entity;
1066    
1067     unless (defined $token) {
1068     $self->{current_attribute}->{value} .= '&';
1069     } else {
1070     $self->{current_attribute}->{value} .= $token->{data};
1071     ## ISSUE: spec says "append the returned character token to the current attribute's value"
1072     }
1073    
1074     $self->{state} = $self->{last_attribute_value_state};
1075     # next-input-character is already done
1076     redo A;
1077     } elsif ($self->{state} eq 'bogus comment') {
1078     ## (only happen if PCDATA state)
1079    
1080     my $token = {type => 'comment', data => ''};
1081    
1082     BC: {
1083     if ($self->{next_input_character} == 0x003E) { # >
1084     $self->{state} = 'data';
1085     !!!next-input-character;
1086    
1087     !!!emit ($token);
1088    
1089     redo A;
1090     } elsif ($self->{next_input_character} == -1) {
1091     $self->{state} = 'data';
1092     ## reconsume
1093    
1094     !!!emit ($token);
1095    
1096     redo A;
1097     } else {
1098     $token->{data} .= chr ($self->{next_input_character});
1099     !!!next-input-character;
1100     redo BC;
1101     }
1102     } # BC
1103     } elsif ($self->{state} eq 'markup declaration open') {
1104     ## (only happen if PCDATA state)
1105    
1106     my @next_char;
1107     push @next_char, $self->{next_input_character};
1108    
1109     if ($self->{next_input_character} == 0x002D) { # -
1110     !!!next-input-character;
1111     push @next_char, $self->{next_input_character};
1112     if ($self->{next_input_character} == 0x002D) { # -
1113     $self->{current_token} = {type => 'comment', data => ''};
1114     $self->{state} = 'comment';
1115     !!!next-input-character;
1116     redo A;
1117     }
1118     } elsif ($self->{next_input_character} == 0x0044 or # D
1119     $self->{next_input_character} == 0x0064) { # d
1120     !!!next-input-character;
1121     push @next_char, $self->{next_input_character};
1122     if ($self->{next_input_character} == 0x004F or # O
1123     $self->{next_input_character} == 0x006F) { # o
1124     !!!next-input-character;
1125     push @next_char, $self->{next_input_character};
1126     if ($self->{next_input_character} == 0x0043 or # C
1127     $self->{next_input_character} == 0x0063) { # c
1128     !!!next-input-character;
1129     push @next_char, $self->{next_input_character};
1130     if ($self->{next_input_character} == 0x0054 or # T
1131     $self->{next_input_character} == 0x0074) { # t
1132     !!!next-input-character;
1133     push @next_char, $self->{next_input_character};
1134     if ($self->{next_input_character} == 0x0059 or # Y
1135     $self->{next_input_character} == 0x0079) { # y
1136     !!!next-input-character;
1137     push @next_char, $self->{next_input_character};
1138     if ($self->{next_input_character} == 0x0050 or # P
1139     $self->{next_input_character} == 0x0070) { # p
1140     !!!next-input-character;
1141     push @next_char, $self->{next_input_character};
1142     if ($self->{next_input_character} == 0x0045 or # E
1143     $self->{next_input_character} == 0x0065) { # e
1144     ## ISSUE: What a stupid code this is!
1145     $self->{state} = 'DOCTYPE';
1146     !!!next-input-character;
1147     redo A;
1148     }
1149     }
1150     }
1151     }
1152     }
1153     }
1154     }
1155    
1156     !!!parse-error;
1157     $self->{next_input_character} = shift @next_char;
1158     !!!back-next-input-character (@next_char);
1159     $self->{state} = 'bogus comment';
1160     redo A;
1161    
1162     ## ISSUE: typos in spec: chacacters, is is a parse error
1163     ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
1164     } elsif ($self->{state} eq 'comment') {
1165     if ($self->{next_input_character} == 0x002D) { # -
1166     $self->{state} = 'comment dash';
1167     !!!next-input-character;
1168     redo A;
1169     } elsif ($self->{next_input_character} == -1) {
1170     !!!parse-error;
1171     $self->{state} = 'data';
1172     ## reconsume
1173    
1174     !!!emit ($self->{current_token}); # comment
1175     undef $self->{current_token};
1176    
1177     redo A;
1178     } else {
1179     $self->{current_token}->{data} .= chr ($self->{next_input_character}); # comment
1180     ## Stay in the state
1181     !!!next-input-character;
1182     redo A;
1183     }
1184     } elsif ($self->{state} eq 'comment dash') {
1185     if ($self->{next_input_character} == 0x002D) { # -
1186     $self->{state} = 'comment end';
1187     !!!next-input-character;
1188     redo A;
1189     } elsif ($self->{next_input_character} == -1) {
1190     !!!parse-error;
1191     $self->{state} = 'data';
1192     ## reconsume
1193    
1194     !!!emit ($self->{current_token}); # comment
1195     undef $self->{current_token};
1196    
1197     redo A;
1198     } else {
1199     $self->{current_token}->{data} .= '-' . chr ($self->{next_input_character}); # comment
1200     $self->{state} = 'comment';
1201     !!!next-input-character;
1202     redo A;
1203     }
1204     } elsif ($self->{state} eq 'comment end') {
1205     if ($self->{next_input_character} == 0x003E) { # >
1206     $self->{state} = 'data';
1207     !!!next-input-character;
1208    
1209     !!!emit ($self->{current_token}); # comment
1210     undef $self->{current_token};
1211    
1212     redo A;
1213     } elsif ($self->{next_input_character} == 0x002D) { # -
1214     !!!parse-error;
1215     $self->{current_token}->{data} .= '-'; # comment
1216     ## Stay in the state
1217     !!!next-input-character;
1218     redo A;
1219     } elsif ($self->{next_input_character} == -1) {
1220     !!!parse-error;
1221     $self->{state} = 'data';
1222     ## reconsume
1223    
1224     !!!emit ($self->{current_token}); # comment
1225     undef $self->{current_token};
1226    
1227     redo A;
1228     } else {
1229     !!!parse-error;
1230     $self->{current_token}->{data} .= '--' . chr ($self->{next_input_character}); # comment
1231     $self->{state} = 'comment';
1232     !!!next-input-character;
1233     redo A;
1234     }
1235     } elsif ($self->{state} eq 'DOCTYPE') {
1236     if ($self->{next_input_character} == 0x0009 or # HT
1237     $self->{next_input_character} == 0x000A or # LF
1238     $self->{next_input_character} == 0x000B or # VT
1239     $self->{next_input_character} == 0x000C or # FF
1240     $self->{next_input_character} == 0x0020) { # SP
1241     $self->{state} = 'before DOCTYPE name';
1242     !!!next-input-character;
1243     redo A;
1244     } else {
1245     !!!parse-error;
1246     $self->{state} = 'before DOCTYPE name';
1247     ## reconsume
1248     redo A;
1249     }
1250     } elsif ($self->{state} eq 'before DOCTYPE name') {
1251     if ($self->{next_input_character} == 0x0009 or # HT
1252     $self->{next_input_character} == 0x000A or # LF
1253     $self->{next_input_character} == 0x000B or # VT
1254     $self->{next_input_character} == 0x000C or # FF
1255     $self->{next_input_character} == 0x0020) { # SP
1256     ## Stay in the state
1257     !!!next-input-character;
1258     redo A;
1259     } elsif (0x0061 <= $self->{next_input_character} and
1260     $self->{next_input_character} <= 0x007A) { # a..z
1261     $self->{current_token} = {type => 'DOCTYPE',
1262     name => chr ($self->{next_input_character} - 0x0020),
1263     error => 1};
1264     $self->{state} = 'DOCTYPE name';
1265     !!!next-input-character;
1266     redo A;
1267     } elsif ($self->{next_input_character} == 0x003E) { # >
1268     !!!parse-error;
1269     $self->{state} = 'data';
1270     !!!next-input-character;
1271    
1272     !!!emit ({type => 'DOCTYPE', name => '', error => 1});
1273    
1274     redo A;
1275     } elsif ($self->{next_input_character} == -1) {
1276     !!!parse-error;
1277     $self->{state} = 'data';
1278     ## reconsume
1279    
1280     !!!emit ({type => 'DOCTYPE', name => '', error => 1});
1281    
1282     redo A;
1283     } else {
1284     $self->{current_token} = {type => 'DOCTYPE',
1285     name => chr ($self->{next_input_character}),
1286     error => 1};
1287     $self->{state} = 'DOCTYPE name';
1288     !!!next-input-character;
1289     redo A;
1290     }
1291     } elsif ($self->{state} eq 'DOCTYPE name') {
1292     if ($self->{next_input_character} == 0x0009 or # HT
1293     $self->{next_input_character} == 0x000A or # LF
1294     $self->{next_input_character} == 0x000B or # VT
1295     $self->{next_input_character} == 0x000C or # FF
1296     $self->{next_input_character} == 0x0020) { # SP
1297     $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE
1298     $self->{state} = 'after DOCTYPE name';
1299     !!!next-input-character;
1300     redo A;
1301     } elsif ($self->{next_input_character} == 0x003E) { # >
1302     $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE
1303     $self->{state} = 'data';
1304     !!!next-input-character;
1305    
1306     !!!emit ($self->{current_token}); # DOCTYPE
1307     undef $self->{current_token};
1308    
1309     redo A;
1310     } elsif (0x0061 <= $self->{next_input_character} and
1311     $self->{next_input_character} <= 0x007A) { # a..z
1312     $self->{current_token}->{name} .= chr ($self->{next_input_character} - 0x0020); # DOCTYPE
1313     #$self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML');
1314     ## Stay in the state
1315     !!!next-input-character;
1316     redo A;
1317     } elsif ($self->{next_input_character} == -1) {
1318     !!!parse-error;
1319     $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE
1320     $self->{state} = 'data';
1321     ## reconsume
1322    
1323     !!!emit ($self->{current_token});
1324     undef $self->{current_token};
1325    
1326     redo A;
1327     } else {
1328 wakaba 1.3 $self->{current_token}->{name}
1329     .= chr ($self->{next_input_character}); # DOCTYPE
1330 wakaba 1.1 #$self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML');
1331     ## Stay in the state
1332     !!!next-input-character;
1333     redo A;
1334     }
1335     } elsif ($self->{state} eq 'after DOCTYPE name') {
1336     if ($self->{next_input_character} == 0x0009 or # HT
1337     $self->{next_input_character} == 0x000A or # LF
1338     $self->{next_input_character} == 0x000B or # VT
1339     $self->{next_input_character} == 0x000C or # FF
1340     $self->{next_input_character} == 0x0020) { # SP
1341     ## Stay in the state
1342     !!!next-input-character;
1343     redo A;
1344     } elsif ($self->{next_input_character} == 0x003E) { # >
1345     $self->{state} = 'data';
1346     !!!next-input-character;
1347    
1348     !!!emit ($self->{current_token}); # DOCTYPE
1349     undef $self->{current_token};
1350    
1351     redo A;
1352     } elsif ($self->{next_input_character} == -1) {
1353     !!!parse-error;
1354     $self->{state} = 'data';
1355     ## reconsume
1356    
1357     !!!emit ($self->{current_token}); # DOCTYPE
1358     undef $self->{current_token};
1359    
1360     redo A;
1361     } else {
1362     !!!parse-error;
1363     $self->{current_token}->{error} = 1; # DOCTYPE
1364     $self->{state} = 'bogus DOCTYPE';
1365     !!!next-input-character;
1366     redo A;
1367     }
1368     } elsif ($self->{state} eq 'bogus DOCTYPE') {
1369     if ($self->{next_input_character} == 0x003E) { # >
1370     $self->{state} = 'data';
1371     !!!next-input-character;
1372    
1373     !!!emit ($self->{current_token}); # DOCTYPE
1374     undef $self->{current_token};
1375    
1376     redo A;
1377     } elsif ($self->{next_input_character} == -1) {
1378     !!!parse-error;
1379     $self->{state} = 'data';
1380     ## reconsume
1381    
1382     !!!emit ($self->{current_token}); # DOCTYPE
1383     undef $self->{current_token};
1384    
1385     redo A;
1386     } else {
1387     ## Stay in the state
1388     !!!next-input-character;
1389     redo A;
1390     }
1391     } else {
1392     die "$0: $self->{state}: Unknown state";
1393     }
1394     } # A
1395    
1396     die "$0: _get_next_token: unexpected case";
1397     } # _get_next_token
1398    
1399     sub _tokenize_attempt_to_consume_an_entity ($) {
1400     my $self = shift;
1401    
1402     if ($self->{next_input_character} == 0x0023) { # #
1403     !!!next-input-character;
1404     my $num;
1405     if ($self->{next_input_character} == 0x0078 or # x
1406     $self->{next_input_character} == 0x0058) { # X
1407     X: {
1408     my $x_char = $self->{next_input_character};
1409     !!!next-input-character;
1410     if (0x0030 <= $self->{next_input_character} and
1411     $self->{next_input_character} <= 0x0039) { # 0..9
1412     $num ||= 0;
1413     $num *= 0x10;
1414     $num += $self->{next_input_character} - 0x0030;
1415     redo X;
1416     } elsif (0x0061 <= $self->{next_input_character} and
1417     $self->{next_input_character} <= 0x0066) { # a..f
1418     ## ISSUE: the spec says U+0078, which is apparently incorrect
1419     $num ||= 0;
1420     $num *= 0x10;
1421     $num += $self->{next_input_character} - 0x0060 + 9;
1422     redo X;
1423     } elsif (0x0041 <= $self->{next_input_character} and
1424     $self->{next_input_character} <= 0x0046) { # A..F
1425     ## ISSUE: the spec says U+0058, which is apparently incorrect
1426     $num ||= 0;
1427     $num *= 0x10;
1428     $num += $self->{next_input_character} - 0x0040 + 9;
1429     redo X;
1430     } elsif (not defined $num) { # no hexadecimal digit
1431     !!!parse-error;
1432     $self->{next_input_character} = 0x0023; # #
1433     !!!back-next-input-character ($x_char);
1434 wakaba 1.5 return undef;
1435 wakaba 1.1 } elsif ($self->{next_input_character} == 0x003B) { # ;
1436     !!!next-input-character;
1437     } else {
1438     !!!parse-error;
1439     }
1440    
1441     ## TODO: check the definition for |a valid Unicode character|.
1442     if ($num > 1114111 or $num == 0) {
1443     $num = 0xFFFD; # REPLACEMENT CHARACTER
1444     ## ISSUE: Why this is not an error?
1445     }
1446    
1447 wakaba 1.5 return {type => 'character', data => chr $num};
1448 wakaba 1.1 } # X
1449 wakaba 1.4 } elsif (0x0030 <= $self->{next_input_character} and
1450     $self->{next_input_character} <= 0x0039) { # 0..9
1451     my $code = $self->{next_input_character} - 0x0030;
1452     !!!next-input-character;
1453    
1454     while (0x0030 <= $self->{next_input_character} and
1455     $self->{next_input_character} <= 0x0039) { # 0..9
1456     $code *= 10;
1457     $code += $self->{next_input_character} - 0x0030;
1458    
1459     !!!next-input-character;
1460     }
1461 wakaba 1.1
1462 wakaba 1.4 if ($self->{next_input_character} == 0x003B) { # ;
1463     !!!next-input-character;
1464     } else {
1465     !!!parse-error;
1466     }
1467 wakaba 1.1
1468 wakaba 1.4 ## TODO: check the definition for |a valid Unicode character|.
1469     if ($code > 1114111 or $code == 0) {
1470     $code = 0xFFFD; # REPLACEMENT CHARACTER
1471     ## ISSUE: Why this is not an error?
1472     }
1473    
1474 wakaba 1.5 return {type => 'character', data => chr $code};
1475 wakaba 1.4 } else {
1476     !!!parse-error;
1477     !!!back-next-input-character ($self->{next_input_character});
1478     $self->{next_input_character} = 0x0023; # #
1479 wakaba 1.5 return undef;
1480     }
1481     } elsif ((0x0041 <= $self->{next_input_character} and
1482     $self->{next_input_character} <= 0x005A) or
1483     (0x0061 <= $self->{next_input_character} and
1484     $self->{next_input_character} <= 0x007A)) {
1485     my $entity_name = chr $self->{next_input_character};
1486     !!!next-input-character;
1487    
1488     my $value = $entity_name;
1489     my $match;
1490    
1491     while (length $entity_name < 10 and
1492     ## NOTE: Some number greater than the maximum length of entity name
1493     ((0x0041 <= $self->{next_input_character} and
1494     $self->{next_input_character} <= 0x005A) or
1495     (0x0061 <= $self->{next_input_character} and
1496     $self->{next_input_character} <= 0x007A) or
1497     (0x0030 <= $self->{next_input_character} and
1498     $self->{next_input_character} <= 0x0039))) {
1499     $entity_name .= chr $self->{next_input_character};
1500     if (defined $entity_char->{$entity_name}) {
1501     $value = $entity_char->{$entity_name};
1502     $match = 1;
1503     } else {
1504     $value .= chr $self->{next_input_character};
1505     }
1506     !!!next-input-character;
1507     }
1508    
1509     if ($match) {
1510     if ($self->{next_input_character} == 0x003B) { # ;
1511     !!!next-input-character;
1512     } else {
1513     !!!parse-error;
1514     }
1515    
1516     return {type => 'character', data => $value};
1517     } else {
1518     !!!parse-error;
1519     ## NOTE: No characters are consumed in the spec.
1520     !!!back-token ({type => 'character', data => $value});
1521     return undef;
1522 wakaba 1.1 }
1523 wakaba 1.5 } else {
1524     ## no characters are consumed
1525     !!!parse-error;
1526     return undef;
1527     }
1528 wakaba 1.1 } # _tokenize_attempt_to_consume_an_entity
1529    
1530 wakaba 1.2 sub _initialize_tree_constructor ($) {
1531     my $self = shift;
1532     require What::NanoDOM;
1533     $self->{document} = What::NanoDOM::Document->new;
1534     $self->{document}->strict_error_checking (0);
1535     ## TODO: Turn mutation events off # MUST
1536     ## TODO: Turn loose Document option (manakai extension) on
1537     } # _initialize_tree_constructor
1538    
1539     sub _terminate_tree_constructor ($) {
1540     my $self = shift;
1541     $self->{document}->strict_error_checking (1);
1542     ## TODO: Turn mutation events on
1543     } # _terminate_tree_constructor
1544    
1545     ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
1546    
1547     sub _construct_tree ($) {
1548     my ($self) = @_;
1549    
1550     ## When an interactive UA render the $self->{document} available
1551     ## to the user, or when it begin accepting user input, are
1552     ## not defined.
1553    
1554     ## Append a character: collect it and all subsequent consecutive
1555     ## characters and insert one Text node whose data is concatenation
1556     ## of all those characters. # MUST
1557    
1558     my $token;
1559     !!!next-token;
1560    
1561     my $phase = 'initial'; # MUST
1562    
1563     my $open_elements = [];
1564     my $active_formatting_elements = [];
1565     my $head_element;
1566     my $form_element;
1567     my $insertion_mode = 'before head';
1568    
1569     my $reconstruct_active_formatting_elements = sub { # MUST
1570     ## Step 1
1571     return unless @$active_formatting_elements;
1572    
1573     ## Step 3
1574     my $i = -1;
1575     my $entry = $active_formatting_elements->[$i];
1576    
1577     ## Step 2
1578     return if $entry->[0] eq '#marker';
1579     for (@$open_elements) {
1580     if ($entry->[0] eq $_->[0]) {
1581     return;
1582     }
1583     }
1584    
1585     ## Step 4
1586     S4: {
1587     last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
1588    
1589     ## Step 5
1590     $i--;
1591     $entry = $active_formatting_elements->[$i];
1592    
1593     ## Step 6
1594     if ($entry->[0] eq '#marker') {
1595     #
1596     } else {
1597     my $in_open_elements;
1598     OE: for (@$open_elements) {
1599     if ($entry->[0] eq $_->[0]) {
1600     $in_open_elements = 1;
1601     last OE;
1602     }
1603     }
1604     if ($in_open_elements) {
1605     #
1606     } else {
1607     redo S4;
1608     }
1609     }
1610    
1611     ## Step 7
1612     $i++;
1613     $entry = $active_formatting_elements->[$i];
1614     } # S4
1615    
1616     S7: {
1617     ## Step 8
1618     my $clone = $entry->[0]->clone_node (0);
1619    
1620     ## Step 9
1621     $open_elements->[-1]->[0]->append_child ($clone);
1622     push @$open_elements, [$clone, $entry->[1]];
1623    
1624     ## Step 10
1625     $active_formatting_elements->[$i] = $open_elements->[-1];
1626    
1627     unless ($i == $#$active_formatting_elements) {
1628     ## Step 7'
1629     $i++;
1630     $entry = $active_formatting_elements->[$i];
1631    
1632     redo S7;
1633     }
1634     } # S7
1635     }; # $reconstruct_active_formatting_elements
1636    
1637     my $clear_up_to_marker = sub {
1638     for (reverse 0..$#$active_formatting_elements) {
1639     if ($active_formatting_elements->[$_]->[0] eq '#marker') {
1640     splice @$active_formatting_elements, $_;
1641     return;
1642     }
1643     }
1644     }; # $clear_up_to_marker
1645    
1646     my $reset_insertion_mode = sub {
1647     ## Step 1
1648     my $last;
1649    
1650     ## Step 2
1651     my $i = -1;
1652     my $node = $open_elements->[$i];
1653    
1654     ## Step 3
1655     S3: {
1656     $last = 1 if $open_elements->[0]->[0] eq $node->[0];
1657     ## TODO: the element whose inner_html is set is neither td nor th, then $node = the element
1658    
1659     ## Step 4..13
1660     my $new_mode = {
1661     select => 'in select',
1662     td => 'in cell',
1663     th => 'in cell',
1664     tr => 'in row',
1665     tbody => 'in table body',
1666     thead => 'in table head',
1667     tfoot => 'in table foot',
1668     caption => 'in caption',
1669     colgroup => 'in column group',
1670     table => 'in table',
1671     head => 'in body', # not in head!
1672     body => 'in body',
1673     frameset => 'in frameset',
1674     }->{$node->[1]};
1675     $insertion_mode = $new_mode and return if defined $new_mode;
1676    
1677     ## Step 14
1678     if ($node->[1] eq 'html') {
1679     unless (defined $head_element) {
1680     $insertion_mode = 'before head';
1681     } else {
1682     $insertion_mode = 'after head';
1683     }
1684     return;
1685     }
1686    
1687     ## Step 15
1688     $insertion_mode = 'in body' and return if $last;
1689    
1690     ## Step 16
1691     $i--;
1692     $node = $open_elements->[$i];
1693    
1694     ## Step 17
1695     redo S3;
1696     } # S3
1697     }; # $reset_insertion_mode
1698    
1699     my $style_start_tag = sub {
1700     my $style_el; !!!create-element ($style_el, 'style');
1701     ## $insertion_mode eq 'in head' and ... (always true)
1702     (($insertion_mode eq 'in head' and defined $head_element)
1703     ? $head_element : $open_elements->[-1]->[0])
1704     ->append_child ($style_el);
1705     $self->{content_model_flag} = 'CDATA';
1706    
1707     my $text = '';
1708     !!!next-token;
1709     while ($token->{type} eq 'character') {
1710     $text .= $token->{data};
1711     !!!next-token;
1712     } # stop if non-character token or tokenizer stops tokenising
1713     if (length $text) {
1714     $style_el->manakai_append_text ($text);
1715     }
1716    
1717     $self->{content_model_flag} = 'PCDATA';
1718    
1719     if ($token->{type} eq 'end tag' and $token->{tag_name} eq 'style') {
1720     ## Ignore the token
1721     } else {
1722     !!!parse-error;
1723     ## ISSUE: And ignore?
1724     }
1725     !!!next-token;
1726     }; # $style_start_tag
1727    
1728     my $script_start_tag = sub {
1729     my $script_el; !!!create-element ($script_el, 'script');
1730     ## TODO: mark as "parser-inserted"
1731    
1732     $self->{content_model_flag} = 'CDATA';
1733    
1734     my $text = '';
1735     !!!next-token;
1736     while ($token->{type} eq 'character') {
1737     $text .= $token->{data};
1738     !!!next-token;
1739     } # stop if non-character token or tokenizer stops tokenising
1740     if (length $text) {
1741     $script_el->manakai_append_text ($text);
1742     }
1743    
1744     $self->{content_model_flag} = 'PCDATA';
1745    
1746     if ($token->{type} eq 'end tag' and
1747     $token->{tag_name} eq 'script') {
1748     ## Ignore the token
1749     } else {
1750     !!!parse-error;
1751     ## ISSUE: And ignore?
1752     ## TODO: mark as "already executed"
1753     }
1754    
1755     ## TODO: inner_html mode then mark as "already executed" and skip
1756     if (1) {
1757     ## TODO: $old_insertion_point = current insertion point
1758     ## TODO: insertion point = just before the next input character
1759    
1760     (($insertion_mode eq 'in head' and defined $head_element)
1761     ? $head_element : $open_elements->[-1]->[0])->append_child ($script_el);
1762    
1763     ## TODO: insertion point = $old_insertion_point (might be "undefined")
1764    
1765     ## TODO: if there is a script that will execute as soon as the parser resume, then...
1766     }
1767    
1768     !!!next-token;
1769     }; # $script_start_tag
1770    
1771     my $formatting_end_tag = sub {
1772     my $tag_name = shift;
1773    
1774     FET: {
1775     ## Step 1
1776     my $formatting_element;
1777     my $formatting_element_i_in_active;
1778     AFE: for (reverse 0..$#$active_formatting_elements) {
1779     if ($active_formatting_elements->[$_]->[1] eq $tag_name) {
1780     $formatting_element = $active_formatting_elements->[$_];
1781     $formatting_element_i_in_active = $_;
1782     last AFE;
1783     } elsif ($active_formatting_elements->[$_]->[0] eq '#marker') {
1784     last AFE;
1785     }
1786     } # AFE
1787     unless (defined $formatting_element) {
1788     !!!parse-error;
1789     ## Ignore the token
1790     !!!next-token;
1791     return;
1792     }
1793     ## has an element in scope
1794     my $in_scope = 1;
1795     my $formatting_element_i_in_open;
1796     INSCOPE: for (reverse 0..$#$open_elements) {
1797     my $node = $open_elements->[$_];
1798     if ($node->[0] eq $formatting_element->[0]) {
1799     if ($in_scope) {
1800     $formatting_element_i_in_open = $_;
1801     last INSCOPE;
1802     } else { # in open elements but not in scope
1803     !!!parse-error;
1804     ## Ignore the token
1805     !!!next-token;
1806     return;
1807     }
1808     } elsif ({
1809     table => 1, caption => 1, td => 1, th => 1,
1810     button => 1, marquee => 1, object => 1, html => 1,
1811     }->{$node->[1]}) {
1812     $in_scope = 0;
1813     }
1814     } # INSCOPE
1815     unless (defined $formatting_element_i_in_open) {
1816     !!!parse-error;
1817     pop @$active_formatting_elements; # $formatting_element
1818     !!!next-token; ## TODO: ok?
1819     return;
1820     }
1821     if (not $open_elements->[-1]->[0] eq $formatting_element->[0]) {
1822     !!!parse-error;
1823     }
1824    
1825     ## Step 2
1826     my $furthest_block;
1827     my $furthest_block_i_in_open;
1828     OE: for (reverse 0..$#$open_elements) {
1829     my $node = $open_elements->[$_];
1830     if (not $formatting_category->{$node->[1]} and
1831     #not $phrasing_category->{$node->[1]} and
1832     ($special_category->{$node->[1]} or
1833     $scoping_category->{$node->[1]})) {
1834     $furthest_block = $node;
1835     $furthest_block_i_in_open = $_;
1836     } elsif ($node->[0] eq $formatting_element->[0]) {
1837     last OE;
1838     }
1839     } # OE
1840    
1841     ## Step 3
1842     unless (defined $furthest_block) { # MUST
1843     splice @$open_elements, $formatting_element_i_in_open;
1844     splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
1845     !!!next-token;
1846     return;
1847     }
1848    
1849     ## Step 4
1850     my $common_ancestor_node = $open_elements->[$formatting_element_i_in_open - 1];
1851    
1852     ## Step 5
1853     my $furthest_block_parent = $furthest_block->[0]->parent_node;
1854     if (defined $furthest_block_parent) {
1855     $furthest_block_parent->remove_child ($furthest_block->[0]);
1856     }
1857    
1858     ## Step 6
1859     my $bookmark_prev_el
1860     = $active_formatting_elements->[$formatting_element_i_in_active - 1]
1861     ->[0];
1862    
1863     ## Step 7
1864     my $node = $furthest_block;
1865     my $node_i_in_open = $furthest_block_i_in_open;
1866     my $last_node = $furthest_block;
1867     S7: {
1868     ## Step 1
1869     $node_i_in_open--;
1870     $node = $open_elements->[$node_i_in_open];
1871    
1872     ## Step 2
1873     my $node_i_in_active;
1874     S7S2: {
1875     for (reverse 0..$#$active_formatting_elements) {
1876     if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
1877     $node_i_in_active = $_;
1878     last S7S2;
1879     }
1880     }
1881     splice @$open_elements, $node_i_in_open, 1;
1882     redo S7;
1883     } # S7S2
1884    
1885     ## Step 3
1886     last S7 if $node->[0] eq $formatting_element->[0];
1887    
1888     ## Step 4
1889     if ($last_node->[0] eq $furthest_block->[0]) {
1890     $bookmark_prev_el = $node->[0];
1891     }
1892    
1893     ## Step 5
1894     if ($node->[0]->has_child_nodes ()) {
1895     my $clone = [$node->[0]->clone_node (0), $node->[1]];
1896     $active_formatting_elements->[$node_i_in_active] = $clone;
1897     $open_elements->[$node_i_in_open] = $clone;
1898     $node = $clone;
1899     }
1900    
1901     ## Step 6
1902     $node->append_child ($last_node);
1903    
1904     ## Step 7
1905     $last_node = $node;
1906    
1907     ## Step 8
1908     redo S7;
1909     } # S7
1910    
1911     ## Step 8
1912     $common_ancestor_node->append_child ($last_node);
1913    
1914     ## Step 9
1915     my $clone = [$formatting_element->[0]->clone_node (0),
1916     $formatting_element->[1]];
1917    
1918     ## Step 10
1919     my @cn = @{$furthest_block->[0]->child_nodes};
1920     $clone->[0]->append_child ($_) for @cn;
1921    
1922     ## Step 11
1923     $furthest_block->[0]->append_child ($clone->[0]);
1924    
1925     ## Step 12
1926     my $i;
1927     AFE: for (reverse 0..$#$active_formatting_elements) {
1928     if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
1929     splice @$active_formatting_elements, $_, 1;
1930     $i-- and last AFE if defined $i;
1931     } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
1932     $i = $_;
1933     }
1934     } # AFE
1935     splice @$active_formatting_elements, $i + 1, 0, $clone;
1936    
1937     ## Step 13
1938     undef $i;
1939     OE: for (reverse 0..$#$open_elements) {
1940     if ($open_elements->[$_]->[0] eq $formatting_element->[0]) {
1941     splice @$open_elements, $_, 1;
1942     $i-- and last OE if defined $i;
1943     } elsif ($open_elements->[$_]->[0] eq $furthest_block->[0]) {
1944     $i = $_;
1945     }
1946     } # OE
1947     splice @$open_elements, $i + 1, 1, $clone;
1948    
1949     ## Step 14
1950     redo FET;
1951     } # FET
1952     }; # $formatting_end_tag
1953    
1954     my $in_body = sub {
1955     my $insert = shift;
1956     if ($token->{type} eq 'start tag') {
1957     if ($token->{tag_name} eq 'script') {
1958     $script_start_tag->();
1959     return;
1960     } elsif ($token->{tag_name} eq 'style') {
1961     $style_start_tag->();
1962     return;
1963     } elsif ({
1964     base => 1, link => 1, meta => 1, title => 1,
1965     }->{$token->{tag_name}}) {
1966     !!!parse-error;
1967     ## NOTE: This is an "as if in head" code clone
1968     my $el;
1969     !!!create-element ($el, $token->{tag_name}, $token->{attributes});
1970     if (defined $head_element) {
1971     $head_element->append_child ($el);
1972     } else {
1973     $insert->($el);
1974     }
1975    
1976     ## ISSUE: Issue on magical <base> in the spec
1977    
1978     !!!next-token;
1979     return;
1980     } elsif ($token->{tag_name} eq 'body') {
1981     !!!parse-error;
1982    
1983     if (@$open_elements == 1 or
1984     $open_elements->[1]->[1] ne 'body') {
1985     ## Ignore the token
1986     } else {
1987     my $body_el = $open_elements->[1]->[0];
1988     for my $attr_name (keys %{$token->{attributes}}) {
1989     unless ($body_el->has_attribute_ns (undef, $attr_name)) {
1990     $body_el->set_attribute_ns
1991     (undef, [undef, $attr_name],
1992     $token->{attributes}->{$attr_name}->{value});
1993     }
1994     }
1995     }
1996     !!!next-token;
1997     return;
1998     } elsif ({
1999     address => 1, blockquote => 1, center => 1, dir => 1,
2000     div => 1, dl => 1, fieldset => 1, listing => 1,
2001     menu => 1, ol => 1, p => 1, ul => 1,
2002     pre => 1,
2003     }->{$token->{tag_name}}) {
2004     ## has a p element in scope
2005     INSCOPE: for (reverse @$open_elements) {
2006     if ($_->[1] eq 'p') {
2007     !!!back-token;
2008     $token = {type => 'end tag', tag_name => 'p'};
2009     return;
2010     } elsif ({
2011     table => 1, caption => 1, td => 1, th => 1,
2012     button => 1, marquee => 1, object => 1, html => 1,
2013     }->{$_->[1]}) {
2014     last INSCOPE;
2015     }
2016     } # INSCOPE
2017    
2018     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2019     if ($token->{tag_name} eq 'pre') {
2020     !!!next-token;
2021     if ($token->{type} eq 'character') {
2022     $token->{data} =~ s/^\x0A//;
2023     unless (length $token->{data}) {
2024     !!!next-token;
2025     }
2026     }
2027     } else {
2028     !!!next-token;
2029     }
2030     return;
2031     } elsif ($token->{tag_name} eq 'form') {
2032     if (defined $form_element) {
2033     !!!parse-error;
2034     ## Ignore the token
2035     } else {
2036     ## has a p element in scope
2037     INSCOPE: for (reverse @$open_elements) {
2038     if ($_->[1] eq 'p') {
2039     !!!back-token;
2040     $token = {type => 'end tag', tag_name => 'p'};
2041     return;
2042     } elsif ({
2043     table => 1, caption => 1, td => 1, th => 1,
2044     button => 1, marquee => 1, object => 1, html => 1,
2045     }->{$_->[1]}) {
2046     last INSCOPE;
2047     }
2048     } # INSCOPE
2049    
2050     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2051     $form_element = $open_elements->[-1]->[0];
2052     !!!next-token;
2053     return;
2054     }
2055     } elsif ($token->{tag_name} eq 'li') {
2056     ## has a p element in scope
2057     INSCOPE: for (reverse @$open_elements) {
2058     if ($_->[1] eq 'p') {
2059     !!!back-token;
2060     $token = {type => 'end tag', tag_name => 'p'};
2061     return;
2062     } elsif ({
2063     table => 1, caption => 1, td => 1, th => 1,
2064     button => 1, marquee => 1, object => 1, html => 1,
2065     }->{$_->[1]}) {
2066     last INSCOPE;
2067     }
2068     } # INSCOPE
2069    
2070     ## Step 1
2071     my $i = -1;
2072     my $node = $open_elements->[$i];
2073     LI: {
2074     ## Step 2
2075     if ($node->[1] eq 'li') {
2076     splice @$open_elements, $i;
2077     last LI;
2078     }
2079    
2080     ## Step 3
2081     if (not $formatting_category->{$node->[1]} and
2082     #not $phrasing_category->{$node->[1]} and
2083     ($special_category->{$node->[1]} or
2084     $scoping_category->{$node->[1]}) and
2085     $node->[1] ne 'address' and $node->[1] ne 'div') {
2086     last LI;
2087     }
2088    
2089     ## Step 4
2090     $i++;
2091     $node = $open_elements->[$i];
2092     redo LI;
2093     } # LI
2094    
2095     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2096     !!!next-token;
2097     return;
2098     } elsif ($token->{tag_name} eq 'dd' or $token->{tag_name} eq 'dt') {
2099     ## has a p element in scope
2100     INSCOPE: for (reverse @$open_elements) {
2101     if ($_->[1] eq 'p') {
2102     !!!back-token;
2103     $token = {type => 'end tag', tag_name => 'p'};
2104     return;
2105     } elsif ({
2106     table => 1, caption => 1, td => 1, th => 1,
2107     button => 1, marquee => 1, object => 1, html => 1,
2108     }->{$_->[1]}) {
2109     last INSCOPE;
2110     }
2111     } # INSCOPE
2112    
2113     ## Step 1
2114     my $i = -1;
2115     my $node = $open_elements->[$i];
2116     LI: {
2117     ## Step 2
2118     if ($node->[1] eq 'dt' or $node->[1] eq 'dd') {
2119     splice @$open_elements, $i;
2120     last LI;
2121     }
2122    
2123     ## Step 3
2124     if (not $formatting_category->{$node->[1]} and
2125     #not $phrasing_category->{$node->[1]} and
2126     ($special_category->{$node->[1]} or
2127     $scoping_category->{$node->[1]}) and
2128     $node->[1] ne 'address' and $node->[1] ne 'div') {
2129     last LI;
2130     }
2131    
2132     ## Step 4
2133     $i++;
2134     $node = $open_elements->[$i];
2135     redo LI;
2136     } # LI
2137    
2138     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2139     !!!next-token;
2140     return;
2141     } elsif ($token->{tag_name} eq 'plaintext') {
2142     ## has a p element in scope
2143     INSCOPE: for (reverse @$open_elements) {
2144     if ($_->[1] eq 'p') {
2145     !!!back-token;
2146     $token = {type => 'end tag', tag_name => 'p'};
2147     return;
2148     } elsif ({
2149     table => 1, caption => 1, td => 1, th => 1,
2150     button => 1, marquee => 1, object => 1, html => 1,
2151     }->{$_->[1]}) {
2152     last INSCOPE;
2153     }
2154     } # INSCOPE
2155    
2156     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2157    
2158     $self->{content_model_flag} = 'PLAINTEXT';
2159    
2160     !!!next-token;
2161     return;
2162     } elsif ({
2163     h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
2164     }->{$token->{tag_name}}) {
2165     ## has a p element in scope
2166     INSCOPE: for (reverse 0..$#$open_elements) {
2167     my $node = $open_elements->[$_];
2168     if ($node->[1] eq 'p') {
2169     !!!back-token;
2170     $token = {type => 'end tag', tag_name => 'p'};
2171     return;
2172     } elsif ({
2173     table => 1, caption => 1, td => 1, th => 1,
2174     button => 1, marquee => 1, object => 1, html => 1,
2175     }->{$node->[1]}) {
2176     last INSCOPE;
2177     }
2178     } # INSCOPE
2179    
2180     ## has an element in scope
2181     my $i;
2182     INSCOPE: for (reverse 0..$#$open_elements) {
2183     my $node = $open_elements->[$_];
2184     if ({
2185     h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
2186     }->{$node->[1]}) {
2187     $i = $_;
2188     last INSCOPE;
2189     } elsif ({
2190     table => 1, caption => 1, td => 1, th => 1,
2191     button => 1, marquee => 1, object => 1, html => 1,
2192     }->{$node->[1]}) {
2193     last INSCOPE;
2194     }
2195     } # INSCOPE
2196    
2197     if (defined $i) {
2198     !!!parse-error;
2199     splice @$open_elements, $i;
2200     }
2201    
2202     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2203    
2204     !!!next-token;
2205     return;
2206     } elsif ($token->{tag_name} eq 'a') {
2207     AFE: for my $i (reverse 0..$#$active_formatting_elements) {
2208     my $node = $active_formatting_elements->[$i];
2209     if ($node->[1] eq 'a') {
2210     !!!parse-error;
2211    
2212     !!!back-token;
2213     $token = {type => 'end tag', tag_name => 'a'};
2214     $formatting_end_tag->($token->{tag_name});
2215    
2216     splice @$active_formatting_elements, $i;
2217     OE: for (reverse 0..$#$open_elements) {
2218     if ($open_elements->[$_]->[0] eq $node->[0]) {
2219     splice @$open_elements, $_;
2220     last OE;
2221     }
2222     } # OE
2223     last AFE;
2224     } elsif ($node->[0] eq '#marker') {
2225     last AFE;
2226     }
2227     } # AFE
2228    
2229     $reconstruct_active_formatting_elements->();
2230    
2231     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2232     push @$active_formatting_elements, $open_elements->[-1];
2233    
2234     !!!next-token;
2235     return;
2236     } elsif ({
2237     b => 1, big => 1, em => 1, font => 1, i => 1,
2238     nobr => 1, s => 1, small => 1, strile => 1,
2239     strong => 1, tt => 1, u => 1,
2240     }->{$token->{tag_name}}) {
2241     $reconstruct_active_formatting_elements->();
2242    
2243     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2244     push @$active_formatting_elements, $open_elements->[-1];
2245    
2246     !!!next-token;
2247     return;
2248     } elsif ($token->{tag_name} eq 'button') {
2249     ## has a button element in scope
2250     INSCOPE: for (reverse 0..$#$open_elements) {
2251     my $node = $open_elements->[$_];
2252     if ($node->[1] eq 'button') {
2253     !!!parse-error;
2254     !!!back-token;
2255     $token = {type => 'end tag', tag_name => 'button'};
2256     return;
2257     } elsif ({
2258     table => 1, caption => 1, td => 1, th => 1,
2259     button => 1, marquee => 1, object => 1, html => 1,
2260     }->{$node->[1]}) {
2261     last INSCOPE;
2262     }
2263     } # INSCOPE
2264    
2265     $reconstruct_active_formatting_elements->();
2266    
2267     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2268     push @$active_formatting_elements, ['#marker', ''];
2269    
2270     !!!next-token;
2271     return;
2272     } elsif ($token->{tag_name} eq 'marquee' or
2273     $token->{tag_name} eq 'object') {
2274     $reconstruct_active_formatting_elements->();
2275    
2276     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2277     push @$active_formatting_elements, ['#marker', ''];
2278    
2279     !!!next-token;
2280     return;
2281     } elsif ($token->{tag_name} eq 'xmp') {
2282     $reconstruct_active_formatting_elements->();
2283    
2284     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2285    
2286     $self->{content_model_flag} = 'CDATA';
2287    
2288     !!!next-token;
2289     return;
2290     } elsif ($token->{tag_name} eq 'tbale') {
2291     ## has a p element in scope
2292     INSCOPE: for (reverse @$open_elements) {
2293     if ($_->[1] eq 'p') {
2294     !!!back-token;
2295     $token = {type => 'end tag', tag_name => 'p'};
2296     return;
2297     } elsif ({
2298     table => 1, caption => 1, td => 1, th => 1,
2299     button => 1, marquee => 1, object => 1, html => 1,
2300     }->{$_->[1]}) {
2301     last INSCOPE;
2302     }
2303     } # INSCOPE
2304    
2305     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2306    
2307     $insertion_mode = 'in table';
2308    
2309     !!!next-token;
2310     return;
2311     } elsif ({
2312     area => 1, basefont => 1, bgsound => 1, br => 1,
2313     embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
2314     image => 1,
2315     }->{$token->{tag_name}}) {
2316     if ($token->{tag_name} eq 'image') {
2317     !!!parse-error;
2318     $token->{tag_name} = 'img';
2319     }
2320    
2321     $reconstruct_active_formatting_elements->();
2322    
2323     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2324     pop @$open_elements;
2325    
2326     !!!next-token;
2327     return;
2328     } elsif ($token->{tag_name} eq 'hr') {
2329     ## has a p element in scope
2330     INSCOPE: for (reverse @$open_elements) {
2331     if ($_->[1] eq 'p') {
2332     !!!back-token;
2333     $token = {type => 'end tag', tag_name => 'p'};
2334     return;
2335     } elsif ({
2336     table => 1, caption => 1, td => 1, th => 1,
2337     button => 1, marquee => 1, object => 1, html => 1,
2338     }->{$_->[1]}) {
2339     last INSCOPE;
2340     }
2341     } # INSCOPE
2342    
2343     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2344     pop @$open_elements;
2345    
2346     !!!next-token;
2347     return;
2348     } elsif ($token->{tag_name} eq 'input') {
2349     $reconstruct_active_formatting_elements->();
2350    
2351     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2352     ## TODO: associate with $form_element if defined
2353     pop @$open_elements;
2354    
2355     !!!next-token;
2356     return;
2357     } elsif ($token->{tag_name} eq 'isindex') {
2358     !!!parse-error;
2359    
2360     if (defined $form_element) {
2361     ## Ignore the token
2362     !!!next-token;
2363     return;
2364     } else {
2365     my $at = $token->{attributes};
2366     $at->{name} = {name => 'name', value => 'isindex'};
2367     my @tokens = (
2368     {type => 'start tag', tag_name => 'form'},
2369     {type => 'start tag', tag_name => 'hr'},
2370     {type => 'start tag', tag_name => 'p'},
2371     {type => 'start tag', tag_name => 'label'},
2372     {type => 'character',
2373     data => 'This is a searchable index. Insert your search keywords here: '}, # SHOULD
2374     ## TODO: make this configurable
2375     {type => 'start tag', tag_name => 'input', attributes => $at},
2376     #{type => 'character', data => ''}, # SHOULD
2377     {type => 'end tag', tag_name => 'label'},
2378     {type => 'end tag', tag_name => 'p'},
2379     {type => 'start tag', tag_name => 'hr'},
2380     {type => 'end tag', tag_name => 'form'},
2381     );
2382     $token = shift @tokens;
2383     !!!back-token (@tokens);
2384     return;
2385     }
2386     } elsif ({
2387     textarea => 1,
2388     noembed => 1,
2389     noframes => 1,
2390     noscript => 0, ## TODO: 1 if scripting is enabled
2391     }->{$token->{tag_name}}) {
2392     my $tag_name = $token->{tag_name};
2393     my $el;
2394     !!!create-element ($el, $token->{tag_name}, $token->{attributes});
2395    
2396     if ($token->{tag_name} eq 'textarea') {
2397     ## TODO: form_element if defined
2398     $self->{content_model_flag} = 'RCDATA';
2399     } else {
2400     $self->{content_model_flag} = 'CDATA';
2401     }
2402    
2403     $insert->($el);
2404    
2405     my $text = '';
2406     !!!next-token;
2407     while ($token->{type} eq 'character') {
2408     $text .= $token->{data};
2409     !!!next-token;
2410     }
2411     if (length $text) {
2412     $el->manakai_append_text ($text);
2413     }
2414    
2415     $self->{content_model_flag} = 'PCDATA';
2416    
2417     if ($token->{type} eq 'end tag' and
2418     $token->{tag_name} eq $tag_name) {
2419     ## Ignore the token
2420     } else {
2421     !!!parse-error;
2422     ## ISSUE: And ignore?
2423     }
2424     !!!next-token;
2425     return;
2426     } elsif ($token->{type} eq 'select') {
2427     $reconstruct_active_formatting_elements->();
2428    
2429     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2430    
2431     $insertion_mode = 'in select';
2432     !!!next-token;
2433     return;
2434     } elsif ({
2435     caption => 1, col => 1, colgroup => 1, frame => 1,
2436     frameset => 1, head => 1, option => 1, optgroup => 1,
2437     tbody => 1, td => 1, tfoot => 1, th => 1,
2438     thead => 1, tr => 1,
2439     }->{$token->{tag_name}}) {
2440     !!!parse-error;
2441     ## Ignore the token
2442     !!!next-token;
2443     return;
2444    
2445     ## ISSUE: An issue on HTML5 new elements in the spec.
2446     } else {
2447     $reconstruct_active_formatting_elements->();
2448    
2449     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2450    
2451     !!!next-token;
2452     return;
2453     }
2454     } elsif ($token->{type} eq 'end tag') {
2455     if ($token->{tag_name} eq 'body') {
2456     if (@$open_elements > 1 and $open_elements->[1]->[1] eq 'body') {
2457     ## ISSUE: There is an issue in the spec.
2458     if ($open_elements->[-1]->[1] ne 'body') {
2459     !!!parse-error;
2460     }
2461     $insertion_mode = 'after body';
2462     !!!next-token;
2463     return;
2464     } else {
2465     !!!parse-error;
2466     ## Ignore the token
2467     !!!next-token;
2468     return;
2469     }
2470     } elsif ($token->{tag_name} eq 'html') {
2471     if (@$open_elements > 1 and $open_elements->[1]->[1] eq 'body') {
2472     ## ISSUE: There is an issue in the spec.
2473     if ($open_elements->[-1]->[1] ne 'body') {
2474     !!!parse-error;
2475     }
2476     $insertion_mode = 'after body';
2477     ## reprocess
2478     return;
2479     } else {
2480     !!!parse-error;
2481     ## Ignore the token
2482     !!!next-token;
2483     return;
2484     }
2485     } elsif ({
2486     address => 1, blockquote => 1, center => 1, dir => 1,
2487     div => 1, dl => 1, fieldset => 1, listing => 1,
2488     menu => 1, ol => 1, pre => 1, ul => 1,
2489     form => 1,
2490     p => 1,
2491     dd => 1, dt => 1, li => 1,
2492     button => 1, marquee => 1, object => 1,
2493     }->{$token->{tag_name}}) {
2494     ## has an element in scope
2495     my $i;
2496     INSCOPE: for (reverse 0..$#$open_elements) {
2497     my $node = $open_elements->[$_];
2498     if ($node->[1] eq $token->{tag_name}) {
2499     ## generate implied end tags
2500     if ({
2501     dd => ($token->{tag_name} ne 'dd'),
2502     dt => ($token->{tag_name} ne 'dt'),
2503     li => ($token->{tag_name} ne 'li'),
2504     p => ($token->{tag_name} ne 'p'),
2505     td => 1, th => 1, tr => 1,
2506     }->{$open_elements->[-1]->[1]}) {
2507     !!!back-token;
2508     $token = {type => 'end tag',
2509     tag_name => $open_elements->[-1]->[1]}; # MUST
2510     return;
2511     }
2512     $i = $_;
2513     last INSCOPE unless $token->{tag_name} eq 'p';
2514     } elsif ({
2515     table => 1, caption => 1, td => 1, th => 1,
2516     button => 1, marquee => 1, object => 1, html => 1,
2517     }->{$node->[1]}) {
2518     last INSCOPE;
2519     }
2520     } # INSCOPE
2521    
2522     if ($open_elements->[-1]->[1] ne $token->{tag_name}) {
2523     !!!parse-error;
2524     }
2525    
2526     splice @$open_elements, $i if defined $i;
2527     undef $form_element if $token->{tag_name} eq 'form';
2528     $clear_up_to_marker->()
2529     if {
2530     button => 1, marquee => 1, object => 1,
2531     }->{$token->{tag_name}};
2532     !!!next-token;
2533     return;
2534     } elsif ({
2535     h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
2536     }->{$token->{tag_name}}) {
2537     ## has an element in scope
2538     my $i;
2539     INSCOPE: for (reverse 0..$#$open_elements) {
2540     my $node = $open_elements->[$_];
2541     if ({
2542     h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
2543     }->{$node->[1]}) {
2544     ## generate implied end tags
2545     if ({
2546     dd => 1, dt => 1, li => 1, p => 1,
2547     td => 1, th => 1, tr => 1,
2548     }->{$open_elements->[-1]->[1]}) {
2549     !!!back-token;
2550     $token = {type => 'end tag',
2551     tag_name => $open_elements->[-1]->[1]}; # MUST
2552     return;
2553     }
2554     $i = $_;
2555     last INSCOPE;
2556     } elsif ({
2557     table => 1, caption => 1, td => 1, th => 1,
2558     button => 1, marquee => 1, object => 1, html => 1,
2559     }->{$node->[1]}) {
2560     last INSCOPE;
2561     }
2562     } # INSCOPE
2563    
2564     if ($open_elements->[-1]->[1] ne $token->{tag_name}) {
2565     !!!parse-error;
2566     }
2567    
2568     splice @$open_elements, $i if defined $i;
2569     !!!next-token;
2570     return;
2571     } elsif ({
2572     a => 1,
2573     b => 1, big => 1, em => 1, font => 1, i => 1,
2574     nobr => 1, s => 1, small => 1, strile => 1,
2575     strong => 1, tt => 1, u => 1,
2576     }->{$token->{tag_name}}) {
2577     $formatting_end_tag->($token->{tag_name});
2578     return;
2579     } elsif ({
2580     caption => 1, col => 1, colgroup => 1, frame => 1,
2581     frameset => 1, head => 1, option => 1, optgroup => 1,
2582     tbody => 1, td => 1, tfoot => 1, th => 1,
2583     thead => 1, tr => 1,
2584     area => 1, basefont => 1, bgsound => 1, br => 1,
2585     embed => 1, hr => 1, iframe => 1, image => 1,
2586     img => 1, input => 1, isindex=> 1, noembed => 1,
2587     noframes => 1, param => 1, select => 1, spacer => 1,
2588     table => 1, textarea => 1, wbr => 1,
2589     noscript => 0, ## TODO: if scripting is enabled
2590     }->{$token->{tag_name}}) {
2591     !!!parse-error;
2592     ## Ignore the token
2593     !!!next-token;
2594     return;
2595    
2596     ## ISSUE: Issue on HTML5 new elements in spec
2597    
2598     } else {
2599     ## Step 1
2600     my $node_i = -1;
2601     my $node = $open_elements->[$node_i];
2602    
2603     ## Step 2
2604     S2: {
2605     if ($node->[1] eq $token->{tag_name}) {
2606     ## Step 1
2607     ## generate implied end tags
2608     if ({
2609     dd => 1, dt => 1, li => 1, p => 1,
2610     td => 1, th => 1, tr => 1,
2611     }->{$open_elements->[-1]->[1]}) {
2612     !!!back-token;
2613     $token = {type => 'end tag',
2614     tag_name => $open_elements->[-1]->[1]}; # MUST
2615     return;
2616     }
2617    
2618     ## Step 2
2619     if ($token->{tag_name} ne $open_elements->[-1]->[1]) {
2620     !!!parse-error;
2621     }
2622    
2623     ## Step 3
2624     splice @$open_elements, $node_i;
2625     last S2;
2626     } else {
2627     ## Step 3
2628     if (not $formatting_category->{$node->[1]} and
2629     #not $phrasing_category->{$node->[1]} and
2630     ($special_category->{$node->[1]} or
2631     $scoping_category->{$node->[1]})) {
2632     !!!parse-error;
2633     ## Ignore the token
2634     !!!next-token;
2635     last S2;
2636     }
2637     }
2638    
2639     ## Step 4
2640     $node_i--;
2641     $node = $open_elements->[$node_i];
2642    
2643     ## Step 5;
2644     redo S2;
2645     } # S2
2646     }
2647     }
2648     }; # $in_body
2649    
2650     B: {
2651     if ($phase eq 'initial') {
2652     if ($token->{type} eq 'DOCTYPE') {
2653     if ($token->{error}) {
2654     ## ISSUE: Spec currently left this case undefined.
2655     }
2656     my $doctype = $self->{document}->create_document_type_definition
2657     ($token->{name});
2658     $self->{document}->append_child ($doctype);
2659     $phase = 'root element';
2660     !!!next-token;
2661     redo B;
2662     } elsif ({
2663     comment => 1,
2664     'start tag' => 1,
2665     'end tag' => 1,
2666     'end-of-file' => 1,
2667     }->{$token->{type}}) {
2668     ## ISSUE: Spec currently left this case undefined.
2669     $phase = 'root element';
2670     ## reprocess
2671     redo B;
2672     } elsif ($token->{type} eq 'character') {
2673     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
2674     $self->{document}->manakai_append_text ($1);
2675     ## ISSUE: DOM3 Core does not allow Document > Text
2676     unless (length $token->{data}) {
2677     ## Stay in the phase
2678     !!!next-token;
2679     redo B;
2680     }
2681     }
2682     ## ISSUE: Spec currently left this case undefined.
2683     $phase = 'root element';
2684     ## reprocess
2685     redo B;
2686     } else {
2687     die "$0: $token->{type}: Unknown token";
2688     }
2689     } elsif ($phase eq 'root element') {
2690     if ($token->{type} eq 'DOCTYPE') {
2691     !!!parse-error;
2692     ## Ignore the token
2693     ## Stay in the phase
2694     !!!next-token;
2695     redo B;
2696     } elsif ($token->{type} eq 'comment') {
2697     my $comment = $self->{document}->create_comment ($token->{data});
2698     $self->{document}->append_child ($comment);
2699     ## Stay in the phase
2700     !!!next-token;
2701     redo B;
2702     } elsif ($token->{type} eq 'character') {
2703     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
2704     $self->{document}->manakai_append_text ($1);
2705     ## ISSUE: DOM3 Core does not allow Document > Text
2706     unless (length $token->{data}) {
2707     ## Stay in the phase
2708     !!!next-token;
2709     redo B;
2710     }
2711     }
2712     #
2713     } elsif ({
2714     'start tag' => 1,
2715     'end tag' => 1,
2716     'end-of-file' => 1,
2717     }->{$token->{type}}) {
2718     ## ISSUE: There is an issue in the spec
2719     #
2720     } else {
2721     die "$0: $token->{type}: Unknown token";
2722     }
2723     my $root_element; !!!create-element ($root_element, 'html');
2724     $self->{document}->append_child ($root_element);
2725     $open_elements = [[$root_element, 'html']];
2726     $phase = 'main';
2727     ## reprocess
2728     redo B;
2729     } elsif ($phase eq 'main') {
2730     if ($token->{type} eq 'DOCTYPE') {
2731     !!!parse-error;
2732     ## Ignore the token
2733     ## Stay in the phase
2734     !!!next-token;
2735     redo B;
2736     } elsif ($token->{type} eq 'start tag' and
2737     $token->{tag_name} eq 'html') {
2738     ## TODO: unless it is the first start tag token, parse-error
2739     my $top_el = $open_elements->[0]->[0];
2740     for my $attr_name (keys %{$token->{attributes}}) {
2741     unless ($top_el->has_attribute_ns (undef, $attr_name)) {
2742     $top_el->set_attribute_ns (undef, [undef, $attr_name],
2743     $token->{attributes}->{value});
2744     }
2745     }
2746     !!!next-token;
2747     redo B;
2748     } elsif ($token->{type} eq 'end-of-file') {
2749     ## Generate implied end tags
2750     if ({
2751     dd => 1, dt => 1, li => 1, p => 1, td => 1, th => 1, tr => 1,
2752     }->{$open_elements->[-1]->[1]}) {
2753     !!!back-token;
2754     $token = {type => 'end tag', tag_name => $open_elements->[-1]->[1]};
2755     redo B;
2756     }
2757    
2758     if (@$open_elements > 2 or
2759     (@$open_elements == 2 and $open_elements->[1]->[1] ne 'body')) {
2760     !!!parse-error;
2761     } else {
2762     ## TODO: inner_html parser and @$open_elements > 1 and $open_elements->[1] ne 'body', then parse-error
2763     }
2764    
2765     ## Stop parsing
2766     last B;
2767    
2768     ## ISSUE: There is an issue in the spec.
2769     } else {
2770     if ($insertion_mode eq 'before head') {
2771     if ($token->{type} eq 'character') {
2772     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
2773     $open_elements->[-1]->[0]->manakai_append_text ($1);
2774     unless (length $token->{data}) {
2775     !!!next-token;
2776     redo B;
2777     }
2778     }
2779     ## As if <head>
2780     !!!create-element ($head_element, 'head');
2781     $open_elements->[-1]->[0]->append_child ($head_element);
2782     push @$open_elements, [$head_element, 'head'];
2783     $insertion_mode = 'in head';
2784     ## reprocess
2785     redo B;
2786     } elsif ($token->{type} eq 'comment') {
2787     my $comment = $self->{document}->create_comment ($token->{data});
2788     $open_elements->[-1]->[0]->append_child ($comment);
2789     !!!next-token;
2790     redo B;
2791     } elsif ($token->{type} eq 'start tag') {
2792     my $attr = $token->{tag_name} eq 'head' ? $token->{attributes} : {};
2793     !!!create-element ($head_element, 'head', $attr);
2794     $open_elements->[-1]->[0]->append_child ($head_element);
2795     push @$open_elements, [$head_element, 'head'];
2796     $insertion_mode = 'in head';
2797     if ($token->{tag_name} eq 'head') {
2798     !!!next-token;
2799     #} elsif ({
2800     # base => 1, link => 1, meta => 1,
2801     # script => 1, style => 1, title => 1,
2802     # }->{$token->{tag_name}}) {
2803     # ## reprocess
2804     } else {
2805     ## reprocess
2806     }
2807     redo B;
2808     } elsif ($token->{type} eq 'end tag') {
2809     if ($token->{tag_name} eq 'html') {
2810     ## As if <head>
2811     !!!create-element ($head_element, 'head');
2812     $open_elements->[-1]->[0]->append_child ($head_element);
2813     push @$open_elements, [$head_element, 'head'];
2814     $insertion_mode = 'in head';
2815     ## reprocess
2816     redo B;
2817     } else {
2818     !!!parse-error;
2819     ## Ignore the token
2820     redo B;
2821     }
2822     } else {
2823     die "$0: $token->{type}: Unknown type";
2824     }
2825     } elsif ($insertion_mode eq 'in head') {
2826     if ($token->{type} eq 'character') {
2827     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
2828     $open_elements->[-1]->[0]->manakai_append_text ($1);
2829     unless (length $token->{data}) {
2830     !!!next-token;
2831     redo B;
2832     }
2833     }
2834    
2835     #
2836     } elsif ($token->{type} eq 'comment') {
2837     my $comment = $self->{document}->create_comment ($token->{data});
2838     $open_elements->[-1]->[0]->append_child ($comment);
2839     !!!next-token;
2840     redo B;
2841     } elsif ($token->{type} eq 'start tag') {
2842     if ($token->{tag_name} eq 'title') {
2843     my $title_el; !!!create-element ($title_el, 'title');
2844     (defined $head_element ? $head_element : $open_elements->[-1]->[0])
2845     ->append_child ($title_el);
2846     $self->{content_model_flag} = 'RCDATA';
2847    
2848     my $text = '';
2849     !!!next-token;
2850     while ($token->{type} eq 'character') {
2851     $text .= $token->{data};
2852     !!!next-token;
2853     }
2854     if (length $text) {
2855     $title_el->manakai_append_text ($text);
2856     }
2857    
2858     $self->{content_model_flag} = 'PCDATA';
2859    
2860     if ($token->{type} eq 'end tag' and
2861     $token->{tag_name} eq 'title') {
2862     ## Ignore the token
2863     } else {
2864     !!!parse-error;
2865     ## ISSUE: And ignore?
2866     }
2867     !!!next-token;
2868     redo B;
2869     } elsif ($token->{tag_name} eq 'style') {
2870     $style_start_tag->();
2871     redo B;
2872     } elsif ($token->{tag_name} eq 'script') {
2873     $script_start_tag->();
2874     redo B;
2875     } elsif ({base => 1, link => 1, meta => 1}->{$token->{tag_name}}) {
2876     ## NOTE: There are "as if in head" code clones
2877     my $el;
2878     !!!create-element ($el, $token->{tag_name}, $token->{attributes});
2879     (defined $head_element ? $head_element : $open_elements->[-1]->[0])
2880     ->append_child ($el);
2881    
2882     ## ISSUE: Issue on magical <base> in the spec
2883    
2884     !!!next-token;
2885     redo B;
2886     } elsif ($token->{tag_name} eq 'head') {
2887     !!!parse-error;
2888     ## Ignore the token
2889     !!!next-token;
2890     redo B;
2891     } else {
2892     #
2893     }
2894     } elsif ($token->{type} eq 'end tag') {
2895     if ($token->{tag_name} eq 'head') {
2896     if ($open_elements->[-1]->[1] eq 'head') {
2897     pop @$open_elements;
2898     } else {
2899     !!!parse-error;
2900     }
2901     $insertion_mode = 'after head';
2902     !!!next-token;
2903     redo B;
2904     } elsif ($token->{tag_name} eq 'html') {
2905     #
2906     } else {
2907     !!!parse-error;
2908     ## Ignore the token
2909     !!!next-token;
2910     redo B;
2911     }
2912     } else {
2913     #
2914     }
2915    
2916     if ($open_elements->[-1]->[1] eq 'head') {
2917     ## As if </head>
2918     pop @$open_elements;
2919     }
2920     $insertion_mode = 'after head';
2921     ## reprocess
2922     redo B;
2923    
2924     ## ISSUE: An issue in the spec.
2925     } elsif ($insertion_mode eq 'after head') {
2926     if ($token->{type} eq 'character') {
2927     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
2928     $open_elements->[-1]->[0]->manakai_append_text ($1);
2929     unless (length $token->{data}) {
2930     !!!next-token;
2931     redo B;
2932     }
2933     }
2934    
2935     #
2936     } elsif ($token->{type} eq 'comment') {
2937     my $comment = $self->{document}->create_comment ($token->{data});
2938     $open_elements->[-1]->[0]->append_child ($comment);
2939     !!!next-token;
2940     redo B;
2941     } elsif ($token->{type} eq 'start tag') {
2942     if ($token->{tag_name} eq 'body') {
2943     !!!insert-element ('body', $token->{attributes});
2944     $insertion_mode = 'in body';
2945     !!!next-token;
2946     redo B;
2947     } elsif ($token->{tag_name} eq 'frameset') {
2948     !!!insert-element ('frameset', $token->{attributes});
2949     $insertion_mode = 'in frameset';
2950     !!!next-token;
2951     redo B;
2952     } elsif ({
2953     base => 1, link => 1, meta => 1,
2954     script=> 1, style => 1, title => 1,
2955     }->{$token->{tag_name}}) {
2956     !!!parse-error;
2957     $insertion_mode = 'in head';
2958     ## reprocess
2959     redo B;
2960     } else {
2961     #
2962     }
2963     } else {
2964     #
2965     }
2966    
2967     ## As if <body>
2968     !!!insert-element ('body');
2969     $insertion_mode = 'in body';
2970     ## reprocess
2971     redo B;
2972     } elsif ($insertion_mode eq 'in body') {
2973     if ($token->{type} eq 'character') {
2974     ## NOTE: There is a code clone of "character in body".
2975     $reconstruct_active_formatting_elements->();
2976    
2977     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
2978    
2979     !!!next-token;
2980     redo B;
2981     } elsif ($token->{type} eq 'comment') {
2982     ## NOTE: There is a code clone of "comment in body".
2983     my $comment = $self->{document}->create_comment ($token->{data});
2984     $open_elements->[-1]->[0]->append_child ($comment);
2985     !!!next-token;
2986     redo B;
2987     } else {
2988     $in_body->(sub {
2989     $open_elements->[-1]->[0]->append_child (shift);
2990     });
2991     redo B;
2992     }
2993     } elsif ($insertion_mode eq 'in table') {
2994     if ($token->{type} eq 'character') {
2995     $reconstruct_active_formatting_elements->();
2996    
2997     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
2998    
2999     !!!next-token;
3000     redo B;
3001     } elsif ($token->{type} eq 'comment') {
3002     my $comment = $self->{document}->create_comment ($token->{data});
3003     $open_elements->[-1]->[0]->append_child ($comment);
3004     !!!next-token;
3005     redo B;
3006     } elsif ($token->{type} eq 'start tag') {
3007     if ({
3008     caption => 1,
3009     colgroup => 1,
3010     tbody => 1, tfoot => 1, thead => 1,
3011     }->{$token->{tag_name}}) {
3012     ## Clear back to table context
3013     while ($open_elements->[-1]->[1] ne 'table' and
3014     $open_elements->[-1]->[1] ne 'html') {
3015     !!!parse-error;
3016     pop @$open_elements;
3017     }
3018    
3019     push @$active_formatting_elements, ['#marker', '']
3020     if $token->{tag_name} eq 'caption';
3021    
3022     !!!insert-element ($token->{tag_name}, $token->{attributes});
3023     $insertion_mode = {
3024     caption => 'in caption',
3025     colgroup => 'in column group',
3026     tbody => 'in table body',
3027     tfoot => 'in table body',
3028     thead => 'in table body',
3029     }->{$token->{tag_name}};
3030     !!!next-token;
3031     redo B;
3032     } elsif ({
3033     col => 1,
3034     td => 1, th => 1, tr => 1,
3035     }->{$token->{tag_name}}) {
3036     ## Clear back to table context
3037     while ($open_elements->[-1]->[1] ne 'table' and
3038     $open_elements->[-1]->[1] ne 'html') {
3039     !!!parse-error;
3040     pop @$open_elements;
3041     }
3042    
3043     !!!insert-element ($token->{tag_name} eq 'col' ? 'colgroup' : 'tbody');
3044     $insertion_mode = $token->{tag_name} eq 'col'
3045     ? 'in column group' : 'in table body';
3046     ## reprocess
3047     redo B;
3048     } elsif ($token->{tag_name} eq 'table') {
3049     ## NOTE: There are code clones for this "table in table"
3050     !!!parse-error;
3051    
3052     ## As if </table>
3053     ## have a table element in table scope
3054     my $i;
3055     INSCOPE: for (reverse 0..$#$open_elements) {
3056     my $node = $open_elements->[$_];
3057     if ($node->[1] eq 'table') {
3058     $i = $_;
3059     last INSCOPE;
3060     } elsif ({
3061     table => 1, html => 1,
3062     }->{$node->[1]}) {
3063     last INSCOPE;
3064     }
3065     } # INSCOPE
3066     unless (defined $i) {
3067     !!!parse-error;
3068     ## Ignore tokens </table><table>
3069     !!!next-token;
3070     redo B;
3071     }
3072    
3073     ## generate implied end tags
3074     if ({
3075     dd => 1, dt => 1, li => 1, p => 1,
3076     td => 1, th => 1, tr => 1,
3077     }->{$open_elements->[-1]->[1]}) {
3078     !!!back-token; # <table>
3079     $token = {type => 'end tag', tag_name => 'table'};
3080     !!!back-token;
3081     $token = {type => 'end tag',
3082     tag_name => $open_elements->[-1]->[1]}; # MUST
3083     redo B;
3084     }
3085    
3086     if ($open_elements->[-1]->[1] ne 'table') {
3087     !!!parse-error;
3088     }
3089    
3090     splice @$open_elements, $i;
3091    
3092     $reset_insertion_mode->();
3093    
3094     ## reprocess
3095     redo B;
3096     } else {
3097     #
3098     }
3099     } elsif ($token->{type} eq 'end tag') {
3100     if ($token->{tag_name} eq 'table') {
3101     ## have a table element in table scope
3102     my $i;
3103     INSCOPE: for (reverse 0..$#$open_elements) {
3104     my $node = $open_elements->[$_];
3105     if ($node->[1] eq $token->{tag_name}) {
3106     $i = $_;
3107     last INSCOPE;
3108     } elsif ({
3109     table => 1, html => 1,
3110     }->{$node->[1]}) {
3111     last INSCOPE;
3112     }
3113     } # INSCOPE
3114     unless (defined $i) {
3115     !!!parse-error;
3116     ## Ignore the token
3117     !!!next-token;
3118     redo B;
3119     }
3120    
3121     ## generate implied end tags
3122     if ({
3123     dd => 1, dt => 1, li => 1, p => 1,
3124     td => 1, th => 1, tr => 1,
3125     }->{$open_elements->[-1]->[1]}) {
3126     !!!back-token;
3127     $token = {type => 'end tag',
3128     tag_name => $open_elements->[-1]->[1]}; # MUST
3129     redo B;
3130     }
3131    
3132     if ($open_elements->[-1]->[1] ne 'table') {
3133     !!!parse-error;
3134     }
3135    
3136     splice @$open_elements, $i;
3137    
3138     $reset_insertion_mode->();
3139    
3140     !!!next-token;
3141     redo B;
3142     } elsif ({
3143     body => 1, caption => 1, col => 1, colgroup => 1,
3144     html => 1, tbody => 1, td => 1, tfoot => 1, th => 1,
3145     thead => 1, tr => 1,
3146     }->{$token->{tag_name}}) {
3147     !!!parse-error;
3148     ## Ignore the token
3149     !!!next-token;
3150     redo B;
3151     } else {
3152     #
3153     }
3154     } else {
3155     #
3156     }
3157    
3158     ## NOTE: There are code clones of "misc in table".
3159     !!!parse-error;
3160     $in_body->(sub {
3161     my $child = shift;
3162     if ({
3163     table => 1, tbody => 1, tfoot => 1,
3164     thead => 1, tr => 1,
3165     }->{$open_elements->[-1]->[1]}) {
3166     # MUST
3167     my $foster_parent_element;
3168     my $next_sibling;
3169     OE: for (reverse 0..$#$open_elements) {
3170     if ($open_elements->[$_]->[1] eq 'table') {
3171     my $parent = $open_elements->[$_]->[0]->parent_node;
3172     if (defined $parent and $parent->node_type == 1) {
3173     $foster_parent_element = $parent;
3174     $next_sibling = $open_elements->[$_]->[0];
3175     } else {
3176     $foster_parent_element
3177     = $open_elements->[$_ - 1]->[0];
3178     }
3179     last OE;
3180     }
3181     } # OE
3182     $foster_parent_element = $open_elements->[0]->[0]
3183     unless defined $foster_parent_element;
3184     $foster_parent_element->insert_before
3185     ($child, $next_sibling);
3186     } else {
3187     $open_elements->[-1]->[0]->append_child ($child);
3188     }
3189     });
3190     redo B;
3191     } elsif ($insertion_mode eq 'in caption') {
3192     if ($token->{type} eq 'start tag') {
3193     if ({
3194     caption => 1, col => 1, colgroup => 1, tbody => 1,
3195     td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
3196     }->{$token->{tag_name}}) {
3197     !!!parse-error;
3198    
3199     ## As if </caption>
3200     ## have a table element in table scope
3201     my $i;
3202     INSCOPE: for (reverse 0..$#$open_elements) {
3203     my $node = $open_elements->[$_];
3204     if ($node->[1] eq 'caption') {
3205     $i = $_;
3206     last INSCOPE;
3207     } elsif ({
3208     table => 1, html => 1,
3209     }->{$node->[1]}) {
3210     last INSCOPE;
3211     }
3212     } # INSCOPE
3213     unless (defined $i) {
3214     !!!parse-error;
3215     ## Ignore the token
3216     !!!next-token;
3217     redo B;
3218     }
3219    
3220     ## generate implied end tags
3221     if ({
3222     dd => 1, dt => 1, li => 1, p => 1,
3223     td => 1, th => 1, tr => 1,
3224     }->{$open_elements->[-1]->[1]}) {
3225     !!!back-token; # <?>
3226     $token = {type => 'end tag', tag_name => 'caption'};
3227     !!!back-token;
3228     $token = {type => 'end tag',
3229     tag_name => $open_elements->[-1]->[1]}; # MUST
3230     redo B;
3231     }
3232    
3233     if ($open_elements->[-1]->[1] ne 'caption') {
3234     !!!parse-error;
3235     }
3236    
3237     splice @$open_elements, $i;
3238    
3239     $clear_up_to_marker->();
3240    
3241     $insertion_mode = 'in table';
3242    
3243     ## reprocess
3244     redo B;
3245     } else {
3246     #
3247     }
3248     } elsif ($token->{type} eq 'end tag') {
3249     if ($token->{tag_name} eq 'caption') {
3250     ## have a table element in table scope
3251     my $i;
3252     INSCOPE: for (reverse 0..$#$open_elements) {
3253     my $node = $open_elements->[$_];
3254     if ($node->[1] eq $token->{tag_name}) {
3255     $i = $_;
3256     last INSCOPE;
3257     } elsif ({
3258     table => 1, html => 1,
3259     }->{$node->[1]}) {
3260     last INSCOPE;
3261     }
3262     } # INSCOPE
3263     unless (defined $i) {
3264     !!!parse-error;
3265     ## Ignore the token
3266     !!!next-token;
3267     redo B;
3268     }
3269    
3270     ## generate implied end tags
3271     if ({
3272     dd => 1, dt => 1, li => 1, p => 1,
3273     td => 1, th => 1, tr => 1,
3274     }->{$open_elements->[-1]->[1]}) {
3275     !!!back-token;
3276     $token = {type => 'end tag',
3277     tag_name => $open_elements->[-1]->[1]}; # MUST
3278     redo B;
3279     }
3280    
3281     if ($open_elements->[-1]->[1] ne 'caption') {
3282     !!!parse-error;
3283     }
3284    
3285     splice @$open_elements, $i;
3286    
3287     $clear_up_to_marker->();
3288    
3289     $insertion_mode = 'in table';
3290    
3291     !!!next-token;
3292     redo B;
3293     } elsif ($token->{tag_name} eq 'table') {
3294     !!!parse-error;
3295    
3296     ## As if </caption>
3297     ## have a table element in table scope
3298     my $i;
3299     INSCOPE: for (reverse 0..$#$open_elements) {
3300     my $node = $open_elements->[$_];
3301     if ($node->[1] eq 'caption') {
3302     $i = $_;
3303     last INSCOPE;
3304     } elsif ({
3305     table => 1, html => 1,
3306     }->{$node->[1]}) {
3307     last INSCOPE;
3308     }
3309     } # INSCOPE
3310     unless (defined $i) {
3311     !!!parse-error;
3312     ## Ignore the token
3313     !!!next-token;
3314     redo B;
3315     }
3316    
3317     ## generate implied end tags
3318     if ({
3319     dd => 1, dt => 1, li => 1, p => 1,
3320     td => 1, th => 1, tr => 1,
3321     }->{$open_elements->[-1]->[1]}) {
3322     !!!back-token; # </table>
3323     $token = {type => 'end tag', tag_name => 'caption'};
3324     !!!back-token;
3325     $token = {type => 'end tag',
3326     tag_name => $open_elements->[-1]->[1]}; # MUST
3327     redo B;
3328     }
3329    
3330     if ($open_elements->[-1]->[1] ne 'caption') {
3331     !!!parse-error;
3332     }
3333    
3334     splice @$open_elements, $i;
3335    
3336     $clear_up_to_marker->();
3337    
3338     $insertion_mode = 'in table';
3339    
3340     ## reprocess
3341     redo B;
3342     } elsif ({
3343     body => 1, col => 1, colgroup => 1,
3344     html => 1, tbody => 1, td => 1, tfoot => 1,
3345     th => 1, thead => 1, tr => 1,
3346     }->{$token->{tag_name}}) {
3347     !!!parse-error;
3348     ## Ignore the token
3349     redo B;
3350     } else {
3351     #
3352     }
3353     } else {
3354     #
3355     }
3356    
3357     $in_body->(sub {
3358     $open_elements->[-1]->[0]->append_child (shift);
3359     });
3360     redo B;
3361     } elsif ($insertion_mode eq 'in column group') {
3362     if ($token->{type} eq 'character') {
3363     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3364     $open_elements->[-1]->[0]->manakai_append_text ($1);
3365     unless (length $token->{data}) {
3366     !!!next-token;
3367     redo B;
3368     }
3369     }
3370    
3371     #
3372     } elsif ($token->{type} eq 'comment') {
3373     my $comment = $self->{document}->create_comment ($token->{data});
3374     $open_elements->[-1]->[0]->append_child ($comment);
3375     !!!next-token;
3376     redo B;
3377     } elsif ($token->{type} eq 'start tag') {
3378     if ($token->{tag_name} eq 'col') {
3379     !!!insert-element ($token->{tag_name}, $token->{attributes});
3380     pop @$open_elements;
3381     !!!next-token;
3382     redo B;
3383     } else {
3384     #
3385     }
3386     } elsif ($token->{type} eq 'end tag') {
3387     if ($token->{tag_name} eq 'colgroup') {
3388     if ($open_elements->[-1]->[1] eq 'html') {
3389     !!!parse-error;
3390     ## Ignore the token
3391     !!!next-token;
3392     redo B;
3393     } else {
3394     pop @$open_elements; # colgroup
3395     $insertion_mode = 'in table';
3396     !!!next-token;
3397     redo B;
3398     }
3399     } elsif ($token->{tag_name} eq 'col') {
3400     !!!parse-error;
3401     ## Ignore the token
3402     !!!next-token;
3403     redo B;
3404     } else {
3405     #
3406     }
3407     } else {
3408     #
3409     }
3410    
3411     ## As if </colgroup>
3412     if ($open_elements->[-1]->[1] eq 'html') {
3413     !!!parse-error;
3414     ## Ignore the token
3415     !!!next-token;
3416     redo B;
3417     } else {
3418     pop @$open_elements; # colgroup
3419     $insertion_mode = 'in table';
3420     ## reprocess
3421     redo B;
3422     }
3423     } elsif ($insertion_mode eq 'in table body') {
3424     if ($token->{type} eq 'character') {
3425     ## Copied from 'in table'
3426     $reconstruct_active_formatting_elements->();
3427    
3428     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
3429    
3430     !!!next-token;
3431     redo B;
3432     } elsif ($token->{type} eq 'comment') {
3433     ## Copied from 'in table'
3434     my $comment = $self->{document}->create_comment ($token->{data});
3435     $open_elements->[-1]->[0]->append_child ($comment);
3436     !!!next-token;
3437     redo B;
3438     } elsif ($token->{type} eq 'start tag') {
3439     if ({
3440     tr => 1,
3441     th => 1, td => 1,
3442     }->{$token->{tag_name}}) {
3443     ## Clear back to table body context
3444     while (not {
3445     tbody => 1, tfoot => 1, thead => 1, html => 1,
3446     }->{$open_elements->[-1]->[1]}) {
3447     !!!parse-error;
3448     pop @$open_elements;
3449     }
3450    
3451     $insertion_mode = 'in row';
3452     if ($token->{tag_name} eq 'tr') {
3453     !!!insert-element ($token->{tag_name}, $token->{attributes});
3454     !!!next-token;
3455     } else {
3456     !!!insert-element ('tr');
3457     ## reprocess
3458     }
3459     redo B;
3460     } elsif ({
3461     caption => 1, col => 1, colgroup => 1,
3462     tbody => 1, tfoot => 1, thead => 1,
3463     }->{$token->{tag_name}}) {
3464     ## have an element in table scope
3465     my $i;
3466     INSCOPE: for (reverse 0..$#$open_elements) {
3467     my $node = $open_elements->[$_];
3468     if ({
3469     tbody => 1, thead => 1, tfoot => 1,
3470     }->{$node->[1]}) {
3471     $i = $_;
3472     last INSCOPE;
3473     } elsif ({
3474     table => 1, html => 1,
3475     }->{$node->[1]}) {
3476     last INSCOPE;
3477     }
3478     } # INSCOPE
3479     unless (defined $i) {
3480     !!!parse-error;
3481     ## Ignore the token
3482     !!!next-token;
3483     redo B;
3484     }
3485    
3486     ## Clear back to table body context
3487     while (not {
3488     tbody => 1, tfoot => 1, thead => 1, html => 1,
3489     }->{$open_elements->[-1]->[1]}) {
3490     !!!parse-error;
3491     pop @$open_elements;
3492     }
3493    
3494     ## As if <{current node}>
3495     ## have an element in table scope
3496     ## true by definition
3497    
3498     ## Clear back to table body context
3499     ## nop by definition
3500    
3501     pop @$open_elements;
3502     $insertion_mode = 'in table';
3503     ## reprocess
3504     redo B;
3505     } elsif ($token->{tag_name} eq 'table') {
3506     ## NOTE: This is a code clone of "table in table"
3507     !!!parse-error;
3508    
3509     ## As if </table>
3510     ## have a table element in table scope
3511     my $i;
3512     INSCOPE: for (reverse 0..$#$open_elements) {
3513     my $node = $open_elements->[$_];
3514     if ($node->[1] eq 'table') {
3515     $i = $_;
3516     last INSCOPE;
3517     } elsif ({
3518     table => 1, html => 1,
3519     }->{$node->[1]}) {
3520     last INSCOPE;
3521     }
3522     } # INSCOPE
3523     unless (defined $i) {
3524     !!!parse-error;
3525     ## Ignore tokens </table><table>
3526     !!!next-token;
3527     redo B;
3528     }
3529    
3530     ## generate implied end tags
3531     if ({
3532     dd => 1, dt => 1, li => 1, p => 1,
3533     td => 1, th => 1, tr => 1,
3534     }->{$open_elements->[-1]->[1]}) {
3535     !!!back-token; # <table>
3536     $token = {type => 'end tag', tag_name => 'table'};
3537     !!!back-token;
3538     $token = {type => 'end tag',
3539     tag_name => $open_elements->[-1]->[1]}; # MUST
3540     redo B;
3541     }
3542    
3543     if ($open_elements->[-1]->[1] ne 'table') {
3544     !!!parse-error;
3545     }
3546    
3547     splice @$open_elements, $i;
3548    
3549     $reset_insertion_mode->();
3550    
3551     ## reprocess
3552     redo B;
3553     } else {
3554     #
3555     }
3556     } elsif ($token->{type} eq 'end tag') {
3557     if ({
3558     tbody => 1, tfoot => 1, thead => 1,
3559     }->{$token->{tag_name}}) {
3560     ## have an element in table scope
3561     my $i;
3562     INSCOPE: for (reverse 0..$#$open_elements) {
3563     my $node = $open_elements->[$_];
3564     if ($node->[1] eq $token->{tag_name}) {
3565     $i = $_;
3566     last INSCOPE;
3567     } elsif ({
3568     table => 1, html => 1,
3569     }->{$node->[1]}) {
3570     last INSCOPE;
3571     }
3572     } # INSCOPE
3573     unless (defined $i) {
3574     !!!parse-error;
3575     ## Ignore the token
3576     !!!next-token;
3577     redo B;
3578     }
3579    
3580     ## Clear back to table body context
3581     while (not {
3582     tbody => 1, tfoot => 1, thead => 1, html => 1,
3583     }->{$open_elements->[-1]->[1]}) {
3584     !!!parse-error;
3585     pop @$open_elements;
3586     }
3587    
3588     pop @$open_elements;
3589     $insertion_mode = 'in table';
3590     !!!next-token;
3591     redo B;
3592     } elsif ($token->{tag_name} eq 'table') {
3593     ## have an element in table scope
3594     my $i;
3595     INSCOPE: for (reverse 0..$#$open_elements) {
3596     my $node = $open_elements->[$_];
3597     if ({
3598     tbody => 1, thead => 1, tfoot => 1,
3599     }->{$node->[1]}) {
3600     $i = $_;
3601     last INSCOPE;
3602     } elsif ({
3603     table => 1, html => 1,
3604     }->{$node->[1]}) {
3605     last INSCOPE;
3606     }
3607     } # INSCOPE
3608     unless (defined $i) {
3609     !!!parse-error;
3610     ## Ignore the token
3611     !!!next-token;
3612     redo B;
3613     }
3614    
3615     ## Clear back to table body context
3616     while (not {
3617     tbody => 1, tfoot => 1, thead => 1, html => 1,
3618     }->{$open_elements->[-1]->[1]}) {
3619     !!!parse-error;
3620     pop @$open_elements;
3621     }
3622    
3623     ## As if <{current node}>
3624     ## have an element in table scope
3625     ## true by definition
3626    
3627     ## Clear back to table body context
3628     ## nop by definition
3629    
3630     pop @$open_elements;
3631     $insertion_mode = 'in table';
3632     ## reprocess
3633     redo B;
3634     } elsif ({
3635     body => 1, caption => 1, col => 1, colgroup => 1,
3636     html => 1, td => 1, th => 1, tr => 1,
3637     }->{$token->{tag_name}}) {
3638     !!!parse-error;
3639     ## Ignore the token
3640     !!!next-token;
3641     redo B;
3642     } else {
3643     #
3644     }
3645     } else {
3646     #
3647     }
3648    
3649     ## As if in table
3650     ## NOTE: This is a code clone of "misc in table".
3651     !!!parse-error;
3652     $in_body->(sub {
3653     my $child = shift;
3654     if ({
3655     table => 1, tbody => 1, tfoot => 1,
3656     thead => 1, tr => 1,
3657     }->{$open_elements->[-1]->[1]}) {
3658     # MUST
3659     my $foster_parent_element;
3660     my $next_sibling;
3661     OE: for (reverse 0..$#$open_elements) {
3662     if ($open_elements->[$_]->[1] eq 'table') {
3663     my $parent = $open_elements->[$_]->[0]->parent_node;
3664     if (defined $parent and $parent->node_type == 1) {
3665     $foster_parent_element = $parent;
3666     $next_sibling = $open_elements->[$_]->[0];
3667     } else {
3668     $foster_parent_element
3669     = $open_elements->[$_ - 1]->[0];
3670     }
3671     last OE;
3672     }
3673     } # OE
3674     $foster_parent_element = $open_elements->[0]->[0]
3675     unless defined $foster_parent_element;
3676     $foster_parent_element->insert_before
3677     ($child, $next_sibling);
3678     } else {
3679     $open_elements->[-1]->[0]->append_child ($child);
3680     }
3681     });
3682     redo B;
3683     } elsif ($insertion_mode eq 'in row') {
3684     if ($token->{type} eq 'character') {
3685     ## Copied from 'in table'
3686     $reconstruct_active_formatting_elements->();
3687    
3688     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
3689    
3690     !!!next-token;
3691     redo B;
3692     } elsif ($token->{type} eq 'comment') {
3693     ## Copied from 'in table'
3694     my $comment = $self->{document}->create_comment ($token->{data});
3695     $open_elements->[-1]->[0]->append_child ($comment);
3696     !!!next-token;
3697     redo B;
3698     } elsif ($token->{type} eq 'start tag') {
3699     if ($token->{tag_name} eq 'th' or
3700     $token->{tag_name} eq 'td') {
3701     ## Clear back to table row context
3702     while (not {
3703     th => 1, td => 1, html => 1,
3704     }->{$open_elements->[-1]->[1]}) {
3705     !!!parse-error;
3706     pop @$open_elements;
3707     }
3708    
3709     !!!insert-element ($token->{tag_name}, $token->{attributes});
3710     $insertion_mode = 'in cell';
3711    
3712     push @$active_formatting_elements, ['#marker', ''];
3713    
3714     !!!next-token;
3715     redo B;
3716     } elsif ({
3717     caption => 1, col => 1, colgroup => 1,
3718     tbody => 1, tfoot => 1, thead => 1, tr => 1,
3719     }->{$token->{tag_name}}) {
3720     ## As if </tr>
3721     ## have an element in table scope
3722     my $i;
3723     INSCOPE: for (reverse 0..$#$open_elements) {
3724     my $node = $open_elements->[$_];
3725     if ($node->[1] eq 'tr') {
3726     $i = $_;
3727     last INSCOPE;
3728     } elsif ({
3729     table => 1, html => 1,
3730     }->{$node->[1]}) {
3731     last INSCOPE;
3732     }
3733     } # INSCOPE
3734     unless (defined $i) {
3735     !!!parse-error;
3736     ## Ignore the token
3737     !!!next-token;
3738     redo B;
3739     }
3740    
3741     ## Clear back to table row context
3742     while (not {
3743     tr => 1, html => 1,
3744     }->{$open_elements->[-1]->[1]}) {
3745     !!!parse-error;
3746     pop @$open_elements;
3747     }
3748    
3749     pop @$open_elements; # tr
3750     $insertion_mode = 'in table body';
3751     ## reprocess
3752     redo B;
3753     } elsif ($token->{tag_name} eq 'table') {
3754     ## NOTE: This is a code clone of "table in table"
3755     !!!parse-error;
3756    
3757     ## As if </table>
3758     ## have a table element in table scope
3759     my $i;
3760     INSCOPE: for (reverse 0..$#$open_elements) {
3761     my $node = $open_elements->[$_];
3762     if ($node->[1] eq 'table') {
3763     $i = $_;
3764     last INSCOPE;
3765     } elsif ({
3766     table => 1, html => 1,
3767     }->{$node->[1]}) {
3768     last INSCOPE;
3769     }
3770     } # INSCOPE
3771     unless (defined $i) {
3772     !!!parse-error;
3773     ## Ignore tokens </table><table>
3774     !!!next-token;
3775     redo B;
3776     }
3777    
3778     ## generate implied end tags
3779     if ({
3780     dd => 1, dt => 1, li => 1, p => 1,
3781     td => 1, th => 1, tr => 1,
3782     }->{$open_elements->[-1]->[1]}) {
3783     !!!back-token; # <table>
3784     $token = {type => 'end tag', tag_name => 'table'};
3785     !!!back-token;
3786     $token = {type => 'end tag',
3787     tag_name => $open_elements->[-1]->[1]}; # MUST
3788     redo B;
3789     }
3790    
3791     if ($open_elements->[-1]->[1] ne 'table') {
3792     !!!parse-error;
3793     }
3794    
3795     splice @$open_elements, $i;
3796    
3797     $reset_insertion_mode->();
3798    
3799     ## reprocess
3800     redo B;
3801     } else {
3802     #
3803     }
3804     } elsif ($token->{type} eq 'end tag') {
3805     if ($token->{tag_name} eq 'tr') {
3806     ## have an element in table scope
3807     my $i;
3808     INSCOPE: for (reverse 0..$#$open_elements) {
3809     my $node = $open_elements->[$_];
3810     if ($node->[1] eq $token->{tag_name}) {
3811     $i = $_;
3812     last INSCOPE;
3813     } elsif ({
3814     table => 1, html => 1,
3815     }->{$node->[1]}) {
3816     last INSCOPE;
3817     }
3818     } # INSCOPE
3819     unless (defined $i) {
3820     !!!parse-error;
3821     ## Ignore the token
3822     !!!next-token;
3823     redo B;
3824     }
3825    
3826     ## Clear back to table row context
3827     while (not {
3828     tr => 1, html => 1,
3829     }->{$open_elements->[-1]->[1]}) {
3830     !!!parse-error;
3831     pop @$open_elements;
3832     }
3833    
3834     pop @$open_elements; # tr
3835     $insertion_mode = 'in table body';
3836     !!!next-token;
3837     redo B;
3838     } elsif ($token->{tag_name} eq 'table') {
3839     ## As if </tr>
3840     ## have an element in table scope
3841     my $i;
3842     INSCOPE: for (reverse 0..$#$open_elements) {
3843     my $node = $open_elements->[$_];
3844     if ($node->[1] eq 'tr') {
3845     $i = $_;
3846     last INSCOPE;
3847     } elsif ({
3848     table => 1, html => 1,
3849     }->{$node->[1]}) {
3850     last INSCOPE;
3851     }
3852     } # INSCOPE
3853     unless (defined $i) {
3854     !!!parse-error;
3855     ## Ignore the token
3856     !!!next-token;
3857     redo B;
3858     }
3859    
3860     ## Clear back to table row context
3861     while (not {
3862     tr => 1, html => 1,
3863     }->{$open_elements->[-1]->[1]}) {
3864     !!!parse-error;
3865     pop @$open_elements;
3866     }
3867    
3868     pop @$open_elements; # tr
3869     $insertion_mode = 'in table body';
3870     ## reprocess
3871     redo B;
3872     } elsif ({
3873     tbody => 1, tfoot => 1, thead => 1,
3874     }->{$token->{tag_name}}) {
3875     ## have an element in table scope
3876     my $i;
3877     INSCOPE: for (reverse 0..$#$open_elements) {
3878     my $node = $open_elements->[$_];
3879     if ($node->[1] eq $token->{tag_name}) {
3880     $i = $_;
3881     last INSCOPE;
3882     } elsif ({
3883     table => 1, html => 1,
3884     }->{$node->[1]}) {
3885     last INSCOPE;
3886     }
3887     } # INSCOPE
3888     unless (defined $i) {
3889     !!!parse-error;
3890     ## Ignore the token
3891     !!!next-token;
3892     redo B;
3893     }
3894    
3895     ## As if </tr>
3896     ## have an element in table scope
3897     my $i;
3898     INSCOPE: for (reverse 0..$#$open_elements) {
3899     my $node = $open_elements->[$_];
3900     if ($node->[1] eq 'tr') {
3901     $i = $_;
3902     last INSCOPE;
3903     } elsif ({
3904     table => 1, html => 1,
3905     }->{$node->[1]}) {
3906     last INSCOPE;
3907     }
3908     } # INSCOPE
3909     unless (defined $i) {
3910     !!!parse-error;
3911     ## Ignore the token
3912     !!!next-token;
3913     redo B;
3914     }
3915    
3916     ## Clear back to table row context
3917     while (not {
3918     tr => 1, html => 1,
3919     }->{$open_elements->[-1]->[1]}) {
3920     !!!parse-error;
3921     pop @$open_elements;
3922     }
3923    
3924     pop @$open_elements; # tr
3925     $insertion_mode = 'in table body';
3926     ## reprocess
3927     redo B;
3928     } elsif ({
3929     body => 1, caption => 1, col => 1,
3930     colgroup => 1, html => 1, td => 1, th => 1,
3931     }->{$token->{tag_name}}) {
3932     !!!parse-error;
3933     ## Ignore the token
3934     !!!next-token;
3935     redo B;
3936     } else {
3937     #
3938     }
3939     } else {
3940     #
3941     }
3942    
3943     ## As if in table
3944     ## NOTE: This is a code clone of "misc in table".
3945     !!!parse-error;
3946     $in_body->(sub {
3947     my $child = shift;
3948     if ({
3949     table => 1, tbody => 1, tfoot => 1,
3950     thead => 1, tr => 1,
3951     }->{$open_elements->[-1]->[1]}) {
3952     # MUST
3953     my $foster_parent_element;
3954     my $next_sibling;
3955     OE: for (reverse 0..$#$open_elements) {
3956     if ($open_elements->[$_]->[1] eq 'table') {
3957     my $parent = $open_elements->[$_]->[0]->parent_node;
3958     if (defined $parent and $parent->node_type == 1) {
3959     $foster_parent_element = $parent;
3960     $next_sibling = $open_elements->[$_]->[0];
3961     } else {
3962     $foster_parent_element
3963     = $open_elements->[$_ - 1]->[0];
3964     }
3965     last OE;
3966     }
3967     } # OE
3968     $foster_parent_element = $open_elements->[0]->[0]
3969     unless defined $foster_parent_element;
3970     $foster_parent_element->insert_before
3971     ($child, $next_sibling);
3972     } else {
3973     $open_elements->[-1]->[0]->append_child ($child);
3974     }
3975     });
3976     redo B;
3977     } elsif ($insertion_mode eq 'in cell') {
3978     if ($token->{type} eq 'character') {
3979     ## NOTE: This is a code clone of "character in body".
3980     $reconstruct_active_formatting_elements->();
3981    
3982     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
3983    
3984     !!!next-token;
3985     redo B;
3986     } elsif ($token->{type} eq 'comment') {
3987     ## NOTE: This is a code clone of "comment in body".
3988     my $comment = $self->{document}->create_comment ($token->{data});
3989     $open_elements->[-1]->[0]->append_child ($comment);
3990     !!!next-token;
3991     redo B;
3992     } elsif ($token->{type} eq 'start tag') {
3993     if ({
3994     caption => 1, col => 1, colgroup => 1,
3995     tbody => 1, td => 1, tfoot => 1, th => 1,
3996     thead => 1, tr => 1,
3997     }->{$token->{tag_name}}) {
3998     ## have an element in table scope
3999     my $tn;
4000     INSCOPE: for (reverse 0..$#$open_elements) {
4001     my $node = $open_elements->[$_];
4002     if ($node->[1] eq 'td' or $node->[1] eq 'th') {
4003     $tn = $node->[1];
4004     last INSCOPE;
4005     } elsif ({
4006     table => 1, html => 1,
4007     }->{$node->[1]}) {
4008     last INSCOPE;
4009     }
4010     } # INSCOPE
4011     unless (defined $tn) {
4012     !!!parse-error;
4013     ## Ignore the token
4014     !!!next-token;
4015     redo B;
4016     }
4017    
4018     ## Close the cell
4019     !!!back-token; # <?>
4020     $token = {type => 'end tag', tag_name => $tn};
4021     redo B;
4022     } else {
4023     #
4024     }
4025     } elsif ($token->{type} eq 'end tag') {
4026     if ($token->{type} eq 'td' or $token->{type} eq 'th') {
4027     ## have an element in table scope
4028     my $i;
4029     INSCOPE: for (reverse 0..$#$open_elements) {
4030     my $node = $open_elements->[$_];
4031     if ($node->[1] eq $token->{tag_name}) {
4032     $i = $_;
4033     last INSCOPE;
4034     } elsif ({
4035     table => 1, html => 1,
4036     }->{$node->[1]}) {
4037     last INSCOPE;
4038     }
4039     } # INSCOPE
4040     unless (defined $i) {
4041     !!!parse-error;
4042     ## Ignore the token
4043     !!!next-token;
4044     redo B;
4045     }
4046    
4047     ## generate implied end tags
4048     if ({
4049     dd => 1, dt => 1, li => 1, p => 1,
4050     td => ($token->{tag_name} eq 'th'),
4051     th => ($token->{tag_name} eq 'td'),
4052     tr => 1,
4053     }->{$open_elements->[-1]->[1]}) {
4054     !!!back-token;
4055     $token = {type => 'end tag',
4056     tag_name => $open_elements->[-1]->[1]}; # MUST
4057     redo B;
4058     }
4059    
4060     if ($open_elements->[-1]->[1] ne $token->{tag_name}) {
4061     !!!parse-error;
4062     }
4063    
4064     splice @$open_elements, $i;
4065    
4066     $clear_up_to_marker->();
4067    
4068     $insertion_mode = 'in row';
4069    
4070     !!!next-token;
4071     redo B;
4072     } elsif ({
4073     body => 1, caption => 1, col => 1,
4074     colgroup => 1, html => 1,
4075     }->{$token->{tag_name}}) {
4076     !!!parse-error;
4077     ## Ignore the token
4078     !!!next-token;
4079     redo B;
4080     } elsif ({
4081     table => 1, tbody => 1, tfoot => 1,
4082     thead => 1, tr => 1,
4083     }->{$token->{tag_name}}) {
4084     ## have an element in table scope
4085     my $i;
4086     my $tn;
4087     INSCOPE: for (reverse 0..$#$open_elements) {
4088     my $node = $open_elements->[$_];
4089     if ($node->[1] eq $token->{tag_name}) {
4090     $i = $_;
4091     $tn = $node->[1];
4092     last INSCOPE;
4093     } elsif ($node->[1] eq 'td' or $node->[1] eq 'th') {
4094     $tn = $node->[1];
4095     ## NOTE: There is exactly one |td| or |th| element
4096     ## in scope in the stack of open elements by definition.
4097     } elsif ({
4098     table => 1, html => 1,
4099     }->{$node->[1]}) {
4100     last INSCOPE;
4101     }
4102     } # INSCOPE
4103     unless (defined $i) {
4104     !!!parse-error;
4105     ## Ignore the token
4106     !!!next-token;
4107     redo B;
4108     }
4109    
4110     ## Close the cell
4111     !!!back-token; # </?>
4112     $token = {type => 'end tag', tag_name => $tn};
4113     redo B;
4114     } else {
4115     #
4116     }
4117     } else {
4118     #
4119     }
4120    
4121     $in_body->(sub {
4122     $open_elements->[-1]->[0]->append_child (shift);
4123     });
4124     redo B;
4125     } elsif ($insertion_mode eq 'in select') {
4126     if ($token->{type} eq 'character') {
4127     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
4128     !!!next-token;
4129     redo B;
4130     } elsif ($token->{type} eq 'comment') {
4131     my $comment = $self->{document}->create_comment ($token->{data});
4132     $open_elements->[-1]->[0]->append_child ($comment);
4133     !!!next-token;
4134     redo B;
4135     } elsif ($token->{type} eq 'start tag') {
4136     if ($token->{tag_name} eq 'option') {
4137     if ($open_elements->[-1]->[1] eq 'option') {
4138     ## As if </option>
4139     pop @$open_elements;
4140     }
4141    
4142     !!!insert-element ($token->{tag_name}, $token->{attributes});
4143     !!!next-token;
4144     redo B;
4145     } elsif ($token->{tag_name} eq 'optgroup') {
4146     if ($open_elements->[-1]->[1] eq 'option') {
4147     ## As if </option>
4148     pop @$open_elements;
4149     }
4150    
4151     if ($open_elements->[-1]->[1] eq 'optgroup') {
4152     ## As if </optgroup>
4153     pop @$open_elements;
4154     }
4155    
4156     !!!insert-element ($token->{tag_name}, $token->{attributes});
4157     !!!next-token;
4158     redo B;
4159     } elsif ($token->{tag_name} eq 'select') {
4160     !!!parse-error;
4161     ## As if </select> instead
4162     ## have an element in table scope
4163     my $i;
4164     INSCOPE: for (reverse 0..$#$open_elements) {
4165     my $node = $open_elements->[$_];
4166     if ($node->[1] eq $token->{tag_name}) {
4167     $i = $_;
4168     last INSCOPE;
4169     } elsif ({
4170     table => 1, html => 1,
4171     }->{$node->[1]}) {
4172     last INSCOPE;
4173     }
4174     } # INSCOPE
4175     unless (defined $i) {
4176     !!!parse-error;
4177     ## Ignore the token
4178     !!!next-token;
4179     redo B;
4180     }
4181    
4182     splice @$open_elements, $i;
4183    
4184     $reset_insertion_mode->();
4185    
4186     !!!next-token;
4187     redo B;
4188     } else {
4189     #
4190     }
4191     } elsif ($token->{type} eq 'end tag') {
4192     if ($token->{tag_name} eq 'optgroup') {
4193     if ($open_elements->[-1]->[1] eq 'option' and
4194     $open_elements->[-2]->[1] eq 'optgroup') {
4195     ## As if </option>
4196     splice @$open_elements, -2;
4197     } elsif ($open_elements->[-1]->[1] eq 'optgroup') {
4198     pop @$open_elements;
4199     } else {
4200     !!!parse-error;
4201     ## Ignore the token
4202     }
4203     !!!next-token;
4204     redo B;
4205     } elsif ($token->{tag_name} eq 'option') {
4206     if ($open_elements->[-1]->[1] eq 'option') {
4207     pop @$open_elements;
4208     } else {
4209     !!!parse-error;
4210     ## Ignore the token
4211     }
4212     !!!next-token;
4213     redo B;
4214     } elsif ($token->{tag_name} eq 'select') {
4215     ## have an element in table scope
4216     my $i;
4217     INSCOPE: for (reverse 0..$#$open_elements) {
4218     my $node = $open_elements->[$_];
4219     if ($node->[1] eq $token->{tag_name}) {
4220     $i = $_;
4221     last INSCOPE;
4222     } elsif ({
4223     table => 1, html => 1,
4224     }->{$node->[1]}) {
4225     last INSCOPE;
4226     }
4227     } # INSCOPE
4228     unless (defined $i) {
4229     !!!parse-error;
4230     ## Ignore the token
4231     !!!next-token;
4232     redo B;
4233     }
4234    
4235     splice @$open_elements, $i;
4236    
4237     $reset_insertion_mode->();
4238    
4239     !!!next-token;
4240     redo B;
4241     } elsif ({
4242     caption => 1, table => 1, tbody => 1,
4243     tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
4244     }->{$token->{tag_name}}) {
4245     !!!parse-error;
4246    
4247     ## have an element in table scope
4248     my $i;
4249     INSCOPE: for (reverse 0..$#$open_elements) {
4250     my $node = $open_elements->[$_];
4251     if ($node->[1] eq $token->{tag_name}) {
4252     $i = $_;
4253     last INSCOPE;
4254     } elsif ({
4255     table => 1, html => 1,
4256     }->{$node->[1]}) {
4257     last INSCOPE;
4258     }
4259     } # INSCOPE
4260     unless (defined $i) {
4261     ## Ignore the token
4262     !!!next-token;
4263     redo B;
4264     }
4265    
4266     ## As if </select>
4267     ## have an element in table scope
4268     undef $i;
4269     INSCOPE: for (reverse 0..$#$open_elements) {
4270     my $node = $open_elements->[$_];
4271     if ($node->[1] eq 'select') {
4272     $i = $_;
4273     last INSCOPE;
4274     } elsif ({
4275     table => 1, html => 1,
4276     }->{$node->[1]}) {
4277     last INSCOPE;
4278     }
4279     } # INSCOPE
4280     unless (defined $i) {
4281     !!!parse-error;
4282     ## Ignore the </select> token
4283     !!!next-token; ## TODO: ok?
4284     redo B;
4285     }
4286    
4287     splice @$open_elements, $i;
4288    
4289     $reset_insertion_mode->();
4290    
4291     ## reprocess
4292     redo B;
4293     } else {
4294     #
4295     }
4296     } else {
4297     #
4298     }
4299    
4300     !!!parse-error;
4301     ## Ignore the token
4302     redo B;
4303     } elsif ($insertion_mode eq 'after body') {
4304     if ($token->{type} eq 'character') {
4305     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4306     ## As if in body
4307     $reconstruct_active_formatting_elements->();
4308    
4309     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
4310    
4311     unless (length $token->{data}) {
4312     !!!next-token;
4313     redo B;
4314     }
4315     }
4316    
4317     #
4318     } elsif ($token->{type} eq 'comment') {
4319     my $comment = $self->{document}->create_comment ($token->{data});
4320     $open_elements->[0]->[0]->append_child ($comment);
4321     !!!next-token;
4322     redo B;
4323     } elsif ($token->{type} eq 'end tag') {
4324     if ($token->{type} eq 'html') {
4325     ## TODO: if inner_html, parse-error, ignore the token; otherwise,
4326    
4327     $phase = 'trailing end';
4328     !!!next-token;
4329     redo B;
4330     } else {
4331     #
4332     }
4333     } else {
4334     #
4335     }
4336    
4337     !!!parse-error;
4338     $insertion_mode = 'in body';
4339     ## reprocess
4340     redo B;
4341     } elsif ($insertion_mode eq 'in frameset') {
4342     if ($token->{type} eq 'character') {
4343     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4344     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
4345    
4346     unless (length $token->{data}) {
4347     !!!next-token;
4348     redo B;
4349     }
4350     }
4351    
4352     #
4353     } elsif ($token->{type} eq 'comment') {
4354     my $comment = $self->{document}->create_comment ($token->{data});
4355     $open_elements->[-1]->[0]->append_child ($comment);
4356     !!!next-token;
4357     redo B;
4358     } elsif ($token->{type} eq 'start tag') {
4359     if ($token->{tag_name} eq 'frameset') {
4360     !!!insert-element ($token->{tag_name}, $token->{attributes});
4361     !!!next-token;
4362     redo B;
4363     } elsif ($token->{tag_name} eq 'frame') {
4364     !!!insert-element ($token->{tag_name}, $token->{attributes});
4365     pop @$open_elements;
4366     !!!next-token;
4367     redo B;
4368     } elsif ($token->{tag_name} eq 'noframes') {
4369     $in_body->(sub {
4370     $open_elements->[-1]->[0]->append_child (shift);
4371     });
4372     redo B;
4373     } else {
4374     #
4375     }
4376     } elsif ($token->{type} eq 'end tag') {
4377     if ($token->{tag_name} eq 'frameset') {
4378     if ($open_elements->[-1]->[1] eq 'html' and
4379     @$open_elements == 1) {
4380     !!!parse-error;
4381     ## Ignore the token
4382     !!!next-token;
4383     } else {
4384     pop @$open_elements;
4385     !!!next-token;
4386     }
4387    
4388     ## if not inner_html and
4389     if ($open_elements->[-1]->[1] ne 'frameset') {
4390     $insertion_mode = 'after frameset';
4391     }
4392     redo B;
4393     } else {
4394     #
4395     }
4396     } else {
4397     #
4398     }
4399    
4400     !!!parse-error;
4401     ## Ignore the token
4402     !!!next-token;
4403     redo B;
4404     } elsif ($insertion_mode eq 'after frameset') {
4405     if ($token->{type} eq 'character') {
4406     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4407     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
4408    
4409     unless (length $token->{data}) {
4410     !!!next-token;
4411     redo B;
4412     }
4413     }
4414    
4415     #
4416     } elsif ($token->{type} eq 'comment') {
4417     my $comment = $self->{document}->create_comment ($token->{data});
4418     $open_elements->[-1]->[0]->append_child ($comment);
4419     !!!next-token;
4420     redo B;
4421     } elsif ($token->{type} eq 'start tag') {
4422     if ($token->{tag_name} eq 'noframes') {
4423     $in_body->(sub {
4424     $open_elements->[-1]->[0]->append_child (shift);
4425     });
4426     redo B;
4427     } else {
4428     #
4429     }
4430     } elsif ($token->{type} eq 'end tag') {
4431     if ($token->{tag_name} eq 'html') {
4432     $phase = 'trailing end';
4433     !!!next-token;
4434     redo B;
4435     } else {
4436     #
4437     }
4438     } else {
4439     #
4440     }
4441    
4442     !!!parse-error;
4443     ## Ignore the token
4444     !!!next-token;
4445     redo B;
4446    
4447     ## ISSUE: An issue in spec there
4448     } else {
4449     die "$0: $insertion_mode: Unknown insertion mode";
4450     }
4451     }
4452     } elsif ($phase eq 'trailing end') {
4453     ## states in the main stage is preserved yet # MUST
4454    
4455     if ($token->{type} eq 'DOCTYPE') {
4456     !!!parse-error;
4457     ## Ignore the token
4458     !!!next-token;
4459     redo B;
4460     } elsif ($token->{type} eq 'comment') {
4461     my $comment = $self->{document}->create_comment ($token->{data});
4462     $self->{document}->append_child ($comment);
4463     !!!next-token;
4464     redo B;
4465     } elsif ($token->{type} eq 'character') {
4466     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4467     ## As if in the main phase.
4468     ## NOTE: The insertion mode in the main phase
4469     ## just before the phase has been changed to the trailing
4470     ## end phase is either "after body" or "after frameset".
4471     $reconstruct_active_formatting_elements->()
4472     if $phase eq 'main';
4473    
4474     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
4475    
4476     unless (length $token->{data}) {
4477     !!!next-token;
4478     redo B;
4479     }
4480     }
4481    
4482     !!!parse-error;
4483     $phase = 'main';
4484     ## reprocess
4485     redo B;
4486     } elsif ($token->{type} eq 'start tag' or
4487     $token->{type} eq 'end tag') {
4488     !!!parse-error;
4489     $phase = 'main';
4490     ## reprocess
4491     redo B;
4492     } elsif ($token->{type} eq 'end-of-file') {
4493     ## Stop parsing
4494     last B;
4495     } else {
4496     die "$0: $token->{type}: Unknown token";
4497     }
4498     }
4499     } # B
4500    
4501     ## Stop parsing # MUST
4502    
4503     ## TODO: script stuffs
4504     } # _construct_tree
4505    
4506     sub inner_html ($$$) {
4507     my ($class, $node, $on_error) = @_;
4508    
4509     ## Step 1
4510     my $s = '';
4511    
4512     my $in_cdata;
4513     my $parent = $node;
4514     while (defined $parent) {
4515     if ($parent->node_type == 1 and
4516     $parent->namespace_uri eq 'http://www.w3.org/1999/xhtml' and
4517     {
4518     style => 1, script => 1, xmp => 1, iframe => 1,
4519     noembed => 1, noframes => 1, noscript => 1,
4520     }->{$parent->local_name}) { ## TODO: case thingy
4521     $in_cdata = 1;
4522     }
4523     $parent = $parent->parent_node;
4524     }
4525    
4526     ## Step 2
4527     my @node = @{$node->child_nodes};
4528     C: while (@node) {
4529     my $child = shift @node;
4530     unless (ref $child) {
4531     if ($child eq 'cdata-out') {
4532     $in_cdata = 0;
4533     } else {
4534     $s .= $child; # end tag
4535     }
4536     next C;
4537     }
4538    
4539     my $nt = $child->node_type;
4540     if ($nt == 1) { # Element
4541     my $tag_name = lc $child->tag_name; ## ISSUE: Definition of "lowercase"
4542     $s .= '<' . $tag_name;
4543    
4544     ## ISSUE: Non-html elements
4545    
4546     my @attrs = @{$child->attributes}; # sort order MUST be stable
4547     for my $attr (@attrs) { # order is implementation dependent
4548     my $attr_name = lc $attr->name; ## ISSUE: Definition of "lowercase"
4549     $s .= ' ' . $attr_name . '="';
4550     my $attr_value = $attr->value;
4551     ## escape
4552     $attr_value =~ s/&/&amp;/g;
4553     $attr_value =~ s/</&lt;/g;
4554     $attr_value =~ s/>/&gt;/g;
4555     $attr_value =~ s/"/&quot;/g;
4556     $s .= $attr_value . '"';
4557     }
4558     $s .= '>';
4559    
4560     next C if {
4561     area => 1, base => 1, basefont => 1, bgsound => 1,
4562     br => 1, col => 1, embed => 1, frame => 1, hr => 1,
4563     img => 1, input => 1, link => 1, meta => 1, param => 1,
4564     spacer => 1, wbr => 1,
4565     }->{$tag_name};
4566    
4567     if (not $in_cdata and {
4568     style => 1, script => 1, xmp => 1, iframe => 1,
4569     noembed => 1, noframes => 1, noscript => 1,
4570     }->{$tag_name}) {
4571     unshift @node, 'cdata-out';
4572     $in_cdata = 1;
4573     }
4574    
4575     unshift @node, @{$child->child_nodes}, '</' . $tag_name . '>';
4576     } elsif ($nt == 3 or $nt == 4) {
4577     if ($in_cdata) {
4578     $s .= $child->data;
4579     } else {
4580     my $value = $child->data;
4581     $value =~ s/&/&amp;/g;
4582     $value =~ s/</&lt;/g;
4583     $value =~ s/>/&gt;/g;
4584     $value =~ s/"/&quot;/g;
4585     $s .= $value;
4586     }
4587     } elsif ($nt == 8) {
4588     $s .= '<!--' . $child->data . '-->';
4589     } elsif ($nt == 10) {
4590     $s .= '<!DOCTYPE ' . $child->name . '>';
4591     } elsif ($nt == 5) { # entrefs
4592     push @node, @{$child->child_nodes};
4593     } else {
4594     $on_error->($child);
4595     }
4596     } # C
4597    
4598     ## Step 3
4599     return \$s;
4600     } # inner_html
4601    
4602 wakaba 1.1 1;
4603 wakaba 1.5 # $Date: 2007/04/30 11:45:24 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24