/[suikacvs]/markup/html/whatpm/What/HTML.pm.src
Suika

Contents of /markup/html/whatpm/What/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.6 - (hide annotations) (download) (as text)
Mon Apr 30 14:12:02 2007 UTC (17 years, 7 months ago) by wakaba
Branch: MAIN
Changes since 1.5: +27 -10 lines
File MIME type: application/x-wais-source
++ whatpm/What/ChangeLog	30 Apr 2007 14:11:13 -0000
	* HTML.pm.src: Some typos are fixed.

2007-04-30  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/t/ChangeLog	30 Apr 2007 14:11:55 -0000
	* .cvsignore: |tree-consturction| is added.

	* HTML-tree.t: New test.

	* Makefile: Rules for tree constructor tests are added.

2007-04-30  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package What::HTML;
2     use strict;
3 wakaba 1.6 our $VERSION=do{my @r=(q$Revision: 1.5 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.1
5     ## This is a very, very early version of an HTML parser.
6    
7     my $permitted_slash_tag_name = {
8     base => 1,
9     link => 1,
10     meta => 1,
11     hr => 1,
12     br => 1,
13     img=> 1,
14     embed => 1,
15     param => 1,
16     area => 1,
17     col => 1,
18     input => 1,
19     };
20    
21 wakaba 1.5 my $entity_char = {
22     AElig => "\x{00C6}",
23     Aacute => "\x{00C1}",
24     Acirc => "\x{00C2}",
25     Agrave => "\x{00C0}",
26     Alpha => "\x{0391}",
27     Aring => "\x{00C5}",
28     Atilde => "\x{00C3}",
29     Auml => "\x{00C4}",
30     Beta => "\x{0392}",
31     Ccedil => "\x{00C7}",
32     Chi => "\x{03A7}",
33     Dagger => "\x{2021}",
34     Delta => "\x{0394}",
35     ETH => "\x{00D0}",
36     Eacute => "\x{00C9}",
37     Ecirc => "\x{00CA}",
38     Egrave => "\x{00C8}",
39     Epsilon => "\x{0395}",
40     Eta => "\x{0397}",
41     Euml => "\x{00CB}",
42     Gamma => "\x{0393}",
43     Iacute => "\x{00CD}",
44     Icirc => "\x{00CE}",
45     Igrave => "\x{00CC}",
46     Iota => "\x{0399}",
47     Iuml => "\x{00CF}",
48     Kappa => "\x{039A}",
49     Lambda => "\x{039B}",
50     Mu => "\x{039C}",
51     Ntilde => "\x{00D1}",
52     Nu => "\x{039D}",
53     OElig => "\x{0152}",
54     Oacute => "\x{00D3}",
55     Ocirc => "\x{00D4}",
56     Ograve => "\x{00D2}",
57     Omega => "\x{03A9}",
58     Omicron => "\x{039F}",
59     Oslash => "\x{00D8}",
60     Otilde => "\x{00D5}",
61     Ouml => "\x{00D6}",
62     Phi => "\x{03A6}",
63     Pi => "\x{03A0}",
64     Prime => "\x{2033}",
65     Psi => "\x{03A8}",
66     Rho => "\x{03A1}",
67     Scaron => "\x{0160}",
68     Sigma => "\x{03A3}",
69     THORN => "\x{00DE}",
70     Tau => "\x{03A4}",
71     Theta => "\x{0398}",
72     Uacute => "\x{00DA}",
73     Ucirc => "\x{00DB}",
74     Ugrave => "\x{00D9}",
75     Upsilon => "\x{03A5}",
76     Uuml => "\x{00DC}",
77     Xi => "\x{039E}",
78     Yacute => "\x{00DD}",
79     Yuml => "\x{0178}",
80     Zeta => "\x{0396}",
81     aacute => "\x{00E1}",
82     acirc => "\x{00E2}",
83     acute => "\x{00B4}",
84     aelig => "\x{00E6}",
85     agrave => "\x{00E0}",
86     alefsym => "\x{2135}",
87     alpha => "\x{03B1}",
88     amp => "\x{0026}",
89     AMP => "\x{0026}",
90     and => "\x{2227}",
91     ang => "\x{2220}",
92     apos => "\x{0027}",
93     aring => "\x{00E5}",
94     asymp => "\x{2248}",
95     atilde => "\x{00E3}",
96     auml => "\x{00E4}",
97     bdquo => "\x{201E}",
98     beta => "\x{03B2}",
99     brvbar => "\x{00A6}",
100     bull => "\x{2022}",
101     cap => "\x{2229}",
102     ccedil => "\x{00E7}",
103     cedil => "\x{00B8}",
104     cent => "\x{00A2}",
105     chi => "\x{03C7}",
106     circ => "\x{02C6}",
107     clubs => "\x{2663}",
108     cong => "\x{2245}",
109     copy => "\x{00A9}",
110     COPY => "\x{00A9}",
111     crarr => "\x{21B5}",
112     cup => "\x{222A}",
113     curren => "\x{00A4}",
114     dArr => "\x{21D3}",
115     dagger => "\x{2020}",
116     darr => "\x{2193}",
117     deg => "\x{00B0}",
118     delta => "\x{03B4}",
119     diams => "\x{2666}",
120     divide => "\x{00F7}",
121     eacute => "\x{00E9}",
122     ecirc => "\x{00EA}",
123     egrave => "\x{00E8}",
124     empty => "\x{2205}",
125     emsp => "\x{2003}",
126     ensp => "\x{2002}",
127     epsilon => "\x{03B5}",
128     equiv => "\x{2261}",
129     eta => "\x{03B7}",
130     eth => "\x{00F0}",
131     euml => "\x{00EB}",
132     euro => "\x{20AC}",
133     exist => "\x{2203}",
134     fnof => "\x{0192}",
135     forall => "\x{2200}",
136     frac12 => "\x{00BD}",
137     frac14 => "\x{00BC}",
138     frac34 => "\x{00BE}",
139     frasl => "\x{2044}",
140     gamma => "\x{03B3}",
141     ge => "\x{2265}",
142     gt => "\x{003E}",
143     GT => "\x{003E}",
144     hArr => "\x{21D4}",
145     harr => "\x{2194}",
146     hearts => "\x{2665}",
147     hellip => "\x{2026}",
148     iacute => "\x{00ED}",
149     icirc => "\x{00EE}",
150     iexcl => "\x{00A1}",
151     igrave => "\x{00EC}",
152     image => "\x{2111}",
153     infin => "\x{221E}",
154     int => "\x{222B}",
155     iota => "\x{03B9}",
156     iquest => "\x{00BF}",
157     isin => "\x{2208}",
158     iuml => "\x{00EF}",
159     kappa => "\x{03BA}",
160     lArr => "\x{21D0}",
161     lambda => "\x{03BB}",
162     lang => "\x{2329}",
163     laquo => "\x{00AB}",
164     larr => "\x{2190}",
165     lceil => "\x{2308}",
166     ldquo => "\x{201C}",
167     le => "\x{2264}",
168     lfloor => "\x{230A}",
169     lowast => "\x{2217}",
170     loz => "\x{25CA}",
171     lrm => "\x{200E}",
172     lsaquo => "\x{2039}",
173     lsquo => "\x{2018}",
174     lt => "\x{003C}",
175     LT => "\x{003C}",
176     macr => "\x{00AF}",
177     mdash => "\x{2014}",
178     micro => "\x{00B5}",
179     middot => "\x{00B7}",
180     minus => "\x{2212}",
181     mu => "\x{03BC}",
182     nabla => "\x{2207}",
183     nbsp => "\x{00A0}",
184     ndash => "\x{2013}",
185     ne => "\x{2260}",
186     ni => "\x{220B}",
187     not => "\x{00AC}",
188     notin => "\x{2209}",
189     nsub => "\x{2284}",
190     ntilde => "\x{00F1}",
191     nu => "\x{03BD}",
192     oacute => "\x{00F3}",
193     ocirc => "\x{00F4}",
194     oelig => "\x{0153}",
195     ograve => "\x{00F2}",
196     oline => "\x{203E}",
197     omega => "\x{03C9}",
198     omicron => "\x{03BF}",
199     oplus => "\x{2295}",
200     or => "\x{2228}",
201     ordf => "\x{00AA}",
202     ordm => "\x{00BA}",
203     oslash => "\x{00F8}",
204     otilde => "\x{00F5}",
205     otimes => "\x{2297}",
206     ouml => "\x{00F6}",
207     para => "\x{00B6}",
208     part => "\x{2202}",
209     permil => "\x{2030}",
210     perp => "\x{22A5}",
211     phi => "\x{03C6}",
212     pi => "\x{03C0}",
213     piv => "\x{03D6}",
214     plusmn => "\x{00B1}",
215     pound => "\x{00A3}",
216     prime => "\x{2032}",
217     prod => "\x{220F}",
218     prop => "\x{221D}",
219     psi => "\x{03C8}",
220     quot => "\x{0022}",
221     QUOT => "\x{0022}",
222     rArr => "\x{21D2}",
223     radic => "\x{221A}",
224     rang => "\x{232A}",
225     raquo => "\x{00BB}",
226     rarr => "\x{2192}",
227     rceil => "\x{2309}",
228     rdquo => "\x{201D}",
229     real => "\x{211C}",
230     reg => "\x{00AE}",
231     REG => "\x{00AE}",
232     rfloor => "\x{230B}",
233     rho => "\x{03C1}",
234     rlm => "\x{200F}",
235     rsaquo => "\x{203A}",
236     rsquo => "\x{2019}",
237     sbquo => "\x{201A}",
238     scaron => "\x{0161}",
239     sdot => "\x{22C5}",
240     sect => "\x{00A7}",
241     shy => "\x{00AD}",
242     sigma => "\x{03C3}",
243     sigmaf => "\x{03C2}",
244     sim => "\x{223C}",
245     spades => "\x{2660}",
246     sub => "\x{2282}",
247     sube => "\x{2286}",
248     sum => "\x{2211}",
249     sup => "\x{2283}",
250     sup1 => "\x{00B9}",
251     sup2 => "\x{00B2}",
252     sup3 => "\x{00B3}",
253     supe => "\x{2287}",
254     szlig => "\x{00DF}",
255     tau => "\x{03C4}",
256     there4 => "\x{2234}",
257     theta => "\x{03B8}",
258     thetasym => "\x{03D1}",
259     thinsp => "\x{2009}",
260     thorn => "\x{00FE}",
261     tilde => "\x{02DC}",
262     times => "\x{00D7}",
263     trade => "\x{2122}",
264     uArr => "\x{21D1}",
265     uacute => "\x{00FA}",
266     uarr => "\x{2191}",
267     ucirc => "\x{00FB}",
268     ugrave => "\x{00F9}",
269     uml => "\x{00A8}",
270     upsih => "\x{03D2}",
271     upsilon => "\x{03C5}",
272     uuml => "\x{00FC}",
273     weierp => "\x{2118}",
274     xi => "\x{03BE}",
275     yacute => "\x{00FD}",
276     yen => "\x{00A5}",
277     yuml => "\x{00FF}",
278     zeta => "\x{03B6}",
279     zwj => "\x{200D}",
280     zwnj => "\x{200C}",
281     };
282    
283 wakaba 1.2 my $special_category = {
284     address => 1, area => 1, base => 1, basefont => 1, bgsound => 1,
285     blockquote => 1, body => 1, br => 1, center => 1, col => 1, colgroup => 1,
286     dd => 1, dir => 1, div => 1, dl => 1, dt => 1, embed => 1, fieldset => 1,
287     form => 1, frame => 1, frameset => 1, h1 => 1, h2 => 1, h3 => 1,
288     h4 => 1, h5 => 1, h6 => 1, head => 1, hr => 1, iframe => 1, image => 1,
289     img => 1, input => 1, isindex => 1, li => 1, link => 1, listing => 1,
290     menu => 1, meta => 1, noembed => 1, noframes => 1, noscript => 1,
291     ol => 1, optgroup => 1, option => 1, p => 1, param => 1, plaintext => 1,
292     pre => 1, script => 1, select => 1, spacer => 1, style => 1, tbody => 1,
293     textarea => 1, tfoot => 1, thead => 1, title => 1, tr => 1, ul => 1, wbr => 1,
294     };
295     my $scoping_category = {
296     button => 1, caption => 1, html => 1, marquee => 1, object => 1,
297     table => 1, td => 1, th => 1,
298     };
299     my $formatting_category = {
300     a => 1, b => 1, big => 1, em => 1, font => 1, i => 1, nobr => 1,
301     s => 1, small => 1, strile => 1, strong => 1, tt => 1, u => 1,
302     };
303     # $phrasing_category: all other elements
304    
305 wakaba 1.1 sub new ($) {
306     my $class = shift;
307     my $self = bless {}, $class;
308     $self->{set_next_input_character} = sub {
309     $self->{next_input_character} = -1;
310     };
311     $self->{parse_error} = sub {
312     #
313     };
314     return $self;
315     } # new
316    
317     ## Implementations MUST act as if state machine in the spec
318    
319     sub _initialize_tokenizer ($) {
320     my $self = shift;
321     $self->{state} = 'data'; # MUST
322     $self->{content_model_flag} = 'PCDATA'; # be
323     undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE
324     undef $self->{current_attribute};
325     undef $self->{last_emitted_start_tag_name};
326     undef $self->{last_attribute_value_state};
327     $self->{char} = [];
328     # $self->{next_input_character}
329     !!!next-input-character;
330     $self->{token} = [];
331     } # _initialize_tokenizer
332    
333     ## A token has:
334     ## ->{type} eq 'DOCTYPE', 'start tag', 'end tag', 'comment',
335     ## 'character', or 'end-of-file'
336     ## ->{name} (DOCTYPE, start tag (tagname), end tag (tagname))
337     ## ISSUE: the spec need s/tagname/tag name/
338     ## ->{error} == 1 or 0 (DOCTYPE)
339     ## ->{attributes} isa HASH (start tag, end tag)
340     ## ->{data} (comment, character)
341    
342     ## Macros
343     ## Macros MUST be preceded by three EXCLAMATION MARKs.
344     ## emit ($token)
345     ## Emits the specified token.
346    
347     ## Emitted token MUST immediately be handled by the tree construction state.
348    
349     ## Before each step, UA MAY check to see if either one of the scripts in
350     ## "list of scripts that will execute as soon as possible" or the first
351     ## script in the "list of scripts that will execute asynchronously",
352     ## has completed loading. If one has, then it MUST be executed
353     ## and removed from the list.
354    
355     sub _get_next_token ($) {
356     my $self = shift;
357     if (@{$self->{token}}) {
358     return shift @{$self->{token}};
359     }
360    
361     A: {
362     if ($self->{state} eq 'data') {
363     if ($self->{next_input_character} == 0x0026) { # &
364     if ($self->{content_model_flag} eq 'PCDATA' or
365     $self->{content_model_flag} eq 'RCDATA') {
366     $self->{state} = 'entity data';
367     !!!next-input-character;
368     redo A;
369     } else {
370     #
371     }
372     } elsif ($self->{next_input_character} == 0x003C) { # <
373     if ($self->{content_model_flag} ne 'PLAINTEXT') {
374     $self->{state} = 'tag open';
375     !!!next-input-character;
376     redo A;
377     } else {
378     #
379     }
380     } elsif ($self->{next_input_character} == -1) {
381     !!!emit ({type => 'end-of-file'});
382     last A; ## TODO: ok?
383     }
384     # Anything else
385     my $token = {type => 'character',
386     data => chr $self->{next_input_character}};
387     ## Stay in the data state
388     !!!next-input-character;
389    
390     !!!emit ($token);
391    
392     redo A;
393     } elsif ($self->{state} eq 'entity data') {
394     ## (cannot happen in CDATA state)
395    
396     my $token = $self->_tokenize_attempt_to_consume_an_entity;
397    
398     $self->{state} = 'data';
399     # next-input-character is already done
400    
401     unless (defined $token) {
402     !!!emit ({type => 'character', data => '&'});
403     } else {
404     !!!emit ($token);
405     }
406    
407     redo A;
408     } elsif ($self->{state} eq 'tag open') {
409     if ($self->{content_model_flag} eq 'RCDATA' or
410     $self->{content_model_flag} eq 'CDATA') {
411     if ($self->{next_input_character} == 0x002F) { # /
412     !!!next-input-character;
413     $self->{state} = 'close tag open';
414     redo A;
415     } else {
416     ## reconsume
417     $self->{state} = 'data';
418    
419     !!!emit (type => 'character', data => {'/'});
420    
421     redo A;
422     }
423     } elsif ($self->{content_model_flag} eq 'PCDATA') {
424     if ($self->{next_input_character} == 0x0021) { # !
425     $self->{state} = 'markup declaration open';
426     !!!next-input-character;
427     redo A;
428     } elsif ($self->{next_input_character} == 0x002F) { # /
429     $self->{state} = 'close tag open';
430     !!!next-input-character;
431     redo A;
432     } elsif (0x0041 <= $self->{next_input_character} and
433     $self->{next_input_character} <= 0x005A) { # A..Z
434     $self->{current_token}
435     = {type => 'start tag',
436     tag_name => chr ($self->{next_input_character} + 0x0020)};
437     $self->{state} = 'tag name';
438     !!!next-input-character;
439     redo A;
440     } elsif (0x0061 <= $self->{next_input_character} and
441     $self->{next_input_character} <= 0x007A) { # a..z
442     $self->{current_token} = {type => 'start tag',
443     tag_name => chr ($self->{next_input_character})};
444     $self->{state} = 'tag name';
445     !!!next-input-character;
446     redo A;
447     } elsif ($self->{next_input_character} == 0x003E) { # >
448     !!!parse-error;
449     $self->{state} = 'data';
450     !!!next-input-character;
451    
452 wakaba 1.3 !!!emit ({type => 'character', data => '<>'});
453 wakaba 1.1
454     redo A;
455     } elsif ($self->{next_input_character} == 0x003F) { # ?
456     !!!parse-error;
457     $self->{state} = 'bogus comment';
458     ## $self->{next_input_character} is intentionally left as is
459     redo A;
460     } else {
461     !!!parse-error;
462     $self->{state} = 'data';
463     ## reconsume
464    
465     !!!emit ({type => 'character', data => '<'});
466    
467     redo A;
468     }
469     } else {
470     die "$0: $self->{content_model_flag}: Unknown content model flag";
471     }
472     } elsif ($self->{state} eq 'close tag open') {
473     if ($self->{content_model_flag} eq 'RCDATA' or
474     $self->{content_model_flag} eq 'CDATA') {
475     my @next_char;
476     TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
477     push @next_char, $self->{next_input_character};
478     my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
479     my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
480     if ($self->{next_input_character} == $c or $self->{next_input_character} == $C) {
481     !!!next-input-character;
482     next TAGNAME;
483     } else {
484     !!!parse-error;
485     $self->{next_input_character} = shift @next_char; # reconsume
486     !!!back-next-input-character (@next_char);
487     $self->{state} = 'data';
488    
489     !!!emit ({type => 'character', data => '</'});
490    
491     redo A;
492     }
493     }
494 wakaba 1.2 push @next_char, $self->{next_input_character};
495 wakaba 1.1
496 wakaba 1.2 unless ($self->{next_input_character} == 0x0009 or # HT
497     $self->{next_input_character} == 0x000A or # LF
498     $self->{next_input_character} == 0x000B or # VT
499     $self->{next_input_character} == 0x000C or # FF
500     $self->{next_input_character} == 0x0020 or # SP
501     $self->{next_input_character} == 0x003E or # >
502     $self->{next_input_character} == 0x002F or # /
503     $self->{next_input_character} == 0x003C or # <
504 wakaba 1.1 $self->{next_input_character} == -1) {
505     !!!parse-error;
506     $self->{next_input_character} = shift @next_char; # reconsume
507     !!!back-next-input-character (@next_char);
508     $self->{state} = 'data';
509    
510     !!!emit ({type => 'character', data => '</'});
511    
512     redo A;
513     } else {
514     $self->{next_input_character} = shift @next_char;
515     !!!back-next-input-character (@next_char);
516     # and consume...
517     }
518     }
519    
520     if (0x0041 <= $self->{next_input_character} and
521     $self->{next_input_character} <= 0x005A) { # A..Z
522     $self->{current_token} = {type => 'end tag',
523     tag_name => chr ($self->{next_input_character} + 0x0020)};
524     $self->{state} = 'tag name';
525     !!!next-input-character;
526     redo A;
527     } elsif (0x0061 <= $self->{next_input_character} and
528     $self->{next_input_character} <= 0x007A) { # a..z
529     $self->{current_token} = {type => 'end tag',
530     tag_name => chr ($self->{next_input_character})};
531     $self->{state} = 'tag name';
532     !!!next-input-character;
533     redo A;
534     } elsif ($self->{next_input_character} == 0x003E) { # >
535     !!!parse-error;
536     $self->{state} = 'data';
537     !!!next-input-character;
538     redo A;
539     } elsif ($self->{next_input_character} == -1) {
540     !!!parse-error;
541     $self->{state} = 'data';
542     # reconsume
543    
544     !!!emit ({type => 'character', data => '</'});
545    
546     redo A;
547     } else {
548     !!!parse-error;
549     $self->{state} = 'bogus comment';
550     ## $self->{next_input_character} is intentionally left as is
551     redo A;
552     }
553     } elsif ($self->{state} eq 'tag name') {
554     if ($self->{next_input_character} == 0x0009 or # HT
555     $self->{next_input_character} == 0x000A or # LF
556     $self->{next_input_character} == 0x000B or # VT
557     $self->{next_input_character} == 0x000C or # FF
558     $self->{next_input_character} == 0x0020) { # SP
559     $self->{state} = 'before attribute name';
560     !!!next-input-character;
561     redo A;
562     } elsif ($self->{next_input_character} == 0x003E) { # >
563     if ($self->{current_token}->{type} eq 'start tag') {
564     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
565     } elsif ($self->{current_token}->{type} eq 'end tag') {
566     $self->{content_model_flag} = 'PCDATA'; # MUST
567 wakaba 1.2 if ($self->{current_token}->{attributes}) {
568 wakaba 1.1 !!!parse-error;
569     }
570     } else {
571     die "$0: $self->{current_token}->{type}: Unknown token type";
572     }
573     $self->{state} = 'data';
574     !!!next-input-character;
575    
576     !!!emit ($self->{current_token}); # start tag or end tag
577     undef $self->{current_token};
578    
579     redo A;
580     } elsif (0x0041 <= $self->{next_input_character} and
581     $self->{next_input_character} <= 0x005A) { # A..Z
582     $self->{current_token}->{tag_name} .= chr ($self->{next_input_character} + 0x0020);
583     # start tag or end tag
584     ## Stay in this state
585     !!!next-input-character;
586     redo A;
587     } elsif ($self->{next_input_character} == 0x003C or # <
588     $self->{next_input_character} == -1) {
589     !!!parse-error;
590     if ($self->{current_token}->{type} eq 'start tag') {
591     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
592     } elsif ($self->{current_token}->{type} eq 'end tag') {
593     $self->{content_model_flag} = 'PCDATA'; # MUST
594 wakaba 1.2 if ($self->{current_token}->{attributes}) {
595 wakaba 1.1 !!!parse-error;
596     }
597     } else {
598     die "$0: $self->{current_token}->{type}: Unknown token type";
599     }
600     $self->{state} = 'data';
601     # reconsume
602    
603     !!!emit ($self->{current_token}); # start tag or end tag
604     undef $self->{current_token};
605    
606     redo A;
607     } elsif ($self->{next_input_character} == 0x002F) { # /
608     !!!next-input-character;
609     if ($self->{next_input_character} == 0x003E and # >
610     $self->{current_token}->{type} eq 'start tag' and
611     $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
612     # permitted slash
613     #
614     } else {
615     !!!parse-error;
616     }
617     $self->{state} = 'before attribute name';
618     # next-input-character is already done
619     redo A;
620     } else {
621     $self->{current_token}->{tag_name} .= chr $self->{next_input_character};
622     # start tag or end tag
623     ## Stay in the state
624     !!!next-input-character;
625     redo A;
626     }
627     } elsif ($self->{state} eq 'before attribute name') {
628     if ($self->{next_input_character} == 0x0009 or # HT
629     $self->{next_input_character} == 0x000A or # LF
630     $self->{next_input_character} == 0x000B or # VT
631     $self->{next_input_character} == 0x000C or # FF
632     $self->{next_input_character} == 0x0020) { # SP
633     ## Stay in the state
634     !!!next-input-character;
635     redo A;
636     } elsif ($self->{next_input_character} == 0x003E) { # >
637     if ($self->{current_token}->{type} eq 'start tag') {
638     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
639     } elsif ($self->{current_token}->{type} eq 'end tag') {
640     $self->{content_model_flag} = 'PCDATA'; # MUST
641 wakaba 1.2 if ($self->{current_token}->{attributes}) {
642 wakaba 1.1 !!!parse-error;
643     }
644     } else {
645     die "$0: $self->{current_token}->{type}: Unknown token type";
646     }
647     $self->{state} = 'data';
648     !!!next-input-character;
649    
650     !!!emit ($self->{current_token}); # start tag or end tag
651     undef $self->{current_token};
652    
653     redo A;
654     } elsif (0x0041 <= $self->{next_input_character} and
655     $self->{next_input_character} <= 0x005A) { # A..Z
656     $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
657     value => ''};
658     $self->{state} = 'attribute name';
659     !!!next-input-character;
660     redo A;
661     } elsif ($self->{next_input_character} == 0x002F) { # /
662     !!!next-input-character;
663     if ($self->{next_input_character} == 0x003E and # >
664     $self->{current_token}->{type} eq 'start tag' and
665     $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
666     # permitted slash
667     #
668     } else {
669     !!!parse-error;
670     }
671     ## Stay in the state
672     # next-input-character is already done
673     redo A;
674     } elsif ($self->{next_input_character} == 0x003C or # <
675     $self->{next_input_character} == -1) {
676     !!!parse-error;
677     if ($self->{current_token}->{type} eq 'start tag') {
678     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
679     } elsif ($self->{current_token}->{type} eq 'end tag') {
680     $self->{content_model_flag} = 'PCDATA'; # MUST
681 wakaba 1.2 if ($self->{current_token}->{attributes}) {
682 wakaba 1.1 !!!parse-error;
683     }
684     } else {
685     die "$0: $self->{current_token}->{type}: Unknown token type";
686     }
687     $self->{state} = 'data';
688     # reconsume
689    
690     !!!emit ($self->{current_token}); # start tag or end tag
691     undef $self->{current_token};
692    
693     redo A;
694     } else {
695     $self->{current_attribute} = {name => chr ($self->{next_input_character}),
696     value => ''};
697     $self->{state} = 'attribute name';
698     !!!next-input-character;
699     redo A;
700     }
701     } elsif ($self->{state} eq 'attribute name') {
702     my $before_leave = sub {
703 wakaba 1.2 if (exists $self->{current_token}->{attributes} # start tag or end tag
704 wakaba 1.1 ->{$self->{current_attribute}->{name}}) { # MUST
705     !!!parse-error;
706     ## Discard $self->{current_attribute} # MUST
707     } else {
708 wakaba 1.2 $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}
709 wakaba 1.1 = $self->{current_attribute};
710     }
711     }; # $before_leave
712    
713     if ($self->{next_input_character} == 0x0009 or # HT
714     $self->{next_input_character} == 0x000A or # LF
715     $self->{next_input_character} == 0x000B or # VT
716     $self->{next_input_character} == 0x000C or # FF
717     $self->{next_input_character} == 0x0020) { # SP
718     $before_leave->();
719     $self->{state} = 'after attribute name';
720     !!!next-input-character;
721     redo A;
722     } elsif ($self->{next_input_character} == 0x003D) { # =
723     $before_leave->();
724     $self->{state} = 'before attribute value';
725     !!!next-input-character;
726     redo A;
727     } elsif ($self->{next_input_character} == 0x003E) { # >
728     $before_leave->();
729     if ($self->{current_token}->{type} eq 'start tag') {
730     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
731     } elsif ($self->{current_token}->{type} eq 'end tag') {
732     $self->{content_model_flag} = 'PCDATA'; # MUST
733 wakaba 1.2 if ($self->{current_token}->{attributes}) {
734 wakaba 1.1 !!!parse-error;
735     }
736     } else {
737     die "$0: $self->{current_token}->{type}: Unknown token type";
738     }
739     $self->{state} = 'data';
740     !!!next-input-character;
741    
742     !!!emit ($self->{current_token}); # start tag or end tag
743     undef $self->{current_token};
744    
745     redo A;
746     } elsif (0x0041 <= $self->{next_input_character} and
747     $self->{next_input_character} <= 0x005A) { # A..Z
748     $self->{current_attribute}->{name} .= chr ($self->{next_input_character} + 0x0020);
749     ## Stay in the state
750     !!!next-input-character;
751     redo A;
752     } elsif ($self->{next_input_character} == 0x002F) { # /
753     $before_leave->();
754     !!!next-input-character;
755     if ($self->{next_input_character} == 0x003E and # >
756     $self->{current_token}->{type} eq 'start tag' and
757     $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
758     # permitted slash
759     #
760     } else {
761     !!!parse-error;
762     }
763     $self->{state} = 'before attribute name';
764     # next-input-character is already done
765     redo A;
766     } elsif ($self->{next_input_character} == 0x003C or # <
767     $self->{next_input_character} == -1) {
768     !!!parse-error;
769     $before_leave->();
770     if ($self->{current_token}->{type} eq 'start tag') {
771     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
772     } elsif ($self->{current_token}->{type} eq 'end tag') {
773     $self->{content_model_flag} = 'PCDATA'; # MUST
774 wakaba 1.2 if ($self->{current_token}->{attributes}) {
775 wakaba 1.1 !!!parse-error;
776     }
777     } else {
778     die "$0: $self->{current_token}->{type}: Unknown token type";
779     }
780     $self->{state} = 'data';
781     # reconsume
782    
783     !!!emit ($self->{current_token}); # start tag or end tag
784     undef $self->{current_token};
785    
786     redo A;
787     } else {
788     $self->{current_attribute}->{name} .= chr ($self->{next_input_character});
789     ## Stay in the state
790     !!!next-input-character;
791     redo A;
792     }
793     } elsif ($self->{state} eq 'after attribute name') {
794     if ($self->{next_input_character} == 0x0009 or # HT
795     $self->{next_input_character} == 0x000A or # LF
796     $self->{next_input_character} == 0x000B or # VT
797     $self->{next_input_character} == 0x000C or # FF
798     $self->{next_input_character} == 0x0020) { # SP
799     ## Stay in the state
800     !!!next-input-character;
801     redo A;
802     } elsif ($self->{next_input_character} == 0x003D) { # =
803     $self->{state} = 'before attribute value';
804     !!!next-input-character;
805     redo A;
806     } elsif ($self->{next_input_character} == 0x003E) { # >
807     if ($self->{current_token}->{type} eq 'start tag') {
808     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
809     } elsif ($self->{current_token}->{type} eq 'end tag') {
810     $self->{content_model_flag} = 'PCDATA'; # MUST
811 wakaba 1.2 if ($self->{current_token}->{attributes}) {
812 wakaba 1.1 !!!parse-error;
813     }
814     } else {
815     die "$0: $self->{current_token}->{type}: Unknown token type";
816     }
817     $self->{state} = 'data';
818     !!!next-input-character;
819    
820     !!!emit ($self->{current_token}); # start tag or end tag
821     undef $self->{current_token};
822    
823     redo A;
824     } elsif (0x0041 <= $self->{next_input_character} and
825     $self->{next_input_character} <= 0x005A) { # A..Z
826     $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
827     value => ''};
828     $self->{state} = 'attribute name';
829     !!!next-input-character;
830     redo A;
831     } elsif ($self->{next_input_character} == 0x002F) { # /
832     !!!next-input-character;
833     if ($self->{next_input_character} == 0x003E and # >
834     $self->{current_token}->{type} eq 'start tag' and
835     $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
836     # permitted slash
837     #
838     } else {
839     !!!parse-error;
840     }
841     $self->{state} = 'before attribute name';
842     # next-input-character is already done
843     redo A;
844     } elsif ($self->{next_input_character} == 0x003C or # <
845     $self->{next_input_character} == -1) {
846     !!!parse-error;
847     if ($self->{current_token}->{type} eq 'start tag') {
848     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
849     } elsif ($self->{current_token}->{type} eq 'end tag') {
850     $self->{content_model_flag} = 'PCDATA'; # MUST
851 wakaba 1.2 if ($self->{current_token}->{attributes}) {
852 wakaba 1.1 !!!parse-error;
853     }
854     } else {
855     die "$0: $self->{current_token}->{type}: Unknown token type";
856     }
857     $self->{state} = 'data';
858     # reconsume
859    
860     !!!emit ($self->{current_token}); # start tag or end tag
861     undef $self->{current_token};
862    
863     redo A;
864     } else {
865     $self->{current_attribute} = {name => chr ($self->{next_input_character}),
866     value => ''};
867     $self->{state} = 'attribute name';
868     !!!next-input-character;
869     redo A;
870     }
871     } elsif ($self->{state} eq 'before attribute value') {
872     if ($self->{next_input_character} == 0x0009 or # HT
873     $self->{next_input_character} == 0x000A or # LF
874     $self->{next_input_character} == 0x000B or # VT
875     $self->{next_input_character} == 0x000C or # FF
876     $self->{next_input_character} == 0x0020) { # SP
877     ## Stay in the state
878     !!!next-input-character;
879     redo A;
880     } elsif ($self->{next_input_character} == 0x0022) { # "
881     $self->{state} = 'attribute value (double-quoted)';
882     !!!next-input-character;
883     redo A;
884     } elsif ($self->{next_input_character} == 0x0026) { # &
885     $self->{state} = 'attribute value (unquoted)';
886     ## reconsume
887     redo A;
888     } elsif ($self->{next_input_character} == 0x0027) { # '
889     $self->{state} = 'attribute value (single-quoted)';
890     !!!next-input-character;
891     redo A;
892     } elsif ($self->{next_input_character} == 0x003E) { # >
893     if ($self->{current_token}->{type} eq 'start tag') {
894     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
895     } elsif ($self->{current_token}->{type} eq 'end tag') {
896     $self->{content_model_flag} = 'PCDATA'; # MUST
897 wakaba 1.2 if ($self->{current_token}->{attributes}) {
898 wakaba 1.1 !!!parse-error;
899     }
900     } else {
901     die "$0: $self->{current_token}->{type}: Unknown token type";
902     }
903     $self->{state} = 'data';
904     !!!next-input-character;
905    
906     !!!emit ($self->{current_token}); # start tag or end tag
907     undef $self->{current_token};
908    
909     redo A;
910     } elsif ($self->{next_input_character} == 0x003C or # <
911     $self->{next_input_character} == -1) {
912     !!!parse-error;
913     if ($self->{current_token}->{type} eq 'start tag') {
914     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
915     } elsif ($self->{current_token}->{type} eq 'end tag') {
916     $self->{content_model_flag} = 'PCDATA'; # MUST
917 wakaba 1.2 if ($self->{current_token}->{attributes}) {
918 wakaba 1.1 !!!parse-error;
919     }
920     } else {
921     die "$0: $self->{current_token}->{type}: Unknown token type";
922     }
923     $self->{state} = 'data';
924     ## reconsume
925    
926     !!!emit ($self->{current_token}); # start tag or end tag
927     undef $self->{current_token};
928    
929     redo A;
930     } else {
931     $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
932     $self->{state} = 'attribute value (unquoted)';
933     !!!next-input-character;
934     redo A;
935     }
936     } elsif ($self->{state} eq 'attribute value (double-quoted)') {
937     if ($self->{next_input_character} == 0x0022) { # "
938     $self->{state} = 'before attribute name';
939     !!!next-input-character;
940     redo A;
941     } elsif ($self->{next_input_character} == 0x0026) { # &
942     $self->{last_attribute_value_state} = 'attribute value (double-quoted)';
943     $self->{state} = 'entity in attribute value';
944     !!!next-input-character;
945     redo A;
946     } elsif ($self->{next_input_character} == -1) {
947     !!!parse-error;
948     if ($self->{current_token}->{type} eq 'start tag') {
949     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
950     } elsif ($self->{current_token}->{type} eq 'end tag') {
951     $self->{content_model_flag} = 'PCDATA'; # MUST
952 wakaba 1.2 if ($self->{current_token}->{attributes}) {
953 wakaba 1.1 !!!parse-error;
954     }
955     } else {
956     die "$0: $self->{current_token}->{type}: Unknown token type";
957     }
958     $self->{state} = 'data';
959     ## reconsume
960    
961     !!!emit ($self->{current_token}); # start tag or end tag
962     undef $self->{current_token};
963    
964     redo A;
965     } else {
966     $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
967     ## Stay in the state
968     !!!next-input-character;
969     redo A;
970     }
971     } elsif ($self->{state} eq 'attribute value (single-quoted)') {
972     if ($self->{next_input_character} == 0x0027) { # '
973     $self->{state} = 'before attribute name';
974     !!!next-input-character;
975     redo A;
976     } elsif ($self->{next_input_character} == 0x0026) { # &
977     $self->{last_attribute_value_state} = 'attribute value (single-quoted)';
978     $self->{state} = 'entity in attribute value';
979     !!!next-input-character;
980     redo A;
981     } elsif ($self->{next_input_character} == -1) {
982     !!!parse-error;
983     if ($self->{current_token}->{type} eq 'start tag') {
984     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
985     } elsif ($self->{current_token}->{type} eq 'end tag') {
986     $self->{content_model_flag} = 'PCDATA'; # MUST
987 wakaba 1.2 if ($self->{current_token}->{attributes}) {
988 wakaba 1.1 !!!parse-error;
989     }
990     } else {
991     die "$0: $self->{current_token}->{type}: Unknown token type";
992     }
993     $self->{state} = 'data';
994     ## reconsume
995    
996     !!!emit ($self->{current_token}); # start tag or end tag
997     undef $self->{current_token};
998    
999     redo A;
1000     } else {
1001     $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1002     ## Stay in the state
1003     !!!next-input-character;
1004     redo A;
1005     }
1006     } elsif ($self->{state} eq 'attribute value (unquoted)') {
1007     if ($self->{next_input_character} == 0x0009 or # HT
1008     $self->{next_input_character} == 0x000A or # LF
1009     $self->{next_input_character} == 0x000B or # HT
1010     $self->{next_input_character} == 0x000C or # FF
1011     $self->{next_input_character} == 0x0020) { # SP
1012     $self->{state} = 'before attribute name';
1013     !!!next-input-character;
1014     redo A;
1015     } elsif ($self->{next_input_character} == 0x0026) { # &
1016     $self->{last_attribute_value_state} = 'attribute value (unquoted)';
1017     $self->{state} = 'entity in attribute value';
1018     !!!next-input-character;
1019     redo A;
1020     } elsif ($self->{next_input_character} == 0x003E) { # >
1021     if ($self->{current_token}->{type} eq 'start tag') {
1022     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1023     } elsif ($self->{current_token}->{type} eq 'end tag') {
1024     $self->{content_model_flag} = 'PCDATA'; # MUST
1025 wakaba 1.2 if ($self->{current_token}->{attributes}) {
1026 wakaba 1.1 !!!parse-error;
1027     }
1028     } else {
1029     die "$0: $self->{current_token}->{type}: Unknown token type";
1030     }
1031     $self->{state} = 'data';
1032     !!!next-input-character;
1033    
1034     !!!emit ($self->{current_token}); # start tag or end tag
1035     undef $self->{current_token};
1036    
1037     redo A;
1038     } elsif ($self->{next_input_character} == 0x003C or # <
1039     $self->{next_input_character} == -1) {
1040     !!!parse-error;
1041     if ($self->{current_token}->{type} eq 'start tag') {
1042     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1043     } elsif ($self->{current_token}->{type} eq 'end tag') {
1044     $self->{content_model_flag} = 'PCDATA'; # MUST
1045 wakaba 1.2 if ($self->{current_token}->{attributes}) {
1046 wakaba 1.1 !!!parse-error;
1047     }
1048     } else {
1049     die "$0: $self->{current_token}->{type}: Unknown token type";
1050     }
1051     $self->{state} = 'data';
1052     ## reconsume
1053    
1054     !!!emit ($self->{current_token}); # start tag or end tag
1055     undef $self->{current_token};
1056    
1057     redo A;
1058     } else {
1059     $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1060     ## Stay in the state
1061     !!!next-input-character;
1062     redo A;
1063     }
1064     } elsif ($self->{state} eq 'entity in attribute value') {
1065     my $token = $self->_tokenize_attempt_to_consume_an_entity;
1066    
1067     unless (defined $token) {
1068     $self->{current_attribute}->{value} .= '&';
1069     } else {
1070     $self->{current_attribute}->{value} .= $token->{data};
1071     ## ISSUE: spec says "append the returned character token to the current attribute's value"
1072     }
1073    
1074     $self->{state} = $self->{last_attribute_value_state};
1075     # next-input-character is already done
1076     redo A;
1077     } elsif ($self->{state} eq 'bogus comment') {
1078     ## (only happen if PCDATA state)
1079    
1080     my $token = {type => 'comment', data => ''};
1081    
1082     BC: {
1083     if ($self->{next_input_character} == 0x003E) { # >
1084     $self->{state} = 'data';
1085     !!!next-input-character;
1086    
1087     !!!emit ($token);
1088    
1089     redo A;
1090     } elsif ($self->{next_input_character} == -1) {
1091     $self->{state} = 'data';
1092     ## reconsume
1093    
1094     !!!emit ($token);
1095    
1096     redo A;
1097     } else {
1098     $token->{data} .= chr ($self->{next_input_character});
1099     !!!next-input-character;
1100     redo BC;
1101     }
1102     } # BC
1103     } elsif ($self->{state} eq 'markup declaration open') {
1104     ## (only happen if PCDATA state)
1105    
1106     my @next_char;
1107     push @next_char, $self->{next_input_character};
1108    
1109     if ($self->{next_input_character} == 0x002D) { # -
1110     !!!next-input-character;
1111     push @next_char, $self->{next_input_character};
1112     if ($self->{next_input_character} == 0x002D) { # -
1113     $self->{current_token} = {type => 'comment', data => ''};
1114     $self->{state} = 'comment';
1115     !!!next-input-character;
1116     redo A;
1117     }
1118     } elsif ($self->{next_input_character} == 0x0044 or # D
1119     $self->{next_input_character} == 0x0064) { # d
1120     !!!next-input-character;
1121     push @next_char, $self->{next_input_character};
1122     if ($self->{next_input_character} == 0x004F or # O
1123     $self->{next_input_character} == 0x006F) { # o
1124     !!!next-input-character;
1125     push @next_char, $self->{next_input_character};
1126     if ($self->{next_input_character} == 0x0043 or # C
1127     $self->{next_input_character} == 0x0063) { # c
1128     !!!next-input-character;
1129     push @next_char, $self->{next_input_character};
1130     if ($self->{next_input_character} == 0x0054 or # T
1131     $self->{next_input_character} == 0x0074) { # t
1132     !!!next-input-character;
1133     push @next_char, $self->{next_input_character};
1134     if ($self->{next_input_character} == 0x0059 or # Y
1135     $self->{next_input_character} == 0x0079) { # y
1136     !!!next-input-character;
1137     push @next_char, $self->{next_input_character};
1138     if ($self->{next_input_character} == 0x0050 or # P
1139     $self->{next_input_character} == 0x0070) { # p
1140     !!!next-input-character;
1141     push @next_char, $self->{next_input_character};
1142     if ($self->{next_input_character} == 0x0045 or # E
1143     $self->{next_input_character} == 0x0065) { # e
1144     ## ISSUE: What a stupid code this is!
1145     $self->{state} = 'DOCTYPE';
1146     !!!next-input-character;
1147     redo A;
1148     }
1149     }
1150     }
1151     }
1152     }
1153     }
1154     }
1155    
1156     !!!parse-error;
1157     $self->{next_input_character} = shift @next_char;
1158     !!!back-next-input-character (@next_char);
1159     $self->{state} = 'bogus comment';
1160     redo A;
1161    
1162     ## ISSUE: typos in spec: chacacters, is is a parse error
1163     ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
1164     } elsif ($self->{state} eq 'comment') {
1165     if ($self->{next_input_character} == 0x002D) { # -
1166     $self->{state} = 'comment dash';
1167     !!!next-input-character;
1168     redo A;
1169     } elsif ($self->{next_input_character} == -1) {
1170     !!!parse-error;
1171     $self->{state} = 'data';
1172     ## reconsume
1173    
1174     !!!emit ($self->{current_token}); # comment
1175     undef $self->{current_token};
1176    
1177     redo A;
1178     } else {
1179     $self->{current_token}->{data} .= chr ($self->{next_input_character}); # comment
1180     ## Stay in the state
1181     !!!next-input-character;
1182     redo A;
1183     }
1184     } elsif ($self->{state} eq 'comment dash') {
1185     if ($self->{next_input_character} == 0x002D) { # -
1186     $self->{state} = 'comment end';
1187     !!!next-input-character;
1188     redo A;
1189     } elsif ($self->{next_input_character} == -1) {
1190     !!!parse-error;
1191     $self->{state} = 'data';
1192     ## reconsume
1193    
1194     !!!emit ($self->{current_token}); # comment
1195     undef $self->{current_token};
1196    
1197     redo A;
1198     } else {
1199     $self->{current_token}->{data} .= '-' . chr ($self->{next_input_character}); # comment
1200     $self->{state} = 'comment';
1201     !!!next-input-character;
1202     redo A;
1203     }
1204     } elsif ($self->{state} eq 'comment end') {
1205     if ($self->{next_input_character} == 0x003E) { # >
1206     $self->{state} = 'data';
1207     !!!next-input-character;
1208    
1209     !!!emit ($self->{current_token}); # comment
1210     undef $self->{current_token};
1211    
1212     redo A;
1213     } elsif ($self->{next_input_character} == 0x002D) { # -
1214     !!!parse-error;
1215     $self->{current_token}->{data} .= '-'; # comment
1216     ## Stay in the state
1217     !!!next-input-character;
1218     redo A;
1219     } elsif ($self->{next_input_character} == -1) {
1220     !!!parse-error;
1221     $self->{state} = 'data';
1222     ## reconsume
1223    
1224     !!!emit ($self->{current_token}); # comment
1225     undef $self->{current_token};
1226    
1227     redo A;
1228     } else {
1229     !!!parse-error;
1230     $self->{current_token}->{data} .= '--' . chr ($self->{next_input_character}); # comment
1231     $self->{state} = 'comment';
1232     !!!next-input-character;
1233     redo A;
1234     }
1235     } elsif ($self->{state} eq 'DOCTYPE') {
1236     if ($self->{next_input_character} == 0x0009 or # HT
1237     $self->{next_input_character} == 0x000A or # LF
1238     $self->{next_input_character} == 0x000B or # VT
1239     $self->{next_input_character} == 0x000C or # FF
1240     $self->{next_input_character} == 0x0020) { # SP
1241     $self->{state} = 'before DOCTYPE name';
1242     !!!next-input-character;
1243     redo A;
1244     } else {
1245     !!!parse-error;
1246     $self->{state} = 'before DOCTYPE name';
1247     ## reconsume
1248     redo A;
1249     }
1250     } elsif ($self->{state} eq 'before DOCTYPE name') {
1251     if ($self->{next_input_character} == 0x0009 or # HT
1252     $self->{next_input_character} == 0x000A or # LF
1253     $self->{next_input_character} == 0x000B or # VT
1254     $self->{next_input_character} == 0x000C or # FF
1255     $self->{next_input_character} == 0x0020) { # SP
1256     ## Stay in the state
1257     !!!next-input-character;
1258     redo A;
1259     } elsif (0x0061 <= $self->{next_input_character} and
1260     $self->{next_input_character} <= 0x007A) { # a..z
1261     $self->{current_token} = {type => 'DOCTYPE',
1262     name => chr ($self->{next_input_character} - 0x0020),
1263     error => 1};
1264     $self->{state} = 'DOCTYPE name';
1265     !!!next-input-character;
1266     redo A;
1267     } elsif ($self->{next_input_character} == 0x003E) { # >
1268     !!!parse-error;
1269     $self->{state} = 'data';
1270     !!!next-input-character;
1271    
1272     !!!emit ({type => 'DOCTYPE', name => '', error => 1});
1273    
1274     redo A;
1275     } elsif ($self->{next_input_character} == -1) {
1276     !!!parse-error;
1277     $self->{state} = 'data';
1278     ## reconsume
1279    
1280     !!!emit ({type => 'DOCTYPE', name => '', error => 1});
1281    
1282     redo A;
1283     } else {
1284     $self->{current_token} = {type => 'DOCTYPE',
1285     name => chr ($self->{next_input_character}),
1286     error => 1};
1287     $self->{state} = 'DOCTYPE name';
1288     !!!next-input-character;
1289     redo A;
1290     }
1291     } elsif ($self->{state} eq 'DOCTYPE name') {
1292     if ($self->{next_input_character} == 0x0009 or # HT
1293     $self->{next_input_character} == 0x000A or # LF
1294     $self->{next_input_character} == 0x000B or # VT
1295     $self->{next_input_character} == 0x000C or # FF
1296     $self->{next_input_character} == 0x0020) { # SP
1297     $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE
1298     $self->{state} = 'after DOCTYPE name';
1299     !!!next-input-character;
1300     redo A;
1301     } elsif ($self->{next_input_character} == 0x003E) { # >
1302     $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE
1303     $self->{state} = 'data';
1304     !!!next-input-character;
1305    
1306     !!!emit ($self->{current_token}); # DOCTYPE
1307     undef $self->{current_token};
1308    
1309     redo A;
1310     } elsif (0x0061 <= $self->{next_input_character} and
1311     $self->{next_input_character} <= 0x007A) { # a..z
1312     $self->{current_token}->{name} .= chr ($self->{next_input_character} - 0x0020); # DOCTYPE
1313     #$self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML');
1314     ## Stay in the state
1315     !!!next-input-character;
1316     redo A;
1317     } elsif ($self->{next_input_character} == -1) {
1318     !!!parse-error;
1319     $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE
1320     $self->{state} = 'data';
1321     ## reconsume
1322    
1323     !!!emit ($self->{current_token});
1324     undef $self->{current_token};
1325    
1326     redo A;
1327     } else {
1328 wakaba 1.3 $self->{current_token}->{name}
1329     .= chr ($self->{next_input_character}); # DOCTYPE
1330 wakaba 1.1 #$self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML');
1331     ## Stay in the state
1332     !!!next-input-character;
1333     redo A;
1334     }
1335     } elsif ($self->{state} eq 'after DOCTYPE name') {
1336     if ($self->{next_input_character} == 0x0009 or # HT
1337     $self->{next_input_character} == 0x000A or # LF
1338     $self->{next_input_character} == 0x000B or # VT
1339     $self->{next_input_character} == 0x000C or # FF
1340     $self->{next_input_character} == 0x0020) { # SP
1341     ## Stay in the state
1342     !!!next-input-character;
1343     redo A;
1344     } elsif ($self->{next_input_character} == 0x003E) { # >
1345     $self->{state} = 'data';
1346     !!!next-input-character;
1347    
1348     !!!emit ($self->{current_token}); # DOCTYPE
1349     undef $self->{current_token};
1350    
1351     redo A;
1352     } elsif ($self->{next_input_character} == -1) {
1353     !!!parse-error;
1354     $self->{state} = 'data';
1355     ## reconsume
1356    
1357     !!!emit ($self->{current_token}); # DOCTYPE
1358     undef $self->{current_token};
1359    
1360     redo A;
1361     } else {
1362     !!!parse-error;
1363     $self->{current_token}->{error} = 1; # DOCTYPE
1364     $self->{state} = 'bogus DOCTYPE';
1365     !!!next-input-character;
1366     redo A;
1367     }
1368     } elsif ($self->{state} eq 'bogus DOCTYPE') {
1369     if ($self->{next_input_character} == 0x003E) { # >
1370     $self->{state} = 'data';
1371     !!!next-input-character;
1372    
1373     !!!emit ($self->{current_token}); # DOCTYPE
1374     undef $self->{current_token};
1375    
1376     redo A;
1377     } elsif ($self->{next_input_character} == -1) {
1378     !!!parse-error;
1379     $self->{state} = 'data';
1380     ## reconsume
1381    
1382     !!!emit ($self->{current_token}); # DOCTYPE
1383     undef $self->{current_token};
1384    
1385     redo A;
1386     } else {
1387     ## Stay in the state
1388     !!!next-input-character;
1389     redo A;
1390     }
1391     } else {
1392     die "$0: $self->{state}: Unknown state";
1393     }
1394     } # A
1395    
1396     die "$0: _get_next_token: unexpected case";
1397     } # _get_next_token
1398    
1399     sub _tokenize_attempt_to_consume_an_entity ($) {
1400     my $self = shift;
1401    
1402     if ($self->{next_input_character} == 0x0023) { # #
1403     !!!next-input-character;
1404     my $num;
1405     if ($self->{next_input_character} == 0x0078 or # x
1406     $self->{next_input_character} == 0x0058) { # X
1407     X: {
1408     my $x_char = $self->{next_input_character};
1409     !!!next-input-character;
1410     if (0x0030 <= $self->{next_input_character} and
1411     $self->{next_input_character} <= 0x0039) { # 0..9
1412     $num ||= 0;
1413     $num *= 0x10;
1414     $num += $self->{next_input_character} - 0x0030;
1415     redo X;
1416     } elsif (0x0061 <= $self->{next_input_character} and
1417     $self->{next_input_character} <= 0x0066) { # a..f
1418     ## ISSUE: the spec says U+0078, which is apparently incorrect
1419     $num ||= 0;
1420     $num *= 0x10;
1421     $num += $self->{next_input_character} - 0x0060 + 9;
1422     redo X;
1423     } elsif (0x0041 <= $self->{next_input_character} and
1424     $self->{next_input_character} <= 0x0046) { # A..F
1425     ## ISSUE: the spec says U+0058, which is apparently incorrect
1426     $num ||= 0;
1427     $num *= 0x10;
1428     $num += $self->{next_input_character} - 0x0040 + 9;
1429     redo X;
1430     } elsif (not defined $num) { # no hexadecimal digit
1431     !!!parse-error;
1432     $self->{next_input_character} = 0x0023; # #
1433     !!!back-next-input-character ($x_char);
1434 wakaba 1.5 return undef;
1435 wakaba 1.1 } elsif ($self->{next_input_character} == 0x003B) { # ;
1436     !!!next-input-character;
1437     } else {
1438     !!!parse-error;
1439     }
1440    
1441     ## TODO: check the definition for |a valid Unicode character|.
1442     if ($num > 1114111 or $num == 0) {
1443     $num = 0xFFFD; # REPLACEMENT CHARACTER
1444     ## ISSUE: Why this is not an error?
1445     }
1446    
1447 wakaba 1.5 return {type => 'character', data => chr $num};
1448 wakaba 1.1 } # X
1449 wakaba 1.4 } elsif (0x0030 <= $self->{next_input_character} and
1450     $self->{next_input_character} <= 0x0039) { # 0..9
1451     my $code = $self->{next_input_character} - 0x0030;
1452     !!!next-input-character;
1453    
1454     while (0x0030 <= $self->{next_input_character} and
1455     $self->{next_input_character} <= 0x0039) { # 0..9
1456     $code *= 10;
1457     $code += $self->{next_input_character} - 0x0030;
1458    
1459     !!!next-input-character;
1460     }
1461 wakaba 1.1
1462 wakaba 1.4 if ($self->{next_input_character} == 0x003B) { # ;
1463     !!!next-input-character;
1464     } else {
1465     !!!parse-error;
1466     }
1467 wakaba 1.1
1468 wakaba 1.4 ## TODO: check the definition for |a valid Unicode character|.
1469     if ($code > 1114111 or $code == 0) {
1470     $code = 0xFFFD; # REPLACEMENT CHARACTER
1471     ## ISSUE: Why this is not an error?
1472     }
1473    
1474 wakaba 1.5 return {type => 'character', data => chr $code};
1475 wakaba 1.4 } else {
1476     !!!parse-error;
1477     !!!back-next-input-character ($self->{next_input_character});
1478     $self->{next_input_character} = 0x0023; # #
1479 wakaba 1.5 return undef;
1480     }
1481     } elsif ((0x0041 <= $self->{next_input_character} and
1482     $self->{next_input_character} <= 0x005A) or
1483     (0x0061 <= $self->{next_input_character} and
1484     $self->{next_input_character} <= 0x007A)) {
1485     my $entity_name = chr $self->{next_input_character};
1486     !!!next-input-character;
1487    
1488     my $value = $entity_name;
1489     my $match;
1490    
1491     while (length $entity_name < 10 and
1492     ## NOTE: Some number greater than the maximum length of entity name
1493     ((0x0041 <= $self->{next_input_character} and
1494     $self->{next_input_character} <= 0x005A) or
1495     (0x0061 <= $self->{next_input_character} and
1496     $self->{next_input_character} <= 0x007A) or
1497     (0x0030 <= $self->{next_input_character} and
1498     $self->{next_input_character} <= 0x0039))) {
1499     $entity_name .= chr $self->{next_input_character};
1500     if (defined $entity_char->{$entity_name}) {
1501     $value = $entity_char->{$entity_name};
1502     $match = 1;
1503     } else {
1504     $value .= chr $self->{next_input_character};
1505     }
1506     !!!next-input-character;
1507     }
1508    
1509     if ($match) {
1510     if ($self->{next_input_character} == 0x003B) { # ;
1511     !!!next-input-character;
1512     } else {
1513     !!!parse-error;
1514     }
1515    
1516     return {type => 'character', data => $value};
1517     } else {
1518     !!!parse-error;
1519     ## NOTE: No characters are consumed in the spec.
1520     !!!back-token ({type => 'character', data => $value});
1521     return undef;
1522 wakaba 1.1 }
1523 wakaba 1.5 } else {
1524     ## no characters are consumed
1525     !!!parse-error;
1526     return undef;
1527     }
1528 wakaba 1.1 } # _tokenize_attempt_to_consume_an_entity
1529    
1530 wakaba 1.2 sub _initialize_tree_constructor ($) {
1531     my $self = shift;
1532     require What::NanoDOM;
1533     $self->{document} = What::NanoDOM::Document->new;
1534     $self->{document}->strict_error_checking (0);
1535     ## TODO: Turn mutation events off # MUST
1536     ## TODO: Turn loose Document option (manakai extension) on
1537     } # _initialize_tree_constructor
1538    
1539     sub _terminate_tree_constructor ($) {
1540     my $self = shift;
1541     $self->{document}->strict_error_checking (1);
1542     ## TODO: Turn mutation events on
1543     } # _terminate_tree_constructor
1544    
1545     ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
1546    
1547     sub _construct_tree ($) {
1548     my ($self) = @_;
1549    
1550     ## When an interactive UA render the $self->{document} available
1551     ## to the user, or when it begin accepting user input, are
1552     ## not defined.
1553    
1554     ## Append a character: collect it and all subsequent consecutive
1555     ## characters and insert one Text node whose data is concatenation
1556     ## of all those characters. # MUST
1557    
1558     my $token;
1559     !!!next-token;
1560    
1561     my $phase = 'initial'; # MUST
1562    
1563     my $open_elements = [];
1564     my $active_formatting_elements = [];
1565     my $head_element;
1566     my $form_element;
1567     my $insertion_mode = 'before head';
1568    
1569     my $reconstruct_active_formatting_elements = sub { # MUST
1570     ## Step 1
1571     return unless @$active_formatting_elements;
1572    
1573     ## Step 3
1574     my $i = -1;
1575     my $entry = $active_formatting_elements->[$i];
1576    
1577     ## Step 2
1578     return if $entry->[0] eq '#marker';
1579     for (@$open_elements) {
1580     if ($entry->[0] eq $_->[0]) {
1581     return;
1582     }
1583     }
1584    
1585     ## Step 4
1586     S4: {
1587     last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
1588    
1589     ## Step 5
1590     $i--;
1591     $entry = $active_formatting_elements->[$i];
1592    
1593     ## Step 6
1594     if ($entry->[0] eq '#marker') {
1595     #
1596     } else {
1597     my $in_open_elements;
1598     OE: for (@$open_elements) {
1599     if ($entry->[0] eq $_->[0]) {
1600     $in_open_elements = 1;
1601     last OE;
1602     }
1603     }
1604     if ($in_open_elements) {
1605     #
1606     } else {
1607     redo S4;
1608     }
1609     }
1610    
1611     ## Step 7
1612     $i++;
1613     $entry = $active_formatting_elements->[$i];
1614     } # S4
1615    
1616     S7: {
1617     ## Step 8
1618     my $clone = $entry->[0]->clone_node (0);
1619    
1620     ## Step 9
1621     $open_elements->[-1]->[0]->append_child ($clone);
1622     push @$open_elements, [$clone, $entry->[1]];
1623    
1624     ## Step 10
1625     $active_formatting_elements->[$i] = $open_elements->[-1];
1626    
1627     unless ($i == $#$active_formatting_elements) {
1628     ## Step 7'
1629     $i++;
1630     $entry = $active_formatting_elements->[$i];
1631    
1632     redo S7;
1633     }
1634     } # S7
1635     }; # $reconstruct_active_formatting_elements
1636    
1637     my $clear_up_to_marker = sub {
1638     for (reverse 0..$#$active_formatting_elements) {
1639     if ($active_formatting_elements->[$_]->[0] eq '#marker') {
1640     splice @$active_formatting_elements, $_;
1641     return;
1642     }
1643     }
1644     }; # $clear_up_to_marker
1645    
1646     my $reset_insertion_mode = sub {
1647     ## Step 1
1648     my $last;
1649    
1650     ## Step 2
1651     my $i = -1;
1652     my $node = $open_elements->[$i];
1653    
1654     ## Step 3
1655     S3: {
1656     $last = 1 if $open_elements->[0]->[0] eq $node->[0];
1657     ## TODO: the element whose inner_html is set is neither td nor th, then $node = the element
1658    
1659     ## Step 4..13
1660     my $new_mode = {
1661     select => 'in select',
1662     td => 'in cell',
1663     th => 'in cell',
1664     tr => 'in row',
1665     tbody => 'in table body',
1666     thead => 'in table head',
1667     tfoot => 'in table foot',
1668     caption => 'in caption',
1669     colgroup => 'in column group',
1670     table => 'in table',
1671     head => 'in body', # not in head!
1672     body => 'in body',
1673     frameset => 'in frameset',
1674     }->{$node->[1]};
1675     $insertion_mode = $new_mode and return if defined $new_mode;
1676    
1677     ## Step 14
1678     if ($node->[1] eq 'html') {
1679     unless (defined $head_element) {
1680     $insertion_mode = 'before head';
1681     } else {
1682     $insertion_mode = 'after head';
1683     }
1684     return;
1685     }
1686    
1687     ## Step 15
1688     $insertion_mode = 'in body' and return if $last;
1689    
1690     ## Step 16
1691     $i--;
1692     $node = $open_elements->[$i];
1693    
1694     ## Step 17
1695     redo S3;
1696     } # S3
1697     }; # $reset_insertion_mode
1698    
1699     my $style_start_tag = sub {
1700     my $style_el; !!!create-element ($style_el, 'style');
1701     ## $insertion_mode eq 'in head' and ... (always true)
1702     (($insertion_mode eq 'in head' and defined $head_element)
1703     ? $head_element : $open_elements->[-1]->[0])
1704     ->append_child ($style_el);
1705     $self->{content_model_flag} = 'CDATA';
1706    
1707     my $text = '';
1708     !!!next-token;
1709     while ($token->{type} eq 'character') {
1710     $text .= $token->{data};
1711     !!!next-token;
1712     } # stop if non-character token or tokenizer stops tokenising
1713     if (length $text) {
1714     $style_el->manakai_append_text ($text);
1715     }
1716    
1717     $self->{content_model_flag} = 'PCDATA';
1718    
1719     if ($token->{type} eq 'end tag' and $token->{tag_name} eq 'style') {
1720     ## Ignore the token
1721     } else {
1722     !!!parse-error;
1723     ## ISSUE: And ignore?
1724     }
1725     !!!next-token;
1726     }; # $style_start_tag
1727    
1728     my $script_start_tag = sub {
1729     my $script_el; !!!create-element ($script_el, 'script');
1730     ## TODO: mark as "parser-inserted"
1731    
1732     $self->{content_model_flag} = 'CDATA';
1733    
1734     my $text = '';
1735     !!!next-token;
1736     while ($token->{type} eq 'character') {
1737     $text .= $token->{data};
1738     !!!next-token;
1739     } # stop if non-character token or tokenizer stops tokenising
1740     if (length $text) {
1741     $script_el->manakai_append_text ($text);
1742     }
1743    
1744     $self->{content_model_flag} = 'PCDATA';
1745    
1746     if ($token->{type} eq 'end tag' and
1747     $token->{tag_name} eq 'script') {
1748     ## Ignore the token
1749     } else {
1750     !!!parse-error;
1751     ## ISSUE: And ignore?
1752     ## TODO: mark as "already executed"
1753     }
1754    
1755     ## TODO: inner_html mode then mark as "already executed" and skip
1756     if (1) {
1757     ## TODO: $old_insertion_point = current insertion point
1758     ## TODO: insertion point = just before the next input character
1759    
1760     (($insertion_mode eq 'in head' and defined $head_element)
1761     ? $head_element : $open_elements->[-1]->[0])->append_child ($script_el);
1762    
1763     ## TODO: insertion point = $old_insertion_point (might be "undefined")
1764    
1765     ## TODO: if there is a script that will execute as soon as the parser resume, then...
1766     }
1767    
1768     !!!next-token;
1769     }; # $script_start_tag
1770    
1771     my $formatting_end_tag = sub {
1772     my $tag_name = shift;
1773    
1774     FET: {
1775     ## Step 1
1776     my $formatting_element;
1777     my $formatting_element_i_in_active;
1778     AFE: for (reverse 0..$#$active_formatting_elements) {
1779     if ($active_formatting_elements->[$_]->[1] eq $tag_name) {
1780     $formatting_element = $active_formatting_elements->[$_];
1781     $formatting_element_i_in_active = $_;
1782     last AFE;
1783     } elsif ($active_formatting_elements->[$_]->[0] eq '#marker') {
1784     last AFE;
1785     }
1786     } # AFE
1787     unless (defined $formatting_element) {
1788     !!!parse-error;
1789     ## Ignore the token
1790     !!!next-token;
1791     return;
1792     }
1793     ## has an element in scope
1794     my $in_scope = 1;
1795     my $formatting_element_i_in_open;
1796     INSCOPE: for (reverse 0..$#$open_elements) {
1797     my $node = $open_elements->[$_];
1798     if ($node->[0] eq $formatting_element->[0]) {
1799     if ($in_scope) {
1800     $formatting_element_i_in_open = $_;
1801     last INSCOPE;
1802     } else { # in open elements but not in scope
1803     !!!parse-error;
1804     ## Ignore the token
1805     !!!next-token;
1806     return;
1807     }
1808     } elsif ({
1809     table => 1, caption => 1, td => 1, th => 1,
1810     button => 1, marquee => 1, object => 1, html => 1,
1811     }->{$node->[1]}) {
1812     $in_scope = 0;
1813     }
1814     } # INSCOPE
1815     unless (defined $formatting_element_i_in_open) {
1816     !!!parse-error;
1817     pop @$active_formatting_elements; # $formatting_element
1818     !!!next-token; ## TODO: ok?
1819     return;
1820     }
1821     if (not $open_elements->[-1]->[0] eq $formatting_element->[0]) {
1822     !!!parse-error;
1823     }
1824    
1825     ## Step 2
1826     my $furthest_block;
1827     my $furthest_block_i_in_open;
1828     OE: for (reverse 0..$#$open_elements) {
1829     my $node = $open_elements->[$_];
1830     if (not $formatting_category->{$node->[1]} and
1831     #not $phrasing_category->{$node->[1]} and
1832     ($special_category->{$node->[1]} or
1833     $scoping_category->{$node->[1]})) {
1834     $furthest_block = $node;
1835     $furthest_block_i_in_open = $_;
1836     } elsif ($node->[0] eq $formatting_element->[0]) {
1837     last OE;
1838     }
1839     } # OE
1840    
1841     ## Step 3
1842     unless (defined $furthest_block) { # MUST
1843     splice @$open_elements, $formatting_element_i_in_open;
1844     splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
1845     !!!next-token;
1846     return;
1847     }
1848    
1849     ## Step 4
1850     my $common_ancestor_node = $open_elements->[$formatting_element_i_in_open - 1];
1851    
1852     ## Step 5
1853     my $furthest_block_parent = $furthest_block->[0]->parent_node;
1854     if (defined $furthest_block_parent) {
1855     $furthest_block_parent->remove_child ($furthest_block->[0]);
1856     }
1857    
1858     ## Step 6
1859     my $bookmark_prev_el
1860     = $active_formatting_elements->[$formatting_element_i_in_active - 1]
1861     ->[0];
1862    
1863     ## Step 7
1864     my $node = $furthest_block;
1865     my $node_i_in_open = $furthest_block_i_in_open;
1866     my $last_node = $furthest_block;
1867     S7: {
1868     ## Step 1
1869     $node_i_in_open--;
1870     $node = $open_elements->[$node_i_in_open];
1871    
1872     ## Step 2
1873     my $node_i_in_active;
1874     S7S2: {
1875     for (reverse 0..$#$active_formatting_elements) {
1876     if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
1877     $node_i_in_active = $_;
1878     last S7S2;
1879     }
1880     }
1881     splice @$open_elements, $node_i_in_open, 1;
1882     redo S7;
1883     } # S7S2
1884    
1885     ## Step 3
1886     last S7 if $node->[0] eq $formatting_element->[0];
1887    
1888     ## Step 4
1889     if ($last_node->[0] eq $furthest_block->[0]) {
1890     $bookmark_prev_el = $node->[0];
1891     }
1892    
1893     ## Step 5
1894     if ($node->[0]->has_child_nodes ()) {
1895     my $clone = [$node->[0]->clone_node (0), $node->[1]];
1896     $active_formatting_elements->[$node_i_in_active] = $clone;
1897     $open_elements->[$node_i_in_open] = $clone;
1898     $node = $clone;
1899     }
1900    
1901     ## Step 6
1902 wakaba 1.6 $node->[0]->append_child ($last_node->[0]);
1903 wakaba 1.2
1904     ## Step 7
1905     $last_node = $node;
1906    
1907     ## Step 8
1908     redo S7;
1909     } # S7
1910    
1911     ## Step 8
1912 wakaba 1.6 $common_ancestor_node->[0]->append_child ($last_node->[0]);
1913 wakaba 1.2
1914     ## Step 9
1915     my $clone = [$formatting_element->[0]->clone_node (0),
1916     $formatting_element->[1]];
1917    
1918     ## Step 10
1919     my @cn = @{$furthest_block->[0]->child_nodes};
1920     $clone->[0]->append_child ($_) for @cn;
1921    
1922     ## Step 11
1923     $furthest_block->[0]->append_child ($clone->[0]);
1924    
1925     ## Step 12
1926     my $i;
1927     AFE: for (reverse 0..$#$active_formatting_elements) {
1928     if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
1929     splice @$active_formatting_elements, $_, 1;
1930     $i-- and last AFE if defined $i;
1931     } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
1932     $i = $_;
1933     }
1934     } # AFE
1935     splice @$active_formatting_elements, $i + 1, 0, $clone;
1936    
1937     ## Step 13
1938     undef $i;
1939     OE: for (reverse 0..$#$open_elements) {
1940     if ($open_elements->[$_]->[0] eq $formatting_element->[0]) {
1941     splice @$open_elements, $_, 1;
1942     $i-- and last OE if defined $i;
1943     } elsif ($open_elements->[$_]->[0] eq $furthest_block->[0]) {
1944     $i = $_;
1945     }
1946     } # OE
1947     splice @$open_elements, $i + 1, 1, $clone;
1948    
1949     ## Step 14
1950     redo FET;
1951     } # FET
1952     }; # $formatting_end_tag
1953    
1954     my $in_body = sub {
1955     my $insert = shift;
1956     if ($token->{type} eq 'start tag') {
1957     if ($token->{tag_name} eq 'script') {
1958     $script_start_tag->();
1959     return;
1960     } elsif ($token->{tag_name} eq 'style') {
1961     $style_start_tag->();
1962     return;
1963     } elsif ({
1964     base => 1, link => 1, meta => 1, title => 1,
1965     }->{$token->{tag_name}}) {
1966     !!!parse-error;
1967     ## NOTE: This is an "as if in head" code clone
1968     my $el;
1969     !!!create-element ($el, $token->{tag_name}, $token->{attributes});
1970     if (defined $head_element) {
1971     $head_element->append_child ($el);
1972     } else {
1973     $insert->($el);
1974     }
1975    
1976     ## ISSUE: Issue on magical <base> in the spec
1977    
1978     !!!next-token;
1979     return;
1980     } elsif ($token->{tag_name} eq 'body') {
1981     !!!parse-error;
1982    
1983     if (@$open_elements == 1 or
1984     $open_elements->[1]->[1] ne 'body') {
1985     ## Ignore the token
1986     } else {
1987     my $body_el = $open_elements->[1]->[0];
1988     for my $attr_name (keys %{$token->{attributes}}) {
1989     unless ($body_el->has_attribute_ns (undef, $attr_name)) {
1990     $body_el->set_attribute_ns
1991     (undef, [undef, $attr_name],
1992     $token->{attributes}->{$attr_name}->{value});
1993     }
1994     }
1995     }
1996     !!!next-token;
1997     return;
1998     } elsif ({
1999     address => 1, blockquote => 1, center => 1, dir => 1,
2000     div => 1, dl => 1, fieldset => 1, listing => 1,
2001     menu => 1, ol => 1, p => 1, ul => 1,
2002     pre => 1,
2003     }->{$token->{tag_name}}) {
2004     ## has a p element in scope
2005     INSCOPE: for (reverse @$open_elements) {
2006     if ($_->[1] eq 'p') {
2007     !!!back-token;
2008     $token = {type => 'end tag', tag_name => 'p'};
2009     return;
2010     } elsif ({
2011     table => 1, caption => 1, td => 1, th => 1,
2012     button => 1, marquee => 1, object => 1, html => 1,
2013     }->{$_->[1]}) {
2014     last INSCOPE;
2015     }
2016     } # INSCOPE
2017    
2018     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2019     if ($token->{tag_name} eq 'pre') {
2020     !!!next-token;
2021     if ($token->{type} eq 'character') {
2022     $token->{data} =~ s/^\x0A//;
2023     unless (length $token->{data}) {
2024     !!!next-token;
2025     }
2026     }
2027     } else {
2028     !!!next-token;
2029     }
2030     return;
2031     } elsif ($token->{tag_name} eq 'form') {
2032     if (defined $form_element) {
2033     !!!parse-error;
2034     ## Ignore the token
2035     } else {
2036     ## has a p element in scope
2037     INSCOPE: for (reverse @$open_elements) {
2038     if ($_->[1] eq 'p') {
2039     !!!back-token;
2040     $token = {type => 'end tag', tag_name => 'p'};
2041     return;
2042     } elsif ({
2043     table => 1, caption => 1, td => 1, th => 1,
2044     button => 1, marquee => 1, object => 1, html => 1,
2045     }->{$_->[1]}) {
2046     last INSCOPE;
2047     }
2048     } # INSCOPE
2049    
2050     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2051     $form_element = $open_elements->[-1]->[0];
2052     !!!next-token;
2053     return;
2054     }
2055     } elsif ($token->{tag_name} eq 'li') {
2056     ## has a p element in scope
2057     INSCOPE: for (reverse @$open_elements) {
2058     if ($_->[1] eq 'p') {
2059     !!!back-token;
2060     $token = {type => 'end tag', tag_name => 'p'};
2061     return;
2062     } elsif ({
2063     table => 1, caption => 1, td => 1, th => 1,
2064     button => 1, marquee => 1, object => 1, html => 1,
2065     }->{$_->[1]}) {
2066     last INSCOPE;
2067     }
2068     } # INSCOPE
2069    
2070     ## Step 1
2071     my $i = -1;
2072     my $node = $open_elements->[$i];
2073     LI: {
2074     ## Step 2
2075     if ($node->[1] eq 'li') {
2076     splice @$open_elements, $i;
2077     last LI;
2078     }
2079    
2080     ## Step 3
2081     if (not $formatting_category->{$node->[1]} and
2082     #not $phrasing_category->{$node->[1]} and
2083     ($special_category->{$node->[1]} or
2084     $scoping_category->{$node->[1]}) and
2085     $node->[1] ne 'address' and $node->[1] ne 'div') {
2086     last LI;
2087     }
2088    
2089     ## Step 4
2090     $i++;
2091     $node = $open_elements->[$i];
2092     redo LI;
2093     } # LI
2094    
2095     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2096     !!!next-token;
2097     return;
2098     } elsif ($token->{tag_name} eq 'dd' or $token->{tag_name} eq 'dt') {
2099     ## has a p element in scope
2100     INSCOPE: for (reverse @$open_elements) {
2101     if ($_->[1] eq 'p') {
2102     !!!back-token;
2103     $token = {type => 'end tag', tag_name => 'p'};
2104     return;
2105     } elsif ({
2106     table => 1, caption => 1, td => 1, th => 1,
2107     button => 1, marquee => 1, object => 1, html => 1,
2108     }->{$_->[1]}) {
2109     last INSCOPE;
2110     }
2111     } # INSCOPE
2112    
2113     ## Step 1
2114     my $i = -1;
2115     my $node = $open_elements->[$i];
2116     LI: {
2117     ## Step 2
2118     if ($node->[1] eq 'dt' or $node->[1] eq 'dd') {
2119     splice @$open_elements, $i;
2120     last LI;
2121     }
2122    
2123     ## Step 3
2124     if (not $formatting_category->{$node->[1]} and
2125     #not $phrasing_category->{$node->[1]} and
2126     ($special_category->{$node->[1]} or
2127     $scoping_category->{$node->[1]}) and
2128     $node->[1] ne 'address' and $node->[1] ne 'div') {
2129     last LI;
2130     }
2131    
2132     ## Step 4
2133     $i++;
2134     $node = $open_elements->[$i];
2135     redo LI;
2136     } # LI
2137    
2138     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2139     !!!next-token;
2140     return;
2141     } elsif ($token->{tag_name} eq 'plaintext') {
2142     ## has a p element in scope
2143     INSCOPE: for (reverse @$open_elements) {
2144     if ($_->[1] eq 'p') {
2145     !!!back-token;
2146     $token = {type => 'end tag', tag_name => 'p'};
2147     return;
2148     } elsif ({
2149     table => 1, caption => 1, td => 1, th => 1,
2150     button => 1, marquee => 1, object => 1, html => 1,
2151     }->{$_->[1]}) {
2152     last INSCOPE;
2153     }
2154     } # INSCOPE
2155    
2156     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2157    
2158     $self->{content_model_flag} = 'PLAINTEXT';
2159    
2160     !!!next-token;
2161     return;
2162     } elsif ({
2163     h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
2164     }->{$token->{tag_name}}) {
2165     ## has a p element in scope
2166     INSCOPE: for (reverse 0..$#$open_elements) {
2167     my $node = $open_elements->[$_];
2168     if ($node->[1] eq 'p') {
2169     !!!back-token;
2170     $token = {type => 'end tag', tag_name => 'p'};
2171     return;
2172     } elsif ({
2173     table => 1, caption => 1, td => 1, th => 1,
2174     button => 1, marquee => 1, object => 1, html => 1,
2175     }->{$node->[1]}) {
2176     last INSCOPE;
2177     }
2178     } # INSCOPE
2179    
2180     ## has an element in scope
2181     my $i;
2182     INSCOPE: for (reverse 0..$#$open_elements) {
2183     my $node = $open_elements->[$_];
2184     if ({
2185     h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
2186     }->{$node->[1]}) {
2187     $i = $_;
2188     last INSCOPE;
2189     } elsif ({
2190     table => 1, caption => 1, td => 1, th => 1,
2191     button => 1, marquee => 1, object => 1, html => 1,
2192     }->{$node->[1]}) {
2193     last INSCOPE;
2194     }
2195     } # INSCOPE
2196    
2197     if (defined $i) {
2198     !!!parse-error;
2199     splice @$open_elements, $i;
2200     }
2201    
2202     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2203    
2204     !!!next-token;
2205     return;
2206     } elsif ($token->{tag_name} eq 'a') {
2207     AFE: for my $i (reverse 0..$#$active_formatting_elements) {
2208     my $node = $active_formatting_elements->[$i];
2209     if ($node->[1] eq 'a') {
2210     !!!parse-error;
2211    
2212     !!!back-token;
2213     $token = {type => 'end tag', tag_name => 'a'};
2214     $formatting_end_tag->($token->{tag_name});
2215    
2216     splice @$active_formatting_elements, $i;
2217     OE: for (reverse 0..$#$open_elements) {
2218     if ($open_elements->[$_]->[0] eq $node->[0]) {
2219     splice @$open_elements, $_;
2220     last OE;
2221     }
2222     } # OE
2223     last AFE;
2224     } elsif ($node->[0] eq '#marker') {
2225     last AFE;
2226     }
2227     } # AFE
2228    
2229     $reconstruct_active_formatting_elements->();
2230    
2231     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2232     push @$active_formatting_elements, $open_elements->[-1];
2233    
2234     !!!next-token;
2235     return;
2236     } elsif ({
2237     b => 1, big => 1, em => 1, font => 1, i => 1,
2238     nobr => 1, s => 1, small => 1, strile => 1,
2239     strong => 1, tt => 1, u => 1,
2240     }->{$token->{tag_name}}) {
2241     $reconstruct_active_formatting_elements->();
2242    
2243     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2244     push @$active_formatting_elements, $open_elements->[-1];
2245    
2246     !!!next-token;
2247     return;
2248     } elsif ($token->{tag_name} eq 'button') {
2249     ## has a button element in scope
2250     INSCOPE: for (reverse 0..$#$open_elements) {
2251     my $node = $open_elements->[$_];
2252     if ($node->[1] eq 'button') {
2253     !!!parse-error;
2254     !!!back-token;
2255     $token = {type => 'end tag', tag_name => 'button'};
2256     return;
2257     } elsif ({
2258     table => 1, caption => 1, td => 1, th => 1,
2259     button => 1, marquee => 1, object => 1, html => 1,
2260     }->{$node->[1]}) {
2261     last INSCOPE;
2262     }
2263     } # INSCOPE
2264    
2265     $reconstruct_active_formatting_elements->();
2266    
2267     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2268     push @$active_formatting_elements, ['#marker', ''];
2269    
2270     !!!next-token;
2271     return;
2272     } elsif ($token->{tag_name} eq 'marquee' or
2273     $token->{tag_name} eq 'object') {
2274     $reconstruct_active_formatting_elements->();
2275    
2276     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2277     push @$active_formatting_elements, ['#marker', ''];
2278    
2279     !!!next-token;
2280     return;
2281     } elsif ($token->{tag_name} eq 'xmp') {
2282     $reconstruct_active_formatting_elements->();
2283    
2284     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2285    
2286     $self->{content_model_flag} = 'CDATA';
2287    
2288     !!!next-token;
2289     return;
2290 wakaba 1.6 } elsif ($token->{tag_name} eq 'table') {
2291 wakaba 1.2 ## has a p element in scope
2292     INSCOPE: for (reverse @$open_elements) {
2293     if ($_->[1] eq 'p') {
2294     !!!back-token;
2295     $token = {type => 'end tag', tag_name => 'p'};
2296     return;
2297     } elsif ({
2298     table => 1, caption => 1, td => 1, th => 1,
2299     button => 1, marquee => 1, object => 1, html => 1,
2300     }->{$_->[1]}) {
2301     last INSCOPE;
2302     }
2303     } # INSCOPE
2304    
2305     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2306    
2307     $insertion_mode = 'in table';
2308    
2309     !!!next-token;
2310     return;
2311     } elsif ({
2312     area => 1, basefont => 1, bgsound => 1, br => 1,
2313     embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
2314     image => 1,
2315     }->{$token->{tag_name}}) {
2316     if ($token->{tag_name} eq 'image') {
2317     !!!parse-error;
2318     $token->{tag_name} = 'img';
2319     }
2320    
2321     $reconstruct_active_formatting_elements->();
2322    
2323     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2324     pop @$open_elements;
2325    
2326     !!!next-token;
2327     return;
2328     } elsif ($token->{tag_name} eq 'hr') {
2329     ## has a p element in scope
2330     INSCOPE: for (reverse @$open_elements) {
2331     if ($_->[1] eq 'p') {
2332     !!!back-token;
2333     $token = {type => 'end tag', tag_name => 'p'};
2334     return;
2335     } elsif ({
2336     table => 1, caption => 1, td => 1, th => 1,
2337     button => 1, marquee => 1, object => 1, html => 1,
2338     }->{$_->[1]}) {
2339     last INSCOPE;
2340     }
2341     } # INSCOPE
2342    
2343     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2344     pop @$open_elements;
2345    
2346     !!!next-token;
2347     return;
2348     } elsif ($token->{tag_name} eq 'input') {
2349     $reconstruct_active_formatting_elements->();
2350    
2351     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2352     ## TODO: associate with $form_element if defined
2353     pop @$open_elements;
2354    
2355     !!!next-token;
2356     return;
2357     } elsif ($token->{tag_name} eq 'isindex') {
2358     !!!parse-error;
2359    
2360     if (defined $form_element) {
2361     ## Ignore the token
2362     !!!next-token;
2363     return;
2364     } else {
2365     my $at = $token->{attributes};
2366     $at->{name} = {name => 'name', value => 'isindex'};
2367     my @tokens = (
2368     {type => 'start tag', tag_name => 'form'},
2369     {type => 'start tag', tag_name => 'hr'},
2370     {type => 'start tag', tag_name => 'p'},
2371     {type => 'start tag', tag_name => 'label'},
2372     {type => 'character',
2373     data => 'This is a searchable index. Insert your search keywords here: '}, # SHOULD
2374     ## TODO: make this configurable
2375     {type => 'start tag', tag_name => 'input', attributes => $at},
2376     #{type => 'character', data => ''}, # SHOULD
2377     {type => 'end tag', tag_name => 'label'},
2378     {type => 'end tag', tag_name => 'p'},
2379     {type => 'start tag', tag_name => 'hr'},
2380     {type => 'end tag', tag_name => 'form'},
2381     );
2382     $token = shift @tokens;
2383     !!!back-token (@tokens);
2384     return;
2385     }
2386     } elsif ({
2387     textarea => 1,
2388     noembed => 1,
2389     noframes => 1,
2390     noscript => 0, ## TODO: 1 if scripting is enabled
2391     }->{$token->{tag_name}}) {
2392     my $tag_name = $token->{tag_name};
2393     my $el;
2394     !!!create-element ($el, $token->{tag_name}, $token->{attributes});
2395    
2396     if ($token->{tag_name} eq 'textarea') {
2397     ## TODO: form_element if defined
2398     $self->{content_model_flag} = 'RCDATA';
2399     } else {
2400     $self->{content_model_flag} = 'CDATA';
2401     }
2402    
2403     $insert->($el);
2404    
2405     my $text = '';
2406     !!!next-token;
2407     while ($token->{type} eq 'character') {
2408     $text .= $token->{data};
2409     !!!next-token;
2410     }
2411     if (length $text) {
2412     $el->manakai_append_text ($text);
2413     }
2414    
2415     $self->{content_model_flag} = 'PCDATA';
2416    
2417     if ($token->{type} eq 'end tag' and
2418     $token->{tag_name} eq $tag_name) {
2419     ## Ignore the token
2420     } else {
2421     !!!parse-error;
2422     ## ISSUE: And ignore?
2423     }
2424     !!!next-token;
2425     return;
2426     } elsif ($token->{type} eq 'select') {
2427     $reconstruct_active_formatting_elements->();
2428    
2429     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2430    
2431     $insertion_mode = 'in select';
2432     !!!next-token;
2433     return;
2434     } elsif ({
2435     caption => 1, col => 1, colgroup => 1, frame => 1,
2436     frameset => 1, head => 1, option => 1, optgroup => 1,
2437     tbody => 1, td => 1, tfoot => 1, th => 1,
2438     thead => 1, tr => 1,
2439     }->{$token->{tag_name}}) {
2440     !!!parse-error;
2441     ## Ignore the token
2442     !!!next-token;
2443     return;
2444    
2445     ## ISSUE: An issue on HTML5 new elements in the spec.
2446     } else {
2447     $reconstruct_active_formatting_elements->();
2448    
2449     !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2450    
2451     !!!next-token;
2452     return;
2453     }
2454     } elsif ($token->{type} eq 'end tag') {
2455     if ($token->{tag_name} eq 'body') {
2456     if (@$open_elements > 1 and $open_elements->[1]->[1] eq 'body') {
2457     ## ISSUE: There is an issue in the spec.
2458     if ($open_elements->[-1]->[1] ne 'body') {
2459     !!!parse-error;
2460     }
2461     $insertion_mode = 'after body';
2462     !!!next-token;
2463     return;
2464     } else {
2465     !!!parse-error;
2466     ## Ignore the token
2467     !!!next-token;
2468     return;
2469     }
2470     } elsif ($token->{tag_name} eq 'html') {
2471     if (@$open_elements > 1 and $open_elements->[1]->[1] eq 'body') {
2472     ## ISSUE: There is an issue in the spec.
2473     if ($open_elements->[-1]->[1] ne 'body') {
2474     !!!parse-error;
2475     }
2476     $insertion_mode = 'after body';
2477     ## reprocess
2478     return;
2479     } else {
2480     !!!parse-error;
2481     ## Ignore the token
2482     !!!next-token;
2483     return;
2484     }
2485     } elsif ({
2486     address => 1, blockquote => 1, center => 1, dir => 1,
2487     div => 1, dl => 1, fieldset => 1, listing => 1,
2488     menu => 1, ol => 1, pre => 1, ul => 1,
2489     form => 1,
2490     p => 1,
2491     dd => 1, dt => 1, li => 1,
2492     button => 1, marquee => 1, object => 1,
2493     }->{$token->{tag_name}}) {
2494     ## has an element in scope
2495     my $i;
2496     INSCOPE: for (reverse 0..$#$open_elements) {
2497     my $node = $open_elements->[$_];
2498     if ($node->[1] eq $token->{tag_name}) {
2499     ## generate implied end tags
2500     if ({
2501     dd => ($token->{tag_name} ne 'dd'),
2502     dt => ($token->{tag_name} ne 'dt'),
2503     li => ($token->{tag_name} ne 'li'),
2504     p => ($token->{tag_name} ne 'p'),
2505     td => 1, th => 1, tr => 1,
2506     }->{$open_elements->[-1]->[1]}) {
2507     !!!back-token;
2508     $token = {type => 'end tag',
2509     tag_name => $open_elements->[-1]->[1]}; # MUST
2510     return;
2511     }
2512     $i = $_;
2513     last INSCOPE unless $token->{tag_name} eq 'p';
2514     } elsif ({
2515     table => 1, caption => 1, td => 1, th => 1,
2516     button => 1, marquee => 1, object => 1, html => 1,
2517     }->{$node->[1]}) {
2518     last INSCOPE;
2519     }
2520     } # INSCOPE
2521    
2522     if ($open_elements->[-1]->[1] ne $token->{tag_name}) {
2523     !!!parse-error;
2524     }
2525    
2526     splice @$open_elements, $i if defined $i;
2527     undef $form_element if $token->{tag_name} eq 'form';
2528     $clear_up_to_marker->()
2529     if {
2530     button => 1, marquee => 1, object => 1,
2531     }->{$token->{tag_name}};
2532     !!!next-token;
2533     return;
2534     } elsif ({
2535     h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
2536     }->{$token->{tag_name}}) {
2537     ## has an element in scope
2538     my $i;
2539     INSCOPE: for (reverse 0..$#$open_elements) {
2540     my $node = $open_elements->[$_];
2541     if ({
2542     h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
2543     }->{$node->[1]}) {
2544     ## generate implied end tags
2545     if ({
2546     dd => 1, dt => 1, li => 1, p => 1,
2547     td => 1, th => 1, tr => 1,
2548     }->{$open_elements->[-1]->[1]}) {
2549     !!!back-token;
2550     $token = {type => 'end tag',
2551     tag_name => $open_elements->[-1]->[1]}; # MUST
2552     return;
2553     }
2554     $i = $_;
2555     last INSCOPE;
2556     } elsif ({
2557     table => 1, caption => 1, td => 1, th => 1,
2558     button => 1, marquee => 1, object => 1, html => 1,
2559     }->{$node->[1]}) {
2560     last INSCOPE;
2561     }
2562     } # INSCOPE
2563    
2564     if ($open_elements->[-1]->[1] ne $token->{tag_name}) {
2565     !!!parse-error;
2566     }
2567    
2568     splice @$open_elements, $i if defined $i;
2569     !!!next-token;
2570     return;
2571     } elsif ({
2572     a => 1,
2573     b => 1, big => 1, em => 1, font => 1, i => 1,
2574     nobr => 1, s => 1, small => 1, strile => 1,
2575     strong => 1, tt => 1, u => 1,
2576     }->{$token->{tag_name}}) {
2577     $formatting_end_tag->($token->{tag_name});
2578     return;
2579     } elsif ({
2580     caption => 1, col => 1, colgroup => 1, frame => 1,
2581     frameset => 1, head => 1, option => 1, optgroup => 1,
2582     tbody => 1, td => 1, tfoot => 1, th => 1,
2583     thead => 1, tr => 1,
2584     area => 1, basefont => 1, bgsound => 1, br => 1,
2585     embed => 1, hr => 1, iframe => 1, image => 1,
2586     img => 1, input => 1, isindex=> 1, noembed => 1,
2587     noframes => 1, param => 1, select => 1, spacer => 1,
2588     table => 1, textarea => 1, wbr => 1,
2589     noscript => 0, ## TODO: if scripting is enabled
2590     }->{$token->{tag_name}}) {
2591     !!!parse-error;
2592     ## Ignore the token
2593     !!!next-token;
2594     return;
2595    
2596     ## ISSUE: Issue on HTML5 new elements in spec
2597    
2598     } else {
2599     ## Step 1
2600     my $node_i = -1;
2601     my $node = $open_elements->[$node_i];
2602    
2603     ## Step 2
2604     S2: {
2605     if ($node->[1] eq $token->{tag_name}) {
2606     ## Step 1
2607     ## generate implied end tags
2608     if ({
2609     dd => 1, dt => 1, li => 1, p => 1,
2610     td => 1, th => 1, tr => 1,
2611     }->{$open_elements->[-1]->[1]}) {
2612     !!!back-token;
2613     $token = {type => 'end tag',
2614     tag_name => $open_elements->[-1]->[1]}; # MUST
2615     return;
2616     }
2617    
2618     ## Step 2
2619     if ($token->{tag_name} ne $open_elements->[-1]->[1]) {
2620     !!!parse-error;
2621     }
2622    
2623     ## Step 3
2624     splice @$open_elements, $node_i;
2625     last S2;
2626     } else {
2627     ## Step 3
2628     if (not $formatting_category->{$node->[1]} and
2629     #not $phrasing_category->{$node->[1]} and
2630     ($special_category->{$node->[1]} or
2631     $scoping_category->{$node->[1]})) {
2632     !!!parse-error;
2633     ## Ignore the token
2634     !!!next-token;
2635     last S2;
2636     }
2637     }
2638    
2639     ## Step 4
2640     $node_i--;
2641     $node = $open_elements->[$node_i];
2642    
2643     ## Step 5;
2644     redo S2;
2645     } # S2
2646     }
2647     }
2648     }; # $in_body
2649    
2650     B: {
2651     if ($phase eq 'initial') {
2652     if ($token->{type} eq 'DOCTYPE') {
2653     if ($token->{error}) {
2654     ## ISSUE: Spec currently left this case undefined.
2655 wakaba 1.6 !!!parse-error ('missing DOCTYPE');
2656 wakaba 1.2 }
2657     my $doctype = $self->{document}->create_document_type_definition
2658     ($token->{name});
2659     $self->{document}->append_child ($doctype);
2660     $phase = 'root element';
2661     !!!next-token;
2662     redo B;
2663     } elsif ({
2664     comment => 1,
2665     'start tag' => 1,
2666     'end tag' => 1,
2667     'end-of-file' => 1,
2668     }->{$token->{type}}) {
2669     ## ISSUE: Spec currently left this case undefined.
2670 wakaba 1.6 !!!parse-error ('missing DOCTYPE');
2671 wakaba 1.2 $phase = 'root element';
2672     ## reprocess
2673     redo B;
2674     } elsif ($token->{type} eq 'character') {
2675     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
2676     $self->{document}->manakai_append_text ($1);
2677     ## ISSUE: DOM3 Core does not allow Document > Text
2678     unless (length $token->{data}) {
2679     ## Stay in the phase
2680     !!!next-token;
2681     redo B;
2682     }
2683     }
2684     ## ISSUE: Spec currently left this case undefined.
2685 wakaba 1.6 !!!parse-error ('missing DOCTYPE');
2686 wakaba 1.2 $phase = 'root element';
2687     ## reprocess
2688     redo B;
2689     } else {
2690     die "$0: $token->{type}: Unknown token";
2691     }
2692     } elsif ($phase eq 'root element') {
2693     if ($token->{type} eq 'DOCTYPE') {
2694     !!!parse-error;
2695     ## Ignore the token
2696     ## Stay in the phase
2697     !!!next-token;
2698     redo B;
2699     } elsif ($token->{type} eq 'comment') {
2700     my $comment = $self->{document}->create_comment ($token->{data});
2701     $self->{document}->append_child ($comment);
2702     ## Stay in the phase
2703     !!!next-token;
2704     redo B;
2705     } elsif ($token->{type} eq 'character') {
2706     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
2707     $self->{document}->manakai_append_text ($1);
2708     ## ISSUE: DOM3 Core does not allow Document > Text
2709     unless (length $token->{data}) {
2710     ## Stay in the phase
2711     !!!next-token;
2712     redo B;
2713     }
2714     }
2715     #
2716     } elsif ({
2717     'start tag' => 1,
2718     'end tag' => 1,
2719     'end-of-file' => 1,
2720     }->{$token->{type}}) {
2721     ## ISSUE: There is an issue in the spec
2722     #
2723     } else {
2724     die "$0: $token->{type}: Unknown token";
2725     }
2726     my $root_element; !!!create-element ($root_element, 'html');
2727     $self->{document}->append_child ($root_element);
2728     $open_elements = [[$root_element, 'html']];
2729     $phase = 'main';
2730     ## reprocess
2731     redo B;
2732     } elsif ($phase eq 'main') {
2733     if ($token->{type} eq 'DOCTYPE') {
2734     !!!parse-error;
2735     ## Ignore the token
2736     ## Stay in the phase
2737     !!!next-token;
2738     redo B;
2739     } elsif ($token->{type} eq 'start tag' and
2740     $token->{tag_name} eq 'html') {
2741     ## TODO: unless it is the first start tag token, parse-error
2742     my $top_el = $open_elements->[0]->[0];
2743     for my $attr_name (keys %{$token->{attributes}}) {
2744     unless ($top_el->has_attribute_ns (undef, $attr_name)) {
2745     $top_el->set_attribute_ns (undef, [undef, $attr_name],
2746     $token->{attributes}->{value});
2747     }
2748     }
2749     !!!next-token;
2750     redo B;
2751     } elsif ($token->{type} eq 'end-of-file') {
2752     ## Generate implied end tags
2753     if ({
2754     dd => 1, dt => 1, li => 1, p => 1, td => 1, th => 1, tr => 1,
2755     }->{$open_elements->[-1]->[1]}) {
2756     !!!back-token;
2757     $token = {type => 'end tag', tag_name => $open_elements->[-1]->[1]};
2758     redo B;
2759     }
2760    
2761     if (@$open_elements > 2 or
2762     (@$open_elements == 2 and $open_elements->[1]->[1] ne 'body')) {
2763     !!!parse-error;
2764     } else {
2765     ## TODO: inner_html parser and @$open_elements > 1 and $open_elements->[1] ne 'body', then parse-error
2766     }
2767    
2768     ## Stop parsing
2769     last B;
2770    
2771     ## ISSUE: There is an issue in the spec.
2772     } else {
2773     if ($insertion_mode eq 'before head') {
2774     if ($token->{type} eq 'character') {
2775     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
2776     $open_elements->[-1]->[0]->manakai_append_text ($1);
2777     unless (length $token->{data}) {
2778     !!!next-token;
2779     redo B;
2780     }
2781     }
2782     ## As if <head>
2783     !!!create-element ($head_element, 'head');
2784     $open_elements->[-1]->[0]->append_child ($head_element);
2785     push @$open_elements, [$head_element, 'head'];
2786     $insertion_mode = 'in head';
2787     ## reprocess
2788     redo B;
2789     } elsif ($token->{type} eq 'comment') {
2790     my $comment = $self->{document}->create_comment ($token->{data});
2791     $open_elements->[-1]->[0]->append_child ($comment);
2792     !!!next-token;
2793     redo B;
2794     } elsif ($token->{type} eq 'start tag') {
2795     my $attr = $token->{tag_name} eq 'head' ? $token->{attributes} : {};
2796     !!!create-element ($head_element, 'head', $attr);
2797     $open_elements->[-1]->[0]->append_child ($head_element);
2798     push @$open_elements, [$head_element, 'head'];
2799     $insertion_mode = 'in head';
2800     if ($token->{tag_name} eq 'head') {
2801     !!!next-token;
2802     #} elsif ({
2803     # base => 1, link => 1, meta => 1,
2804     # script => 1, style => 1, title => 1,
2805     # }->{$token->{tag_name}}) {
2806     # ## reprocess
2807     } else {
2808     ## reprocess
2809     }
2810     redo B;
2811     } elsif ($token->{type} eq 'end tag') {
2812     if ($token->{tag_name} eq 'html') {
2813     ## As if <head>
2814     !!!create-element ($head_element, 'head');
2815     $open_elements->[-1]->[0]->append_child ($head_element);
2816     push @$open_elements, [$head_element, 'head'];
2817     $insertion_mode = 'in head';
2818     ## reprocess
2819     redo B;
2820     } else {
2821     !!!parse-error;
2822     ## Ignore the token
2823 wakaba 1.6 !!!next-token;
2824 wakaba 1.2 redo B;
2825     }
2826     } else {
2827     die "$0: $token->{type}: Unknown type";
2828     }
2829     } elsif ($insertion_mode eq 'in head') {
2830     if ($token->{type} eq 'character') {
2831     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
2832     $open_elements->[-1]->[0]->manakai_append_text ($1);
2833     unless (length $token->{data}) {
2834     !!!next-token;
2835     redo B;
2836     }
2837     }
2838    
2839     #
2840     } elsif ($token->{type} eq 'comment') {
2841     my $comment = $self->{document}->create_comment ($token->{data});
2842     $open_elements->[-1]->[0]->append_child ($comment);
2843     !!!next-token;
2844     redo B;
2845     } elsif ($token->{type} eq 'start tag') {
2846     if ($token->{tag_name} eq 'title') {
2847     my $title_el; !!!create-element ($title_el, 'title');
2848     (defined $head_element ? $head_element : $open_elements->[-1]->[0])
2849     ->append_child ($title_el);
2850     $self->{content_model_flag} = 'RCDATA';
2851    
2852     my $text = '';
2853     !!!next-token;
2854     while ($token->{type} eq 'character') {
2855     $text .= $token->{data};
2856     !!!next-token;
2857     }
2858     if (length $text) {
2859     $title_el->manakai_append_text ($text);
2860     }
2861    
2862     $self->{content_model_flag} = 'PCDATA';
2863    
2864     if ($token->{type} eq 'end tag' and
2865     $token->{tag_name} eq 'title') {
2866     ## Ignore the token
2867     } else {
2868     !!!parse-error;
2869     ## ISSUE: And ignore?
2870     }
2871     !!!next-token;
2872     redo B;
2873     } elsif ($token->{tag_name} eq 'style') {
2874     $style_start_tag->();
2875     redo B;
2876     } elsif ($token->{tag_name} eq 'script') {
2877     $script_start_tag->();
2878     redo B;
2879     } elsif ({base => 1, link => 1, meta => 1}->{$token->{tag_name}}) {
2880     ## NOTE: There are "as if in head" code clones
2881     my $el;
2882     !!!create-element ($el, $token->{tag_name}, $token->{attributes});
2883     (defined $head_element ? $head_element : $open_elements->[-1]->[0])
2884     ->append_child ($el);
2885    
2886     ## ISSUE: Issue on magical <base> in the spec
2887    
2888     !!!next-token;
2889     redo B;
2890     } elsif ($token->{tag_name} eq 'head') {
2891     !!!parse-error;
2892     ## Ignore the token
2893     !!!next-token;
2894     redo B;
2895     } else {
2896     #
2897     }
2898     } elsif ($token->{type} eq 'end tag') {
2899     if ($token->{tag_name} eq 'head') {
2900     if ($open_elements->[-1]->[1] eq 'head') {
2901     pop @$open_elements;
2902     } else {
2903     !!!parse-error;
2904     }
2905     $insertion_mode = 'after head';
2906     !!!next-token;
2907     redo B;
2908     } elsif ($token->{tag_name} eq 'html') {
2909     #
2910     } else {
2911     !!!parse-error;
2912     ## Ignore the token
2913     !!!next-token;
2914     redo B;
2915     }
2916     } else {
2917     #
2918     }
2919    
2920     if ($open_elements->[-1]->[1] eq 'head') {
2921     ## As if </head>
2922     pop @$open_elements;
2923     }
2924     $insertion_mode = 'after head';
2925     ## reprocess
2926     redo B;
2927    
2928     ## ISSUE: An issue in the spec.
2929     } elsif ($insertion_mode eq 'after head') {
2930     if ($token->{type} eq 'character') {
2931     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
2932     $open_elements->[-1]->[0]->manakai_append_text ($1);
2933     unless (length $token->{data}) {
2934     !!!next-token;
2935     redo B;
2936     }
2937     }
2938    
2939     #
2940     } elsif ($token->{type} eq 'comment') {
2941     my $comment = $self->{document}->create_comment ($token->{data});
2942     $open_elements->[-1]->[0]->append_child ($comment);
2943     !!!next-token;
2944     redo B;
2945     } elsif ($token->{type} eq 'start tag') {
2946     if ($token->{tag_name} eq 'body') {
2947     !!!insert-element ('body', $token->{attributes});
2948     $insertion_mode = 'in body';
2949     !!!next-token;
2950     redo B;
2951     } elsif ($token->{tag_name} eq 'frameset') {
2952     !!!insert-element ('frameset', $token->{attributes});
2953     $insertion_mode = 'in frameset';
2954     !!!next-token;
2955     redo B;
2956     } elsif ({
2957     base => 1, link => 1, meta => 1,
2958     script=> 1, style => 1, title => 1,
2959     }->{$token->{tag_name}}) {
2960     !!!parse-error;
2961     $insertion_mode = 'in head';
2962     ## reprocess
2963     redo B;
2964     } else {
2965     #
2966     }
2967     } else {
2968     #
2969     }
2970    
2971     ## As if <body>
2972     !!!insert-element ('body');
2973     $insertion_mode = 'in body';
2974     ## reprocess
2975     redo B;
2976     } elsif ($insertion_mode eq 'in body') {
2977     if ($token->{type} eq 'character') {
2978     ## NOTE: There is a code clone of "character in body".
2979     $reconstruct_active_formatting_elements->();
2980    
2981     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
2982    
2983     !!!next-token;
2984     redo B;
2985     } elsif ($token->{type} eq 'comment') {
2986     ## NOTE: There is a code clone of "comment in body".
2987     my $comment = $self->{document}->create_comment ($token->{data});
2988     $open_elements->[-1]->[0]->append_child ($comment);
2989     !!!next-token;
2990     redo B;
2991     } else {
2992     $in_body->(sub {
2993     $open_elements->[-1]->[0]->append_child (shift);
2994     });
2995     redo B;
2996     }
2997     } elsif ($insertion_mode eq 'in table') {
2998     if ($token->{type} eq 'character') {
2999     $reconstruct_active_formatting_elements->();
3000    
3001     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
3002    
3003     !!!next-token;
3004     redo B;
3005     } elsif ($token->{type} eq 'comment') {
3006     my $comment = $self->{document}->create_comment ($token->{data});
3007     $open_elements->[-1]->[0]->append_child ($comment);
3008     !!!next-token;
3009     redo B;
3010     } elsif ($token->{type} eq 'start tag') {
3011     if ({
3012     caption => 1,
3013     colgroup => 1,
3014     tbody => 1, tfoot => 1, thead => 1,
3015     }->{$token->{tag_name}}) {
3016     ## Clear back to table context
3017     while ($open_elements->[-1]->[1] ne 'table' and
3018     $open_elements->[-1]->[1] ne 'html') {
3019     !!!parse-error;
3020     pop @$open_elements;
3021     }
3022    
3023     push @$active_formatting_elements, ['#marker', '']
3024     if $token->{tag_name} eq 'caption';
3025    
3026     !!!insert-element ($token->{tag_name}, $token->{attributes});
3027     $insertion_mode = {
3028     caption => 'in caption',
3029     colgroup => 'in column group',
3030     tbody => 'in table body',
3031     tfoot => 'in table body',
3032     thead => 'in table body',
3033     }->{$token->{tag_name}};
3034     !!!next-token;
3035     redo B;
3036     } elsif ({
3037     col => 1,
3038     td => 1, th => 1, tr => 1,
3039     }->{$token->{tag_name}}) {
3040     ## Clear back to table context
3041     while ($open_elements->[-1]->[1] ne 'table' and
3042     $open_elements->[-1]->[1] ne 'html') {
3043     !!!parse-error;
3044     pop @$open_elements;
3045     }
3046    
3047     !!!insert-element ($token->{tag_name} eq 'col' ? 'colgroup' : 'tbody');
3048     $insertion_mode = $token->{tag_name} eq 'col'
3049     ? 'in column group' : 'in table body';
3050     ## reprocess
3051     redo B;
3052     } elsif ($token->{tag_name} eq 'table') {
3053     ## NOTE: There are code clones for this "table in table"
3054     !!!parse-error;
3055    
3056     ## As if </table>
3057     ## have a table element in table scope
3058     my $i;
3059     INSCOPE: for (reverse 0..$#$open_elements) {
3060     my $node = $open_elements->[$_];
3061     if ($node->[1] eq 'table') {
3062     $i = $_;
3063     last INSCOPE;
3064     } elsif ({
3065     table => 1, html => 1,
3066     }->{$node->[1]}) {
3067     last INSCOPE;
3068     }
3069     } # INSCOPE
3070     unless (defined $i) {
3071     !!!parse-error;
3072     ## Ignore tokens </table><table>
3073     !!!next-token;
3074     redo B;
3075     }
3076    
3077     ## generate implied end tags
3078     if ({
3079     dd => 1, dt => 1, li => 1, p => 1,
3080     td => 1, th => 1, tr => 1,
3081     }->{$open_elements->[-1]->[1]}) {
3082     !!!back-token; # <table>
3083     $token = {type => 'end tag', tag_name => 'table'};
3084     !!!back-token;
3085     $token = {type => 'end tag',
3086     tag_name => $open_elements->[-1]->[1]}; # MUST
3087     redo B;
3088     }
3089    
3090     if ($open_elements->[-1]->[1] ne 'table') {
3091     !!!parse-error;
3092     }
3093    
3094     splice @$open_elements, $i;
3095    
3096     $reset_insertion_mode->();
3097    
3098     ## reprocess
3099     redo B;
3100     } else {
3101     #
3102     }
3103     } elsif ($token->{type} eq 'end tag') {
3104     if ($token->{tag_name} eq 'table') {
3105     ## have a table element in table scope
3106     my $i;
3107     INSCOPE: for (reverse 0..$#$open_elements) {
3108     my $node = $open_elements->[$_];
3109     if ($node->[1] eq $token->{tag_name}) {
3110     $i = $_;
3111     last INSCOPE;
3112     } elsif ({
3113     table => 1, html => 1,
3114     }->{$node->[1]}) {
3115     last INSCOPE;
3116     }
3117     } # INSCOPE
3118     unless (defined $i) {
3119     !!!parse-error;
3120     ## Ignore the token
3121     !!!next-token;
3122     redo B;
3123     }
3124    
3125     ## generate implied end tags
3126     if ({
3127     dd => 1, dt => 1, li => 1, p => 1,
3128     td => 1, th => 1, tr => 1,
3129     }->{$open_elements->[-1]->[1]}) {
3130     !!!back-token;
3131     $token = {type => 'end tag',
3132     tag_name => $open_elements->[-1]->[1]}; # MUST
3133     redo B;
3134     }
3135    
3136     if ($open_elements->[-1]->[1] ne 'table') {
3137     !!!parse-error;
3138     }
3139    
3140     splice @$open_elements, $i;
3141    
3142     $reset_insertion_mode->();
3143    
3144     !!!next-token;
3145     redo B;
3146     } elsif ({
3147     body => 1, caption => 1, col => 1, colgroup => 1,
3148     html => 1, tbody => 1, td => 1, tfoot => 1, th => 1,
3149     thead => 1, tr => 1,
3150     }->{$token->{tag_name}}) {
3151     !!!parse-error;
3152     ## Ignore the token
3153     !!!next-token;
3154     redo B;
3155     } else {
3156     #
3157     }
3158     } else {
3159     #
3160     }
3161    
3162     ## NOTE: There are code clones of "misc in table".
3163     !!!parse-error;
3164     $in_body->(sub {
3165     my $child = shift;
3166     if ({
3167     table => 1, tbody => 1, tfoot => 1,
3168     thead => 1, tr => 1,
3169     }->{$open_elements->[-1]->[1]}) {
3170     # MUST
3171     my $foster_parent_element;
3172     my $next_sibling;
3173     OE: for (reverse 0..$#$open_elements) {
3174     if ($open_elements->[$_]->[1] eq 'table') {
3175     my $parent = $open_elements->[$_]->[0]->parent_node;
3176     if (defined $parent and $parent->node_type == 1) {
3177     $foster_parent_element = $parent;
3178     $next_sibling = $open_elements->[$_]->[0];
3179     } else {
3180     $foster_parent_element
3181     = $open_elements->[$_ - 1]->[0];
3182     }
3183     last OE;
3184     }
3185     } # OE
3186     $foster_parent_element = $open_elements->[0]->[0]
3187     unless defined $foster_parent_element;
3188     $foster_parent_element->insert_before
3189     ($child, $next_sibling);
3190     } else {
3191     $open_elements->[-1]->[0]->append_child ($child);
3192     }
3193     });
3194     redo B;
3195     } elsif ($insertion_mode eq 'in caption') {
3196 wakaba 1.6 if ($token->{type} eq 'character') {
3197     ## NOTE: This is a code clone of "character in body".
3198     $reconstruct_active_formatting_elements->();
3199    
3200     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
3201    
3202     !!!next-token;
3203     redo B;
3204     } elsif ($token->{type} eq 'comment') {
3205     ## NOTE: This is a code clone of "comment in body".
3206     my $comment = $self->{document}->create_comment ($token->{data});
3207     $open_elements->[-1]->[0]->append_child ($comment);
3208     !!!next-token;
3209     redo B;
3210     } elsif ($token->{type} eq 'start tag') {
3211 wakaba 1.2 if ({
3212     caption => 1, col => 1, colgroup => 1, tbody => 1,
3213     td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
3214     }->{$token->{tag_name}}) {
3215     !!!parse-error;
3216    
3217     ## As if </caption>
3218     ## have a table element in table scope
3219     my $i;
3220     INSCOPE: for (reverse 0..$#$open_elements) {
3221     my $node = $open_elements->[$_];
3222     if ($node->[1] eq 'caption') {
3223     $i = $_;
3224     last INSCOPE;
3225     } elsif ({
3226     table => 1, html => 1,
3227     }->{$node->[1]}) {
3228     last INSCOPE;
3229     }
3230     } # INSCOPE
3231     unless (defined $i) {
3232     !!!parse-error;
3233     ## Ignore the token
3234     !!!next-token;
3235     redo B;
3236     }
3237    
3238     ## generate implied end tags
3239     if ({
3240     dd => 1, dt => 1, li => 1, p => 1,
3241     td => 1, th => 1, tr => 1,
3242     }->{$open_elements->[-1]->[1]}) {
3243     !!!back-token; # <?>
3244     $token = {type => 'end tag', tag_name => 'caption'};
3245     !!!back-token;
3246     $token = {type => 'end tag',
3247     tag_name => $open_elements->[-1]->[1]}; # MUST
3248     redo B;
3249     }
3250    
3251     if ($open_elements->[-1]->[1] ne 'caption') {
3252     !!!parse-error;
3253     }
3254    
3255     splice @$open_elements, $i;
3256    
3257     $clear_up_to_marker->();
3258    
3259     $insertion_mode = 'in table';
3260    
3261     ## reprocess
3262     redo B;
3263     } else {
3264     #
3265     }
3266     } elsif ($token->{type} eq 'end tag') {
3267     if ($token->{tag_name} eq 'caption') {
3268     ## have a table element in table scope
3269     my $i;
3270     INSCOPE: for (reverse 0..$#$open_elements) {
3271     my $node = $open_elements->[$_];
3272     if ($node->[1] eq $token->{tag_name}) {
3273     $i = $_;
3274     last INSCOPE;
3275     } elsif ({
3276     table => 1, html => 1,
3277     }->{$node->[1]}) {
3278     last INSCOPE;
3279     }
3280     } # INSCOPE
3281     unless (defined $i) {
3282     !!!parse-error;
3283     ## Ignore the token
3284     !!!next-token;
3285     redo B;
3286     }
3287    
3288     ## generate implied end tags
3289     if ({
3290     dd => 1, dt => 1, li => 1, p => 1,
3291     td => 1, th => 1, tr => 1,
3292     }->{$open_elements->[-1]->[1]}) {
3293     !!!back-token;
3294     $token = {type => 'end tag',
3295     tag_name => $open_elements->[-1]->[1]}; # MUST
3296     redo B;
3297     }
3298    
3299     if ($open_elements->[-1]->[1] ne 'caption') {
3300     !!!parse-error;
3301     }
3302    
3303     splice @$open_elements, $i;
3304    
3305     $clear_up_to_marker->();
3306    
3307     $insertion_mode = 'in table';
3308    
3309     !!!next-token;
3310     redo B;
3311     } elsif ($token->{tag_name} eq 'table') {
3312     !!!parse-error;
3313    
3314     ## As if </caption>
3315     ## have a table element in table scope
3316     my $i;
3317     INSCOPE: for (reverse 0..$#$open_elements) {
3318     my $node = $open_elements->[$_];
3319     if ($node->[1] eq 'caption') {
3320     $i = $_;
3321     last INSCOPE;
3322     } elsif ({
3323     table => 1, html => 1,
3324     }->{$node->[1]}) {
3325     last INSCOPE;
3326     }
3327     } # INSCOPE
3328     unless (defined $i) {
3329     !!!parse-error;
3330     ## Ignore the token
3331     !!!next-token;
3332     redo B;
3333     }
3334    
3335     ## generate implied end tags
3336     if ({
3337     dd => 1, dt => 1, li => 1, p => 1,
3338     td => 1, th => 1, tr => 1,
3339     }->{$open_elements->[-1]->[1]}) {
3340     !!!back-token; # </table>
3341     $token = {type => 'end tag', tag_name => 'caption'};
3342     !!!back-token;
3343     $token = {type => 'end tag',
3344     tag_name => $open_elements->[-1]->[1]}; # MUST
3345     redo B;
3346     }
3347    
3348     if ($open_elements->[-1]->[1] ne 'caption') {
3349     !!!parse-error;
3350     }
3351    
3352     splice @$open_elements, $i;
3353    
3354     $clear_up_to_marker->();
3355    
3356     $insertion_mode = 'in table';
3357    
3358     ## reprocess
3359     redo B;
3360     } elsif ({
3361     body => 1, col => 1, colgroup => 1,
3362     html => 1, tbody => 1, td => 1, tfoot => 1,
3363     th => 1, thead => 1, tr => 1,
3364     }->{$token->{tag_name}}) {
3365     !!!parse-error;
3366     ## Ignore the token
3367     redo B;
3368     } else {
3369     #
3370     }
3371     } else {
3372     #
3373     }
3374    
3375     $in_body->(sub {
3376     $open_elements->[-1]->[0]->append_child (shift);
3377     });
3378     redo B;
3379     } elsif ($insertion_mode eq 'in column group') {
3380     if ($token->{type} eq 'character') {
3381     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3382     $open_elements->[-1]->[0]->manakai_append_text ($1);
3383     unless (length $token->{data}) {
3384     !!!next-token;
3385     redo B;
3386     }
3387     }
3388    
3389     #
3390     } elsif ($token->{type} eq 'comment') {
3391     my $comment = $self->{document}->create_comment ($token->{data});
3392     $open_elements->[-1]->[0]->append_child ($comment);
3393     !!!next-token;
3394     redo B;
3395     } elsif ($token->{type} eq 'start tag') {
3396     if ($token->{tag_name} eq 'col') {
3397     !!!insert-element ($token->{tag_name}, $token->{attributes});
3398     pop @$open_elements;
3399     !!!next-token;
3400     redo B;
3401     } else {
3402     #
3403     }
3404     } elsif ($token->{type} eq 'end tag') {
3405     if ($token->{tag_name} eq 'colgroup') {
3406     if ($open_elements->[-1]->[1] eq 'html') {
3407     !!!parse-error;
3408     ## Ignore the token
3409     !!!next-token;
3410     redo B;
3411     } else {
3412     pop @$open_elements; # colgroup
3413     $insertion_mode = 'in table';
3414     !!!next-token;
3415     redo B;
3416     }
3417     } elsif ($token->{tag_name} eq 'col') {
3418     !!!parse-error;
3419     ## Ignore the token
3420     !!!next-token;
3421     redo B;
3422     } else {
3423     #
3424     }
3425     } else {
3426     #
3427     }
3428    
3429     ## As if </colgroup>
3430     if ($open_elements->[-1]->[1] eq 'html') {
3431     !!!parse-error;
3432     ## Ignore the token
3433     !!!next-token;
3434     redo B;
3435     } else {
3436     pop @$open_elements; # colgroup
3437     $insertion_mode = 'in table';
3438     ## reprocess
3439     redo B;
3440     }
3441     } elsif ($insertion_mode eq 'in table body') {
3442     if ($token->{type} eq 'character') {
3443     ## Copied from 'in table'
3444     $reconstruct_active_formatting_elements->();
3445    
3446     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
3447    
3448     !!!next-token;
3449     redo B;
3450     } elsif ($token->{type} eq 'comment') {
3451     ## Copied from 'in table'
3452     my $comment = $self->{document}->create_comment ($token->{data});
3453     $open_elements->[-1]->[0]->append_child ($comment);
3454     !!!next-token;
3455     redo B;
3456     } elsif ($token->{type} eq 'start tag') {
3457     if ({
3458     tr => 1,
3459     th => 1, td => 1,
3460     }->{$token->{tag_name}}) {
3461     ## Clear back to table body context
3462     while (not {
3463     tbody => 1, tfoot => 1, thead => 1, html => 1,
3464     }->{$open_elements->[-1]->[1]}) {
3465     !!!parse-error;
3466     pop @$open_elements;
3467     }
3468    
3469     $insertion_mode = 'in row';
3470     if ($token->{tag_name} eq 'tr') {
3471     !!!insert-element ($token->{tag_name}, $token->{attributes});
3472     !!!next-token;
3473     } else {
3474     !!!insert-element ('tr');
3475     ## reprocess
3476     }
3477     redo B;
3478     } elsif ({
3479     caption => 1, col => 1, colgroup => 1,
3480     tbody => 1, tfoot => 1, thead => 1,
3481     }->{$token->{tag_name}}) {
3482     ## have an element in table scope
3483     my $i;
3484     INSCOPE: for (reverse 0..$#$open_elements) {
3485     my $node = $open_elements->[$_];
3486     if ({
3487     tbody => 1, thead => 1, tfoot => 1,
3488     }->{$node->[1]}) {
3489     $i = $_;
3490     last INSCOPE;
3491     } elsif ({
3492     table => 1, html => 1,
3493     }->{$node->[1]}) {
3494     last INSCOPE;
3495     }
3496     } # INSCOPE
3497     unless (defined $i) {
3498     !!!parse-error;
3499     ## Ignore the token
3500     !!!next-token;
3501     redo B;
3502     }
3503    
3504     ## Clear back to table body context
3505     while (not {
3506     tbody => 1, tfoot => 1, thead => 1, html => 1,
3507     }->{$open_elements->[-1]->[1]}) {
3508     !!!parse-error;
3509     pop @$open_elements;
3510     }
3511    
3512     ## As if <{current node}>
3513     ## have an element in table scope
3514     ## true by definition
3515    
3516     ## Clear back to table body context
3517     ## nop by definition
3518    
3519     pop @$open_elements;
3520     $insertion_mode = 'in table';
3521     ## reprocess
3522     redo B;
3523     } elsif ($token->{tag_name} eq 'table') {
3524     ## NOTE: This is a code clone of "table in table"
3525     !!!parse-error;
3526    
3527     ## As if </table>
3528     ## have a table element in table scope
3529     my $i;
3530     INSCOPE: for (reverse 0..$#$open_elements) {
3531     my $node = $open_elements->[$_];
3532     if ($node->[1] eq 'table') {
3533     $i = $_;
3534     last INSCOPE;
3535     } elsif ({
3536     table => 1, html => 1,
3537     }->{$node->[1]}) {
3538     last INSCOPE;
3539     }
3540     } # INSCOPE
3541     unless (defined $i) {
3542     !!!parse-error;
3543     ## Ignore tokens </table><table>
3544     !!!next-token;
3545     redo B;
3546     }
3547    
3548     ## generate implied end tags
3549     if ({
3550     dd => 1, dt => 1, li => 1, p => 1,
3551     td => 1, th => 1, tr => 1,
3552     }->{$open_elements->[-1]->[1]}) {
3553     !!!back-token; # <table>
3554     $token = {type => 'end tag', tag_name => 'table'};
3555     !!!back-token;
3556     $token = {type => 'end tag',
3557     tag_name => $open_elements->[-1]->[1]}; # MUST
3558     redo B;
3559     }
3560    
3561     if ($open_elements->[-1]->[1] ne 'table') {
3562     !!!parse-error;
3563     }
3564    
3565     splice @$open_elements, $i;
3566    
3567     $reset_insertion_mode->();
3568    
3569     ## reprocess
3570     redo B;
3571     } else {
3572     #
3573     }
3574     } elsif ($token->{type} eq 'end tag') {
3575     if ({
3576     tbody => 1, tfoot => 1, thead => 1,
3577     }->{$token->{tag_name}}) {
3578     ## have an element in table scope
3579     my $i;
3580     INSCOPE: for (reverse 0..$#$open_elements) {
3581     my $node = $open_elements->[$_];
3582     if ($node->[1] eq $token->{tag_name}) {
3583     $i = $_;
3584     last INSCOPE;
3585     } elsif ({
3586     table => 1, html => 1,
3587     }->{$node->[1]}) {
3588     last INSCOPE;
3589     }
3590     } # INSCOPE
3591     unless (defined $i) {
3592     !!!parse-error;
3593     ## Ignore the token
3594     !!!next-token;
3595     redo B;
3596     }
3597    
3598     ## Clear back to table body context
3599     while (not {
3600     tbody => 1, tfoot => 1, thead => 1, html => 1,
3601     }->{$open_elements->[-1]->[1]}) {
3602     !!!parse-error;
3603     pop @$open_elements;
3604     }
3605    
3606     pop @$open_elements;
3607     $insertion_mode = 'in table';
3608     !!!next-token;
3609     redo B;
3610     } elsif ($token->{tag_name} eq 'table') {
3611     ## have an element in table scope
3612     my $i;
3613     INSCOPE: for (reverse 0..$#$open_elements) {
3614     my $node = $open_elements->[$_];
3615     if ({
3616     tbody => 1, thead => 1, tfoot => 1,
3617     }->{$node->[1]}) {
3618     $i = $_;
3619     last INSCOPE;
3620     } elsif ({
3621     table => 1, html => 1,
3622     }->{$node->[1]}) {
3623     last INSCOPE;
3624     }
3625     } # INSCOPE
3626     unless (defined $i) {
3627     !!!parse-error;
3628     ## Ignore the token
3629     !!!next-token;
3630     redo B;
3631     }
3632    
3633     ## Clear back to table body context
3634     while (not {
3635     tbody => 1, tfoot => 1, thead => 1, html => 1,
3636     }->{$open_elements->[-1]->[1]}) {
3637     !!!parse-error;
3638     pop @$open_elements;
3639     }
3640    
3641     ## As if <{current node}>
3642     ## have an element in table scope
3643     ## true by definition
3644    
3645     ## Clear back to table body context
3646     ## nop by definition
3647    
3648     pop @$open_elements;
3649     $insertion_mode = 'in table';
3650     ## reprocess
3651     redo B;
3652     } elsif ({
3653     body => 1, caption => 1, col => 1, colgroup => 1,
3654     html => 1, td => 1, th => 1, tr => 1,
3655     }->{$token->{tag_name}}) {
3656     !!!parse-error;
3657     ## Ignore the token
3658     !!!next-token;
3659     redo B;
3660     } else {
3661     #
3662     }
3663     } else {
3664     #
3665     }
3666    
3667     ## As if in table
3668     ## NOTE: This is a code clone of "misc in table".
3669     !!!parse-error;
3670     $in_body->(sub {
3671     my $child = shift;
3672     if ({
3673     table => 1, tbody => 1, tfoot => 1,
3674     thead => 1, tr => 1,
3675     }->{$open_elements->[-1]->[1]}) {
3676     # MUST
3677     my $foster_parent_element;
3678     my $next_sibling;
3679     OE: for (reverse 0..$#$open_elements) {
3680     if ($open_elements->[$_]->[1] eq 'table') {
3681     my $parent = $open_elements->[$_]->[0]->parent_node;
3682     if (defined $parent and $parent->node_type == 1) {
3683     $foster_parent_element = $parent;
3684     $next_sibling = $open_elements->[$_]->[0];
3685     } else {
3686     $foster_parent_element
3687     = $open_elements->[$_ - 1]->[0];
3688     }
3689     last OE;
3690     }
3691     } # OE
3692     $foster_parent_element = $open_elements->[0]->[0]
3693     unless defined $foster_parent_element;
3694     $foster_parent_element->insert_before
3695     ($child, $next_sibling);
3696     } else {
3697     $open_elements->[-1]->[0]->append_child ($child);
3698     }
3699     });
3700     redo B;
3701     } elsif ($insertion_mode eq 'in row') {
3702     if ($token->{type} eq 'character') {
3703     ## Copied from 'in table'
3704     $reconstruct_active_formatting_elements->();
3705    
3706     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
3707    
3708     !!!next-token;
3709     redo B;
3710     } elsif ($token->{type} eq 'comment') {
3711     ## Copied from 'in table'
3712     my $comment = $self->{document}->create_comment ($token->{data});
3713     $open_elements->[-1]->[0]->append_child ($comment);
3714     !!!next-token;
3715     redo B;
3716     } elsif ($token->{type} eq 'start tag') {
3717     if ($token->{tag_name} eq 'th' or
3718     $token->{tag_name} eq 'td') {
3719     ## Clear back to table row context
3720     while (not {
3721 wakaba 1.6 tr => 1, html => 1,
3722 wakaba 1.2 }->{$open_elements->[-1]->[1]}) {
3723     !!!parse-error;
3724     pop @$open_elements;
3725     }
3726    
3727     !!!insert-element ($token->{tag_name}, $token->{attributes});
3728     $insertion_mode = 'in cell';
3729    
3730     push @$active_formatting_elements, ['#marker', ''];
3731    
3732     !!!next-token;
3733     redo B;
3734     } elsif ({
3735     caption => 1, col => 1, colgroup => 1,
3736     tbody => 1, tfoot => 1, thead => 1, tr => 1,
3737     }->{$token->{tag_name}}) {
3738     ## As if </tr>
3739     ## have an element in table scope
3740     my $i;
3741     INSCOPE: for (reverse 0..$#$open_elements) {
3742     my $node = $open_elements->[$_];
3743     if ($node->[1] eq 'tr') {
3744     $i = $_;
3745     last INSCOPE;
3746     } elsif ({
3747     table => 1, html => 1,
3748     }->{$node->[1]}) {
3749     last INSCOPE;
3750     }
3751     } # INSCOPE
3752     unless (defined $i) {
3753     !!!parse-error;
3754     ## Ignore the token
3755     !!!next-token;
3756     redo B;
3757     }
3758    
3759     ## Clear back to table row context
3760     while (not {
3761     tr => 1, html => 1,
3762     }->{$open_elements->[-1]->[1]}) {
3763     !!!parse-error;
3764     pop @$open_elements;
3765     }
3766    
3767     pop @$open_elements; # tr
3768     $insertion_mode = 'in table body';
3769     ## reprocess
3770     redo B;
3771     } elsif ($token->{tag_name} eq 'table') {
3772     ## NOTE: This is a code clone of "table in table"
3773     !!!parse-error;
3774    
3775     ## As if </table>
3776     ## have a table element in table scope
3777     my $i;
3778     INSCOPE: for (reverse 0..$#$open_elements) {
3779     my $node = $open_elements->[$_];
3780     if ($node->[1] eq 'table') {
3781     $i = $_;
3782     last INSCOPE;
3783     } elsif ({
3784     table => 1, html => 1,
3785     }->{$node->[1]}) {
3786     last INSCOPE;
3787     }
3788     } # INSCOPE
3789     unless (defined $i) {
3790     !!!parse-error;
3791     ## Ignore tokens </table><table>
3792     !!!next-token;
3793     redo B;
3794     }
3795    
3796     ## generate implied end tags
3797     if ({
3798     dd => 1, dt => 1, li => 1, p => 1,
3799     td => 1, th => 1, tr => 1,
3800     }->{$open_elements->[-1]->[1]}) {
3801     !!!back-token; # <table>
3802     $token = {type => 'end tag', tag_name => 'table'};
3803     !!!back-token;
3804     $token = {type => 'end tag',
3805     tag_name => $open_elements->[-1]->[1]}; # MUST
3806     redo B;
3807     }
3808    
3809     if ($open_elements->[-1]->[1] ne 'table') {
3810     !!!parse-error;
3811     }
3812    
3813     splice @$open_elements, $i;
3814    
3815     $reset_insertion_mode->();
3816    
3817     ## reprocess
3818     redo B;
3819     } else {
3820     #
3821     }
3822     } elsif ($token->{type} eq 'end tag') {
3823     if ($token->{tag_name} eq 'tr') {
3824     ## have an element in table scope
3825     my $i;
3826     INSCOPE: for (reverse 0..$#$open_elements) {
3827     my $node = $open_elements->[$_];
3828     if ($node->[1] eq $token->{tag_name}) {
3829     $i = $_;
3830     last INSCOPE;
3831     } elsif ({
3832     table => 1, html => 1,
3833     }->{$node->[1]}) {
3834     last INSCOPE;
3835     }
3836     } # INSCOPE
3837     unless (defined $i) {
3838     !!!parse-error;
3839     ## Ignore the token
3840     !!!next-token;
3841     redo B;
3842     }
3843    
3844     ## Clear back to table row context
3845     while (not {
3846     tr => 1, html => 1,
3847     }->{$open_elements->[-1]->[1]}) {
3848     !!!parse-error;
3849     pop @$open_elements;
3850     }
3851    
3852     pop @$open_elements; # tr
3853     $insertion_mode = 'in table body';
3854     !!!next-token;
3855     redo B;
3856     } elsif ($token->{tag_name} eq 'table') {
3857     ## As if </tr>
3858     ## have an element in table scope
3859     my $i;
3860     INSCOPE: for (reverse 0..$#$open_elements) {
3861     my $node = $open_elements->[$_];
3862     if ($node->[1] eq 'tr') {
3863     $i = $_;
3864     last INSCOPE;
3865     } elsif ({
3866     table => 1, html => 1,
3867     }->{$node->[1]}) {
3868     last INSCOPE;
3869     }
3870     } # INSCOPE
3871     unless (defined $i) {
3872     !!!parse-error;
3873     ## Ignore the token
3874     !!!next-token;
3875     redo B;
3876     }
3877    
3878     ## Clear back to table row context
3879     while (not {
3880     tr => 1, html => 1,
3881     }->{$open_elements->[-1]->[1]}) {
3882     !!!parse-error;
3883     pop @$open_elements;
3884     }
3885    
3886     pop @$open_elements; # tr
3887     $insertion_mode = 'in table body';
3888     ## reprocess
3889     redo B;
3890     } elsif ({
3891     tbody => 1, tfoot => 1, thead => 1,
3892     }->{$token->{tag_name}}) {
3893     ## have an element in table scope
3894     my $i;
3895     INSCOPE: for (reverse 0..$#$open_elements) {
3896     my $node = $open_elements->[$_];
3897     if ($node->[1] eq $token->{tag_name}) {
3898     $i = $_;
3899     last INSCOPE;
3900     } elsif ({
3901     table => 1, html => 1,
3902     }->{$node->[1]}) {
3903     last INSCOPE;
3904     }
3905     } # INSCOPE
3906     unless (defined $i) {
3907     !!!parse-error;
3908     ## Ignore the token
3909     !!!next-token;
3910     redo B;
3911     }
3912    
3913     ## As if </tr>
3914     ## have an element in table scope
3915     my $i;
3916     INSCOPE: for (reverse 0..$#$open_elements) {
3917     my $node = $open_elements->[$_];
3918     if ($node->[1] eq 'tr') {
3919     $i = $_;
3920     last INSCOPE;
3921     } elsif ({
3922     table => 1, html => 1,
3923     }->{$node->[1]}) {
3924     last INSCOPE;
3925     }
3926     } # INSCOPE
3927     unless (defined $i) {
3928     !!!parse-error;
3929     ## Ignore the token
3930     !!!next-token;
3931     redo B;
3932     }
3933    
3934     ## Clear back to table row context
3935     while (not {
3936     tr => 1, html => 1,
3937     }->{$open_elements->[-1]->[1]}) {
3938     !!!parse-error;
3939     pop @$open_elements;
3940     }
3941    
3942     pop @$open_elements; # tr
3943     $insertion_mode = 'in table body';
3944     ## reprocess
3945     redo B;
3946     } elsif ({
3947     body => 1, caption => 1, col => 1,
3948     colgroup => 1, html => 1, td => 1, th => 1,
3949     }->{$token->{tag_name}}) {
3950     !!!parse-error;
3951     ## Ignore the token
3952     !!!next-token;
3953     redo B;
3954     } else {
3955     #
3956     }
3957     } else {
3958     #
3959     }
3960    
3961     ## As if in table
3962     ## NOTE: This is a code clone of "misc in table".
3963     !!!parse-error;
3964     $in_body->(sub {
3965     my $child = shift;
3966     if ({
3967     table => 1, tbody => 1, tfoot => 1,
3968     thead => 1, tr => 1,
3969     }->{$open_elements->[-1]->[1]}) {
3970     # MUST
3971     my $foster_parent_element;
3972     my $next_sibling;
3973     OE: for (reverse 0..$#$open_elements) {
3974     if ($open_elements->[$_]->[1] eq 'table') {
3975     my $parent = $open_elements->[$_]->[0]->parent_node;
3976     if (defined $parent and $parent->node_type == 1) {
3977     $foster_parent_element = $parent;
3978     $next_sibling = $open_elements->[$_]->[0];
3979     } else {
3980     $foster_parent_element
3981     = $open_elements->[$_ - 1]->[0];
3982     }
3983     last OE;
3984     }
3985     } # OE
3986     $foster_parent_element = $open_elements->[0]->[0]
3987     unless defined $foster_parent_element;
3988     $foster_parent_element->insert_before
3989     ($child, $next_sibling);
3990     } else {
3991     $open_elements->[-1]->[0]->append_child ($child);
3992     }
3993     });
3994     redo B;
3995     } elsif ($insertion_mode eq 'in cell') {
3996     if ($token->{type} eq 'character') {
3997     ## NOTE: This is a code clone of "character in body".
3998     $reconstruct_active_formatting_elements->();
3999    
4000     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
4001    
4002     !!!next-token;
4003     redo B;
4004     } elsif ($token->{type} eq 'comment') {
4005     ## NOTE: This is a code clone of "comment in body".
4006     my $comment = $self->{document}->create_comment ($token->{data});
4007     $open_elements->[-1]->[0]->append_child ($comment);
4008     !!!next-token;
4009     redo B;
4010     } elsif ($token->{type} eq 'start tag') {
4011     if ({
4012     caption => 1, col => 1, colgroup => 1,
4013     tbody => 1, td => 1, tfoot => 1, th => 1,
4014     thead => 1, tr => 1,
4015     }->{$token->{tag_name}}) {
4016     ## have an element in table scope
4017     my $tn;
4018     INSCOPE: for (reverse 0..$#$open_elements) {
4019     my $node = $open_elements->[$_];
4020     if ($node->[1] eq 'td' or $node->[1] eq 'th') {
4021     $tn = $node->[1];
4022     last INSCOPE;
4023     } elsif ({
4024     table => 1, html => 1,
4025     }->{$node->[1]}) {
4026     last INSCOPE;
4027     }
4028     } # INSCOPE
4029     unless (defined $tn) {
4030     !!!parse-error;
4031     ## Ignore the token
4032     !!!next-token;
4033     redo B;
4034     }
4035    
4036     ## Close the cell
4037     !!!back-token; # <?>
4038     $token = {type => 'end tag', tag_name => $tn};
4039     redo B;
4040     } else {
4041     #
4042     }
4043     } elsif ($token->{type} eq 'end tag') {
4044 wakaba 1.6 if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') {
4045 wakaba 1.2 ## have an element in table scope
4046     my $i;
4047     INSCOPE: for (reverse 0..$#$open_elements) {
4048     my $node = $open_elements->[$_];
4049     if ($node->[1] eq $token->{tag_name}) {
4050     $i = $_;
4051     last INSCOPE;
4052     } elsif ({
4053     table => 1, html => 1,
4054     }->{$node->[1]}) {
4055     last INSCOPE;
4056     }
4057     } # INSCOPE
4058     unless (defined $i) {
4059     !!!parse-error;
4060     ## Ignore the token
4061     !!!next-token;
4062     redo B;
4063     }
4064    
4065     ## generate implied end tags
4066     if ({
4067     dd => 1, dt => 1, li => 1, p => 1,
4068     td => ($token->{tag_name} eq 'th'),
4069     th => ($token->{tag_name} eq 'td'),
4070     tr => 1,
4071     }->{$open_elements->[-1]->[1]}) {
4072     !!!back-token;
4073     $token = {type => 'end tag',
4074     tag_name => $open_elements->[-1]->[1]}; # MUST
4075     redo B;
4076     }
4077    
4078     if ($open_elements->[-1]->[1] ne $token->{tag_name}) {
4079     !!!parse-error;
4080     }
4081    
4082     splice @$open_elements, $i;
4083    
4084     $clear_up_to_marker->();
4085    
4086     $insertion_mode = 'in row';
4087    
4088     !!!next-token;
4089     redo B;
4090     } elsif ({
4091     body => 1, caption => 1, col => 1,
4092     colgroup => 1, html => 1,
4093     }->{$token->{tag_name}}) {
4094     !!!parse-error;
4095     ## Ignore the token
4096     !!!next-token;
4097     redo B;
4098     } elsif ({
4099     table => 1, tbody => 1, tfoot => 1,
4100     thead => 1, tr => 1,
4101     }->{$token->{tag_name}}) {
4102     ## have an element in table scope
4103     my $i;
4104     my $tn;
4105     INSCOPE: for (reverse 0..$#$open_elements) {
4106     my $node = $open_elements->[$_];
4107     if ($node->[1] eq $token->{tag_name}) {
4108     $i = $_;
4109     last INSCOPE;
4110     } elsif ($node->[1] eq 'td' or $node->[1] eq 'th') {
4111     $tn = $node->[1];
4112     ## NOTE: There is exactly one |td| or |th| element
4113     ## in scope in the stack of open elements by definition.
4114     } elsif ({
4115     table => 1, html => 1,
4116     }->{$node->[1]}) {
4117     last INSCOPE;
4118     }
4119     } # INSCOPE
4120     unless (defined $i) {
4121     !!!parse-error;
4122     ## Ignore the token
4123     !!!next-token;
4124     redo B;
4125     }
4126    
4127     ## Close the cell
4128     !!!back-token; # </?>
4129     $token = {type => 'end tag', tag_name => $tn};
4130     redo B;
4131     } else {
4132     #
4133     }
4134     } else {
4135     #
4136     }
4137    
4138     $in_body->(sub {
4139     $open_elements->[-1]->[0]->append_child (shift);
4140     });
4141     redo B;
4142     } elsif ($insertion_mode eq 'in select') {
4143     if ($token->{type} eq 'character') {
4144     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
4145     !!!next-token;
4146     redo B;
4147     } elsif ($token->{type} eq 'comment') {
4148     my $comment = $self->{document}->create_comment ($token->{data});
4149     $open_elements->[-1]->[0]->append_child ($comment);
4150     !!!next-token;
4151     redo B;
4152     } elsif ($token->{type} eq 'start tag') {
4153     if ($token->{tag_name} eq 'option') {
4154     if ($open_elements->[-1]->[1] eq 'option') {
4155     ## As if </option>
4156     pop @$open_elements;
4157     }
4158    
4159     !!!insert-element ($token->{tag_name}, $token->{attributes});
4160     !!!next-token;
4161     redo B;
4162     } elsif ($token->{tag_name} eq 'optgroup') {
4163     if ($open_elements->[-1]->[1] eq 'option') {
4164     ## As if </option>
4165     pop @$open_elements;
4166     }
4167    
4168     if ($open_elements->[-1]->[1] eq 'optgroup') {
4169     ## As if </optgroup>
4170     pop @$open_elements;
4171     }
4172    
4173     !!!insert-element ($token->{tag_name}, $token->{attributes});
4174     !!!next-token;
4175     redo B;
4176     } elsif ($token->{tag_name} eq 'select') {
4177     !!!parse-error;
4178     ## As if </select> instead
4179     ## have an element in table scope
4180     my $i;
4181     INSCOPE: for (reverse 0..$#$open_elements) {
4182     my $node = $open_elements->[$_];
4183     if ($node->[1] eq $token->{tag_name}) {
4184     $i = $_;
4185     last INSCOPE;
4186     } elsif ({
4187     table => 1, html => 1,
4188     }->{$node->[1]}) {
4189     last INSCOPE;
4190     }
4191     } # INSCOPE
4192     unless (defined $i) {
4193     !!!parse-error;
4194     ## Ignore the token
4195     !!!next-token;
4196     redo B;
4197     }
4198    
4199     splice @$open_elements, $i;
4200    
4201     $reset_insertion_mode->();
4202    
4203     !!!next-token;
4204     redo B;
4205     } else {
4206     #
4207     }
4208     } elsif ($token->{type} eq 'end tag') {
4209     if ($token->{tag_name} eq 'optgroup') {
4210     if ($open_elements->[-1]->[1] eq 'option' and
4211     $open_elements->[-2]->[1] eq 'optgroup') {
4212     ## As if </option>
4213     splice @$open_elements, -2;
4214     } elsif ($open_elements->[-1]->[1] eq 'optgroup') {
4215     pop @$open_elements;
4216     } else {
4217     !!!parse-error;
4218     ## Ignore the token
4219     }
4220     !!!next-token;
4221     redo B;
4222     } elsif ($token->{tag_name} eq 'option') {
4223     if ($open_elements->[-1]->[1] eq 'option') {
4224     pop @$open_elements;
4225     } else {
4226     !!!parse-error;
4227     ## Ignore the token
4228     }
4229     !!!next-token;
4230     redo B;
4231     } elsif ($token->{tag_name} eq 'select') {
4232     ## have an element in table scope
4233     my $i;
4234     INSCOPE: for (reverse 0..$#$open_elements) {
4235     my $node = $open_elements->[$_];
4236     if ($node->[1] eq $token->{tag_name}) {
4237     $i = $_;
4238     last INSCOPE;
4239     } elsif ({
4240     table => 1, html => 1,
4241     }->{$node->[1]}) {
4242     last INSCOPE;
4243     }
4244     } # INSCOPE
4245     unless (defined $i) {
4246     !!!parse-error;
4247     ## Ignore the token
4248     !!!next-token;
4249     redo B;
4250     }
4251    
4252     splice @$open_elements, $i;
4253    
4254     $reset_insertion_mode->();
4255    
4256     !!!next-token;
4257     redo B;
4258     } elsif ({
4259     caption => 1, table => 1, tbody => 1,
4260     tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
4261     }->{$token->{tag_name}}) {
4262     !!!parse-error;
4263    
4264     ## have an element in table scope
4265     my $i;
4266     INSCOPE: for (reverse 0..$#$open_elements) {
4267     my $node = $open_elements->[$_];
4268     if ($node->[1] eq $token->{tag_name}) {
4269     $i = $_;
4270     last INSCOPE;
4271     } elsif ({
4272     table => 1, html => 1,
4273     }->{$node->[1]}) {
4274     last INSCOPE;
4275     }
4276     } # INSCOPE
4277     unless (defined $i) {
4278     ## Ignore the token
4279     !!!next-token;
4280     redo B;
4281     }
4282    
4283     ## As if </select>
4284     ## have an element in table scope
4285     undef $i;
4286     INSCOPE: for (reverse 0..$#$open_elements) {
4287     my $node = $open_elements->[$_];
4288     if ($node->[1] eq 'select') {
4289     $i = $_;
4290     last INSCOPE;
4291     } elsif ({
4292     table => 1, html => 1,
4293     }->{$node->[1]}) {
4294     last INSCOPE;
4295     }
4296     } # INSCOPE
4297     unless (defined $i) {
4298     !!!parse-error;
4299     ## Ignore the </select> token
4300     !!!next-token; ## TODO: ok?
4301     redo B;
4302     }
4303    
4304     splice @$open_elements, $i;
4305    
4306     $reset_insertion_mode->();
4307    
4308     ## reprocess
4309     redo B;
4310     } else {
4311     #
4312     }
4313     } else {
4314     #
4315     }
4316    
4317     !!!parse-error;
4318     ## Ignore the token
4319     redo B;
4320     } elsif ($insertion_mode eq 'after body') {
4321     if ($token->{type} eq 'character') {
4322     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4323     ## As if in body
4324     $reconstruct_active_formatting_elements->();
4325    
4326     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
4327    
4328     unless (length $token->{data}) {
4329     !!!next-token;
4330     redo B;
4331     }
4332     }
4333    
4334     #
4335     } elsif ($token->{type} eq 'comment') {
4336     my $comment = $self->{document}->create_comment ($token->{data});
4337     $open_elements->[0]->[0]->append_child ($comment);
4338     !!!next-token;
4339     redo B;
4340     } elsif ($token->{type} eq 'end tag') {
4341 wakaba 1.6 if ($token->{tag_name} eq 'html') {
4342 wakaba 1.2 ## TODO: if inner_html, parse-error, ignore the token; otherwise,
4343    
4344     $phase = 'trailing end';
4345     !!!next-token;
4346     redo B;
4347     } else {
4348     #
4349     }
4350     } else {
4351     #
4352     }
4353    
4354     !!!parse-error;
4355     $insertion_mode = 'in body';
4356     ## reprocess
4357     redo B;
4358     } elsif ($insertion_mode eq 'in frameset') {
4359     if ($token->{type} eq 'character') {
4360     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4361     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
4362    
4363     unless (length $token->{data}) {
4364     !!!next-token;
4365     redo B;
4366     }
4367     }
4368    
4369     #
4370     } elsif ($token->{type} eq 'comment') {
4371     my $comment = $self->{document}->create_comment ($token->{data});
4372     $open_elements->[-1]->[0]->append_child ($comment);
4373     !!!next-token;
4374     redo B;
4375     } elsif ($token->{type} eq 'start tag') {
4376     if ($token->{tag_name} eq 'frameset') {
4377     !!!insert-element ($token->{tag_name}, $token->{attributes});
4378     !!!next-token;
4379     redo B;
4380     } elsif ($token->{tag_name} eq 'frame') {
4381     !!!insert-element ($token->{tag_name}, $token->{attributes});
4382     pop @$open_elements;
4383     !!!next-token;
4384     redo B;
4385     } elsif ($token->{tag_name} eq 'noframes') {
4386     $in_body->(sub {
4387     $open_elements->[-1]->[0]->append_child (shift);
4388     });
4389     redo B;
4390     } else {
4391     #
4392     }
4393     } elsif ($token->{type} eq 'end tag') {
4394     if ($token->{tag_name} eq 'frameset') {
4395     if ($open_elements->[-1]->[1] eq 'html' and
4396     @$open_elements == 1) {
4397     !!!parse-error;
4398     ## Ignore the token
4399     !!!next-token;
4400     } else {
4401     pop @$open_elements;
4402     !!!next-token;
4403     }
4404    
4405     ## if not inner_html and
4406     if ($open_elements->[-1]->[1] ne 'frameset') {
4407     $insertion_mode = 'after frameset';
4408     }
4409     redo B;
4410     } else {
4411     #
4412     }
4413     } else {
4414     #
4415     }
4416    
4417     !!!parse-error;
4418     ## Ignore the token
4419     !!!next-token;
4420     redo B;
4421     } elsif ($insertion_mode eq 'after frameset') {
4422     if ($token->{type} eq 'character') {
4423     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4424     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
4425    
4426     unless (length $token->{data}) {
4427     !!!next-token;
4428     redo B;
4429     }
4430     }
4431    
4432     #
4433     } elsif ($token->{type} eq 'comment') {
4434     my $comment = $self->{document}->create_comment ($token->{data});
4435     $open_elements->[-1]->[0]->append_child ($comment);
4436     !!!next-token;
4437     redo B;
4438     } elsif ($token->{type} eq 'start tag') {
4439     if ($token->{tag_name} eq 'noframes') {
4440     $in_body->(sub {
4441     $open_elements->[-1]->[0]->append_child (shift);
4442     });
4443     redo B;
4444     } else {
4445     #
4446     }
4447     } elsif ($token->{type} eq 'end tag') {
4448     if ($token->{tag_name} eq 'html') {
4449     $phase = 'trailing end';
4450     !!!next-token;
4451     redo B;
4452     } else {
4453     #
4454     }
4455     } else {
4456     #
4457     }
4458    
4459     !!!parse-error;
4460     ## Ignore the token
4461     !!!next-token;
4462     redo B;
4463    
4464     ## ISSUE: An issue in spec there
4465     } else {
4466     die "$0: $insertion_mode: Unknown insertion mode";
4467     }
4468     }
4469     } elsif ($phase eq 'trailing end') {
4470     ## states in the main stage is preserved yet # MUST
4471    
4472     if ($token->{type} eq 'DOCTYPE') {
4473     !!!parse-error;
4474     ## Ignore the token
4475     !!!next-token;
4476     redo B;
4477     } elsif ($token->{type} eq 'comment') {
4478     my $comment = $self->{document}->create_comment ($token->{data});
4479     $self->{document}->append_child ($comment);
4480     !!!next-token;
4481     redo B;
4482     } elsif ($token->{type} eq 'character') {
4483     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4484     ## As if in the main phase.
4485     ## NOTE: The insertion mode in the main phase
4486     ## just before the phase has been changed to the trailing
4487     ## end phase is either "after body" or "after frameset".
4488     $reconstruct_active_formatting_elements->()
4489     if $phase eq 'main';
4490    
4491     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
4492    
4493     unless (length $token->{data}) {
4494     !!!next-token;
4495     redo B;
4496     }
4497     }
4498    
4499     !!!parse-error;
4500     $phase = 'main';
4501     ## reprocess
4502     redo B;
4503     } elsif ($token->{type} eq 'start tag' or
4504     $token->{type} eq 'end tag') {
4505     !!!parse-error;
4506     $phase = 'main';
4507     ## reprocess
4508     redo B;
4509     } elsif ($token->{type} eq 'end-of-file') {
4510     ## Stop parsing
4511     last B;
4512     } else {
4513     die "$0: $token->{type}: Unknown token";
4514     }
4515     }
4516     } # B
4517    
4518     ## Stop parsing # MUST
4519    
4520     ## TODO: script stuffs
4521     } # _construct_tree
4522    
4523     sub inner_html ($$$) {
4524     my ($class, $node, $on_error) = @_;
4525    
4526     ## Step 1
4527     my $s = '';
4528    
4529     my $in_cdata;
4530     my $parent = $node;
4531     while (defined $parent) {
4532     if ($parent->node_type == 1 and
4533     $parent->namespace_uri eq 'http://www.w3.org/1999/xhtml' and
4534     {
4535     style => 1, script => 1, xmp => 1, iframe => 1,
4536     noembed => 1, noframes => 1, noscript => 1,
4537     }->{$parent->local_name}) { ## TODO: case thingy
4538     $in_cdata = 1;
4539     }
4540     $parent = $parent->parent_node;
4541     }
4542    
4543     ## Step 2
4544     my @node = @{$node->child_nodes};
4545     C: while (@node) {
4546     my $child = shift @node;
4547     unless (ref $child) {
4548     if ($child eq 'cdata-out') {
4549     $in_cdata = 0;
4550     } else {
4551     $s .= $child; # end tag
4552     }
4553     next C;
4554     }
4555    
4556     my $nt = $child->node_type;
4557     if ($nt == 1) { # Element
4558     my $tag_name = lc $child->tag_name; ## ISSUE: Definition of "lowercase"
4559     $s .= '<' . $tag_name;
4560    
4561     ## ISSUE: Non-html elements
4562    
4563     my @attrs = @{$child->attributes}; # sort order MUST be stable
4564     for my $attr (@attrs) { # order is implementation dependent
4565     my $attr_name = lc $attr->name; ## ISSUE: Definition of "lowercase"
4566     $s .= ' ' . $attr_name . '="';
4567     my $attr_value = $attr->value;
4568     ## escape
4569     $attr_value =~ s/&/&amp;/g;
4570     $attr_value =~ s/</&lt;/g;
4571     $attr_value =~ s/>/&gt;/g;
4572     $attr_value =~ s/"/&quot;/g;
4573     $s .= $attr_value . '"';
4574     }
4575     $s .= '>';
4576    
4577     next C if {
4578     area => 1, base => 1, basefont => 1, bgsound => 1,
4579     br => 1, col => 1, embed => 1, frame => 1, hr => 1,
4580     img => 1, input => 1, link => 1, meta => 1, param => 1,
4581     spacer => 1, wbr => 1,
4582     }->{$tag_name};
4583    
4584     if (not $in_cdata and {
4585     style => 1, script => 1, xmp => 1, iframe => 1,
4586     noembed => 1, noframes => 1, noscript => 1,
4587     }->{$tag_name}) {
4588     unshift @node, 'cdata-out';
4589     $in_cdata = 1;
4590     }
4591    
4592     unshift @node, @{$child->child_nodes}, '</' . $tag_name . '>';
4593     } elsif ($nt == 3 or $nt == 4) {
4594     if ($in_cdata) {
4595     $s .= $child->data;
4596     } else {
4597     my $value = $child->data;
4598     $value =~ s/&/&amp;/g;
4599     $value =~ s/</&lt;/g;
4600     $value =~ s/>/&gt;/g;
4601     $value =~ s/"/&quot;/g;
4602     $s .= $value;
4603     }
4604     } elsif ($nt == 8) {
4605     $s .= '<!--' . $child->data . '-->';
4606     } elsif ($nt == 10) {
4607     $s .= '<!DOCTYPE ' . $child->name . '>';
4608     } elsif ($nt == 5) { # entrefs
4609     push @node, @{$child->child_nodes};
4610     } else {
4611     $on_error->($child);
4612     }
4613     } # C
4614    
4615     ## Step 3
4616     return \$s;
4617     } # inner_html
4618    
4619 wakaba 1.1 1;
4620 wakaba 1.6 # $Date: 2007/04/30 12:06:12 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24