/[suikacvs]/markup/html/whatpm/What/HTML.pm
Suika

Contents of /markup/html/whatpm/What/HTML.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.8 - (hide annotations) (download)
Tue May 1 06:22:12 2007 UTC (17 years, 7 months ago) by wakaba
Branch: MAIN
Changes since 1.7: +300 -159 lines
++ whatpm/What/ChangeLog	1 May 2007 06:20:06 -0000
2007-05-01  Wakaba  <wakaba@suika.fam.cx>

	* NanoDOM.pm (last_child, previous_sibling): New attributes.
	(clone_node): Attribute nodes were not completely copied.

	* HTML.pm.src: Many bugs are fixed.

++ whatpm/t/ChangeLog	1 May 2007 06:21:52 -0000
2007-05-01  Wakaba  <wakaba@suika.fam.cx>

	* HTML-tree.t: New test file is added.  Sort key
	was incorrect.

	* HTML-tokenizer.t: New test file is added.

	* tokenizer-test-1.test, tree-test-1.dat: New tests.

1 wakaba 1.1 package What::HTML;
2     use strict;
3 wakaba 1.8 our $VERSION=do{my @r=(q$Revision: 1.6 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.1
5     ## This is a very, very early version of an HTML parser.
6    
7     my $permitted_slash_tag_name = {
8     base => 1,
9     link => 1,
10     meta => 1,
11     hr => 1,
12     br => 1,
13     img=> 1,
14     embed => 1,
15     param => 1,
16     area => 1,
17     col => 1,
18     input => 1,
19     };
20    
21 wakaba 1.6 my $entity_char = {
22     AElig => "\x{00C6}",
23     Aacute => "\x{00C1}",
24     Acirc => "\x{00C2}",
25     Agrave => "\x{00C0}",
26     Alpha => "\x{0391}",
27     Aring => "\x{00C5}",
28     Atilde => "\x{00C3}",
29     Auml => "\x{00C4}",
30     Beta => "\x{0392}",
31     Ccedil => "\x{00C7}",
32     Chi => "\x{03A7}",
33     Dagger => "\x{2021}",
34     Delta => "\x{0394}",
35     ETH => "\x{00D0}",
36     Eacute => "\x{00C9}",
37     Ecirc => "\x{00CA}",
38     Egrave => "\x{00C8}",
39     Epsilon => "\x{0395}",
40     Eta => "\x{0397}",
41     Euml => "\x{00CB}",
42     Gamma => "\x{0393}",
43     Iacute => "\x{00CD}",
44     Icirc => "\x{00CE}",
45     Igrave => "\x{00CC}",
46     Iota => "\x{0399}",
47     Iuml => "\x{00CF}",
48     Kappa => "\x{039A}",
49     Lambda => "\x{039B}",
50     Mu => "\x{039C}",
51     Ntilde => "\x{00D1}",
52     Nu => "\x{039D}",
53     OElig => "\x{0152}",
54     Oacute => "\x{00D3}",
55     Ocirc => "\x{00D4}",
56     Ograve => "\x{00D2}",
57     Omega => "\x{03A9}",
58     Omicron => "\x{039F}",
59     Oslash => "\x{00D8}",
60     Otilde => "\x{00D5}",
61     Ouml => "\x{00D6}",
62     Phi => "\x{03A6}",
63     Pi => "\x{03A0}",
64     Prime => "\x{2033}",
65     Psi => "\x{03A8}",
66     Rho => "\x{03A1}",
67     Scaron => "\x{0160}",
68     Sigma => "\x{03A3}",
69     THORN => "\x{00DE}",
70     Tau => "\x{03A4}",
71     Theta => "\x{0398}",
72     Uacute => "\x{00DA}",
73     Ucirc => "\x{00DB}",
74     Ugrave => "\x{00D9}",
75     Upsilon => "\x{03A5}",
76     Uuml => "\x{00DC}",
77     Xi => "\x{039E}",
78     Yacute => "\x{00DD}",
79     Yuml => "\x{0178}",
80     Zeta => "\x{0396}",
81     aacute => "\x{00E1}",
82     acirc => "\x{00E2}",
83     acute => "\x{00B4}",
84     aelig => "\x{00E6}",
85     agrave => "\x{00E0}",
86     alefsym => "\x{2135}",
87     alpha => "\x{03B1}",
88     amp => "\x{0026}",
89     AMP => "\x{0026}",
90     and => "\x{2227}",
91     ang => "\x{2220}",
92     apos => "\x{0027}",
93     aring => "\x{00E5}",
94     asymp => "\x{2248}",
95     atilde => "\x{00E3}",
96     auml => "\x{00E4}",
97     bdquo => "\x{201E}",
98     beta => "\x{03B2}",
99     brvbar => "\x{00A6}",
100     bull => "\x{2022}",
101     cap => "\x{2229}",
102     ccedil => "\x{00E7}",
103     cedil => "\x{00B8}",
104     cent => "\x{00A2}",
105     chi => "\x{03C7}",
106     circ => "\x{02C6}",
107     clubs => "\x{2663}",
108     cong => "\x{2245}",
109     copy => "\x{00A9}",
110     COPY => "\x{00A9}",
111     crarr => "\x{21B5}",
112     cup => "\x{222A}",
113     curren => "\x{00A4}",
114     dArr => "\x{21D3}",
115     dagger => "\x{2020}",
116     darr => "\x{2193}",
117     deg => "\x{00B0}",
118     delta => "\x{03B4}",
119     diams => "\x{2666}",
120     divide => "\x{00F7}",
121     eacute => "\x{00E9}",
122     ecirc => "\x{00EA}",
123     egrave => "\x{00E8}",
124     empty => "\x{2205}",
125     emsp => "\x{2003}",
126     ensp => "\x{2002}",
127     epsilon => "\x{03B5}",
128     equiv => "\x{2261}",
129     eta => "\x{03B7}",
130     eth => "\x{00F0}",
131     euml => "\x{00EB}",
132     euro => "\x{20AC}",
133     exist => "\x{2203}",
134     fnof => "\x{0192}",
135     forall => "\x{2200}",
136     frac12 => "\x{00BD}",
137     frac14 => "\x{00BC}",
138     frac34 => "\x{00BE}",
139     frasl => "\x{2044}",
140     gamma => "\x{03B3}",
141     ge => "\x{2265}",
142     gt => "\x{003E}",
143     GT => "\x{003E}",
144     hArr => "\x{21D4}",
145     harr => "\x{2194}",
146     hearts => "\x{2665}",
147     hellip => "\x{2026}",
148     iacute => "\x{00ED}",
149     icirc => "\x{00EE}",
150     iexcl => "\x{00A1}",
151     igrave => "\x{00EC}",
152     image => "\x{2111}",
153     infin => "\x{221E}",
154     int => "\x{222B}",
155     iota => "\x{03B9}",
156     iquest => "\x{00BF}",
157     isin => "\x{2208}",
158     iuml => "\x{00EF}",
159     kappa => "\x{03BA}",
160     lArr => "\x{21D0}",
161     lambda => "\x{03BB}",
162     lang => "\x{2329}",
163     laquo => "\x{00AB}",
164     larr => "\x{2190}",
165     lceil => "\x{2308}",
166     ldquo => "\x{201C}",
167     le => "\x{2264}",
168     lfloor => "\x{230A}",
169     lowast => "\x{2217}",
170     loz => "\x{25CA}",
171     lrm => "\x{200E}",
172     lsaquo => "\x{2039}",
173     lsquo => "\x{2018}",
174     lt => "\x{003C}",
175     LT => "\x{003C}",
176     macr => "\x{00AF}",
177     mdash => "\x{2014}",
178     micro => "\x{00B5}",
179     middot => "\x{00B7}",
180     minus => "\x{2212}",
181     mu => "\x{03BC}",
182     nabla => "\x{2207}",
183     nbsp => "\x{00A0}",
184     ndash => "\x{2013}",
185     ne => "\x{2260}",
186     ni => "\x{220B}",
187     not => "\x{00AC}",
188     notin => "\x{2209}",
189     nsub => "\x{2284}",
190     ntilde => "\x{00F1}",
191     nu => "\x{03BD}",
192     oacute => "\x{00F3}",
193     ocirc => "\x{00F4}",
194     oelig => "\x{0153}",
195     ograve => "\x{00F2}",
196     oline => "\x{203E}",
197     omega => "\x{03C9}",
198     omicron => "\x{03BF}",
199     oplus => "\x{2295}",
200     or => "\x{2228}",
201     ordf => "\x{00AA}",
202     ordm => "\x{00BA}",
203     oslash => "\x{00F8}",
204     otilde => "\x{00F5}",
205     otimes => "\x{2297}",
206     ouml => "\x{00F6}",
207     para => "\x{00B6}",
208     part => "\x{2202}",
209     permil => "\x{2030}",
210     perp => "\x{22A5}",
211     phi => "\x{03C6}",
212     pi => "\x{03C0}",
213     piv => "\x{03D6}",
214     plusmn => "\x{00B1}",
215     pound => "\x{00A3}",
216     prime => "\x{2032}",
217     prod => "\x{220F}",
218     prop => "\x{221D}",
219     psi => "\x{03C8}",
220     quot => "\x{0022}",
221     QUOT => "\x{0022}",
222     rArr => "\x{21D2}",
223     radic => "\x{221A}",
224     rang => "\x{232A}",
225     raquo => "\x{00BB}",
226     rarr => "\x{2192}",
227     rceil => "\x{2309}",
228     rdquo => "\x{201D}",
229     real => "\x{211C}",
230     reg => "\x{00AE}",
231     REG => "\x{00AE}",
232     rfloor => "\x{230B}",
233     rho => "\x{03C1}",
234     rlm => "\x{200F}",
235     rsaquo => "\x{203A}",
236     rsquo => "\x{2019}",
237     sbquo => "\x{201A}",
238     scaron => "\x{0161}",
239     sdot => "\x{22C5}",
240     sect => "\x{00A7}",
241     shy => "\x{00AD}",
242     sigma => "\x{03C3}",
243     sigmaf => "\x{03C2}",
244     sim => "\x{223C}",
245     spades => "\x{2660}",
246     sub => "\x{2282}",
247     sube => "\x{2286}",
248     sum => "\x{2211}",
249     sup => "\x{2283}",
250     sup1 => "\x{00B9}",
251     sup2 => "\x{00B2}",
252     sup3 => "\x{00B3}",
253     supe => "\x{2287}",
254     szlig => "\x{00DF}",
255     tau => "\x{03C4}",
256     there4 => "\x{2234}",
257     theta => "\x{03B8}",
258     thetasym => "\x{03D1}",
259     thinsp => "\x{2009}",
260     thorn => "\x{00FE}",
261     tilde => "\x{02DC}",
262     times => "\x{00D7}",
263     trade => "\x{2122}",
264     uArr => "\x{21D1}",
265     uacute => "\x{00FA}",
266     uarr => "\x{2191}",
267     ucirc => "\x{00FB}",
268     ugrave => "\x{00F9}",
269     uml => "\x{00A8}",
270     upsih => "\x{03D2}",
271     upsilon => "\x{03C5}",
272     uuml => "\x{00FC}",
273     weierp => "\x{2118}",
274     xi => "\x{03BE}",
275     yacute => "\x{00FD}",
276     yen => "\x{00A5}",
277     yuml => "\x{00FF}",
278     zeta => "\x{03B6}",
279     zwj => "\x{200D}",
280     zwnj => "\x{200C}",
281     };
282    
283 wakaba 1.3 my $special_category = {
284     address => 1, area => 1, base => 1, basefont => 1, bgsound => 1,
285     blockquote => 1, body => 1, br => 1, center => 1, col => 1, colgroup => 1,
286     dd => 1, dir => 1, div => 1, dl => 1, dt => 1, embed => 1, fieldset => 1,
287     form => 1, frame => 1, frameset => 1, h1 => 1, h2 => 1, h3 => 1,
288     h4 => 1, h5 => 1, h6 => 1, head => 1, hr => 1, iframe => 1, image => 1,
289     img => 1, input => 1, isindex => 1, li => 1, link => 1, listing => 1,
290     menu => 1, meta => 1, noembed => 1, noframes => 1, noscript => 1,
291     ol => 1, optgroup => 1, option => 1, p => 1, param => 1, plaintext => 1,
292     pre => 1, script => 1, select => 1, spacer => 1, style => 1, tbody => 1,
293     textarea => 1, tfoot => 1, thead => 1, title => 1, tr => 1, ul => 1, wbr => 1,
294     };
295     my $scoping_category = {
296     button => 1, caption => 1, html => 1, marquee => 1, object => 1,
297     table => 1, td => 1, th => 1,
298     };
299     my $formatting_category = {
300     a => 1, b => 1, big => 1, em => 1, font => 1, i => 1, nobr => 1,
301     s => 1, small => 1, strile => 1, strong => 1, tt => 1, u => 1,
302     };
303     # $phrasing_category: all other elements
304    
305 wakaba 1.1 sub new ($) {
306     my $class = shift;
307     my $self = bless {}, $class;
308     $self->{set_next_input_character} = sub {
309     $self->{next_input_character} = -1;
310     };
311     $self->{parse_error} = sub {
312     #
313     };
314     return $self;
315     } # new
316    
317     ## Implementations MUST act as if state machine in the spec
318    
319     sub _initialize_tokenizer ($) {
320     my $self = shift;
321     $self->{state} = 'data'; # MUST
322     $self->{content_model_flag} = 'PCDATA'; # be
323     undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE
324     undef $self->{current_attribute};
325     undef $self->{last_emitted_start_tag_name};
326     undef $self->{last_attribute_value_state};
327     $self->{char} = [];
328     # $self->{next_input_character}
329    
330     if (@{$self->{char}}) {
331     $self->{next_input_character} = shift @{$self->{char}};
332     } else {
333     $self->{set_next_input_character}->($self);
334     }
335    
336     $self->{token} = [];
337     } # _initialize_tokenizer
338    
339     ## A token has:
340     ## ->{type} eq 'DOCTYPE', 'start tag', 'end tag', 'comment',
341     ## 'character', or 'end-of-file'
342     ## ->{name} (DOCTYPE, start tag (tagname), end tag (tagname))
343     ## ISSUE: the spec need s/tagname/tag name/
344     ## ->{error} == 1 or 0 (DOCTYPE)
345     ## ->{attributes} isa HASH (start tag, end tag)
346     ## ->{data} (comment, character)
347    
348     ## Macros
349     ## Macros MUST be preceded by three EXCLAMATION MARKs.
350     ## emit ($token)
351     ## Emits the specified token.
352    
353     ## Emitted token MUST immediately be handled by the tree construction state.
354    
355     ## Before each step, UA MAY check to see if either one of the scripts in
356     ## "list of scripts that will execute as soon as possible" or the first
357     ## script in the "list of scripts that will execute asynchronously",
358     ## has completed loading. If one has, then it MUST be executed
359     ## and removed from the list.
360    
361     sub _get_next_token ($) {
362     my $self = shift;
363     if (@{$self->{token}}) {
364     return shift @{$self->{token}};
365     }
366    
367     A: {
368     if ($self->{state} eq 'data') {
369     if ($self->{next_input_character} == 0x0026) { # &
370     if ($self->{content_model_flag} eq 'PCDATA' or
371     $self->{content_model_flag} eq 'RCDATA') {
372     $self->{state} = 'entity data';
373    
374     if (@{$self->{char}}) {
375     $self->{next_input_character} = shift @{$self->{char}};
376     } else {
377     $self->{set_next_input_character}->($self);
378     }
379    
380     redo A;
381     } else {
382     #
383     }
384     } elsif ($self->{next_input_character} == 0x003C) { # <
385     if ($self->{content_model_flag} ne 'PLAINTEXT') {
386     $self->{state} = 'tag open';
387    
388     if (@{$self->{char}}) {
389     $self->{next_input_character} = shift @{$self->{char}};
390     } else {
391     $self->{set_next_input_character}->($self);
392     }
393    
394     redo A;
395     } else {
396     #
397     }
398     } elsif ($self->{next_input_character} == -1) {
399     return ({type => 'end-of-file'});
400     last A; ## TODO: ok?
401     }
402     # Anything else
403     my $token = {type => 'character',
404     data => chr $self->{next_input_character}};
405     ## Stay in the data state
406    
407     if (@{$self->{char}}) {
408     $self->{next_input_character} = shift @{$self->{char}};
409     } else {
410     $self->{set_next_input_character}->($self);
411     }
412    
413    
414     return ($token);
415    
416     redo A;
417     } elsif ($self->{state} eq 'entity data') {
418     ## (cannot happen in CDATA state)
419    
420     my $token = $self->_tokenize_attempt_to_consume_an_entity;
421    
422     $self->{state} = 'data';
423     # next-input-character is already done
424    
425     unless (defined $token) {
426     return ({type => 'character', data => '&'});
427     } else {
428     return ($token);
429     }
430    
431     redo A;
432     } elsif ($self->{state} eq 'tag open') {
433     if ($self->{content_model_flag} eq 'RCDATA' or
434     $self->{content_model_flag} eq 'CDATA') {
435     if ($self->{next_input_character} == 0x002F) { # /
436    
437     if (@{$self->{char}}) {
438     $self->{next_input_character} = shift @{$self->{char}};
439     } else {
440     $self->{set_next_input_character}->($self);
441     }
442    
443     $self->{state} = 'close tag open';
444     redo A;
445     } else {
446     ## reconsume
447     $self->{state} = 'data';
448    
449 wakaba 1.8 return ({type => 'character', data => '<'});
450 wakaba 1.1
451     redo A;
452     }
453     } elsif ($self->{content_model_flag} eq 'PCDATA') {
454     if ($self->{next_input_character} == 0x0021) { # !
455     $self->{state} = 'markup declaration open';
456    
457     if (@{$self->{char}}) {
458     $self->{next_input_character} = shift @{$self->{char}};
459     } else {
460     $self->{set_next_input_character}->($self);
461     }
462    
463     redo A;
464     } elsif ($self->{next_input_character} == 0x002F) { # /
465     $self->{state} = 'close tag open';
466    
467     if (@{$self->{char}}) {
468     $self->{next_input_character} = shift @{$self->{char}};
469     } else {
470     $self->{set_next_input_character}->($self);
471     }
472    
473     redo A;
474     } elsif (0x0041 <= $self->{next_input_character} and
475     $self->{next_input_character} <= 0x005A) { # A..Z
476     $self->{current_token}
477     = {type => 'start tag',
478     tag_name => chr ($self->{next_input_character} + 0x0020)};
479     $self->{state} = 'tag name';
480    
481     if (@{$self->{char}}) {
482     $self->{next_input_character} = shift @{$self->{char}};
483     } else {
484     $self->{set_next_input_character}->($self);
485     }
486    
487     redo A;
488     } elsif (0x0061 <= $self->{next_input_character} and
489     $self->{next_input_character} <= 0x007A) { # a..z
490     $self->{current_token} = {type => 'start tag',
491     tag_name => chr ($self->{next_input_character})};
492     $self->{state} = 'tag name';
493    
494     if (@{$self->{char}}) {
495     $self->{next_input_character} = shift @{$self->{char}};
496     } else {
497     $self->{set_next_input_character}->($self);
498     }
499    
500     redo A;
501     } elsif ($self->{next_input_character} == 0x003E) { # >
502     $self->{parse_error}->();
503     $self->{state} = 'data';
504    
505     if (@{$self->{char}}) {
506     $self->{next_input_character} = shift @{$self->{char}};
507     } else {
508     $self->{set_next_input_character}->($self);
509     }
510    
511    
512 wakaba 1.4 return ({type => 'character', data => '<>'});
513 wakaba 1.1
514     redo A;
515     } elsif ($self->{next_input_character} == 0x003F) { # ?
516     $self->{parse_error}->();
517     $self->{state} = 'bogus comment';
518     ## $self->{next_input_character} is intentionally left as is
519     redo A;
520     } else {
521     $self->{parse_error}->();
522     $self->{state} = 'data';
523     ## reconsume
524    
525     return ({type => 'character', data => '<'});
526    
527     redo A;
528     }
529     } else {
530     die "$0: $self->{content_model_flag}: Unknown content model flag";
531     }
532     } elsif ($self->{state} eq 'close tag open') {
533     if ($self->{content_model_flag} eq 'RCDATA' or
534     $self->{content_model_flag} eq 'CDATA') {
535     my @next_char;
536     TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
537     push @next_char, $self->{next_input_character};
538     my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
539     my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
540     if ($self->{next_input_character} == $c or $self->{next_input_character} == $C) {
541    
542     if (@{$self->{char}}) {
543     $self->{next_input_character} = shift @{$self->{char}};
544     } else {
545     $self->{set_next_input_character}->($self);
546     }
547    
548     next TAGNAME;
549     } else {
550     $self->{parse_error}->();
551     $self->{next_input_character} = shift @next_char; # reconsume
552     unshift @{$self->{char}}, (@next_char);
553     $self->{state} = 'data';
554    
555     return ({type => 'character', data => '</'});
556    
557     redo A;
558     }
559     }
560 wakaba 1.3 push @next_char, $self->{next_input_character};
561 wakaba 1.1
562 wakaba 1.3 unless ($self->{next_input_character} == 0x0009 or # HT
563     $self->{next_input_character} == 0x000A or # LF
564     $self->{next_input_character} == 0x000B or # VT
565     $self->{next_input_character} == 0x000C or # FF
566     $self->{next_input_character} == 0x0020 or # SP
567     $self->{next_input_character} == 0x003E or # >
568     $self->{next_input_character} == 0x002F or # /
569     $self->{next_input_character} == 0x003C or # <
570 wakaba 1.1 $self->{next_input_character} == -1) {
571     $self->{parse_error}->();
572     $self->{next_input_character} = shift @next_char; # reconsume
573     unshift @{$self->{char}}, (@next_char);
574     $self->{state} = 'data';
575    
576     return ({type => 'character', data => '</'});
577    
578     redo A;
579     } else {
580     $self->{next_input_character} = shift @next_char;
581     unshift @{$self->{char}}, (@next_char);
582     # and consume...
583     }
584     }
585    
586     if (0x0041 <= $self->{next_input_character} and
587     $self->{next_input_character} <= 0x005A) { # A..Z
588     $self->{current_token} = {type => 'end tag',
589     tag_name => chr ($self->{next_input_character} + 0x0020)};
590     $self->{state} = 'tag name';
591    
592     if (@{$self->{char}}) {
593     $self->{next_input_character} = shift @{$self->{char}};
594     } else {
595     $self->{set_next_input_character}->($self);
596     }
597    
598     redo A;
599     } elsif (0x0061 <= $self->{next_input_character} and
600     $self->{next_input_character} <= 0x007A) { # a..z
601     $self->{current_token} = {type => 'end tag',
602     tag_name => chr ($self->{next_input_character})};
603     $self->{state} = 'tag name';
604    
605     if (@{$self->{char}}) {
606     $self->{next_input_character} = shift @{$self->{char}};
607     } else {
608     $self->{set_next_input_character}->($self);
609     }
610    
611     redo A;
612     } elsif ($self->{next_input_character} == 0x003E) { # >
613     $self->{parse_error}->();
614     $self->{state} = 'data';
615    
616     if (@{$self->{char}}) {
617     $self->{next_input_character} = shift @{$self->{char}};
618     } else {
619     $self->{set_next_input_character}->($self);
620     }
621    
622     redo A;
623     } elsif ($self->{next_input_character} == -1) {
624     $self->{parse_error}->();
625     $self->{state} = 'data';
626     # reconsume
627    
628     return ({type => 'character', data => '</'});
629    
630     redo A;
631     } else {
632     $self->{parse_error}->();
633     $self->{state} = 'bogus comment';
634     ## $self->{next_input_character} is intentionally left as is
635     redo A;
636     }
637     } elsif ($self->{state} eq 'tag name') {
638     if ($self->{next_input_character} == 0x0009 or # HT
639     $self->{next_input_character} == 0x000A or # LF
640     $self->{next_input_character} == 0x000B or # VT
641     $self->{next_input_character} == 0x000C or # FF
642     $self->{next_input_character} == 0x0020) { # SP
643     $self->{state} = 'before attribute name';
644    
645     if (@{$self->{char}}) {
646     $self->{next_input_character} = shift @{$self->{char}};
647     } else {
648     $self->{set_next_input_character}->($self);
649     }
650    
651     redo A;
652     } elsif ($self->{next_input_character} == 0x003E) { # >
653     if ($self->{current_token}->{type} eq 'start tag') {
654     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
655     } elsif ($self->{current_token}->{type} eq 'end tag') {
656     $self->{content_model_flag} = 'PCDATA'; # MUST
657 wakaba 1.3 if ($self->{current_token}->{attributes}) {
658 wakaba 1.1 $self->{parse_error}->();
659     }
660     } else {
661     die "$0: $self->{current_token}->{type}: Unknown token type";
662     }
663     $self->{state} = 'data';
664    
665     if (@{$self->{char}}) {
666     $self->{next_input_character} = shift @{$self->{char}};
667     } else {
668     $self->{set_next_input_character}->($self);
669     }
670    
671    
672     return ($self->{current_token}); # start tag or end tag
673     undef $self->{current_token};
674    
675     redo A;
676     } elsif (0x0041 <= $self->{next_input_character} and
677     $self->{next_input_character} <= 0x005A) { # A..Z
678     $self->{current_token}->{tag_name} .= chr ($self->{next_input_character} + 0x0020);
679     # start tag or end tag
680     ## Stay in this state
681    
682     if (@{$self->{char}}) {
683     $self->{next_input_character} = shift @{$self->{char}};
684     } else {
685     $self->{set_next_input_character}->($self);
686     }
687    
688     redo A;
689     } elsif ($self->{next_input_character} == 0x003C or # <
690     $self->{next_input_character} == -1) {
691     $self->{parse_error}->();
692     if ($self->{current_token}->{type} eq 'start tag') {
693     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
694     } elsif ($self->{current_token}->{type} eq 'end tag') {
695     $self->{content_model_flag} = 'PCDATA'; # MUST
696 wakaba 1.3 if ($self->{current_token}->{attributes}) {
697 wakaba 1.1 $self->{parse_error}->();
698     }
699     } else {
700     die "$0: $self->{current_token}->{type}: Unknown token type";
701     }
702     $self->{state} = 'data';
703     # reconsume
704    
705     return ($self->{current_token}); # start tag or end tag
706     undef $self->{current_token};
707    
708     redo A;
709     } elsif ($self->{next_input_character} == 0x002F) { # /
710    
711     if (@{$self->{char}}) {
712     $self->{next_input_character} = shift @{$self->{char}};
713     } else {
714     $self->{set_next_input_character}->($self);
715     }
716    
717     if ($self->{next_input_character} == 0x003E and # >
718     $self->{current_token}->{type} eq 'start tag' and
719     $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
720     # permitted slash
721     #
722     } else {
723     $self->{parse_error}->();
724     }
725     $self->{state} = 'before attribute name';
726     # next-input-character is already done
727     redo A;
728     } else {
729     $self->{current_token}->{tag_name} .= chr $self->{next_input_character};
730     # start tag or end tag
731     ## Stay in the state
732    
733     if (@{$self->{char}}) {
734     $self->{next_input_character} = shift @{$self->{char}};
735     } else {
736     $self->{set_next_input_character}->($self);
737     }
738    
739     redo A;
740     }
741     } elsif ($self->{state} eq 'before attribute name') {
742     if ($self->{next_input_character} == 0x0009 or # HT
743     $self->{next_input_character} == 0x000A or # LF
744     $self->{next_input_character} == 0x000B or # VT
745     $self->{next_input_character} == 0x000C or # FF
746     $self->{next_input_character} == 0x0020) { # SP
747     ## Stay in the state
748    
749     if (@{$self->{char}}) {
750     $self->{next_input_character} = shift @{$self->{char}};
751     } else {
752     $self->{set_next_input_character}->($self);
753     }
754    
755     redo A;
756     } elsif ($self->{next_input_character} == 0x003E) { # >
757     if ($self->{current_token}->{type} eq 'start tag') {
758     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
759     } elsif ($self->{current_token}->{type} eq 'end tag') {
760     $self->{content_model_flag} = 'PCDATA'; # MUST
761 wakaba 1.3 if ($self->{current_token}->{attributes}) {
762 wakaba 1.1 $self->{parse_error}->();
763     }
764     } else {
765     die "$0: $self->{current_token}->{type}: Unknown token type";
766     }
767     $self->{state} = 'data';
768    
769     if (@{$self->{char}}) {
770     $self->{next_input_character} = shift @{$self->{char}};
771     } else {
772     $self->{set_next_input_character}->($self);
773     }
774    
775    
776     return ($self->{current_token}); # start tag or end tag
777     undef $self->{current_token};
778    
779     redo A;
780     } elsif (0x0041 <= $self->{next_input_character} and
781     $self->{next_input_character} <= 0x005A) { # A..Z
782     $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
783     value => ''};
784     $self->{state} = 'attribute name';
785    
786     if (@{$self->{char}}) {
787     $self->{next_input_character} = shift @{$self->{char}};
788     } else {
789     $self->{set_next_input_character}->($self);
790     }
791    
792     redo A;
793     } elsif ($self->{next_input_character} == 0x002F) { # /
794    
795     if (@{$self->{char}}) {
796     $self->{next_input_character} = shift @{$self->{char}};
797     } else {
798     $self->{set_next_input_character}->($self);
799     }
800    
801     if ($self->{next_input_character} == 0x003E and # >
802     $self->{current_token}->{type} eq 'start tag' and
803     $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
804     # permitted slash
805     #
806     } else {
807     $self->{parse_error}->();
808     }
809     ## Stay in the state
810     # next-input-character is already done
811     redo A;
812     } elsif ($self->{next_input_character} == 0x003C or # <
813     $self->{next_input_character} == -1) {
814     $self->{parse_error}->();
815     if ($self->{current_token}->{type} eq 'start tag') {
816     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
817     } elsif ($self->{current_token}->{type} eq 'end tag') {
818     $self->{content_model_flag} = 'PCDATA'; # MUST
819 wakaba 1.3 if ($self->{current_token}->{attributes}) {
820 wakaba 1.1 $self->{parse_error}->();
821     }
822     } else {
823     die "$0: $self->{current_token}->{type}: Unknown token type";
824     }
825     $self->{state} = 'data';
826     # reconsume
827    
828     return ($self->{current_token}); # start tag or end tag
829     undef $self->{current_token};
830    
831     redo A;
832     } else {
833     $self->{current_attribute} = {name => chr ($self->{next_input_character}),
834     value => ''};
835     $self->{state} = 'attribute name';
836    
837     if (@{$self->{char}}) {
838     $self->{next_input_character} = shift @{$self->{char}};
839     } else {
840     $self->{set_next_input_character}->($self);
841     }
842    
843     redo A;
844     }
845     } elsif ($self->{state} eq 'attribute name') {
846     my $before_leave = sub {
847 wakaba 1.3 if (exists $self->{current_token}->{attributes} # start tag or end tag
848 wakaba 1.1 ->{$self->{current_attribute}->{name}}) { # MUST
849     $self->{parse_error}->();
850     ## Discard $self->{current_attribute} # MUST
851     } else {
852 wakaba 1.3 $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}
853 wakaba 1.1 = $self->{current_attribute};
854     }
855     }; # $before_leave
856    
857     if ($self->{next_input_character} == 0x0009 or # HT
858     $self->{next_input_character} == 0x000A or # LF
859     $self->{next_input_character} == 0x000B or # VT
860     $self->{next_input_character} == 0x000C or # FF
861     $self->{next_input_character} == 0x0020) { # SP
862     $before_leave->();
863     $self->{state} = 'after attribute name';
864    
865     if (@{$self->{char}}) {
866     $self->{next_input_character} = shift @{$self->{char}};
867     } else {
868     $self->{set_next_input_character}->($self);
869     }
870    
871     redo A;
872     } elsif ($self->{next_input_character} == 0x003D) { # =
873     $before_leave->();
874     $self->{state} = 'before attribute value';
875    
876     if (@{$self->{char}}) {
877     $self->{next_input_character} = shift @{$self->{char}};
878     } else {
879     $self->{set_next_input_character}->($self);
880     }
881    
882     redo A;
883     } elsif ($self->{next_input_character} == 0x003E) { # >
884     $before_leave->();
885     if ($self->{current_token}->{type} eq 'start tag') {
886     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
887     } elsif ($self->{current_token}->{type} eq 'end tag') {
888     $self->{content_model_flag} = 'PCDATA'; # MUST
889 wakaba 1.3 if ($self->{current_token}->{attributes}) {
890 wakaba 1.1 $self->{parse_error}->();
891     }
892     } else {
893     die "$0: $self->{current_token}->{type}: Unknown token type";
894     }
895     $self->{state} = 'data';
896    
897     if (@{$self->{char}}) {
898     $self->{next_input_character} = shift @{$self->{char}};
899     } else {
900     $self->{set_next_input_character}->($self);
901     }
902    
903    
904     return ($self->{current_token}); # start tag or end tag
905     undef $self->{current_token};
906    
907     redo A;
908     } elsif (0x0041 <= $self->{next_input_character} and
909     $self->{next_input_character} <= 0x005A) { # A..Z
910     $self->{current_attribute}->{name} .= chr ($self->{next_input_character} + 0x0020);
911     ## Stay in the state
912    
913     if (@{$self->{char}}) {
914     $self->{next_input_character} = shift @{$self->{char}};
915     } else {
916     $self->{set_next_input_character}->($self);
917     }
918    
919     redo A;
920     } elsif ($self->{next_input_character} == 0x002F) { # /
921     $before_leave->();
922    
923     if (@{$self->{char}}) {
924     $self->{next_input_character} = shift @{$self->{char}};
925     } else {
926     $self->{set_next_input_character}->($self);
927     }
928    
929     if ($self->{next_input_character} == 0x003E and # >
930     $self->{current_token}->{type} eq 'start tag' and
931     $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
932     # permitted slash
933     #
934     } else {
935     $self->{parse_error}->();
936     }
937     $self->{state} = 'before attribute name';
938     # next-input-character is already done
939     redo A;
940     } elsif ($self->{next_input_character} == 0x003C or # <
941     $self->{next_input_character} == -1) {
942     $self->{parse_error}->();
943     $before_leave->();
944     if ($self->{current_token}->{type} eq 'start tag') {
945     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
946     } elsif ($self->{current_token}->{type} eq 'end tag') {
947     $self->{content_model_flag} = 'PCDATA'; # MUST
948 wakaba 1.3 if ($self->{current_token}->{attributes}) {
949 wakaba 1.1 $self->{parse_error}->();
950     }
951     } else {
952     die "$0: $self->{current_token}->{type}: Unknown token type";
953     }
954     $self->{state} = 'data';
955     # reconsume
956    
957     return ($self->{current_token}); # start tag or end tag
958     undef $self->{current_token};
959    
960     redo A;
961     } else {
962     $self->{current_attribute}->{name} .= chr ($self->{next_input_character});
963     ## Stay in the state
964    
965     if (@{$self->{char}}) {
966     $self->{next_input_character} = shift @{$self->{char}};
967     } else {
968     $self->{set_next_input_character}->($self);
969     }
970    
971     redo A;
972     }
973     } elsif ($self->{state} eq 'after attribute name') {
974     if ($self->{next_input_character} == 0x0009 or # HT
975     $self->{next_input_character} == 0x000A or # LF
976     $self->{next_input_character} == 0x000B or # VT
977     $self->{next_input_character} == 0x000C or # FF
978     $self->{next_input_character} == 0x0020) { # SP
979     ## Stay in the state
980    
981     if (@{$self->{char}}) {
982     $self->{next_input_character} = shift @{$self->{char}};
983     } else {
984     $self->{set_next_input_character}->($self);
985     }
986    
987     redo A;
988     } elsif ($self->{next_input_character} == 0x003D) { # =
989     $self->{state} = 'before attribute value';
990    
991     if (@{$self->{char}}) {
992     $self->{next_input_character} = shift @{$self->{char}};
993     } else {
994     $self->{set_next_input_character}->($self);
995     }
996    
997     redo A;
998     } elsif ($self->{next_input_character} == 0x003E) { # >
999     if ($self->{current_token}->{type} eq 'start tag') {
1000     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1001     } elsif ($self->{current_token}->{type} eq 'end tag') {
1002     $self->{content_model_flag} = 'PCDATA'; # MUST
1003 wakaba 1.3 if ($self->{current_token}->{attributes}) {
1004 wakaba 1.1 $self->{parse_error}->();
1005     }
1006     } else {
1007     die "$0: $self->{current_token}->{type}: Unknown token type";
1008     }
1009     $self->{state} = 'data';
1010    
1011     if (@{$self->{char}}) {
1012     $self->{next_input_character} = shift @{$self->{char}};
1013     } else {
1014     $self->{set_next_input_character}->($self);
1015     }
1016    
1017    
1018     return ($self->{current_token}); # start tag or end tag
1019     undef $self->{current_token};
1020    
1021     redo A;
1022     } elsif (0x0041 <= $self->{next_input_character} and
1023     $self->{next_input_character} <= 0x005A) { # A..Z
1024     $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
1025     value => ''};
1026     $self->{state} = 'attribute name';
1027    
1028     if (@{$self->{char}}) {
1029     $self->{next_input_character} = shift @{$self->{char}};
1030     } else {
1031     $self->{set_next_input_character}->($self);
1032     }
1033    
1034     redo A;
1035     } elsif ($self->{next_input_character} == 0x002F) { # /
1036    
1037     if (@{$self->{char}}) {
1038     $self->{next_input_character} = shift @{$self->{char}};
1039     } else {
1040     $self->{set_next_input_character}->($self);
1041     }
1042    
1043     if ($self->{next_input_character} == 0x003E and # >
1044     $self->{current_token}->{type} eq 'start tag' and
1045     $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
1046     # permitted slash
1047     #
1048     } else {
1049     $self->{parse_error}->();
1050     }
1051     $self->{state} = 'before attribute name';
1052     # next-input-character is already done
1053     redo A;
1054     } elsif ($self->{next_input_character} == 0x003C or # <
1055     $self->{next_input_character} == -1) {
1056     $self->{parse_error}->();
1057     if ($self->{current_token}->{type} eq 'start tag') {
1058     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1059     } elsif ($self->{current_token}->{type} eq 'end tag') {
1060     $self->{content_model_flag} = 'PCDATA'; # MUST
1061 wakaba 1.3 if ($self->{current_token}->{attributes}) {
1062 wakaba 1.1 $self->{parse_error}->();
1063     }
1064     } else {
1065     die "$0: $self->{current_token}->{type}: Unknown token type";
1066     }
1067     $self->{state} = 'data';
1068     # reconsume
1069    
1070     return ($self->{current_token}); # start tag or end tag
1071     undef $self->{current_token};
1072    
1073     redo A;
1074     } else {
1075     $self->{current_attribute} = {name => chr ($self->{next_input_character}),
1076     value => ''};
1077     $self->{state} = 'attribute name';
1078    
1079     if (@{$self->{char}}) {
1080     $self->{next_input_character} = shift @{$self->{char}};
1081     } else {
1082     $self->{set_next_input_character}->($self);
1083     }
1084    
1085     redo A;
1086     }
1087     } elsif ($self->{state} eq 'before attribute value') {
1088     if ($self->{next_input_character} == 0x0009 or # HT
1089     $self->{next_input_character} == 0x000A or # LF
1090     $self->{next_input_character} == 0x000B or # VT
1091     $self->{next_input_character} == 0x000C or # FF
1092     $self->{next_input_character} == 0x0020) { # SP
1093     ## Stay in the state
1094    
1095     if (@{$self->{char}}) {
1096     $self->{next_input_character} = shift @{$self->{char}};
1097     } else {
1098     $self->{set_next_input_character}->($self);
1099     }
1100    
1101     redo A;
1102     } elsif ($self->{next_input_character} == 0x0022) { # "
1103     $self->{state} = 'attribute value (double-quoted)';
1104    
1105     if (@{$self->{char}}) {
1106     $self->{next_input_character} = shift @{$self->{char}};
1107     } else {
1108     $self->{set_next_input_character}->($self);
1109     }
1110    
1111     redo A;
1112     } elsif ($self->{next_input_character} == 0x0026) { # &
1113     $self->{state} = 'attribute value (unquoted)';
1114     ## reconsume
1115     redo A;
1116     } elsif ($self->{next_input_character} == 0x0027) { # '
1117     $self->{state} = 'attribute value (single-quoted)';
1118    
1119     if (@{$self->{char}}) {
1120     $self->{next_input_character} = shift @{$self->{char}};
1121     } else {
1122     $self->{set_next_input_character}->($self);
1123     }
1124    
1125     redo A;
1126     } elsif ($self->{next_input_character} == 0x003E) { # >
1127     if ($self->{current_token}->{type} eq 'start tag') {
1128     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1129     } elsif ($self->{current_token}->{type} eq 'end tag') {
1130     $self->{content_model_flag} = 'PCDATA'; # MUST
1131 wakaba 1.3 if ($self->{current_token}->{attributes}) {
1132 wakaba 1.1 $self->{parse_error}->();
1133     }
1134     } else {
1135     die "$0: $self->{current_token}->{type}: Unknown token type";
1136     }
1137     $self->{state} = 'data';
1138    
1139     if (@{$self->{char}}) {
1140     $self->{next_input_character} = shift @{$self->{char}};
1141     } else {
1142     $self->{set_next_input_character}->($self);
1143     }
1144    
1145    
1146     return ($self->{current_token}); # start tag or end tag
1147     undef $self->{current_token};
1148    
1149     redo A;
1150     } elsif ($self->{next_input_character} == 0x003C or # <
1151     $self->{next_input_character} == -1) {
1152     $self->{parse_error}->();
1153     if ($self->{current_token}->{type} eq 'start tag') {
1154     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1155     } elsif ($self->{current_token}->{type} eq 'end tag') {
1156     $self->{content_model_flag} = 'PCDATA'; # MUST
1157 wakaba 1.3 if ($self->{current_token}->{attributes}) {
1158 wakaba 1.1 $self->{parse_error}->();
1159     }
1160     } else {
1161     die "$0: $self->{current_token}->{type}: Unknown token type";
1162     }
1163     $self->{state} = 'data';
1164     ## reconsume
1165    
1166     return ($self->{current_token}); # start tag or end tag
1167     undef $self->{current_token};
1168    
1169     redo A;
1170     } else {
1171     $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1172     $self->{state} = 'attribute value (unquoted)';
1173    
1174     if (@{$self->{char}}) {
1175     $self->{next_input_character} = shift @{$self->{char}};
1176     } else {
1177     $self->{set_next_input_character}->($self);
1178     }
1179    
1180     redo A;
1181     }
1182     } elsif ($self->{state} eq 'attribute value (double-quoted)') {
1183     if ($self->{next_input_character} == 0x0022) { # "
1184     $self->{state} = 'before attribute name';
1185    
1186     if (@{$self->{char}}) {
1187     $self->{next_input_character} = shift @{$self->{char}};
1188     } else {
1189     $self->{set_next_input_character}->($self);
1190     }
1191    
1192     redo A;
1193     } elsif ($self->{next_input_character} == 0x0026) { # &
1194     $self->{last_attribute_value_state} = 'attribute value (double-quoted)';
1195     $self->{state} = 'entity in attribute value';
1196    
1197     if (@{$self->{char}}) {
1198     $self->{next_input_character} = shift @{$self->{char}};
1199     } else {
1200     $self->{set_next_input_character}->($self);
1201     }
1202    
1203     redo A;
1204     } elsif ($self->{next_input_character} == -1) {
1205     $self->{parse_error}->();
1206     if ($self->{current_token}->{type} eq 'start tag') {
1207     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1208     } elsif ($self->{current_token}->{type} eq 'end tag') {
1209     $self->{content_model_flag} = 'PCDATA'; # MUST
1210 wakaba 1.3 if ($self->{current_token}->{attributes}) {
1211 wakaba 1.1 $self->{parse_error}->();
1212     }
1213     } else {
1214     die "$0: $self->{current_token}->{type}: Unknown token type";
1215     }
1216     $self->{state} = 'data';
1217     ## reconsume
1218    
1219     return ($self->{current_token}); # start tag or end tag
1220     undef $self->{current_token};
1221    
1222     redo A;
1223     } else {
1224     $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1225     ## Stay in the state
1226    
1227     if (@{$self->{char}}) {
1228     $self->{next_input_character} = shift @{$self->{char}};
1229     } else {
1230     $self->{set_next_input_character}->($self);
1231     }
1232    
1233     redo A;
1234     }
1235     } elsif ($self->{state} eq 'attribute value (single-quoted)') {
1236     if ($self->{next_input_character} == 0x0027) { # '
1237     $self->{state} = 'before attribute name';
1238    
1239     if (@{$self->{char}}) {
1240     $self->{next_input_character} = shift @{$self->{char}};
1241     } else {
1242     $self->{set_next_input_character}->($self);
1243     }
1244    
1245     redo A;
1246     } elsif ($self->{next_input_character} == 0x0026) { # &
1247     $self->{last_attribute_value_state} = 'attribute value (single-quoted)';
1248     $self->{state} = 'entity in attribute value';
1249    
1250     if (@{$self->{char}}) {
1251     $self->{next_input_character} = shift @{$self->{char}};
1252     } else {
1253     $self->{set_next_input_character}->($self);
1254     }
1255    
1256     redo A;
1257     } elsif ($self->{next_input_character} == -1) {
1258     $self->{parse_error}->();
1259     if ($self->{current_token}->{type} eq 'start tag') {
1260     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1261     } elsif ($self->{current_token}->{type} eq 'end tag') {
1262     $self->{content_model_flag} = 'PCDATA'; # MUST
1263 wakaba 1.3 if ($self->{current_token}->{attributes}) {
1264 wakaba 1.1 $self->{parse_error}->();
1265     }
1266     } else {
1267     die "$0: $self->{current_token}->{type}: Unknown token type";
1268     }
1269     $self->{state} = 'data';
1270     ## reconsume
1271    
1272     return ($self->{current_token}); # start tag or end tag
1273     undef $self->{current_token};
1274    
1275     redo A;
1276     } else {
1277     $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1278     ## Stay in the state
1279    
1280     if (@{$self->{char}}) {
1281     $self->{next_input_character} = shift @{$self->{char}};
1282     } else {
1283     $self->{set_next_input_character}->($self);
1284     }
1285    
1286     redo A;
1287     }
1288     } elsif ($self->{state} eq 'attribute value (unquoted)') {
1289     if ($self->{next_input_character} == 0x0009 or # HT
1290     $self->{next_input_character} == 0x000A or # LF
1291     $self->{next_input_character} == 0x000B or # HT
1292     $self->{next_input_character} == 0x000C or # FF
1293     $self->{next_input_character} == 0x0020) { # SP
1294     $self->{state} = 'before attribute name';
1295    
1296     if (@{$self->{char}}) {
1297     $self->{next_input_character} = shift @{$self->{char}};
1298     } else {
1299     $self->{set_next_input_character}->($self);
1300     }
1301    
1302     redo A;
1303     } elsif ($self->{next_input_character} == 0x0026) { # &
1304     $self->{last_attribute_value_state} = 'attribute value (unquoted)';
1305     $self->{state} = 'entity in attribute value';
1306    
1307     if (@{$self->{char}}) {
1308     $self->{next_input_character} = shift @{$self->{char}};
1309     } else {
1310     $self->{set_next_input_character}->($self);
1311     }
1312    
1313     redo A;
1314     } elsif ($self->{next_input_character} == 0x003E) { # >
1315     if ($self->{current_token}->{type} eq 'start tag') {
1316     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1317     } elsif ($self->{current_token}->{type} eq 'end tag') {
1318     $self->{content_model_flag} = 'PCDATA'; # MUST
1319 wakaba 1.3 if ($self->{current_token}->{attributes}) {
1320 wakaba 1.1 $self->{parse_error}->();
1321     }
1322     } else {
1323     die "$0: $self->{current_token}->{type}: Unknown token type";
1324     }
1325     $self->{state} = 'data';
1326    
1327     if (@{$self->{char}}) {
1328     $self->{next_input_character} = shift @{$self->{char}};
1329     } else {
1330     $self->{set_next_input_character}->($self);
1331     }
1332    
1333    
1334     return ($self->{current_token}); # start tag or end tag
1335     undef $self->{current_token};
1336    
1337     redo A;
1338     } elsif ($self->{next_input_character} == 0x003C or # <
1339     $self->{next_input_character} == -1) {
1340     $self->{parse_error}->();
1341     if ($self->{current_token}->{type} eq 'start tag') {
1342     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1343     } elsif ($self->{current_token}->{type} eq 'end tag') {
1344     $self->{content_model_flag} = 'PCDATA'; # MUST
1345 wakaba 1.3 if ($self->{current_token}->{attributes}) {
1346 wakaba 1.1 $self->{parse_error}->();
1347     }
1348     } else {
1349     die "$0: $self->{current_token}->{type}: Unknown token type";
1350     }
1351     $self->{state} = 'data';
1352     ## reconsume
1353    
1354     return ($self->{current_token}); # start tag or end tag
1355     undef $self->{current_token};
1356    
1357     redo A;
1358     } else {
1359     $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1360     ## Stay in the state
1361    
1362     if (@{$self->{char}}) {
1363     $self->{next_input_character} = shift @{$self->{char}};
1364     } else {
1365     $self->{set_next_input_character}->($self);
1366     }
1367    
1368     redo A;
1369     }
1370     } elsif ($self->{state} eq 'entity in attribute value') {
1371     my $token = $self->_tokenize_attempt_to_consume_an_entity;
1372    
1373     unless (defined $token) {
1374     $self->{current_attribute}->{value} .= '&';
1375     } else {
1376     $self->{current_attribute}->{value} .= $token->{data};
1377     ## ISSUE: spec says "append the returned character token to the current attribute's value"
1378     }
1379    
1380     $self->{state} = $self->{last_attribute_value_state};
1381     # next-input-character is already done
1382     redo A;
1383     } elsif ($self->{state} eq 'bogus comment') {
1384     ## (only happen if PCDATA state)
1385    
1386     my $token = {type => 'comment', data => ''};
1387    
1388     BC: {
1389     if ($self->{next_input_character} == 0x003E) { # >
1390     $self->{state} = 'data';
1391    
1392     if (@{$self->{char}}) {
1393     $self->{next_input_character} = shift @{$self->{char}};
1394     } else {
1395     $self->{set_next_input_character}->($self);
1396     }
1397    
1398    
1399     return ($token);
1400    
1401     redo A;
1402     } elsif ($self->{next_input_character} == -1) {
1403     $self->{state} = 'data';
1404     ## reconsume
1405    
1406     return ($token);
1407    
1408     redo A;
1409     } else {
1410     $token->{data} .= chr ($self->{next_input_character});
1411    
1412     if (@{$self->{char}}) {
1413     $self->{next_input_character} = shift @{$self->{char}};
1414     } else {
1415     $self->{set_next_input_character}->($self);
1416     }
1417    
1418     redo BC;
1419     }
1420     } # BC
1421     } elsif ($self->{state} eq 'markup declaration open') {
1422     ## (only happen if PCDATA state)
1423    
1424     my @next_char;
1425     push @next_char, $self->{next_input_character};
1426    
1427     if ($self->{next_input_character} == 0x002D) { # -
1428    
1429     if (@{$self->{char}}) {
1430     $self->{next_input_character} = shift @{$self->{char}};
1431     } else {
1432     $self->{set_next_input_character}->($self);
1433     }
1434    
1435     push @next_char, $self->{next_input_character};
1436     if ($self->{next_input_character} == 0x002D) { # -
1437     $self->{current_token} = {type => 'comment', data => ''};
1438     $self->{state} = 'comment';
1439    
1440     if (@{$self->{char}}) {
1441     $self->{next_input_character} = shift @{$self->{char}};
1442     } else {
1443     $self->{set_next_input_character}->($self);
1444     }
1445    
1446     redo A;
1447     }
1448     } elsif ($self->{next_input_character} == 0x0044 or # D
1449     $self->{next_input_character} == 0x0064) { # d
1450    
1451     if (@{$self->{char}}) {
1452     $self->{next_input_character} = shift @{$self->{char}};
1453     } else {
1454     $self->{set_next_input_character}->($self);
1455     }
1456    
1457     push @next_char, $self->{next_input_character};
1458     if ($self->{next_input_character} == 0x004F or # O
1459     $self->{next_input_character} == 0x006F) { # o
1460    
1461     if (@{$self->{char}}) {
1462     $self->{next_input_character} = shift @{$self->{char}};
1463     } else {
1464     $self->{set_next_input_character}->($self);
1465     }
1466    
1467     push @next_char, $self->{next_input_character};
1468     if ($self->{next_input_character} == 0x0043 or # C
1469     $self->{next_input_character} == 0x0063) { # c
1470    
1471     if (@{$self->{char}}) {
1472     $self->{next_input_character} = shift @{$self->{char}};
1473     } else {
1474     $self->{set_next_input_character}->($self);
1475     }
1476    
1477     push @next_char, $self->{next_input_character};
1478     if ($self->{next_input_character} == 0x0054 or # T
1479     $self->{next_input_character} == 0x0074) { # t
1480    
1481     if (@{$self->{char}}) {
1482     $self->{next_input_character} = shift @{$self->{char}};
1483     } else {
1484     $self->{set_next_input_character}->($self);
1485     }
1486    
1487     push @next_char, $self->{next_input_character};
1488     if ($self->{next_input_character} == 0x0059 or # Y
1489     $self->{next_input_character} == 0x0079) { # y
1490    
1491     if (@{$self->{char}}) {
1492     $self->{next_input_character} = shift @{$self->{char}};
1493     } else {
1494     $self->{set_next_input_character}->($self);
1495     }
1496    
1497     push @next_char, $self->{next_input_character};
1498     if ($self->{next_input_character} == 0x0050 or # P
1499     $self->{next_input_character} == 0x0070) { # p
1500    
1501     if (@{$self->{char}}) {
1502     $self->{next_input_character} = shift @{$self->{char}};
1503     } else {
1504     $self->{set_next_input_character}->($self);
1505     }
1506    
1507     push @next_char, $self->{next_input_character};
1508     if ($self->{next_input_character} == 0x0045 or # E
1509     $self->{next_input_character} == 0x0065) { # e
1510     ## ISSUE: What a stupid code this is!
1511     $self->{state} = 'DOCTYPE';
1512    
1513     if (@{$self->{char}}) {
1514     $self->{next_input_character} = shift @{$self->{char}};
1515     } else {
1516     $self->{set_next_input_character}->($self);
1517     }
1518    
1519     redo A;
1520     }
1521     }
1522     }
1523     }
1524     }
1525     }
1526     }
1527    
1528     $self->{parse_error}->();
1529     $self->{next_input_character} = shift @next_char;
1530     unshift @{$self->{char}}, (@next_char);
1531     $self->{state} = 'bogus comment';
1532     redo A;
1533    
1534     ## ISSUE: typos in spec: chacacters, is is a parse error
1535     ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
1536     } elsif ($self->{state} eq 'comment') {
1537     if ($self->{next_input_character} == 0x002D) { # -
1538     $self->{state} = 'comment dash';
1539    
1540     if (@{$self->{char}}) {
1541     $self->{next_input_character} = shift @{$self->{char}};
1542     } else {
1543     $self->{set_next_input_character}->($self);
1544     }
1545    
1546     redo A;
1547     } elsif ($self->{next_input_character} == -1) {
1548     $self->{parse_error}->();
1549     $self->{state} = 'data';
1550     ## reconsume
1551    
1552     return ($self->{current_token}); # comment
1553     undef $self->{current_token};
1554    
1555     redo A;
1556     } else {
1557     $self->{current_token}->{data} .= chr ($self->{next_input_character}); # comment
1558     ## Stay in the state
1559    
1560     if (@{$self->{char}}) {
1561     $self->{next_input_character} = shift @{$self->{char}};
1562     } else {
1563     $self->{set_next_input_character}->($self);
1564     }
1565    
1566     redo A;
1567     }
1568     } elsif ($self->{state} eq 'comment dash') {
1569     if ($self->{next_input_character} == 0x002D) { # -
1570     $self->{state} = 'comment end';
1571    
1572     if (@{$self->{char}}) {
1573     $self->{next_input_character} = shift @{$self->{char}};
1574     } else {
1575     $self->{set_next_input_character}->($self);
1576     }
1577    
1578     redo A;
1579     } elsif ($self->{next_input_character} == -1) {
1580     $self->{parse_error}->();
1581     $self->{state} = 'data';
1582     ## reconsume
1583    
1584     return ($self->{current_token}); # comment
1585     undef $self->{current_token};
1586    
1587     redo A;
1588     } else {
1589     $self->{current_token}->{data} .= '-' . chr ($self->{next_input_character}); # comment
1590     $self->{state} = 'comment';
1591    
1592     if (@{$self->{char}}) {
1593     $self->{next_input_character} = shift @{$self->{char}};
1594     } else {
1595     $self->{set_next_input_character}->($self);
1596     }
1597    
1598     redo A;
1599     }
1600     } elsif ($self->{state} eq 'comment end') {
1601     if ($self->{next_input_character} == 0x003E) { # >
1602     $self->{state} = 'data';
1603    
1604     if (@{$self->{char}}) {
1605     $self->{next_input_character} = shift @{$self->{char}};
1606     } else {
1607     $self->{set_next_input_character}->($self);
1608     }
1609    
1610    
1611     return ($self->{current_token}); # comment
1612     undef $self->{current_token};
1613    
1614     redo A;
1615     } elsif ($self->{next_input_character} == 0x002D) { # -
1616     $self->{parse_error}->();
1617     $self->{current_token}->{data} .= '-'; # comment
1618     ## Stay in the state
1619    
1620     if (@{$self->{char}}) {
1621     $self->{next_input_character} = shift @{$self->{char}};
1622     } else {
1623     $self->{set_next_input_character}->($self);
1624     }
1625    
1626     redo A;
1627     } elsif ($self->{next_input_character} == -1) {
1628     $self->{parse_error}->();
1629     $self->{state} = 'data';
1630     ## reconsume
1631    
1632     return ($self->{current_token}); # comment
1633     undef $self->{current_token};
1634    
1635     redo A;
1636     } else {
1637     $self->{parse_error}->();
1638     $self->{current_token}->{data} .= '--' . chr ($self->{next_input_character}); # comment
1639     $self->{state} = 'comment';
1640    
1641     if (@{$self->{char}}) {
1642     $self->{next_input_character} = shift @{$self->{char}};
1643     } else {
1644     $self->{set_next_input_character}->($self);
1645     }
1646    
1647     redo A;
1648     }
1649     } elsif ($self->{state} eq 'DOCTYPE') {
1650     if ($self->{next_input_character} == 0x0009 or # HT
1651     $self->{next_input_character} == 0x000A or # LF
1652     $self->{next_input_character} == 0x000B or # VT
1653     $self->{next_input_character} == 0x000C or # FF
1654     $self->{next_input_character} == 0x0020) { # SP
1655     $self->{state} = 'before DOCTYPE name';
1656    
1657     if (@{$self->{char}}) {
1658     $self->{next_input_character} = shift @{$self->{char}};
1659     } else {
1660     $self->{set_next_input_character}->($self);
1661     }
1662    
1663     redo A;
1664     } else {
1665     $self->{parse_error}->();
1666     $self->{state} = 'before DOCTYPE name';
1667     ## reconsume
1668     redo A;
1669     }
1670     } elsif ($self->{state} eq 'before DOCTYPE name') {
1671     if ($self->{next_input_character} == 0x0009 or # HT
1672     $self->{next_input_character} == 0x000A or # LF
1673     $self->{next_input_character} == 0x000B or # VT
1674     $self->{next_input_character} == 0x000C or # FF
1675     $self->{next_input_character} == 0x0020) { # SP
1676     ## Stay in the state
1677    
1678     if (@{$self->{char}}) {
1679     $self->{next_input_character} = shift @{$self->{char}};
1680     } else {
1681     $self->{set_next_input_character}->($self);
1682     }
1683    
1684     redo A;
1685     } elsif (0x0061 <= $self->{next_input_character} and
1686     $self->{next_input_character} <= 0x007A) { # a..z
1687     $self->{current_token} = {type => 'DOCTYPE',
1688     name => chr ($self->{next_input_character} - 0x0020),
1689     error => 1};
1690     $self->{state} = 'DOCTYPE name';
1691    
1692     if (@{$self->{char}}) {
1693     $self->{next_input_character} = shift @{$self->{char}};
1694     } else {
1695     $self->{set_next_input_character}->($self);
1696     }
1697    
1698     redo A;
1699     } elsif ($self->{next_input_character} == 0x003E) { # >
1700     $self->{parse_error}->();
1701     $self->{state} = 'data';
1702    
1703     if (@{$self->{char}}) {
1704     $self->{next_input_character} = shift @{$self->{char}};
1705     } else {
1706     $self->{set_next_input_character}->($self);
1707     }
1708    
1709    
1710     return ({type => 'DOCTYPE', name => '', error => 1});
1711    
1712     redo A;
1713     } elsif ($self->{next_input_character} == -1) {
1714     $self->{parse_error}->();
1715     $self->{state} = 'data';
1716     ## reconsume
1717    
1718     return ({type => 'DOCTYPE', name => '', error => 1});
1719    
1720     redo A;
1721     } else {
1722     $self->{current_token} = {type => 'DOCTYPE',
1723     name => chr ($self->{next_input_character}),
1724     error => 1};
1725     $self->{state} = 'DOCTYPE name';
1726    
1727     if (@{$self->{char}}) {
1728     $self->{next_input_character} = shift @{$self->{char}};
1729     } else {
1730     $self->{set_next_input_character}->($self);
1731     }
1732    
1733     redo A;
1734     }
1735     } elsif ($self->{state} eq 'DOCTYPE name') {
1736     if ($self->{next_input_character} == 0x0009 or # HT
1737     $self->{next_input_character} == 0x000A or # LF
1738     $self->{next_input_character} == 0x000B or # VT
1739     $self->{next_input_character} == 0x000C or # FF
1740     $self->{next_input_character} == 0x0020) { # SP
1741     $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE
1742     $self->{state} = 'after DOCTYPE name';
1743    
1744     if (@{$self->{char}}) {
1745     $self->{next_input_character} = shift @{$self->{char}};
1746     } else {
1747     $self->{set_next_input_character}->($self);
1748     }
1749    
1750     redo A;
1751     } elsif ($self->{next_input_character} == 0x003E) { # >
1752     $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE
1753     $self->{state} = 'data';
1754    
1755     if (@{$self->{char}}) {
1756     $self->{next_input_character} = shift @{$self->{char}};
1757     } else {
1758     $self->{set_next_input_character}->($self);
1759     }
1760    
1761    
1762     return ($self->{current_token}); # DOCTYPE
1763     undef $self->{current_token};
1764    
1765     redo A;
1766     } elsif (0x0061 <= $self->{next_input_character} and
1767     $self->{next_input_character} <= 0x007A) { # a..z
1768     $self->{current_token}->{name} .= chr ($self->{next_input_character} - 0x0020); # DOCTYPE
1769     #$self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML');
1770     ## Stay in the state
1771    
1772     if (@{$self->{char}}) {
1773     $self->{next_input_character} = shift @{$self->{char}};
1774     } else {
1775     $self->{set_next_input_character}->($self);
1776     }
1777    
1778     redo A;
1779     } elsif ($self->{next_input_character} == -1) {
1780     $self->{parse_error}->();
1781     $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE
1782     $self->{state} = 'data';
1783     ## reconsume
1784    
1785     return ($self->{current_token});
1786     undef $self->{current_token};
1787    
1788     redo A;
1789     } else {
1790 wakaba 1.4 $self->{current_token}->{name}
1791     .= chr ($self->{next_input_character}); # DOCTYPE
1792 wakaba 1.1 #$self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML');
1793     ## Stay in the state
1794    
1795     if (@{$self->{char}}) {
1796     $self->{next_input_character} = shift @{$self->{char}};
1797     } else {
1798     $self->{set_next_input_character}->($self);
1799     }
1800    
1801     redo A;
1802     }
1803     } elsif ($self->{state} eq 'after DOCTYPE name') {
1804     if ($self->{next_input_character} == 0x0009 or # HT
1805     $self->{next_input_character} == 0x000A or # LF
1806     $self->{next_input_character} == 0x000B or # VT
1807     $self->{next_input_character} == 0x000C or # FF
1808     $self->{next_input_character} == 0x0020) { # SP
1809     ## Stay in the state
1810    
1811     if (@{$self->{char}}) {
1812     $self->{next_input_character} = shift @{$self->{char}};
1813     } else {
1814     $self->{set_next_input_character}->($self);
1815     }
1816    
1817     redo A;
1818     } elsif ($self->{next_input_character} == 0x003E) { # >
1819     $self->{state} = 'data';
1820    
1821     if (@{$self->{char}}) {
1822     $self->{next_input_character} = shift @{$self->{char}};
1823     } else {
1824     $self->{set_next_input_character}->($self);
1825     }
1826    
1827    
1828     return ($self->{current_token}); # DOCTYPE
1829     undef $self->{current_token};
1830    
1831     redo A;
1832     } elsif ($self->{next_input_character} == -1) {
1833     $self->{parse_error}->();
1834     $self->{state} = 'data';
1835     ## reconsume
1836    
1837     return ($self->{current_token}); # DOCTYPE
1838     undef $self->{current_token};
1839    
1840     redo A;
1841     } else {
1842     $self->{parse_error}->();
1843     $self->{current_token}->{error} = 1; # DOCTYPE
1844     $self->{state} = 'bogus DOCTYPE';
1845    
1846     if (@{$self->{char}}) {
1847     $self->{next_input_character} = shift @{$self->{char}};
1848     } else {
1849     $self->{set_next_input_character}->($self);
1850     }
1851    
1852     redo A;
1853     }
1854     } elsif ($self->{state} eq 'bogus DOCTYPE') {
1855     if ($self->{next_input_character} == 0x003E) { # >
1856     $self->{state} = 'data';
1857    
1858     if (@{$self->{char}}) {
1859     $self->{next_input_character} = shift @{$self->{char}};
1860     } else {
1861     $self->{set_next_input_character}->($self);
1862     }
1863    
1864    
1865     return ($self->{current_token}); # DOCTYPE
1866     undef $self->{current_token};
1867    
1868     redo A;
1869     } elsif ($self->{next_input_character} == -1) {
1870     $self->{parse_error}->();
1871     $self->{state} = 'data';
1872     ## reconsume
1873    
1874     return ($self->{current_token}); # DOCTYPE
1875     undef $self->{current_token};
1876    
1877     redo A;
1878     } else {
1879     ## Stay in the state
1880    
1881     if (@{$self->{char}}) {
1882     $self->{next_input_character} = shift @{$self->{char}};
1883     } else {
1884     $self->{set_next_input_character}->($self);
1885     }
1886    
1887     redo A;
1888     }
1889     } else {
1890     die "$0: $self->{state}: Unknown state";
1891     }
1892     } # A
1893    
1894     die "$0: _get_next_token: unexpected case";
1895     } # _get_next_token
1896    
1897     sub _tokenize_attempt_to_consume_an_entity ($) {
1898     my $self = shift;
1899    
1900     if ($self->{next_input_character} == 0x0023) { # #
1901    
1902     if (@{$self->{char}}) {
1903     $self->{next_input_character} = shift @{$self->{char}};
1904     } else {
1905     $self->{set_next_input_character}->($self);
1906     }
1907    
1908     my $num;
1909     if ($self->{next_input_character} == 0x0078 or # x
1910     $self->{next_input_character} == 0x0058) { # X
1911     X: {
1912     my $x_char = $self->{next_input_character};
1913    
1914     if (@{$self->{char}}) {
1915     $self->{next_input_character} = shift @{$self->{char}};
1916     } else {
1917     $self->{set_next_input_character}->($self);
1918     }
1919    
1920     if (0x0030 <= $self->{next_input_character} and
1921     $self->{next_input_character} <= 0x0039) { # 0..9
1922     $num ||= 0;
1923     $num *= 0x10;
1924     $num += $self->{next_input_character} - 0x0030;
1925     redo X;
1926     } elsif (0x0061 <= $self->{next_input_character} and
1927     $self->{next_input_character} <= 0x0066) { # a..f
1928     ## ISSUE: the spec says U+0078, which is apparently incorrect
1929     $num ||= 0;
1930     $num *= 0x10;
1931     $num += $self->{next_input_character} - 0x0060 + 9;
1932     redo X;
1933     } elsif (0x0041 <= $self->{next_input_character} and
1934     $self->{next_input_character} <= 0x0046) { # A..F
1935     ## ISSUE: the spec says U+0058, which is apparently incorrect
1936     $num ||= 0;
1937     $num *= 0x10;
1938     $num += $self->{next_input_character} - 0x0040 + 9;
1939     redo X;
1940     } elsif (not defined $num) { # no hexadecimal digit
1941     $self->{parse_error}->();
1942     $self->{next_input_character} = 0x0023; # #
1943     unshift @{$self->{char}}, ($x_char);
1944 wakaba 1.6 return undef;
1945 wakaba 1.1 } elsif ($self->{next_input_character} == 0x003B) { # ;
1946    
1947     if (@{$self->{char}}) {
1948     $self->{next_input_character} = shift @{$self->{char}};
1949     } else {
1950     $self->{set_next_input_character}->($self);
1951     }
1952    
1953     } else {
1954     $self->{parse_error}->();
1955     }
1956    
1957     ## TODO: check the definition for |a valid Unicode character|.
1958     if ($num > 1114111 or $num == 0) {
1959     $num = 0xFFFD; # REPLACEMENT CHARACTER
1960     ## ISSUE: Why this is not an error?
1961     }
1962    
1963 wakaba 1.6 return {type => 'character', data => chr $num};
1964 wakaba 1.1 } # X
1965 wakaba 1.5 } elsif (0x0030 <= $self->{next_input_character} and
1966     $self->{next_input_character} <= 0x0039) { # 0..9
1967     my $code = $self->{next_input_character} - 0x0030;
1968    
1969     if (@{$self->{char}}) {
1970     $self->{next_input_character} = shift @{$self->{char}};
1971     } else {
1972     $self->{set_next_input_character}->($self);
1973     }
1974    
1975    
1976     while (0x0030 <= $self->{next_input_character} and
1977     $self->{next_input_character} <= 0x0039) { # 0..9
1978     $code *= 10;
1979     $code += $self->{next_input_character} - 0x0030;
1980    
1981    
1982 wakaba 1.1 if (@{$self->{char}}) {
1983     $self->{next_input_character} = shift @{$self->{char}};
1984     } else {
1985     $self->{set_next_input_character}->($self);
1986     }
1987    
1988 wakaba 1.5 }
1989 wakaba 1.1
1990 wakaba 1.5 if ($self->{next_input_character} == 0x003B) { # ;
1991    
1992 wakaba 1.1 if (@{$self->{char}}) {
1993     $self->{next_input_character} = shift @{$self->{char}};
1994     } else {
1995     $self->{set_next_input_character}->($self);
1996     }
1997    
1998 wakaba 1.5 } else {
1999     $self->{parse_error}->();
2000     }
2001 wakaba 1.1
2002 wakaba 1.5 ## TODO: check the definition for |a valid Unicode character|.
2003     if ($code > 1114111 or $code == 0) {
2004     $code = 0xFFFD; # REPLACEMENT CHARACTER
2005     ## ISSUE: Why this is not an error?
2006     }
2007    
2008 wakaba 1.6 return {type => 'character', data => chr $code};
2009 wakaba 1.5 } else {
2010     $self->{parse_error}->();
2011     unshift @{$self->{char}}, ($self->{next_input_character});
2012     $self->{next_input_character} = 0x0023; # #
2013 wakaba 1.6 return undef;
2014 wakaba 1.1 }
2015 wakaba 1.6 } elsif ((0x0041 <= $self->{next_input_character} and
2016     $self->{next_input_character} <= 0x005A) or
2017     (0x0061 <= $self->{next_input_character} and
2018     $self->{next_input_character} <= 0x007A)) {
2019     my $entity_name = chr $self->{next_input_character};
2020    
2021 wakaba 1.1 if (@{$self->{char}}) {
2022     $self->{next_input_character} = shift @{$self->{char}};
2023     } else {
2024     $self->{set_next_input_character}->($self);
2025     }
2026    
2027 wakaba 1.6
2028     my $value = $entity_name;
2029     my $match;
2030    
2031     while (length $entity_name < 10 and
2032     ## NOTE: Some number greater than the maximum length of entity name
2033     ((0x0041 <= $self->{next_input_character} and
2034     $self->{next_input_character} <= 0x005A) or
2035     (0x0061 <= $self->{next_input_character} and
2036     $self->{next_input_character} <= 0x007A) or
2037     (0x0030 <= $self->{next_input_character} and
2038     $self->{next_input_character} <= 0x0039))) {
2039     $entity_name .= chr $self->{next_input_character};
2040     if (defined $entity_char->{$entity_name}) {
2041     $value = $entity_char->{$entity_name};
2042     $match = 1;
2043 wakaba 1.1 } else {
2044 wakaba 1.6 $value .= chr $self->{next_input_character};
2045 wakaba 1.1 }
2046 wakaba 1.6
2047 wakaba 1.1 if (@{$self->{char}}) {
2048     $self->{next_input_character} = shift @{$self->{char}};
2049     } else {
2050     $self->{set_next_input_character}->($self);
2051     }
2052    
2053 wakaba 1.6 }
2054    
2055     if ($match) {
2056     if ($self->{next_input_character} == 0x003B) { # ;
2057 wakaba 1.1
2058     if (@{$self->{char}}) {
2059     $self->{next_input_character} = shift @{$self->{char}};
2060     } else {
2061     $self->{set_next_input_character}->($self);
2062     }
2063    
2064     } else {
2065 wakaba 1.6 $self->{parse_error}->();
2066 wakaba 1.1 }
2067 wakaba 1.6
2068     return {type => 'character', data => $value};
2069     } else {
2070     $self->{parse_error}->();
2071     ## NOTE: No characters are consumed in the spec.
2072     unshift @{$self->{token}}, ({type => 'character', data => $value});
2073     return undef;
2074     }
2075     } else {
2076     ## no characters are consumed
2077     $self->{parse_error}->();
2078     return undef;
2079     }
2080 wakaba 1.1 } # _tokenize_attempt_to_consume_an_entity
2081 wakaba 1.3
2082     sub _initialize_tree_constructor ($) {
2083     my $self = shift;
2084     require What::NanoDOM;
2085     $self->{document} = What::NanoDOM::Document->new;
2086     $self->{document}->strict_error_checking (0);
2087     ## TODO: Turn mutation events off # MUST
2088     ## TODO: Turn loose Document option (manakai extension) on
2089     } # _initialize_tree_constructor
2090    
2091     sub _terminate_tree_constructor ($) {
2092     my $self = shift;
2093     $self->{document}->strict_error_checking (1);
2094     ## TODO: Turn mutation events on
2095     } # _terminate_tree_constructor
2096    
2097     ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
2098    
2099     sub _construct_tree ($) {
2100     my ($self) = @_;
2101    
2102     ## When an interactive UA render the $self->{document} available
2103     ## to the user, or when it begin accepting user input, are
2104     ## not defined.
2105    
2106     ## Append a character: collect it and all subsequent consecutive
2107     ## characters and insert one Text node whose data is concatenation
2108     ## of all those characters. # MUST
2109    
2110     my $token;
2111     $token = $self->_get_next_token;
2112    
2113     my $phase = 'initial'; # MUST
2114    
2115     my $open_elements = [];
2116     my $active_formatting_elements = [];
2117     my $head_element;
2118     my $form_element;
2119     my $insertion_mode = 'before head';
2120    
2121     my $reconstruct_active_formatting_elements = sub { # MUST
2122 wakaba 1.8 my $insert = shift;
2123    
2124 wakaba 1.3 ## Step 1
2125     return unless @$active_formatting_elements;
2126    
2127     ## Step 3
2128     my $i = -1;
2129     my $entry = $active_formatting_elements->[$i];
2130    
2131     ## Step 2
2132     return if $entry->[0] eq '#marker';
2133     for (@$open_elements) {
2134     if ($entry->[0] eq $_->[0]) {
2135     return;
2136     }
2137     }
2138    
2139     S4: {
2140 wakaba 1.8 ## Step 4
2141 wakaba 1.3 last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
2142    
2143     ## Step 5
2144     $i--;
2145     $entry = $active_formatting_elements->[$i];
2146    
2147     ## Step 6
2148     if ($entry->[0] eq '#marker') {
2149     #
2150     } else {
2151     my $in_open_elements;
2152     OE: for (@$open_elements) {
2153     if ($entry->[0] eq $_->[0]) {
2154 wakaba 1.8 $in_open_elements = 1;
2155     last OE;
2156     }
2157 wakaba 1.3 }
2158     if ($in_open_elements) {
2159     #
2160     } else {
2161     redo S4;
2162     }
2163     }
2164    
2165     ## Step 7
2166     $i++;
2167     $entry = $active_formatting_elements->[$i];
2168     } # S4
2169    
2170     S7: {
2171     ## Step 8
2172 wakaba 1.8 my $clone = [$entry->[0]->clone_node (0), $entry->[1]];
2173 wakaba 1.3
2174     ## Step 9
2175 wakaba 1.8 $insert->($clone->[0]);
2176     push @$open_elements, $clone;
2177 wakaba 1.3
2178     ## Step 10
2179     $active_formatting_elements->[$i] = $open_elements->[-1];
2180 wakaba 1.8
2181     ## Step 11
2182     unless ($clone->[0] eq $active_formatting_elements->[-1]->[0]) {
2183 wakaba 1.3 ## Step 7'
2184     $i++;
2185     $entry = $active_formatting_elements->[$i];
2186    
2187     redo S7;
2188     }
2189     } # S7
2190     }; # $reconstruct_active_formatting_elements
2191    
2192     my $clear_up_to_marker = sub {
2193     for (reverse 0..$#$active_formatting_elements) {
2194     if ($active_formatting_elements->[$_]->[0] eq '#marker') {
2195     splice @$active_formatting_elements, $_;
2196     return;
2197     }
2198     }
2199     }; # $clear_up_to_marker
2200    
2201     my $reset_insertion_mode = sub {
2202     ## Step 1
2203     my $last;
2204    
2205     ## Step 2
2206     my $i = -1;
2207     my $node = $open_elements->[$i];
2208    
2209     ## Step 3
2210     S3: {
2211     $last = 1 if $open_elements->[0]->[0] eq $node->[0];
2212     ## TODO: the element whose inner_html is set is neither td nor th, then $node = the element
2213    
2214     ## Step 4..13
2215     my $new_mode = {
2216     select => 'in select',
2217     td => 'in cell',
2218     th => 'in cell',
2219     tr => 'in row',
2220     tbody => 'in table body',
2221     thead => 'in table head',
2222     tfoot => 'in table foot',
2223     caption => 'in caption',
2224     colgroup => 'in column group',
2225     table => 'in table',
2226     head => 'in body', # not in head!
2227     body => 'in body',
2228     frameset => 'in frameset',
2229     }->{$node->[1]};
2230     $insertion_mode = $new_mode and return if defined $new_mode;
2231    
2232     ## Step 14
2233     if ($node->[1] eq 'html') {
2234     unless (defined $head_element) {
2235     $insertion_mode = 'before head';
2236     } else {
2237     $insertion_mode = 'after head';
2238     }
2239     return;
2240     }
2241    
2242     ## Step 15
2243     $insertion_mode = 'in body' and return if $last;
2244    
2245     ## Step 16
2246     $i--;
2247     $node = $open_elements->[$i];
2248    
2249     ## Step 17
2250     redo S3;
2251     } # S3
2252     }; # $reset_insertion_mode
2253    
2254     my $style_start_tag = sub {
2255     my $style_el;
2256     $style_el = $self->{document}->create_element_ns
2257     (q<http://www.w3.org/1999/xhtml>, [undef, 'style']);
2258    
2259     ## $insertion_mode eq 'in head' and ... (always true)
2260     (($insertion_mode eq 'in head' and defined $head_element)
2261     ? $head_element : $open_elements->[-1]->[0])
2262     ->append_child ($style_el);
2263     $self->{content_model_flag} = 'CDATA';
2264    
2265     my $text = '';
2266     $token = $self->_get_next_token;
2267     while ($token->{type} eq 'character') {
2268     $text .= $token->{data};
2269     $token = $self->_get_next_token;
2270     } # stop if non-character token or tokenizer stops tokenising
2271     if (length $text) {
2272     $style_el->manakai_append_text ($text);
2273     }
2274    
2275     $self->{content_model_flag} = 'PCDATA';
2276    
2277     if ($token->{type} eq 'end tag' and $token->{tag_name} eq 'style') {
2278     ## Ignore the token
2279     } else {
2280     $self->{parse_error}->();
2281     ## ISSUE: And ignore?
2282     }
2283     $token = $self->_get_next_token;
2284     }; # $style_start_tag
2285    
2286     my $script_start_tag = sub {
2287 wakaba 1.8 my $script_el;
2288    
2289 wakaba 1.3 $script_el = $self->{document}->create_element_ns
2290     (q<http://www.w3.org/1999/xhtml>, [undef, 'script']);
2291    
2292 wakaba 1.8 for my $attr_name (keys %{ $token->{attributes}}) {
2293     $script_el->set_attribute_ns (undef, [undef, $attr_name],
2294     $token->{attributes} ->{$attr_name}->{value});
2295     }
2296    
2297 wakaba 1.3 ## TODO: mark as "parser-inserted"
2298    
2299     $self->{content_model_flag} = 'CDATA';
2300    
2301     my $text = '';
2302     $token = $self->_get_next_token;
2303     while ($token->{type} eq 'character') {
2304     $text .= $token->{data};
2305     $token = $self->_get_next_token;
2306     } # stop if non-character token or tokenizer stops tokenising
2307     if (length $text) {
2308     $script_el->manakai_append_text ($text);
2309     }
2310    
2311     $self->{content_model_flag} = 'PCDATA';
2312 wakaba 1.8
2313 wakaba 1.3 if ($token->{type} eq 'end tag' and
2314     $token->{tag_name} eq 'script') {
2315     ## Ignore the token
2316     } else {
2317     $self->{parse_error}->();
2318     ## ISSUE: And ignore?
2319     ## TODO: mark as "already executed"
2320     }
2321    
2322     ## TODO: inner_html mode then mark as "already executed" and skip
2323     if (1) {
2324     ## TODO: $old_insertion_point = current insertion point
2325     ## TODO: insertion point = just before the next input character
2326    
2327     (($insertion_mode eq 'in head' and defined $head_element)
2328     ? $head_element : $open_elements->[-1]->[0])->append_child ($script_el);
2329    
2330     ## TODO: insertion point = $old_insertion_point (might be "undefined")
2331    
2332     ## TODO: if there is a script that will execute as soon as the parser resume, then...
2333     }
2334    
2335     $token = $self->_get_next_token;
2336     }; # $script_start_tag
2337    
2338     my $formatting_end_tag = sub {
2339     my $tag_name = shift;
2340    
2341     FET: {
2342     ## Step 1
2343     my $formatting_element;
2344     my $formatting_element_i_in_active;
2345     AFE: for (reverse 0..$#$active_formatting_elements) {
2346     if ($active_formatting_elements->[$_]->[1] eq $tag_name) {
2347     $formatting_element = $active_formatting_elements->[$_];
2348     $formatting_element_i_in_active = $_;
2349     last AFE;
2350     } elsif ($active_formatting_elements->[$_]->[0] eq '#marker') {
2351     last AFE;
2352     }
2353     } # AFE
2354     unless (defined $formatting_element) {
2355     $self->{parse_error}->();
2356     ## Ignore the token
2357     $token = $self->_get_next_token;
2358     return;
2359     }
2360     ## has an element in scope
2361     my $in_scope = 1;
2362     my $formatting_element_i_in_open;
2363     INSCOPE: for (reverse 0..$#$open_elements) {
2364     my $node = $open_elements->[$_];
2365     if ($node->[0] eq $formatting_element->[0]) {
2366     if ($in_scope) {
2367     $formatting_element_i_in_open = $_;
2368     last INSCOPE;
2369     } else { # in open elements but not in scope
2370     $self->{parse_error}->();
2371     ## Ignore the token
2372     $token = $self->_get_next_token;
2373     return;
2374     }
2375     } elsif ({
2376     table => 1, caption => 1, td => 1, th => 1,
2377     button => 1, marquee => 1, object => 1, html => 1,
2378     }->{$node->[1]}) {
2379     $in_scope = 0;
2380     }
2381     } # INSCOPE
2382     unless (defined $formatting_element_i_in_open) {
2383     $self->{parse_error}->();
2384     pop @$active_formatting_elements; # $formatting_element
2385     $token = $self->_get_next_token; ## TODO: ok?
2386     return;
2387     }
2388     if (not $open_elements->[-1]->[0] eq $formatting_element->[0]) {
2389     $self->{parse_error}->();
2390     }
2391    
2392     ## Step 2
2393     my $furthest_block;
2394     my $furthest_block_i_in_open;
2395     OE: for (reverse 0..$#$open_elements) {
2396     my $node = $open_elements->[$_];
2397     if (not $formatting_category->{$node->[1]} and
2398     #not $phrasing_category->{$node->[1]} and
2399     ($special_category->{$node->[1]} or
2400     $scoping_category->{$node->[1]})) {
2401     $furthest_block = $node;
2402     $furthest_block_i_in_open = $_;
2403     } elsif ($node->[0] eq $formatting_element->[0]) {
2404     last OE;
2405     }
2406     } # OE
2407    
2408     ## Step 3
2409     unless (defined $furthest_block) { # MUST
2410     splice @$open_elements, $formatting_element_i_in_open;
2411     splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
2412     $token = $self->_get_next_token;
2413     return;
2414     }
2415    
2416     ## Step 4
2417     my $common_ancestor_node = $open_elements->[$formatting_element_i_in_open - 1];
2418    
2419     ## Step 5
2420     my $furthest_block_parent = $furthest_block->[0]->parent_node;
2421     if (defined $furthest_block_parent) {
2422     $furthest_block_parent->remove_child ($furthest_block->[0]);
2423     }
2424    
2425     ## Step 6
2426     my $bookmark_prev_el
2427     = $active_formatting_elements->[$formatting_element_i_in_active - 1]
2428     ->[0];
2429    
2430     ## Step 7
2431     my $node = $furthest_block;
2432     my $node_i_in_open = $furthest_block_i_in_open;
2433     my $last_node = $furthest_block;
2434     S7: {
2435     ## Step 1
2436     $node_i_in_open--;
2437     $node = $open_elements->[$node_i_in_open];
2438    
2439     ## Step 2
2440     my $node_i_in_active;
2441     S7S2: {
2442     for (reverse 0..$#$active_formatting_elements) {
2443     if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
2444     $node_i_in_active = $_;
2445     last S7S2;
2446     }
2447     }
2448     splice @$open_elements, $node_i_in_open, 1;
2449     redo S7;
2450     } # S7S2
2451    
2452     ## Step 3
2453     last S7 if $node->[0] eq $formatting_element->[0];
2454    
2455     ## Step 4
2456     if ($last_node->[0] eq $furthest_block->[0]) {
2457     $bookmark_prev_el = $node->[0];
2458     }
2459    
2460     ## Step 5
2461     if ($node->[0]->has_child_nodes ()) {
2462     my $clone = [$node->[0]->clone_node (0), $node->[1]];
2463     $active_formatting_elements->[$node_i_in_active] = $clone;
2464     $open_elements->[$node_i_in_open] = $clone;
2465     $node = $clone;
2466     }
2467    
2468     ## Step 6
2469 wakaba 1.7 $node->[0]->append_child ($last_node->[0]);
2470 wakaba 1.3
2471     ## Step 7
2472     $last_node = $node;
2473    
2474     ## Step 8
2475     redo S7;
2476     } # S7
2477    
2478     ## Step 8
2479 wakaba 1.7 $common_ancestor_node->[0]->append_child ($last_node->[0]);
2480 wakaba 1.3
2481     ## Step 9
2482     my $clone = [$formatting_element->[0]->clone_node (0),
2483     $formatting_element->[1]];
2484    
2485     ## Step 10
2486     my @cn = @{$furthest_block->[0]->child_nodes};
2487     $clone->[0]->append_child ($_) for @cn;
2488    
2489     ## Step 11
2490     $furthest_block->[0]->append_child ($clone->[0]);
2491    
2492     ## Step 12
2493     my $i;
2494     AFE: for (reverse 0..$#$active_formatting_elements) {
2495     if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
2496     splice @$active_formatting_elements, $_, 1;
2497     $i-- and last AFE if defined $i;
2498     } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
2499     $i = $_;
2500     }
2501     } # AFE
2502     splice @$active_formatting_elements, $i + 1, 0, $clone;
2503    
2504     ## Step 13
2505     undef $i;
2506     OE: for (reverse 0..$#$open_elements) {
2507     if ($open_elements->[$_]->[0] eq $formatting_element->[0]) {
2508     splice @$open_elements, $_, 1;
2509     $i-- and last OE if defined $i;
2510     } elsif ($open_elements->[$_]->[0] eq $furthest_block->[0]) {
2511     $i = $_;
2512     }
2513     } # OE
2514     splice @$open_elements, $i + 1, 1, $clone;
2515    
2516     ## Step 14
2517     redo FET;
2518     } # FET
2519     }; # $formatting_end_tag
2520    
2521 wakaba 1.8 my $insert_to_current = sub {
2522     $open_elements->[-1]->[0]->append_child (shift);
2523     }; # $insert_to_current
2524    
2525     my $insert_to_foster = sub {
2526     my $child = shift;
2527     if ({
2528     table => 1, tbody => 1, tfoot => 1,
2529     thead => 1, tr => 1,
2530     }->{$open_elements->[-1]->[1]}) {
2531     # MUST
2532     my $foster_parent_element;
2533     my $next_sibling;
2534     OE: for (reverse 0..$#$open_elements) {
2535     if ($open_elements->[$_]->[1] eq 'table') {
2536     my $parent = $open_elements->[$_]->[0]->parent_node;
2537     if (defined $parent and $parent->node_type == 1) {
2538     $foster_parent_element = $parent;
2539     $next_sibling = $open_elements->[$_]->[0];
2540     } else {
2541     $foster_parent_element
2542     = $open_elements->[$_ - 1]->[0];
2543     }
2544     last OE;
2545     }
2546     } # OE
2547     $foster_parent_element = $open_elements->[0]->[0]
2548     unless defined $foster_parent_element;
2549     $foster_parent_element->insert_before
2550     ($child, $next_sibling);
2551     } else {
2552     $open_elements->[-1]->[0]->append_child ($child);
2553     }
2554     }; # $insert_to_foster
2555    
2556 wakaba 1.3 my $in_body = sub {
2557     my $insert = shift;
2558     if ($token->{type} eq 'start tag') {
2559     if ($token->{tag_name} eq 'script') {
2560     $script_start_tag->();
2561     return;
2562     } elsif ($token->{tag_name} eq 'style') {
2563     $style_start_tag->();
2564     return;
2565     } elsif ({
2566 wakaba 1.8 base => 1, link => 1, meta => 1,
2567 wakaba 1.3 }->{$token->{tag_name}}) {
2568     $self->{parse_error}->();
2569     ## NOTE: This is an "as if in head" code clone
2570     my $el;
2571    
2572     $el = $self->{document}->create_element_ns
2573     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
2574    
2575     for my $attr_name (keys %{ $token->{attributes}}) {
2576     $el->set_attribute_ns (undef, [undef, $attr_name],
2577     $token->{attributes} ->{$attr_name}->{value});
2578     }
2579    
2580     if (defined $head_element) {
2581     $head_element->append_child ($el);
2582     } else {
2583     $insert->($el);
2584     }
2585    
2586     ## ISSUE: Issue on magical <base> in the spec
2587    
2588     $token = $self->_get_next_token;
2589     return;
2590 wakaba 1.8 } elsif ($token->{tag_name} eq 'title') {
2591     ## NOTE: There is an "as if in head" code clone
2592     my $title_el;
2593    
2594     $title_el = $self->{document}->create_element_ns
2595     (q<http://www.w3.org/1999/xhtml>, [undef, 'title']);
2596    
2597     for my $attr_name (keys %{ $token->{attributes}}) {
2598     $title_el->set_attribute_ns (undef, [undef, $attr_name],
2599     $token->{attributes} ->{$attr_name}->{value});
2600     }
2601    
2602     (defined $head_element ? $head_element : $open_elements->[-1]->[0])
2603     ->append_child ($title_el);
2604     $self->{content_model_flag} = 'RCDATA';
2605    
2606     my $text = '';
2607     $token = $self->_get_next_token;
2608     while ($token->{type} eq 'character') {
2609     $text .= $token->{data};
2610     $token = $self->_get_next_token;
2611     }
2612     if (length $text) {
2613     $title_el->manakai_append_text ($text);
2614     }
2615    
2616     $self->{content_model_flag} = 'PCDATA';
2617    
2618     if ($token->{type} eq 'end tag' and
2619     $token->{tag_name} eq 'title') {
2620     ## Ignore the token
2621     } else {
2622     $self->{parse_error}->();
2623     ## ISSUE: And ignore?
2624     }
2625     $token = $self->_get_next_token;
2626     return;
2627 wakaba 1.3 } elsif ($token->{tag_name} eq 'body') {
2628     $self->{parse_error}->();
2629    
2630     if (@$open_elements == 1 or
2631     $open_elements->[1]->[1] ne 'body') {
2632     ## Ignore the token
2633     } else {
2634     my $body_el = $open_elements->[1]->[0];
2635     for my $attr_name (keys %{$token->{attributes}}) {
2636     unless ($body_el->has_attribute_ns (undef, $attr_name)) {
2637     $body_el->set_attribute_ns
2638     (undef, [undef, $attr_name],
2639     $token->{attributes}->{$attr_name}->{value});
2640     }
2641     }
2642     }
2643     $token = $self->_get_next_token;
2644     return;
2645     } elsif ({
2646     address => 1, blockquote => 1, center => 1, dir => 1,
2647     div => 1, dl => 1, fieldset => 1, listing => 1,
2648     menu => 1, ol => 1, p => 1, ul => 1,
2649     pre => 1,
2650     }->{$token->{tag_name}}) {
2651     ## has a p element in scope
2652     INSCOPE: for (reverse @$open_elements) {
2653     if ($_->[1] eq 'p') {
2654     unshift @{$self->{token}}, $token;
2655     $token = {type => 'end tag', tag_name => 'p'};
2656     return;
2657     } elsif ({
2658     table => 1, caption => 1, td => 1, th => 1,
2659     button => 1, marquee => 1, object => 1, html => 1,
2660     }->{$_->[1]}) {
2661     last INSCOPE;
2662     }
2663     } # INSCOPE
2664    
2665    
2666     {
2667     my $el;
2668    
2669     $el = $self->{document}->create_element_ns
2670     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
2671    
2672     for my $attr_name (keys %{ $token->{attributes}}) {
2673     $el->set_attribute_ns (undef, [undef, $attr_name],
2674     $token->{attributes} ->{$attr_name}->{value});
2675     }
2676    
2677     $insert->($el);
2678     push @$open_elements, [$el, $token->{tag_name}];
2679     }
2680    
2681     if ($token->{tag_name} eq 'pre') {
2682     $token = $self->_get_next_token;
2683     if ($token->{type} eq 'character') {
2684     $token->{data} =~ s/^\x0A//;
2685     unless (length $token->{data}) {
2686     $token = $self->_get_next_token;
2687     }
2688     }
2689     } else {
2690     $token = $self->_get_next_token;
2691     }
2692     return;
2693     } elsif ($token->{tag_name} eq 'form') {
2694     if (defined $form_element) {
2695     $self->{parse_error}->();
2696     ## Ignore the token
2697     } else {
2698     ## has a p element in scope
2699     INSCOPE: for (reverse @$open_elements) {
2700     if ($_->[1] eq 'p') {
2701     unshift @{$self->{token}}, $token;
2702     $token = {type => 'end tag', tag_name => 'p'};
2703     return;
2704     } elsif ({
2705     table => 1, caption => 1, td => 1, th => 1,
2706     button => 1, marquee => 1, object => 1, html => 1,
2707     }->{$_->[1]}) {
2708     last INSCOPE;
2709     }
2710     } # INSCOPE
2711    
2712    
2713     {
2714     my $el;
2715    
2716     $el = $self->{document}->create_element_ns
2717     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
2718    
2719     for my $attr_name (keys %{ $token->{attributes}}) {
2720     $el->set_attribute_ns (undef, [undef, $attr_name],
2721     $token->{attributes} ->{$attr_name}->{value});
2722     }
2723    
2724     $insert->($el);
2725     push @$open_elements, [$el, $token->{tag_name}];
2726     }
2727    
2728     $form_element = $open_elements->[-1]->[0];
2729     $token = $self->_get_next_token;
2730     return;
2731     }
2732     } elsif ($token->{tag_name} eq 'li') {
2733     ## has a p element in scope
2734     INSCOPE: for (reverse @$open_elements) {
2735     if ($_->[1] eq 'p') {
2736     unshift @{$self->{token}}, $token;
2737     $token = {type => 'end tag', tag_name => 'p'};
2738     return;
2739     } elsif ({
2740     table => 1, caption => 1, td => 1, th => 1,
2741     button => 1, marquee => 1, object => 1, html => 1,
2742     }->{$_->[1]}) {
2743     last INSCOPE;
2744     }
2745     } # INSCOPE
2746    
2747     ## Step 1
2748     my $i = -1;
2749     my $node = $open_elements->[$i];
2750     LI: {
2751     ## Step 2
2752     if ($node->[1] eq 'li') {
2753     splice @$open_elements, $i;
2754     last LI;
2755     }
2756    
2757     ## Step 3
2758     if (not $formatting_category->{$node->[1]} and
2759     #not $phrasing_category->{$node->[1]} and
2760     ($special_category->{$node->[1]} or
2761     $scoping_category->{$node->[1]}) and
2762     $node->[1] ne 'address' and $node->[1] ne 'div') {
2763     last LI;
2764     }
2765    
2766     ## Step 4
2767 wakaba 1.8 $i--;
2768 wakaba 1.3 $node = $open_elements->[$i];
2769     redo LI;
2770     } # LI
2771    
2772    
2773     {
2774     my $el;
2775    
2776     $el = $self->{document}->create_element_ns
2777     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
2778    
2779     for my $attr_name (keys %{ $token->{attributes}}) {
2780     $el->set_attribute_ns (undef, [undef, $attr_name],
2781     $token->{attributes} ->{$attr_name}->{value});
2782     }
2783    
2784     $insert->($el);
2785     push @$open_elements, [$el, $token->{tag_name}];
2786     }
2787    
2788     $token = $self->_get_next_token;
2789     return;
2790     } elsif ($token->{tag_name} eq 'dd' or $token->{tag_name} eq 'dt') {
2791     ## has a p element in scope
2792     INSCOPE: for (reverse @$open_elements) {
2793     if ($_->[1] eq 'p') {
2794     unshift @{$self->{token}}, $token;
2795     $token = {type => 'end tag', tag_name => 'p'};
2796     return;
2797     } elsif ({
2798     table => 1, caption => 1, td => 1, th => 1,
2799     button => 1, marquee => 1, object => 1, html => 1,
2800     }->{$_->[1]}) {
2801     last INSCOPE;
2802     }
2803     } # INSCOPE
2804    
2805     ## Step 1
2806     my $i = -1;
2807     my $node = $open_elements->[$i];
2808     LI: {
2809     ## Step 2
2810     if ($node->[1] eq 'dt' or $node->[1] eq 'dd') {
2811     splice @$open_elements, $i;
2812     last LI;
2813     }
2814    
2815     ## Step 3
2816     if (not $formatting_category->{$node->[1]} and
2817     #not $phrasing_category->{$node->[1]} and
2818     ($special_category->{$node->[1]} or
2819     $scoping_category->{$node->[1]}) and
2820     $node->[1] ne 'address' and $node->[1] ne 'div') {
2821     last LI;
2822     }
2823    
2824     ## Step 4
2825 wakaba 1.8 $i--;
2826 wakaba 1.3 $node = $open_elements->[$i];
2827     redo LI;
2828     } # LI
2829    
2830    
2831     {
2832     my $el;
2833    
2834     $el = $self->{document}->create_element_ns
2835     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
2836    
2837     for my $attr_name (keys %{ $token->{attributes}}) {
2838     $el->set_attribute_ns (undef, [undef, $attr_name],
2839     $token->{attributes} ->{$attr_name}->{value});
2840     }
2841    
2842     $insert->($el);
2843     push @$open_elements, [$el, $token->{tag_name}];
2844     }
2845    
2846     $token = $self->_get_next_token;
2847     return;
2848     } elsif ($token->{tag_name} eq 'plaintext') {
2849     ## has a p element in scope
2850     INSCOPE: for (reverse @$open_elements) {
2851     if ($_->[1] eq 'p') {
2852     unshift @{$self->{token}}, $token;
2853     $token = {type => 'end tag', tag_name => 'p'};
2854     return;
2855     } elsif ({
2856     table => 1, caption => 1, td => 1, th => 1,
2857     button => 1, marquee => 1, object => 1, html => 1,
2858     }->{$_->[1]}) {
2859     last INSCOPE;
2860     }
2861     } # INSCOPE
2862    
2863    
2864     {
2865     my $el;
2866    
2867     $el = $self->{document}->create_element_ns
2868     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
2869    
2870     for my $attr_name (keys %{ $token->{attributes}}) {
2871     $el->set_attribute_ns (undef, [undef, $attr_name],
2872     $token->{attributes} ->{$attr_name}->{value});
2873     }
2874    
2875     $insert->($el);
2876     push @$open_elements, [$el, $token->{tag_name}];
2877     }
2878    
2879    
2880     $self->{content_model_flag} = 'PLAINTEXT';
2881    
2882     $token = $self->_get_next_token;
2883     return;
2884     } elsif ({
2885     h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
2886     }->{$token->{tag_name}}) {
2887     ## has a p element in scope
2888     INSCOPE: for (reverse 0..$#$open_elements) {
2889     my $node = $open_elements->[$_];
2890     if ($node->[1] eq 'p') {
2891     unshift @{$self->{token}}, $token;
2892     $token = {type => 'end tag', tag_name => 'p'};
2893     return;
2894     } elsif ({
2895     table => 1, caption => 1, td => 1, th => 1,
2896     button => 1, marquee => 1, object => 1, html => 1,
2897     }->{$node->[1]}) {
2898     last INSCOPE;
2899     }
2900     } # INSCOPE
2901    
2902     ## has an element in scope
2903     my $i;
2904     INSCOPE: for (reverse 0..$#$open_elements) {
2905     my $node = $open_elements->[$_];
2906     if ({
2907     h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
2908     }->{$node->[1]}) {
2909     $i = $_;
2910     last INSCOPE;
2911     } elsif ({
2912     table => 1, caption => 1, td => 1, th => 1,
2913     button => 1, marquee => 1, object => 1, html => 1,
2914     }->{$node->[1]}) {
2915     last INSCOPE;
2916     }
2917     } # INSCOPE
2918    
2919     if (defined $i) {
2920     $self->{parse_error}->();
2921     splice @$open_elements, $i;
2922     }
2923    
2924    
2925     {
2926     my $el;
2927    
2928     $el = $self->{document}->create_element_ns
2929     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
2930    
2931     for my $attr_name (keys %{ $token->{attributes}}) {
2932     $el->set_attribute_ns (undef, [undef, $attr_name],
2933     $token->{attributes} ->{$attr_name}->{value});
2934     }
2935    
2936     $insert->($el);
2937     push @$open_elements, [$el, $token->{tag_name}];
2938     }
2939    
2940    
2941     $token = $self->_get_next_token;
2942     return;
2943     } elsif ($token->{tag_name} eq 'a') {
2944     AFE: for my $i (reverse 0..$#$active_formatting_elements) {
2945     my $node = $active_formatting_elements->[$i];
2946     if ($node->[1] eq 'a') {
2947 wakaba 1.8 $self->{parse_error}-> ('a in a');
2948 wakaba 1.3
2949     unshift @{$self->{token}}, $token;
2950     $token = {type => 'end tag', tag_name => 'a'};
2951     $formatting_end_tag->($token->{tag_name});
2952    
2953 wakaba 1.8 AFE2: for (reverse 0..$#$active_formatting_elements) {
2954     if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
2955     splice @$active_formatting_elements, $_, 1;
2956     last AFE2;
2957     }
2958     } # AFE2
2959 wakaba 1.3 OE: for (reverse 0..$#$open_elements) {
2960     if ($open_elements->[$_]->[0] eq $node->[0]) {
2961 wakaba 1.8 splice @$open_elements, $_, 1;
2962 wakaba 1.3 last OE;
2963     }
2964     } # OE
2965     last AFE;
2966     } elsif ($node->[0] eq '#marker') {
2967     last AFE;
2968     }
2969     } # AFE
2970    
2971 wakaba 1.8 $reconstruct_active_formatting_elements->($insert_to_current);
2972 wakaba 1.3
2973    
2974     {
2975     my $el;
2976    
2977     $el = $self->{document}->create_element_ns
2978     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
2979    
2980     for my $attr_name (keys %{ $token->{attributes}}) {
2981     $el->set_attribute_ns (undef, [undef, $attr_name],
2982     $token->{attributes} ->{$attr_name}->{value});
2983     }
2984    
2985     $insert->($el);
2986     push @$open_elements, [$el, $token->{tag_name}];
2987     }
2988    
2989     push @$active_formatting_elements, $open_elements->[-1];
2990    
2991     $token = $self->_get_next_token;
2992     return;
2993     } elsif ({
2994     b => 1, big => 1, em => 1, font => 1, i => 1,
2995     nobr => 1, s => 1, small => 1, strile => 1,
2996     strong => 1, tt => 1, u => 1,
2997     }->{$token->{tag_name}}) {
2998 wakaba 1.8 $reconstruct_active_formatting_elements->($insert_to_current);
2999 wakaba 1.3
3000    
3001     {
3002     my $el;
3003    
3004     $el = $self->{document}->create_element_ns
3005     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
3006    
3007     for my $attr_name (keys %{ $token->{attributes}}) {
3008     $el->set_attribute_ns (undef, [undef, $attr_name],
3009     $token->{attributes} ->{$attr_name}->{value});
3010     }
3011    
3012     $insert->($el);
3013     push @$open_elements, [$el, $token->{tag_name}];
3014     }
3015    
3016     push @$active_formatting_elements, $open_elements->[-1];
3017    
3018     $token = $self->_get_next_token;
3019     return;
3020     } elsif ($token->{tag_name} eq 'button') {
3021     ## has a button element in scope
3022     INSCOPE: for (reverse 0..$#$open_elements) {
3023     my $node = $open_elements->[$_];
3024     if ($node->[1] eq 'button') {
3025     $self->{parse_error}->();
3026     unshift @{$self->{token}}, $token;
3027     $token = {type => 'end tag', tag_name => 'button'};
3028     return;
3029     } elsif ({
3030     table => 1, caption => 1, td => 1, th => 1,
3031     button => 1, marquee => 1, object => 1, html => 1,
3032     }->{$node->[1]}) {
3033     last INSCOPE;
3034     }
3035     } # INSCOPE
3036    
3037 wakaba 1.8 $reconstruct_active_formatting_elements->($insert_to_current);
3038 wakaba 1.3
3039    
3040     {
3041     my $el;
3042    
3043     $el = $self->{document}->create_element_ns
3044     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
3045    
3046     for my $attr_name (keys %{ $token->{attributes}}) {
3047     $el->set_attribute_ns (undef, [undef, $attr_name],
3048     $token->{attributes} ->{$attr_name}->{value});
3049     }
3050    
3051     $insert->($el);
3052     push @$open_elements, [$el, $token->{tag_name}];
3053     }
3054    
3055     push @$active_formatting_elements, ['#marker', ''];
3056    
3057     $token = $self->_get_next_token;
3058     return;
3059     } elsif ($token->{tag_name} eq 'marquee' or
3060     $token->{tag_name} eq 'object') {
3061 wakaba 1.8 $reconstruct_active_formatting_elements->($insert_to_current);
3062 wakaba 1.3
3063    
3064     {
3065     my $el;
3066    
3067     $el = $self->{document}->create_element_ns
3068     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
3069    
3070     for my $attr_name (keys %{ $token->{attributes}}) {
3071     $el->set_attribute_ns (undef, [undef, $attr_name],
3072     $token->{attributes} ->{$attr_name}->{value});
3073     }
3074    
3075     $insert->($el);
3076     push @$open_elements, [$el, $token->{tag_name}];
3077     }
3078    
3079     push @$active_formatting_elements, ['#marker', ''];
3080    
3081     $token = $self->_get_next_token;
3082     return;
3083     } elsif ($token->{tag_name} eq 'xmp') {
3084 wakaba 1.8 $reconstruct_active_formatting_elements->($insert_to_current);
3085 wakaba 1.3
3086    
3087     {
3088     my $el;
3089    
3090     $el = $self->{document}->create_element_ns
3091     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
3092    
3093     for my $attr_name (keys %{ $token->{attributes}}) {
3094     $el->set_attribute_ns (undef, [undef, $attr_name],
3095     $token->{attributes} ->{$attr_name}->{value});
3096     }
3097    
3098     $insert->($el);
3099     push @$open_elements, [$el, $token->{tag_name}];
3100     }
3101    
3102    
3103     $self->{content_model_flag} = 'CDATA';
3104    
3105     $token = $self->_get_next_token;
3106     return;
3107 wakaba 1.7 } elsif ($token->{tag_name} eq 'table') {
3108 wakaba 1.3 ## has a p element in scope
3109     INSCOPE: for (reverse @$open_elements) {
3110     if ($_->[1] eq 'p') {
3111     unshift @{$self->{token}}, $token;
3112     $token = {type => 'end tag', tag_name => 'p'};
3113     return;
3114     } elsif ({
3115     table => 1, caption => 1, td => 1, th => 1,
3116     button => 1, marquee => 1, object => 1, html => 1,
3117     }->{$_->[1]}) {
3118     last INSCOPE;
3119     }
3120     } # INSCOPE
3121    
3122    
3123     {
3124     my $el;
3125    
3126     $el = $self->{document}->create_element_ns
3127     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
3128    
3129     for my $attr_name (keys %{ $token->{attributes}}) {
3130     $el->set_attribute_ns (undef, [undef, $attr_name],
3131     $token->{attributes} ->{$attr_name}->{value});
3132     }
3133    
3134     $insert->($el);
3135     push @$open_elements, [$el, $token->{tag_name}];
3136     }
3137    
3138    
3139     $insertion_mode = 'in table';
3140    
3141     $token = $self->_get_next_token;
3142     return;
3143     } elsif ({
3144     area => 1, basefont => 1, bgsound => 1, br => 1,
3145     embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
3146     image => 1,
3147     }->{$token->{tag_name}}) {
3148     if ($token->{tag_name} eq 'image') {
3149     $self->{parse_error}->();
3150     $token->{tag_name} = 'img';
3151     }
3152    
3153 wakaba 1.8 $reconstruct_active_formatting_elements->($insert_to_current);
3154 wakaba 1.3
3155    
3156     {
3157     my $el;
3158    
3159     $el = $self->{document}->create_element_ns
3160     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
3161    
3162     for my $attr_name (keys %{ $token->{attributes}}) {
3163     $el->set_attribute_ns (undef, [undef, $attr_name],
3164     $token->{attributes} ->{$attr_name}->{value});
3165     }
3166    
3167     $insert->($el);
3168     push @$open_elements, [$el, $token->{tag_name}];
3169     }
3170    
3171     pop @$open_elements;
3172    
3173     $token = $self->_get_next_token;
3174     return;
3175     } elsif ($token->{tag_name} eq 'hr') {
3176     ## has a p element in scope
3177     INSCOPE: for (reverse @$open_elements) {
3178     if ($_->[1] eq 'p') {
3179     unshift @{$self->{token}}, $token;
3180     $token = {type => 'end tag', tag_name => 'p'};
3181     return;
3182     } elsif ({
3183     table => 1, caption => 1, td => 1, th => 1,
3184     button => 1, marquee => 1, object => 1, html => 1,
3185     }->{$_->[1]}) {
3186     last INSCOPE;
3187     }
3188     } # INSCOPE
3189    
3190    
3191     {
3192     my $el;
3193    
3194     $el = $self->{document}->create_element_ns
3195     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
3196    
3197     for my $attr_name (keys %{ $token->{attributes}}) {
3198     $el->set_attribute_ns (undef, [undef, $attr_name],
3199     $token->{attributes} ->{$attr_name}->{value});
3200     }
3201    
3202     $insert->($el);
3203     push @$open_elements, [$el, $token->{tag_name}];
3204     }
3205    
3206     pop @$open_elements;
3207    
3208     $token = $self->_get_next_token;
3209     return;
3210     } elsif ($token->{tag_name} eq 'input') {
3211 wakaba 1.8 $reconstruct_active_formatting_elements->($insert_to_current);
3212 wakaba 1.3
3213    
3214     {
3215     my $el;
3216    
3217     $el = $self->{document}->create_element_ns
3218     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
3219    
3220     for my $attr_name (keys %{ $token->{attributes}}) {
3221     $el->set_attribute_ns (undef, [undef, $attr_name],
3222     $token->{attributes} ->{$attr_name}->{value});
3223     }
3224    
3225     $insert->($el);
3226     push @$open_elements, [$el, $token->{tag_name}];
3227     }
3228    
3229     ## TODO: associate with $form_element if defined
3230     pop @$open_elements;
3231    
3232     $token = $self->_get_next_token;
3233     return;
3234     } elsif ($token->{tag_name} eq 'isindex') {
3235     $self->{parse_error}->();
3236    
3237     if (defined $form_element) {
3238     ## Ignore the token
3239     $token = $self->_get_next_token;
3240     return;
3241     } else {
3242     my $at = $token->{attributes};
3243     $at->{name} = {name => 'name', value => 'isindex'};
3244     my @tokens = (
3245     {type => 'start tag', tag_name => 'form'},
3246     {type => 'start tag', tag_name => 'hr'},
3247     {type => 'start tag', tag_name => 'p'},
3248     {type => 'start tag', tag_name => 'label'},
3249     {type => 'character',
3250 wakaba 1.8 data => 'This is a searchable index. Insert your search keywords here: '}, # SHOULD
3251 wakaba 1.3 ## TODO: make this configurable
3252     {type => 'start tag', tag_name => 'input', attributes => $at},
3253     #{type => 'character', data => ''}, # SHOULD
3254     {type => 'end tag', tag_name => 'label'},
3255     {type => 'end tag', tag_name => 'p'},
3256     {type => 'start tag', tag_name => 'hr'},
3257     {type => 'end tag', tag_name => 'form'},
3258     );
3259     $token = shift @tokens;
3260     unshift @{$self->{token}}, (@tokens);
3261     return;
3262     }
3263     } elsif ({
3264     textarea => 1,
3265     noembed => 1,
3266     noframes => 1,
3267     noscript => 0, ## TODO: 1 if scripting is enabled
3268     }->{$token->{tag_name}}) {
3269     my $tag_name = $token->{tag_name};
3270     my $el;
3271    
3272     $el = $self->{document}->create_element_ns
3273     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
3274    
3275     for my $attr_name (keys %{ $token->{attributes}}) {
3276     $el->set_attribute_ns (undef, [undef, $attr_name],
3277     $token->{attributes} ->{$attr_name}->{value});
3278     }
3279    
3280    
3281     if ($token->{tag_name} eq 'textarea') {
3282     ## TODO: form_element if defined
3283     $self->{content_model_flag} = 'RCDATA';
3284     } else {
3285     $self->{content_model_flag} = 'CDATA';
3286     }
3287    
3288     $insert->($el);
3289    
3290     my $text = '';
3291     $token = $self->_get_next_token;
3292     while ($token->{type} eq 'character') {
3293     $text .= $token->{data};
3294     $token = $self->_get_next_token;
3295     }
3296     if (length $text) {
3297     $el->manakai_append_text ($text);
3298     }
3299    
3300     $self->{content_model_flag} = 'PCDATA';
3301    
3302     if ($token->{type} eq 'end tag' and
3303     $token->{tag_name} eq $tag_name) {
3304     ## Ignore the token
3305     } else {
3306     $self->{parse_error}->();
3307     ## ISSUE: And ignore?
3308     }
3309     $token = $self->_get_next_token;
3310     return;
3311 wakaba 1.8 } elsif ($token->{tag_name} eq 'select') {
3312     $reconstruct_active_formatting_elements->($insert_to_current);
3313 wakaba 1.3
3314    
3315     {
3316     my $el;
3317    
3318     $el = $self->{document}->create_element_ns
3319     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
3320    
3321     for my $attr_name (keys %{ $token->{attributes}}) {
3322     $el->set_attribute_ns (undef, [undef, $attr_name],
3323     $token->{attributes} ->{$attr_name}->{value});
3324     }
3325    
3326     $insert->($el);
3327     push @$open_elements, [$el, $token->{tag_name}];
3328     }
3329    
3330    
3331     $insertion_mode = 'in select';
3332     $token = $self->_get_next_token;
3333     return;
3334     } elsif ({
3335     caption => 1, col => 1, colgroup => 1, frame => 1,
3336     frameset => 1, head => 1, option => 1, optgroup => 1,
3337     tbody => 1, td => 1, tfoot => 1, th => 1,
3338     thead => 1, tr => 1,
3339     }->{$token->{tag_name}}) {
3340     $self->{parse_error}->();
3341     ## Ignore the token
3342     $token = $self->_get_next_token;
3343     return;
3344    
3345     ## ISSUE: An issue on HTML5 new elements in the spec.
3346     } else {
3347 wakaba 1.8 $reconstruct_active_formatting_elements->($insert_to_current);
3348 wakaba 1.3
3349    
3350     {
3351     my $el;
3352    
3353     $el = $self->{document}->create_element_ns
3354     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
3355    
3356     for my $attr_name (keys %{ $token->{attributes}}) {
3357     $el->set_attribute_ns (undef, [undef, $attr_name],
3358     $token->{attributes} ->{$attr_name}->{value});
3359     }
3360    
3361     $insert->($el);
3362     push @$open_elements, [$el, $token->{tag_name}];
3363     }
3364    
3365    
3366     $token = $self->_get_next_token;
3367     return;
3368     }
3369     } elsif ($token->{type} eq 'end tag') {
3370     if ($token->{tag_name} eq 'body') {
3371     if (@$open_elements > 1 and $open_elements->[1]->[1] eq 'body') {
3372     ## ISSUE: There is an issue in the spec.
3373     if ($open_elements->[-1]->[1] ne 'body') {
3374     $self->{parse_error}->();
3375     }
3376     $insertion_mode = 'after body';
3377     $token = $self->_get_next_token;
3378     return;
3379     } else {
3380     $self->{parse_error}->();
3381     ## Ignore the token
3382     $token = $self->_get_next_token;
3383     return;
3384     }
3385     } elsif ($token->{tag_name} eq 'html') {
3386     if (@$open_elements > 1 and $open_elements->[1]->[1] eq 'body') {
3387     ## ISSUE: There is an issue in the spec.
3388     if ($open_elements->[-1]->[1] ne 'body') {
3389     $self->{parse_error}->();
3390     }
3391     $insertion_mode = 'after body';
3392     ## reprocess
3393     return;
3394     } else {
3395     $self->{parse_error}->();
3396     ## Ignore the token
3397     $token = $self->_get_next_token;
3398     return;
3399     }
3400     } elsif ({
3401     address => 1, blockquote => 1, center => 1, dir => 1,
3402     div => 1, dl => 1, fieldset => 1, listing => 1,
3403     menu => 1, ol => 1, pre => 1, ul => 1,
3404     form => 1,
3405     p => 1,
3406     dd => 1, dt => 1, li => 1,
3407     button => 1, marquee => 1, object => 1,
3408     }->{$token->{tag_name}}) {
3409     ## has an element in scope
3410     my $i;
3411     INSCOPE: for (reverse 0..$#$open_elements) {
3412     my $node = $open_elements->[$_];
3413     if ($node->[1] eq $token->{tag_name}) {
3414     ## generate implied end tags
3415     if ({
3416     dd => ($token->{tag_name} ne 'dd'),
3417     dt => ($token->{tag_name} ne 'dt'),
3418     li => ($token->{tag_name} ne 'li'),
3419     p => ($token->{tag_name} ne 'p'),
3420     td => 1, th => 1, tr => 1,
3421     }->{$open_elements->[-1]->[1]}) {
3422     unshift @{$self->{token}}, $token;
3423     $token = {type => 'end tag',
3424     tag_name => $open_elements->[-1]->[1]}; # MUST
3425     return;
3426     }
3427     $i = $_;
3428     last INSCOPE unless $token->{tag_name} eq 'p';
3429     } elsif ({
3430     table => 1, caption => 1, td => 1, th => 1,
3431     button => 1, marquee => 1, object => 1, html => 1,
3432     }->{$node->[1]}) {
3433     last INSCOPE;
3434     }
3435     } # INSCOPE
3436    
3437     if ($open_elements->[-1]->[1] ne $token->{tag_name}) {
3438     $self->{parse_error}->();
3439     }
3440    
3441     splice @$open_elements, $i if defined $i;
3442     undef $form_element if $token->{tag_name} eq 'form';
3443     $clear_up_to_marker->()
3444     if {
3445     button => 1, marquee => 1, object => 1,
3446     }->{$token->{tag_name}};
3447     $token = $self->_get_next_token;
3448     return;
3449     } elsif ({
3450     h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
3451     }->{$token->{tag_name}}) {
3452     ## has an element in scope
3453     my $i;
3454     INSCOPE: for (reverse 0..$#$open_elements) {
3455     my $node = $open_elements->[$_];
3456     if ({
3457     h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
3458     }->{$node->[1]}) {
3459     ## generate implied end tags
3460     if ({
3461     dd => 1, dt => 1, li => 1, p => 1,
3462     td => 1, th => 1, tr => 1,
3463     }->{$open_elements->[-1]->[1]}) {
3464     unshift @{$self->{token}}, $token;
3465     $token = {type => 'end tag',
3466     tag_name => $open_elements->[-1]->[1]}; # MUST
3467     return;
3468     }
3469     $i = $_;
3470     last INSCOPE;
3471     } elsif ({
3472     table => 1, caption => 1, td => 1, th => 1,
3473     button => 1, marquee => 1, object => 1, html => 1,
3474     }->{$node->[1]}) {
3475     last INSCOPE;
3476     }
3477     } # INSCOPE
3478    
3479     if ($open_elements->[-1]->[1] ne $token->{tag_name}) {
3480     $self->{parse_error}->();
3481     }
3482    
3483     splice @$open_elements, $i if defined $i;
3484     $token = $self->_get_next_token;
3485     return;
3486     } elsif ({
3487     a => 1,
3488     b => 1, big => 1, em => 1, font => 1, i => 1,
3489     nobr => 1, s => 1, small => 1, strile => 1,
3490     strong => 1, tt => 1, u => 1,
3491     }->{$token->{tag_name}}) {
3492     $formatting_end_tag->($token->{tag_name});
3493     return;
3494     } elsif ({
3495     caption => 1, col => 1, colgroup => 1, frame => 1,
3496     frameset => 1, head => 1, option => 1, optgroup => 1,
3497     tbody => 1, td => 1, tfoot => 1, th => 1,
3498     thead => 1, tr => 1,
3499     area => 1, basefont => 1, bgsound => 1, br => 1,
3500     embed => 1, hr => 1, iframe => 1, image => 1,
3501     img => 1, input => 1, isindex=> 1, noembed => 1,
3502     noframes => 1, param => 1, select => 1, spacer => 1,
3503     table => 1, textarea => 1, wbr => 1,
3504     noscript => 0, ## TODO: if scripting is enabled
3505     }->{$token->{tag_name}}) {
3506     $self->{parse_error}->();
3507     ## Ignore the token
3508     $token = $self->_get_next_token;
3509     return;
3510    
3511     ## ISSUE: Issue on HTML5 new elements in spec
3512    
3513     } else {
3514     ## Step 1
3515     my $node_i = -1;
3516     my $node = $open_elements->[$node_i];
3517    
3518     ## Step 2
3519     S2: {
3520     if ($node->[1] eq $token->{tag_name}) {
3521     ## Step 1
3522     ## generate implied end tags
3523     if ({
3524     dd => 1, dt => 1, li => 1, p => 1,
3525     td => 1, th => 1, tr => 1,
3526     }->{$open_elements->[-1]->[1]}) {
3527     unshift @{$self->{token}}, $token;
3528     $token = {type => 'end tag',
3529     tag_name => $open_elements->[-1]->[1]}; # MUST
3530     return;
3531     }
3532    
3533     ## Step 2
3534     if ($token->{tag_name} ne $open_elements->[-1]->[1]) {
3535     $self->{parse_error}->();
3536     }
3537    
3538     ## Step 3
3539     splice @$open_elements, $node_i;
3540     last S2;
3541     } else {
3542     ## Step 3
3543     if (not $formatting_category->{$node->[1]} and
3544     #not $phrasing_category->{$node->[1]} and
3545     ($special_category->{$node->[1]} or
3546     $scoping_category->{$node->[1]})) {
3547     $self->{parse_error}->();
3548     ## Ignore the token
3549     $token = $self->_get_next_token;
3550     last S2;
3551     }
3552     }
3553    
3554     ## Step 4
3555     $node_i--;
3556     $node = $open_elements->[$node_i];
3557    
3558     ## Step 5;
3559     redo S2;
3560     } # S2
3561     }
3562     }
3563     }; # $in_body
3564    
3565     B: {
3566     if ($phase eq 'initial') {
3567     if ($token->{type} eq 'DOCTYPE') {
3568     if ($token->{error}) {
3569     ## ISSUE: Spec currently left this case undefined.
3570 wakaba 1.7 $self->{parse_error}-> ('missing DOCTYPE');
3571 wakaba 1.3 }
3572     my $doctype = $self->{document}->create_document_type_definition
3573     ($token->{name});
3574     $self->{document}->append_child ($doctype);
3575     $phase = 'root element';
3576     $token = $self->_get_next_token;
3577     redo B;
3578     } elsif ({
3579     comment => 1,
3580     'start tag' => 1,
3581     'end tag' => 1,
3582     'end-of-file' => 1,
3583     }->{$token->{type}}) {
3584     ## ISSUE: Spec currently left this case undefined.
3585 wakaba 1.7 $self->{parse_error}-> ('missing DOCTYPE');
3586 wakaba 1.3 $phase = 'root element';
3587     ## reprocess
3588     redo B;
3589     } elsif ($token->{type} eq 'character') {
3590     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3591     $self->{document}->manakai_append_text ($1);
3592     ## ISSUE: DOM3 Core does not allow Document > Text
3593     unless (length $token->{data}) {
3594     ## Stay in the phase
3595     $token = $self->_get_next_token;
3596     redo B;
3597     }
3598     }
3599     ## ISSUE: Spec currently left this case undefined.
3600 wakaba 1.7 $self->{parse_error}-> ('missing DOCTYPE');
3601 wakaba 1.3 $phase = 'root element';
3602     ## reprocess
3603     redo B;
3604     } else {
3605     die "$0: $token->{type}: Unknown token";
3606     }
3607     } elsif ($phase eq 'root element') {
3608     if ($token->{type} eq 'DOCTYPE') {
3609     $self->{parse_error}->();
3610     ## Ignore the token
3611     ## Stay in the phase
3612     $token = $self->_get_next_token;
3613     redo B;
3614     } elsif ($token->{type} eq 'comment') {
3615     my $comment = $self->{document}->create_comment ($token->{data});
3616     $self->{document}->append_child ($comment);
3617     ## Stay in the phase
3618     $token = $self->_get_next_token;
3619     redo B;
3620     } elsif ($token->{type} eq 'character') {
3621     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3622     $self->{document}->manakai_append_text ($1);
3623     ## ISSUE: DOM3 Core does not allow Document > Text
3624     unless (length $token->{data}) {
3625     ## Stay in the phase
3626     $token = $self->_get_next_token;
3627     redo B;
3628     }
3629     }
3630     #
3631     } elsif ({
3632     'start tag' => 1,
3633     'end tag' => 1,
3634     'end-of-file' => 1,
3635     }->{$token->{type}}) {
3636     ## ISSUE: There is an issue in the spec
3637     #
3638     } else {
3639     die "$0: $token->{type}: Unknown token";
3640     }
3641     my $root_element;
3642     $root_element = $self->{document}->create_element_ns
3643     (q<http://www.w3.org/1999/xhtml>, [undef, 'html']);
3644    
3645     $self->{document}->append_child ($root_element);
3646     $open_elements = [[$root_element, 'html']];
3647     $phase = 'main';
3648     ## reprocess
3649     redo B;
3650     } elsif ($phase eq 'main') {
3651     if ($token->{type} eq 'DOCTYPE') {
3652     $self->{parse_error}->();
3653     ## Ignore the token
3654     ## Stay in the phase
3655     $token = $self->_get_next_token;
3656     redo B;
3657     } elsif ($token->{type} eq 'start tag' and
3658     $token->{tag_name} eq 'html') {
3659     ## TODO: unless it is the first start tag token, parse-error
3660     my $top_el = $open_elements->[0]->[0];
3661     for my $attr_name (keys %{$token->{attributes}}) {
3662     unless ($top_el->has_attribute_ns (undef, $attr_name)) {
3663 wakaba 1.8 $top_el->set_attribute_ns
3664     (undef, [undef, $attr_name],
3665     $token->{attributes}->{$attr_name}->{value});
3666 wakaba 1.3 }
3667     }
3668     $token = $self->_get_next_token;
3669     redo B;
3670     } elsif ($token->{type} eq 'end-of-file') {
3671     ## Generate implied end tags
3672     if ({
3673     dd => 1, dt => 1, li => 1, p => 1, td => 1, th => 1, tr => 1,
3674     }->{$open_elements->[-1]->[1]}) {
3675     unshift @{$self->{token}}, $token;
3676     $token = {type => 'end tag', tag_name => $open_elements->[-1]->[1]};
3677     redo B;
3678     }
3679    
3680     if (@$open_elements > 2 or
3681     (@$open_elements == 2 and $open_elements->[1]->[1] ne 'body')) {
3682     $self->{parse_error}->();
3683     } else {
3684     ## TODO: inner_html parser and @$open_elements > 1 and $open_elements->[1] ne 'body', then parse-error
3685     }
3686    
3687     ## Stop parsing
3688     last B;
3689    
3690     ## ISSUE: There is an issue in the spec.
3691     } else {
3692     if ($insertion_mode eq 'before head') {
3693     if ($token->{type} eq 'character') {
3694     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3695     $open_elements->[-1]->[0]->manakai_append_text ($1);
3696     unless (length $token->{data}) {
3697     $token = $self->_get_next_token;
3698     redo B;
3699     }
3700     }
3701     ## As if <head>
3702    
3703     $head_element = $self->{document}->create_element_ns
3704     (q<http://www.w3.org/1999/xhtml>, [undef, 'head']);
3705    
3706     $open_elements->[-1]->[0]->append_child ($head_element);
3707     push @$open_elements, [$head_element, 'head'];
3708     $insertion_mode = 'in head';
3709     ## reprocess
3710     redo B;
3711     } elsif ($token->{type} eq 'comment') {
3712     my $comment = $self->{document}->create_comment ($token->{data});
3713     $open_elements->[-1]->[0]->append_child ($comment);
3714     $token = $self->_get_next_token;
3715     redo B;
3716     } elsif ($token->{type} eq 'start tag') {
3717     my $attr = $token->{tag_name} eq 'head' ? $token->{attributes} : {};
3718    
3719     $head_element = $self->{document}->create_element_ns
3720     (q<http://www.w3.org/1999/xhtml>, [undef, 'head']);
3721    
3722     for my $attr_name (keys %{ $attr}) {
3723     $head_element->set_attribute_ns (undef, [undef, $attr_name],
3724     $attr ->{$attr_name}->{value});
3725     }
3726    
3727     $open_elements->[-1]->[0]->append_child ($head_element);
3728     push @$open_elements, [$head_element, 'head'];
3729     $insertion_mode = 'in head';
3730     if ($token->{tag_name} eq 'head') {
3731     $token = $self->_get_next_token;
3732     #} elsif ({
3733     # base => 1, link => 1, meta => 1,
3734     # script => 1, style => 1, title => 1,
3735     # }->{$token->{tag_name}}) {
3736     # ## reprocess
3737     } else {
3738     ## reprocess
3739     }
3740     redo B;
3741     } elsif ($token->{type} eq 'end tag') {
3742     if ($token->{tag_name} eq 'html') {
3743     ## As if <head>
3744    
3745     $head_element = $self->{document}->create_element_ns
3746     (q<http://www.w3.org/1999/xhtml>, [undef, 'head']);
3747    
3748     $open_elements->[-1]->[0]->append_child ($head_element);
3749     push @$open_elements, [$head_element, 'head'];
3750     $insertion_mode = 'in head';
3751     ## reprocess
3752     redo B;
3753     } else {
3754     $self->{parse_error}->();
3755     ## Ignore the token
3756 wakaba 1.7 $token = $self->_get_next_token;
3757 wakaba 1.3 redo B;
3758     }
3759     } else {
3760     die "$0: $token->{type}: Unknown type";
3761     }
3762     } elsif ($insertion_mode eq 'in head') {
3763     if ($token->{type} eq 'character') {
3764     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3765     $open_elements->[-1]->[0]->manakai_append_text ($1);
3766     unless (length $token->{data}) {
3767     $token = $self->_get_next_token;
3768     redo B;
3769     }
3770     }
3771    
3772     #
3773     } elsif ($token->{type} eq 'comment') {
3774     my $comment = $self->{document}->create_comment ($token->{data});
3775     $open_elements->[-1]->[0]->append_child ($comment);
3776     $token = $self->_get_next_token;
3777     redo B;
3778     } elsif ($token->{type} eq 'start tag') {
3779     if ($token->{tag_name} eq 'title') {
3780 wakaba 1.8 ## NOTE: There is an "as if in head" code clone
3781     my $title_el;
3782    
3783 wakaba 1.3 $title_el = $self->{document}->create_element_ns
3784     (q<http://www.w3.org/1999/xhtml>, [undef, 'title']);
3785    
3786 wakaba 1.8 for my $attr_name (keys %{ $token->{attributes}}) {
3787     $title_el->set_attribute_ns (undef, [undef, $attr_name],
3788     $token->{attributes} ->{$attr_name}->{value});
3789     }
3790    
3791 wakaba 1.3 (defined $head_element ? $head_element : $open_elements->[-1]->[0])
3792     ->append_child ($title_el);
3793     $self->{content_model_flag} = 'RCDATA';
3794 wakaba 1.8
3795 wakaba 1.3 my $text = '';
3796     $token = $self->_get_next_token;
3797     while ($token->{type} eq 'character') {
3798     $text .= $token->{data};
3799     $token = $self->_get_next_token;
3800     }
3801     if (length $text) {
3802     $title_el->manakai_append_text ($text);
3803     }
3804    
3805     $self->{content_model_flag} = 'PCDATA';
3806    
3807     if ($token->{type} eq 'end tag' and
3808     $token->{tag_name} eq 'title') {
3809     ## Ignore the token
3810     } else {
3811     $self->{parse_error}->();
3812     ## ISSUE: And ignore?
3813     }
3814     $token = $self->_get_next_token;
3815     redo B;
3816     } elsif ($token->{tag_name} eq 'style') {
3817     $style_start_tag->();
3818     redo B;
3819     } elsif ($token->{tag_name} eq 'script') {
3820     $script_start_tag->();
3821     redo B;
3822     } elsif ({base => 1, link => 1, meta => 1}->{$token->{tag_name}}) {
3823     ## NOTE: There are "as if in head" code clones
3824     my $el;
3825    
3826     $el = $self->{document}->create_element_ns
3827     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
3828    
3829     for my $attr_name (keys %{ $token->{attributes}}) {
3830     $el->set_attribute_ns (undef, [undef, $attr_name],
3831     $token->{attributes} ->{$attr_name}->{value});
3832     }
3833    
3834     (defined $head_element ? $head_element : $open_elements->[-1]->[0])
3835     ->append_child ($el);
3836    
3837     ## ISSUE: Issue on magical <base> in the spec
3838    
3839     $token = $self->_get_next_token;
3840     redo B;
3841     } elsif ($token->{tag_name} eq 'head') {
3842     $self->{parse_error}->();
3843     ## Ignore the token
3844     $token = $self->_get_next_token;
3845     redo B;
3846     } else {
3847     #
3848     }
3849     } elsif ($token->{type} eq 'end tag') {
3850     if ($token->{tag_name} eq 'head') {
3851     if ($open_elements->[-1]->[1] eq 'head') {
3852     pop @$open_elements;
3853     } else {
3854     $self->{parse_error}->();
3855     }
3856     $insertion_mode = 'after head';
3857     $token = $self->_get_next_token;
3858     redo B;
3859     } elsif ($token->{tag_name} eq 'html') {
3860     #
3861     } else {
3862     $self->{parse_error}->();
3863     ## Ignore the token
3864     $token = $self->_get_next_token;
3865     redo B;
3866     }
3867     } else {
3868     #
3869     }
3870    
3871     if ($open_elements->[-1]->[1] eq 'head') {
3872     ## As if </head>
3873     pop @$open_elements;
3874     }
3875     $insertion_mode = 'after head';
3876     ## reprocess
3877     redo B;
3878    
3879     ## ISSUE: An issue in the spec.
3880     } elsif ($insertion_mode eq 'after head') {
3881     if ($token->{type} eq 'character') {
3882     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3883     $open_elements->[-1]->[0]->manakai_append_text ($1);
3884     unless (length $token->{data}) {
3885     $token = $self->_get_next_token;
3886     redo B;
3887     }
3888     }
3889    
3890     #
3891     } elsif ($token->{type} eq 'comment') {
3892     my $comment = $self->{document}->create_comment ($token->{data});
3893     $open_elements->[-1]->[0]->append_child ($comment);
3894     $token = $self->_get_next_token;
3895     redo B;
3896     } elsif ($token->{type} eq 'start tag') {
3897     if ($token->{tag_name} eq 'body') {
3898    
3899     {
3900     my $el;
3901    
3902     $el = $self->{document}->create_element_ns
3903     (q<http://www.w3.org/1999/xhtml>, [undef, 'body']);
3904    
3905     for my $attr_name (keys %{ $token->{attributes}}) {
3906     $el->set_attribute_ns (undef, [undef, $attr_name],
3907     $token->{attributes} ->{$attr_name}->{value});
3908     }
3909    
3910     $open_elements->[-1]->[0]->append_child ($el);
3911     push @$open_elements, [$el, 'body'];
3912     }
3913    
3914     $insertion_mode = 'in body';
3915     $token = $self->_get_next_token;
3916     redo B;
3917     } elsif ($token->{tag_name} eq 'frameset') {
3918    
3919     {
3920     my $el;
3921    
3922     $el = $self->{document}->create_element_ns
3923     (q<http://www.w3.org/1999/xhtml>, [undef, 'frameset']);
3924    
3925     for my $attr_name (keys %{ $token->{attributes}}) {
3926     $el->set_attribute_ns (undef, [undef, $attr_name],
3927     $token->{attributes} ->{$attr_name}->{value});
3928     }
3929    
3930     $open_elements->[-1]->[0]->append_child ($el);
3931     push @$open_elements, [$el, 'frameset'];
3932     }
3933    
3934     $insertion_mode = 'in frameset';
3935     $token = $self->_get_next_token;
3936     redo B;
3937     } elsif ({
3938     base => 1, link => 1, meta => 1,
3939     script=> 1, style => 1, title => 1,
3940     }->{$token->{tag_name}}) {
3941     $self->{parse_error}->();
3942     $insertion_mode = 'in head';
3943     ## reprocess
3944     redo B;
3945     } else {
3946     #
3947     }
3948     } else {
3949     #
3950     }
3951    
3952     ## As if <body>
3953    
3954     {
3955     my $el;
3956    
3957     $el = $self->{document}->create_element_ns
3958     (q<http://www.w3.org/1999/xhtml>, [undef, 'body']);
3959    
3960     $open_elements->[-1]->[0]->append_child ($el);
3961     push @$open_elements, [$el, 'body'];
3962     }
3963    
3964     $insertion_mode = 'in body';
3965     ## reprocess
3966     redo B;
3967     } elsif ($insertion_mode eq 'in body') {
3968     if ($token->{type} eq 'character') {
3969     ## NOTE: There is a code clone of "character in body".
3970 wakaba 1.8 $reconstruct_active_formatting_elements->($insert_to_current);
3971 wakaba 1.3
3972     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
3973    
3974     $token = $self->_get_next_token;
3975     redo B;
3976     } elsif ($token->{type} eq 'comment') {
3977     ## NOTE: There is a code clone of "comment in body".
3978     my $comment = $self->{document}->create_comment ($token->{data});
3979     $open_elements->[-1]->[0]->append_child ($comment);
3980     $token = $self->_get_next_token;
3981     redo B;
3982     } else {
3983 wakaba 1.8 $in_body->($insert_to_current);
3984 wakaba 1.3 redo B;
3985     }
3986     } elsif ($insertion_mode eq 'in table') {
3987     if ($token->{type} eq 'character') {
3988 wakaba 1.8 ## NOTE: There are "character in table" code clones.
3989     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3990     $open_elements->[-1]->[0]->manakai_append_text ($1);
3991    
3992     unless (length $token->{data}) {
3993     $token = $self->_get_next_token;
3994     redo B;
3995     }
3996     }
3997 wakaba 1.3
3998 wakaba 1.8 ## As if in body, but insert into foster parent element
3999     ## ISSUE: Spec says that "whenever a node would be inserted
4000     ## into the current node" while characters might not be
4001     ## result in a new Text node.
4002     $reconstruct_active_formatting_elements->($insert_to_foster);
4003    
4004     if ({
4005     table => 1, tbody => 1, tfoot => 1,
4006     thead => 1, tr => 1,
4007     }->{$open_elements->[-1]->[1]}) {
4008     # MUST
4009     my $foster_parent_element;
4010     my $next_sibling;
4011     my $prev_sibling;
4012     OE: for (reverse 0..$#$open_elements) {
4013     if ($open_elements->[$_]->[1] eq 'table') {
4014     my $parent = $open_elements->[$_]->[0]->parent_node;
4015     if (defined $parent and $parent->node_type == 1) {
4016     $foster_parent_element = $parent;
4017     $next_sibling = $open_elements->[$_]->[0];
4018     $prev_sibling = $next_sibling->previous_sibling;
4019     } else {
4020     $foster_parent_element = $open_elements->[$_ - 1]->[0];
4021     $prev_sibling = $foster_parent_element->last_child;
4022     }
4023     last OE;
4024     }
4025     } # OE
4026     $foster_parent_element = $open_elements->[0]->[0] and
4027     $prev_sibling = $foster_parent_element->last_child
4028     unless defined $foster_parent_element;
4029     if (defined $prev_sibling and
4030     $prev_sibling->node_type == 3) {
4031     $prev_sibling->manakai_append_text ($token->{data});
4032     } else {
4033     $foster_parent_element->insert_before
4034     ($self->{document}->create_text_node ($token->{data}),
4035     $next_sibling);
4036     }
4037     } else {
4038     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
4039     }
4040    
4041 wakaba 1.3 $token = $self->_get_next_token;
4042     redo B;
4043     } elsif ($token->{type} eq 'comment') {
4044     my $comment = $self->{document}->create_comment ($token->{data});
4045     $open_elements->[-1]->[0]->append_child ($comment);
4046     $token = $self->_get_next_token;
4047     redo B;
4048     } elsif ($token->{type} eq 'start tag') {
4049     if ({
4050     caption => 1,
4051     colgroup => 1,
4052     tbody => 1, tfoot => 1, thead => 1,
4053     }->{$token->{tag_name}}) {
4054     ## Clear back to table context
4055     while ($open_elements->[-1]->[1] ne 'table' and
4056     $open_elements->[-1]->[1] ne 'html') {
4057     $self->{parse_error}->();
4058     pop @$open_elements;
4059     }
4060    
4061     push @$active_formatting_elements, ['#marker', '']
4062     if $token->{tag_name} eq 'caption';
4063    
4064    
4065     {
4066     my $el;
4067    
4068     $el = $self->{document}->create_element_ns
4069     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
4070    
4071     for my $attr_name (keys %{ $token->{attributes}}) {
4072     $el->set_attribute_ns (undef, [undef, $attr_name],
4073     $token->{attributes} ->{$attr_name}->{value});
4074     }
4075    
4076     $open_elements->[-1]->[0]->append_child ($el);
4077     push @$open_elements, [$el, $token->{tag_name}];
4078     }
4079    
4080     $insertion_mode = {
4081     caption => 'in caption',
4082     colgroup => 'in column group',
4083     tbody => 'in table body',
4084     tfoot => 'in table body',
4085     thead => 'in table body',
4086     }->{$token->{tag_name}};
4087     $token = $self->_get_next_token;
4088     redo B;
4089     } elsif ({
4090     col => 1,
4091     td => 1, th => 1, tr => 1,
4092     }->{$token->{tag_name}}) {
4093     ## Clear back to table context
4094     while ($open_elements->[-1]->[1] ne 'table' and
4095     $open_elements->[-1]->[1] ne 'html') {
4096     $self->{parse_error}->();
4097     pop @$open_elements;
4098     }
4099    
4100    
4101     {
4102     my $el;
4103    
4104     $el = $self->{document}->create_element_ns
4105     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name} eq 'col' ? 'colgroup' : 'tbody']);
4106    
4107     $open_elements->[-1]->[0]->append_child ($el);
4108     push @$open_elements, [$el, $token->{tag_name} eq 'col' ? 'colgroup' : 'tbody'];
4109     }
4110    
4111     $insertion_mode = $token->{tag_name} eq 'col'
4112     ? 'in column group' : 'in table body';
4113     ## reprocess
4114     redo B;
4115     } elsif ($token->{tag_name} eq 'table') {
4116     ## NOTE: There are code clones for this "table in table"
4117     $self->{parse_error}->();
4118    
4119     ## As if </table>
4120     ## have a table element in table scope
4121     my $i;
4122     INSCOPE: for (reverse 0..$#$open_elements) {
4123     my $node = $open_elements->[$_];
4124     if ($node->[1] eq 'table') {
4125     $i = $_;
4126     last INSCOPE;
4127     } elsif ({
4128     table => 1, html => 1,
4129     }->{$node->[1]}) {
4130     last INSCOPE;
4131     }
4132     } # INSCOPE
4133     unless (defined $i) {
4134     $self->{parse_error}->();
4135     ## Ignore tokens </table><table>
4136     $token = $self->_get_next_token;
4137     redo B;
4138     }
4139    
4140     ## generate implied end tags
4141     if ({
4142     dd => 1, dt => 1, li => 1, p => 1,
4143     td => 1, th => 1, tr => 1,
4144     }->{$open_elements->[-1]->[1]}) {
4145     unshift @{$self->{token}}, $token; # <table>
4146     $token = {type => 'end tag', tag_name => 'table'};
4147     unshift @{$self->{token}}, $token;
4148     $token = {type => 'end tag',
4149     tag_name => $open_elements->[-1]->[1]}; # MUST
4150     redo B;
4151     }
4152    
4153     if ($open_elements->[-1]->[1] ne 'table') {
4154     $self->{parse_error}->();
4155     }
4156    
4157     splice @$open_elements, $i;
4158    
4159     $reset_insertion_mode->();
4160    
4161     ## reprocess
4162     redo B;
4163     } else {
4164     #
4165     }
4166     } elsif ($token->{type} eq 'end tag') {
4167     if ($token->{tag_name} eq 'table') {
4168     ## have a table element in table scope
4169     my $i;
4170     INSCOPE: for (reverse 0..$#$open_elements) {
4171     my $node = $open_elements->[$_];
4172     if ($node->[1] eq $token->{tag_name}) {
4173     $i = $_;
4174     last INSCOPE;
4175     } elsif ({
4176     table => 1, html => 1,
4177     }->{$node->[1]}) {
4178     last INSCOPE;
4179     }
4180     } # INSCOPE
4181     unless (defined $i) {
4182     $self->{parse_error}->();
4183     ## Ignore the token
4184     $token = $self->_get_next_token;
4185     redo B;
4186     }
4187    
4188     ## generate implied end tags
4189     if ({
4190     dd => 1, dt => 1, li => 1, p => 1,
4191     td => 1, th => 1, tr => 1,
4192     }->{$open_elements->[-1]->[1]}) {
4193     unshift @{$self->{token}}, $token;
4194     $token = {type => 'end tag',
4195     tag_name => $open_elements->[-1]->[1]}; # MUST
4196     redo B;
4197     }
4198    
4199     if ($open_elements->[-1]->[1] ne 'table') {
4200     $self->{parse_error}->();
4201     }
4202    
4203     splice @$open_elements, $i;
4204    
4205     $reset_insertion_mode->();
4206    
4207     $token = $self->_get_next_token;
4208     redo B;
4209     } elsif ({
4210     body => 1, caption => 1, col => 1, colgroup => 1,
4211     html => 1, tbody => 1, td => 1, tfoot => 1, th => 1,
4212     thead => 1, tr => 1,
4213     }->{$token->{tag_name}}) {
4214     $self->{parse_error}->();
4215     ## Ignore the token
4216     $token = $self->_get_next_token;
4217     redo B;
4218     } else {
4219     #
4220     }
4221     } else {
4222     #
4223     }
4224    
4225     $self->{parse_error}->();
4226 wakaba 1.8 $in_body->($insert_to_foster);
4227 wakaba 1.3 redo B;
4228     } elsif ($insertion_mode eq 'in caption') {
4229 wakaba 1.7 if ($token->{type} eq 'character') {
4230     ## NOTE: This is a code clone of "character in body".
4231 wakaba 1.8 $reconstruct_active_formatting_elements->($insert_to_current);
4232 wakaba 1.7
4233     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
4234    
4235     $token = $self->_get_next_token;
4236     redo B;
4237     } elsif ($token->{type} eq 'comment') {
4238     ## NOTE: This is a code clone of "comment in body".
4239     my $comment = $self->{document}->create_comment ($token->{data});
4240     $open_elements->[-1]->[0]->append_child ($comment);
4241     $token = $self->_get_next_token;
4242     redo B;
4243     } elsif ($token->{type} eq 'start tag') {
4244 wakaba 1.3 if ({
4245     caption => 1, col => 1, colgroup => 1, tbody => 1,
4246     td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
4247     }->{$token->{tag_name}}) {
4248     $self->{parse_error}->();
4249    
4250     ## As if </caption>
4251     ## have a table element in table scope
4252     my $i;
4253     INSCOPE: for (reverse 0..$#$open_elements) {
4254     my $node = $open_elements->[$_];
4255     if ($node->[1] eq 'caption') {
4256     $i = $_;
4257     last INSCOPE;
4258     } elsif ({
4259     table => 1, html => 1,
4260     }->{$node->[1]}) {
4261     last INSCOPE;
4262     }
4263     } # INSCOPE
4264     unless (defined $i) {
4265     $self->{parse_error}->();
4266     ## Ignore the token
4267     $token = $self->_get_next_token;
4268     redo B;
4269     }
4270    
4271     ## generate implied end tags
4272     if ({
4273     dd => 1, dt => 1, li => 1, p => 1,
4274     td => 1, th => 1, tr => 1,
4275     }->{$open_elements->[-1]->[1]}) {
4276     unshift @{$self->{token}}, $token; # <?>
4277     $token = {type => 'end tag', tag_name => 'caption'};
4278     unshift @{$self->{token}}, $token;
4279     $token = {type => 'end tag',
4280     tag_name => $open_elements->[-1]->[1]}; # MUST
4281     redo B;
4282     }
4283    
4284     if ($open_elements->[-1]->[1] ne 'caption') {
4285     $self->{parse_error}->();
4286     }
4287    
4288     splice @$open_elements, $i;
4289    
4290     $clear_up_to_marker->();
4291    
4292     $insertion_mode = 'in table';
4293    
4294     ## reprocess
4295     redo B;
4296     } else {
4297     #
4298     }
4299     } elsif ($token->{type} eq 'end tag') {
4300     if ($token->{tag_name} eq 'caption') {
4301     ## have a table element in table scope
4302     my $i;
4303     INSCOPE: for (reverse 0..$#$open_elements) {
4304     my $node = $open_elements->[$_];
4305     if ($node->[1] eq $token->{tag_name}) {
4306     $i = $_;
4307     last INSCOPE;
4308     } elsif ({
4309     table => 1, html => 1,
4310     }->{$node->[1]}) {
4311     last INSCOPE;
4312     }
4313     } # INSCOPE
4314     unless (defined $i) {
4315     $self->{parse_error}->();
4316     ## Ignore the token
4317     $token = $self->_get_next_token;
4318     redo B;
4319     }
4320    
4321     ## generate implied end tags
4322     if ({
4323     dd => 1, dt => 1, li => 1, p => 1,
4324     td => 1, th => 1, tr => 1,
4325     }->{$open_elements->[-1]->[1]}) {
4326     unshift @{$self->{token}}, $token;
4327     $token = {type => 'end tag',
4328     tag_name => $open_elements->[-1]->[1]}; # MUST
4329     redo B;
4330     }
4331    
4332     if ($open_elements->[-1]->[1] ne 'caption') {
4333     $self->{parse_error}->();
4334     }
4335    
4336     splice @$open_elements, $i;
4337    
4338     $clear_up_to_marker->();
4339    
4340     $insertion_mode = 'in table';
4341    
4342     $token = $self->_get_next_token;
4343     redo B;
4344     } elsif ($token->{tag_name} eq 'table') {
4345     $self->{parse_error}->();
4346    
4347     ## As if </caption>
4348     ## have a table element in table scope
4349     my $i;
4350     INSCOPE: for (reverse 0..$#$open_elements) {
4351     my $node = $open_elements->[$_];
4352     if ($node->[1] eq 'caption') {
4353     $i = $_;
4354     last INSCOPE;
4355     } elsif ({
4356     table => 1, html => 1,
4357     }->{$node->[1]}) {
4358     last INSCOPE;
4359     }
4360     } # INSCOPE
4361     unless (defined $i) {
4362     $self->{parse_error}->();
4363     ## Ignore the token
4364     $token = $self->_get_next_token;
4365     redo B;
4366     }
4367    
4368     ## generate implied end tags
4369     if ({
4370     dd => 1, dt => 1, li => 1, p => 1,
4371     td => 1, th => 1, tr => 1,
4372     }->{$open_elements->[-1]->[1]}) {
4373     unshift @{$self->{token}}, $token; # </table>
4374     $token = {type => 'end tag', tag_name => 'caption'};
4375     unshift @{$self->{token}}, $token;
4376     $token = {type => 'end tag',
4377     tag_name => $open_elements->[-1]->[1]}; # MUST
4378     redo B;
4379     }
4380    
4381     if ($open_elements->[-1]->[1] ne 'caption') {
4382     $self->{parse_error}->();
4383     }
4384    
4385     splice @$open_elements, $i;
4386    
4387     $clear_up_to_marker->();
4388    
4389     $insertion_mode = 'in table';
4390    
4391     ## reprocess
4392     redo B;
4393     } elsif ({
4394     body => 1, col => 1, colgroup => 1,
4395     html => 1, tbody => 1, td => 1, tfoot => 1,
4396     th => 1, thead => 1, tr => 1,
4397     }->{$token->{tag_name}}) {
4398     $self->{parse_error}->();
4399     ## Ignore the token
4400     redo B;
4401     } else {
4402     #
4403     }
4404     } else {
4405     #
4406     }
4407    
4408 wakaba 1.8 $in_body->($insert_to_current);
4409 wakaba 1.3 redo B;
4410     } elsif ($insertion_mode eq 'in column group') {
4411     if ($token->{type} eq 'character') {
4412     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4413     $open_elements->[-1]->[0]->manakai_append_text ($1);
4414     unless (length $token->{data}) {
4415     $token = $self->_get_next_token;
4416     redo B;
4417     }
4418     }
4419    
4420     #
4421     } elsif ($token->{type} eq 'comment') {
4422     my $comment = $self->{document}->create_comment ($token->{data});
4423     $open_elements->[-1]->[0]->append_child ($comment);
4424     $token = $self->_get_next_token;
4425     redo B;
4426     } elsif ($token->{type} eq 'start tag') {
4427     if ($token->{tag_name} eq 'col') {
4428    
4429     {
4430     my $el;
4431    
4432     $el = $self->{document}->create_element_ns
4433     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
4434    
4435     for my $attr_name (keys %{ $token->{attributes}}) {
4436     $el->set_attribute_ns (undef, [undef, $attr_name],
4437     $token->{attributes} ->{$attr_name}->{value});
4438     }
4439    
4440     $open_elements->[-1]->[0]->append_child ($el);
4441     push @$open_elements, [$el, $token->{tag_name}];
4442     }
4443    
4444     pop @$open_elements;
4445     $token = $self->_get_next_token;
4446     redo B;
4447     } else {
4448     #
4449     }
4450     } elsif ($token->{type} eq 'end tag') {
4451     if ($token->{tag_name} eq 'colgroup') {
4452     if ($open_elements->[-1]->[1] eq 'html') {
4453     $self->{parse_error}->();
4454     ## Ignore the token
4455     $token = $self->_get_next_token;
4456     redo B;
4457     } else {
4458     pop @$open_elements; # colgroup
4459     $insertion_mode = 'in table';
4460     $token = $self->_get_next_token;
4461     redo B;
4462     }
4463     } elsif ($token->{tag_name} eq 'col') {
4464     $self->{parse_error}->();
4465     ## Ignore the token
4466     $token = $self->_get_next_token;
4467     redo B;
4468     } else {
4469     #
4470     }
4471     } else {
4472     #
4473     }
4474    
4475     ## As if </colgroup>
4476     if ($open_elements->[-1]->[1] eq 'html') {
4477     $self->{parse_error}->();
4478     ## Ignore the token
4479     $token = $self->_get_next_token;
4480     redo B;
4481     } else {
4482     pop @$open_elements; # colgroup
4483     $insertion_mode = 'in table';
4484     ## reprocess
4485     redo B;
4486     }
4487     } elsif ($insertion_mode eq 'in table body') {
4488     if ($token->{type} eq 'character') {
4489 wakaba 1.8 ## NOTE: This is a "character in table" code clone.
4490     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4491     $open_elements->[-1]->[0]->manakai_append_text ($1);
4492    
4493     unless (length $token->{data}) {
4494     $token = $self->_get_next_token;
4495     redo B;
4496     }
4497     }
4498 wakaba 1.3
4499 wakaba 1.8 ## As if in body, but insert into foster parent element
4500     ## ISSUE: Spec says that "whenever a node would be inserted
4501     ## into the current node" while characters might not be
4502     ## result in a new Text node.
4503     $reconstruct_active_formatting_elements->($insert_to_foster);
4504 wakaba 1.3
4505 wakaba 1.8 if ({
4506     table => 1, tbody => 1, tfoot => 1,
4507     thead => 1, tr => 1,
4508     }->{$open_elements->[-1]->[1]}) {
4509     # MUST
4510     my $foster_parent_element;
4511     my $next_sibling;
4512     my $prev_sibling;
4513     OE: for (reverse 0..$#$open_elements) {
4514     if ($open_elements->[$_]->[1] eq 'table') {
4515     my $parent = $open_elements->[$_]->[0]->parent_node;
4516     if (defined $parent and $parent->node_type == 1) {
4517     $foster_parent_element = $parent;
4518     $next_sibling = $open_elements->[$_]->[0];
4519     $prev_sibling = $next_sibling->previous_sibling;
4520     } else {
4521     $foster_parent_element = $open_elements->[$_ - 1]->[0];
4522     $prev_sibling = $foster_parent_element->last_child;
4523     }
4524     last OE;
4525     }
4526     } # OE
4527     $foster_parent_element = $open_elements->[0]->[0] and
4528     $prev_sibling = $foster_parent_element->last_child
4529     unless defined $foster_parent_element;
4530     if (defined $prev_sibling and
4531     $prev_sibling->node_type == 3) {
4532     $prev_sibling->manakai_append_text ($token->{data});
4533     } else {
4534     $foster_parent_element->insert_before
4535     ($self->{document}->create_text_node ($token->{data}),
4536     $next_sibling);
4537     }
4538     } else {
4539     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
4540     }
4541    
4542 wakaba 1.3 $token = $self->_get_next_token;
4543     redo B;
4544     } elsif ($token->{type} eq 'comment') {
4545     ## Copied from 'in table'
4546     my $comment = $self->{document}->create_comment ($token->{data});
4547     $open_elements->[-1]->[0]->append_child ($comment);
4548     $token = $self->_get_next_token;
4549     redo B;
4550     } elsif ($token->{type} eq 'start tag') {
4551     if ({
4552     tr => 1,
4553     th => 1, td => 1,
4554     }->{$token->{tag_name}}) {
4555     ## Clear back to table body context
4556     while (not {
4557     tbody => 1, tfoot => 1, thead => 1, html => 1,
4558     }->{$open_elements->[-1]->[1]}) {
4559     $self->{parse_error}->();
4560     pop @$open_elements;
4561     }
4562    
4563     $insertion_mode = 'in row';
4564     if ($token->{tag_name} eq 'tr') {
4565    
4566     {
4567     my $el;
4568    
4569     $el = $self->{document}->create_element_ns
4570     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
4571    
4572     for my $attr_name (keys %{ $token->{attributes}}) {
4573     $el->set_attribute_ns (undef, [undef, $attr_name],
4574     $token->{attributes} ->{$attr_name}->{value});
4575     }
4576    
4577     $open_elements->[-1]->[0]->append_child ($el);
4578     push @$open_elements, [$el, $token->{tag_name}];
4579     }
4580    
4581     $token = $self->_get_next_token;
4582     } else {
4583    
4584     {
4585     my $el;
4586    
4587     $el = $self->{document}->create_element_ns
4588     (q<http://www.w3.org/1999/xhtml>, [undef, 'tr']);
4589    
4590     $open_elements->[-1]->[0]->append_child ($el);
4591     push @$open_elements, [$el, 'tr'];
4592     }
4593    
4594     ## reprocess
4595     }
4596     redo B;
4597     } elsif ({
4598     caption => 1, col => 1, colgroup => 1,
4599     tbody => 1, tfoot => 1, thead => 1,
4600     }->{$token->{tag_name}}) {
4601     ## have an element in table scope
4602     my $i;
4603     INSCOPE: for (reverse 0..$#$open_elements) {
4604     my $node = $open_elements->[$_];
4605     if ({
4606     tbody => 1, thead => 1, tfoot => 1,
4607     }->{$node->[1]}) {
4608     $i = $_;
4609     last INSCOPE;
4610     } elsif ({
4611     table => 1, html => 1,
4612     }->{$node->[1]}) {
4613     last INSCOPE;
4614     }
4615     } # INSCOPE
4616     unless (defined $i) {
4617     $self->{parse_error}->();
4618     ## Ignore the token
4619     $token = $self->_get_next_token;
4620     redo B;
4621     }
4622    
4623     ## Clear back to table body context
4624     while (not {
4625     tbody => 1, tfoot => 1, thead => 1, html => 1,
4626     }->{$open_elements->[-1]->[1]}) {
4627     $self->{parse_error}->();
4628     pop @$open_elements;
4629     }
4630    
4631     ## As if <{current node}>
4632     ## have an element in table scope
4633     ## true by definition
4634    
4635     ## Clear back to table body context
4636     ## nop by definition
4637    
4638     pop @$open_elements;
4639     $insertion_mode = 'in table';
4640     ## reprocess
4641     redo B;
4642     } elsif ($token->{tag_name} eq 'table') {
4643     ## NOTE: This is a code clone of "table in table"
4644     $self->{parse_error}->();
4645    
4646     ## As if </table>
4647     ## have a table element in table scope
4648     my $i;
4649     INSCOPE: for (reverse 0..$#$open_elements) {
4650     my $node = $open_elements->[$_];
4651     if ($node->[1] eq 'table') {
4652     $i = $_;
4653     last INSCOPE;
4654     } elsif ({
4655     table => 1, html => 1,
4656     }->{$node->[1]}) {
4657     last INSCOPE;
4658     }
4659     } # INSCOPE
4660     unless (defined $i) {
4661     $self->{parse_error}->();
4662     ## Ignore tokens </table><table>
4663     $token = $self->_get_next_token;
4664     redo B;
4665     }
4666    
4667     ## generate implied end tags
4668     if ({
4669     dd => 1, dt => 1, li => 1, p => 1,
4670     td => 1, th => 1, tr => 1,
4671     }->{$open_elements->[-1]->[1]}) {
4672     unshift @{$self->{token}}, $token; # <table>
4673     $token = {type => 'end tag', tag_name => 'table'};
4674     unshift @{$self->{token}}, $token;
4675     $token = {type => 'end tag',
4676     tag_name => $open_elements->[-1]->[1]}; # MUST
4677     redo B;
4678     }
4679    
4680     if ($open_elements->[-1]->[1] ne 'table') {
4681     $self->{parse_error}->();
4682     }
4683    
4684     splice @$open_elements, $i;
4685    
4686     $reset_insertion_mode->();
4687    
4688     ## reprocess
4689     redo B;
4690     } else {
4691     #
4692     }
4693     } elsif ($token->{type} eq 'end tag') {
4694     if ({
4695     tbody => 1, tfoot => 1, thead => 1,
4696     }->{$token->{tag_name}}) {
4697     ## have an element in table scope
4698     my $i;
4699     INSCOPE: for (reverse 0..$#$open_elements) {
4700     my $node = $open_elements->[$_];
4701     if ($node->[1] eq $token->{tag_name}) {
4702     $i = $_;
4703     last INSCOPE;
4704     } elsif ({
4705     table => 1, html => 1,
4706     }->{$node->[1]}) {
4707     last INSCOPE;
4708     }
4709     } # INSCOPE
4710     unless (defined $i) {
4711     $self->{parse_error}->();
4712     ## Ignore the token
4713     $token = $self->_get_next_token;
4714     redo B;
4715     }
4716    
4717     ## Clear back to table body context
4718     while (not {
4719     tbody => 1, tfoot => 1, thead => 1, html => 1,
4720     }->{$open_elements->[-1]->[1]}) {
4721     $self->{parse_error}->();
4722     pop @$open_elements;
4723     }
4724    
4725     pop @$open_elements;
4726     $insertion_mode = 'in table';
4727     $token = $self->_get_next_token;
4728     redo B;
4729     } elsif ($token->{tag_name} eq 'table') {
4730     ## have an element in table scope
4731     my $i;
4732     INSCOPE: for (reverse 0..$#$open_elements) {
4733     my $node = $open_elements->[$_];
4734     if ({
4735     tbody => 1, thead => 1, tfoot => 1,
4736     }->{$node->[1]}) {
4737     $i = $_;
4738     last INSCOPE;
4739     } elsif ({
4740     table => 1, html => 1,
4741     }->{$node->[1]}) {
4742     last INSCOPE;
4743     }
4744     } # INSCOPE
4745     unless (defined $i) {
4746     $self->{parse_error}->();
4747     ## Ignore the token
4748     $token = $self->_get_next_token;
4749     redo B;
4750     }
4751    
4752     ## Clear back to table body context
4753     while (not {
4754     tbody => 1, tfoot => 1, thead => 1, html => 1,
4755     }->{$open_elements->[-1]->[1]}) {
4756     $self->{parse_error}->();
4757     pop @$open_elements;
4758     }
4759    
4760     ## As if <{current node}>
4761     ## have an element in table scope
4762     ## true by definition
4763    
4764     ## Clear back to table body context
4765     ## nop by definition
4766    
4767     pop @$open_elements;
4768     $insertion_mode = 'in table';
4769     ## reprocess
4770     redo B;
4771     } elsif ({
4772     body => 1, caption => 1, col => 1, colgroup => 1,
4773     html => 1, td => 1, th => 1, tr => 1,
4774     }->{$token->{tag_name}}) {
4775     $self->{parse_error}->();
4776     ## Ignore the token
4777     $token = $self->_get_next_token;
4778     redo B;
4779     } else {
4780     #
4781     }
4782     } else {
4783     #
4784     }
4785    
4786     ## As if in table
4787     $self->{parse_error}->();
4788 wakaba 1.8 $in_body->($insert_to_foster);
4789 wakaba 1.3 redo B;
4790     } elsif ($insertion_mode eq 'in row') {
4791     if ($token->{type} eq 'character') {
4792 wakaba 1.8 ## NOTE: This is a "character in table" code clone.
4793     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4794     $open_elements->[-1]->[0]->manakai_append_text ($1);
4795    
4796     unless (length $token->{data}) {
4797     $token = $self->_get_next_token;
4798     redo B;
4799     }
4800     }
4801 wakaba 1.3
4802 wakaba 1.8 ## As if in body, but insert into foster parent element
4803     ## ISSUE: Spec says that "whenever a node would be inserted
4804     ## into the current node" while characters might not be
4805     ## result in a new Text node.
4806     $reconstruct_active_formatting_elements->($insert_to_foster);
4807    
4808     if ({
4809     table => 1, tbody => 1, tfoot => 1,
4810     thead => 1, tr => 1,
4811     }->{$open_elements->[-1]->[1]}) {
4812     # MUST
4813     my $foster_parent_element;
4814     my $next_sibling;
4815     my $prev_sibling;
4816     OE: for (reverse 0..$#$open_elements) {
4817     if ($open_elements->[$_]->[1] eq 'table') {
4818     my $parent = $open_elements->[$_]->[0]->parent_node;
4819     if (defined $parent and $parent->node_type == 1) {
4820     $foster_parent_element = $parent;
4821     $next_sibling = $open_elements->[$_]->[0];
4822     $prev_sibling = $next_sibling->previous_sibling;
4823     } else {
4824     $foster_parent_element = $open_elements->[$_ - 1]->[0];
4825     $prev_sibling = $foster_parent_element->last_child;
4826     }
4827     last OE;
4828     }
4829     } # OE
4830     $foster_parent_element = $open_elements->[0]->[0] and
4831     $prev_sibling = $foster_parent_element->last_child
4832     unless defined $foster_parent_element;
4833     if (defined $prev_sibling and
4834     $prev_sibling->node_type == 3) {
4835     $prev_sibling->manakai_append_text ($token->{data});
4836     } else {
4837     $foster_parent_element->insert_before
4838     ($self->{document}->create_text_node ($token->{data}),
4839     $next_sibling);
4840     }
4841     } else {
4842     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
4843     }
4844    
4845 wakaba 1.3 $token = $self->_get_next_token;
4846     redo B;
4847     } elsif ($token->{type} eq 'comment') {
4848     ## Copied from 'in table'
4849     my $comment = $self->{document}->create_comment ($token->{data});
4850     $open_elements->[-1]->[0]->append_child ($comment);
4851     $token = $self->_get_next_token;
4852     redo B;
4853     } elsif ($token->{type} eq 'start tag') {
4854     if ($token->{tag_name} eq 'th' or
4855     $token->{tag_name} eq 'td') {
4856     ## Clear back to table row context
4857     while (not {
4858 wakaba 1.7 tr => 1, html => 1,
4859 wakaba 1.3 }->{$open_elements->[-1]->[1]}) {
4860     $self->{parse_error}->();
4861     pop @$open_elements;
4862     }
4863    
4864    
4865     {
4866     my $el;
4867    
4868     $el = $self->{document}->create_element_ns
4869     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
4870    
4871     for my $attr_name (keys %{ $token->{attributes}}) {
4872     $el->set_attribute_ns (undef, [undef, $attr_name],
4873     $token->{attributes} ->{$attr_name}->{value});
4874     }
4875    
4876     $open_elements->[-1]->[0]->append_child ($el);
4877     push @$open_elements, [$el, $token->{tag_name}];
4878     }
4879    
4880     $insertion_mode = 'in cell';
4881    
4882     push @$active_formatting_elements, ['#marker', ''];
4883    
4884     $token = $self->_get_next_token;
4885     redo B;
4886     } elsif ({
4887     caption => 1, col => 1, colgroup => 1,
4888     tbody => 1, tfoot => 1, thead => 1, tr => 1,
4889     }->{$token->{tag_name}}) {
4890     ## As if </tr>
4891     ## have an element in table scope
4892     my $i;
4893     INSCOPE: for (reverse 0..$#$open_elements) {
4894     my $node = $open_elements->[$_];
4895     if ($node->[1] eq 'tr') {
4896     $i = $_;
4897     last INSCOPE;
4898     } elsif ({
4899     table => 1, html => 1,
4900     }->{$node->[1]}) {
4901     last INSCOPE;
4902     }
4903     } # INSCOPE
4904     unless (defined $i) {
4905     $self->{parse_error}->();
4906     ## Ignore the token
4907     $token = $self->_get_next_token;
4908     redo B;
4909     }
4910    
4911     ## Clear back to table row context
4912     while (not {
4913     tr => 1, html => 1,
4914     }->{$open_elements->[-1]->[1]}) {
4915     $self->{parse_error}->();
4916     pop @$open_elements;
4917     }
4918    
4919     pop @$open_elements; # tr
4920     $insertion_mode = 'in table body';
4921     ## reprocess
4922     redo B;
4923     } elsif ($token->{tag_name} eq 'table') {
4924     ## NOTE: This is a code clone of "table in table"
4925     $self->{parse_error}->();
4926    
4927     ## As if </table>
4928     ## have a table element in table scope
4929     my $i;
4930     INSCOPE: for (reverse 0..$#$open_elements) {
4931     my $node = $open_elements->[$_];
4932     if ($node->[1] eq 'table') {
4933     $i = $_;
4934     last INSCOPE;
4935     } elsif ({
4936     table => 1, html => 1,
4937     }->{$node->[1]}) {
4938     last INSCOPE;
4939     }
4940     } # INSCOPE
4941     unless (defined $i) {
4942     $self->{parse_error}->();
4943     ## Ignore tokens </table><table>
4944     $token = $self->_get_next_token;
4945     redo B;
4946     }
4947    
4948     ## generate implied end tags
4949     if ({
4950     dd => 1, dt => 1, li => 1, p => 1,
4951     td => 1, th => 1, tr => 1,
4952     }->{$open_elements->[-1]->[1]}) {
4953     unshift @{$self->{token}}, $token; # <table>
4954     $token = {type => 'end tag', tag_name => 'table'};
4955     unshift @{$self->{token}}, $token;
4956     $token = {type => 'end tag',
4957     tag_name => $open_elements->[-1]->[1]}; # MUST
4958     redo B;
4959     }
4960    
4961     if ($open_elements->[-1]->[1] ne 'table') {
4962     $self->{parse_error}->();
4963     }
4964    
4965     splice @$open_elements, $i;
4966    
4967     $reset_insertion_mode->();
4968    
4969     ## reprocess
4970     redo B;
4971     } else {
4972     #
4973     }
4974     } elsif ($token->{type} eq 'end tag') {
4975     if ($token->{tag_name} eq 'tr') {
4976     ## have an element in table scope
4977     my $i;
4978     INSCOPE: for (reverse 0..$#$open_elements) {
4979     my $node = $open_elements->[$_];
4980     if ($node->[1] eq $token->{tag_name}) {
4981     $i = $_;
4982     last INSCOPE;
4983     } elsif ({
4984     table => 1, html => 1,
4985     }->{$node->[1]}) {
4986     last INSCOPE;
4987     }
4988     } # INSCOPE
4989     unless (defined $i) {
4990     $self->{parse_error}->();
4991     ## Ignore the token
4992     $token = $self->_get_next_token;
4993     redo B;
4994     }
4995    
4996     ## Clear back to table row context
4997     while (not {
4998     tr => 1, html => 1,
4999     }->{$open_elements->[-1]->[1]}) {
5000     $self->{parse_error}->();
5001     pop @$open_elements;
5002     }
5003    
5004     pop @$open_elements; # tr
5005     $insertion_mode = 'in table body';
5006     $token = $self->_get_next_token;
5007     redo B;
5008     } elsif ($token->{tag_name} eq 'table') {
5009     ## As if </tr>
5010     ## have an element in table scope
5011     my $i;
5012     INSCOPE: for (reverse 0..$#$open_elements) {
5013     my $node = $open_elements->[$_];
5014     if ($node->[1] eq 'tr') {
5015     $i = $_;
5016     last INSCOPE;
5017     } elsif ({
5018     table => 1, html => 1,
5019     }->{$node->[1]}) {
5020     last INSCOPE;
5021     }
5022     } # INSCOPE
5023     unless (defined $i) {
5024     $self->{parse_error}->();
5025     ## Ignore the token
5026     $token = $self->_get_next_token;
5027     redo B;
5028     }
5029    
5030     ## Clear back to table row context
5031     while (not {
5032     tr => 1, html => 1,
5033     }->{$open_elements->[-1]->[1]}) {
5034     $self->{parse_error}->();
5035     pop @$open_elements;
5036     }
5037    
5038     pop @$open_elements; # tr
5039     $insertion_mode = 'in table body';
5040     ## reprocess
5041     redo B;
5042     } elsif ({
5043     tbody => 1, tfoot => 1, thead => 1,
5044     }->{$token->{tag_name}}) {
5045     ## have an element in table scope
5046     my $i;
5047     INSCOPE: for (reverse 0..$#$open_elements) {
5048     my $node = $open_elements->[$_];
5049     if ($node->[1] eq $token->{tag_name}) {
5050     $i = $_;
5051     last INSCOPE;
5052     } elsif ({
5053     table => 1, html => 1,
5054     }->{$node->[1]}) {
5055     last INSCOPE;
5056     }
5057     } # INSCOPE
5058     unless (defined $i) {
5059     $self->{parse_error}->();
5060     ## Ignore the token
5061     $token = $self->_get_next_token;
5062     redo B;
5063     }
5064    
5065     ## As if </tr>
5066     ## have an element in table scope
5067     my $i;
5068     INSCOPE: for (reverse 0..$#$open_elements) {
5069     my $node = $open_elements->[$_];
5070     if ($node->[1] eq 'tr') {
5071     $i = $_;
5072     last INSCOPE;
5073     } elsif ({
5074     table => 1, html => 1,
5075     }->{$node->[1]}) {
5076     last INSCOPE;
5077     }
5078     } # INSCOPE
5079     unless (defined $i) {
5080     $self->{parse_error}->();
5081     ## Ignore the token
5082     $token = $self->_get_next_token;
5083     redo B;
5084     }
5085    
5086     ## Clear back to table row context
5087     while (not {
5088     tr => 1, html => 1,
5089     }->{$open_elements->[-1]->[1]}) {
5090     $self->{parse_error}->();
5091     pop @$open_elements;
5092     }
5093    
5094     pop @$open_elements; # tr
5095     $insertion_mode = 'in table body';
5096     ## reprocess
5097     redo B;
5098     } elsif ({
5099     body => 1, caption => 1, col => 1,
5100     colgroup => 1, html => 1, td => 1, th => 1,
5101     }->{$token->{tag_name}}) {
5102     $self->{parse_error}->();
5103     ## Ignore the token
5104     $token = $self->_get_next_token;
5105     redo B;
5106     } else {
5107     #
5108     }
5109     } else {
5110     #
5111     }
5112    
5113     ## As if in table
5114     $self->{parse_error}->();
5115 wakaba 1.8 $in_body->($insert_to_foster);
5116 wakaba 1.3 redo B;
5117     } elsif ($insertion_mode eq 'in cell') {
5118     if ($token->{type} eq 'character') {
5119     ## NOTE: This is a code clone of "character in body".
5120 wakaba 1.8 $reconstruct_active_formatting_elements->($insert_to_current);
5121 wakaba 1.3
5122     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
5123    
5124     $token = $self->_get_next_token;
5125     redo B;
5126     } elsif ($token->{type} eq 'comment') {
5127     ## NOTE: This is a code clone of "comment in body".
5128     my $comment = $self->{document}->create_comment ($token->{data});
5129     $open_elements->[-1]->[0]->append_child ($comment);
5130     $token = $self->_get_next_token;
5131     redo B;
5132     } elsif ($token->{type} eq 'start tag') {
5133     if ({
5134     caption => 1, col => 1, colgroup => 1,
5135     tbody => 1, td => 1, tfoot => 1, th => 1,
5136     thead => 1, tr => 1,
5137     }->{$token->{tag_name}}) {
5138     ## have an element in table scope
5139     my $tn;
5140     INSCOPE: for (reverse 0..$#$open_elements) {
5141     my $node = $open_elements->[$_];
5142     if ($node->[1] eq 'td' or $node->[1] eq 'th') {
5143     $tn = $node->[1];
5144     last INSCOPE;
5145     } elsif ({
5146     table => 1, html => 1,
5147     }->{$node->[1]}) {
5148     last INSCOPE;
5149     }
5150     } # INSCOPE
5151     unless (defined $tn) {
5152     $self->{parse_error}->();
5153     ## Ignore the token
5154     $token = $self->_get_next_token;
5155     redo B;
5156     }
5157    
5158     ## Close the cell
5159     unshift @{$self->{token}}, $token; # <?>
5160     $token = {type => 'end tag', tag_name => $tn};
5161     redo B;
5162     } else {
5163     #
5164     }
5165     } elsif ($token->{type} eq 'end tag') {
5166 wakaba 1.7 if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') {
5167 wakaba 1.3 ## have an element in table scope
5168     my $i;
5169     INSCOPE: for (reverse 0..$#$open_elements) {
5170     my $node = $open_elements->[$_];
5171     if ($node->[1] eq $token->{tag_name}) {
5172     $i = $_;
5173     last INSCOPE;
5174     } elsif ({
5175     table => 1, html => 1,
5176     }->{$node->[1]}) {
5177     last INSCOPE;
5178     }
5179     } # INSCOPE
5180     unless (defined $i) {
5181     $self->{parse_error}->();
5182     ## Ignore the token
5183     $token = $self->_get_next_token;
5184     redo B;
5185     }
5186    
5187     ## generate implied end tags
5188     if ({
5189     dd => 1, dt => 1, li => 1, p => 1,
5190     td => ($token->{tag_name} eq 'th'),
5191     th => ($token->{tag_name} eq 'td'),
5192     tr => 1,
5193     }->{$open_elements->[-1]->[1]}) {
5194     unshift @{$self->{token}}, $token;
5195     $token = {type => 'end tag',
5196     tag_name => $open_elements->[-1]->[1]}; # MUST
5197     redo B;
5198     }
5199    
5200     if ($open_elements->[-1]->[1] ne $token->{tag_name}) {
5201     $self->{parse_error}->();
5202     }
5203    
5204     splice @$open_elements, $i;
5205    
5206     $clear_up_to_marker->();
5207    
5208     $insertion_mode = 'in row';
5209    
5210     $token = $self->_get_next_token;
5211     redo B;
5212     } elsif ({
5213     body => 1, caption => 1, col => 1,
5214     colgroup => 1, html => 1,
5215     }->{$token->{tag_name}}) {
5216     $self->{parse_error}->();
5217     ## Ignore the token
5218     $token = $self->_get_next_token;
5219     redo B;
5220     } elsif ({
5221     table => 1, tbody => 1, tfoot => 1,
5222     thead => 1, tr => 1,
5223     }->{$token->{tag_name}}) {
5224     ## have an element in table scope
5225     my $i;
5226     my $tn;
5227     INSCOPE: for (reverse 0..$#$open_elements) {
5228     my $node = $open_elements->[$_];
5229     if ($node->[1] eq $token->{tag_name}) {
5230     $i = $_;
5231     last INSCOPE;
5232     } elsif ($node->[1] eq 'td' or $node->[1] eq 'th') {
5233     $tn = $node->[1];
5234     ## NOTE: There is exactly one |td| or |th| element
5235     ## in scope in the stack of open elements by definition.
5236     } elsif ({
5237     table => 1, html => 1,
5238     }->{$node->[1]}) {
5239     last INSCOPE;
5240     }
5241     } # INSCOPE
5242     unless (defined $i) {
5243     $self->{parse_error}->();
5244     ## Ignore the token
5245     $token = $self->_get_next_token;
5246     redo B;
5247     }
5248    
5249     ## Close the cell
5250     unshift @{$self->{token}}, $token; # </?>
5251     $token = {type => 'end tag', tag_name => $tn};
5252     redo B;
5253     } else {
5254     #
5255     }
5256     } else {
5257     #
5258     }
5259    
5260 wakaba 1.8 $in_body->($insert_to_current);
5261 wakaba 1.3 redo B;
5262     } elsif ($insertion_mode eq 'in select') {
5263     if ($token->{type} eq 'character') {
5264     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
5265     $token = $self->_get_next_token;
5266     redo B;
5267     } elsif ($token->{type} eq 'comment') {
5268     my $comment = $self->{document}->create_comment ($token->{data});
5269     $open_elements->[-1]->[0]->append_child ($comment);
5270     $token = $self->_get_next_token;
5271     redo B;
5272     } elsif ($token->{type} eq 'start tag') {
5273     if ($token->{tag_name} eq 'option') {
5274     if ($open_elements->[-1]->[1] eq 'option') {
5275     ## As if </option>
5276     pop @$open_elements;
5277     }
5278    
5279    
5280     {
5281     my $el;
5282    
5283     $el = $self->{document}->create_element_ns
5284     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
5285    
5286     for my $attr_name (keys %{ $token->{attributes}}) {
5287     $el->set_attribute_ns (undef, [undef, $attr_name],
5288     $token->{attributes} ->{$attr_name}->{value});
5289     }
5290    
5291     $open_elements->[-1]->[0]->append_child ($el);
5292     push @$open_elements, [$el, $token->{tag_name}];
5293     }
5294    
5295     $token = $self->_get_next_token;
5296     redo B;
5297     } elsif ($token->{tag_name} eq 'optgroup') {
5298     if ($open_elements->[-1]->[1] eq 'option') {
5299     ## As if </option>
5300     pop @$open_elements;
5301     }
5302    
5303     if ($open_elements->[-1]->[1] eq 'optgroup') {
5304     ## As if </optgroup>
5305     pop @$open_elements;
5306     }
5307    
5308    
5309     {
5310     my $el;
5311    
5312     $el = $self->{document}->create_element_ns
5313     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
5314    
5315     for my $attr_name (keys %{ $token->{attributes}}) {
5316     $el->set_attribute_ns (undef, [undef, $attr_name],
5317     $token->{attributes} ->{$attr_name}->{value});
5318     }
5319    
5320     $open_elements->[-1]->[0]->append_child ($el);
5321     push @$open_elements, [$el, $token->{tag_name}];
5322     }
5323    
5324     $token = $self->_get_next_token;
5325     redo B;
5326     } elsif ($token->{tag_name} eq 'select') {
5327     $self->{parse_error}->();
5328     ## As if </select> instead
5329     ## have an element in table scope
5330     my $i;
5331     INSCOPE: for (reverse 0..$#$open_elements) {
5332     my $node = $open_elements->[$_];
5333     if ($node->[1] eq $token->{tag_name}) {
5334     $i = $_;
5335     last INSCOPE;
5336     } elsif ({
5337     table => 1, html => 1,
5338     }->{$node->[1]}) {
5339     last INSCOPE;
5340     }
5341     } # INSCOPE
5342     unless (defined $i) {
5343     $self->{parse_error}->();
5344     ## Ignore the token
5345     $token = $self->_get_next_token;
5346     redo B;
5347     }
5348    
5349     splice @$open_elements, $i;
5350    
5351     $reset_insertion_mode->();
5352    
5353     $token = $self->_get_next_token;
5354     redo B;
5355     } else {
5356     #
5357     }
5358     } elsif ($token->{type} eq 'end tag') {
5359     if ($token->{tag_name} eq 'optgroup') {
5360     if ($open_elements->[-1]->[1] eq 'option' and
5361     $open_elements->[-2]->[1] eq 'optgroup') {
5362     ## As if </option>
5363     splice @$open_elements, -2;
5364     } elsif ($open_elements->[-1]->[1] eq 'optgroup') {
5365     pop @$open_elements;
5366     } else {
5367     $self->{parse_error}->();
5368     ## Ignore the token
5369     }
5370     $token = $self->_get_next_token;
5371     redo B;
5372     } elsif ($token->{tag_name} eq 'option') {
5373     if ($open_elements->[-1]->[1] eq 'option') {
5374     pop @$open_elements;
5375     } else {
5376     $self->{parse_error}->();
5377     ## Ignore the token
5378     }
5379     $token = $self->_get_next_token;
5380     redo B;
5381     } elsif ($token->{tag_name} eq 'select') {
5382     ## have an element in table scope
5383     my $i;
5384     INSCOPE: for (reverse 0..$#$open_elements) {
5385     my $node = $open_elements->[$_];
5386     if ($node->[1] eq $token->{tag_name}) {
5387     $i = $_;
5388     last INSCOPE;
5389     } elsif ({
5390     table => 1, html => 1,
5391     }->{$node->[1]}) {
5392     last INSCOPE;
5393     }
5394     } # INSCOPE
5395     unless (defined $i) {
5396     $self->{parse_error}->();
5397     ## Ignore the token
5398     $token = $self->_get_next_token;
5399     redo B;
5400     }
5401    
5402     splice @$open_elements, $i;
5403    
5404     $reset_insertion_mode->();
5405    
5406     $token = $self->_get_next_token;
5407     redo B;
5408     } elsif ({
5409     caption => 1, table => 1, tbody => 1,
5410     tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
5411     }->{$token->{tag_name}}) {
5412     $self->{parse_error}->();
5413    
5414     ## have an element in table scope
5415     my $i;
5416     INSCOPE: for (reverse 0..$#$open_elements) {
5417     my $node = $open_elements->[$_];
5418     if ($node->[1] eq $token->{tag_name}) {
5419     $i = $_;
5420     last INSCOPE;
5421     } elsif ({
5422     table => 1, html => 1,
5423     }->{$node->[1]}) {
5424     last INSCOPE;
5425     }
5426     } # INSCOPE
5427     unless (defined $i) {
5428     ## Ignore the token
5429     $token = $self->_get_next_token;
5430     redo B;
5431     }
5432    
5433     ## As if </select>
5434     ## have an element in table scope
5435     undef $i;
5436     INSCOPE: for (reverse 0..$#$open_elements) {
5437     my $node = $open_elements->[$_];
5438     if ($node->[1] eq 'select') {
5439     $i = $_;
5440     last INSCOPE;
5441     } elsif ({
5442     table => 1, html => 1,
5443     }->{$node->[1]}) {
5444     last INSCOPE;
5445     }
5446     } # INSCOPE
5447     unless (defined $i) {
5448     $self->{parse_error}->();
5449     ## Ignore the </select> token
5450     $token = $self->_get_next_token; ## TODO: ok?
5451     redo B;
5452     }
5453    
5454     splice @$open_elements, $i;
5455    
5456     $reset_insertion_mode->();
5457    
5458     ## reprocess
5459     redo B;
5460     } else {
5461     #
5462     }
5463     } else {
5464     #
5465     }
5466    
5467     $self->{parse_error}->();
5468     ## Ignore the token
5469 wakaba 1.8 $token = $self->_get_next_token;
5470 wakaba 1.3 redo B;
5471     } elsif ($insertion_mode eq 'after body') {
5472     if ($token->{type} eq 'character') {
5473     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5474     ## As if in body
5475 wakaba 1.8 $reconstruct_active_formatting_elements->($insert_to_current);
5476 wakaba 1.3
5477     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
5478    
5479     unless (length $token->{data}) {
5480     $token = $self->_get_next_token;
5481     redo B;
5482     }
5483     }
5484    
5485     #
5486     } elsif ($token->{type} eq 'comment') {
5487     my $comment = $self->{document}->create_comment ($token->{data});
5488     $open_elements->[0]->[0]->append_child ($comment);
5489     $token = $self->_get_next_token;
5490     redo B;
5491     } elsif ($token->{type} eq 'end tag') {
5492 wakaba 1.7 if ($token->{tag_name} eq 'html') {
5493 wakaba 1.3 ## TODO: if inner_html, parse-error, ignore the token; otherwise,
5494    
5495     $phase = 'trailing end';
5496     $token = $self->_get_next_token;
5497     redo B;
5498     } else {
5499     #
5500     }
5501     } else {
5502     #
5503     }
5504    
5505     $self->{parse_error}->();
5506     $insertion_mode = 'in body';
5507     ## reprocess
5508     redo B;
5509     } elsif ($insertion_mode eq 'in frameset') {
5510     if ($token->{type} eq 'character') {
5511     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5512     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
5513    
5514     unless (length $token->{data}) {
5515     $token = $self->_get_next_token;
5516     redo B;
5517     }
5518     }
5519    
5520     #
5521     } elsif ($token->{type} eq 'comment') {
5522     my $comment = $self->{document}->create_comment ($token->{data});
5523     $open_elements->[-1]->[0]->append_child ($comment);
5524     $token = $self->_get_next_token;
5525     redo B;
5526     } elsif ($token->{type} eq 'start tag') {
5527     if ($token->{tag_name} eq 'frameset') {
5528    
5529     {
5530     my $el;
5531    
5532     $el = $self->{document}->create_element_ns
5533     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
5534    
5535     for my $attr_name (keys %{ $token->{attributes}}) {
5536     $el->set_attribute_ns (undef, [undef, $attr_name],
5537     $token->{attributes} ->{$attr_name}->{value});
5538     }
5539    
5540     $open_elements->[-1]->[0]->append_child ($el);
5541     push @$open_elements, [$el, $token->{tag_name}];
5542     }
5543    
5544     $token = $self->_get_next_token;
5545     redo B;
5546     } elsif ($token->{tag_name} eq 'frame') {
5547    
5548     {
5549     my $el;
5550    
5551     $el = $self->{document}->create_element_ns
5552     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
5553    
5554     for my $attr_name (keys %{ $token->{attributes}}) {
5555     $el->set_attribute_ns (undef, [undef, $attr_name],
5556     $token->{attributes} ->{$attr_name}->{value});
5557     }
5558    
5559     $open_elements->[-1]->[0]->append_child ($el);
5560     push @$open_elements, [$el, $token->{tag_name}];
5561     }
5562    
5563     pop @$open_elements;
5564     $token = $self->_get_next_token;
5565     redo B;
5566     } elsif ($token->{tag_name} eq 'noframes') {
5567 wakaba 1.8 $in_body->($insert_to_current);
5568 wakaba 1.3 redo B;
5569     } else {
5570     #
5571     }
5572     } elsif ($token->{type} eq 'end tag') {
5573     if ($token->{tag_name} eq 'frameset') {
5574     if ($open_elements->[-1]->[1] eq 'html' and
5575     @$open_elements == 1) {
5576     $self->{parse_error}->();
5577     ## Ignore the token
5578     $token = $self->_get_next_token;
5579     } else {
5580     pop @$open_elements;
5581     $token = $self->_get_next_token;
5582     }
5583    
5584     ## if not inner_html and
5585     if ($open_elements->[-1]->[1] ne 'frameset') {
5586     $insertion_mode = 'after frameset';
5587     }
5588     redo B;
5589     } else {
5590     #
5591     }
5592     } else {
5593     #
5594     }
5595    
5596     $self->{parse_error}->();
5597     ## Ignore the token
5598     $token = $self->_get_next_token;
5599     redo B;
5600     } elsif ($insertion_mode eq 'after frameset') {
5601     if ($token->{type} eq 'character') {
5602     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5603     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
5604    
5605     unless (length $token->{data}) {
5606     $token = $self->_get_next_token;
5607     redo B;
5608     }
5609     }
5610    
5611     #
5612     } elsif ($token->{type} eq 'comment') {
5613     my $comment = $self->{document}->create_comment ($token->{data});
5614     $open_elements->[-1]->[0]->append_child ($comment);
5615     $token = $self->_get_next_token;
5616     redo B;
5617     } elsif ($token->{type} eq 'start tag') {
5618     if ($token->{tag_name} eq 'noframes') {
5619 wakaba 1.8 $in_body->($insert_to_current);
5620 wakaba 1.3 redo B;
5621     } else {
5622     #
5623     }
5624     } elsif ($token->{type} eq 'end tag') {
5625     if ($token->{tag_name} eq 'html') {
5626     $phase = 'trailing end';
5627     $token = $self->_get_next_token;
5628     redo B;
5629     } else {
5630     #
5631     }
5632     } else {
5633     #
5634     }
5635    
5636     $self->{parse_error}->();
5637     ## Ignore the token
5638     $token = $self->_get_next_token;
5639     redo B;
5640    
5641     ## ISSUE: An issue in spec there
5642     } else {
5643     die "$0: $insertion_mode: Unknown insertion mode";
5644     }
5645     }
5646     } elsif ($phase eq 'trailing end') {
5647     ## states in the main stage is preserved yet # MUST
5648    
5649     if ($token->{type} eq 'DOCTYPE') {
5650     $self->{parse_error}->();
5651     ## Ignore the token
5652     $token = $self->_get_next_token;
5653     redo B;
5654     } elsif ($token->{type} eq 'comment') {
5655     my $comment = $self->{document}->create_comment ($token->{data});
5656     $self->{document}->append_child ($comment);
5657     $token = $self->_get_next_token;
5658     redo B;
5659     } elsif ($token->{type} eq 'character') {
5660     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5661 wakaba 1.8 my $data = $1;
5662 wakaba 1.3 ## As if in the main phase.
5663     ## NOTE: The insertion mode in the main phase
5664     ## just before the phase has been changed to the trailing
5665     ## end phase is either "after body" or "after frameset".
5666 wakaba 1.8 $reconstruct_active_formatting_elements->($insert_to_current)
5667 wakaba 1.3 if $phase eq 'main';
5668    
5669 wakaba 1.8 $open_elements->[-1]->[0]->manakai_append_text ($data);
5670 wakaba 1.3
5671     unless (length $token->{data}) {
5672     $token = $self->_get_next_token;
5673     redo B;
5674     }
5675     }
5676    
5677     $self->{parse_error}->();
5678     $phase = 'main';
5679     ## reprocess
5680     redo B;
5681     } elsif ($token->{type} eq 'start tag' or
5682     $token->{type} eq 'end tag') {
5683     $self->{parse_error}->();
5684     $phase = 'main';
5685     ## reprocess
5686     redo B;
5687     } elsif ($token->{type} eq 'end-of-file') {
5688     ## Stop parsing
5689     last B;
5690     } else {
5691     die "$0: $token->{type}: Unknown token";
5692     }
5693     }
5694     } # B
5695    
5696     ## Stop parsing # MUST
5697    
5698     ## TODO: script stuffs
5699     } # _construct_tree
5700    
5701     sub inner_html ($$$) {
5702     my ($class, $node, $on_error) = @_;
5703    
5704     ## Step 1
5705     my $s = '';
5706    
5707     my $in_cdata;
5708     my $parent = $node;
5709     while (defined $parent) {
5710     if ($parent->node_type == 1 and
5711     $parent->namespace_uri eq 'http://www.w3.org/1999/xhtml' and
5712     {
5713     style => 1, script => 1, xmp => 1, iframe => 1,
5714     noembed => 1, noframes => 1, noscript => 1,
5715     }->{$parent->local_name}) { ## TODO: case thingy
5716     $in_cdata = 1;
5717     }
5718     $parent = $parent->parent_node;
5719     }
5720    
5721     ## Step 2
5722     my @node = @{$node->child_nodes};
5723     C: while (@node) {
5724     my $child = shift @node;
5725     unless (ref $child) {
5726     if ($child eq 'cdata-out') {
5727     $in_cdata = 0;
5728     } else {
5729     $s .= $child; # end tag
5730     }
5731     next C;
5732     }
5733    
5734     my $nt = $child->node_type;
5735     if ($nt == 1) { # Element
5736     my $tag_name = lc $child->tag_name; ## ISSUE: Definition of "lowercase"
5737     $s .= '<' . $tag_name;
5738    
5739     ## ISSUE: Non-html elements
5740    
5741     my @attrs = @{$child->attributes}; # sort order MUST be stable
5742     for my $attr (@attrs) { # order is implementation dependent
5743     my $attr_name = lc $attr->name; ## ISSUE: Definition of "lowercase"
5744     $s .= ' ' . $attr_name . '="';
5745     my $attr_value = $attr->value;
5746     ## escape
5747     $attr_value =~ s/&/&amp;/g;
5748     $attr_value =~ s/</&lt;/g;
5749     $attr_value =~ s/>/&gt;/g;
5750     $attr_value =~ s/"/&quot;/g;
5751     $s .= $attr_value . '"';
5752     }
5753     $s .= '>';
5754    
5755     next C if {
5756     area => 1, base => 1, basefont => 1, bgsound => 1,
5757     br => 1, col => 1, embed => 1, frame => 1, hr => 1,
5758     img => 1, input => 1, link => 1, meta => 1, param => 1,
5759     spacer => 1, wbr => 1,
5760     }->{$tag_name};
5761    
5762     if (not $in_cdata and {
5763     style => 1, script => 1, xmp => 1, iframe => 1,
5764     noembed => 1, noframes => 1, noscript => 1,
5765     }->{$tag_name}) {
5766     unshift @node, 'cdata-out';
5767     $in_cdata = 1;
5768     }
5769    
5770     unshift @node, @{$child->child_nodes}, '</' . $tag_name . '>';
5771     } elsif ($nt == 3 or $nt == 4) {
5772     if ($in_cdata) {
5773     $s .= $child->data;
5774     } else {
5775     my $value = $child->data;
5776     $value =~ s/&/&amp;/g;
5777     $value =~ s/</&lt;/g;
5778     $value =~ s/>/&gt;/g;
5779     $value =~ s/"/&quot;/g;
5780     $s .= $value;
5781     }
5782     } elsif ($nt == 8) {
5783     $s .= '<!--' . $child->data . '-->';
5784     } elsif ($nt == 10) {
5785     $s .= '<!DOCTYPE ' . $child->name . '>';
5786     } elsif ($nt == 5) { # entrefs
5787     push @node, @{$child->child_nodes};
5788     } else {
5789     $on_error->($child);
5790     }
5791     } # C
5792    
5793     ## Step 3
5794     return \$s;
5795     } # inner_html
5796 wakaba 1.1
5797     1;
5798 wakaba 1.8 # $Date: 2007/04/30 14:12:02 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24