/[suikacvs]/markup/html/whatpm/What/HTML.pm
Suika

Contents of /markup/html/whatpm/What/HTML.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.10 - (hide annotations) (download)
Tue May 1 10:37:33 2007 UTC (17 years, 7 months ago) by wakaba
Branch: MAIN
CVS Tags: HEAD
Changes since 1.9: +2 -2 lines
FILE REMOVED
Renamed

1 wakaba 1.1 package What::HTML;
2     use strict;
3 wakaba 1.10 our $VERSION=do{my @r=(q$Revision: 1.9 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.1
5 wakaba 1.9 ## This is an early version of an HTML parser.
6 wakaba 1.1
7     my $permitted_slash_tag_name = {
8     base => 1,
9     link => 1,
10     meta => 1,
11     hr => 1,
12     br => 1,
13     img=> 1,
14     embed => 1,
15     param => 1,
16     area => 1,
17     col => 1,
18     input => 1,
19     };
20    
21 wakaba 1.6 my $entity_char = {
22     AElig => "\x{00C6}",
23     Aacute => "\x{00C1}",
24     Acirc => "\x{00C2}",
25     Agrave => "\x{00C0}",
26     Alpha => "\x{0391}",
27     Aring => "\x{00C5}",
28     Atilde => "\x{00C3}",
29     Auml => "\x{00C4}",
30     Beta => "\x{0392}",
31     Ccedil => "\x{00C7}",
32     Chi => "\x{03A7}",
33     Dagger => "\x{2021}",
34     Delta => "\x{0394}",
35     ETH => "\x{00D0}",
36     Eacute => "\x{00C9}",
37     Ecirc => "\x{00CA}",
38     Egrave => "\x{00C8}",
39     Epsilon => "\x{0395}",
40     Eta => "\x{0397}",
41     Euml => "\x{00CB}",
42     Gamma => "\x{0393}",
43     Iacute => "\x{00CD}",
44     Icirc => "\x{00CE}",
45     Igrave => "\x{00CC}",
46     Iota => "\x{0399}",
47     Iuml => "\x{00CF}",
48     Kappa => "\x{039A}",
49     Lambda => "\x{039B}",
50     Mu => "\x{039C}",
51     Ntilde => "\x{00D1}",
52     Nu => "\x{039D}",
53     OElig => "\x{0152}",
54     Oacute => "\x{00D3}",
55     Ocirc => "\x{00D4}",
56     Ograve => "\x{00D2}",
57     Omega => "\x{03A9}",
58     Omicron => "\x{039F}",
59     Oslash => "\x{00D8}",
60     Otilde => "\x{00D5}",
61     Ouml => "\x{00D6}",
62     Phi => "\x{03A6}",
63     Pi => "\x{03A0}",
64     Prime => "\x{2033}",
65     Psi => "\x{03A8}",
66     Rho => "\x{03A1}",
67     Scaron => "\x{0160}",
68     Sigma => "\x{03A3}",
69     THORN => "\x{00DE}",
70     Tau => "\x{03A4}",
71     Theta => "\x{0398}",
72     Uacute => "\x{00DA}",
73     Ucirc => "\x{00DB}",
74     Ugrave => "\x{00D9}",
75     Upsilon => "\x{03A5}",
76     Uuml => "\x{00DC}",
77     Xi => "\x{039E}",
78     Yacute => "\x{00DD}",
79     Yuml => "\x{0178}",
80     Zeta => "\x{0396}",
81     aacute => "\x{00E1}",
82     acirc => "\x{00E2}",
83     acute => "\x{00B4}",
84     aelig => "\x{00E6}",
85     agrave => "\x{00E0}",
86     alefsym => "\x{2135}",
87     alpha => "\x{03B1}",
88     amp => "\x{0026}",
89     AMP => "\x{0026}",
90     and => "\x{2227}",
91     ang => "\x{2220}",
92     apos => "\x{0027}",
93     aring => "\x{00E5}",
94     asymp => "\x{2248}",
95     atilde => "\x{00E3}",
96     auml => "\x{00E4}",
97     bdquo => "\x{201E}",
98     beta => "\x{03B2}",
99     brvbar => "\x{00A6}",
100     bull => "\x{2022}",
101     cap => "\x{2229}",
102     ccedil => "\x{00E7}",
103     cedil => "\x{00B8}",
104     cent => "\x{00A2}",
105     chi => "\x{03C7}",
106     circ => "\x{02C6}",
107     clubs => "\x{2663}",
108     cong => "\x{2245}",
109     copy => "\x{00A9}",
110     COPY => "\x{00A9}",
111     crarr => "\x{21B5}",
112     cup => "\x{222A}",
113     curren => "\x{00A4}",
114     dArr => "\x{21D3}",
115     dagger => "\x{2020}",
116     darr => "\x{2193}",
117     deg => "\x{00B0}",
118     delta => "\x{03B4}",
119     diams => "\x{2666}",
120     divide => "\x{00F7}",
121     eacute => "\x{00E9}",
122     ecirc => "\x{00EA}",
123     egrave => "\x{00E8}",
124     empty => "\x{2205}",
125     emsp => "\x{2003}",
126     ensp => "\x{2002}",
127     epsilon => "\x{03B5}",
128     equiv => "\x{2261}",
129     eta => "\x{03B7}",
130     eth => "\x{00F0}",
131     euml => "\x{00EB}",
132     euro => "\x{20AC}",
133     exist => "\x{2203}",
134     fnof => "\x{0192}",
135     forall => "\x{2200}",
136     frac12 => "\x{00BD}",
137     frac14 => "\x{00BC}",
138     frac34 => "\x{00BE}",
139     frasl => "\x{2044}",
140     gamma => "\x{03B3}",
141     ge => "\x{2265}",
142     gt => "\x{003E}",
143     GT => "\x{003E}",
144     hArr => "\x{21D4}",
145     harr => "\x{2194}",
146     hearts => "\x{2665}",
147     hellip => "\x{2026}",
148     iacute => "\x{00ED}",
149     icirc => "\x{00EE}",
150     iexcl => "\x{00A1}",
151     igrave => "\x{00EC}",
152     image => "\x{2111}",
153     infin => "\x{221E}",
154     int => "\x{222B}",
155     iota => "\x{03B9}",
156     iquest => "\x{00BF}",
157     isin => "\x{2208}",
158     iuml => "\x{00EF}",
159     kappa => "\x{03BA}",
160     lArr => "\x{21D0}",
161     lambda => "\x{03BB}",
162     lang => "\x{2329}",
163     laquo => "\x{00AB}",
164     larr => "\x{2190}",
165     lceil => "\x{2308}",
166     ldquo => "\x{201C}",
167     le => "\x{2264}",
168     lfloor => "\x{230A}",
169     lowast => "\x{2217}",
170     loz => "\x{25CA}",
171     lrm => "\x{200E}",
172     lsaquo => "\x{2039}",
173     lsquo => "\x{2018}",
174     lt => "\x{003C}",
175     LT => "\x{003C}",
176     macr => "\x{00AF}",
177     mdash => "\x{2014}",
178     micro => "\x{00B5}",
179     middot => "\x{00B7}",
180     minus => "\x{2212}",
181     mu => "\x{03BC}",
182     nabla => "\x{2207}",
183     nbsp => "\x{00A0}",
184     ndash => "\x{2013}",
185     ne => "\x{2260}",
186     ni => "\x{220B}",
187     not => "\x{00AC}",
188     notin => "\x{2209}",
189     nsub => "\x{2284}",
190     ntilde => "\x{00F1}",
191     nu => "\x{03BD}",
192     oacute => "\x{00F3}",
193     ocirc => "\x{00F4}",
194     oelig => "\x{0153}",
195     ograve => "\x{00F2}",
196     oline => "\x{203E}",
197     omega => "\x{03C9}",
198     omicron => "\x{03BF}",
199     oplus => "\x{2295}",
200     or => "\x{2228}",
201     ordf => "\x{00AA}",
202     ordm => "\x{00BA}",
203     oslash => "\x{00F8}",
204     otilde => "\x{00F5}",
205     otimes => "\x{2297}",
206     ouml => "\x{00F6}",
207     para => "\x{00B6}",
208     part => "\x{2202}",
209     permil => "\x{2030}",
210     perp => "\x{22A5}",
211     phi => "\x{03C6}",
212     pi => "\x{03C0}",
213     piv => "\x{03D6}",
214     plusmn => "\x{00B1}",
215     pound => "\x{00A3}",
216     prime => "\x{2032}",
217     prod => "\x{220F}",
218     prop => "\x{221D}",
219     psi => "\x{03C8}",
220     quot => "\x{0022}",
221     QUOT => "\x{0022}",
222     rArr => "\x{21D2}",
223     radic => "\x{221A}",
224     rang => "\x{232A}",
225     raquo => "\x{00BB}",
226     rarr => "\x{2192}",
227     rceil => "\x{2309}",
228     rdquo => "\x{201D}",
229     real => "\x{211C}",
230     reg => "\x{00AE}",
231     REG => "\x{00AE}",
232     rfloor => "\x{230B}",
233     rho => "\x{03C1}",
234     rlm => "\x{200F}",
235     rsaquo => "\x{203A}",
236     rsquo => "\x{2019}",
237     sbquo => "\x{201A}",
238     scaron => "\x{0161}",
239     sdot => "\x{22C5}",
240     sect => "\x{00A7}",
241     shy => "\x{00AD}",
242     sigma => "\x{03C3}",
243     sigmaf => "\x{03C2}",
244     sim => "\x{223C}",
245     spades => "\x{2660}",
246     sub => "\x{2282}",
247     sube => "\x{2286}",
248     sum => "\x{2211}",
249     sup => "\x{2283}",
250     sup1 => "\x{00B9}",
251     sup2 => "\x{00B2}",
252     sup3 => "\x{00B3}",
253     supe => "\x{2287}",
254     szlig => "\x{00DF}",
255     tau => "\x{03C4}",
256     there4 => "\x{2234}",
257     theta => "\x{03B8}",
258     thetasym => "\x{03D1}",
259     thinsp => "\x{2009}",
260     thorn => "\x{00FE}",
261     tilde => "\x{02DC}",
262     times => "\x{00D7}",
263     trade => "\x{2122}",
264     uArr => "\x{21D1}",
265     uacute => "\x{00FA}",
266     uarr => "\x{2191}",
267     ucirc => "\x{00FB}",
268     ugrave => "\x{00F9}",
269     uml => "\x{00A8}",
270     upsih => "\x{03D2}",
271     upsilon => "\x{03C5}",
272     uuml => "\x{00FC}",
273     weierp => "\x{2118}",
274     xi => "\x{03BE}",
275     yacute => "\x{00FD}",
276     yen => "\x{00A5}",
277     yuml => "\x{00FF}",
278     zeta => "\x{03B6}",
279     zwj => "\x{200D}",
280     zwnj => "\x{200C}",
281     };
282    
283 wakaba 1.3 my $special_category = {
284     address => 1, area => 1, base => 1, basefont => 1, bgsound => 1,
285     blockquote => 1, body => 1, br => 1, center => 1, col => 1, colgroup => 1,
286     dd => 1, dir => 1, div => 1, dl => 1, dt => 1, embed => 1, fieldset => 1,
287     form => 1, frame => 1, frameset => 1, h1 => 1, h2 => 1, h3 => 1,
288     h4 => 1, h5 => 1, h6 => 1, head => 1, hr => 1, iframe => 1, image => 1,
289     img => 1, input => 1, isindex => 1, li => 1, link => 1, listing => 1,
290     menu => 1, meta => 1, noembed => 1, noframes => 1, noscript => 1,
291     ol => 1, optgroup => 1, option => 1, p => 1, param => 1, plaintext => 1,
292     pre => 1, script => 1, select => 1, spacer => 1, style => 1, tbody => 1,
293     textarea => 1, tfoot => 1, thead => 1, title => 1, tr => 1, ul => 1, wbr => 1,
294     };
295     my $scoping_category = {
296     button => 1, caption => 1, html => 1, marquee => 1, object => 1,
297     table => 1, td => 1, th => 1,
298     };
299     my $formatting_category = {
300     a => 1, b => 1, big => 1, em => 1, font => 1, i => 1, nobr => 1,
301     s => 1, small => 1, strile => 1, strong => 1, tt => 1, u => 1,
302     };
303     # $phrasing_category: all other elements
304    
305 wakaba 1.9 sub parse_string ($$$;$) {
306     my $self = shift->new;
307     my $s = \$_[0];
308     $self->{document} = $_[1];
309    
310     my $i;
311     my $i = 0;
312     $self->{set_next_input_character} = sub {
313     my $self = shift;
314     $self->{next_input_character} = -1 and return if $i >= length $$s;
315     $self->{next_input_character} = ord substr $$s, $i++, 1;
316    
317     if ($self->{next_input_character} == 0x000D) { # CR
318     if ($i >= length $$s) {
319     #
320     } else {
321     my $next_char = ord substr $$s, $i++, 1;
322     if ($next_char == 0x000A) { # LF
323     #
324     } else {
325     push @{$self->{char}}, $next_char;
326     }
327     }
328     $self->{next_input_character} = 0x000A; # LF # MUST
329     } elsif ($self->{next_input_character} > 0x10FFFF) {
330     $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
331     } elsif ($self->{next_input_character} == 0x0000) { # NULL
332     $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
333     }
334     };
335    
336     $self->{parse_error} = $_[2] || sub {
337     warn "Parse error at character $i\n"; ## TODO: Report (line, column) pair
338     };
339    
340     $self->_initialize_tokenizer;
341     $self->_initialize_tree_constructor;
342     $self->_construct_tree;
343     $self->_terminate_tree_constructor;
344    
345     return $self->{document};
346     } # parse_string
347    
348 wakaba 1.1 sub new ($) {
349     my $class = shift;
350     my $self = bless {}, $class;
351     $self->{set_next_input_character} = sub {
352     $self->{next_input_character} = -1;
353     };
354     $self->{parse_error} = sub {
355     #
356     };
357     return $self;
358     } # new
359    
360     ## Implementations MUST act as if state machine in the spec
361    
362     sub _initialize_tokenizer ($) {
363     my $self = shift;
364     $self->{state} = 'data'; # MUST
365     $self->{content_model_flag} = 'PCDATA'; # be
366     undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE
367     undef $self->{current_attribute};
368     undef $self->{last_emitted_start_tag_name};
369     undef $self->{last_attribute_value_state};
370     $self->{char} = [];
371     # $self->{next_input_character}
372    
373     if (@{$self->{char}}) {
374     $self->{next_input_character} = shift @{$self->{char}};
375     } else {
376     $self->{set_next_input_character}->($self);
377     }
378    
379     $self->{token} = [];
380     } # _initialize_tokenizer
381    
382     ## A token has:
383     ## ->{type} eq 'DOCTYPE', 'start tag', 'end tag', 'comment',
384     ## 'character', or 'end-of-file'
385     ## ->{name} (DOCTYPE, start tag (tagname), end tag (tagname))
386     ## ISSUE: the spec need s/tagname/tag name/
387     ## ->{error} == 1 or 0 (DOCTYPE)
388     ## ->{attributes} isa HASH (start tag, end tag)
389     ## ->{data} (comment, character)
390    
391     ## Macros
392     ## Macros MUST be preceded by three EXCLAMATION MARKs.
393     ## emit ($token)
394     ## Emits the specified token.
395    
396     ## Emitted token MUST immediately be handled by the tree construction state.
397    
398     ## Before each step, UA MAY check to see if either one of the scripts in
399     ## "list of scripts that will execute as soon as possible" or the first
400     ## script in the "list of scripts that will execute asynchronously",
401     ## has completed loading. If one has, then it MUST be executed
402     ## and removed from the list.
403    
404     sub _get_next_token ($) {
405     my $self = shift;
406     if (@{$self->{token}}) {
407     return shift @{$self->{token}};
408     }
409    
410     A: {
411     if ($self->{state} eq 'data') {
412     if ($self->{next_input_character} == 0x0026) { # &
413     if ($self->{content_model_flag} eq 'PCDATA' or
414     $self->{content_model_flag} eq 'RCDATA') {
415     $self->{state} = 'entity data';
416    
417     if (@{$self->{char}}) {
418     $self->{next_input_character} = shift @{$self->{char}};
419     } else {
420     $self->{set_next_input_character}->($self);
421     }
422    
423     redo A;
424     } else {
425     #
426     }
427     } elsif ($self->{next_input_character} == 0x003C) { # <
428     if ($self->{content_model_flag} ne 'PLAINTEXT') {
429     $self->{state} = 'tag open';
430    
431     if (@{$self->{char}}) {
432     $self->{next_input_character} = shift @{$self->{char}};
433     } else {
434     $self->{set_next_input_character}->($self);
435     }
436    
437     redo A;
438     } else {
439     #
440     }
441     } elsif ($self->{next_input_character} == -1) {
442     return ({type => 'end-of-file'});
443     last A; ## TODO: ok?
444     }
445     # Anything else
446     my $token = {type => 'character',
447     data => chr $self->{next_input_character}};
448     ## Stay in the data state
449    
450     if (@{$self->{char}}) {
451     $self->{next_input_character} = shift @{$self->{char}};
452     } else {
453     $self->{set_next_input_character}->($self);
454     }
455    
456    
457     return ($token);
458    
459     redo A;
460     } elsif ($self->{state} eq 'entity data') {
461     ## (cannot happen in CDATA state)
462    
463     my $token = $self->_tokenize_attempt_to_consume_an_entity;
464    
465     $self->{state} = 'data';
466     # next-input-character is already done
467    
468     unless (defined $token) {
469     return ({type => 'character', data => '&'});
470     } else {
471     return ($token);
472     }
473    
474     redo A;
475     } elsif ($self->{state} eq 'tag open') {
476     if ($self->{content_model_flag} eq 'RCDATA' or
477     $self->{content_model_flag} eq 'CDATA') {
478     if ($self->{next_input_character} == 0x002F) { # /
479    
480     if (@{$self->{char}}) {
481     $self->{next_input_character} = shift @{$self->{char}};
482     } else {
483     $self->{set_next_input_character}->($self);
484     }
485    
486     $self->{state} = 'close tag open';
487     redo A;
488     } else {
489     ## reconsume
490     $self->{state} = 'data';
491    
492 wakaba 1.8 return ({type => 'character', data => '<'});
493 wakaba 1.1
494     redo A;
495     }
496     } elsif ($self->{content_model_flag} eq 'PCDATA') {
497     if ($self->{next_input_character} == 0x0021) { # !
498     $self->{state} = 'markup declaration open';
499    
500     if (@{$self->{char}}) {
501     $self->{next_input_character} = shift @{$self->{char}};
502     } else {
503     $self->{set_next_input_character}->($self);
504     }
505    
506     redo A;
507     } elsif ($self->{next_input_character} == 0x002F) { # /
508     $self->{state} = 'close tag open';
509    
510     if (@{$self->{char}}) {
511     $self->{next_input_character} = shift @{$self->{char}};
512     } else {
513     $self->{set_next_input_character}->($self);
514     }
515    
516     redo A;
517     } elsif (0x0041 <= $self->{next_input_character} and
518     $self->{next_input_character} <= 0x005A) { # A..Z
519     $self->{current_token}
520     = {type => 'start tag',
521     tag_name => chr ($self->{next_input_character} + 0x0020)};
522     $self->{state} = 'tag name';
523    
524     if (@{$self->{char}}) {
525     $self->{next_input_character} = shift @{$self->{char}};
526     } else {
527     $self->{set_next_input_character}->($self);
528     }
529    
530     redo A;
531     } elsif (0x0061 <= $self->{next_input_character} and
532     $self->{next_input_character} <= 0x007A) { # a..z
533     $self->{current_token} = {type => 'start tag',
534     tag_name => chr ($self->{next_input_character})};
535     $self->{state} = 'tag name';
536    
537     if (@{$self->{char}}) {
538     $self->{next_input_character} = shift @{$self->{char}};
539     } else {
540     $self->{set_next_input_character}->($self);
541     }
542    
543     redo A;
544     } elsif ($self->{next_input_character} == 0x003E) { # >
545     $self->{parse_error}->();
546     $self->{state} = 'data';
547    
548     if (@{$self->{char}}) {
549     $self->{next_input_character} = shift @{$self->{char}};
550     } else {
551     $self->{set_next_input_character}->($self);
552     }
553    
554    
555 wakaba 1.4 return ({type => 'character', data => '<>'});
556 wakaba 1.1
557     redo A;
558     } elsif ($self->{next_input_character} == 0x003F) { # ?
559     $self->{parse_error}->();
560     $self->{state} = 'bogus comment';
561     ## $self->{next_input_character} is intentionally left as is
562     redo A;
563     } else {
564     $self->{parse_error}->();
565     $self->{state} = 'data';
566     ## reconsume
567    
568     return ({type => 'character', data => '<'});
569    
570     redo A;
571     }
572     } else {
573     die "$0: $self->{content_model_flag}: Unknown content model flag";
574     }
575     } elsif ($self->{state} eq 'close tag open') {
576     if ($self->{content_model_flag} eq 'RCDATA' or
577     $self->{content_model_flag} eq 'CDATA') {
578     my @next_char;
579     TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
580     push @next_char, $self->{next_input_character};
581     my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
582     my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
583     if ($self->{next_input_character} == $c or $self->{next_input_character} == $C) {
584    
585     if (@{$self->{char}}) {
586     $self->{next_input_character} = shift @{$self->{char}};
587     } else {
588     $self->{set_next_input_character}->($self);
589     }
590    
591     next TAGNAME;
592     } else {
593     $self->{parse_error}->();
594     $self->{next_input_character} = shift @next_char; # reconsume
595     unshift @{$self->{char}}, (@next_char);
596     $self->{state} = 'data';
597    
598     return ({type => 'character', data => '</'});
599    
600     redo A;
601     }
602     }
603 wakaba 1.3 push @next_char, $self->{next_input_character};
604 wakaba 1.1
605 wakaba 1.3 unless ($self->{next_input_character} == 0x0009 or # HT
606     $self->{next_input_character} == 0x000A or # LF
607     $self->{next_input_character} == 0x000B or # VT
608     $self->{next_input_character} == 0x000C or # FF
609     $self->{next_input_character} == 0x0020 or # SP
610     $self->{next_input_character} == 0x003E or # >
611     $self->{next_input_character} == 0x002F or # /
612     $self->{next_input_character} == 0x003C or # <
613 wakaba 1.1 $self->{next_input_character} == -1) {
614     $self->{parse_error}->();
615     $self->{next_input_character} = shift @next_char; # reconsume
616     unshift @{$self->{char}}, (@next_char);
617     $self->{state} = 'data';
618    
619     return ({type => 'character', data => '</'});
620    
621     redo A;
622     } else {
623     $self->{next_input_character} = shift @next_char;
624     unshift @{$self->{char}}, (@next_char);
625     # and consume...
626     }
627     }
628    
629     if (0x0041 <= $self->{next_input_character} and
630     $self->{next_input_character} <= 0x005A) { # A..Z
631     $self->{current_token} = {type => 'end tag',
632     tag_name => chr ($self->{next_input_character} + 0x0020)};
633     $self->{state} = 'tag name';
634    
635     if (@{$self->{char}}) {
636     $self->{next_input_character} = shift @{$self->{char}};
637     } else {
638     $self->{set_next_input_character}->($self);
639     }
640    
641     redo A;
642     } elsif (0x0061 <= $self->{next_input_character} and
643     $self->{next_input_character} <= 0x007A) { # a..z
644     $self->{current_token} = {type => 'end tag',
645     tag_name => chr ($self->{next_input_character})};
646     $self->{state} = 'tag name';
647    
648     if (@{$self->{char}}) {
649     $self->{next_input_character} = shift @{$self->{char}};
650     } else {
651     $self->{set_next_input_character}->($self);
652     }
653    
654     redo A;
655     } elsif ($self->{next_input_character} == 0x003E) { # >
656     $self->{parse_error}->();
657     $self->{state} = 'data';
658    
659     if (@{$self->{char}}) {
660     $self->{next_input_character} = shift @{$self->{char}};
661     } else {
662     $self->{set_next_input_character}->($self);
663     }
664    
665     redo A;
666     } elsif ($self->{next_input_character} == -1) {
667     $self->{parse_error}->();
668     $self->{state} = 'data';
669     # reconsume
670    
671     return ({type => 'character', data => '</'});
672    
673     redo A;
674     } else {
675     $self->{parse_error}->();
676     $self->{state} = 'bogus comment';
677     ## $self->{next_input_character} is intentionally left as is
678     redo A;
679     }
680     } elsif ($self->{state} eq 'tag name') {
681     if ($self->{next_input_character} == 0x0009 or # HT
682     $self->{next_input_character} == 0x000A or # LF
683     $self->{next_input_character} == 0x000B or # VT
684     $self->{next_input_character} == 0x000C or # FF
685     $self->{next_input_character} == 0x0020) { # SP
686     $self->{state} = 'before attribute name';
687    
688     if (@{$self->{char}}) {
689     $self->{next_input_character} = shift @{$self->{char}};
690     } else {
691     $self->{set_next_input_character}->($self);
692     }
693    
694     redo A;
695     } elsif ($self->{next_input_character} == 0x003E) { # >
696     if ($self->{current_token}->{type} eq 'start tag') {
697     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
698     } elsif ($self->{current_token}->{type} eq 'end tag') {
699     $self->{content_model_flag} = 'PCDATA'; # MUST
700 wakaba 1.3 if ($self->{current_token}->{attributes}) {
701 wakaba 1.1 $self->{parse_error}->();
702     }
703     } else {
704     die "$0: $self->{current_token}->{type}: Unknown token type";
705     }
706     $self->{state} = 'data';
707    
708     if (@{$self->{char}}) {
709     $self->{next_input_character} = shift @{$self->{char}};
710     } else {
711     $self->{set_next_input_character}->($self);
712     }
713    
714    
715     return ($self->{current_token}); # start tag or end tag
716     undef $self->{current_token};
717    
718     redo A;
719     } elsif (0x0041 <= $self->{next_input_character} and
720     $self->{next_input_character} <= 0x005A) { # A..Z
721     $self->{current_token}->{tag_name} .= chr ($self->{next_input_character} + 0x0020);
722     # start tag or end tag
723     ## Stay in this state
724    
725     if (@{$self->{char}}) {
726     $self->{next_input_character} = shift @{$self->{char}};
727     } else {
728     $self->{set_next_input_character}->($self);
729     }
730    
731     redo A;
732     } elsif ($self->{next_input_character} == 0x003C or # <
733     $self->{next_input_character} == -1) {
734     $self->{parse_error}->();
735     if ($self->{current_token}->{type} eq 'start tag') {
736     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
737     } elsif ($self->{current_token}->{type} eq 'end tag') {
738     $self->{content_model_flag} = 'PCDATA'; # MUST
739 wakaba 1.3 if ($self->{current_token}->{attributes}) {
740 wakaba 1.1 $self->{parse_error}->();
741     }
742     } else {
743     die "$0: $self->{current_token}->{type}: Unknown token type";
744     }
745     $self->{state} = 'data';
746     # reconsume
747    
748     return ($self->{current_token}); # start tag or end tag
749     undef $self->{current_token};
750    
751     redo A;
752     } elsif ($self->{next_input_character} == 0x002F) { # /
753    
754     if (@{$self->{char}}) {
755     $self->{next_input_character} = shift @{$self->{char}};
756     } else {
757     $self->{set_next_input_character}->($self);
758     }
759    
760     if ($self->{next_input_character} == 0x003E and # >
761     $self->{current_token}->{type} eq 'start tag' and
762     $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
763     # permitted slash
764     #
765     } else {
766     $self->{parse_error}->();
767     }
768     $self->{state} = 'before attribute name';
769     # next-input-character is already done
770     redo A;
771     } else {
772     $self->{current_token}->{tag_name} .= chr $self->{next_input_character};
773     # start tag or end tag
774     ## Stay in the state
775    
776     if (@{$self->{char}}) {
777     $self->{next_input_character} = shift @{$self->{char}};
778     } else {
779     $self->{set_next_input_character}->($self);
780     }
781    
782     redo A;
783     }
784     } elsif ($self->{state} eq 'before attribute name') {
785     if ($self->{next_input_character} == 0x0009 or # HT
786     $self->{next_input_character} == 0x000A or # LF
787     $self->{next_input_character} == 0x000B or # VT
788     $self->{next_input_character} == 0x000C or # FF
789     $self->{next_input_character} == 0x0020) { # SP
790     ## Stay in the state
791    
792     if (@{$self->{char}}) {
793     $self->{next_input_character} = shift @{$self->{char}};
794     } else {
795     $self->{set_next_input_character}->($self);
796     }
797    
798     redo A;
799     } elsif ($self->{next_input_character} == 0x003E) { # >
800     if ($self->{current_token}->{type} eq 'start tag') {
801     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
802     } elsif ($self->{current_token}->{type} eq 'end tag') {
803     $self->{content_model_flag} = 'PCDATA'; # MUST
804 wakaba 1.3 if ($self->{current_token}->{attributes}) {
805 wakaba 1.1 $self->{parse_error}->();
806     }
807     } else {
808     die "$0: $self->{current_token}->{type}: Unknown token type";
809     }
810     $self->{state} = 'data';
811    
812     if (@{$self->{char}}) {
813     $self->{next_input_character} = shift @{$self->{char}};
814     } else {
815     $self->{set_next_input_character}->($self);
816     }
817    
818    
819     return ($self->{current_token}); # start tag or end tag
820     undef $self->{current_token};
821    
822     redo A;
823     } elsif (0x0041 <= $self->{next_input_character} and
824     $self->{next_input_character} <= 0x005A) { # A..Z
825     $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
826     value => ''};
827     $self->{state} = 'attribute name';
828    
829     if (@{$self->{char}}) {
830     $self->{next_input_character} = shift @{$self->{char}};
831     } else {
832     $self->{set_next_input_character}->($self);
833     }
834    
835     redo A;
836     } elsif ($self->{next_input_character} == 0x002F) { # /
837    
838     if (@{$self->{char}}) {
839     $self->{next_input_character} = shift @{$self->{char}};
840     } else {
841     $self->{set_next_input_character}->($self);
842     }
843    
844     if ($self->{next_input_character} == 0x003E and # >
845     $self->{current_token}->{type} eq 'start tag' and
846     $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
847     # permitted slash
848     #
849     } else {
850     $self->{parse_error}->();
851     }
852     ## Stay in the state
853     # next-input-character is already done
854     redo A;
855     } elsif ($self->{next_input_character} == 0x003C or # <
856     $self->{next_input_character} == -1) {
857     $self->{parse_error}->();
858     if ($self->{current_token}->{type} eq 'start tag') {
859     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
860     } elsif ($self->{current_token}->{type} eq 'end tag') {
861     $self->{content_model_flag} = 'PCDATA'; # MUST
862 wakaba 1.3 if ($self->{current_token}->{attributes}) {
863 wakaba 1.1 $self->{parse_error}->();
864     }
865     } else {
866     die "$0: $self->{current_token}->{type}: Unknown token type";
867     }
868     $self->{state} = 'data';
869     # reconsume
870    
871     return ($self->{current_token}); # start tag or end tag
872     undef $self->{current_token};
873    
874     redo A;
875     } else {
876     $self->{current_attribute} = {name => chr ($self->{next_input_character}),
877     value => ''};
878     $self->{state} = 'attribute name';
879    
880     if (@{$self->{char}}) {
881     $self->{next_input_character} = shift @{$self->{char}};
882     } else {
883     $self->{set_next_input_character}->($self);
884     }
885    
886     redo A;
887     }
888     } elsif ($self->{state} eq 'attribute name') {
889     my $before_leave = sub {
890 wakaba 1.3 if (exists $self->{current_token}->{attributes} # start tag or end tag
891 wakaba 1.1 ->{$self->{current_attribute}->{name}}) { # MUST
892     $self->{parse_error}->();
893     ## Discard $self->{current_attribute} # MUST
894     } else {
895 wakaba 1.3 $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}
896 wakaba 1.1 = $self->{current_attribute};
897     }
898     }; # $before_leave
899    
900     if ($self->{next_input_character} == 0x0009 or # HT
901     $self->{next_input_character} == 0x000A or # LF
902     $self->{next_input_character} == 0x000B or # VT
903     $self->{next_input_character} == 0x000C or # FF
904     $self->{next_input_character} == 0x0020) { # SP
905     $before_leave->();
906     $self->{state} = 'after attribute name';
907    
908     if (@{$self->{char}}) {
909     $self->{next_input_character} = shift @{$self->{char}};
910     } else {
911     $self->{set_next_input_character}->($self);
912     }
913    
914     redo A;
915     } elsif ($self->{next_input_character} == 0x003D) { # =
916     $before_leave->();
917     $self->{state} = 'before attribute value';
918    
919     if (@{$self->{char}}) {
920     $self->{next_input_character} = shift @{$self->{char}};
921     } else {
922     $self->{set_next_input_character}->($self);
923     }
924    
925     redo A;
926     } elsif ($self->{next_input_character} == 0x003E) { # >
927     $before_leave->();
928     if ($self->{current_token}->{type} eq 'start tag') {
929     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
930     } elsif ($self->{current_token}->{type} eq 'end tag') {
931     $self->{content_model_flag} = 'PCDATA'; # MUST
932 wakaba 1.3 if ($self->{current_token}->{attributes}) {
933 wakaba 1.1 $self->{parse_error}->();
934     }
935     } else {
936     die "$0: $self->{current_token}->{type}: Unknown token type";
937     }
938     $self->{state} = 'data';
939    
940     if (@{$self->{char}}) {
941     $self->{next_input_character} = shift @{$self->{char}};
942     } else {
943     $self->{set_next_input_character}->($self);
944     }
945    
946    
947     return ($self->{current_token}); # start tag or end tag
948     undef $self->{current_token};
949    
950     redo A;
951     } elsif (0x0041 <= $self->{next_input_character} and
952     $self->{next_input_character} <= 0x005A) { # A..Z
953     $self->{current_attribute}->{name} .= chr ($self->{next_input_character} + 0x0020);
954     ## Stay in the state
955    
956     if (@{$self->{char}}) {
957     $self->{next_input_character} = shift @{$self->{char}};
958     } else {
959     $self->{set_next_input_character}->($self);
960     }
961    
962     redo A;
963     } elsif ($self->{next_input_character} == 0x002F) { # /
964     $before_leave->();
965    
966     if (@{$self->{char}}) {
967     $self->{next_input_character} = shift @{$self->{char}};
968     } else {
969     $self->{set_next_input_character}->($self);
970     }
971    
972     if ($self->{next_input_character} == 0x003E and # >
973     $self->{current_token}->{type} eq 'start tag' and
974     $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
975     # permitted slash
976     #
977     } else {
978     $self->{parse_error}->();
979     }
980     $self->{state} = 'before attribute name';
981     # next-input-character is already done
982     redo A;
983     } elsif ($self->{next_input_character} == 0x003C or # <
984     $self->{next_input_character} == -1) {
985     $self->{parse_error}->();
986     $before_leave->();
987     if ($self->{current_token}->{type} eq 'start tag') {
988     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
989     } elsif ($self->{current_token}->{type} eq 'end tag') {
990     $self->{content_model_flag} = 'PCDATA'; # MUST
991 wakaba 1.3 if ($self->{current_token}->{attributes}) {
992 wakaba 1.1 $self->{parse_error}->();
993     }
994     } else {
995     die "$0: $self->{current_token}->{type}: Unknown token type";
996     }
997     $self->{state} = 'data';
998     # reconsume
999    
1000     return ($self->{current_token}); # start tag or end tag
1001     undef $self->{current_token};
1002    
1003     redo A;
1004     } else {
1005     $self->{current_attribute}->{name} .= chr ($self->{next_input_character});
1006     ## Stay in the state
1007    
1008     if (@{$self->{char}}) {
1009     $self->{next_input_character} = shift @{$self->{char}};
1010     } else {
1011     $self->{set_next_input_character}->($self);
1012     }
1013    
1014     redo A;
1015     }
1016     } elsif ($self->{state} eq 'after attribute name') {
1017     if ($self->{next_input_character} == 0x0009 or # HT
1018     $self->{next_input_character} == 0x000A or # LF
1019     $self->{next_input_character} == 0x000B or # VT
1020     $self->{next_input_character} == 0x000C or # FF
1021     $self->{next_input_character} == 0x0020) { # SP
1022     ## Stay in the state
1023    
1024     if (@{$self->{char}}) {
1025     $self->{next_input_character} = shift @{$self->{char}};
1026     } else {
1027     $self->{set_next_input_character}->($self);
1028     }
1029    
1030     redo A;
1031     } elsif ($self->{next_input_character} == 0x003D) { # =
1032     $self->{state} = 'before attribute value';
1033    
1034     if (@{$self->{char}}) {
1035     $self->{next_input_character} = shift @{$self->{char}};
1036     } else {
1037     $self->{set_next_input_character}->($self);
1038     }
1039    
1040     redo A;
1041     } elsif ($self->{next_input_character} == 0x003E) { # >
1042     if ($self->{current_token}->{type} eq 'start tag') {
1043     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1044     } elsif ($self->{current_token}->{type} eq 'end tag') {
1045     $self->{content_model_flag} = 'PCDATA'; # MUST
1046 wakaba 1.3 if ($self->{current_token}->{attributes}) {
1047 wakaba 1.1 $self->{parse_error}->();
1048     }
1049     } else {
1050     die "$0: $self->{current_token}->{type}: Unknown token type";
1051     }
1052     $self->{state} = 'data';
1053    
1054     if (@{$self->{char}}) {
1055     $self->{next_input_character} = shift @{$self->{char}};
1056     } else {
1057     $self->{set_next_input_character}->($self);
1058     }
1059    
1060    
1061     return ($self->{current_token}); # start tag or end tag
1062     undef $self->{current_token};
1063    
1064     redo A;
1065     } elsif (0x0041 <= $self->{next_input_character} and
1066     $self->{next_input_character} <= 0x005A) { # A..Z
1067     $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
1068     value => ''};
1069     $self->{state} = 'attribute name';
1070    
1071     if (@{$self->{char}}) {
1072     $self->{next_input_character} = shift @{$self->{char}};
1073     } else {
1074     $self->{set_next_input_character}->($self);
1075     }
1076    
1077     redo A;
1078     } elsif ($self->{next_input_character} == 0x002F) { # /
1079    
1080     if (@{$self->{char}}) {
1081     $self->{next_input_character} = shift @{$self->{char}};
1082     } else {
1083     $self->{set_next_input_character}->($self);
1084     }
1085    
1086     if ($self->{next_input_character} == 0x003E and # >
1087     $self->{current_token}->{type} eq 'start tag' and
1088     $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
1089     # permitted slash
1090     #
1091     } else {
1092     $self->{parse_error}->();
1093     }
1094     $self->{state} = 'before attribute name';
1095     # next-input-character is already done
1096     redo A;
1097     } elsif ($self->{next_input_character} == 0x003C or # <
1098     $self->{next_input_character} == -1) {
1099     $self->{parse_error}->();
1100     if ($self->{current_token}->{type} eq 'start tag') {
1101     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1102     } elsif ($self->{current_token}->{type} eq 'end tag') {
1103     $self->{content_model_flag} = 'PCDATA'; # MUST
1104 wakaba 1.3 if ($self->{current_token}->{attributes}) {
1105 wakaba 1.1 $self->{parse_error}->();
1106     }
1107     } else {
1108     die "$0: $self->{current_token}->{type}: Unknown token type";
1109     }
1110     $self->{state} = 'data';
1111     # reconsume
1112    
1113     return ($self->{current_token}); # start tag or end tag
1114     undef $self->{current_token};
1115    
1116     redo A;
1117     } else {
1118     $self->{current_attribute} = {name => chr ($self->{next_input_character}),
1119     value => ''};
1120     $self->{state} = 'attribute name';
1121    
1122     if (@{$self->{char}}) {
1123     $self->{next_input_character} = shift @{$self->{char}};
1124     } else {
1125     $self->{set_next_input_character}->($self);
1126     }
1127    
1128     redo A;
1129     }
1130     } elsif ($self->{state} eq 'before attribute value') {
1131     if ($self->{next_input_character} == 0x0009 or # HT
1132     $self->{next_input_character} == 0x000A or # LF
1133     $self->{next_input_character} == 0x000B or # VT
1134     $self->{next_input_character} == 0x000C or # FF
1135     $self->{next_input_character} == 0x0020) { # SP
1136     ## Stay in the state
1137    
1138     if (@{$self->{char}}) {
1139     $self->{next_input_character} = shift @{$self->{char}};
1140     } else {
1141     $self->{set_next_input_character}->($self);
1142     }
1143    
1144     redo A;
1145     } elsif ($self->{next_input_character} == 0x0022) { # "
1146     $self->{state} = 'attribute value (double-quoted)';
1147    
1148     if (@{$self->{char}}) {
1149     $self->{next_input_character} = shift @{$self->{char}};
1150     } else {
1151     $self->{set_next_input_character}->($self);
1152     }
1153    
1154     redo A;
1155     } elsif ($self->{next_input_character} == 0x0026) { # &
1156     $self->{state} = 'attribute value (unquoted)';
1157     ## reconsume
1158     redo A;
1159     } elsif ($self->{next_input_character} == 0x0027) { # '
1160     $self->{state} = 'attribute value (single-quoted)';
1161    
1162     if (@{$self->{char}}) {
1163     $self->{next_input_character} = shift @{$self->{char}};
1164     } else {
1165     $self->{set_next_input_character}->($self);
1166     }
1167    
1168     redo A;
1169     } elsif ($self->{next_input_character} == 0x003E) { # >
1170     if ($self->{current_token}->{type} eq 'start tag') {
1171     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1172     } elsif ($self->{current_token}->{type} eq 'end tag') {
1173     $self->{content_model_flag} = 'PCDATA'; # MUST
1174 wakaba 1.3 if ($self->{current_token}->{attributes}) {
1175 wakaba 1.1 $self->{parse_error}->();
1176     }
1177     } else {
1178     die "$0: $self->{current_token}->{type}: Unknown token type";
1179     }
1180     $self->{state} = 'data';
1181    
1182     if (@{$self->{char}}) {
1183     $self->{next_input_character} = shift @{$self->{char}};
1184     } else {
1185     $self->{set_next_input_character}->($self);
1186     }
1187    
1188    
1189     return ($self->{current_token}); # start tag or end tag
1190     undef $self->{current_token};
1191    
1192     redo A;
1193     } elsif ($self->{next_input_character} == 0x003C or # <
1194     $self->{next_input_character} == -1) {
1195     $self->{parse_error}->();
1196     if ($self->{current_token}->{type} eq 'start tag') {
1197     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1198     } elsif ($self->{current_token}->{type} eq 'end tag') {
1199     $self->{content_model_flag} = 'PCDATA'; # MUST
1200 wakaba 1.3 if ($self->{current_token}->{attributes}) {
1201 wakaba 1.1 $self->{parse_error}->();
1202     }
1203     } else {
1204     die "$0: $self->{current_token}->{type}: Unknown token type";
1205     }
1206     $self->{state} = 'data';
1207     ## reconsume
1208    
1209     return ($self->{current_token}); # start tag or end tag
1210     undef $self->{current_token};
1211    
1212     redo A;
1213     } else {
1214     $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1215     $self->{state} = 'attribute value (unquoted)';
1216    
1217     if (@{$self->{char}}) {
1218     $self->{next_input_character} = shift @{$self->{char}};
1219     } else {
1220     $self->{set_next_input_character}->($self);
1221     }
1222    
1223     redo A;
1224     }
1225     } elsif ($self->{state} eq 'attribute value (double-quoted)') {
1226     if ($self->{next_input_character} == 0x0022) { # "
1227     $self->{state} = 'before attribute name';
1228    
1229     if (@{$self->{char}}) {
1230     $self->{next_input_character} = shift @{$self->{char}};
1231     } else {
1232     $self->{set_next_input_character}->($self);
1233     }
1234    
1235     redo A;
1236     } elsif ($self->{next_input_character} == 0x0026) { # &
1237     $self->{last_attribute_value_state} = 'attribute value (double-quoted)';
1238     $self->{state} = 'entity in attribute value';
1239    
1240     if (@{$self->{char}}) {
1241     $self->{next_input_character} = shift @{$self->{char}};
1242     } else {
1243     $self->{set_next_input_character}->($self);
1244     }
1245    
1246     redo A;
1247     } elsif ($self->{next_input_character} == -1) {
1248     $self->{parse_error}->();
1249     if ($self->{current_token}->{type} eq 'start tag') {
1250     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1251     } elsif ($self->{current_token}->{type} eq 'end tag') {
1252     $self->{content_model_flag} = 'PCDATA'; # MUST
1253 wakaba 1.3 if ($self->{current_token}->{attributes}) {
1254 wakaba 1.1 $self->{parse_error}->();
1255     }
1256     } else {
1257     die "$0: $self->{current_token}->{type}: Unknown token type";
1258     }
1259     $self->{state} = 'data';
1260     ## reconsume
1261    
1262     return ($self->{current_token}); # start tag or end tag
1263     undef $self->{current_token};
1264    
1265     redo A;
1266     } else {
1267     $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1268     ## Stay in the state
1269    
1270     if (@{$self->{char}}) {
1271     $self->{next_input_character} = shift @{$self->{char}};
1272     } else {
1273     $self->{set_next_input_character}->($self);
1274     }
1275    
1276     redo A;
1277     }
1278     } elsif ($self->{state} eq 'attribute value (single-quoted)') {
1279     if ($self->{next_input_character} == 0x0027) { # '
1280     $self->{state} = 'before attribute name';
1281    
1282     if (@{$self->{char}}) {
1283     $self->{next_input_character} = shift @{$self->{char}};
1284     } else {
1285     $self->{set_next_input_character}->($self);
1286     }
1287    
1288     redo A;
1289     } elsif ($self->{next_input_character} == 0x0026) { # &
1290     $self->{last_attribute_value_state} = 'attribute value (single-quoted)';
1291     $self->{state} = 'entity in attribute value';
1292    
1293     if (@{$self->{char}}) {
1294     $self->{next_input_character} = shift @{$self->{char}};
1295     } else {
1296     $self->{set_next_input_character}->($self);
1297     }
1298    
1299     redo A;
1300     } elsif ($self->{next_input_character} == -1) {
1301     $self->{parse_error}->();
1302     if ($self->{current_token}->{type} eq 'start tag') {
1303     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1304     } elsif ($self->{current_token}->{type} eq 'end tag') {
1305     $self->{content_model_flag} = 'PCDATA'; # MUST
1306 wakaba 1.3 if ($self->{current_token}->{attributes}) {
1307 wakaba 1.1 $self->{parse_error}->();
1308     }
1309     } else {
1310     die "$0: $self->{current_token}->{type}: Unknown token type";
1311     }
1312     $self->{state} = 'data';
1313     ## reconsume
1314    
1315     return ($self->{current_token}); # start tag or end tag
1316     undef $self->{current_token};
1317    
1318     redo A;
1319     } else {
1320     $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1321     ## Stay in the state
1322    
1323     if (@{$self->{char}}) {
1324     $self->{next_input_character} = shift @{$self->{char}};
1325     } else {
1326     $self->{set_next_input_character}->($self);
1327     }
1328    
1329     redo A;
1330     }
1331     } elsif ($self->{state} eq 'attribute value (unquoted)') {
1332     if ($self->{next_input_character} == 0x0009 or # HT
1333     $self->{next_input_character} == 0x000A or # LF
1334     $self->{next_input_character} == 0x000B or # HT
1335     $self->{next_input_character} == 0x000C or # FF
1336     $self->{next_input_character} == 0x0020) { # SP
1337     $self->{state} = 'before attribute name';
1338    
1339     if (@{$self->{char}}) {
1340     $self->{next_input_character} = shift @{$self->{char}};
1341     } else {
1342     $self->{set_next_input_character}->($self);
1343     }
1344    
1345     redo A;
1346     } elsif ($self->{next_input_character} == 0x0026) { # &
1347     $self->{last_attribute_value_state} = 'attribute value (unquoted)';
1348     $self->{state} = 'entity in attribute value';
1349    
1350     if (@{$self->{char}}) {
1351     $self->{next_input_character} = shift @{$self->{char}};
1352     } else {
1353     $self->{set_next_input_character}->($self);
1354     }
1355    
1356     redo A;
1357     } elsif ($self->{next_input_character} == 0x003E) { # >
1358     if ($self->{current_token}->{type} eq 'start tag') {
1359     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1360     } elsif ($self->{current_token}->{type} eq 'end tag') {
1361     $self->{content_model_flag} = 'PCDATA'; # MUST
1362 wakaba 1.3 if ($self->{current_token}->{attributes}) {
1363 wakaba 1.1 $self->{parse_error}->();
1364     }
1365     } else {
1366     die "$0: $self->{current_token}->{type}: Unknown token type";
1367     }
1368     $self->{state} = 'data';
1369    
1370     if (@{$self->{char}}) {
1371     $self->{next_input_character} = shift @{$self->{char}};
1372     } else {
1373     $self->{set_next_input_character}->($self);
1374     }
1375    
1376    
1377     return ($self->{current_token}); # start tag or end tag
1378     undef $self->{current_token};
1379    
1380     redo A;
1381     } elsif ($self->{next_input_character} == 0x003C or # <
1382     $self->{next_input_character} == -1) {
1383     $self->{parse_error}->();
1384     if ($self->{current_token}->{type} eq 'start tag') {
1385     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1386     } elsif ($self->{current_token}->{type} eq 'end tag') {
1387     $self->{content_model_flag} = 'PCDATA'; # MUST
1388 wakaba 1.3 if ($self->{current_token}->{attributes}) {
1389 wakaba 1.1 $self->{parse_error}->();
1390     }
1391     } else {
1392     die "$0: $self->{current_token}->{type}: Unknown token type";
1393     }
1394     $self->{state} = 'data';
1395     ## reconsume
1396    
1397     return ($self->{current_token}); # start tag or end tag
1398     undef $self->{current_token};
1399    
1400     redo A;
1401     } else {
1402     $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1403     ## Stay in the state
1404    
1405     if (@{$self->{char}}) {
1406     $self->{next_input_character} = shift @{$self->{char}};
1407     } else {
1408     $self->{set_next_input_character}->($self);
1409     }
1410    
1411     redo A;
1412     }
1413     } elsif ($self->{state} eq 'entity in attribute value') {
1414     my $token = $self->_tokenize_attempt_to_consume_an_entity;
1415    
1416     unless (defined $token) {
1417     $self->{current_attribute}->{value} .= '&';
1418     } else {
1419     $self->{current_attribute}->{value} .= $token->{data};
1420     ## ISSUE: spec says "append the returned character token to the current attribute's value"
1421     }
1422    
1423     $self->{state} = $self->{last_attribute_value_state};
1424     # next-input-character is already done
1425     redo A;
1426     } elsif ($self->{state} eq 'bogus comment') {
1427     ## (only happen if PCDATA state)
1428    
1429     my $token = {type => 'comment', data => ''};
1430    
1431     BC: {
1432     if ($self->{next_input_character} == 0x003E) { # >
1433     $self->{state} = 'data';
1434    
1435     if (@{$self->{char}}) {
1436     $self->{next_input_character} = shift @{$self->{char}};
1437     } else {
1438     $self->{set_next_input_character}->($self);
1439     }
1440    
1441    
1442     return ($token);
1443    
1444     redo A;
1445     } elsif ($self->{next_input_character} == -1) {
1446     $self->{state} = 'data';
1447     ## reconsume
1448    
1449     return ($token);
1450    
1451     redo A;
1452     } else {
1453     $token->{data} .= chr ($self->{next_input_character});
1454    
1455     if (@{$self->{char}}) {
1456     $self->{next_input_character} = shift @{$self->{char}};
1457     } else {
1458     $self->{set_next_input_character}->($self);
1459     }
1460    
1461     redo BC;
1462     }
1463     } # BC
1464     } elsif ($self->{state} eq 'markup declaration open') {
1465     ## (only happen if PCDATA state)
1466    
1467     my @next_char;
1468     push @next_char, $self->{next_input_character};
1469    
1470     if ($self->{next_input_character} == 0x002D) { # -
1471    
1472     if (@{$self->{char}}) {
1473     $self->{next_input_character} = shift @{$self->{char}};
1474     } else {
1475     $self->{set_next_input_character}->($self);
1476     }
1477    
1478     push @next_char, $self->{next_input_character};
1479     if ($self->{next_input_character} == 0x002D) { # -
1480     $self->{current_token} = {type => 'comment', data => ''};
1481     $self->{state} = 'comment';
1482    
1483     if (@{$self->{char}}) {
1484     $self->{next_input_character} = shift @{$self->{char}};
1485     } else {
1486     $self->{set_next_input_character}->($self);
1487     }
1488    
1489     redo A;
1490     }
1491     } elsif ($self->{next_input_character} == 0x0044 or # D
1492     $self->{next_input_character} == 0x0064) { # d
1493    
1494     if (@{$self->{char}}) {
1495     $self->{next_input_character} = shift @{$self->{char}};
1496     } else {
1497     $self->{set_next_input_character}->($self);
1498     }
1499    
1500     push @next_char, $self->{next_input_character};
1501     if ($self->{next_input_character} == 0x004F or # O
1502     $self->{next_input_character} == 0x006F) { # o
1503    
1504     if (@{$self->{char}}) {
1505     $self->{next_input_character} = shift @{$self->{char}};
1506     } else {
1507     $self->{set_next_input_character}->($self);
1508     }
1509    
1510     push @next_char, $self->{next_input_character};
1511     if ($self->{next_input_character} == 0x0043 or # C
1512     $self->{next_input_character} == 0x0063) { # c
1513    
1514     if (@{$self->{char}}) {
1515     $self->{next_input_character} = shift @{$self->{char}};
1516     } else {
1517     $self->{set_next_input_character}->($self);
1518     }
1519    
1520     push @next_char, $self->{next_input_character};
1521     if ($self->{next_input_character} == 0x0054 or # T
1522     $self->{next_input_character} == 0x0074) { # t
1523    
1524     if (@{$self->{char}}) {
1525     $self->{next_input_character} = shift @{$self->{char}};
1526     } else {
1527     $self->{set_next_input_character}->($self);
1528     }
1529    
1530     push @next_char, $self->{next_input_character};
1531     if ($self->{next_input_character} == 0x0059 or # Y
1532     $self->{next_input_character} == 0x0079) { # y
1533    
1534     if (@{$self->{char}}) {
1535     $self->{next_input_character} = shift @{$self->{char}};
1536     } else {
1537     $self->{set_next_input_character}->($self);
1538     }
1539    
1540     push @next_char, $self->{next_input_character};
1541     if ($self->{next_input_character} == 0x0050 or # P
1542     $self->{next_input_character} == 0x0070) { # p
1543    
1544     if (@{$self->{char}}) {
1545     $self->{next_input_character} = shift @{$self->{char}};
1546     } else {
1547     $self->{set_next_input_character}->($self);
1548     }
1549    
1550     push @next_char, $self->{next_input_character};
1551     if ($self->{next_input_character} == 0x0045 or # E
1552     $self->{next_input_character} == 0x0065) { # e
1553     ## ISSUE: What a stupid code this is!
1554     $self->{state} = 'DOCTYPE';
1555    
1556     if (@{$self->{char}}) {
1557     $self->{next_input_character} = shift @{$self->{char}};
1558     } else {
1559     $self->{set_next_input_character}->($self);
1560     }
1561    
1562     redo A;
1563     }
1564     }
1565     }
1566     }
1567     }
1568     }
1569     }
1570    
1571     $self->{parse_error}->();
1572     $self->{next_input_character} = shift @next_char;
1573     unshift @{$self->{char}}, (@next_char);
1574     $self->{state} = 'bogus comment';
1575     redo A;
1576    
1577     ## ISSUE: typos in spec: chacacters, is is a parse error
1578     ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
1579     } elsif ($self->{state} eq 'comment') {
1580     if ($self->{next_input_character} == 0x002D) { # -
1581     $self->{state} = 'comment dash';
1582    
1583     if (@{$self->{char}}) {
1584     $self->{next_input_character} = shift @{$self->{char}};
1585     } else {
1586     $self->{set_next_input_character}->($self);
1587     }
1588    
1589     redo A;
1590     } elsif ($self->{next_input_character} == -1) {
1591     $self->{parse_error}->();
1592     $self->{state} = 'data';
1593     ## reconsume
1594    
1595     return ($self->{current_token}); # comment
1596     undef $self->{current_token};
1597    
1598     redo A;
1599     } else {
1600     $self->{current_token}->{data} .= chr ($self->{next_input_character}); # comment
1601     ## Stay in the state
1602    
1603     if (@{$self->{char}}) {
1604     $self->{next_input_character} = shift @{$self->{char}};
1605     } else {
1606     $self->{set_next_input_character}->($self);
1607     }
1608    
1609     redo A;
1610     }
1611     } elsif ($self->{state} eq 'comment dash') {
1612     if ($self->{next_input_character} == 0x002D) { # -
1613     $self->{state} = 'comment end';
1614    
1615     if (@{$self->{char}}) {
1616     $self->{next_input_character} = shift @{$self->{char}};
1617     } else {
1618     $self->{set_next_input_character}->($self);
1619     }
1620    
1621     redo A;
1622     } elsif ($self->{next_input_character} == -1) {
1623     $self->{parse_error}->();
1624     $self->{state} = 'data';
1625     ## reconsume
1626    
1627     return ($self->{current_token}); # comment
1628     undef $self->{current_token};
1629    
1630     redo A;
1631     } else {
1632     $self->{current_token}->{data} .= '-' . chr ($self->{next_input_character}); # comment
1633     $self->{state} = 'comment';
1634    
1635     if (@{$self->{char}}) {
1636     $self->{next_input_character} = shift @{$self->{char}};
1637     } else {
1638     $self->{set_next_input_character}->($self);
1639     }
1640    
1641     redo A;
1642     }
1643     } elsif ($self->{state} eq 'comment end') {
1644     if ($self->{next_input_character} == 0x003E) { # >
1645     $self->{state} = 'data';
1646    
1647     if (@{$self->{char}}) {
1648     $self->{next_input_character} = shift @{$self->{char}};
1649     } else {
1650     $self->{set_next_input_character}->($self);
1651     }
1652    
1653    
1654     return ($self->{current_token}); # comment
1655     undef $self->{current_token};
1656    
1657     redo A;
1658     } elsif ($self->{next_input_character} == 0x002D) { # -
1659     $self->{parse_error}->();
1660     $self->{current_token}->{data} .= '-'; # comment
1661     ## Stay in the state
1662    
1663     if (@{$self->{char}}) {
1664     $self->{next_input_character} = shift @{$self->{char}};
1665     } else {
1666     $self->{set_next_input_character}->($self);
1667     }
1668    
1669     redo A;
1670     } elsif ($self->{next_input_character} == -1) {
1671     $self->{parse_error}->();
1672     $self->{state} = 'data';
1673     ## reconsume
1674    
1675     return ($self->{current_token}); # comment
1676     undef $self->{current_token};
1677    
1678     redo A;
1679     } else {
1680     $self->{parse_error}->();
1681     $self->{current_token}->{data} .= '--' . chr ($self->{next_input_character}); # comment
1682     $self->{state} = 'comment';
1683    
1684     if (@{$self->{char}}) {
1685     $self->{next_input_character} = shift @{$self->{char}};
1686     } else {
1687     $self->{set_next_input_character}->($self);
1688     }
1689    
1690     redo A;
1691     }
1692     } elsif ($self->{state} eq 'DOCTYPE') {
1693     if ($self->{next_input_character} == 0x0009 or # HT
1694     $self->{next_input_character} == 0x000A or # LF
1695     $self->{next_input_character} == 0x000B or # VT
1696     $self->{next_input_character} == 0x000C or # FF
1697     $self->{next_input_character} == 0x0020) { # SP
1698     $self->{state} = 'before DOCTYPE name';
1699    
1700     if (@{$self->{char}}) {
1701     $self->{next_input_character} = shift @{$self->{char}};
1702     } else {
1703     $self->{set_next_input_character}->($self);
1704     }
1705    
1706     redo A;
1707     } else {
1708     $self->{parse_error}->();
1709     $self->{state} = 'before DOCTYPE name';
1710     ## reconsume
1711     redo A;
1712     }
1713     } elsif ($self->{state} eq 'before DOCTYPE name') {
1714     if ($self->{next_input_character} == 0x0009 or # HT
1715     $self->{next_input_character} == 0x000A or # LF
1716     $self->{next_input_character} == 0x000B or # VT
1717     $self->{next_input_character} == 0x000C or # FF
1718     $self->{next_input_character} == 0x0020) { # SP
1719     ## Stay in the state
1720    
1721     if (@{$self->{char}}) {
1722     $self->{next_input_character} = shift @{$self->{char}};
1723     } else {
1724     $self->{set_next_input_character}->($self);
1725     }
1726    
1727     redo A;
1728     } elsif (0x0061 <= $self->{next_input_character} and
1729     $self->{next_input_character} <= 0x007A) { # a..z
1730     $self->{current_token} = {type => 'DOCTYPE',
1731     name => chr ($self->{next_input_character} - 0x0020),
1732     error => 1};
1733     $self->{state} = 'DOCTYPE name';
1734    
1735     if (@{$self->{char}}) {
1736     $self->{next_input_character} = shift @{$self->{char}};
1737     } else {
1738     $self->{set_next_input_character}->($self);
1739     }
1740    
1741     redo A;
1742     } elsif ($self->{next_input_character} == 0x003E) { # >
1743     $self->{parse_error}->();
1744     $self->{state} = 'data';
1745    
1746     if (@{$self->{char}}) {
1747     $self->{next_input_character} = shift @{$self->{char}};
1748     } else {
1749     $self->{set_next_input_character}->($self);
1750     }
1751    
1752    
1753     return ({type => 'DOCTYPE', name => '', error => 1});
1754    
1755     redo A;
1756     } elsif ($self->{next_input_character} == -1) {
1757     $self->{parse_error}->();
1758     $self->{state} = 'data';
1759     ## reconsume
1760    
1761     return ({type => 'DOCTYPE', name => '', error => 1});
1762    
1763     redo A;
1764     } else {
1765     $self->{current_token} = {type => 'DOCTYPE',
1766     name => chr ($self->{next_input_character}),
1767     error => 1};
1768     $self->{state} = 'DOCTYPE name';
1769    
1770     if (@{$self->{char}}) {
1771     $self->{next_input_character} = shift @{$self->{char}};
1772     } else {
1773     $self->{set_next_input_character}->($self);
1774     }
1775    
1776     redo A;
1777     }
1778     } elsif ($self->{state} eq 'DOCTYPE name') {
1779     if ($self->{next_input_character} == 0x0009 or # HT
1780     $self->{next_input_character} == 0x000A or # LF
1781     $self->{next_input_character} == 0x000B or # VT
1782     $self->{next_input_character} == 0x000C or # FF
1783     $self->{next_input_character} == 0x0020) { # SP
1784     $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE
1785     $self->{state} = 'after DOCTYPE name';
1786    
1787     if (@{$self->{char}}) {
1788     $self->{next_input_character} = shift @{$self->{char}};
1789     } else {
1790     $self->{set_next_input_character}->($self);
1791     }
1792    
1793     redo A;
1794     } elsif ($self->{next_input_character} == 0x003E) { # >
1795     $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE
1796     $self->{state} = 'data';
1797    
1798     if (@{$self->{char}}) {
1799     $self->{next_input_character} = shift @{$self->{char}};
1800     } else {
1801     $self->{set_next_input_character}->($self);
1802     }
1803    
1804    
1805     return ($self->{current_token}); # DOCTYPE
1806     undef $self->{current_token};
1807    
1808     redo A;
1809     } elsif (0x0061 <= $self->{next_input_character} and
1810     $self->{next_input_character} <= 0x007A) { # a..z
1811     $self->{current_token}->{name} .= chr ($self->{next_input_character} - 0x0020); # DOCTYPE
1812     #$self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML');
1813     ## Stay in the state
1814    
1815     if (@{$self->{char}}) {
1816     $self->{next_input_character} = shift @{$self->{char}};
1817     } else {
1818     $self->{set_next_input_character}->($self);
1819     }
1820    
1821     redo A;
1822     } elsif ($self->{next_input_character} == -1) {
1823     $self->{parse_error}->();
1824     $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE
1825     $self->{state} = 'data';
1826     ## reconsume
1827    
1828     return ($self->{current_token});
1829     undef $self->{current_token};
1830    
1831     redo A;
1832     } else {
1833 wakaba 1.4 $self->{current_token}->{name}
1834     .= chr ($self->{next_input_character}); # DOCTYPE
1835 wakaba 1.1 #$self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML');
1836     ## Stay in the state
1837    
1838     if (@{$self->{char}}) {
1839     $self->{next_input_character} = shift @{$self->{char}};
1840     } else {
1841     $self->{set_next_input_character}->($self);
1842     }
1843    
1844     redo A;
1845     }
1846     } elsif ($self->{state} eq 'after DOCTYPE name') {
1847     if ($self->{next_input_character} == 0x0009 or # HT
1848     $self->{next_input_character} == 0x000A or # LF
1849     $self->{next_input_character} == 0x000B or # VT
1850     $self->{next_input_character} == 0x000C or # FF
1851     $self->{next_input_character} == 0x0020) { # SP
1852     ## Stay in the state
1853    
1854     if (@{$self->{char}}) {
1855     $self->{next_input_character} = shift @{$self->{char}};
1856     } else {
1857     $self->{set_next_input_character}->($self);
1858     }
1859    
1860     redo A;
1861     } elsif ($self->{next_input_character} == 0x003E) { # >
1862     $self->{state} = 'data';
1863    
1864     if (@{$self->{char}}) {
1865     $self->{next_input_character} = shift @{$self->{char}};
1866     } else {
1867     $self->{set_next_input_character}->($self);
1868     }
1869    
1870    
1871     return ($self->{current_token}); # DOCTYPE
1872     undef $self->{current_token};
1873    
1874     redo A;
1875     } elsif ($self->{next_input_character} == -1) {
1876     $self->{parse_error}->();
1877     $self->{state} = 'data';
1878     ## reconsume
1879    
1880     return ($self->{current_token}); # DOCTYPE
1881     undef $self->{current_token};
1882    
1883     redo A;
1884     } else {
1885     $self->{parse_error}->();
1886     $self->{current_token}->{error} = 1; # DOCTYPE
1887     $self->{state} = 'bogus DOCTYPE';
1888    
1889     if (@{$self->{char}}) {
1890     $self->{next_input_character} = shift @{$self->{char}};
1891     } else {
1892     $self->{set_next_input_character}->($self);
1893     }
1894    
1895     redo A;
1896     }
1897     } elsif ($self->{state} eq 'bogus DOCTYPE') {
1898     if ($self->{next_input_character} == 0x003E) { # >
1899     $self->{state} = 'data';
1900    
1901     if (@{$self->{char}}) {
1902     $self->{next_input_character} = shift @{$self->{char}};
1903     } else {
1904     $self->{set_next_input_character}->($self);
1905     }
1906    
1907    
1908     return ($self->{current_token}); # DOCTYPE
1909     undef $self->{current_token};
1910    
1911     redo A;
1912     } elsif ($self->{next_input_character} == -1) {
1913     $self->{parse_error}->();
1914     $self->{state} = 'data';
1915     ## reconsume
1916    
1917     return ($self->{current_token}); # DOCTYPE
1918     undef $self->{current_token};
1919    
1920     redo A;
1921     } else {
1922     ## Stay in the state
1923    
1924     if (@{$self->{char}}) {
1925     $self->{next_input_character} = shift @{$self->{char}};
1926     } else {
1927     $self->{set_next_input_character}->($self);
1928     }
1929    
1930     redo A;
1931     }
1932     } else {
1933     die "$0: $self->{state}: Unknown state";
1934     }
1935     } # A
1936    
1937     die "$0: _get_next_token: unexpected case";
1938     } # _get_next_token
1939    
1940     sub _tokenize_attempt_to_consume_an_entity ($) {
1941     my $self = shift;
1942    
1943     if ($self->{next_input_character} == 0x0023) { # #
1944    
1945     if (@{$self->{char}}) {
1946     $self->{next_input_character} = shift @{$self->{char}};
1947     } else {
1948     $self->{set_next_input_character}->($self);
1949     }
1950    
1951     my $num;
1952     if ($self->{next_input_character} == 0x0078 or # x
1953     $self->{next_input_character} == 0x0058) { # X
1954     X: {
1955     my $x_char = $self->{next_input_character};
1956    
1957     if (@{$self->{char}}) {
1958     $self->{next_input_character} = shift @{$self->{char}};
1959     } else {
1960     $self->{set_next_input_character}->($self);
1961     }
1962    
1963     if (0x0030 <= $self->{next_input_character} and
1964     $self->{next_input_character} <= 0x0039) { # 0..9
1965     $num ||= 0;
1966     $num *= 0x10;
1967     $num += $self->{next_input_character} - 0x0030;
1968     redo X;
1969     } elsif (0x0061 <= $self->{next_input_character} and
1970     $self->{next_input_character} <= 0x0066) { # a..f
1971     ## ISSUE: the spec says U+0078, which is apparently incorrect
1972     $num ||= 0;
1973     $num *= 0x10;
1974     $num += $self->{next_input_character} - 0x0060 + 9;
1975     redo X;
1976     } elsif (0x0041 <= $self->{next_input_character} and
1977     $self->{next_input_character} <= 0x0046) { # A..F
1978     ## ISSUE: the spec says U+0058, which is apparently incorrect
1979     $num ||= 0;
1980     $num *= 0x10;
1981     $num += $self->{next_input_character} - 0x0040 + 9;
1982     redo X;
1983     } elsif (not defined $num) { # no hexadecimal digit
1984     $self->{parse_error}->();
1985     $self->{next_input_character} = 0x0023; # #
1986     unshift @{$self->{char}}, ($x_char);
1987 wakaba 1.6 return undef;
1988 wakaba 1.1 } elsif ($self->{next_input_character} == 0x003B) { # ;
1989    
1990     if (@{$self->{char}}) {
1991     $self->{next_input_character} = shift @{$self->{char}};
1992     } else {
1993     $self->{set_next_input_character}->($self);
1994     }
1995    
1996     } else {
1997     $self->{parse_error}->();
1998     }
1999    
2000     ## TODO: check the definition for |a valid Unicode character|.
2001     if ($num > 1114111 or $num == 0) {
2002     $num = 0xFFFD; # REPLACEMENT CHARACTER
2003     ## ISSUE: Why this is not an error?
2004     }
2005    
2006 wakaba 1.6 return {type => 'character', data => chr $num};
2007 wakaba 1.1 } # X
2008 wakaba 1.5 } elsif (0x0030 <= $self->{next_input_character} and
2009     $self->{next_input_character} <= 0x0039) { # 0..9
2010     my $code = $self->{next_input_character} - 0x0030;
2011    
2012     if (@{$self->{char}}) {
2013     $self->{next_input_character} = shift @{$self->{char}};
2014     } else {
2015     $self->{set_next_input_character}->($self);
2016     }
2017    
2018    
2019     while (0x0030 <= $self->{next_input_character} and
2020     $self->{next_input_character} <= 0x0039) { # 0..9
2021     $code *= 10;
2022     $code += $self->{next_input_character} - 0x0030;
2023    
2024    
2025 wakaba 1.1 if (@{$self->{char}}) {
2026     $self->{next_input_character} = shift @{$self->{char}};
2027     } else {
2028     $self->{set_next_input_character}->($self);
2029     }
2030    
2031 wakaba 1.5 }
2032 wakaba 1.1
2033 wakaba 1.5 if ($self->{next_input_character} == 0x003B) { # ;
2034    
2035 wakaba 1.1 if (@{$self->{char}}) {
2036     $self->{next_input_character} = shift @{$self->{char}};
2037     } else {
2038     $self->{set_next_input_character}->($self);
2039     }
2040    
2041 wakaba 1.5 } else {
2042     $self->{parse_error}->();
2043     }
2044 wakaba 1.1
2045 wakaba 1.5 ## TODO: check the definition for |a valid Unicode character|.
2046     if ($code > 1114111 or $code == 0) {
2047     $code = 0xFFFD; # REPLACEMENT CHARACTER
2048     ## ISSUE: Why this is not an error?
2049     }
2050    
2051 wakaba 1.6 return {type => 'character', data => chr $code};
2052 wakaba 1.5 } else {
2053     $self->{parse_error}->();
2054     unshift @{$self->{char}}, ($self->{next_input_character});
2055     $self->{next_input_character} = 0x0023; # #
2056 wakaba 1.6 return undef;
2057 wakaba 1.1 }
2058 wakaba 1.6 } elsif ((0x0041 <= $self->{next_input_character} and
2059     $self->{next_input_character} <= 0x005A) or
2060     (0x0061 <= $self->{next_input_character} and
2061     $self->{next_input_character} <= 0x007A)) {
2062     my $entity_name = chr $self->{next_input_character};
2063    
2064 wakaba 1.1 if (@{$self->{char}}) {
2065     $self->{next_input_character} = shift @{$self->{char}};
2066     } else {
2067     $self->{set_next_input_character}->($self);
2068     }
2069    
2070 wakaba 1.6
2071     my $value = $entity_name;
2072     my $match;
2073    
2074     while (length $entity_name < 10 and
2075     ## NOTE: Some number greater than the maximum length of entity name
2076     ((0x0041 <= $self->{next_input_character} and
2077     $self->{next_input_character} <= 0x005A) or
2078     (0x0061 <= $self->{next_input_character} and
2079     $self->{next_input_character} <= 0x007A) or
2080     (0x0030 <= $self->{next_input_character} and
2081     $self->{next_input_character} <= 0x0039))) {
2082     $entity_name .= chr $self->{next_input_character};
2083     if (defined $entity_char->{$entity_name}) {
2084     $value = $entity_char->{$entity_name};
2085     $match = 1;
2086 wakaba 1.1 } else {
2087 wakaba 1.6 $value .= chr $self->{next_input_character};
2088 wakaba 1.1 }
2089 wakaba 1.6
2090 wakaba 1.1 if (@{$self->{char}}) {
2091     $self->{next_input_character} = shift @{$self->{char}};
2092     } else {
2093     $self->{set_next_input_character}->($self);
2094     }
2095    
2096 wakaba 1.6 }
2097    
2098     if ($match) {
2099     if ($self->{next_input_character} == 0x003B) { # ;
2100 wakaba 1.1
2101     if (@{$self->{char}}) {
2102     $self->{next_input_character} = shift @{$self->{char}};
2103     } else {
2104     $self->{set_next_input_character}->($self);
2105     }
2106    
2107     } else {
2108 wakaba 1.6 $self->{parse_error}->();
2109 wakaba 1.1 }
2110 wakaba 1.6
2111     return {type => 'character', data => $value};
2112     } else {
2113     $self->{parse_error}->();
2114     ## NOTE: No characters are consumed in the spec.
2115     unshift @{$self->{token}}, ({type => 'character', data => $value});
2116     return undef;
2117     }
2118     } else {
2119     ## no characters are consumed
2120     $self->{parse_error}->();
2121     return undef;
2122     }
2123 wakaba 1.1 } # _tokenize_attempt_to_consume_an_entity
2124 wakaba 1.3
2125     sub _initialize_tree_constructor ($) {
2126     my $self = shift;
2127 wakaba 1.9 ## NOTE: $self->{document} MUST be specified before this method is called
2128 wakaba 1.3 $self->{document}->strict_error_checking (0);
2129     ## TODO: Turn mutation events off # MUST
2130     ## TODO: Turn loose Document option (manakai extension) on
2131 wakaba 1.9 ## TODO: Mark the Document as an HTML document # MUST
2132 wakaba 1.3 } # _initialize_tree_constructor
2133    
2134     sub _terminate_tree_constructor ($) {
2135     my $self = shift;
2136     $self->{document}->strict_error_checking (1);
2137     ## TODO: Turn mutation events on
2138     } # _terminate_tree_constructor
2139    
2140     ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
2141    
2142     sub _construct_tree ($) {
2143     my ($self) = @_;
2144    
2145     ## When an interactive UA render the $self->{document} available
2146     ## to the user, or when it begin accepting user input, are
2147     ## not defined.
2148    
2149     ## Append a character: collect it and all subsequent consecutive
2150     ## characters and insert one Text node whose data is concatenation
2151     ## of all those characters. # MUST
2152    
2153     my $token;
2154     $token = $self->_get_next_token;
2155    
2156     my $phase = 'initial'; # MUST
2157    
2158     my $open_elements = [];
2159     my $active_formatting_elements = [];
2160     my $head_element;
2161     my $form_element;
2162     my $insertion_mode = 'before head';
2163    
2164     my $reconstruct_active_formatting_elements = sub { # MUST
2165 wakaba 1.8 my $insert = shift;
2166    
2167 wakaba 1.3 ## Step 1
2168     return unless @$active_formatting_elements;
2169    
2170     ## Step 3
2171     my $i = -1;
2172     my $entry = $active_formatting_elements->[$i];
2173    
2174     ## Step 2
2175     return if $entry->[0] eq '#marker';
2176     for (@$open_elements) {
2177     if ($entry->[0] eq $_->[0]) {
2178     return;
2179     }
2180     }
2181    
2182     S4: {
2183 wakaba 1.8 ## Step 4
2184 wakaba 1.3 last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
2185    
2186     ## Step 5
2187     $i--;
2188     $entry = $active_formatting_elements->[$i];
2189    
2190     ## Step 6
2191     if ($entry->[0] eq '#marker') {
2192     #
2193     } else {
2194     my $in_open_elements;
2195     OE: for (@$open_elements) {
2196     if ($entry->[0] eq $_->[0]) {
2197 wakaba 1.8 $in_open_elements = 1;
2198     last OE;
2199     }
2200 wakaba 1.3 }
2201     if ($in_open_elements) {
2202     #
2203     } else {
2204     redo S4;
2205     }
2206     }
2207    
2208     ## Step 7
2209     $i++;
2210     $entry = $active_formatting_elements->[$i];
2211     } # S4
2212    
2213     S7: {
2214     ## Step 8
2215 wakaba 1.8 my $clone = [$entry->[0]->clone_node (0), $entry->[1]];
2216 wakaba 1.3
2217     ## Step 9
2218 wakaba 1.8 $insert->($clone->[0]);
2219     push @$open_elements, $clone;
2220 wakaba 1.3
2221     ## Step 10
2222     $active_formatting_elements->[$i] = $open_elements->[-1];
2223 wakaba 1.8
2224     ## Step 11
2225     unless ($clone->[0] eq $active_formatting_elements->[-1]->[0]) {
2226 wakaba 1.3 ## Step 7'
2227     $i++;
2228     $entry = $active_formatting_elements->[$i];
2229    
2230     redo S7;
2231     }
2232     } # S7
2233     }; # $reconstruct_active_formatting_elements
2234    
2235     my $clear_up_to_marker = sub {
2236     for (reverse 0..$#$active_formatting_elements) {
2237     if ($active_formatting_elements->[$_]->[0] eq '#marker') {
2238     splice @$active_formatting_elements, $_;
2239     return;
2240     }
2241     }
2242     }; # $clear_up_to_marker
2243    
2244     my $reset_insertion_mode = sub {
2245     ## Step 1
2246     my $last;
2247    
2248     ## Step 2
2249     my $i = -1;
2250     my $node = $open_elements->[$i];
2251    
2252     ## Step 3
2253     S3: {
2254     $last = 1 if $open_elements->[0]->[0] eq $node->[0];
2255     ## TODO: the element whose inner_html is set is neither td nor th, then $node = the element
2256    
2257     ## Step 4..13
2258     my $new_mode = {
2259     select => 'in select',
2260     td => 'in cell',
2261     th => 'in cell',
2262     tr => 'in row',
2263     tbody => 'in table body',
2264     thead => 'in table head',
2265     tfoot => 'in table foot',
2266     caption => 'in caption',
2267     colgroup => 'in column group',
2268     table => 'in table',
2269     head => 'in body', # not in head!
2270     body => 'in body',
2271     frameset => 'in frameset',
2272     }->{$node->[1]};
2273     $insertion_mode = $new_mode and return if defined $new_mode;
2274    
2275     ## Step 14
2276     if ($node->[1] eq 'html') {
2277     unless (defined $head_element) {
2278     $insertion_mode = 'before head';
2279     } else {
2280     $insertion_mode = 'after head';
2281     }
2282     return;
2283     }
2284    
2285     ## Step 15
2286     $insertion_mode = 'in body' and return if $last;
2287    
2288     ## Step 16
2289     $i--;
2290     $node = $open_elements->[$i];
2291    
2292     ## Step 17
2293     redo S3;
2294     } # S3
2295     }; # $reset_insertion_mode
2296    
2297     my $style_start_tag = sub {
2298     my $style_el;
2299     $style_el = $self->{document}->create_element_ns
2300     (q<http://www.w3.org/1999/xhtml>, [undef, 'style']);
2301    
2302     ## $insertion_mode eq 'in head' and ... (always true)
2303     (($insertion_mode eq 'in head' and defined $head_element)
2304     ? $head_element : $open_elements->[-1]->[0])
2305     ->append_child ($style_el);
2306     $self->{content_model_flag} = 'CDATA';
2307    
2308     my $text = '';
2309     $token = $self->_get_next_token;
2310     while ($token->{type} eq 'character') {
2311     $text .= $token->{data};
2312     $token = $self->_get_next_token;
2313     } # stop if non-character token or tokenizer stops tokenising
2314     if (length $text) {
2315     $style_el->manakai_append_text ($text);
2316     }
2317    
2318     $self->{content_model_flag} = 'PCDATA';
2319    
2320     if ($token->{type} eq 'end tag' and $token->{tag_name} eq 'style') {
2321     ## Ignore the token
2322     } else {
2323     $self->{parse_error}->();
2324     ## ISSUE: And ignore?
2325     }
2326     $token = $self->_get_next_token;
2327     }; # $style_start_tag
2328    
2329     my $script_start_tag = sub {
2330 wakaba 1.8 my $script_el;
2331    
2332 wakaba 1.3 $script_el = $self->{document}->create_element_ns
2333     (q<http://www.w3.org/1999/xhtml>, [undef, 'script']);
2334    
2335 wakaba 1.8 for my $attr_name (keys %{ $token->{attributes}}) {
2336     $script_el->set_attribute_ns (undef, [undef, $attr_name],
2337     $token->{attributes} ->{$attr_name}->{value});
2338     }
2339    
2340 wakaba 1.3 ## TODO: mark as "parser-inserted"
2341    
2342     $self->{content_model_flag} = 'CDATA';
2343    
2344     my $text = '';
2345     $token = $self->_get_next_token;
2346     while ($token->{type} eq 'character') {
2347     $text .= $token->{data};
2348     $token = $self->_get_next_token;
2349     } # stop if non-character token or tokenizer stops tokenising
2350     if (length $text) {
2351     $script_el->manakai_append_text ($text);
2352     }
2353    
2354     $self->{content_model_flag} = 'PCDATA';
2355 wakaba 1.8
2356 wakaba 1.3 if ($token->{type} eq 'end tag' and
2357     $token->{tag_name} eq 'script') {
2358     ## Ignore the token
2359     } else {
2360     $self->{parse_error}->();
2361     ## ISSUE: And ignore?
2362     ## TODO: mark as "already executed"
2363     }
2364    
2365     ## TODO: inner_html mode then mark as "already executed" and skip
2366     if (1) {
2367     ## TODO: $old_insertion_point = current insertion point
2368     ## TODO: insertion point = just before the next input character
2369    
2370     (($insertion_mode eq 'in head' and defined $head_element)
2371     ? $head_element : $open_elements->[-1]->[0])->append_child ($script_el);
2372    
2373     ## TODO: insertion point = $old_insertion_point (might be "undefined")
2374    
2375     ## TODO: if there is a script that will execute as soon as the parser resume, then...
2376     }
2377    
2378     $token = $self->_get_next_token;
2379     }; # $script_start_tag
2380    
2381     my $formatting_end_tag = sub {
2382     my $tag_name = shift;
2383    
2384     FET: {
2385     ## Step 1
2386     my $formatting_element;
2387     my $formatting_element_i_in_active;
2388     AFE: for (reverse 0..$#$active_formatting_elements) {
2389     if ($active_formatting_elements->[$_]->[1] eq $tag_name) {
2390     $formatting_element = $active_formatting_elements->[$_];
2391     $formatting_element_i_in_active = $_;
2392     last AFE;
2393     } elsif ($active_formatting_elements->[$_]->[0] eq '#marker') {
2394     last AFE;
2395     }
2396     } # AFE
2397     unless (defined $formatting_element) {
2398     $self->{parse_error}->();
2399     ## Ignore the token
2400     $token = $self->_get_next_token;
2401     return;
2402     }
2403     ## has an element in scope
2404     my $in_scope = 1;
2405     my $formatting_element_i_in_open;
2406     INSCOPE: for (reverse 0..$#$open_elements) {
2407     my $node = $open_elements->[$_];
2408     if ($node->[0] eq $formatting_element->[0]) {
2409     if ($in_scope) {
2410     $formatting_element_i_in_open = $_;
2411     last INSCOPE;
2412     } else { # in open elements but not in scope
2413     $self->{parse_error}->();
2414     ## Ignore the token
2415     $token = $self->_get_next_token;
2416     return;
2417     }
2418     } elsif ({
2419     table => 1, caption => 1, td => 1, th => 1,
2420     button => 1, marquee => 1, object => 1, html => 1,
2421     }->{$node->[1]}) {
2422     $in_scope = 0;
2423     }
2424     } # INSCOPE
2425     unless (defined $formatting_element_i_in_open) {
2426     $self->{parse_error}->();
2427     pop @$active_formatting_elements; # $formatting_element
2428     $token = $self->_get_next_token; ## TODO: ok?
2429     return;
2430     }
2431     if (not $open_elements->[-1]->[0] eq $formatting_element->[0]) {
2432     $self->{parse_error}->();
2433     }
2434    
2435     ## Step 2
2436     my $furthest_block;
2437     my $furthest_block_i_in_open;
2438     OE: for (reverse 0..$#$open_elements) {
2439     my $node = $open_elements->[$_];
2440     if (not $formatting_category->{$node->[1]} and
2441     #not $phrasing_category->{$node->[1]} and
2442     ($special_category->{$node->[1]} or
2443     $scoping_category->{$node->[1]})) {
2444     $furthest_block = $node;
2445     $furthest_block_i_in_open = $_;
2446     } elsif ($node->[0] eq $formatting_element->[0]) {
2447     last OE;
2448     }
2449     } # OE
2450    
2451     ## Step 3
2452     unless (defined $furthest_block) { # MUST
2453     splice @$open_elements, $formatting_element_i_in_open;
2454     splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
2455     $token = $self->_get_next_token;
2456     return;
2457     }
2458    
2459     ## Step 4
2460     my $common_ancestor_node = $open_elements->[$formatting_element_i_in_open - 1];
2461    
2462     ## Step 5
2463     my $furthest_block_parent = $furthest_block->[0]->parent_node;
2464     if (defined $furthest_block_parent) {
2465     $furthest_block_parent->remove_child ($furthest_block->[0]);
2466     }
2467    
2468     ## Step 6
2469     my $bookmark_prev_el
2470     = $active_formatting_elements->[$formatting_element_i_in_active - 1]
2471     ->[0];
2472    
2473     ## Step 7
2474     my $node = $furthest_block;
2475     my $node_i_in_open = $furthest_block_i_in_open;
2476     my $last_node = $furthest_block;
2477     S7: {
2478     ## Step 1
2479     $node_i_in_open--;
2480     $node = $open_elements->[$node_i_in_open];
2481    
2482     ## Step 2
2483     my $node_i_in_active;
2484     S7S2: {
2485     for (reverse 0..$#$active_formatting_elements) {
2486     if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
2487     $node_i_in_active = $_;
2488     last S7S2;
2489     }
2490     }
2491     splice @$open_elements, $node_i_in_open, 1;
2492     redo S7;
2493     } # S7S2
2494    
2495     ## Step 3
2496     last S7 if $node->[0] eq $formatting_element->[0];
2497    
2498     ## Step 4
2499     if ($last_node->[0] eq $furthest_block->[0]) {
2500     $bookmark_prev_el = $node->[0];
2501     }
2502    
2503     ## Step 5
2504     if ($node->[0]->has_child_nodes ()) {
2505     my $clone = [$node->[0]->clone_node (0), $node->[1]];
2506     $active_formatting_elements->[$node_i_in_active] = $clone;
2507     $open_elements->[$node_i_in_open] = $clone;
2508     $node = $clone;
2509     }
2510    
2511     ## Step 6
2512 wakaba 1.7 $node->[0]->append_child ($last_node->[0]);
2513 wakaba 1.3
2514     ## Step 7
2515     $last_node = $node;
2516    
2517     ## Step 8
2518     redo S7;
2519     } # S7
2520    
2521     ## Step 8
2522 wakaba 1.7 $common_ancestor_node->[0]->append_child ($last_node->[0]);
2523 wakaba 1.3
2524     ## Step 9
2525     my $clone = [$formatting_element->[0]->clone_node (0),
2526     $formatting_element->[1]];
2527    
2528     ## Step 10
2529     my @cn = @{$furthest_block->[0]->child_nodes};
2530     $clone->[0]->append_child ($_) for @cn;
2531    
2532     ## Step 11
2533     $furthest_block->[0]->append_child ($clone->[0]);
2534    
2535     ## Step 12
2536     my $i;
2537     AFE: for (reverse 0..$#$active_formatting_elements) {
2538     if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
2539     splice @$active_formatting_elements, $_, 1;
2540     $i-- and last AFE if defined $i;
2541     } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
2542     $i = $_;
2543     }
2544     } # AFE
2545     splice @$active_formatting_elements, $i + 1, 0, $clone;
2546    
2547     ## Step 13
2548     undef $i;
2549     OE: for (reverse 0..$#$open_elements) {
2550     if ($open_elements->[$_]->[0] eq $formatting_element->[0]) {
2551     splice @$open_elements, $_, 1;
2552     $i-- and last OE if defined $i;
2553     } elsif ($open_elements->[$_]->[0] eq $furthest_block->[0]) {
2554     $i = $_;
2555     }
2556     } # OE
2557     splice @$open_elements, $i + 1, 1, $clone;
2558    
2559     ## Step 14
2560     redo FET;
2561     } # FET
2562     }; # $formatting_end_tag
2563    
2564 wakaba 1.8 my $insert_to_current = sub {
2565     $open_elements->[-1]->[0]->append_child (shift);
2566     }; # $insert_to_current
2567    
2568     my $insert_to_foster = sub {
2569     my $child = shift;
2570     if ({
2571     table => 1, tbody => 1, tfoot => 1,
2572     thead => 1, tr => 1,
2573     }->{$open_elements->[-1]->[1]}) {
2574     # MUST
2575     my $foster_parent_element;
2576     my $next_sibling;
2577     OE: for (reverse 0..$#$open_elements) {
2578     if ($open_elements->[$_]->[1] eq 'table') {
2579     my $parent = $open_elements->[$_]->[0]->parent_node;
2580     if (defined $parent and $parent->node_type == 1) {
2581     $foster_parent_element = $parent;
2582     $next_sibling = $open_elements->[$_]->[0];
2583     } else {
2584     $foster_parent_element
2585     = $open_elements->[$_ - 1]->[0];
2586     }
2587     last OE;
2588     }
2589     } # OE
2590     $foster_parent_element = $open_elements->[0]->[0]
2591     unless defined $foster_parent_element;
2592     $foster_parent_element->insert_before
2593     ($child, $next_sibling);
2594     } else {
2595     $open_elements->[-1]->[0]->append_child ($child);
2596     }
2597     }; # $insert_to_foster
2598    
2599 wakaba 1.3 my $in_body = sub {
2600     my $insert = shift;
2601     if ($token->{type} eq 'start tag') {
2602     if ($token->{tag_name} eq 'script') {
2603     $script_start_tag->();
2604     return;
2605     } elsif ($token->{tag_name} eq 'style') {
2606     $style_start_tag->();
2607     return;
2608     } elsif ({
2609 wakaba 1.8 base => 1, link => 1, meta => 1,
2610 wakaba 1.3 }->{$token->{tag_name}}) {
2611 wakaba 1.9 $self->{parse_error}-> ($token->{tag_name}.' in body');
2612 wakaba 1.3 ## NOTE: This is an "as if in head" code clone
2613     my $el;
2614    
2615     $el = $self->{document}->create_element_ns
2616     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
2617    
2618     for my $attr_name (keys %{ $token->{attributes}}) {
2619     $el->set_attribute_ns (undef, [undef, $attr_name],
2620     $token->{attributes} ->{$attr_name}->{value});
2621     }
2622    
2623     if (defined $head_element) {
2624     $head_element->append_child ($el);
2625     } else {
2626     $insert->($el);
2627     }
2628    
2629     $token = $self->_get_next_token;
2630     return;
2631 wakaba 1.8 } elsif ($token->{tag_name} eq 'title') {
2632 wakaba 1.9 $self->{parse_error}-> ('title in body');
2633 wakaba 1.8 ## NOTE: There is an "as if in head" code clone
2634     my $title_el;
2635    
2636     $title_el = $self->{document}->create_element_ns
2637     (q<http://www.w3.org/1999/xhtml>, [undef, 'title']);
2638    
2639     for my $attr_name (keys %{ $token->{attributes}}) {
2640     $title_el->set_attribute_ns (undef, [undef, $attr_name],
2641     $token->{attributes} ->{$attr_name}->{value});
2642     }
2643    
2644     (defined $head_element ? $head_element : $open_elements->[-1]->[0])
2645     ->append_child ($title_el);
2646     $self->{content_model_flag} = 'RCDATA';
2647    
2648     my $text = '';
2649     $token = $self->_get_next_token;
2650     while ($token->{type} eq 'character') {
2651     $text .= $token->{data};
2652     $token = $self->_get_next_token;
2653     }
2654     if (length $text) {
2655     $title_el->manakai_append_text ($text);
2656     }
2657    
2658     $self->{content_model_flag} = 'PCDATA';
2659    
2660     if ($token->{type} eq 'end tag' and
2661     $token->{tag_name} eq 'title') {
2662     ## Ignore the token
2663     } else {
2664     $self->{parse_error}->();
2665     ## ISSUE: And ignore?
2666     }
2667     $token = $self->_get_next_token;
2668     return;
2669 wakaba 1.3 } elsif ($token->{tag_name} eq 'body') {
2670     $self->{parse_error}->();
2671    
2672     if (@$open_elements == 1 or
2673     $open_elements->[1]->[1] ne 'body') {
2674     ## Ignore the token
2675     } else {
2676     my $body_el = $open_elements->[1]->[0];
2677     for my $attr_name (keys %{$token->{attributes}}) {
2678     unless ($body_el->has_attribute_ns (undef, $attr_name)) {
2679     $body_el->set_attribute_ns
2680     (undef, [undef, $attr_name],
2681     $token->{attributes}->{$attr_name}->{value});
2682     }
2683     }
2684     }
2685     $token = $self->_get_next_token;
2686     return;
2687     } elsif ({
2688     address => 1, blockquote => 1, center => 1, dir => 1,
2689     div => 1, dl => 1, fieldset => 1, listing => 1,
2690     menu => 1, ol => 1, p => 1, ul => 1,
2691     pre => 1,
2692     }->{$token->{tag_name}}) {
2693     ## has a p element in scope
2694     INSCOPE: for (reverse @$open_elements) {
2695     if ($_->[1] eq 'p') {
2696     unshift @{$self->{token}}, $token;
2697     $token = {type => 'end tag', tag_name => 'p'};
2698     return;
2699     } elsif ({
2700     table => 1, caption => 1, td => 1, th => 1,
2701     button => 1, marquee => 1, object => 1, html => 1,
2702     }->{$_->[1]}) {
2703     last INSCOPE;
2704     }
2705     } # INSCOPE
2706    
2707    
2708     {
2709     my $el;
2710    
2711     $el = $self->{document}->create_element_ns
2712     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
2713    
2714     for my $attr_name (keys %{ $token->{attributes}}) {
2715     $el->set_attribute_ns (undef, [undef, $attr_name],
2716     $token->{attributes} ->{$attr_name}->{value});
2717     }
2718    
2719     $insert->($el);
2720     push @$open_elements, [$el, $token->{tag_name}];
2721     }
2722    
2723     if ($token->{tag_name} eq 'pre') {
2724     $token = $self->_get_next_token;
2725     if ($token->{type} eq 'character') {
2726     $token->{data} =~ s/^\x0A//;
2727     unless (length $token->{data}) {
2728     $token = $self->_get_next_token;
2729     }
2730     }
2731     } else {
2732     $token = $self->_get_next_token;
2733     }
2734     return;
2735     } elsif ($token->{tag_name} eq 'form') {
2736     if (defined $form_element) {
2737     $self->{parse_error}->();
2738     ## Ignore the token
2739     } else {
2740     ## has a p element in scope
2741     INSCOPE: for (reverse @$open_elements) {
2742     if ($_->[1] eq 'p') {
2743     unshift @{$self->{token}}, $token;
2744     $token = {type => 'end tag', tag_name => 'p'};
2745     return;
2746     } elsif ({
2747     table => 1, caption => 1, td => 1, th => 1,
2748     button => 1, marquee => 1, object => 1, html => 1,
2749     }->{$_->[1]}) {
2750     last INSCOPE;
2751     }
2752     } # INSCOPE
2753    
2754    
2755     {
2756     my $el;
2757    
2758     $el = $self->{document}->create_element_ns
2759     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
2760    
2761     for my $attr_name (keys %{ $token->{attributes}}) {
2762     $el->set_attribute_ns (undef, [undef, $attr_name],
2763     $token->{attributes} ->{$attr_name}->{value});
2764     }
2765    
2766     $insert->($el);
2767     push @$open_elements, [$el, $token->{tag_name}];
2768     }
2769    
2770     $form_element = $open_elements->[-1]->[0];
2771     $token = $self->_get_next_token;
2772     return;
2773     }
2774     } elsif ($token->{tag_name} eq 'li') {
2775     ## has a p element in scope
2776     INSCOPE: for (reverse @$open_elements) {
2777     if ($_->[1] eq 'p') {
2778     unshift @{$self->{token}}, $token;
2779     $token = {type => 'end tag', tag_name => 'p'};
2780     return;
2781     } elsif ({
2782     table => 1, caption => 1, td => 1, th => 1,
2783     button => 1, marquee => 1, object => 1, html => 1,
2784     }->{$_->[1]}) {
2785     last INSCOPE;
2786     }
2787     } # INSCOPE
2788    
2789     ## Step 1
2790     my $i = -1;
2791     my $node = $open_elements->[$i];
2792     LI: {
2793     ## Step 2
2794     if ($node->[1] eq 'li') {
2795     splice @$open_elements, $i;
2796     last LI;
2797     }
2798    
2799     ## Step 3
2800     if (not $formatting_category->{$node->[1]} and
2801     #not $phrasing_category->{$node->[1]} and
2802     ($special_category->{$node->[1]} or
2803     $scoping_category->{$node->[1]}) and
2804     $node->[1] ne 'address' and $node->[1] ne 'div') {
2805     last LI;
2806     }
2807    
2808     ## Step 4
2809 wakaba 1.8 $i--;
2810 wakaba 1.3 $node = $open_elements->[$i];
2811     redo LI;
2812     } # LI
2813    
2814    
2815     {
2816     my $el;
2817    
2818     $el = $self->{document}->create_element_ns
2819     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
2820    
2821     for my $attr_name (keys %{ $token->{attributes}}) {
2822     $el->set_attribute_ns (undef, [undef, $attr_name],
2823     $token->{attributes} ->{$attr_name}->{value});
2824     }
2825    
2826     $insert->($el);
2827     push @$open_elements, [$el, $token->{tag_name}];
2828     }
2829    
2830     $token = $self->_get_next_token;
2831     return;
2832     } elsif ($token->{tag_name} eq 'dd' or $token->{tag_name} eq 'dt') {
2833     ## has a p element in scope
2834     INSCOPE: for (reverse @$open_elements) {
2835     if ($_->[1] eq 'p') {
2836     unshift @{$self->{token}}, $token;
2837     $token = {type => 'end tag', tag_name => 'p'};
2838     return;
2839     } elsif ({
2840     table => 1, caption => 1, td => 1, th => 1,
2841     button => 1, marquee => 1, object => 1, html => 1,
2842     }->{$_->[1]}) {
2843     last INSCOPE;
2844     }
2845     } # INSCOPE
2846    
2847     ## Step 1
2848     my $i = -1;
2849     my $node = $open_elements->[$i];
2850     LI: {
2851     ## Step 2
2852     if ($node->[1] eq 'dt' or $node->[1] eq 'dd') {
2853     splice @$open_elements, $i;
2854     last LI;
2855     }
2856    
2857     ## Step 3
2858     if (not $formatting_category->{$node->[1]} and
2859     #not $phrasing_category->{$node->[1]} and
2860     ($special_category->{$node->[1]} or
2861     $scoping_category->{$node->[1]}) and
2862     $node->[1] ne 'address' and $node->[1] ne 'div') {
2863     last LI;
2864     }
2865    
2866     ## Step 4
2867 wakaba 1.8 $i--;
2868 wakaba 1.3 $node = $open_elements->[$i];
2869     redo LI;
2870     } # LI
2871    
2872    
2873     {
2874     my $el;
2875    
2876     $el = $self->{document}->create_element_ns
2877     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
2878    
2879     for my $attr_name (keys %{ $token->{attributes}}) {
2880     $el->set_attribute_ns (undef, [undef, $attr_name],
2881     $token->{attributes} ->{$attr_name}->{value});
2882     }
2883    
2884     $insert->($el);
2885     push @$open_elements, [$el, $token->{tag_name}];
2886     }
2887    
2888     $token = $self->_get_next_token;
2889     return;
2890     } elsif ($token->{tag_name} eq 'plaintext') {
2891     ## has a p element in scope
2892     INSCOPE: for (reverse @$open_elements) {
2893     if ($_->[1] eq 'p') {
2894     unshift @{$self->{token}}, $token;
2895     $token = {type => 'end tag', tag_name => 'p'};
2896     return;
2897     } elsif ({
2898     table => 1, caption => 1, td => 1, th => 1,
2899     button => 1, marquee => 1, object => 1, html => 1,
2900     }->{$_->[1]}) {
2901     last INSCOPE;
2902     }
2903     } # INSCOPE
2904    
2905    
2906     {
2907     my $el;
2908    
2909     $el = $self->{document}->create_element_ns
2910     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
2911    
2912     for my $attr_name (keys %{ $token->{attributes}}) {
2913     $el->set_attribute_ns (undef, [undef, $attr_name],
2914     $token->{attributes} ->{$attr_name}->{value});
2915     }
2916    
2917     $insert->($el);
2918     push @$open_elements, [$el, $token->{tag_name}];
2919     }
2920    
2921    
2922     $self->{content_model_flag} = 'PLAINTEXT';
2923    
2924     $token = $self->_get_next_token;
2925     return;
2926     } elsif ({
2927     h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
2928     }->{$token->{tag_name}}) {
2929     ## has a p element in scope
2930     INSCOPE: for (reverse 0..$#$open_elements) {
2931     my $node = $open_elements->[$_];
2932     if ($node->[1] eq 'p') {
2933     unshift @{$self->{token}}, $token;
2934     $token = {type => 'end tag', tag_name => 'p'};
2935     return;
2936     } elsif ({
2937     table => 1, caption => 1, td => 1, th => 1,
2938     button => 1, marquee => 1, object => 1, html => 1,
2939     }->{$node->[1]}) {
2940     last INSCOPE;
2941     }
2942     } # INSCOPE
2943    
2944     ## has an element in scope
2945     my $i;
2946     INSCOPE: for (reverse 0..$#$open_elements) {
2947     my $node = $open_elements->[$_];
2948     if ({
2949     h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
2950     }->{$node->[1]}) {
2951     $i = $_;
2952     last INSCOPE;
2953     } elsif ({
2954     table => 1, caption => 1, td => 1, th => 1,
2955     button => 1, marquee => 1, object => 1, html => 1,
2956     }->{$node->[1]}) {
2957     last INSCOPE;
2958     }
2959     } # INSCOPE
2960    
2961     if (defined $i) {
2962     $self->{parse_error}->();
2963     splice @$open_elements, $i;
2964     }
2965    
2966    
2967     {
2968     my $el;
2969    
2970     $el = $self->{document}->create_element_ns
2971     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
2972    
2973     for my $attr_name (keys %{ $token->{attributes}}) {
2974     $el->set_attribute_ns (undef, [undef, $attr_name],
2975     $token->{attributes} ->{$attr_name}->{value});
2976     }
2977    
2978     $insert->($el);
2979     push @$open_elements, [$el, $token->{tag_name}];
2980     }
2981    
2982    
2983     $token = $self->_get_next_token;
2984     return;
2985     } elsif ($token->{tag_name} eq 'a') {
2986     AFE: for my $i (reverse 0..$#$active_formatting_elements) {
2987     my $node = $active_formatting_elements->[$i];
2988     if ($node->[1] eq 'a') {
2989 wakaba 1.8 $self->{parse_error}-> ('a in a');
2990 wakaba 1.3
2991     unshift @{$self->{token}}, $token;
2992     $token = {type => 'end tag', tag_name => 'a'};
2993     $formatting_end_tag->($token->{tag_name});
2994    
2995 wakaba 1.8 AFE2: for (reverse 0..$#$active_formatting_elements) {
2996     if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
2997     splice @$active_formatting_elements, $_, 1;
2998     last AFE2;
2999     }
3000     } # AFE2
3001 wakaba 1.3 OE: for (reverse 0..$#$open_elements) {
3002     if ($open_elements->[$_]->[0] eq $node->[0]) {
3003 wakaba 1.8 splice @$open_elements, $_, 1;
3004 wakaba 1.3 last OE;
3005     }
3006     } # OE
3007     last AFE;
3008     } elsif ($node->[0] eq '#marker') {
3009     last AFE;
3010     }
3011     } # AFE
3012    
3013 wakaba 1.8 $reconstruct_active_formatting_elements->($insert_to_current);
3014 wakaba 1.3
3015    
3016     {
3017     my $el;
3018    
3019     $el = $self->{document}->create_element_ns
3020     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
3021    
3022     for my $attr_name (keys %{ $token->{attributes}}) {
3023     $el->set_attribute_ns (undef, [undef, $attr_name],
3024     $token->{attributes} ->{$attr_name}->{value});
3025     }
3026    
3027     $insert->($el);
3028     push @$open_elements, [$el, $token->{tag_name}];
3029     }
3030    
3031     push @$active_formatting_elements, $open_elements->[-1];
3032    
3033     $token = $self->_get_next_token;
3034     return;
3035     } elsif ({
3036     b => 1, big => 1, em => 1, font => 1, i => 1,
3037     nobr => 1, s => 1, small => 1, strile => 1,
3038     strong => 1, tt => 1, u => 1,
3039     }->{$token->{tag_name}}) {
3040 wakaba 1.8 $reconstruct_active_formatting_elements->($insert_to_current);
3041 wakaba 1.3
3042    
3043     {
3044     my $el;
3045    
3046     $el = $self->{document}->create_element_ns
3047     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
3048    
3049     for my $attr_name (keys %{ $token->{attributes}}) {
3050     $el->set_attribute_ns (undef, [undef, $attr_name],
3051     $token->{attributes} ->{$attr_name}->{value});
3052     }
3053    
3054     $insert->($el);
3055     push @$open_elements, [$el, $token->{tag_name}];
3056     }
3057    
3058     push @$active_formatting_elements, $open_elements->[-1];
3059    
3060     $token = $self->_get_next_token;
3061     return;
3062     } elsif ($token->{tag_name} eq 'button') {
3063     ## has a button element in scope
3064     INSCOPE: for (reverse 0..$#$open_elements) {
3065     my $node = $open_elements->[$_];
3066     if ($node->[1] eq 'button') {
3067     $self->{parse_error}->();
3068     unshift @{$self->{token}}, $token;
3069     $token = {type => 'end tag', tag_name => 'button'};
3070     return;
3071     } elsif ({
3072     table => 1, caption => 1, td => 1, th => 1,
3073     button => 1, marquee => 1, object => 1, html => 1,
3074     }->{$node->[1]}) {
3075     last INSCOPE;
3076     }
3077     } # INSCOPE
3078    
3079 wakaba 1.8 $reconstruct_active_formatting_elements->($insert_to_current);
3080 wakaba 1.3
3081    
3082     {
3083     my $el;
3084    
3085     $el = $self->{document}->create_element_ns
3086     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
3087    
3088     for my $attr_name (keys %{ $token->{attributes}}) {
3089     $el->set_attribute_ns (undef, [undef, $attr_name],
3090     $token->{attributes} ->{$attr_name}->{value});
3091     }
3092    
3093     $insert->($el);
3094     push @$open_elements, [$el, $token->{tag_name}];
3095     }
3096    
3097     push @$active_formatting_elements, ['#marker', ''];
3098    
3099     $token = $self->_get_next_token;
3100     return;
3101     } elsif ($token->{tag_name} eq 'marquee' or
3102     $token->{tag_name} eq 'object') {
3103 wakaba 1.8 $reconstruct_active_formatting_elements->($insert_to_current);
3104 wakaba 1.3
3105    
3106     {
3107     my $el;
3108    
3109     $el = $self->{document}->create_element_ns
3110     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
3111    
3112     for my $attr_name (keys %{ $token->{attributes}}) {
3113     $el->set_attribute_ns (undef, [undef, $attr_name],
3114     $token->{attributes} ->{$attr_name}->{value});
3115     }
3116    
3117     $insert->($el);
3118     push @$open_elements, [$el, $token->{tag_name}];
3119     }
3120    
3121     push @$active_formatting_elements, ['#marker', ''];
3122    
3123     $token = $self->_get_next_token;
3124     return;
3125     } elsif ($token->{tag_name} eq 'xmp') {
3126 wakaba 1.8 $reconstruct_active_formatting_elements->($insert_to_current);
3127 wakaba 1.3
3128    
3129     {
3130     my $el;
3131    
3132     $el = $self->{document}->create_element_ns
3133     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
3134    
3135     for my $attr_name (keys %{ $token->{attributes}}) {
3136     $el->set_attribute_ns (undef, [undef, $attr_name],
3137     $token->{attributes} ->{$attr_name}->{value});
3138     }
3139    
3140     $insert->($el);
3141     push @$open_elements, [$el, $token->{tag_name}];
3142     }
3143    
3144    
3145     $self->{content_model_flag} = 'CDATA';
3146    
3147     $token = $self->_get_next_token;
3148     return;
3149 wakaba 1.7 } elsif ($token->{tag_name} eq 'table') {
3150 wakaba 1.3 ## has a p element in scope
3151     INSCOPE: for (reverse @$open_elements) {
3152     if ($_->[1] eq 'p') {
3153     unshift @{$self->{token}}, $token;
3154     $token = {type => 'end tag', tag_name => 'p'};
3155     return;
3156     } elsif ({
3157     table => 1, caption => 1, td => 1, th => 1,
3158     button => 1, marquee => 1, object => 1, html => 1,
3159     }->{$_->[1]}) {
3160     last INSCOPE;
3161     }
3162     } # INSCOPE
3163    
3164    
3165     {
3166     my $el;
3167    
3168     $el = $self->{document}->create_element_ns
3169     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
3170    
3171     for my $attr_name (keys %{ $token->{attributes}}) {
3172     $el->set_attribute_ns (undef, [undef, $attr_name],
3173     $token->{attributes} ->{$attr_name}->{value});
3174     }
3175    
3176     $insert->($el);
3177     push @$open_elements, [$el, $token->{tag_name}];
3178     }
3179    
3180    
3181     $insertion_mode = 'in table';
3182    
3183     $token = $self->_get_next_token;
3184     return;
3185     } elsif ({
3186     area => 1, basefont => 1, bgsound => 1, br => 1,
3187     embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
3188     image => 1,
3189     }->{$token->{tag_name}}) {
3190     if ($token->{tag_name} eq 'image') {
3191     $self->{parse_error}->();
3192     $token->{tag_name} = 'img';
3193     }
3194    
3195 wakaba 1.8 $reconstruct_active_formatting_elements->($insert_to_current);
3196 wakaba 1.3
3197    
3198     {
3199     my $el;
3200    
3201     $el = $self->{document}->create_element_ns
3202     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
3203    
3204     for my $attr_name (keys %{ $token->{attributes}}) {
3205     $el->set_attribute_ns (undef, [undef, $attr_name],
3206     $token->{attributes} ->{$attr_name}->{value});
3207     }
3208    
3209     $insert->($el);
3210     push @$open_elements, [$el, $token->{tag_name}];
3211     }
3212    
3213     pop @$open_elements;
3214    
3215     $token = $self->_get_next_token;
3216     return;
3217     } elsif ($token->{tag_name} eq 'hr') {
3218     ## has a p element in scope
3219     INSCOPE: for (reverse @$open_elements) {
3220     if ($_->[1] eq 'p') {
3221     unshift @{$self->{token}}, $token;
3222     $token = {type => 'end tag', tag_name => 'p'};
3223     return;
3224     } elsif ({
3225     table => 1, caption => 1, td => 1, th => 1,
3226     button => 1, marquee => 1, object => 1, html => 1,
3227     }->{$_->[1]}) {
3228     last INSCOPE;
3229     }
3230     } # INSCOPE
3231    
3232    
3233     {
3234     my $el;
3235    
3236     $el = $self->{document}->create_element_ns
3237     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
3238    
3239     for my $attr_name (keys %{ $token->{attributes}}) {
3240     $el->set_attribute_ns (undef, [undef, $attr_name],
3241     $token->{attributes} ->{$attr_name}->{value});
3242     }
3243    
3244     $insert->($el);
3245     push @$open_elements, [$el, $token->{tag_name}];
3246     }
3247    
3248     pop @$open_elements;
3249    
3250     $token = $self->_get_next_token;
3251     return;
3252     } elsif ($token->{tag_name} eq 'input') {
3253 wakaba 1.8 $reconstruct_active_formatting_elements->($insert_to_current);
3254 wakaba 1.3
3255    
3256     {
3257     my $el;
3258    
3259     $el = $self->{document}->create_element_ns
3260     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
3261    
3262     for my $attr_name (keys %{ $token->{attributes}}) {
3263     $el->set_attribute_ns (undef, [undef, $attr_name],
3264     $token->{attributes} ->{$attr_name}->{value});
3265     }
3266    
3267     $insert->($el);
3268     push @$open_elements, [$el, $token->{tag_name}];
3269     }
3270    
3271     ## TODO: associate with $form_element if defined
3272     pop @$open_elements;
3273    
3274     $token = $self->_get_next_token;
3275     return;
3276     } elsif ($token->{tag_name} eq 'isindex') {
3277     $self->{parse_error}->();
3278    
3279     if (defined $form_element) {
3280     ## Ignore the token
3281     $token = $self->_get_next_token;
3282     return;
3283     } else {
3284     my $at = $token->{attributes};
3285     $at->{name} = {name => 'name', value => 'isindex'};
3286     my @tokens = (
3287     {type => 'start tag', tag_name => 'form'},
3288     {type => 'start tag', tag_name => 'hr'},
3289     {type => 'start tag', tag_name => 'p'},
3290     {type => 'start tag', tag_name => 'label'},
3291     {type => 'character',
3292 wakaba 1.8 data => 'This is a searchable index. Insert your search keywords here: '}, # SHOULD
3293 wakaba 1.3 ## TODO: make this configurable
3294     {type => 'start tag', tag_name => 'input', attributes => $at},
3295     #{type => 'character', data => ''}, # SHOULD
3296     {type => 'end tag', tag_name => 'label'},
3297     {type => 'end tag', tag_name => 'p'},
3298     {type => 'start tag', tag_name => 'hr'},
3299     {type => 'end tag', tag_name => 'form'},
3300     );
3301     $token = shift @tokens;
3302     unshift @{$self->{token}}, (@tokens);
3303     return;
3304     }
3305     } elsif ({
3306     textarea => 1,
3307     noembed => 1,
3308     noframes => 1,
3309     noscript => 0, ## TODO: 1 if scripting is enabled
3310     }->{$token->{tag_name}}) {
3311     my $tag_name = $token->{tag_name};
3312     my $el;
3313    
3314     $el = $self->{document}->create_element_ns
3315     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
3316    
3317     for my $attr_name (keys %{ $token->{attributes}}) {
3318     $el->set_attribute_ns (undef, [undef, $attr_name],
3319     $token->{attributes} ->{$attr_name}->{value});
3320     }
3321    
3322    
3323     if ($token->{tag_name} eq 'textarea') {
3324     ## TODO: form_element if defined
3325     $self->{content_model_flag} = 'RCDATA';
3326     } else {
3327     $self->{content_model_flag} = 'CDATA';
3328     }
3329    
3330     $insert->($el);
3331    
3332     my $text = '';
3333     $token = $self->_get_next_token;
3334     while ($token->{type} eq 'character') {
3335     $text .= $token->{data};
3336     $token = $self->_get_next_token;
3337     }
3338     if (length $text) {
3339     $el->manakai_append_text ($text);
3340     }
3341    
3342     $self->{content_model_flag} = 'PCDATA';
3343    
3344     if ($token->{type} eq 'end tag' and
3345     $token->{tag_name} eq $tag_name) {
3346     ## Ignore the token
3347     } else {
3348     $self->{parse_error}->();
3349     ## ISSUE: And ignore?
3350     }
3351     $token = $self->_get_next_token;
3352     return;
3353 wakaba 1.8 } elsif ($token->{tag_name} eq 'select') {
3354     $reconstruct_active_formatting_elements->($insert_to_current);
3355 wakaba 1.3
3356    
3357     {
3358     my $el;
3359    
3360     $el = $self->{document}->create_element_ns
3361     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
3362    
3363     for my $attr_name (keys %{ $token->{attributes}}) {
3364     $el->set_attribute_ns (undef, [undef, $attr_name],
3365     $token->{attributes} ->{$attr_name}->{value});
3366     }
3367    
3368     $insert->($el);
3369     push @$open_elements, [$el, $token->{tag_name}];
3370     }
3371    
3372    
3373     $insertion_mode = 'in select';
3374     $token = $self->_get_next_token;
3375     return;
3376     } elsif ({
3377     caption => 1, col => 1, colgroup => 1, frame => 1,
3378     frameset => 1, head => 1, option => 1, optgroup => 1,
3379     tbody => 1, td => 1, tfoot => 1, th => 1,
3380     thead => 1, tr => 1,
3381     }->{$token->{tag_name}}) {
3382     $self->{parse_error}->();
3383     ## Ignore the token
3384     $token = $self->_get_next_token;
3385     return;
3386    
3387     ## ISSUE: An issue on HTML5 new elements in the spec.
3388     } else {
3389 wakaba 1.8 $reconstruct_active_formatting_elements->($insert_to_current);
3390 wakaba 1.3
3391    
3392     {
3393     my $el;
3394    
3395     $el = $self->{document}->create_element_ns
3396     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
3397    
3398     for my $attr_name (keys %{ $token->{attributes}}) {
3399     $el->set_attribute_ns (undef, [undef, $attr_name],
3400     $token->{attributes} ->{$attr_name}->{value});
3401     }
3402    
3403     $insert->($el);
3404     push @$open_elements, [$el, $token->{tag_name}];
3405     }
3406    
3407    
3408     $token = $self->_get_next_token;
3409     return;
3410     }
3411     } elsif ($token->{type} eq 'end tag') {
3412     if ($token->{tag_name} eq 'body') {
3413     if (@$open_elements > 1 and $open_elements->[1]->[1] eq 'body') {
3414     ## ISSUE: There is an issue in the spec.
3415     if ($open_elements->[-1]->[1] ne 'body') {
3416     $self->{parse_error}->();
3417     }
3418     $insertion_mode = 'after body';
3419     $token = $self->_get_next_token;
3420     return;
3421     } else {
3422     $self->{parse_error}->();
3423     ## Ignore the token
3424     $token = $self->_get_next_token;
3425     return;
3426     }
3427     } elsif ($token->{tag_name} eq 'html') {
3428     if (@$open_elements > 1 and $open_elements->[1]->[1] eq 'body') {
3429     ## ISSUE: There is an issue in the spec.
3430     if ($open_elements->[-1]->[1] ne 'body') {
3431     $self->{parse_error}->();
3432     }
3433     $insertion_mode = 'after body';
3434     ## reprocess
3435     return;
3436     } else {
3437     $self->{parse_error}->();
3438     ## Ignore the token
3439     $token = $self->_get_next_token;
3440     return;
3441     }
3442     } elsif ({
3443     address => 1, blockquote => 1, center => 1, dir => 1,
3444     div => 1, dl => 1, fieldset => 1, listing => 1,
3445     menu => 1, ol => 1, pre => 1, ul => 1,
3446     form => 1,
3447     p => 1,
3448     dd => 1, dt => 1, li => 1,
3449     button => 1, marquee => 1, object => 1,
3450     }->{$token->{tag_name}}) {
3451     ## has an element in scope
3452     my $i;
3453     INSCOPE: for (reverse 0..$#$open_elements) {
3454     my $node = $open_elements->[$_];
3455     if ($node->[1] eq $token->{tag_name}) {
3456     ## generate implied end tags
3457     if ({
3458     dd => ($token->{tag_name} ne 'dd'),
3459     dt => ($token->{tag_name} ne 'dt'),
3460     li => ($token->{tag_name} ne 'li'),
3461     p => ($token->{tag_name} ne 'p'),
3462     td => 1, th => 1, tr => 1,
3463     }->{$open_elements->[-1]->[1]}) {
3464     unshift @{$self->{token}}, $token;
3465     $token = {type => 'end tag',
3466     tag_name => $open_elements->[-1]->[1]}; # MUST
3467     return;
3468     }
3469     $i = $_;
3470     last INSCOPE unless $token->{tag_name} eq 'p';
3471     } elsif ({
3472     table => 1, caption => 1, td => 1, th => 1,
3473     button => 1, marquee => 1, object => 1, html => 1,
3474     }->{$node->[1]}) {
3475     last INSCOPE;
3476     }
3477     } # INSCOPE
3478    
3479     if ($open_elements->[-1]->[1] ne $token->{tag_name}) {
3480     $self->{parse_error}->();
3481     }
3482    
3483     splice @$open_elements, $i if defined $i;
3484     undef $form_element if $token->{tag_name} eq 'form';
3485     $clear_up_to_marker->()
3486     if {
3487     button => 1, marquee => 1, object => 1,
3488     }->{$token->{tag_name}};
3489     $token = $self->_get_next_token;
3490     return;
3491     } elsif ({
3492     h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
3493     }->{$token->{tag_name}}) {
3494     ## has an element in scope
3495     my $i;
3496     INSCOPE: for (reverse 0..$#$open_elements) {
3497     my $node = $open_elements->[$_];
3498     if ({
3499     h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
3500     }->{$node->[1]}) {
3501     ## generate implied end tags
3502     if ({
3503     dd => 1, dt => 1, li => 1, p => 1,
3504     td => 1, th => 1, tr => 1,
3505     }->{$open_elements->[-1]->[1]}) {
3506     unshift @{$self->{token}}, $token;
3507     $token = {type => 'end tag',
3508     tag_name => $open_elements->[-1]->[1]}; # MUST
3509     return;
3510     }
3511     $i = $_;
3512     last INSCOPE;
3513     } elsif ({
3514     table => 1, caption => 1, td => 1, th => 1,
3515     button => 1, marquee => 1, object => 1, html => 1,
3516     }->{$node->[1]}) {
3517     last INSCOPE;
3518     }
3519     } # INSCOPE
3520    
3521     if ($open_elements->[-1]->[1] ne $token->{tag_name}) {
3522     $self->{parse_error}->();
3523     }
3524    
3525     splice @$open_elements, $i if defined $i;
3526     $token = $self->_get_next_token;
3527     return;
3528     } elsif ({
3529     a => 1,
3530     b => 1, big => 1, em => 1, font => 1, i => 1,
3531     nobr => 1, s => 1, small => 1, strile => 1,
3532     strong => 1, tt => 1, u => 1,
3533     }->{$token->{tag_name}}) {
3534     $formatting_end_tag->($token->{tag_name});
3535     return;
3536     } elsif ({
3537     caption => 1, col => 1, colgroup => 1, frame => 1,
3538     frameset => 1, head => 1, option => 1, optgroup => 1,
3539     tbody => 1, td => 1, tfoot => 1, th => 1,
3540     thead => 1, tr => 1,
3541     area => 1, basefont => 1, bgsound => 1, br => 1,
3542     embed => 1, hr => 1, iframe => 1, image => 1,
3543     img => 1, input => 1, isindex=> 1, noembed => 1,
3544     noframes => 1, param => 1, select => 1, spacer => 1,
3545     table => 1, textarea => 1, wbr => 1,
3546     noscript => 0, ## TODO: if scripting is enabled
3547     }->{$token->{tag_name}}) {
3548     $self->{parse_error}->();
3549     ## Ignore the token
3550     $token = $self->_get_next_token;
3551     return;
3552    
3553     ## ISSUE: Issue on HTML5 new elements in spec
3554    
3555     } else {
3556     ## Step 1
3557     my $node_i = -1;
3558     my $node = $open_elements->[$node_i];
3559    
3560     ## Step 2
3561     S2: {
3562     if ($node->[1] eq $token->{tag_name}) {
3563     ## Step 1
3564     ## generate implied end tags
3565     if ({
3566     dd => 1, dt => 1, li => 1, p => 1,
3567     td => 1, th => 1, tr => 1,
3568     }->{$open_elements->[-1]->[1]}) {
3569     unshift @{$self->{token}}, $token;
3570     $token = {type => 'end tag',
3571     tag_name => $open_elements->[-1]->[1]}; # MUST
3572     return;
3573     }
3574    
3575     ## Step 2
3576     if ($token->{tag_name} ne $open_elements->[-1]->[1]) {
3577     $self->{parse_error}->();
3578     }
3579    
3580     ## Step 3
3581     splice @$open_elements, $node_i;
3582     last S2;
3583     } else {
3584     ## Step 3
3585     if (not $formatting_category->{$node->[1]} and
3586     #not $phrasing_category->{$node->[1]} and
3587     ($special_category->{$node->[1]} or
3588     $scoping_category->{$node->[1]})) {
3589     $self->{parse_error}->();
3590     ## Ignore the token
3591     $token = $self->_get_next_token;
3592     last S2;
3593     }
3594     }
3595    
3596     ## Step 4
3597     $node_i--;
3598     $node = $open_elements->[$node_i];
3599    
3600     ## Step 5;
3601     redo S2;
3602     } # S2
3603     }
3604     }
3605     }; # $in_body
3606    
3607     B: {
3608     if ($phase eq 'initial') {
3609     if ($token->{type} eq 'DOCTYPE') {
3610     if ($token->{error}) {
3611     ## ISSUE: Spec currently left this case undefined.
3612 wakaba 1.7 $self->{parse_error}-> ('missing DOCTYPE');
3613 wakaba 1.3 }
3614     my $doctype = $self->{document}->create_document_type_definition
3615     ($token->{name});
3616     $self->{document}->append_child ($doctype);
3617     $phase = 'root element';
3618     $token = $self->_get_next_token;
3619     redo B;
3620     } elsif ({
3621     comment => 1,
3622     'start tag' => 1,
3623     'end tag' => 1,
3624     'end-of-file' => 1,
3625     }->{$token->{type}}) {
3626     ## ISSUE: Spec currently left this case undefined.
3627 wakaba 1.7 $self->{parse_error}-> ('missing DOCTYPE');
3628 wakaba 1.3 $phase = 'root element';
3629     ## reprocess
3630     redo B;
3631     } elsif ($token->{type} eq 'character') {
3632     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3633     $self->{document}->manakai_append_text ($1);
3634     ## ISSUE: DOM3 Core does not allow Document > Text
3635     unless (length $token->{data}) {
3636     ## Stay in the phase
3637     $token = $self->_get_next_token;
3638     redo B;
3639     }
3640     }
3641     ## ISSUE: Spec currently left this case undefined.
3642 wakaba 1.7 $self->{parse_error}-> ('missing DOCTYPE');
3643 wakaba 1.3 $phase = 'root element';
3644     ## reprocess
3645     redo B;
3646     } else {
3647     die "$0: $token->{type}: Unknown token";
3648     }
3649     } elsif ($phase eq 'root element') {
3650     if ($token->{type} eq 'DOCTYPE') {
3651     $self->{parse_error}->();
3652     ## Ignore the token
3653     ## Stay in the phase
3654     $token = $self->_get_next_token;
3655     redo B;
3656     } elsif ($token->{type} eq 'comment') {
3657     my $comment = $self->{document}->create_comment ($token->{data});
3658     $self->{document}->append_child ($comment);
3659     ## Stay in the phase
3660     $token = $self->_get_next_token;
3661     redo B;
3662     } elsif ($token->{type} eq 'character') {
3663     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3664     $self->{document}->manakai_append_text ($1);
3665     ## ISSUE: DOM3 Core does not allow Document > Text
3666     unless (length $token->{data}) {
3667     ## Stay in the phase
3668     $token = $self->_get_next_token;
3669     redo B;
3670     }
3671     }
3672     #
3673     } elsif ({
3674     'start tag' => 1,
3675     'end tag' => 1,
3676     'end-of-file' => 1,
3677     }->{$token->{type}}) {
3678     ## ISSUE: There is an issue in the spec
3679     #
3680     } else {
3681     die "$0: $token->{type}: Unknown token";
3682     }
3683     my $root_element;
3684     $root_element = $self->{document}->create_element_ns
3685     (q<http://www.w3.org/1999/xhtml>, [undef, 'html']);
3686    
3687     $self->{document}->append_child ($root_element);
3688     $open_elements = [[$root_element, 'html']];
3689     $phase = 'main';
3690     ## reprocess
3691     redo B;
3692     } elsif ($phase eq 'main') {
3693     if ($token->{type} eq 'DOCTYPE') {
3694     $self->{parse_error}->();
3695     ## Ignore the token
3696     ## Stay in the phase
3697     $token = $self->_get_next_token;
3698     redo B;
3699     } elsif ($token->{type} eq 'start tag' and
3700     $token->{tag_name} eq 'html') {
3701     ## TODO: unless it is the first start tag token, parse-error
3702     my $top_el = $open_elements->[0]->[0];
3703     for my $attr_name (keys %{$token->{attributes}}) {
3704     unless ($top_el->has_attribute_ns (undef, $attr_name)) {
3705 wakaba 1.8 $top_el->set_attribute_ns
3706     (undef, [undef, $attr_name],
3707     $token->{attributes}->{$attr_name}->{value});
3708 wakaba 1.3 }
3709     }
3710     $token = $self->_get_next_token;
3711     redo B;
3712     } elsif ($token->{type} eq 'end-of-file') {
3713     ## Generate implied end tags
3714     if ({
3715     dd => 1, dt => 1, li => 1, p => 1, td => 1, th => 1, tr => 1,
3716     }->{$open_elements->[-1]->[1]}) {
3717     unshift @{$self->{token}}, $token;
3718     $token = {type => 'end tag', tag_name => $open_elements->[-1]->[1]};
3719     redo B;
3720     }
3721    
3722     if (@$open_elements > 2 or
3723     (@$open_elements == 2 and $open_elements->[1]->[1] ne 'body')) {
3724     $self->{parse_error}->();
3725     } else {
3726     ## TODO: inner_html parser and @$open_elements > 1 and $open_elements->[1] ne 'body', then parse-error
3727     }
3728    
3729     ## Stop parsing
3730     last B;
3731    
3732     ## ISSUE: There is an issue in the spec.
3733     } else {
3734     if ($insertion_mode eq 'before head') {
3735     if ($token->{type} eq 'character') {
3736     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3737     $open_elements->[-1]->[0]->manakai_append_text ($1);
3738     unless (length $token->{data}) {
3739     $token = $self->_get_next_token;
3740     redo B;
3741     }
3742     }
3743     ## As if <head>
3744    
3745     $head_element = $self->{document}->create_element_ns
3746     (q<http://www.w3.org/1999/xhtml>, [undef, 'head']);
3747    
3748     $open_elements->[-1]->[0]->append_child ($head_element);
3749     push @$open_elements, [$head_element, 'head'];
3750     $insertion_mode = 'in head';
3751     ## reprocess
3752     redo B;
3753     } elsif ($token->{type} eq 'comment') {
3754     my $comment = $self->{document}->create_comment ($token->{data});
3755     $open_elements->[-1]->[0]->append_child ($comment);
3756     $token = $self->_get_next_token;
3757     redo B;
3758     } elsif ($token->{type} eq 'start tag') {
3759     my $attr = $token->{tag_name} eq 'head' ? $token->{attributes} : {};
3760    
3761     $head_element = $self->{document}->create_element_ns
3762     (q<http://www.w3.org/1999/xhtml>, [undef, 'head']);
3763    
3764     for my $attr_name (keys %{ $attr}) {
3765     $head_element->set_attribute_ns (undef, [undef, $attr_name],
3766     $attr ->{$attr_name}->{value});
3767     }
3768    
3769     $open_elements->[-1]->[0]->append_child ($head_element);
3770     push @$open_elements, [$head_element, 'head'];
3771     $insertion_mode = 'in head';
3772     if ($token->{tag_name} eq 'head') {
3773     $token = $self->_get_next_token;
3774     #} elsif ({
3775     # base => 1, link => 1, meta => 1,
3776     # script => 1, style => 1, title => 1,
3777     # }->{$token->{tag_name}}) {
3778     # ## reprocess
3779     } else {
3780     ## reprocess
3781     }
3782     redo B;
3783     } elsif ($token->{type} eq 'end tag') {
3784     if ($token->{tag_name} eq 'html') {
3785     ## As if <head>
3786    
3787     $head_element = $self->{document}->create_element_ns
3788     (q<http://www.w3.org/1999/xhtml>, [undef, 'head']);
3789    
3790     $open_elements->[-1]->[0]->append_child ($head_element);
3791     push @$open_elements, [$head_element, 'head'];
3792     $insertion_mode = 'in head';
3793     ## reprocess
3794     redo B;
3795     } else {
3796     $self->{parse_error}->();
3797     ## Ignore the token
3798 wakaba 1.7 $token = $self->_get_next_token;
3799 wakaba 1.3 redo B;
3800     }
3801     } else {
3802     die "$0: $token->{type}: Unknown type";
3803     }
3804     } elsif ($insertion_mode eq 'in head') {
3805     if ($token->{type} eq 'character') {
3806     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3807     $open_elements->[-1]->[0]->manakai_append_text ($1);
3808     unless (length $token->{data}) {
3809     $token = $self->_get_next_token;
3810     redo B;
3811     }
3812     }
3813    
3814     #
3815     } elsif ($token->{type} eq 'comment') {
3816     my $comment = $self->{document}->create_comment ($token->{data});
3817     $open_elements->[-1]->[0]->append_child ($comment);
3818     $token = $self->_get_next_token;
3819     redo B;
3820     } elsif ($token->{type} eq 'start tag') {
3821     if ($token->{tag_name} eq 'title') {
3822 wakaba 1.8 ## NOTE: There is an "as if in head" code clone
3823     my $title_el;
3824    
3825 wakaba 1.3 $title_el = $self->{document}->create_element_ns
3826     (q<http://www.w3.org/1999/xhtml>, [undef, 'title']);
3827    
3828 wakaba 1.8 for my $attr_name (keys %{ $token->{attributes}}) {
3829     $title_el->set_attribute_ns (undef, [undef, $attr_name],
3830     $token->{attributes} ->{$attr_name}->{value});
3831     }
3832    
3833 wakaba 1.3 (defined $head_element ? $head_element : $open_elements->[-1]->[0])
3834     ->append_child ($title_el);
3835     $self->{content_model_flag} = 'RCDATA';
3836 wakaba 1.8
3837 wakaba 1.3 my $text = '';
3838     $token = $self->_get_next_token;
3839     while ($token->{type} eq 'character') {
3840     $text .= $token->{data};
3841     $token = $self->_get_next_token;
3842     }
3843     if (length $text) {
3844     $title_el->manakai_append_text ($text);
3845     }
3846    
3847     $self->{content_model_flag} = 'PCDATA';
3848    
3849     if ($token->{type} eq 'end tag' and
3850     $token->{tag_name} eq 'title') {
3851     ## Ignore the token
3852     } else {
3853     $self->{parse_error}->();
3854     ## ISSUE: And ignore?
3855     }
3856     $token = $self->_get_next_token;
3857     redo B;
3858     } elsif ($token->{tag_name} eq 'style') {
3859     $style_start_tag->();
3860     redo B;
3861     } elsif ($token->{tag_name} eq 'script') {
3862     $script_start_tag->();
3863     redo B;
3864     } elsif ({base => 1, link => 1, meta => 1}->{$token->{tag_name}}) {
3865     ## NOTE: There are "as if in head" code clones
3866     my $el;
3867    
3868     $el = $self->{document}->create_element_ns
3869     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
3870    
3871     for my $attr_name (keys %{ $token->{attributes}}) {
3872     $el->set_attribute_ns (undef, [undef, $attr_name],
3873     $token->{attributes} ->{$attr_name}->{value});
3874     }
3875    
3876     (defined $head_element ? $head_element : $open_elements->[-1]->[0])
3877     ->append_child ($el);
3878    
3879     $token = $self->_get_next_token;
3880     redo B;
3881     } elsif ($token->{tag_name} eq 'head') {
3882     $self->{parse_error}->();
3883     ## Ignore the token
3884     $token = $self->_get_next_token;
3885     redo B;
3886     } else {
3887     #
3888     }
3889     } elsif ($token->{type} eq 'end tag') {
3890     if ($token->{tag_name} eq 'head') {
3891     if ($open_elements->[-1]->[1] eq 'head') {
3892     pop @$open_elements;
3893     } else {
3894     $self->{parse_error}->();
3895     }
3896     $insertion_mode = 'after head';
3897     $token = $self->_get_next_token;
3898     redo B;
3899     } elsif ($token->{tag_name} eq 'html') {
3900     #
3901     } else {
3902     $self->{parse_error}->();
3903     ## Ignore the token
3904     $token = $self->_get_next_token;
3905     redo B;
3906     }
3907     } else {
3908     #
3909     }
3910    
3911     if ($open_elements->[-1]->[1] eq 'head') {
3912     ## As if </head>
3913     pop @$open_elements;
3914     }
3915     $insertion_mode = 'after head';
3916     ## reprocess
3917     redo B;
3918    
3919     ## ISSUE: An issue in the spec.
3920     } elsif ($insertion_mode eq 'after head') {
3921     if ($token->{type} eq 'character') {
3922     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3923     $open_elements->[-1]->[0]->manakai_append_text ($1);
3924     unless (length $token->{data}) {
3925     $token = $self->_get_next_token;
3926     redo B;
3927     }
3928     }
3929    
3930     #
3931     } elsif ($token->{type} eq 'comment') {
3932     my $comment = $self->{document}->create_comment ($token->{data});
3933     $open_elements->[-1]->[0]->append_child ($comment);
3934     $token = $self->_get_next_token;
3935     redo B;
3936     } elsif ($token->{type} eq 'start tag') {
3937     if ($token->{tag_name} eq 'body') {
3938    
3939     {
3940     my $el;
3941    
3942     $el = $self->{document}->create_element_ns
3943     (q<http://www.w3.org/1999/xhtml>, [undef, 'body']);
3944    
3945     for my $attr_name (keys %{ $token->{attributes}}) {
3946     $el->set_attribute_ns (undef, [undef, $attr_name],
3947     $token->{attributes} ->{$attr_name}->{value});
3948     }
3949    
3950     $open_elements->[-1]->[0]->append_child ($el);
3951     push @$open_elements, [$el, 'body'];
3952     }
3953    
3954     $insertion_mode = 'in body';
3955     $token = $self->_get_next_token;
3956     redo B;
3957     } elsif ($token->{tag_name} eq 'frameset') {
3958    
3959     {
3960     my $el;
3961    
3962     $el = $self->{document}->create_element_ns
3963     (q<http://www.w3.org/1999/xhtml>, [undef, 'frameset']);
3964    
3965     for my $attr_name (keys %{ $token->{attributes}}) {
3966     $el->set_attribute_ns (undef, [undef, $attr_name],
3967     $token->{attributes} ->{$attr_name}->{value});
3968     }
3969    
3970     $open_elements->[-1]->[0]->append_child ($el);
3971     push @$open_elements, [$el, 'frameset'];
3972     }
3973    
3974     $insertion_mode = 'in frameset';
3975     $token = $self->_get_next_token;
3976     redo B;
3977     } elsif ({
3978     base => 1, link => 1, meta => 1,
3979     script=> 1, style => 1, title => 1,
3980     }->{$token->{tag_name}}) {
3981     $self->{parse_error}->();
3982     $insertion_mode = 'in head';
3983     ## reprocess
3984     redo B;
3985     } else {
3986     #
3987     }
3988     } else {
3989     #
3990     }
3991    
3992     ## As if <body>
3993    
3994     {
3995     my $el;
3996    
3997     $el = $self->{document}->create_element_ns
3998     (q<http://www.w3.org/1999/xhtml>, [undef, 'body']);
3999    
4000     $open_elements->[-1]->[0]->append_child ($el);
4001     push @$open_elements, [$el, 'body'];
4002     }
4003    
4004     $insertion_mode = 'in body';
4005     ## reprocess
4006     redo B;
4007     } elsif ($insertion_mode eq 'in body') {
4008     if ($token->{type} eq 'character') {
4009     ## NOTE: There is a code clone of "character in body".
4010 wakaba 1.8 $reconstruct_active_formatting_elements->($insert_to_current);
4011 wakaba 1.3
4012     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
4013    
4014     $token = $self->_get_next_token;
4015     redo B;
4016     } elsif ($token->{type} eq 'comment') {
4017     ## NOTE: There is a code clone of "comment in body".
4018     my $comment = $self->{document}->create_comment ($token->{data});
4019     $open_elements->[-1]->[0]->append_child ($comment);
4020     $token = $self->_get_next_token;
4021     redo B;
4022     } else {
4023 wakaba 1.8 $in_body->($insert_to_current);
4024 wakaba 1.3 redo B;
4025     }
4026     } elsif ($insertion_mode eq 'in table') {
4027     if ($token->{type} eq 'character') {
4028 wakaba 1.8 ## NOTE: There are "character in table" code clones.
4029     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4030     $open_elements->[-1]->[0]->manakai_append_text ($1);
4031    
4032     unless (length $token->{data}) {
4033     $token = $self->_get_next_token;
4034     redo B;
4035     }
4036     }
4037 wakaba 1.3
4038 wakaba 1.8 ## As if in body, but insert into foster parent element
4039     ## ISSUE: Spec says that "whenever a node would be inserted
4040     ## into the current node" while characters might not be
4041     ## result in a new Text node.
4042     $reconstruct_active_formatting_elements->($insert_to_foster);
4043    
4044     if ({
4045     table => 1, tbody => 1, tfoot => 1,
4046     thead => 1, tr => 1,
4047     }->{$open_elements->[-1]->[1]}) {
4048     # MUST
4049     my $foster_parent_element;
4050     my $next_sibling;
4051     my $prev_sibling;
4052     OE: for (reverse 0..$#$open_elements) {
4053     if ($open_elements->[$_]->[1] eq 'table') {
4054     my $parent = $open_elements->[$_]->[0]->parent_node;
4055     if (defined $parent and $parent->node_type == 1) {
4056     $foster_parent_element = $parent;
4057     $next_sibling = $open_elements->[$_]->[0];
4058     $prev_sibling = $next_sibling->previous_sibling;
4059     } else {
4060     $foster_parent_element = $open_elements->[$_ - 1]->[0];
4061     $prev_sibling = $foster_parent_element->last_child;
4062     }
4063     last OE;
4064     }
4065     } # OE
4066     $foster_parent_element = $open_elements->[0]->[0] and
4067     $prev_sibling = $foster_parent_element->last_child
4068     unless defined $foster_parent_element;
4069     if (defined $prev_sibling and
4070     $prev_sibling->node_type == 3) {
4071     $prev_sibling->manakai_append_text ($token->{data});
4072     } else {
4073     $foster_parent_element->insert_before
4074     ($self->{document}->create_text_node ($token->{data}),
4075     $next_sibling);
4076     }
4077     } else {
4078     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
4079     }
4080    
4081 wakaba 1.3 $token = $self->_get_next_token;
4082     redo B;
4083     } elsif ($token->{type} eq 'comment') {
4084     my $comment = $self->{document}->create_comment ($token->{data});
4085     $open_elements->[-1]->[0]->append_child ($comment);
4086     $token = $self->_get_next_token;
4087     redo B;
4088     } elsif ($token->{type} eq 'start tag') {
4089     if ({
4090     caption => 1,
4091     colgroup => 1,
4092     tbody => 1, tfoot => 1, thead => 1,
4093     }->{$token->{tag_name}}) {
4094     ## Clear back to table context
4095     while ($open_elements->[-1]->[1] ne 'table' and
4096     $open_elements->[-1]->[1] ne 'html') {
4097     $self->{parse_error}->();
4098     pop @$open_elements;
4099     }
4100    
4101     push @$active_formatting_elements, ['#marker', '']
4102     if $token->{tag_name} eq 'caption';
4103    
4104    
4105     {
4106     my $el;
4107    
4108     $el = $self->{document}->create_element_ns
4109     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
4110    
4111     for my $attr_name (keys %{ $token->{attributes}}) {
4112     $el->set_attribute_ns (undef, [undef, $attr_name],
4113     $token->{attributes} ->{$attr_name}->{value});
4114     }
4115    
4116     $open_elements->[-1]->[0]->append_child ($el);
4117     push @$open_elements, [$el, $token->{tag_name}];
4118     }
4119    
4120     $insertion_mode = {
4121     caption => 'in caption',
4122     colgroup => 'in column group',
4123     tbody => 'in table body',
4124     tfoot => 'in table body',
4125     thead => 'in table body',
4126     }->{$token->{tag_name}};
4127     $token = $self->_get_next_token;
4128     redo B;
4129     } elsif ({
4130     col => 1,
4131     td => 1, th => 1, tr => 1,
4132     }->{$token->{tag_name}}) {
4133     ## Clear back to table context
4134     while ($open_elements->[-1]->[1] ne 'table' and
4135     $open_elements->[-1]->[1] ne 'html') {
4136     $self->{parse_error}->();
4137     pop @$open_elements;
4138     }
4139    
4140    
4141     {
4142     my $el;
4143    
4144     $el = $self->{document}->create_element_ns
4145     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name} eq 'col' ? 'colgroup' : 'tbody']);
4146    
4147     $open_elements->[-1]->[0]->append_child ($el);
4148     push @$open_elements, [$el, $token->{tag_name} eq 'col' ? 'colgroup' : 'tbody'];
4149     }
4150    
4151     $insertion_mode = $token->{tag_name} eq 'col'
4152     ? 'in column group' : 'in table body';
4153     ## reprocess
4154     redo B;
4155     } elsif ($token->{tag_name} eq 'table') {
4156     ## NOTE: There are code clones for this "table in table"
4157     $self->{parse_error}->();
4158    
4159     ## As if </table>
4160     ## have a table element in table scope
4161     my $i;
4162     INSCOPE: for (reverse 0..$#$open_elements) {
4163     my $node = $open_elements->[$_];
4164     if ($node->[1] eq 'table') {
4165     $i = $_;
4166     last INSCOPE;
4167     } elsif ({
4168     table => 1, html => 1,
4169     }->{$node->[1]}) {
4170     last INSCOPE;
4171     }
4172     } # INSCOPE
4173     unless (defined $i) {
4174     $self->{parse_error}->();
4175     ## Ignore tokens </table><table>
4176     $token = $self->_get_next_token;
4177     redo B;
4178     }
4179    
4180     ## generate implied end tags
4181     if ({
4182     dd => 1, dt => 1, li => 1, p => 1,
4183     td => 1, th => 1, tr => 1,
4184     }->{$open_elements->[-1]->[1]}) {
4185     unshift @{$self->{token}}, $token; # <table>
4186     $token = {type => 'end tag', tag_name => 'table'};
4187     unshift @{$self->{token}}, $token;
4188     $token = {type => 'end tag',
4189     tag_name => $open_elements->[-1]->[1]}; # MUST
4190     redo B;
4191     }
4192    
4193     if ($open_elements->[-1]->[1] ne 'table') {
4194     $self->{parse_error}->();
4195     }
4196    
4197     splice @$open_elements, $i;
4198    
4199     $reset_insertion_mode->();
4200    
4201     ## reprocess
4202     redo B;
4203     } else {
4204     #
4205     }
4206     } elsif ($token->{type} eq 'end tag') {
4207     if ($token->{tag_name} eq 'table') {
4208     ## have a table element in table scope
4209     my $i;
4210     INSCOPE: for (reverse 0..$#$open_elements) {
4211     my $node = $open_elements->[$_];
4212     if ($node->[1] eq $token->{tag_name}) {
4213     $i = $_;
4214     last INSCOPE;
4215     } elsif ({
4216     table => 1, html => 1,
4217     }->{$node->[1]}) {
4218     last INSCOPE;
4219     }
4220     } # INSCOPE
4221     unless (defined $i) {
4222     $self->{parse_error}->();
4223     ## Ignore the token
4224     $token = $self->_get_next_token;
4225     redo B;
4226     }
4227    
4228     ## generate implied end tags
4229     if ({
4230     dd => 1, dt => 1, li => 1, p => 1,
4231     td => 1, th => 1, tr => 1,
4232     }->{$open_elements->[-1]->[1]}) {
4233     unshift @{$self->{token}}, $token;
4234     $token = {type => 'end tag',
4235     tag_name => $open_elements->[-1]->[1]}; # MUST
4236     redo B;
4237     }
4238    
4239     if ($open_elements->[-1]->[1] ne 'table') {
4240     $self->{parse_error}->();
4241     }
4242    
4243     splice @$open_elements, $i;
4244    
4245     $reset_insertion_mode->();
4246    
4247     $token = $self->_get_next_token;
4248     redo B;
4249     } elsif ({
4250     body => 1, caption => 1, col => 1, colgroup => 1,
4251     html => 1, tbody => 1, td => 1, tfoot => 1, th => 1,
4252     thead => 1, tr => 1,
4253     }->{$token->{tag_name}}) {
4254     $self->{parse_error}->();
4255     ## Ignore the token
4256     $token = $self->_get_next_token;
4257     redo B;
4258     } else {
4259     #
4260     }
4261     } else {
4262     #
4263     }
4264    
4265     $self->{parse_error}->();
4266 wakaba 1.8 $in_body->($insert_to_foster);
4267 wakaba 1.3 redo B;
4268     } elsif ($insertion_mode eq 'in caption') {
4269 wakaba 1.7 if ($token->{type} eq 'character') {
4270     ## NOTE: This is a code clone of "character in body".
4271 wakaba 1.8 $reconstruct_active_formatting_elements->($insert_to_current);
4272 wakaba 1.7
4273     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
4274    
4275     $token = $self->_get_next_token;
4276     redo B;
4277     } elsif ($token->{type} eq 'comment') {
4278     ## NOTE: This is a code clone of "comment in body".
4279     my $comment = $self->{document}->create_comment ($token->{data});
4280     $open_elements->[-1]->[0]->append_child ($comment);
4281     $token = $self->_get_next_token;
4282     redo B;
4283     } elsif ($token->{type} eq 'start tag') {
4284 wakaba 1.3 if ({
4285     caption => 1, col => 1, colgroup => 1, tbody => 1,
4286     td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
4287     }->{$token->{tag_name}}) {
4288     $self->{parse_error}->();
4289    
4290     ## As if </caption>
4291     ## have a table element in table scope
4292     my $i;
4293     INSCOPE: for (reverse 0..$#$open_elements) {
4294     my $node = $open_elements->[$_];
4295     if ($node->[1] eq 'caption') {
4296     $i = $_;
4297     last INSCOPE;
4298     } elsif ({
4299     table => 1, html => 1,
4300     }->{$node->[1]}) {
4301     last INSCOPE;
4302     }
4303     } # INSCOPE
4304     unless (defined $i) {
4305     $self->{parse_error}->();
4306     ## Ignore the token
4307     $token = $self->_get_next_token;
4308     redo B;
4309     }
4310    
4311     ## generate implied end tags
4312     if ({
4313     dd => 1, dt => 1, li => 1, p => 1,
4314     td => 1, th => 1, tr => 1,
4315     }->{$open_elements->[-1]->[1]}) {
4316     unshift @{$self->{token}}, $token; # <?>
4317     $token = {type => 'end tag', tag_name => 'caption'};
4318     unshift @{$self->{token}}, $token;
4319     $token = {type => 'end tag',
4320     tag_name => $open_elements->[-1]->[1]}; # MUST
4321     redo B;
4322     }
4323    
4324     if ($open_elements->[-1]->[1] ne 'caption') {
4325     $self->{parse_error}->();
4326     }
4327    
4328     splice @$open_elements, $i;
4329    
4330     $clear_up_to_marker->();
4331    
4332     $insertion_mode = 'in table';
4333    
4334     ## reprocess
4335     redo B;
4336     } else {
4337     #
4338     }
4339     } elsif ($token->{type} eq 'end tag') {
4340     if ($token->{tag_name} eq 'caption') {
4341     ## have a table element in table scope
4342     my $i;
4343     INSCOPE: for (reverse 0..$#$open_elements) {
4344     my $node = $open_elements->[$_];
4345     if ($node->[1] eq $token->{tag_name}) {
4346     $i = $_;
4347     last INSCOPE;
4348     } elsif ({
4349     table => 1, html => 1,
4350     }->{$node->[1]}) {
4351     last INSCOPE;
4352     }
4353     } # INSCOPE
4354     unless (defined $i) {
4355     $self->{parse_error}->();
4356     ## Ignore the token
4357     $token = $self->_get_next_token;
4358     redo B;
4359     }
4360    
4361     ## generate implied end tags
4362     if ({
4363     dd => 1, dt => 1, li => 1, p => 1,
4364     td => 1, th => 1, tr => 1,
4365     }->{$open_elements->[-1]->[1]}) {
4366     unshift @{$self->{token}}, $token;
4367     $token = {type => 'end tag',
4368     tag_name => $open_elements->[-1]->[1]}; # MUST
4369     redo B;
4370     }
4371    
4372     if ($open_elements->[-1]->[1] ne 'caption') {
4373     $self->{parse_error}->();
4374     }
4375    
4376     splice @$open_elements, $i;
4377    
4378     $clear_up_to_marker->();
4379    
4380     $insertion_mode = 'in table';
4381    
4382     $token = $self->_get_next_token;
4383     redo B;
4384     } elsif ($token->{tag_name} eq 'table') {
4385     $self->{parse_error}->();
4386    
4387     ## As if </caption>
4388     ## have a table element in table scope
4389     my $i;
4390     INSCOPE: for (reverse 0..$#$open_elements) {
4391     my $node = $open_elements->[$_];
4392     if ($node->[1] eq 'caption') {
4393     $i = $_;
4394     last INSCOPE;
4395     } elsif ({
4396     table => 1, html => 1,
4397     }->{$node->[1]}) {
4398     last INSCOPE;
4399     }
4400     } # INSCOPE
4401     unless (defined $i) {
4402     $self->{parse_error}->();
4403     ## Ignore the token
4404     $token = $self->_get_next_token;
4405     redo B;
4406     }
4407    
4408     ## generate implied end tags
4409     if ({
4410     dd => 1, dt => 1, li => 1, p => 1,
4411     td => 1, th => 1, tr => 1,
4412     }->{$open_elements->[-1]->[1]}) {
4413     unshift @{$self->{token}}, $token; # </table>
4414     $token = {type => 'end tag', tag_name => 'caption'};
4415     unshift @{$self->{token}}, $token;
4416     $token = {type => 'end tag',
4417     tag_name => $open_elements->[-1]->[1]}; # MUST
4418     redo B;
4419     }
4420    
4421     if ($open_elements->[-1]->[1] ne 'caption') {
4422     $self->{parse_error}->();
4423     }
4424    
4425     splice @$open_elements, $i;
4426    
4427     $clear_up_to_marker->();
4428    
4429     $insertion_mode = 'in table';
4430    
4431     ## reprocess
4432     redo B;
4433     } elsif ({
4434     body => 1, col => 1, colgroup => 1,
4435     html => 1, tbody => 1, td => 1, tfoot => 1,
4436     th => 1, thead => 1, tr => 1,
4437     }->{$token->{tag_name}}) {
4438     $self->{parse_error}->();
4439     ## Ignore the token
4440     redo B;
4441     } else {
4442     #
4443     }
4444     } else {
4445     #
4446     }
4447    
4448 wakaba 1.8 $in_body->($insert_to_current);
4449 wakaba 1.3 redo B;
4450     } elsif ($insertion_mode eq 'in column group') {
4451     if ($token->{type} eq 'character') {
4452     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4453     $open_elements->[-1]->[0]->manakai_append_text ($1);
4454     unless (length $token->{data}) {
4455     $token = $self->_get_next_token;
4456     redo B;
4457     }
4458     }
4459    
4460     #
4461     } elsif ($token->{type} eq 'comment') {
4462     my $comment = $self->{document}->create_comment ($token->{data});
4463     $open_elements->[-1]->[0]->append_child ($comment);
4464     $token = $self->_get_next_token;
4465     redo B;
4466     } elsif ($token->{type} eq 'start tag') {
4467     if ($token->{tag_name} eq 'col') {
4468    
4469     {
4470     my $el;
4471    
4472     $el = $self->{document}->create_element_ns
4473     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
4474    
4475     for my $attr_name (keys %{ $token->{attributes}}) {
4476     $el->set_attribute_ns (undef, [undef, $attr_name],
4477     $token->{attributes} ->{$attr_name}->{value});
4478     }
4479    
4480     $open_elements->[-1]->[0]->append_child ($el);
4481     push @$open_elements, [$el, $token->{tag_name}];
4482     }
4483    
4484     pop @$open_elements;
4485     $token = $self->_get_next_token;
4486     redo B;
4487     } else {
4488     #
4489     }
4490     } elsif ($token->{type} eq 'end tag') {
4491     if ($token->{tag_name} eq 'colgroup') {
4492     if ($open_elements->[-1]->[1] eq 'html') {
4493     $self->{parse_error}->();
4494     ## Ignore the token
4495     $token = $self->_get_next_token;
4496     redo B;
4497     } else {
4498     pop @$open_elements; # colgroup
4499     $insertion_mode = 'in table';
4500     $token = $self->_get_next_token;
4501     redo B;
4502     }
4503     } elsif ($token->{tag_name} eq 'col') {
4504     $self->{parse_error}->();
4505     ## Ignore the token
4506     $token = $self->_get_next_token;
4507     redo B;
4508     } else {
4509     #
4510     }
4511     } else {
4512     #
4513     }
4514    
4515     ## As if </colgroup>
4516     if ($open_elements->[-1]->[1] eq 'html') {
4517     $self->{parse_error}->();
4518     ## Ignore the token
4519     $token = $self->_get_next_token;
4520     redo B;
4521     } else {
4522     pop @$open_elements; # colgroup
4523     $insertion_mode = 'in table';
4524     ## reprocess
4525     redo B;
4526     }
4527     } elsif ($insertion_mode eq 'in table body') {
4528     if ($token->{type} eq 'character') {
4529 wakaba 1.8 ## NOTE: This is a "character in table" code clone.
4530     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4531     $open_elements->[-1]->[0]->manakai_append_text ($1);
4532    
4533     unless (length $token->{data}) {
4534     $token = $self->_get_next_token;
4535     redo B;
4536     }
4537     }
4538 wakaba 1.3
4539 wakaba 1.8 ## As if in body, but insert into foster parent element
4540     ## ISSUE: Spec says that "whenever a node would be inserted
4541     ## into the current node" while characters might not be
4542     ## result in a new Text node.
4543     $reconstruct_active_formatting_elements->($insert_to_foster);
4544 wakaba 1.3
4545 wakaba 1.8 if ({
4546     table => 1, tbody => 1, tfoot => 1,
4547     thead => 1, tr => 1,
4548     }->{$open_elements->[-1]->[1]}) {
4549     # MUST
4550     my $foster_parent_element;
4551     my $next_sibling;
4552     my $prev_sibling;
4553     OE: for (reverse 0..$#$open_elements) {
4554     if ($open_elements->[$_]->[1] eq 'table') {
4555     my $parent = $open_elements->[$_]->[0]->parent_node;
4556     if (defined $parent and $parent->node_type == 1) {
4557     $foster_parent_element = $parent;
4558     $next_sibling = $open_elements->[$_]->[0];
4559     $prev_sibling = $next_sibling->previous_sibling;
4560     } else {
4561     $foster_parent_element = $open_elements->[$_ - 1]->[0];
4562     $prev_sibling = $foster_parent_element->last_child;
4563     }
4564     last OE;
4565     }
4566     } # OE
4567     $foster_parent_element = $open_elements->[0]->[0] and
4568     $prev_sibling = $foster_parent_element->last_child
4569     unless defined $foster_parent_element;
4570     if (defined $prev_sibling and
4571     $prev_sibling->node_type == 3) {
4572     $prev_sibling->manakai_append_text ($token->{data});
4573     } else {
4574     $foster_parent_element->insert_before
4575     ($self->{document}->create_text_node ($token->{data}),
4576     $next_sibling);
4577     }
4578     } else {
4579     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
4580     }
4581    
4582 wakaba 1.3 $token = $self->_get_next_token;
4583     redo B;
4584     } elsif ($token->{type} eq 'comment') {
4585     ## Copied from 'in table'
4586     my $comment = $self->{document}->create_comment ($token->{data});
4587     $open_elements->[-1]->[0]->append_child ($comment);
4588     $token = $self->_get_next_token;
4589     redo B;
4590     } elsif ($token->{type} eq 'start tag') {
4591     if ({
4592     tr => 1,
4593     th => 1, td => 1,
4594     }->{$token->{tag_name}}) {
4595     ## Clear back to table body context
4596     while (not {
4597     tbody => 1, tfoot => 1, thead => 1, html => 1,
4598     }->{$open_elements->[-1]->[1]}) {
4599     $self->{parse_error}->();
4600     pop @$open_elements;
4601     }
4602    
4603     $insertion_mode = 'in row';
4604     if ($token->{tag_name} eq 'tr') {
4605    
4606     {
4607     my $el;
4608    
4609     $el = $self->{document}->create_element_ns
4610     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
4611    
4612     for my $attr_name (keys %{ $token->{attributes}}) {
4613     $el->set_attribute_ns (undef, [undef, $attr_name],
4614     $token->{attributes} ->{$attr_name}->{value});
4615     }
4616    
4617     $open_elements->[-1]->[0]->append_child ($el);
4618     push @$open_elements, [$el, $token->{tag_name}];
4619     }
4620    
4621     $token = $self->_get_next_token;
4622     } else {
4623    
4624     {
4625     my $el;
4626    
4627     $el = $self->{document}->create_element_ns
4628     (q<http://www.w3.org/1999/xhtml>, [undef, 'tr']);
4629    
4630     $open_elements->[-1]->[0]->append_child ($el);
4631     push @$open_elements, [$el, 'tr'];
4632     }
4633    
4634     ## reprocess
4635     }
4636     redo B;
4637     } elsif ({
4638     caption => 1, col => 1, colgroup => 1,
4639     tbody => 1, tfoot => 1, thead => 1,
4640     }->{$token->{tag_name}}) {
4641     ## have an element in table scope
4642     my $i;
4643     INSCOPE: for (reverse 0..$#$open_elements) {
4644     my $node = $open_elements->[$_];
4645     if ({
4646     tbody => 1, thead => 1, tfoot => 1,
4647     }->{$node->[1]}) {
4648     $i = $_;
4649     last INSCOPE;
4650     } elsif ({
4651     table => 1, html => 1,
4652     }->{$node->[1]}) {
4653     last INSCOPE;
4654     }
4655     } # INSCOPE
4656     unless (defined $i) {
4657     $self->{parse_error}->();
4658     ## Ignore the token
4659     $token = $self->_get_next_token;
4660     redo B;
4661     }
4662    
4663     ## Clear back to table body context
4664     while (not {
4665     tbody => 1, tfoot => 1, thead => 1, html => 1,
4666     }->{$open_elements->[-1]->[1]}) {
4667     $self->{parse_error}->();
4668     pop @$open_elements;
4669     }
4670    
4671     ## As if <{current node}>
4672     ## have an element in table scope
4673     ## true by definition
4674    
4675     ## Clear back to table body context
4676     ## nop by definition
4677    
4678     pop @$open_elements;
4679     $insertion_mode = 'in table';
4680     ## reprocess
4681     redo B;
4682     } elsif ($token->{tag_name} eq 'table') {
4683     ## NOTE: This is a code clone of "table in table"
4684     $self->{parse_error}->();
4685    
4686     ## As if </table>
4687     ## have a table element in table scope
4688     my $i;
4689     INSCOPE: for (reverse 0..$#$open_elements) {
4690     my $node = $open_elements->[$_];
4691     if ($node->[1] eq 'table') {
4692     $i = $_;
4693     last INSCOPE;
4694     } elsif ({
4695     table => 1, html => 1,
4696     }->{$node->[1]}) {
4697     last INSCOPE;
4698     }
4699     } # INSCOPE
4700     unless (defined $i) {
4701     $self->{parse_error}->();
4702     ## Ignore tokens </table><table>
4703     $token = $self->_get_next_token;
4704     redo B;
4705     }
4706    
4707     ## generate implied end tags
4708     if ({
4709     dd => 1, dt => 1, li => 1, p => 1,
4710     td => 1, th => 1, tr => 1,
4711     }->{$open_elements->[-1]->[1]}) {
4712     unshift @{$self->{token}}, $token; # <table>
4713     $token = {type => 'end tag', tag_name => 'table'};
4714     unshift @{$self->{token}}, $token;
4715     $token = {type => 'end tag',
4716     tag_name => $open_elements->[-1]->[1]}; # MUST
4717     redo B;
4718     }
4719    
4720     if ($open_elements->[-1]->[1] ne 'table') {
4721     $self->{parse_error}->();
4722     }
4723    
4724     splice @$open_elements, $i;
4725    
4726     $reset_insertion_mode->();
4727    
4728     ## reprocess
4729     redo B;
4730     } else {
4731     #
4732     }
4733     } elsif ($token->{type} eq 'end tag') {
4734     if ({
4735     tbody => 1, tfoot => 1, thead => 1,
4736     }->{$token->{tag_name}}) {
4737     ## have an element in table scope
4738     my $i;
4739     INSCOPE: for (reverse 0..$#$open_elements) {
4740     my $node = $open_elements->[$_];
4741     if ($node->[1] eq $token->{tag_name}) {
4742     $i = $_;
4743     last INSCOPE;
4744     } elsif ({
4745     table => 1, html => 1,
4746     }->{$node->[1]}) {
4747     last INSCOPE;
4748     }
4749     } # INSCOPE
4750     unless (defined $i) {
4751     $self->{parse_error}->();
4752     ## Ignore the token
4753     $token = $self->_get_next_token;
4754     redo B;
4755     }
4756    
4757     ## Clear back to table body context
4758     while (not {
4759     tbody => 1, tfoot => 1, thead => 1, html => 1,
4760     }->{$open_elements->[-1]->[1]}) {
4761     $self->{parse_error}->();
4762     pop @$open_elements;
4763     }
4764    
4765     pop @$open_elements;
4766     $insertion_mode = 'in table';
4767     $token = $self->_get_next_token;
4768     redo B;
4769     } elsif ($token->{tag_name} eq 'table') {
4770     ## have an element in table scope
4771     my $i;
4772     INSCOPE: for (reverse 0..$#$open_elements) {
4773     my $node = $open_elements->[$_];
4774     if ({
4775     tbody => 1, thead => 1, tfoot => 1,
4776     }->{$node->[1]}) {
4777     $i = $_;
4778     last INSCOPE;
4779     } elsif ({
4780     table => 1, html => 1,
4781     }->{$node->[1]}) {
4782     last INSCOPE;
4783     }
4784     } # INSCOPE
4785     unless (defined $i) {
4786     $self->{parse_error}->();
4787     ## Ignore the token
4788     $token = $self->_get_next_token;
4789     redo B;
4790     }
4791    
4792     ## Clear back to table body context
4793     while (not {
4794     tbody => 1, tfoot => 1, thead => 1, html => 1,
4795     }->{$open_elements->[-1]->[1]}) {
4796     $self->{parse_error}->();
4797     pop @$open_elements;
4798     }
4799    
4800     ## As if <{current node}>
4801     ## have an element in table scope
4802     ## true by definition
4803    
4804     ## Clear back to table body context
4805     ## nop by definition
4806    
4807     pop @$open_elements;
4808     $insertion_mode = 'in table';
4809     ## reprocess
4810     redo B;
4811     } elsif ({
4812     body => 1, caption => 1, col => 1, colgroup => 1,
4813     html => 1, td => 1, th => 1, tr => 1,
4814     }->{$token->{tag_name}}) {
4815     $self->{parse_error}->();
4816     ## Ignore the token
4817     $token = $self->_get_next_token;
4818     redo B;
4819     } else {
4820     #
4821     }
4822     } else {
4823     #
4824     }
4825    
4826     ## As if in table
4827     $self->{parse_error}->();
4828 wakaba 1.8 $in_body->($insert_to_foster);
4829 wakaba 1.3 redo B;
4830     } elsif ($insertion_mode eq 'in row') {
4831     if ($token->{type} eq 'character') {
4832 wakaba 1.8 ## NOTE: This is a "character in table" code clone.
4833     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4834     $open_elements->[-1]->[0]->manakai_append_text ($1);
4835    
4836     unless (length $token->{data}) {
4837     $token = $self->_get_next_token;
4838     redo B;
4839     }
4840     }
4841 wakaba 1.3
4842 wakaba 1.8 ## As if in body, but insert into foster parent element
4843     ## ISSUE: Spec says that "whenever a node would be inserted
4844     ## into the current node" while characters might not be
4845     ## result in a new Text node.
4846     $reconstruct_active_formatting_elements->($insert_to_foster);
4847    
4848     if ({
4849     table => 1, tbody => 1, tfoot => 1,
4850     thead => 1, tr => 1,
4851     }->{$open_elements->[-1]->[1]}) {
4852     # MUST
4853     my $foster_parent_element;
4854     my $next_sibling;
4855     my $prev_sibling;
4856     OE: for (reverse 0..$#$open_elements) {
4857     if ($open_elements->[$_]->[1] eq 'table') {
4858     my $parent = $open_elements->[$_]->[0]->parent_node;
4859     if (defined $parent and $parent->node_type == 1) {
4860     $foster_parent_element = $parent;
4861     $next_sibling = $open_elements->[$_]->[0];
4862     $prev_sibling = $next_sibling->previous_sibling;
4863     } else {
4864     $foster_parent_element = $open_elements->[$_ - 1]->[0];
4865     $prev_sibling = $foster_parent_element->last_child;
4866     }
4867     last OE;
4868     }
4869     } # OE
4870     $foster_parent_element = $open_elements->[0]->[0] and
4871     $prev_sibling = $foster_parent_element->last_child
4872     unless defined $foster_parent_element;
4873     if (defined $prev_sibling and
4874     $prev_sibling->node_type == 3) {
4875     $prev_sibling->manakai_append_text ($token->{data});
4876     } else {
4877     $foster_parent_element->insert_before
4878     ($self->{document}->create_text_node ($token->{data}),
4879     $next_sibling);
4880     }
4881     } else {
4882     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
4883     }
4884    
4885 wakaba 1.3 $token = $self->_get_next_token;
4886     redo B;
4887     } elsif ($token->{type} eq 'comment') {
4888     ## Copied from 'in table'
4889     my $comment = $self->{document}->create_comment ($token->{data});
4890     $open_elements->[-1]->[0]->append_child ($comment);
4891     $token = $self->_get_next_token;
4892     redo B;
4893     } elsif ($token->{type} eq 'start tag') {
4894     if ($token->{tag_name} eq 'th' or
4895     $token->{tag_name} eq 'td') {
4896     ## Clear back to table row context
4897     while (not {
4898 wakaba 1.7 tr => 1, html => 1,
4899 wakaba 1.3 }->{$open_elements->[-1]->[1]}) {
4900     $self->{parse_error}->();
4901     pop @$open_elements;
4902     }
4903    
4904    
4905     {
4906     my $el;
4907    
4908     $el = $self->{document}->create_element_ns
4909     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
4910    
4911     for my $attr_name (keys %{ $token->{attributes}}) {
4912     $el->set_attribute_ns (undef, [undef, $attr_name],
4913     $token->{attributes} ->{$attr_name}->{value});
4914     }
4915    
4916     $open_elements->[-1]->[0]->append_child ($el);
4917     push @$open_elements, [$el, $token->{tag_name}];
4918     }
4919    
4920     $insertion_mode = 'in cell';
4921    
4922     push @$active_formatting_elements, ['#marker', ''];
4923    
4924     $token = $self->_get_next_token;
4925     redo B;
4926     } elsif ({
4927     caption => 1, col => 1, colgroup => 1,
4928     tbody => 1, tfoot => 1, thead => 1, tr => 1,
4929     }->{$token->{tag_name}}) {
4930     ## As if </tr>
4931     ## have an element in table scope
4932     my $i;
4933     INSCOPE: for (reverse 0..$#$open_elements) {
4934     my $node = $open_elements->[$_];
4935     if ($node->[1] eq 'tr') {
4936     $i = $_;
4937     last INSCOPE;
4938     } elsif ({
4939     table => 1, html => 1,
4940     }->{$node->[1]}) {
4941     last INSCOPE;
4942     }
4943     } # INSCOPE
4944     unless (defined $i) {
4945     $self->{parse_error}->();
4946     ## Ignore the token
4947     $token = $self->_get_next_token;
4948     redo B;
4949     }
4950    
4951     ## Clear back to table row context
4952     while (not {
4953     tr => 1, html => 1,
4954     }->{$open_elements->[-1]->[1]}) {
4955     $self->{parse_error}->();
4956     pop @$open_elements;
4957     }
4958    
4959     pop @$open_elements; # tr
4960     $insertion_mode = 'in table body';
4961     ## reprocess
4962     redo B;
4963     } elsif ($token->{tag_name} eq 'table') {
4964     ## NOTE: This is a code clone of "table in table"
4965     $self->{parse_error}->();
4966    
4967     ## As if </table>
4968     ## have a table element in table scope
4969     my $i;
4970     INSCOPE: for (reverse 0..$#$open_elements) {
4971     my $node = $open_elements->[$_];
4972     if ($node->[1] eq 'table') {
4973     $i = $_;
4974     last INSCOPE;
4975     } elsif ({
4976     table => 1, html => 1,
4977     }->{$node->[1]}) {
4978     last INSCOPE;
4979     }
4980     } # INSCOPE
4981     unless (defined $i) {
4982     $self->{parse_error}->();
4983     ## Ignore tokens </table><table>
4984     $token = $self->_get_next_token;
4985     redo B;
4986     }
4987    
4988     ## generate implied end tags
4989     if ({
4990     dd => 1, dt => 1, li => 1, p => 1,
4991     td => 1, th => 1, tr => 1,
4992     }->{$open_elements->[-1]->[1]}) {
4993     unshift @{$self->{token}}, $token; # <table>
4994     $token = {type => 'end tag', tag_name => 'table'};
4995     unshift @{$self->{token}}, $token;
4996     $token = {type => 'end tag',
4997     tag_name => $open_elements->[-1]->[1]}; # MUST
4998     redo B;
4999     }
5000    
5001     if ($open_elements->[-1]->[1] ne 'table') {
5002     $self->{parse_error}->();
5003     }
5004    
5005     splice @$open_elements, $i;
5006    
5007     $reset_insertion_mode->();
5008    
5009     ## reprocess
5010     redo B;
5011     } else {
5012     #
5013     }
5014     } elsif ($token->{type} eq 'end tag') {
5015     if ($token->{tag_name} eq 'tr') {
5016     ## have an element in table scope
5017     my $i;
5018     INSCOPE: for (reverse 0..$#$open_elements) {
5019     my $node = $open_elements->[$_];
5020     if ($node->[1] eq $token->{tag_name}) {
5021     $i = $_;
5022     last INSCOPE;
5023     } elsif ({
5024     table => 1, html => 1,
5025     }->{$node->[1]}) {
5026     last INSCOPE;
5027     }
5028     } # INSCOPE
5029     unless (defined $i) {
5030     $self->{parse_error}->();
5031     ## Ignore the token
5032     $token = $self->_get_next_token;
5033     redo B;
5034     }
5035    
5036     ## Clear back to table row context
5037     while (not {
5038     tr => 1, html => 1,
5039     }->{$open_elements->[-1]->[1]}) {
5040     $self->{parse_error}->();
5041     pop @$open_elements;
5042     }
5043    
5044     pop @$open_elements; # tr
5045     $insertion_mode = 'in table body';
5046     $token = $self->_get_next_token;
5047     redo B;
5048     } elsif ($token->{tag_name} eq 'table') {
5049     ## As if </tr>
5050     ## have an element in table scope
5051     my $i;
5052     INSCOPE: for (reverse 0..$#$open_elements) {
5053     my $node = $open_elements->[$_];
5054     if ($node->[1] eq 'tr') {
5055     $i = $_;
5056     last INSCOPE;
5057     } elsif ({
5058     table => 1, html => 1,
5059     }->{$node->[1]}) {
5060     last INSCOPE;
5061     }
5062     } # INSCOPE
5063     unless (defined $i) {
5064     $self->{parse_error}->();
5065     ## Ignore the token
5066     $token = $self->_get_next_token;
5067     redo B;
5068     }
5069    
5070     ## Clear back to table row context
5071     while (not {
5072     tr => 1, html => 1,
5073     }->{$open_elements->[-1]->[1]}) {
5074     $self->{parse_error}->();
5075     pop @$open_elements;
5076     }
5077    
5078     pop @$open_elements; # tr
5079     $insertion_mode = 'in table body';
5080     ## reprocess
5081     redo B;
5082     } elsif ({
5083     tbody => 1, tfoot => 1, thead => 1,
5084     }->{$token->{tag_name}}) {
5085     ## have an element in table scope
5086     my $i;
5087     INSCOPE: for (reverse 0..$#$open_elements) {
5088     my $node = $open_elements->[$_];
5089     if ($node->[1] eq $token->{tag_name}) {
5090     $i = $_;
5091     last INSCOPE;
5092     } elsif ({
5093     table => 1, html => 1,
5094     }->{$node->[1]}) {
5095     last INSCOPE;
5096     }
5097     } # INSCOPE
5098     unless (defined $i) {
5099     $self->{parse_error}->();
5100     ## Ignore the token
5101     $token = $self->_get_next_token;
5102     redo B;
5103     }
5104    
5105     ## As if </tr>
5106     ## have an element in table scope
5107     my $i;
5108     INSCOPE: for (reverse 0..$#$open_elements) {
5109     my $node = $open_elements->[$_];
5110     if ($node->[1] eq 'tr') {
5111     $i = $_;
5112     last INSCOPE;
5113     } elsif ({
5114     table => 1, html => 1,
5115     }->{$node->[1]}) {
5116     last INSCOPE;
5117     }
5118     } # INSCOPE
5119     unless (defined $i) {
5120     $self->{parse_error}->();
5121     ## Ignore the token
5122     $token = $self->_get_next_token;
5123     redo B;
5124     }
5125    
5126     ## Clear back to table row context
5127     while (not {
5128     tr => 1, html => 1,
5129     }->{$open_elements->[-1]->[1]}) {
5130     $self->{parse_error}->();
5131     pop @$open_elements;
5132     }
5133    
5134     pop @$open_elements; # tr
5135     $insertion_mode = 'in table body';
5136     ## reprocess
5137     redo B;
5138     } elsif ({
5139     body => 1, caption => 1, col => 1,
5140     colgroup => 1, html => 1, td => 1, th => 1,
5141     }->{$token->{tag_name}}) {
5142     $self->{parse_error}->();
5143     ## Ignore the token
5144     $token = $self->_get_next_token;
5145     redo B;
5146     } else {
5147     #
5148     }
5149     } else {
5150     #
5151     }
5152    
5153     ## As if in table
5154     $self->{parse_error}->();
5155 wakaba 1.8 $in_body->($insert_to_foster);
5156 wakaba 1.3 redo B;
5157     } elsif ($insertion_mode eq 'in cell') {
5158     if ($token->{type} eq 'character') {
5159     ## NOTE: This is a code clone of "character in body".
5160 wakaba 1.8 $reconstruct_active_formatting_elements->($insert_to_current);
5161 wakaba 1.3
5162     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
5163    
5164     $token = $self->_get_next_token;
5165     redo B;
5166     } elsif ($token->{type} eq 'comment') {
5167     ## NOTE: This is a code clone of "comment in body".
5168     my $comment = $self->{document}->create_comment ($token->{data});
5169     $open_elements->[-1]->[0]->append_child ($comment);
5170     $token = $self->_get_next_token;
5171     redo B;
5172     } elsif ($token->{type} eq 'start tag') {
5173     if ({
5174     caption => 1, col => 1, colgroup => 1,
5175     tbody => 1, td => 1, tfoot => 1, th => 1,
5176     thead => 1, tr => 1,
5177     }->{$token->{tag_name}}) {
5178     ## have an element in table scope
5179     my $tn;
5180     INSCOPE: for (reverse 0..$#$open_elements) {
5181     my $node = $open_elements->[$_];
5182     if ($node->[1] eq 'td' or $node->[1] eq 'th') {
5183     $tn = $node->[1];
5184     last INSCOPE;
5185     } elsif ({
5186     table => 1, html => 1,
5187     }->{$node->[1]}) {
5188     last INSCOPE;
5189     }
5190     } # INSCOPE
5191     unless (defined $tn) {
5192     $self->{parse_error}->();
5193     ## Ignore the token
5194     $token = $self->_get_next_token;
5195     redo B;
5196     }
5197    
5198     ## Close the cell
5199     unshift @{$self->{token}}, $token; # <?>
5200     $token = {type => 'end tag', tag_name => $tn};
5201     redo B;
5202     } else {
5203     #
5204     }
5205     } elsif ($token->{type} eq 'end tag') {
5206 wakaba 1.7 if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') {
5207 wakaba 1.3 ## have an element in table scope
5208     my $i;
5209     INSCOPE: for (reverse 0..$#$open_elements) {
5210     my $node = $open_elements->[$_];
5211     if ($node->[1] eq $token->{tag_name}) {
5212     $i = $_;
5213     last INSCOPE;
5214     } elsif ({
5215     table => 1, html => 1,
5216     }->{$node->[1]}) {
5217     last INSCOPE;
5218     }
5219     } # INSCOPE
5220     unless (defined $i) {
5221     $self->{parse_error}->();
5222     ## Ignore the token
5223     $token = $self->_get_next_token;
5224     redo B;
5225     }
5226    
5227     ## generate implied end tags
5228     if ({
5229     dd => 1, dt => 1, li => 1, p => 1,
5230     td => ($token->{tag_name} eq 'th'),
5231     th => ($token->{tag_name} eq 'td'),
5232     tr => 1,
5233     }->{$open_elements->[-1]->[1]}) {
5234     unshift @{$self->{token}}, $token;
5235     $token = {type => 'end tag',
5236     tag_name => $open_elements->[-1]->[1]}; # MUST
5237     redo B;
5238     }
5239    
5240     if ($open_elements->[-1]->[1] ne $token->{tag_name}) {
5241     $self->{parse_error}->();
5242     }
5243    
5244     splice @$open_elements, $i;
5245    
5246     $clear_up_to_marker->();
5247    
5248     $insertion_mode = 'in row';
5249    
5250     $token = $self->_get_next_token;
5251     redo B;
5252     } elsif ({
5253     body => 1, caption => 1, col => 1,
5254     colgroup => 1, html => 1,
5255     }->{$token->{tag_name}}) {
5256     $self->{parse_error}->();
5257     ## Ignore the token
5258     $token = $self->_get_next_token;
5259     redo B;
5260     } elsif ({
5261     table => 1, tbody => 1, tfoot => 1,
5262     thead => 1, tr => 1,
5263     }->{$token->{tag_name}}) {
5264     ## have an element in table scope
5265     my $i;
5266     my $tn;
5267     INSCOPE: for (reverse 0..$#$open_elements) {
5268     my $node = $open_elements->[$_];
5269     if ($node->[1] eq $token->{tag_name}) {
5270     $i = $_;
5271     last INSCOPE;
5272     } elsif ($node->[1] eq 'td' or $node->[1] eq 'th') {
5273     $tn = $node->[1];
5274     ## NOTE: There is exactly one |td| or |th| element
5275     ## in scope in the stack of open elements by definition.
5276     } elsif ({
5277     table => 1, html => 1,
5278     }->{$node->[1]}) {
5279     last INSCOPE;
5280     }
5281     } # INSCOPE
5282     unless (defined $i) {
5283     $self->{parse_error}->();
5284     ## Ignore the token
5285     $token = $self->_get_next_token;
5286     redo B;
5287     }
5288    
5289     ## Close the cell
5290     unshift @{$self->{token}}, $token; # </?>
5291     $token = {type => 'end tag', tag_name => $tn};
5292     redo B;
5293     } else {
5294     #
5295     }
5296     } else {
5297     #
5298     }
5299    
5300 wakaba 1.8 $in_body->($insert_to_current);
5301 wakaba 1.3 redo B;
5302     } elsif ($insertion_mode eq 'in select') {
5303     if ($token->{type} eq 'character') {
5304     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
5305     $token = $self->_get_next_token;
5306     redo B;
5307     } elsif ($token->{type} eq 'comment') {
5308     my $comment = $self->{document}->create_comment ($token->{data});
5309     $open_elements->[-1]->[0]->append_child ($comment);
5310     $token = $self->_get_next_token;
5311     redo B;
5312     } elsif ($token->{type} eq 'start tag') {
5313     if ($token->{tag_name} eq 'option') {
5314     if ($open_elements->[-1]->[1] eq 'option') {
5315     ## As if </option>
5316     pop @$open_elements;
5317     }
5318    
5319    
5320     {
5321     my $el;
5322    
5323     $el = $self->{document}->create_element_ns
5324     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
5325    
5326     for my $attr_name (keys %{ $token->{attributes}}) {
5327     $el->set_attribute_ns (undef, [undef, $attr_name],
5328     $token->{attributes} ->{$attr_name}->{value});
5329     }
5330    
5331     $open_elements->[-1]->[0]->append_child ($el);
5332     push @$open_elements, [$el, $token->{tag_name}];
5333     }
5334    
5335     $token = $self->_get_next_token;
5336     redo B;
5337     } elsif ($token->{tag_name} eq 'optgroup') {
5338     if ($open_elements->[-1]->[1] eq 'option') {
5339     ## As if </option>
5340     pop @$open_elements;
5341     }
5342    
5343     if ($open_elements->[-1]->[1] eq 'optgroup') {
5344     ## As if </optgroup>
5345     pop @$open_elements;
5346     }
5347    
5348    
5349     {
5350     my $el;
5351    
5352     $el = $self->{document}->create_element_ns
5353     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
5354    
5355     for my $attr_name (keys %{ $token->{attributes}}) {
5356     $el->set_attribute_ns (undef, [undef, $attr_name],
5357     $token->{attributes} ->{$attr_name}->{value});
5358     }
5359    
5360     $open_elements->[-1]->[0]->append_child ($el);
5361     push @$open_elements, [$el, $token->{tag_name}];
5362     }
5363    
5364     $token = $self->_get_next_token;
5365     redo B;
5366     } elsif ($token->{tag_name} eq 'select') {
5367     $self->{parse_error}->();
5368     ## As if </select> instead
5369     ## have an element in table scope
5370     my $i;
5371     INSCOPE: for (reverse 0..$#$open_elements) {
5372     my $node = $open_elements->[$_];
5373     if ($node->[1] eq $token->{tag_name}) {
5374     $i = $_;
5375     last INSCOPE;
5376     } elsif ({
5377     table => 1, html => 1,
5378     }->{$node->[1]}) {
5379     last INSCOPE;
5380     }
5381     } # INSCOPE
5382     unless (defined $i) {
5383     $self->{parse_error}->();
5384     ## Ignore the token
5385     $token = $self->_get_next_token;
5386     redo B;
5387     }
5388    
5389     splice @$open_elements, $i;
5390    
5391     $reset_insertion_mode->();
5392    
5393     $token = $self->_get_next_token;
5394     redo B;
5395     } else {
5396     #
5397     }
5398     } elsif ($token->{type} eq 'end tag') {
5399     if ($token->{tag_name} eq 'optgroup') {
5400     if ($open_elements->[-1]->[1] eq 'option' and
5401     $open_elements->[-2]->[1] eq 'optgroup') {
5402     ## As if </option>
5403     splice @$open_elements, -2;
5404     } elsif ($open_elements->[-1]->[1] eq 'optgroup') {
5405     pop @$open_elements;
5406     } else {
5407     $self->{parse_error}->();
5408     ## Ignore the token
5409     }
5410     $token = $self->_get_next_token;
5411     redo B;
5412     } elsif ($token->{tag_name} eq 'option') {
5413     if ($open_elements->[-1]->[1] eq 'option') {
5414     pop @$open_elements;
5415     } else {
5416     $self->{parse_error}->();
5417     ## Ignore the token
5418     }
5419     $token = $self->_get_next_token;
5420     redo B;
5421     } elsif ($token->{tag_name} eq 'select') {
5422     ## have an element in table scope
5423     my $i;
5424     INSCOPE: for (reverse 0..$#$open_elements) {
5425     my $node = $open_elements->[$_];
5426     if ($node->[1] eq $token->{tag_name}) {
5427     $i = $_;
5428     last INSCOPE;
5429     } elsif ({
5430     table => 1, html => 1,
5431     }->{$node->[1]}) {
5432     last INSCOPE;
5433     }
5434     } # INSCOPE
5435     unless (defined $i) {
5436     $self->{parse_error}->();
5437     ## Ignore the token
5438     $token = $self->_get_next_token;
5439     redo B;
5440     }
5441    
5442     splice @$open_elements, $i;
5443    
5444     $reset_insertion_mode->();
5445    
5446     $token = $self->_get_next_token;
5447     redo B;
5448     } elsif ({
5449     caption => 1, table => 1, tbody => 1,
5450     tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
5451     }->{$token->{tag_name}}) {
5452     $self->{parse_error}->();
5453    
5454     ## have an element in table scope
5455     my $i;
5456     INSCOPE: for (reverse 0..$#$open_elements) {
5457     my $node = $open_elements->[$_];
5458     if ($node->[1] eq $token->{tag_name}) {
5459     $i = $_;
5460     last INSCOPE;
5461     } elsif ({
5462     table => 1, html => 1,
5463     }->{$node->[1]}) {
5464     last INSCOPE;
5465     }
5466     } # INSCOPE
5467     unless (defined $i) {
5468     ## Ignore the token
5469     $token = $self->_get_next_token;
5470     redo B;
5471     }
5472    
5473     ## As if </select>
5474     ## have an element in table scope
5475     undef $i;
5476     INSCOPE: for (reverse 0..$#$open_elements) {
5477     my $node = $open_elements->[$_];
5478     if ($node->[1] eq 'select') {
5479     $i = $_;
5480     last INSCOPE;
5481     } elsif ({
5482     table => 1, html => 1,
5483     }->{$node->[1]}) {
5484     last INSCOPE;
5485     }
5486     } # INSCOPE
5487     unless (defined $i) {
5488     $self->{parse_error}->();
5489     ## Ignore the </select> token
5490     $token = $self->_get_next_token; ## TODO: ok?
5491     redo B;
5492     }
5493    
5494     splice @$open_elements, $i;
5495    
5496     $reset_insertion_mode->();
5497    
5498     ## reprocess
5499     redo B;
5500     } else {
5501     #
5502     }
5503     } else {
5504     #
5505     }
5506    
5507     $self->{parse_error}->();
5508     ## Ignore the token
5509 wakaba 1.8 $token = $self->_get_next_token;
5510 wakaba 1.3 redo B;
5511     } elsif ($insertion_mode eq 'after body') {
5512     if ($token->{type} eq 'character') {
5513     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5514     ## As if in body
5515 wakaba 1.8 $reconstruct_active_formatting_elements->($insert_to_current);
5516 wakaba 1.3
5517     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
5518    
5519     unless (length $token->{data}) {
5520     $token = $self->_get_next_token;
5521     redo B;
5522     }
5523     }
5524    
5525     #
5526     } elsif ($token->{type} eq 'comment') {
5527     my $comment = $self->{document}->create_comment ($token->{data});
5528     $open_elements->[0]->[0]->append_child ($comment);
5529     $token = $self->_get_next_token;
5530     redo B;
5531     } elsif ($token->{type} eq 'end tag') {
5532 wakaba 1.7 if ($token->{tag_name} eq 'html') {
5533 wakaba 1.3 ## TODO: if inner_html, parse-error, ignore the token; otherwise,
5534    
5535     $phase = 'trailing end';
5536     $token = $self->_get_next_token;
5537     redo B;
5538     } else {
5539     #
5540     }
5541     } else {
5542     #
5543     }
5544    
5545 wakaba 1.9 $self->{parse_error}-> ('data after body');
5546 wakaba 1.3 $insertion_mode = 'in body';
5547     ## reprocess
5548     redo B;
5549     } elsif ($insertion_mode eq 'in frameset') {
5550     if ($token->{type} eq 'character') {
5551     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5552     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
5553    
5554     unless (length $token->{data}) {
5555     $token = $self->_get_next_token;
5556     redo B;
5557     }
5558     }
5559    
5560     #
5561     } elsif ($token->{type} eq 'comment') {
5562     my $comment = $self->{document}->create_comment ($token->{data});
5563     $open_elements->[-1]->[0]->append_child ($comment);
5564     $token = $self->_get_next_token;
5565     redo B;
5566     } elsif ($token->{type} eq 'start tag') {
5567     if ($token->{tag_name} eq 'frameset') {
5568    
5569     {
5570     my $el;
5571    
5572     $el = $self->{document}->create_element_ns
5573     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
5574    
5575     for my $attr_name (keys %{ $token->{attributes}}) {
5576     $el->set_attribute_ns (undef, [undef, $attr_name],
5577     $token->{attributes} ->{$attr_name}->{value});
5578     }
5579    
5580     $open_elements->[-1]->[0]->append_child ($el);
5581     push @$open_elements, [$el, $token->{tag_name}];
5582     }
5583    
5584     $token = $self->_get_next_token;
5585     redo B;
5586     } elsif ($token->{tag_name} eq 'frame') {
5587    
5588     {
5589     my $el;
5590    
5591     $el = $self->{document}->create_element_ns
5592     (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
5593    
5594     for my $attr_name (keys %{ $token->{attributes}}) {
5595     $el->set_attribute_ns (undef, [undef, $attr_name],
5596     $token->{attributes} ->{$attr_name}->{value});
5597     }
5598    
5599     $open_elements->[-1]->[0]->append_child ($el);
5600     push @$open_elements, [$el, $token->{tag_name}];
5601     }
5602    
5603     pop @$open_elements;
5604     $token = $self->_get_next_token;
5605     redo B;
5606     } elsif ($token->{tag_name} eq 'noframes') {
5607 wakaba 1.8 $in_body->($insert_to_current);
5608 wakaba 1.3 redo B;
5609     } else {
5610     #
5611     }
5612     } elsif ($token->{type} eq 'end tag') {
5613     if ($token->{tag_name} eq 'frameset') {
5614     if ($open_elements->[-1]->[1] eq 'html' and
5615     @$open_elements == 1) {
5616     $self->{parse_error}->();
5617     ## Ignore the token
5618     $token = $self->_get_next_token;
5619     } else {
5620     pop @$open_elements;
5621     $token = $self->_get_next_token;
5622     }
5623    
5624     ## if not inner_html and
5625     if ($open_elements->[-1]->[1] ne 'frameset') {
5626     $insertion_mode = 'after frameset';
5627     }
5628     redo B;
5629     } else {
5630     #
5631     }
5632     } else {
5633     #
5634     }
5635    
5636     $self->{parse_error}->();
5637     ## Ignore the token
5638     $token = $self->_get_next_token;
5639     redo B;
5640     } elsif ($insertion_mode eq 'after frameset') {
5641     if ($token->{type} eq 'character') {
5642     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5643     $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
5644    
5645     unless (length $token->{data}) {
5646     $token = $self->_get_next_token;
5647     redo B;
5648     }
5649     }
5650    
5651     #
5652     } elsif ($token->{type} eq 'comment') {
5653     my $comment = $self->{document}->create_comment ($token->{data});
5654     $open_elements->[-1]->[0]->append_child ($comment);
5655     $token = $self->_get_next_token;
5656     redo B;
5657     } elsif ($token->{type} eq 'start tag') {
5658     if ($token->{tag_name} eq 'noframes') {
5659 wakaba 1.8 $in_body->($insert_to_current);
5660 wakaba 1.3 redo B;
5661     } else {
5662     #
5663     }
5664     } elsif ($token->{type} eq 'end tag') {
5665     if ($token->{tag_name} eq 'html') {
5666     $phase = 'trailing end';
5667     $token = $self->_get_next_token;
5668     redo B;
5669     } else {
5670     #
5671     }
5672     } else {
5673     #
5674     }
5675    
5676     $self->{parse_error}->();
5677     ## Ignore the token
5678     $token = $self->_get_next_token;
5679     redo B;
5680    
5681     ## ISSUE: An issue in spec there
5682     } else {
5683     die "$0: $insertion_mode: Unknown insertion mode";
5684     }
5685     }
5686     } elsif ($phase eq 'trailing end') {
5687     ## states in the main stage is preserved yet # MUST
5688    
5689     if ($token->{type} eq 'DOCTYPE') {
5690     $self->{parse_error}->();
5691     ## Ignore the token
5692     $token = $self->_get_next_token;
5693     redo B;
5694     } elsif ($token->{type} eq 'comment') {
5695     my $comment = $self->{document}->create_comment ($token->{data});
5696     $self->{document}->append_child ($comment);
5697     $token = $self->_get_next_token;
5698     redo B;
5699     } elsif ($token->{type} eq 'character') {
5700     if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5701 wakaba 1.8 my $data = $1;
5702 wakaba 1.3 ## As if in the main phase.
5703     ## NOTE: The insertion mode in the main phase
5704     ## just before the phase has been changed to the trailing
5705     ## end phase is either "after body" or "after frameset".
5706 wakaba 1.8 $reconstruct_active_formatting_elements->($insert_to_current)
5707 wakaba 1.3 if $phase eq 'main';
5708    
5709 wakaba 1.8 $open_elements->[-1]->[0]->manakai_append_text ($data);
5710 wakaba 1.3
5711     unless (length $token->{data}) {
5712     $token = $self->_get_next_token;
5713     redo B;
5714     }
5715     }
5716    
5717     $self->{parse_error}->();
5718     $phase = 'main';
5719     ## reprocess
5720     redo B;
5721     } elsif ($token->{type} eq 'start tag' or
5722     $token->{type} eq 'end tag') {
5723     $self->{parse_error}->();
5724     $phase = 'main';
5725     ## reprocess
5726     redo B;
5727     } elsif ($token->{type} eq 'end-of-file') {
5728     ## Stop parsing
5729     last B;
5730     } else {
5731     die "$0: $token->{type}: Unknown token";
5732     }
5733     }
5734     } # B
5735    
5736     ## Stop parsing # MUST
5737    
5738     ## TODO: script stuffs
5739     } # _construct_tree
5740    
5741 wakaba 1.9 sub get_inner_html ($$$) {
5742 wakaba 1.3 my ($class, $node, $on_error) = @_;
5743    
5744     ## Step 1
5745     my $s = '';
5746    
5747     my $in_cdata;
5748     my $parent = $node;
5749     while (defined $parent) {
5750     if ($parent->node_type == 1 and
5751     $parent->namespace_uri eq 'http://www.w3.org/1999/xhtml' and
5752     {
5753     style => 1, script => 1, xmp => 1, iframe => 1,
5754     noembed => 1, noframes => 1, noscript => 1,
5755     }->{$parent->local_name}) { ## TODO: case thingy
5756     $in_cdata = 1;
5757     }
5758     $parent = $parent->parent_node;
5759     }
5760    
5761     ## Step 2
5762     my @node = @{$node->child_nodes};
5763     C: while (@node) {
5764     my $child = shift @node;
5765     unless (ref $child) {
5766     if ($child eq 'cdata-out') {
5767     $in_cdata = 0;
5768     } else {
5769     $s .= $child; # end tag
5770     }
5771     next C;
5772     }
5773    
5774     my $nt = $child->node_type;
5775     if ($nt == 1) { # Element
5776     my $tag_name = lc $child->tag_name; ## ISSUE: Definition of "lowercase"
5777     $s .= '<' . $tag_name;
5778    
5779     ## ISSUE: Non-html elements
5780    
5781     my @attrs = @{$child->attributes}; # sort order MUST be stable
5782     for my $attr (@attrs) { # order is implementation dependent
5783     my $attr_name = lc $attr->name; ## ISSUE: Definition of "lowercase"
5784     $s .= ' ' . $attr_name . '="';
5785     my $attr_value = $attr->value;
5786     ## escape
5787     $attr_value =~ s/&/&amp;/g;
5788     $attr_value =~ s/</&lt;/g;
5789     $attr_value =~ s/>/&gt;/g;
5790     $attr_value =~ s/"/&quot;/g;
5791     $s .= $attr_value . '"';
5792     }
5793     $s .= '>';
5794    
5795     next C if {
5796     area => 1, base => 1, basefont => 1, bgsound => 1,
5797     br => 1, col => 1, embed => 1, frame => 1, hr => 1,
5798     img => 1, input => 1, link => 1, meta => 1, param => 1,
5799     spacer => 1, wbr => 1,
5800     }->{$tag_name};
5801    
5802     if (not $in_cdata and {
5803     style => 1, script => 1, xmp => 1, iframe => 1,
5804     noembed => 1, noframes => 1, noscript => 1,
5805     }->{$tag_name}) {
5806     unshift @node, 'cdata-out';
5807     $in_cdata = 1;
5808     }
5809    
5810     unshift @node, @{$child->child_nodes}, '</' . $tag_name . '>';
5811     } elsif ($nt == 3 or $nt == 4) {
5812     if ($in_cdata) {
5813     $s .= $child->data;
5814     } else {
5815     my $value = $child->data;
5816     $value =~ s/&/&amp;/g;
5817     $value =~ s/</&lt;/g;
5818     $value =~ s/>/&gt;/g;
5819     $value =~ s/"/&quot;/g;
5820     $s .= $value;
5821     }
5822     } elsif ($nt == 8) {
5823     $s .= '<!--' . $child->data . '-->';
5824     } elsif ($nt == 10) {
5825     $s .= '<!DOCTYPE ' . $child->name . '>';
5826     } elsif ($nt == 5) { # entrefs
5827     push @node, @{$child->child_nodes};
5828     } else {
5829 wakaba 1.9 $on_error->($child) if defined $on_error;
5830 wakaba 1.3 }
5831 wakaba 1.9 ## ISSUE: This code does not support PIs.
5832 wakaba 1.3 } # C
5833    
5834     ## Step 3
5835     return \$s;
5836 wakaba 1.9 } # get_inner_html
5837 wakaba 1.1
5838     1;
5839 wakaba 1.10 # $Date: 2007/05/01 07:46:42 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24