/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.7 by wakaba, Wed May 30 12:24:50 2007 UTC revision 1.20 by wakaba, Sat Jun 23 14:25:05 2007 UTC
# Line 2  package Whatpm::HTML; Line 2  package Whatpm::HTML;
2  use strict;  use strict;
3  our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};  our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4    
5  ## This is an early version of an HTML parser.  ## ISSUE:
6    ## var doc = implementation.createDocument (null, null, null);
7    ## doc.write ('');
8    ## alert (doc.compatMode);
9    
10  my $permitted_slash_tag_name = {  my $permitted_slash_tag_name = {
11    base => 1,    base => 1,
# Line 18  my $permitted_slash_tag_name = { Line 21  my $permitted_slash_tag_name = {
21    input => 1,    input => 1,
22  };  };
23    
 my $entity_char = {  
   AElig => "\x{00C6}",  
   Aacute => "\x{00C1}",  
   Acirc => "\x{00C2}",  
   Agrave => "\x{00C0}",  
   Alpha => "\x{0391}",  
   Aring => "\x{00C5}",  
   Atilde => "\x{00C3}",  
   Auml => "\x{00C4}",  
   Beta => "\x{0392}",  
   Ccedil => "\x{00C7}",  
   Chi => "\x{03A7}",  
   Dagger => "\x{2021}",  
   Delta => "\x{0394}",  
   ETH => "\x{00D0}",  
   Eacute => "\x{00C9}",  
   Ecirc => "\x{00CA}",  
   Egrave => "\x{00C8}",  
   Epsilon => "\x{0395}",  
   Eta => "\x{0397}",  
   Euml => "\x{00CB}",  
   Gamma => "\x{0393}",  
   Iacute => "\x{00CD}",  
   Icirc => "\x{00CE}",  
   Igrave => "\x{00CC}",  
   Iota => "\x{0399}",  
   Iuml => "\x{00CF}",  
   Kappa => "\x{039A}",  
   Lambda => "\x{039B}",  
   Mu => "\x{039C}",  
   Ntilde => "\x{00D1}",  
   Nu => "\x{039D}",  
   OElig => "\x{0152}",  
   Oacute => "\x{00D3}",  
   Ocirc => "\x{00D4}",  
   Ograve => "\x{00D2}",  
   Omega => "\x{03A9}",  
   Omicron => "\x{039F}",  
   Oslash => "\x{00D8}",  
   Otilde => "\x{00D5}",  
   Ouml => "\x{00D6}",  
   Phi => "\x{03A6}",  
   Pi => "\x{03A0}",  
   Prime => "\x{2033}",  
   Psi => "\x{03A8}",  
   Rho => "\x{03A1}",  
   Scaron => "\x{0160}",  
   Sigma => "\x{03A3}",  
   THORN => "\x{00DE}",  
   Tau => "\x{03A4}",  
   Theta => "\x{0398}",  
   Uacute => "\x{00DA}",  
   Ucirc => "\x{00DB}",  
   Ugrave => "\x{00D9}",  
   Upsilon => "\x{03A5}",  
   Uuml => "\x{00DC}",  
   Xi => "\x{039E}",  
   Yacute => "\x{00DD}",  
   Yuml => "\x{0178}",  
   Zeta => "\x{0396}",  
   aacute => "\x{00E1}",  
   acirc => "\x{00E2}",  
   acute => "\x{00B4}",  
   aelig => "\x{00E6}",  
   agrave => "\x{00E0}",  
   alefsym => "\x{2135}",  
   alpha => "\x{03B1}",  
   amp => "\x{0026}",  
   AMP => "\x{0026}",  
   and => "\x{2227}",  
   ang => "\x{2220}",  
   apos => "\x{0027}",  
   aring => "\x{00E5}",  
   asymp => "\x{2248}",  
   atilde => "\x{00E3}",  
   auml => "\x{00E4}",  
   bdquo => "\x{201E}",  
   beta => "\x{03B2}",  
   brvbar => "\x{00A6}",  
   bull => "\x{2022}",  
   cap => "\x{2229}",  
   ccedil => "\x{00E7}",  
   cedil => "\x{00B8}",  
   cent => "\x{00A2}",  
   chi => "\x{03C7}",  
   circ => "\x{02C6}",  
   clubs => "\x{2663}",  
   cong => "\x{2245}",  
   copy => "\x{00A9}",  
   COPY => "\x{00A9}",  
   crarr => "\x{21B5}",  
   cup => "\x{222A}",  
   curren => "\x{00A4}",  
   dArr => "\x{21D3}",  
   dagger => "\x{2020}",  
   darr => "\x{2193}",  
   deg => "\x{00B0}",  
   delta => "\x{03B4}",  
   diams => "\x{2666}",  
   divide => "\x{00F7}",  
   eacute => "\x{00E9}",  
   ecirc => "\x{00EA}",  
   egrave => "\x{00E8}",  
   empty => "\x{2205}",  
   emsp => "\x{2003}",  
   ensp => "\x{2002}",  
   epsilon => "\x{03B5}",  
   equiv => "\x{2261}",  
   eta => "\x{03B7}",  
   eth => "\x{00F0}",  
   euml => "\x{00EB}",  
   euro => "\x{20AC}",  
   exist => "\x{2203}",  
   fnof => "\x{0192}",  
   forall => "\x{2200}",  
   frac12 => "\x{00BD}",  
   frac14 => "\x{00BC}",  
   frac34 => "\x{00BE}",  
   frasl => "\x{2044}",  
   gamma => "\x{03B3}",  
   ge => "\x{2265}",  
   gt => "\x{003E}",  
   GT => "\x{003E}",  
   hArr => "\x{21D4}",  
   harr => "\x{2194}",  
   hearts => "\x{2665}",  
   hellip => "\x{2026}",  
   iacute => "\x{00ED}",  
   icirc => "\x{00EE}",  
   iexcl => "\x{00A1}",  
   igrave => "\x{00EC}",  
   image => "\x{2111}",  
   infin => "\x{221E}",  
   int => "\x{222B}",  
   iota => "\x{03B9}",  
   iquest => "\x{00BF}",  
   isin => "\x{2208}",  
   iuml => "\x{00EF}",  
   kappa => "\x{03BA}",  
   lArr => "\x{21D0}",  
   lambda => "\x{03BB}",  
   lang => "\x{2329}",  
   laquo => "\x{00AB}",  
   larr => "\x{2190}",  
   lceil => "\x{2308}",  
   ldquo => "\x{201C}",  
   le => "\x{2264}",  
   lfloor => "\x{230A}",  
   lowast => "\x{2217}",  
   loz => "\x{25CA}",  
   lrm => "\x{200E}",  
   lsaquo => "\x{2039}",  
   lsquo => "\x{2018}",  
   lt => "\x{003C}",  
   LT => "\x{003C}",  
   macr => "\x{00AF}",  
   mdash => "\x{2014}",  
   micro => "\x{00B5}",  
   middot => "\x{00B7}",  
   minus => "\x{2212}",  
   mu => "\x{03BC}",  
   nabla => "\x{2207}",  
   nbsp => "\x{00A0}",  
   ndash => "\x{2013}",  
   ne => "\x{2260}",  
   ni => "\x{220B}",  
   not => "\x{00AC}",  
   notin => "\x{2209}",  
   nsub => "\x{2284}",  
   ntilde => "\x{00F1}",  
   nu => "\x{03BD}",  
   oacute => "\x{00F3}",  
   ocirc => "\x{00F4}",  
   oelig => "\x{0153}",  
   ograve => "\x{00F2}",  
   oline => "\x{203E}",  
   omega => "\x{03C9}",  
   omicron => "\x{03BF}",  
   oplus => "\x{2295}",  
   or => "\x{2228}",  
   ordf => "\x{00AA}",  
   ordm => "\x{00BA}",  
   oslash => "\x{00F8}",  
   otilde => "\x{00F5}",  
   otimes => "\x{2297}",  
   ouml => "\x{00F6}",  
   para => "\x{00B6}",  
   part => "\x{2202}",  
   permil => "\x{2030}",  
   perp => "\x{22A5}",  
   phi => "\x{03C6}",  
   pi => "\x{03C0}",  
   piv => "\x{03D6}",  
   plusmn => "\x{00B1}",  
   pound => "\x{00A3}",  
   prime => "\x{2032}",  
   prod => "\x{220F}",  
   prop => "\x{221D}",  
   psi => "\x{03C8}",  
   quot => "\x{0022}",  
   QUOT => "\x{0022}",  
   rArr => "\x{21D2}",  
   radic => "\x{221A}",  
   rang => "\x{232A}",  
   raquo => "\x{00BB}",  
   rarr => "\x{2192}",  
   rceil => "\x{2309}",  
   rdquo => "\x{201D}",  
   real => "\x{211C}",  
   reg => "\x{00AE}",  
   REG => "\x{00AE}",  
   rfloor => "\x{230B}",  
   rho => "\x{03C1}",  
   rlm => "\x{200F}",  
   rsaquo => "\x{203A}",  
   rsquo => "\x{2019}",  
   sbquo => "\x{201A}",  
   scaron => "\x{0161}",  
   sdot => "\x{22C5}",  
   sect => "\x{00A7}",  
   shy => "\x{00AD}",  
   sigma => "\x{03C3}",  
   sigmaf => "\x{03C2}",  
   sim => "\x{223C}",  
   spades => "\x{2660}",  
   sub => "\x{2282}",  
   sube => "\x{2286}",  
   sum => "\x{2211}",  
   sup => "\x{2283}",  
   sup1 => "\x{00B9}",  
   sup2 => "\x{00B2}",  
   sup3 => "\x{00B3}",  
   supe => "\x{2287}",  
   szlig => "\x{00DF}",  
   tau => "\x{03C4}",  
   there4 => "\x{2234}",  
   theta => "\x{03B8}",  
   thetasym => "\x{03D1}",  
   thinsp => "\x{2009}",  
   thorn => "\x{00FE}",  
   tilde => "\x{02DC}",  
   times => "\x{00D7}",  
   trade => "\x{2122}",  
   uArr => "\x{21D1}",  
   uacute => "\x{00FA}",  
   uarr => "\x{2191}",  
   ucirc => "\x{00FB}",  
   ugrave => "\x{00F9}",  
   uml => "\x{00A8}",  
   upsih => "\x{03D2}",  
   upsilon => "\x{03C5}",  
   uuml => "\x{00FC}",  
   weierp => "\x{2118}",  
   xi => "\x{03BE}",  
   yacute => "\x{00FD}",  
   yen => "\x{00A5}",  
   yuml => "\x{00FF}",  
   zeta => "\x{03B6}",  
   zwj => "\x{200D}",  
   zwnj => "\x{200C}",  
 }; # $entity_char  
   
 ## <http://lists.whatwg.org/pipermail/whatwg-whatwg.org/2006-December/thread.html#8562>  
24  my $c1_entity_char = {  my $c1_entity_char = {
25       128, 8364,    0x80 => 0x20AC,
26       129, 65533,    0x81 => 0xFFFD,
27       130, 8218,    0x82 => 0x201A,
28       131, 402,    0x83 => 0x0192,
29       132, 8222,    0x84 => 0x201E,
30       133, 8230,    0x85 => 0x2026,
31       134, 8224,    0x86 => 0x2020,
32       135, 8225,    0x87 => 0x2021,
33       136, 710,    0x88 => 0x02C6,
34       137, 8240,    0x89 => 0x2030,
35       138, 352,    0x8A => 0x0160,
36       139, 8249,    0x8B => 0x2039,
37       140, 338,    0x8C => 0x0152,
38       141, 65533,    0x8D => 0xFFFD,
39       142, 381,    0x8E => 0x017D,
40       143, 65533,    0x8F => 0xFFFD,
41       144, 65533,    0x90 => 0xFFFD,
42       145, 8216,    0x91 => 0x2018,
43       146, 8217,    0x92 => 0x2019,
44       147, 8220,    0x93 => 0x201C,
45       148, 8221,    0x94 => 0x201D,
46       149, 8226,    0x95 => 0x2022,
47       150, 8211,    0x96 => 0x2013,
48       151, 8212,    0x97 => 0x2014,
49       152, 732,    0x98 => 0x02DC,
50       153, 8482,    0x99 => 0x2122,
51       154, 353,    0x9A => 0x0161,
52       155, 8250,    0x9B => 0x203A,
53       156, 339,    0x9C => 0x0153,
54       157, 65533,    0x9D => 0xFFFD,
55       158, 382,    0x9E => 0x017E,
56       159, 376,    0x9F => 0x0178,
57  }; # $c1_entity_char  }; # $c1_entity_char
58    
59  my $special_category = {  my $special_category = {
# Line 350  sub parse_string ($$$;$) { Line 90  sub parse_string ($$$;$) {
90    my $column = 0;    my $column = 0;
91    $self->{set_next_input_character} = sub {    $self->{set_next_input_character} = sub {
92      my $self = shift;      my $self = shift;
93    
94        pop @{$self->{prev_input_character}};
95        unshift @{$self->{prev_input_character}}, $self->{next_input_character};
96    
97      $self->{next_input_character} = -1 and return if $i >= length $$s;      $self->{next_input_character} = -1 and return if $i >= length $$s;
98      $self->{next_input_character} = ord substr $$s, $i++, 1;      $self->{next_input_character} = ord substr $$s, $i++, 1;
99      $column++;      $column++;
# Line 358  sub parse_string ($$$;$) { Line 102  sub parse_string ($$$;$) {
102        $line++;        $line++;
103        $column = 0;        $column = 0;
104      } elsif ($self->{next_input_character} == 0x000D) { # CR      } elsif ($self->{next_input_character} == 0x000D) { # CR
105        if ($i >= length $$s) {        $i++ if substr ($$s, $i, 1) eq "\x0A";
         #  
       } else {  
         my $next_char = ord substr $$s, $i++, 1;  
         if ($next_char == 0x000A) { # LF  
           #  
         } else {  
           push @{$self->{char}}, $next_char;  
         }  
       }  
106        $self->{next_input_character} = 0x000A; # LF # MUST        $self->{next_input_character} = 0x000A; # LF # MUST
107        $line++;        $line++;
108        $column = 0;        $column = 0;
109      } elsif ($self->{next_input_character} > 0x10FFFF) {      } elsif ($self->{next_input_character} > 0x10FFFF) {
110        $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST        $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
111      } elsif ($self->{next_input_character} == 0x0000) { # NULL      } elsif ($self->{next_input_character} == 0x0000) { # NULL
112          !!!parse-error (type => 'NULL');
113        $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST        $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
114      }      }
115    };    };
116      $self->{prev_input_character} = [-1, -1, -1];
117      $self->{next_input_character} = -1;
118    
119    my $onerror = $_[2] || sub {    my $onerror = $_[2] || sub {
120      my (%opt) = @_;      my (%opt) = @_;
# Line 420  sub _initialize_tokenizer ($) { Line 158  sub _initialize_tokenizer ($) {
158    # $self->{next_input_character}    # $self->{next_input_character}
159    !!!next-input-character;    !!!next-input-character;
160    $self->{token} = [];    $self->{token} = [];
161      # $self->{escape}
162  } # _initialize_tokenizer  } # _initialize_tokenizer
163    
164  ## A token has:  ## A token has:
165  ##   ->{type} eq 'DOCTYPE', 'start tag', 'end tag', 'comment',  ##   ->{type} eq 'DOCTYPE', 'start tag', 'end tag', 'comment',
166  ##       'character', or 'end-of-file'  ##       'character', or 'end-of-file'
167  ##   ->{name} (DOCTYPE, start tag (tagname), end tag (tagname))  ##   ->{name} (DOCTYPE, start tag (tag name), end tag (tag name))
168      ## ISSUE: the spec need s/tagname/tag name/  ##   ->{public_identifier} (DOCTYPE)
169  ##   ->{error} == 1 or 0 (DOCTYPE)  ##   ->{system_identifier} (DOCTYPE)
170    ##   ->{correct} == 1 or 0 (DOCTYPE)
171  ##   ->{attributes} isa HASH (start tag, end tag)  ##   ->{attributes} isa HASH (start tag, end tag)
172  ##   ->{data} (comment, character)  ##   ->{data} (comment, character)
173    
 ## Macros  
 ##   Macros MUST be preceded by three EXCLAMATION MARKs.  
 ##   emit ($token)  
 ##     Emits the specified token.  
   
174  ## Emitted token MUST immediately be handled by the tree construction state.  ## Emitted token MUST immediately be handled by the tree construction state.
175    
176  ## Before each step, UA MAY check to see if either one of the scripts in  ## Before each step, UA MAY check to see if either one of the scripts in
# Line 461  sub _get_next_token ($) { Line 196  sub _get_next_token ($) {
196          } else {          } else {
197            #            #
198          }          }
199          } elsif ($self->{next_input_character} == 0x002D) { # -
200            if ($self->{content_model_flag} eq 'RCDATA' or
201                $self->{content_model_flag} eq 'CDATA') {
202              unless ($self->{escape}) {
203                if ($self->{prev_input_character}->[0] == 0x002D and # -
204                    $self->{prev_input_character}->[1] == 0x0021 and # !
205                    $self->{prev_input_character}->[2] == 0x003C) { # <
206                  $self->{escape} = 1;
207                }
208              }
209            }
210            
211            #
212        } elsif ($self->{next_input_character} == 0x003C) { # <        } elsif ($self->{next_input_character} == 0x003C) { # <
213          if ($self->{content_model_flag} ne 'PLAINTEXT') {          if ($self->{content_model_flag} eq 'PCDATA' or
214                (($self->{content_model_flag} eq 'CDATA' or
215                  $self->{content_model_flag} eq 'RCDATA') and
216                 not $self->{escape})) {
217            $self->{state} = 'tag open';            $self->{state} = 'tag open';
218            !!!next-input-character;            !!!next-input-character;
219            redo A;            redo A;
220          } else {          } else {
221            #            #
222          }          }
223          } elsif ($self->{next_input_character} == 0x003E) { # >
224            if ($self->{escape} and
225                ($self->{content_model_flag} eq 'RCDATA' or
226                 $self->{content_model_flag} eq 'CDATA')) {
227              if ($self->{prev_input_character}->[0] == 0x002D and # -
228                  $self->{prev_input_character}->[1] == 0x002D) { # -
229                delete $self->{escape};
230              }
231            }
232            
233            #
234        } elsif ($self->{next_input_character} == -1) {        } elsif ($self->{next_input_character} == -1) {
235          !!!emit ({type => 'end-of-file'});          !!!emit ({type => 'end-of-file'});
236          last A; ## TODO: ok?          last A; ## TODO: ok?
# Line 573  sub _get_next_token ($) { Line 335  sub _get_next_token ($) {
335              !!!next-input-character;              !!!next-input-character;
336              next TAGNAME;              next TAGNAME;
337            } else {            } else {
             !!!parse-error (type => 'unmatched end tag');  
338              $self->{next_input_character} = shift @next_char; # reconsume              $self->{next_input_character} = shift @next_char; # reconsume
339              !!!back-next-input-character (@next_char);              !!!back-next-input-character (@next_char);
340              $self->{state} = 'data';              $self->{state} = 'data';
# Line 592  sub _get_next_token ($) { Line 353  sub _get_next_token ($) {
353                  $self->{next_input_character} == 0x0020 or # SP                  $self->{next_input_character} == 0x0020 or # SP
354                  $self->{next_input_character} == 0x003E or # >                  $self->{next_input_character} == 0x003E or # >
355                  $self->{next_input_character} == 0x002F or # /                  $self->{next_input_character} == 0x002F or # /
                 $self->{next_input_character} == 0x003C or # <  
356                  $self->{next_input_character} == -1) {                  $self->{next_input_character} == -1) {
           !!!parse-error (type => 'unmatched end tag');  
357            $self->{next_input_character} = shift @next_char; # reconsume            $self->{next_input_character} = shift @next_char; # reconsume
358            !!!back-next-input-character (@next_char);            !!!back-next-input-character (@next_char);
359            $self->{state} = 'data';            $self->{state} = 'data';
# Line 676  sub _get_next_token ($) { Line 435  sub _get_next_token ($) {
435          ## Stay in this state          ## Stay in this state
436          !!!next-input-character;          !!!next-input-character;
437          redo A;          redo A;
438        } elsif ($self->{next_input_character} == 0x003C or # <        } elsif ($self->{next_input_character} == -1) {
                $self->{next_input_character} == -1) {  
439          !!!parse-error (type => 'unclosed tag');          !!!parse-error (type => 'unclosed tag');
440          if ($self->{current_token}->{type} eq 'start tag') {          if ($self->{current_token}->{type} eq 'start tag') {
441            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
# Line 763  sub _get_next_token ($) { Line 521  sub _get_next_token ($) {
521          ## Stay in the state          ## Stay in the state
522          # next-input-character is already done          # next-input-character is already done
523          redo A;          redo A;
524        } elsif ($self->{next_input_character} == 0x003C or # <        } elsif ($self->{next_input_character} == -1) {
                $self->{next_input_character} == -1) {  
525          !!!parse-error (type => 'unclosed tag');          !!!parse-error (type => 'unclosed tag');
526          if ($self->{current_token}->{type} eq 'start tag') {          if ($self->{current_token}->{type} eq 'start tag') {
527            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
# Line 855  sub _get_next_token ($) { Line 612  sub _get_next_token ($) {
612          $self->{state} = 'before attribute name';          $self->{state} = 'before attribute name';
613          # next-input-character is already done          # next-input-character is already done
614          redo A;          redo A;
615        } elsif ($self->{next_input_character} == 0x003C or # <        } elsif ($self->{next_input_character} == -1) {
                $self->{next_input_character} == -1) {  
616          !!!parse-error (type => 'unclosed tag');          !!!parse-error (type => 'unclosed tag');
617          $before_leave->();          $before_leave->();
618          if ($self->{current_token}->{type} eq 'start tag') {          if ($self->{current_token}->{type} eq 'start tag') {
# Line 933  sub _get_next_token ($) { Line 689  sub _get_next_token ($) {
689          $self->{state} = 'before attribute name';          $self->{state} = 'before attribute name';
690          # next-input-character is already done          # next-input-character is already done
691          redo A;          redo A;
692        } elsif ($self->{next_input_character} == 0x003C or # <        } elsif ($self->{next_input_character} == -1) {
                $self->{next_input_character} == -1) {  
693          !!!parse-error (type => 'unclosed tag');          !!!parse-error (type => 'unclosed tag');
694          if ($self->{current_token}->{type} eq 'start tag') {          if ($self->{current_token}->{type} eq 'start tag') {
695            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
# Line 999  sub _get_next_token ($) { Line 754  sub _get_next_token ($) {
754          undef $self->{current_token};          undef $self->{current_token};
755    
756          redo A;          redo A;
757        } elsif ($self->{next_input_character} == 0x003C or # <        } elsif ($self->{next_input_character} == -1) {
                $self->{next_input_character} == -1) {  
758          !!!parse-error (type => 'unclosed tag');          !!!parse-error (type => 'unclosed tag');
759          if ($self->{current_token}->{type} eq 'start tag') {          if ($self->{current_token}->{type} eq 'start tag') {
760            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
# Line 1127  sub _get_next_token ($) { Line 881  sub _get_next_token ($) {
881          undef $self->{current_token};          undef $self->{current_token};
882    
883          redo A;          redo A;
884        } elsif ($self->{next_input_character} == 0x003C or # <        } elsif ($self->{next_input_character} == -1) {
                $self->{next_input_character} == -1) {  
885          !!!parse-error (type => 'unclosed tag');          !!!parse-error (type => 'unclosed tag');
886          if ($self->{current_token}->{type} eq 'start tag') {          if ($self->{current_token}->{type} eq 'start tag') {
887            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
# Line 1348  sub _get_next_token ($) { Line 1101  sub _get_next_token ($) {
1101          ## Stay in the state          ## Stay in the state
1102          !!!next-input-character;          !!!next-input-character;
1103          redo A;          redo A;
       } elsif (0x0061 <= $self->{next_input_character} and  
                $self->{next_input_character} <= 0x007A) { # a..z  
 ## ISSUE: "Set the token's name name to the" in the spec  
         $self->{current_token} = {type => 'DOCTYPE',  
                           name => chr ($self->{next_input_character} - 0x0020),  
                           error => 1};  
         $self->{state} = 'DOCTYPE name';  
         !!!next-input-character;  
         redo A;  
1104        } elsif ($self->{next_input_character} == 0x003E) { # >        } elsif ($self->{next_input_character} == 0x003E) { # >
1105          !!!parse-error (type => 'no DOCTYPE name');          !!!parse-error (type => 'no DOCTYPE name');
1106          $self->{state} = 'data';          $self->{state} = 'data';
1107          !!!next-input-character;          !!!next-input-character;
1108    
1109          !!!emit ({type => 'DOCTYPE', name => '', error => 1});          !!!emit ({type => 'DOCTYPE'}); # incorrect
1110    
1111          redo A;          redo A;
1112        } elsif ($self->{next_input_character} == -1) {        } elsif ($self->{next_input_character} == -1) {
# Line 1370  sub _get_next_token ($) { Line 1114  sub _get_next_token ($) {
1114          $self->{state} = 'data';          $self->{state} = 'data';
1115          ## reconsume          ## reconsume
1116    
1117          !!!emit ({type => 'DOCTYPE', name => '', error => 1});          !!!emit ({type => 'DOCTYPE'}); # incorrect
1118    
1119          redo A;          redo A;
1120        } else {        } else {
1121          $self->{current_token} = {type => 'DOCTYPE',          $self->{current_token}
1122                            name => chr ($self->{next_input_character}),              = {type => 'DOCTYPE',
1123                            error => 1};                 name => chr ($self->{next_input_character}),
1124                   correct => 1};
1125  ## ISSUE: "Set the token's name name to the" in the spec  ## ISSUE: "Set the token's name name to the" in the spec
1126          $self->{state} = 'DOCTYPE name';          $self->{state} = 'DOCTYPE name';
1127          !!!next-input-character;          !!!next-input-character;
1128          redo A;          redo A;
1129        }        }
1130      } elsif ($self->{state} eq 'DOCTYPE name') {      } elsif ($self->{state} eq 'DOCTYPE name') {
1131    ## ISSUE: Redundant "First," in the spec.
1132        if ($self->{next_input_character} == 0x0009 or # HT        if ($self->{next_input_character} == 0x0009 or # HT
1133            $self->{next_input_character} == 0x000A or # LF            $self->{next_input_character} == 0x000A or # LF
1134            $self->{next_input_character} == 0x000B or # VT            $self->{next_input_character} == 0x000B or # VT
1135            $self->{next_input_character} == 0x000C or # FF            $self->{next_input_character} == 0x000C or # FF
1136            $self->{next_input_character} == 0x0020) { # SP            $self->{next_input_character} == 0x0020) { # SP
         $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE  
1137          $self->{state} = 'after DOCTYPE name';          $self->{state} = 'after DOCTYPE name';
1138          !!!next-input-character;          !!!next-input-character;
1139          redo A;          redo A;
1140        } elsif ($self->{next_input_character} == 0x003E) { # >        } elsif ($self->{next_input_character} == 0x003E) { # >
         $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE  
1141          $self->{state} = 'data';          $self->{state} = 'data';
1142          !!!next-input-character;          !!!next-input-character;
1143    
# Line 1401  sub _get_next_token ($) { Line 1145  sub _get_next_token ($) {
1145          undef $self->{current_token};          undef $self->{current_token};
1146    
1147          redo A;          redo A;
       } elsif (0x0061 <= $self->{next_input_character} and  
                $self->{next_input_character} <= 0x007A) { # a..z  
         $self->{current_token}->{name} .= chr ($self->{next_input_character} - 0x0020); # DOCTYPE  
         #$self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML');  
         ## Stay in the state  
         !!!next-input-character;  
         redo A;  
1148        } elsif ($self->{next_input_character} == -1) {        } elsif ($self->{next_input_character} == -1) {
1149          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
         $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE  
1150          $self->{state} = 'data';          $self->{state} = 'data';
1151          ## reconsume          ## reconsume
1152    
1153          !!!emit ($self->{current_token});          delete $self->{current_token}->{correct};
1154            !!!emit ($self->{current_token}); # DOCTYPE
1155          undef $self->{current_token};          undef $self->{current_token};
1156    
1157          redo A;          redo A;
1158        } else {        } else {
1159          $self->{current_token}->{name}          $self->{current_token}->{name}
1160            .= chr ($self->{next_input_character}); # DOCTYPE            .= chr ($self->{next_input_character}); # DOCTYPE
         #$self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML');  
1161          ## Stay in the state          ## Stay in the state
1162          !!!next-input-character;          !!!next-input-character;
1163          redo A;          redo A;
# Line 1448  sub _get_next_token ($) { Line 1184  sub _get_next_token ($) {
1184          $self->{state} = 'data';          $self->{state} = 'data';
1185          ## reconsume          ## reconsume
1186    
1187            delete $self->{current_token}->{correct};
1188            !!!emit ($self->{current_token}); # DOCTYPE
1189            undef $self->{current_token};
1190    
1191            redo A;
1192          } elsif ($self->{next_input_character} == 0x0050 or # P
1193                   $self->{next_input_character} == 0x0070) { # p
1194            !!!next-input-character;
1195            if ($self->{next_input_character} == 0x0055 or # U
1196                $self->{next_input_character} == 0x0075) { # u
1197              !!!next-input-character;
1198              if ($self->{next_input_character} == 0x0042 or # B
1199                  $self->{next_input_character} == 0x0062) { # b
1200                !!!next-input-character;
1201                if ($self->{next_input_character} == 0x004C or # L
1202                    $self->{next_input_character} == 0x006C) { # l
1203                  !!!next-input-character;
1204                  if ($self->{next_input_character} == 0x0049 or # I
1205                      $self->{next_input_character} == 0x0069) { # i
1206                    !!!next-input-character;
1207                    if ($self->{next_input_character} == 0x0043 or # C
1208                        $self->{next_input_character} == 0x0063) { # c
1209                      $self->{state} = 'before DOCTYPE public identifier';
1210                      !!!next-input-character;
1211                      redo A;
1212                    }
1213                  }
1214                }
1215              }
1216            }
1217    
1218            #
1219          } elsif ($self->{next_input_character} == 0x0053 or # S
1220                   $self->{next_input_character} == 0x0073) { # s
1221            !!!next-input-character;
1222            if ($self->{next_input_character} == 0x0059 or # Y
1223                $self->{next_input_character} == 0x0079) { # y
1224              !!!next-input-character;
1225              if ($self->{next_input_character} == 0x0053 or # S
1226                  $self->{next_input_character} == 0x0073) { # s
1227                !!!next-input-character;
1228                if ($self->{next_input_character} == 0x0054 or # T
1229                    $self->{next_input_character} == 0x0074) { # t
1230                  !!!next-input-character;
1231                  if ($self->{next_input_character} == 0x0045 or # E
1232                      $self->{next_input_character} == 0x0065) { # e
1233                    !!!next-input-character;
1234                    if ($self->{next_input_character} == 0x004D or # M
1235                        $self->{next_input_character} == 0x006D) { # m
1236                      $self->{state} = 'before DOCTYPE system identifier';
1237                      !!!next-input-character;
1238                      redo A;
1239                    }
1240                  }
1241                }
1242              }
1243            }
1244    
1245            #
1246          } else {
1247            !!!next-input-character;
1248            #
1249          }
1250    
1251          !!!parse-error (type => 'string after DOCTYPE name');
1252          $self->{state} = 'bogus DOCTYPE';
1253          # next-input-character is already done
1254          redo A;
1255        } elsif ($self->{state} eq 'before DOCTYPE public identifier') {
1256          if ({
1257                0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1258                #0x000D => 1, # HT, LF, VT, FF, SP, CR
1259              }->{$self->{next_input_character}}) {
1260            ## Stay in the state
1261            !!!next-input-character;
1262            redo A;
1263          } elsif ($self->{next_input_character} eq 0x0022) { # "
1264            $self->{current_token}->{public_identifier} = ''; # DOCTYPE
1265            $self->{state} = 'DOCTYPE public identifier (double-quoted)';
1266            !!!next-input-character;
1267            redo A;
1268          } elsif ($self->{next_input_character} eq 0x0027) { # '
1269            $self->{current_token}->{public_identifier} = ''; # DOCTYPE
1270            $self->{state} = 'DOCTYPE public identifier (single-quoted)';
1271            !!!next-input-character;
1272            redo A;
1273          } elsif ($self->{next_input_character} eq 0x003E) { # >
1274            !!!parse-error (type => 'no PUBLIC literal');
1275    
1276            $self->{state} = 'data';
1277            !!!next-input-character;
1278    
1279            delete $self->{current_token}->{correct};
1280            !!!emit ($self->{current_token}); # DOCTYPE
1281            undef $self->{current_token};
1282    
1283            redo A;
1284          } elsif ($self->{next_input_character} == -1) {
1285            !!!parse-error (type => 'unclosed DOCTYPE');
1286    
1287            $self->{state} = 'data';
1288            ## reconsume
1289    
1290            delete $self->{current_token}->{correct};
1291            !!!emit ($self->{current_token}); # DOCTYPE
1292            undef $self->{current_token};
1293    
1294            redo A;
1295          } else {
1296            !!!parse-error (type => 'string after PUBLIC');
1297            $self->{state} = 'bogus DOCTYPE';
1298            !!!next-input-character;
1299            redo A;
1300          }
1301        } elsif ($self->{state} eq 'DOCTYPE public identifier (double-quoted)') {
1302          if ($self->{next_input_character} == 0x0022) { # "
1303            $self->{state} = 'after DOCTYPE public identifier';
1304            !!!next-input-character;
1305            redo A;
1306          } elsif ($self->{next_input_character} == -1) {
1307            !!!parse-error (type => 'unclosed PUBLIC literal');
1308    
1309            $self->{state} = 'data';
1310            ## reconsume
1311    
1312            delete $self->{current_token}->{correct};
1313            !!!emit ($self->{current_token}); # DOCTYPE
1314            undef $self->{current_token};
1315    
1316            redo A;
1317          } else {
1318            $self->{current_token}->{public_identifier} # DOCTYPE
1319                .= chr $self->{next_input_character};
1320            ## Stay in the state
1321            !!!next-input-character;
1322            redo A;
1323          }
1324        } elsif ($self->{state} eq 'DOCTYPE public identifier (single-quoted)') {
1325          if ($self->{next_input_character} == 0x0027) { # '
1326            $self->{state} = 'after DOCTYPE public identifier';
1327            !!!next-input-character;
1328            redo A;
1329          } elsif ($self->{next_input_character} == -1) {
1330            !!!parse-error (type => 'unclosed PUBLIC literal');
1331    
1332            $self->{state} = 'data';
1333            ## reconsume
1334    
1335            delete $self->{current_token}->{correct};
1336            !!!emit ($self->{current_token}); # DOCTYPE
1337            undef $self->{current_token};
1338    
1339            redo A;
1340          } else {
1341            $self->{current_token}->{public_identifier} # DOCTYPE
1342                .= chr $self->{next_input_character};
1343            ## Stay in the state
1344            !!!next-input-character;
1345            redo A;
1346          }
1347        } elsif ($self->{state} eq 'after DOCTYPE public identifier') {
1348          if ({
1349                0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1350                #0x000D => 1, # HT, LF, VT, FF, SP, CR
1351              }->{$self->{next_input_character}}) {
1352            ## Stay in the state
1353            !!!next-input-character;
1354            redo A;
1355          } elsif ($self->{next_input_character} == 0x0022) { # "
1356            $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1357            $self->{state} = 'DOCTYPE system identifier (double-quoted)';
1358            !!!next-input-character;
1359            redo A;
1360          } elsif ($self->{next_input_character} == 0x0027) { # '
1361            $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1362            $self->{state} = 'DOCTYPE system identifier (single-quoted)';
1363            !!!next-input-character;
1364            redo A;
1365          } elsif ($self->{next_input_character} == 0x003E) { # >
1366            $self->{state} = 'data';
1367            !!!next-input-character;
1368    
1369            !!!emit ($self->{current_token}); # DOCTYPE
1370            undef $self->{current_token};
1371    
1372            redo A;
1373          } elsif ($self->{next_input_character} == -1) {
1374            !!!parse-error (type => 'unclosed DOCTYPE');
1375    
1376            $self->{state} = 'data';
1377            ## recomsume
1378    
1379            delete $self->{current_token}->{correct};
1380            !!!emit ($self->{current_token}); # DOCTYPE
1381            undef $self->{current_token};
1382    
1383            redo A;
1384          } else {
1385            !!!parse-error (type => 'string after PUBLIC literal');
1386            $self->{state} = 'bogus DOCTYPE';
1387            !!!next-input-character;
1388            redo A;
1389          }
1390        } elsif ($self->{state} eq 'before DOCTYPE system identifier') {
1391          if ({
1392                0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1393                #0x000D => 1, # HT, LF, VT, FF, SP, CR
1394              }->{$self->{next_input_character}}) {
1395            ## Stay in the state
1396            !!!next-input-character;
1397            redo A;
1398          } elsif ($self->{next_input_character} == 0x0022) { # "
1399            $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1400            $self->{state} = 'DOCTYPE system identifier (double-quoted)';
1401            !!!next-input-character;
1402            redo A;
1403          } elsif ($self->{next_input_character} == 0x0027) { # '
1404            $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1405            $self->{state} = 'DOCTYPE system identifier (single-quoted)';
1406            !!!next-input-character;
1407            redo A;
1408          } elsif ($self->{next_input_character} == 0x003E) { # >
1409            !!!parse-error (type => 'no SYSTEM literal');
1410            $self->{state} = 'data';
1411            !!!next-input-character;
1412    
1413            delete $self->{current_token}->{correct};
1414            !!!emit ($self->{current_token}); # DOCTYPE
1415            undef $self->{current_token};
1416    
1417            redo A;
1418          } elsif ($self->{next_input_character} == -1) {
1419            !!!parse-error (type => 'unclosed DOCTYPE');
1420    
1421            $self->{state} = 'data';
1422            ## recomsume
1423    
1424            delete $self->{current_token}->{correct};
1425            !!!emit ($self->{current_token}); # DOCTYPE
1426            undef $self->{current_token};
1427    
1428            redo A;
1429          } else {
1430            !!!parse-error (type => 'string after PUBLIC literal');
1431            $self->{state} = 'bogus DOCTYPE';
1432            !!!next-input-character;
1433            redo A;
1434          }
1435        } elsif ($self->{state} eq 'DOCTYPE system identifier (double-quoted)') {
1436          if ($self->{next_input_character} == 0x0022) { # "
1437            $self->{state} = 'after DOCTYPE system identifier';
1438            !!!next-input-character;
1439            redo A;
1440          } elsif ($self->{next_input_character} == -1) {
1441            !!!parse-error (type => 'unclosed SYSTEM literal');
1442    
1443            $self->{state} = 'data';
1444            ## reconsume
1445    
1446            delete $self->{current_token}->{correct};
1447            !!!emit ($self->{current_token}); # DOCTYPE
1448            undef $self->{current_token};
1449    
1450            redo A;
1451          } else {
1452            $self->{current_token}->{system_identifier} # DOCTYPE
1453                .= chr $self->{next_input_character};
1454            ## Stay in the state
1455            !!!next-input-character;
1456            redo A;
1457          }
1458        } elsif ($self->{state} eq 'DOCTYPE system identifier (single-quoted)') {
1459          if ($self->{next_input_character} == 0x0027) { # '
1460            $self->{state} = 'after DOCTYPE system identifier';
1461            !!!next-input-character;
1462            redo A;
1463          } elsif ($self->{next_input_character} == -1) {
1464            !!!parse-error (type => 'unclosed SYSTEM literal');
1465    
1466            $self->{state} = 'data';
1467            ## reconsume
1468    
1469            delete $self->{current_token}->{correct};
1470            !!!emit ($self->{current_token}); # DOCTYPE
1471            undef $self->{current_token};
1472    
1473            redo A;
1474          } else {
1475            $self->{current_token}->{system_identifier} # DOCTYPE
1476                .= chr $self->{next_input_character};
1477            ## Stay in the state
1478            !!!next-input-character;
1479            redo A;
1480          }
1481        } elsif ($self->{state} eq 'after DOCTYPE system identifier') {
1482          if ({
1483                0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1484                #0x000D => 1, # HT, LF, VT, FF, SP, CR
1485              }->{$self->{next_input_character}}) {
1486            ## Stay in the state
1487            !!!next-input-character;
1488            redo A;
1489          } elsif ($self->{next_input_character} == 0x003E) { # >
1490            $self->{state} = 'data';
1491            !!!next-input-character;
1492    
1493            !!!emit ($self->{current_token}); # DOCTYPE
1494            undef $self->{current_token};
1495    
1496            redo A;
1497          } elsif ($self->{next_input_character} == -1) {
1498            !!!parse-error (type => 'unclosed DOCTYPE');
1499    
1500            $self->{state} = 'data';
1501            ## recomsume
1502    
1503            delete $self->{current_token}->{correct};
1504          !!!emit ($self->{current_token}); # DOCTYPE          !!!emit ($self->{current_token}); # DOCTYPE
1505          undef $self->{current_token};          undef $self->{current_token};
1506    
1507          redo A;          redo A;
1508        } else {        } else {
1509          !!!parse-error (type => 'string after DOCTYPE name');          !!!parse-error (type => 'string after SYSTEM literal');
         $self->{current_token}->{error} = 1; # DOCTYPE  
1510          $self->{state} = 'bogus DOCTYPE';          $self->{state} = 'bogus DOCTYPE';
1511          !!!next-input-character;          !!!next-input-character;
1512          redo A;          redo A;
# Line 1464  sub _get_next_token ($) { Line 1516  sub _get_next_token ($) {
1516          $self->{state} = 'data';          $self->{state} = 'data';
1517          !!!next-input-character;          !!!next-input-character;
1518    
1519            delete $self->{current_token}->{correct};
1520          !!!emit ($self->{current_token}); # DOCTYPE          !!!emit ($self->{current_token}); # DOCTYPE
1521          undef $self->{current_token};          undef $self->{current_token};
1522    
# Line 1473  sub _get_next_token ($) { Line 1526  sub _get_next_token ($) {
1526          $self->{state} = 'data';          $self->{state} = 'data';
1527          ## reconsume          ## reconsume
1528    
1529            delete $self->{current_token}->{correct};
1530          !!!emit ($self->{current_token}); # DOCTYPE          !!!emit ($self->{current_token}); # DOCTYPE
1531          undef $self->{current_token};          undef $self->{current_token};
1532    
# Line 1492  sub _get_next_token ($) { Line 1546  sub _get_next_token ($) {
1546    
1547  sub _tokenize_attempt_to_consume_an_entity ($) {  sub _tokenize_attempt_to_consume_an_entity ($) {
1548    my $self = shift;    my $self = shift;
1549      
1550    if ($self->{next_input_character} == 0x0023) { # #    if ({
1551           0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
1552           0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR
1553          }->{$self->{next_input_character}}) {
1554        ## Don't consume
1555        ## No error
1556        return undef;
1557      } elsif ($self->{next_input_character} == 0x0023) { # #
1558      !!!next-input-character;      !!!next-input-character;
1559      if ($self->{next_input_character} == 0x0078 or # x      if ($self->{next_input_character} == 0x0078 or # x
1560          $self->{next_input_character} == 0x0058) { # X          $self->{next_input_character} == 0x0058) { # X
# Line 1538  sub _tokenize_attempt_to_consume_an_enti Line 1599  sub _tokenize_attempt_to_consume_an_enti
1599            $num = 0xFFFD; # REPLACEMENT CHARACTER            $num = 0xFFFD; # REPLACEMENT CHARACTER
1600            ## ISSUE: Why this is not an error?            ## ISSUE: Why this is not an error?
1601          } elsif (0x80 <= $num and $num <= 0x9F) {          } elsif (0x80 <= $num and $num <= 0x9F) {
1602            ## NOTE: <http://lists.whatwg.org/pipermail/whatwg-whatwg.org/2006-December/thread.html#8562>            !!!parse-error (type => sprintf 'c1 entity:U+%04X', $num);
           ## ISSUE: Not in the spec yet; parse error?  
1603            $num = $c1_entity_char->{$num};            $num = $c1_entity_char->{$num};
1604          }          }
1605    
# Line 1569  sub _tokenize_attempt_to_consume_an_enti Line 1629  sub _tokenize_attempt_to_consume_an_enti
1629          $code = 0xFFFD; # REPLACEMENT CHARACTER          $code = 0xFFFD; # REPLACEMENT CHARACTER
1630          ## ISSUE: Why this is not an error?          ## ISSUE: Why this is not an error?
1631        } elsif (0x80 <= $code and $code <= 0x9F) {        } elsif (0x80 <= $code and $code <= 0x9F) {
1632          ## NOTE: <http://lists.whatwg.org/pipermail/whatwg-whatwg.org/2006-December/thread.html#8562>          !!!parse-error (type => sprintf 'c1 entity:U+%04X', $code);
         ## ISSUE: Not in the spec yet; parse error?  
1633          $code = $c1_entity_char->{$code};          $code = $c1_entity_char->{$code};
1634        }        }
1635                
# Line 1590  sub _tokenize_attempt_to_consume_an_enti Line 1649  sub _tokenize_attempt_to_consume_an_enti
1649    
1650      my $value = $entity_name;      my $value = $entity_name;
1651      my $match;      my $match;
1652        require Whatpm::_NamedEntityList;
1653        our $EntityChar;
1654    
1655      while (length $entity_name < 10 and      while (length $entity_name < 10 and
1656             ## NOTE: Some number greater than the maximum length of entity name             ## NOTE: Some number greater than the maximum length of entity name
1657             ((0x0041 <= $self->{next_input_character} and             ((0x0041 <= $self->{next_input_character} and # a
1658               $self->{next_input_character} <= 0x005A) or               $self->{next_input_character} <= 0x005A) or # x
1659              (0x0061 <= $self->{next_input_character} and              (0x0061 <= $self->{next_input_character} and # a
1660               $self->{next_input_character} <= 0x007A) or               $self->{next_input_character} <= 0x007A) or # z
1661              (0x0030 <= $self->{next_input_character} and              (0x0030 <= $self->{next_input_character} and # 0
1662               $self->{next_input_character} <= 0x0039))) {               $self->{next_input_character} <= 0x0039) or # 9
1663                $self->{next_input_character} == 0x003B)) { # ;
1664        $entity_name .= chr $self->{next_input_character};        $entity_name .= chr $self->{next_input_character};
1665        if (defined $entity_char->{$entity_name}) {        if (defined $EntityChar->{$entity_name}) {
1666          $value = $entity_char->{$entity_name};          $value = $EntityChar->{$entity_name};
1667          $match = 1;          if ($self->{next_input_character} == 0x003B) { # ;
1668              $match = 1;
1669              !!!next-input-character;
1670              last;
1671            } else {
1672              $match = -1;
1673            }
1674        } else {        } else {
1675          $value .= chr $self->{next_input_character};          $value .= chr $self->{next_input_character};
1676        }        }
1677        !!!next-input-character;        !!!next-input-character;
1678      }      }
1679            
1680      if ($match) {      if ($match > 0) {
1681        if ($self->{next_input_character} == 0x003B) { # ;        return {type => 'character', data => $value};
1682          !!!next-input-character;      } elsif ($match < 0) {
1683        } else {        !!!parse-error (type => 'refc');
         !!!parse-error (type => 'refc');  
       }  
   
1684        return {type => 'character', data => $value};        return {type => 'character', data => $value};
1685      } else {      } else {
1686        !!!parse-error (type => 'bare ero');        !!!parse-error (type => 'bare ero');
# Line 1636  sub _initialize_tree_constructor ($) { Line 1701  sub _initialize_tree_constructor ($) {
1701    $self->{document}->strict_error_checking (0);    $self->{document}->strict_error_checking (0);
1702    ## TODO: Turn mutation events off # MUST    ## TODO: Turn mutation events off # MUST
1703    ## TODO: Turn loose Document option (manakai extension) on    ## TODO: Turn loose Document option (manakai extension) on
1704    ## TODO: Mark the Document as an HTML document # MUST    $self->{document}->manakai_is_html (1); # MUST
1705  } # _initialize_tree_constructor  } # _initialize_tree_constructor
1706    
1707  sub _terminate_tree_constructor ($) {  sub _terminate_tree_constructor ($) {
# Line 1676  sub _construct_tree ($) { Line 1741  sub _construct_tree ($) {
1741    
1742  sub _tree_construction_initial ($) {  sub _tree_construction_initial ($) {
1743    my $self = shift;    my $self = shift;
1744    B: {    INITIAL: {
1745        if ($token->{type} eq 'DOCTYPE') {      if ($token->{type} eq 'DOCTYPE') {
1746          if ($token->{error}) {        ## NOTE: Conformance checkers MAY, instead of reporting "not HTML5"
1747            ## ISSUE: Spec currently left this case undefined.        ## error, switch to a conformance checking mode for another
1748            !!!parse-error (type => 'bogus DOCTYPE');        ## language.
1749          }        my $doctype_name = $token->{name};
1750          my $doctype = $self->{document}->create_document_type_definition        $doctype_name = '' unless defined $doctype_name;
1751            ($token->{name});        $doctype_name =~ tr/a-z/A-Z/;
1752          $self->{document}->append_child ($doctype);        if (not defined $token->{name} or # <!DOCTYPE>
1753          #$phase = 'root element';            defined $token->{public_identifier} or
1754          !!!next-token;            defined $token->{system_identifier}) {
1755          #redo B;          !!!parse-error (type => 'not HTML5');
1756          return;        } elsif ($doctype_name ne 'HTML') {
1757        } elsif ({          ## ISSUE: ASCII case-insensitive? (in fact it does not matter)
1758                  comment => 1,          !!!parse-error (type => 'not HTML5');
1759                  'start tag' => 1,        }
1760                  'end tag' => 1,        
1761                  'end-of-file' => 1,        my $doctype = $self->{document}->create_document_type_definition
1762                 }->{$token->{type}}) {          ($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)?
1763          ## ISSUE: Spec currently left this case undefined.        $doctype->public_id ($token->{public_identifier})
1764          !!!parse-error (type => 'missing DOCTYPE');            if defined $token->{public_identifier};
1765          #$phase = 'root element';        $doctype->system_id ($token->{system_identifier})
1766          ## reprocess            if defined $token->{system_identifier};
1767          #redo B;        ## NOTE: Other DocumentType attributes are null or empty lists.
1768          return;        ## ISSUE: internalSubset = null??
1769        } elsif ($token->{type} eq 'character') {        $self->{document}->append_child ($doctype);
1770          if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {        
1771            $self->{document}->manakai_append_text ($1);        if (not $token->{correct} or $doctype_name ne 'HTML') {
1772            ## ISSUE: DOM3 Core does not allow Document > Text          $self->{document}->manakai_compat_mode ('quirks');
1773            unless (length $token->{data}) {        } elsif (defined $token->{public_identifier}) {
1774              ## Stay in the phase          my $pubid = $token->{public_identifier};
1775              !!!next-token;          $pubid =~ tr/a-z/A-z/;
1776              redo B;          if ({
1777              "+//SILMARIL//DTD HTML PRO V0R11 19970101//EN" => 1,
1778              "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
1779              "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
1780              "-//IETF//DTD HTML 2.0 LEVEL 1//EN" => 1,
1781              "-//IETF//DTD HTML 2.0 LEVEL 2//EN" => 1,
1782              "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//EN" => 1,
1783              "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//EN" => 1,
1784              "-//IETF//DTD HTML 2.0 STRICT//EN" => 1,
1785              "-//IETF//DTD HTML 2.0//EN" => 1,
1786              "-//IETF//DTD HTML 2.1E//EN" => 1,
1787              "-//IETF//DTD HTML 3.0//EN" => 1,
1788              "-//IETF//DTD HTML 3.0//EN//" => 1,
1789              "-//IETF//DTD HTML 3.2 FINAL//EN" => 1,
1790              "-//IETF//DTD HTML 3.2//EN" => 1,
1791              "-//IETF//DTD HTML 3//EN" => 1,
1792              "-//IETF//DTD HTML LEVEL 0//EN" => 1,
1793              "-//IETF//DTD HTML LEVEL 0//EN//2.0" => 1,
1794              "-//IETF//DTD HTML LEVEL 1//EN" => 1,
1795              "-//IETF//DTD HTML LEVEL 1//EN//2.0" => 1,
1796              "-//IETF//DTD HTML LEVEL 2//EN" => 1,
1797              "-//IETF//DTD HTML LEVEL 2//EN//2.0" => 1,
1798              "-//IETF//DTD HTML LEVEL 3//EN" => 1,
1799              "-//IETF//DTD HTML LEVEL 3//EN//3.0" => 1,
1800              "-//IETF//DTD HTML STRICT LEVEL 0//EN" => 1,
1801              "-//IETF//DTD HTML STRICT LEVEL 0//EN//2.0" => 1,
1802              "-//IETF//DTD HTML STRICT LEVEL 1//EN" => 1,
1803              "-//IETF//DTD HTML STRICT LEVEL 1//EN//2.0" => 1,
1804              "-//IETF//DTD HTML STRICT LEVEL 2//EN" => 1,
1805              "-//IETF//DTD HTML STRICT LEVEL 2//EN//2.0" => 1,
1806              "-//IETF//DTD HTML STRICT LEVEL 3//EN" => 1,
1807              "-//IETF//DTD HTML STRICT LEVEL 3//EN//3.0" => 1,
1808              "-//IETF//DTD HTML STRICT//EN" => 1,
1809              "-//IETF//DTD HTML STRICT//EN//2.0" => 1,
1810              "-//IETF//DTD HTML STRICT//EN//3.0" => 1,
1811              "-//IETF//DTD HTML//EN" => 1,
1812              "-//IETF//DTD HTML//EN//2.0" => 1,
1813              "-//IETF//DTD HTML//EN//3.0" => 1,
1814              "-//METRIUS//DTD METRIUS PRESENTATIONAL//EN" => 1,
1815              "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//EN" => 1,
1816              "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//EN" => 1,
1817              "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//EN" => 1,
1818              "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//EN" => 1,
1819              "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//EN" => 1,
1820              "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//EN" => 1,
1821              "-//NETSCAPE COMM. CORP.//DTD HTML//EN" => 1,
1822              "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1,
1823              "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1,
1824              "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1,
1825              "-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1,
1826              "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1,
1827              "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1,
1828              "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//EN" => 1,
1829              "-//W3C//DTD HTML 3 1995-03-24//EN" => 1,
1830              "-//W3C//DTD HTML 3.2 DRAFT//EN" => 1,
1831              "-//W3C//DTD HTML 3.2 FINAL//EN" => 1,
1832              "-//W3C//DTD HTML 3.2//EN" => 1,
1833              "-//W3C//DTD HTML 3.2S DRAFT//EN" => 1,
1834              "-//W3C//DTD HTML 4.0 FRAMESET//EN" => 1,
1835              "-//W3C//DTD HTML 4.0 TRANSITIONAL//EN" => 1,
1836              "-//W3C//DTD HTML EXPERIMETNAL 19960712//EN" => 1,
1837              "-//W3C//DTD HTML EXPERIMENTAL 970421//EN" => 1,
1838              "-//W3C//DTD W3 HTML//EN" => 1,
1839              "-//W3O//DTD W3 HTML 3.0//EN" => 1,
1840              "-//W3O//DTD W3 HTML 3.0//EN//" => 1,
1841              "-//W3O//DTD W3 HTML STRICT 3.0//EN//" => 1,
1842              "-//WEBTECHS//DTD MOZILLA HTML 2.0//EN" => 1,
1843              "-//WEBTECHS//DTD MOZILLA HTML//EN" => 1,
1844              "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" => 1,
1845              "HTML" => 1,
1846            }->{$pubid}) {
1847              $self->{document}->manakai_compat_mode ('quirks');
1848            } elsif ($pubid eq "-//W3C//DTD HTML 4.01 FRAMESET//EN" or
1849                     $pubid eq "-//W3C//DTD HTML 4.01 TRANSITIONAL//EN") {
1850              if (defined $token->{system_identifier}) {
1851                $self->{document}->manakai_compat_mode ('quirks');
1852              } else {
1853                $self->{document}->manakai_compat_mode ('limited quirks');
1854            }            }
1855            } elsif ($pubid eq "-//W3C//DTD XHTML 1.0 Frameset//EN" or
1856                     $pubid eq "-//W3C//DTD XHTML 1.0 Transitional//EN") {
1857              $self->{document}->manakai_compat_mode ('limited quirks');
1858            }
1859          }
1860          if (defined $token->{system_identifier}) {
1861            my $sysid = $token->{system_identifier};
1862            $sysid =~ tr/A-Z/a-z/;
1863            if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
1864              $self->{document}->manakai_compat_mode ('quirks');
1865          }          }
         ## ISSUE: Spec currently left this case undefined.  
         !!!parse-error (type => 'missing DOCTYPE');  
         #$phase = 'root element';  
         ## reprocess  
         #redo B;  
         return;  
       } else {  
         die "$0: $token->{type}: Unknown token";  
1866        }        }
1867      } # B        
1868          ## Go to the root element phase.
1869          !!!next-token;
1870          return;
1871        } elsif ({
1872                  'start tag' => 1,
1873                  'end tag' => 1,
1874                  'end-of-file' => 1,
1875                 }->{$token->{type}}) {
1876          !!!parse-error (type => 'no DOCTYPE');
1877          $self->{document}->manakai_compat_mode ('quirks');
1878          ## Go to the root element phase
1879          ## reprocess
1880          return;
1881        } elsif ($token->{type} eq 'character') {
1882          if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
1883            ## Ignore the token
1884            unless (length $token->{data}) {
1885              ## Stay in the phase
1886              !!!next-token;
1887              redo INITIAL;
1888            }
1889          }
1890    
1891          !!!parse-error (type => 'no DOCTYPE');
1892          $self->{document}->manakai_compat_mode ('quirks');
1893          ## Go to the root element phase
1894          ## reprocess
1895          return;
1896        } elsif ($token->{type} eq 'comment') {
1897          my $comment = $self->{document}->create_comment ($token->{data});
1898          $self->{document}->append_child ($comment);
1899          
1900          ## Stay in the phase.
1901          !!!next-token;
1902          redo INITIAL;
1903        } else {
1904          die "$0: $token->{type}: Unknown token";
1905        }
1906      } # INITIAL
1907  } # _tree_construction_initial  } # _tree_construction_initial
1908    
1909  sub _tree_construction_root_element ($) {  sub _tree_construction_root_element ($) {
# Line 1926  sub _tree_construction_main ($) { Line 2109  sub _tree_construction_main ($) {
2109       ? $self->{head_element} : $self->{open_elements}->[-1]->[0])       ? $self->{head_element} : $self->{open_elements}->[-1]->[0])
2110        ->append_child ($style_el);        ->append_child ($style_el);
2111      $self->{content_model_flag} = 'CDATA';      $self->{content_model_flag} = 'CDATA';
2112        delete $self->{escape}; # MUST
2113                                
2114      my $text = '';      my $text = '';
2115      !!!next-token;      !!!next-token;
# Line 1954  sub _tree_construction_main ($) { Line 2138  sub _tree_construction_main ($) {
2138      ## TODO: mark as "parser-inserted"      ## TODO: mark as "parser-inserted"
2139    
2140      $self->{content_model_flag} = 'CDATA';      $self->{content_model_flag} = 'CDATA';
2141        delete $self->{escape}; # MUST
2142            
2143      my $text = '';      my $text = '';
2144      !!!next-token;      !!!next-token;
# Line 2243  sub _tree_construction_main ($) { Line 2428  sub _tree_construction_main ($) {
2428          (defined $self->{head_element} ? $self->{head_element} : $self->{open_elements}->[-1]->[0])          (defined $self->{head_element} ? $self->{head_element} : $self->{open_elements}->[-1]->[0])
2429            ->append_child ($title_el);            ->append_child ($title_el);
2430          $self->{content_model_flag} = 'RCDATA';          $self->{content_model_flag} = 'RCDATA';
2431            delete $self->{escape}; # MUST
2432                    
2433          my $text = '';          my $text = '';
2434          !!!next-token;          !!!next-token;
# Line 2363  sub _tree_construction_main ($) { Line 2549  sub _tree_construction_main ($) {
2549          LI: {          LI: {
2550            ## Step 2            ## Step 2
2551            if ($node->[1] eq 'li') {            if ($node->[1] eq 'li') {
2552                if ($i != -1) {
2553                  !!!parse-error (type => 'end tag missing:'.
2554                                  $self->{open_elements}->[-1]->[1]);
2555                  ## TODO: test
2556                }
2557              splice @{$self->{open_elements}}, $i;              splice @{$self->{open_elements}}, $i;
2558              last LI;              last LI;
2559            }            }
# Line 2406  sub _tree_construction_main ($) { Line 2597  sub _tree_construction_main ($) {
2597          LI: {          LI: {
2598            ## Step 2            ## Step 2
2599            if ($node->[1] eq 'dt' or $node->[1] eq 'dd') {            if ($node->[1] eq 'dt' or $node->[1] eq 'dd') {
2600                if ($i != -1) {
2601                  !!!parse-error (type => 'end tag missing:'.
2602                                  $self->{open_elements}->[-1]->[1]);
2603                  ## TODO: test
2604                }
2605              splice @{$self->{open_elements}}, $i;              splice @{$self->{open_elements}}, $i;
2606              last LI;              last LI;
2607            }            }
# Line 2530  sub _tree_construction_main ($) { Line 2726  sub _tree_construction_main ($) {
2726          return;          return;
2727        } elsif ({        } elsif ({
2728                  b => 1, big => 1, em => 1, font => 1, i => 1,                  b => 1, big => 1, em => 1, font => 1, i => 1,
2729                  nobr => 1, s => 1, small => 1, strile => 1,                  s => 1, small => 1, strile => 1,
2730                  strong => 1, tt => 1, u => 1,                  strong => 1, tt => 1, u => 1,
2731                 }->{$token->{tag_name}}) {                 }->{$token->{tag_name}}) {
2732          $reconstruct_active_formatting_elements->($insert_to_current);          $reconstruct_active_formatting_elements->($insert_to_current);
# Line 2540  sub _tree_construction_main ($) { Line 2736  sub _tree_construction_main ($) {
2736                    
2737          !!!next-token;          !!!next-token;
2738          return;          return;
2739          } elsif ($token->{tag_name} eq 'nobr') {
2740            $reconstruct_active_formatting_elements->($insert_to_current);
2741    
2742            ## has a |nobr| element in scope
2743            INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2744              my $node = $self->{open_elements}->[$_];
2745              if ($node->[1] eq 'nobr') {
2746                !!!back-token;
2747                $token = {type => 'end tag', tag_name => 'nobr'};
2748                return;
2749              } elsif ({
2750                        table => 1, caption => 1, td => 1, th => 1,
2751                        button => 1, marquee => 1, object => 1, html => 1,
2752                       }->{$node->[1]}) {
2753                last INSCOPE;
2754              }
2755            } # INSCOPE
2756            
2757            !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2758            push @$active_formatting_elements, $self->{open_elements}->[-1];
2759            
2760            !!!next-token;
2761            return;
2762        } elsif ($token->{tag_name} eq 'button') {        } elsif ($token->{tag_name} eq 'button') {
2763          ## has a button element in scope          ## has a button element in scope
2764          INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {          INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
# Line 2579  sub _tree_construction_main ($) { Line 2798  sub _tree_construction_main ($) {
2798          !!!insert-element-t ($token->{tag_name}, $token->{attributes});          !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2799                    
2800          $self->{content_model_flag} = 'CDATA';          $self->{content_model_flag} = 'CDATA';
2801            delete $self->{escape}; # MUST
2802                    
2803          !!!next-token;          !!!next-token;
2804          return;          return;
# Line 2695  sub _tree_construction_main ($) { Line 2915  sub _tree_construction_main ($) {
2915          } else {          } else {
2916            $self->{content_model_flag} = 'CDATA';            $self->{content_model_flag} = 'CDATA';
2917          }          }
2918            delete $self->{escape}; # MUST
2919                    
2920          $insert->($el);          $insert->($el);
2921                    
2922          my $text = '';          my $text = '';
2923          !!!next-token;          if ($token->{tag_name} eq 'textarea') {
2924              !!!next-token;
2925              if ($token->{type} eq 'character') {
2926                $token->{data} =~ s/^\x0A//;
2927                unless (length $token->{data}) {
2928                  !!!next-token;
2929                }
2930              }
2931            } else {
2932              !!!next-token;
2933            }
2934          while ($token->{type} eq 'character') {          while ($token->{type} eq 'character') {
2935            $text .= $token->{data};            $text .= $token->{data};
2936            !!!next-token;            !!!next-token;
# Line 2715  sub _tree_construction_main ($) { Line 2946  sub _tree_construction_main ($) {
2946            ## Ignore the token            ## Ignore the token
2947          } else {          } else {
2948            if ($token->{tag_name} eq 'textarea') {            if ($token->{tag_name} eq 'textarea') {
             !!!parse-error (type => 'in CDATA:#'.$token->{type});  
           } else {  
2949              !!!parse-error (type => 'in RCDATA:#'.$token->{type});              !!!parse-error (type => 'in RCDATA:#'.$token->{type});
2950              } else {
2951                !!!parse-error (type => 'in CDATA:#'.$token->{type});
2952            }            }
2953            ## ISSUE: And ignore?            ## ISSUE: And ignore?
2954          }          }
# Line 2753  sub _tree_construction_main ($) { Line 2984  sub _tree_construction_main ($) {
2984        }        }
2985      } elsif ($token->{type} eq 'end tag') {      } elsif ($token->{type} eq 'end tag') {
2986        if ($token->{tag_name} eq 'body') {        if ($token->{tag_name} eq 'body') {
2987          if (@{$self->{open_elements}} > 1 and $self->{open_elements}->[1]->[1] eq 'body') {          if (@{$self->{open_elements}} > 1 and
2988            ## ISSUE: There is an issue in the spec.              $self->{open_elements}->[1]->[1] eq 'body') {
2989            if ($self->{open_elements}->[-1]->[1] ne 'body') {            for (@{$self->{open_elements}}) {
2990              !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);              unless ({
2991                           dd => 1, dt => 1, li => 1, p => 1, td => 1,
2992                           th => 1, tr => 1, body => 1, html => 1,
2993                        }->{$_->[1]}) {
2994                  !!!parse-error (type => 'not closed:'.$_->[1]);
2995                }
2996            }            }
2997    
2998            $self->{insertion_mode} = 'after body';            $self->{insertion_mode} = 'after body';
2999            !!!next-token;            !!!next-token;
3000            return;            return;
# Line 2786  sub _tree_construction_main ($) { Line 3023  sub _tree_construction_main ($) {
3023                  address => 1, blockquote => 1, center => 1, dir => 1,                  address => 1, blockquote => 1, center => 1, dir => 1,
3024                  div => 1, dl => 1, fieldset => 1, listing => 1,                  div => 1, dl => 1, fieldset => 1, listing => 1,
3025                  menu => 1, ol => 1, pre => 1, ul => 1,                  menu => 1, ol => 1, pre => 1, ul => 1,
                 form => 1,  
3026                  p => 1,                  p => 1,
3027                  dd => 1, dt => 1, li => 1,                  dd => 1, dt => 1, li => 1,
3028                  button => 1, marquee => 1, object => 1,                  button => 1, marquee => 1, object => 1,
# Line 2824  sub _tree_construction_main ($) { Line 3060  sub _tree_construction_main ($) {
3060          }          }
3061                    
3062          splice @{$self->{open_elements}}, $i if defined $i;          splice @{$self->{open_elements}}, $i if defined $i;
         undef $self->{form_element} if $token->{tag_name} eq 'form';  
3063          $clear_up_to_marker->()          $clear_up_to_marker->()
3064            if {            if {
3065              button => 1, marquee => 1, object => 1,              button => 1, marquee => 1, object => 1,
3066            }->{$token->{tag_name}};            }->{$token->{tag_name}};
3067          !!!next-token;          !!!next-token;
3068          return;          return;
3069          } elsif ($token->{tag_name} eq 'form') {
3070            ## has an element in scope
3071            INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3072              my $node = $self->{open_elements}->[$_];
3073              if ($node->[1] eq $token->{tag_name}) {
3074                ## generate implied end tags
3075                if ({
3076                     dd => 1, dt => 1, li => 1, p => 1,
3077                     td => 1, th => 1, tr => 1,
3078                    }->{$self->{open_elements}->[-1]->[1]}) {
3079                  !!!back-token;
3080                  $token = {type => 'end tag',
3081                            tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3082                  return;
3083                }
3084                last INSCOPE;
3085              } elsif ({
3086                        table => 1, caption => 1, td => 1, th => 1,
3087                        button => 1, marquee => 1, object => 1, html => 1,
3088                       }->{$node->[1]}) {
3089                last INSCOPE;
3090              }
3091            } # INSCOPE
3092            
3093            if ($self->{open_elements}->[-1]->[1] eq $token->{tag_name}) {
3094              pop @{$self->{open_elements}};
3095            } else {
3096              !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3097            }
3098    
3099            undef $self->{form_element};
3100            !!!next-token;
3101            return;
3102        } elsif ({        } elsif ({
3103                  h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,                  h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
3104                 }->{$token->{tag_name}}) {                 }->{$token->{tag_name}}) {
# Line 2875  sub _tree_construction_main ($) { Line 3143  sub _tree_construction_main ($) {
3143                  strong => 1, tt => 1, u => 1,                  strong => 1, tt => 1, u => 1,
3144                 }->{$token->{tag_name}}) {                 }->{$token->{tag_name}}) {
3145          $formatting_end_tag->($token->{tag_name});          $formatting_end_tag->($token->{tag_name});
3146    ## TODO: <http://html5.org/tools/web-apps-tracker?from=883&to=884>
3147          return;          return;
3148        } elsif ({        } elsif ({
3149                  caption => 1, col => 1, colgroup => 1, frame => 1,                  caption => 1, col => 1, colgroup => 1, frame => 1,
# Line 3075  sub _tree_construction_main ($) { Line 3344  sub _tree_construction_main ($) {
3344                (defined $self->{head_element} ? $self->{head_element} : $self->{open_elements}->[-1]->[0])                (defined $self->{head_element} ? $self->{head_element} : $self->{open_elements}->[-1]->[0])
3345                  ->append_child ($title_el);                  ->append_child ($title_el);
3346                $self->{content_model_flag} = 'RCDATA';                $self->{content_model_flag} = 'RCDATA';
3347                  delete $self->{escape}; # MUST
3348    
3349                my $text = '';                my $text = '';
3350                !!!next-token;                !!!next-token;
# Line 4846  sub set_inner_html ($$$) { Line 5116  sub set_inner_html ($$$) {
5116      ## NOTE: Most of this code is copied from |parse_string|      ## NOTE: Most of this code is copied from |parse_string|
5117    
5118      ## Step 1 # MUST      ## Step 1 # MUST
5119      my $doc = $node->owner_document->implementation->create_document;      my $this_doc = $node->owner_document;
5120      ## TODO: Mark as HTML document      my $doc = $this_doc->implementation->create_document;
5121        $doc->manakai_is_html (1);
5122      my $p = $class->new;      my $p = $class->new;
5123      $p->{document} = $doc;      $p->{document} = $doc;
5124    
# Line 4857  sub set_inner_html ($$$) { Line 5128  sub set_inner_html ($$$) {
5128      my $column = 0;      my $column = 0;
5129      $p->{set_next_input_character} = sub {      $p->{set_next_input_character} = sub {
5130        my $self = shift;        my $self = shift;
5131    
5132          pop @{$self->{prev_input_character}};
5133          unshift @{$self->{prev_input_character}}, $self->{next_input_character};
5134    
5135        $self->{next_input_character} = -1 and return if $i >= length $$s;        $self->{next_input_character} = -1 and return if $i >= length $$s;
5136        $self->{next_input_character} = ord substr $$s, $i++, 1;        $self->{next_input_character} = ord substr $$s, $i++, 1;
5137        $column++;        $column++;
# Line 4865  sub set_inner_html ($$$) { Line 5140  sub set_inner_html ($$$) {
5140          $line++;          $line++;
5141          $column = 0;          $column = 0;
5142        } elsif ($self->{next_input_character} == 0x000D) { # CR        } elsif ($self->{next_input_character} == 0x000D) { # CR
5143          if ($i >= length $$s) {          $i++ if substr ($$s, $i, 1) eq "\x0A";
           #  
         } else {  
           my $next_char = ord substr $$s, $i++, 1;  
           if ($next_char == 0x000A) { # LF  
             #  
           } else {  
             push @{$self->{char}}, $next_char;  
           }  
         }  
5144          $self->{next_input_character} = 0x000A; # LF # MUST          $self->{next_input_character} = 0x000A; # LF # MUST
5145          $line++;          $line++;
5146          $column = 0;          $column = 0;
5147        } elsif ($self->{next_input_character} > 0x10FFFF) {        } elsif ($self->{next_input_character} > 0x10FFFF) {
5148          $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST          $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
5149        } elsif ($self->{next_input_character} == 0x0000) { # NULL        } elsif ($self->{next_input_character} == 0x0000) { # NULL
5150            !!!parse-error (type => 'NULL');
5151          $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST          $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
5152        }        }
5153      };      };
5154        $p->{prev_input_character} = [-1, -1, -1];
5155        $p->{next_input_character} = -1;
5156            
5157      my $ponerror = $onerror || sub {      my $ponerror = $onerror || sub {
5158        my (%opt) = @_;        my (%opt) = @_;
# Line 4962  sub set_inner_html ($$$) { Line 5231  sub set_inner_html ($$$) {
5231      ## Step 12 # MUST      ## Step 12 # MUST
5232      @cn = @{$root->child_nodes};      @cn = @{$root->child_nodes};
5233      for (@cn) {      for (@cn) {
5234          $this_doc->adopt_node ($_);
5235        $node->append_child ($_);        $node->append_child ($_);
5236      }      }
5237      ## ISSUE: adopt_node? mutation events?      ## ISSUE: mutation events?
5238    
5239      $p->_terminate_tree_constructor;      $p->_terminate_tree_constructor;
5240    } else {    } else {

Legend:
Removed from v.1.7  
changed lines
  Added in v.1.20

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24