/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory | Revision Log | View Patch Patch

-revision 1.22 by wakaba,
Sun Oct 19 10:12:54 2008 UTC
+revision 1.34 by wakaba,
Sat Sep  5 11:31:58 2009 UTC
 Line 105 
 sub COMMENT_START_STATE () { 14 }
  sub COMMENT_START_DASH_STATE () { 15 }
  sub COMMENT_STATE () { 16 }
  sub COMMENT_END_STATE () { 17 }
+ sub COMMENT_END_BANG_STATE () { 102 }
+ sub COMMENT_END_SPACE_STATE () { 103 } ## LAST
  sub COMMENT_END_DASH_STATE () { 18 }
  sub BOGUS_COMMENT_STATE () { 19 }
  sub DOCTYPE_STATE () { 20 }
-Line 204 
 sub FOREIGN_EL () { 0b1_00000000000 }
+Line 206 
 sub FOREIGN_EL () { 0b1_00000000000 }
  ## Character reference mappings
  my $charref_map = {
+x00 => 0xFFFD, # REPLACEMENT CHARACTER
 x0D => 0x000A,
 x80 => 0x20AC,
-x81 => 0xFFFD,
+x81 => 0x0081,
 x82 => 0x201A,
 x83 => 0x0192,
 x84 => 0x201E,
-Line 218 
 my $charref_map = {
+Line 221 
 my $charref_map = {
 x8A => 0x0160,
 x8B => 0x2039,
 x8C => 0x0152,
-x8D => 0xFFFD,
+x8D => 0x008D,
 x8E => 0x017D,
-x8F => 0xFFFD,
+x8F => 0x008F,
-x90 => 0xFFFD,
+x90 => 0x0090,
 x91 => 0x2018,
 x92 => 0x2019,
 x93 => 0x201C,
-Line 234 
 my $charref_map = {
+Line 237 
 my $charref_map = {
 x9A => 0x0161,
 x9B => 0x203A,
 x9C => 0x0153,
-x9D => 0xFFFD,
+x9D => 0x009D,
 x9E => 0x017E,
 x9F => 0x0178,
  }; # $charref_map
- $charref_map->{$_} = 0xFFFD
+ $charref_map->{$_} = $_
-     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
+     for 0x0001..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
-xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
+xD800..0xDFFF, 0xFDD0..0xFDEF,
 xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
 x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
 x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
-Line 860 
 sub _get_next_token ($) {
+Line 863 
 sub _get_next_token ($) {
          $self->{s_kwd} = '';
          # reconsume
-         !!!emit ($self->{ct}); # start tag or end tag
+         ## Discard the token.
+         #!!!emit ($self->{ct}); # start tag or end tag
          redo A;
        } elsif ($self->{nc} == 0x002F) { # /
-Line 941 
 sub _get_next_token ($) {
+Line 945 
 sub _get_next_token ($) {
          $self->{s_kwd} = '';
          # reconsume
-         !!!emit ($self->{ct}); # start tag or end tag
+         ## Discard the token.
+         #!!!emit ($self->{ct}); # start tag or end tag
          redo A;
        } else {
          if ({
 x0022 => 1, # "
 x0027 => 1, # '
+x003C => 1, # <
 x003D => 1, # =
              }->{$self->{nc}}) {
            !!!cp (55);
-Line 1066 
 sub _get_next_token ($) {
+Line 1072 
 sub _get_next_token ($) {
          $self->{s_kwd} = '';
          # reconsume
-         !!!emit ($self->{ct}); # start tag or end tag
+         ## Discard the token.
+         #!!!emit ($self->{ct}); # start tag or end tag
          redo A;
        } else {
-         if ($self->{nc} == 0x0022 or # "
+         if ({
-             $self->{nc} == 0x0027) { # '
+x0022 => 1, # "
+x0027 => 1, # '
+x003C => 1, # <
+             }->{$self->{nc}}) {
            !!!cp (69);
            ## XML5: Not a parse error.
            !!!parse-error (type => 'bad attribute name');
-Line 1170 
 sub _get_next_token ($) {
+Line 1180 
 sub _get_next_token ($) {
          $self->{state} = DATA_STATE;
          # reconsume
-         !!!emit ($self->{ct}); # start tag or end tag
+         ## Discard the token.
+         #!!!emit ($self->{ct}); # start tag or end tag
          redo A;
        } else {
-Line 1182 
 sub _get_next_token ($) {
+Line 1193 
 sub _get_next_token ($) {
            !!!cp (78.2);
          }
-         if ($self->{nc} == 0x0022 or # "
+         if ({
-             $self->{nc} == 0x0027) { # '
+x0022 => 1, # "
+x0027 => 1, # '
+x003C => 1, # <
+             }->{$self->{nc}}) {
            !!!cp (78);
            ## XML5: Not a parse error.
            !!!parse-error (type => 'bad attribute name');
-Line 1266 
 sub _get_next_token ($) {
+Line 1280 
 sub _get_next_token ($) {
          $self->{s_kwd} = '';
          ## reconsume
-         !!!emit ($self->{ct}); # start tag or end tag
+         ## Discard the token.
+         #!!!emit ($self->{ct}); # start tag or end tag
          redo A;
        } else {
-         if ($self->{nc} == 0x003D) { # =
+         if ($self->{nc} == 0x003D or $self->{nc} == 0x003C) { # =, <
            !!!cp (93);
            ## XML5: Not a parse error.
            !!!parse-error (type => 'bad attribute value');
-Line 1316 
 sub _get_next_token ($) {
+Line 1331 
 sub _get_next_token ($) {
          $self->{state} = ENTITY_STATE;
          !!!next-input-character;
          redo A;
+       } elsif ($self->{is_xml} and
+                $is_space->{$self->{nc}}) {
+         !!!cp (97.1);
+         $self->{ca}->{value} .= ' ';
+         ## Stay in the state.
+         !!!next-input-character;
+         redo A;
        } elsif ($self->{nc} == -1) {
          !!!parse-error (type => 'unclosed attribute value');
          if ($self->{ct}->{type} == START_TAG_TOKEN) {
-Line 1340 
 sub _get_next_token ($) {
+Line 1362 
 sub _get_next_token ($) {
            $self->{state} = DATA_STATE;
            $self->{s_kwd} = '';
            ## reconsume
-           !!!emit ($self->{ct}); # end tag
+           ## Discard the token.
+           #!!!emit ($self->{ct}); # end tag
            redo A;
          } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
            ## XML5: No parse error above; not defined yet.
            push @{$self->{ct}->{attrdefs}}, $self->{ca};
            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
            ## Reconsume.
-           !!!emit ($self->{ct}); # ATTLIST
+           ## Discard the token.
+           #!!!emit ($self->{ct}); # ATTLIST
            redo A;
          } else {
            die "$0: $self->{ct}->{type}: Unknown token type";
-Line 1363 
 sub _get_next_token ($) {
+Line 1391 
 sub _get_next_token ($) {
          }
          $self->{ca}->{value} .= chr ($self->{nc});
          $self->{read_until}->($self->{ca}->{value},
-                               q["&<],
+                               qq["&<\x09\x0C\x20],
                                length $self->{ca}->{value});
          ## Stay in the state
-Line 1400 
 sub _get_next_token ($) {
+Line 1428 
 sub _get_next_token ($) {
          $self->{state} = ENTITY_STATE;
          !!!next-input-character;
          redo A;
+       } elsif ($self->{is_xml} and
+                $is_space->{$self->{nc}}) {
+         !!!cp (103.1);
+         $self->{ca}->{value} .= ' ';
+         ## Stay in the state.
+         !!!next-input-character;
+         redo A;
        } elsif ($self->{nc} == -1) {
          !!!parse-error (type => 'unclosed attribute value');
          if ($self->{ct}->{type} == START_TAG_TOKEN) {
-Line 1409 
 sub _get_next_token ($) {
+Line 1444 
 sub _get_next_token ($) {
            $self->{state} = DATA_STATE;
            $self->{s_kwd} = '';
            ## reconsume
-           !!!emit ($self->{ct}); # start tag
+           ## Discard the token.
+           #!!!emit ($self->{ct}); # start tag
            redo A;
          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
-Line 1424 
 sub _get_next_token ($) {
+Line 1462 
 sub _get_next_token ($) {
            $self->{state} = DATA_STATE;
            $self->{s_kwd} = '';
            ## reconsume
-           !!!emit ($self->{ct}); # end tag
+           ## Discard the token.
+           #!!!emit ($self->{ct}); # end tag
            redo A;
          } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
            ## XML5: No parse error above; not defined yet.
            push @{$self->{ct}->{attrdefs}}, $self->{ca};
            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
            ## Reconsume.
-           !!!emit ($self->{ct}); # ATTLIST
+           ## Discard the token.
+           #!!!emit ($self->{ct}); # ATTLIST
            redo A;
          } else {
            die "$0: $self->{ct}->{type}: Unknown token type";
-Line 1447 
 sub _get_next_token ($) {
+Line 1491 
 sub _get_next_token ($) {
          }
          $self->{ca}->{value} .= chr ($self->{nc});
          $self->{read_until}->($self->{ca}->{value},
-                               q['&<],
+                               qq['&<\x09\x0C\x20],
                                length $self->{ca}->{value});
          ## Stay in the state
-Line 1526 
 sub _get_next_token ($) {
+Line 1570 
 sub _get_next_token ($) {
            $self->{state} = DATA_STATE;
            $self->{s_kwd} = '';
            ## reconsume
-           !!!emit ($self->{ct}); # start tag
+           ## Discard the token.
+           #!!!emit ($self->{ct}); # start tag
            redo A;
          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
            !!!parse-error (type => 'unclosed tag');
-Line 1542 
 sub _get_next_token ($) {
+Line 1589 
 sub _get_next_token ($) {
            $self->{state} = DATA_STATE;
            $self->{s_kwd} = '';
            ## reconsume
-           !!!emit ($self->{ct}); # end tag
+           ## Discard the token.
+           #!!!emit ($self->{ct}); # end tag
            redo A;
          } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
            !!!parse-error (type => 'unclosed md'); ## TODO: type
            push @{$self->{ct}->{attrdefs}}, $self->{ca};
            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
            ## Reconsume.
-           !!!emit ($self->{ct}); # ATTLIST
+           ## Discard the token.
+           #!!!emit ($self->{ct}); # ATTLIST
            redo A;
          } else {
            die "$0: $self->{ct}->{type}: Unknown token type";
-Line 1559 
 sub _get_next_token ($) {
+Line 1612 
 sub _get_next_token ($) {
 x0022 => 1, # "
 x0027 => 1, # '
 x003D => 1, # =
+x003C => 1, # <
              }->{$self->{nc}}) {
            !!!cp (115);
            ## XML5: Not a parse error.
-Line 1568 
 sub _get_next_token ($) {
+Line 1622 
 sub _get_next_token ($) {
          }
          $self->{ca}->{value} .= chr ($self->{nc});
          $self->{read_until}->($self->{ca}->{value},
-                               q["'=& >],
+                               qq["'=& \x09\x0C>],
                                length $self->{ca}->{value});
          ## Stay in the state
-Line 1628 
 sub _get_next_token ($) {
+Line 1682 
 sub _get_next_token ($) {
          $self->{state} = DATA_STATE;
          $self->{s_kwd} = '';
          ## Reconsume.
-         !!!emit ($self->{ct}); # start tag or end tag
+         ## Discard the token.
+         #!!!emit ($self->{ct}); # start tag or end tag
          redo A;
        } else {
          !!!cp ('124.1');
-Line 1685 
 sub _get_next_token ($) {
+Line 1742 
 sub _get_next_token ($) {
          $self->{state} = DATA_STATE;
          $self->{s_kwd} = '';
          ## Reconsume.
-         !!!emit ($self->{ct}); # start tag or end tag
+         ## Discard the token.
+         #!!!emit ($self->{ct}); # start tag or end tag
          redo A;
        } else {
          !!!cp ('124.4');
-Line 2060 
 sub _get_next_token ($) {
+Line 2120 
 sub _get_next_token ($) {
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} == COMMENT_END_STATE) {
+     } elsif ($self->{state} == COMMENT_END_STATE or
+              $self->{state} == COMMENT_END_BANG_STATE) {
        ## XML5: "Comment end state" and "DOCTYPE comment end state".
+       ## (No comment end bang state.)
        if ($self->{nc} == 0x003E) { # >
          if ($self->{in_subset}) {
-Line 2078 
 sub _get_next_token ($) {
+Line 2140 
 sub _get_next_token ($) {
          redo A;
        } elsif ($self->{nc} == 0x002D) { # -
-         !!!cp (152);
+         if ($self->{state} == COMMENT_END_BANG_STATE) {
-         ## XML5: Not a parse error.
+           !!!cp (154.3);
-         !!!parse-error (type => 'dash in comment',
+           $self->{ct}->{data} .= '--!'; # comment
-                         line => $self->{line_prev},
+           $self->{state} = COMMENT_END_DASH_STATE;
-                         column => $self->{column_prev});
+         } else {
-         $self->{ct}->{data} .= '-'; # comment
+           !!!cp (152);
-         ## Stay in the state
+           ## XML5: Not a parse error.
+           !!!parse-error (type => 'dash in comment',
+                           line => $self->{line_prev},
+                           column => $self->{column_prev});
+           $self->{ct}->{data} .= '-'; # comment
+           ## Stay in the state
+         }
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{state} != COMMENT_END_BANG_STATE and
+                $is_space->{$self->{nc}}) {
+         !!!cp (152.1);
+         !!!parse-error (type => 'comment end space'); # XXX error type
+         $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
+         $self->{state} = COMMENT_END_SPACE_STATE;
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{state} != COMMENT_END_BANG_STATE and
+                $self->{nc} == 0x0021) { # !
+         !!!cp (152.2);
+         !!!parse-error (type => 'comment end bang'); # XXX error type
+         $self->{state} = COMMENT_END_BANG_STATE;
          !!!next-input-character;
          redo A;
        } elsif ($self->{nc} == -1) {
-Line 2097 
 sub _get_next_token ($) {
+Line 2180 
 sub _get_next_token ($) {
            $self->{state} = DATA_STATE;
            $self->{s_kwd} = '';
          }
-         ## reconsume
+         ## Reconsume.
          !!!emit ($self->{ct}); # comment
          redo A;
        } else {
          !!!cp (154);
-         ## XML5: Not a parse error.
+         if ($self->{state} == COMMENT_END_BANG_STATE) {
-         !!!parse-error (type => 'dash in comment',
+           $self->{ct}->{data} .= '--!' . chr ($self->{nc}); # comment
-                         line => $self->{line_prev},
+         } else {
-                         column => $self->{column_prev});
+           $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
-         $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
+         }
          $self->{state} = COMMENT_STATE;
          !!!next-input-character;
          redo A;
        }
+     } elsif ($self->{state} == COMMENT_END_SPACE_STATE) {
+       ## XML5: Not exist.
+       if ($self->{nc} == 0x003E) { # >
+         if ($self->{in_subset}) {
+           !!!cp (154.4);
+           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
+         } else {
+           !!!cp (154.5);
+           $self->{state} = DATA_STATE;
+           $self->{s_kwd} = '';
+         }
+         !!!next-input-character;
+         !!!emit ($self->{ct}); # comment
+         redo A;
+       } elsif ($is_space->{$self->{nc}}) {
+         !!!cp (154.6);
+         $self->{ct}->{data} .= chr ($self->{nc}); # comment
+         ## Stay in the state.
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{nc} == -1) {
+         !!!parse-error (type => 'unclosed comment');
+         if ($self->{in_subset}) {
+           !!!cp (154.7);
+           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
+         } else {
+           !!!cp (154.8);
+           $self->{state} = DATA_STATE;
+           $self->{s_kwd} = '';
+         }
+         ## Reconsume.
+         !!!emit ($self->{ct}); # comment
+         redo A;
+       } else {
+         !!!cp (154.9);
+         $self->{ct}->{data} .= chr ($self->{nc}); # comment
+         $self->{state} = COMMENT_STATE;
+         !!!next-input-character;
+         redo A;
+       }
      } elsif ($self->{state} == DOCTYPE_STATE) {
        if ($is_space->{$self->{nc}}) {
          !!!cp (155);
          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
          !!!next-input-character;
          redo A;
+       } elsif ($self->{nc} == -1) {
+         !!!cp (155.1);
+         !!!parse-error (type => 'unclosed DOCTYPE');
+         $self->{ct}->{quirks} = 1;
+         $self->{state} = DATA_STATE;
+         ## Reconsume.
+         !!!emit ($self->{ct}); # DOCTYPE (quirks)
+         redo A;
        } else {
          !!!cp (156);
-         ## XML5: Unless EOF, swith to the bogus comment state.
+         ## XML5: Swith to the bogus comment state.
          !!!parse-error (type => 'no space before DOCTYPE name');
          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
          ## reconsume
-Line 2146 
 sub _get_next_token ($) {
+Line 2284 
 sub _get_next_token ($) {
          !!!emit ($self->{ct}); # DOCTYPE (quirks)
          redo A;
+       } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
+         !!!cp (158.1);
+         $self->{ct}->{name} # DOCTYPE
+             = chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
+         delete $self->{ct}->{quirks};
+         $self->{state} = DOCTYPE_NAME_STATE;
+         !!!next-input-character;
+         redo A;
        } elsif ($self->{nc} == -1) {
          !!!cp (159);
          !!!parse-error (type => 'no DOCTYPE name');
-Line 2192 
 sub _get_next_token ($) {
+Line 2338 
 sub _get_next_token ($) {
          !!!emit ($self->{ct}); # DOCTYPE
          redo A;
+       } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
+         !!!cp (162.1);
+         $self->{ct}->{name} # DOCTYPE
+             .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
+         delete $self->{ct}->{quirks};
+         ## Stay in the state.
+         !!!next-input-character;
+         redo A;
        } elsif ($self->{nc} == -1) {
          !!!cp (163);
          !!!parse-error (type => 'unclosed DOCTYPE');
-Line 2213 
 sub _get_next_token ($) {
+Line 2367 
 sub _get_next_token ($) {
          redo A;
        } else {
          !!!cp (164);
-         $self->{ct}->{name}
+         $self->{ct}->{name} .= chr ($self->{nc}); # DOCTYPE
-           .= chr ($self->{nc}); # DOCTYPE
+         ## Stay in the state.
-         ## Stay in the state
          !!!next-input-character;
          redo A;
        }
-Line 3217 
 sub _get_next_token ($) {
+Line 3370 
 sub _get_next_token ($) {
        my $code = $self->{kwd};
        my $l = $self->{line_prev};
        my $c = $self->{column_prev};
-       if ($charref_map->{$code}) {
+       if ((not $self->{is_xml} and $charref_map->{$code}) or
+           ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
+           ($self->{is_xml} and $code == 0x0000)) {
          !!!cp (1015);
          !!!parse-error (type => 'invalid character reference',
                          text => (sprintf 'U+%04X', $code),
-Line 3330 
 sub _get_next_token ($) {
+Line 3485 
 sub _get_next_token ($) {
        my $code = $self->{kwd};
        my $l = $self->{line_prev};
        my $c = $self->{column_prev};
-       if ($charref_map->{$code}) {
+       if ((not $self->{is_xml} and $charref_map->{$code}) or
+           ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
+           ($self->{is_xml} and $code == 0x0000)) {
          !!!cp (1008);
          !!!parse-error (type => 'invalid character reference',
                          text => (sprintf 'U+%04X', $code),
-Line 3659 
 sub _get_next_token ($) {
+Line 3816 
 sub _get_next_token ($) {
          ## XML5: Not defined yet.
          ## TODO:
+         if (not $self->{stop_processing} and
+             not $self->{document}->xml_standalone) {
+           !!!parse-error (type => 'stop processing', ## TODO: type
+                           level => $self->{level}->{info});
+           $self->{stop_processing} = 1;
+         }
          !!!next-input-character;
          redo A;
        } elsif ($self->{nc} == 0x005D) { # ]
-Line 3893 
 sub _get_next_token ($) {
+Line 4058 
 sub _get_next_token ($) {
          }
          $self->{ct} = {type => ELEMENT_TOKEN, name => '',
                         line => $self->{line_prev},
-                        column => $self->{column_prev} - 6};
+                        column => $self->{column_prev} - 7};
          $self->{state} = DOCTYPE_MD_STATE;
          !!!next-input-character;
          redo A;
-Line 3941 
 sub _get_next_token ($) {
+Line 4106 
 sub _get_next_token ($) {
          $self->{ct} = {type => ATTLIST_TOKEN, name => '',
                         attrdefs => [],
                         line => $self->{line_prev},
-                        column => $self->{column_prev} - 6};
+                        column => $self->{column_prev} - 7};
          $self->{state} = DOCTYPE_MD_STATE;
          !!!next-input-character;
          redo A;
-Line 3990 
 sub _get_next_token ($) {
+Line 4155 
 sub _get_next_token ($) {
          }
          $self->{ct} = {type => NOTATION_TOKEN, name => '',
                         line => $self->{line_prev},
-                        column => $self->{column_prev} - 6};
+                        column => $self->{column_prev} - 8};
          $self->{state} = DOCTYPE_MD_STATE;
          !!!next-input-character;
          redo A;

 Legend:



Removed from v.1.22
 


changed lines


 
Added in v.1.34
 Legend:



Removed from v.1.22
 


changed lines


 
Added in v.1.34
-Removed from v.1.22
+Added in v.1.34

admin@suikawiki.org	ViewVC Help
Powered by ViewVC 1.1.24