/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.31 by wakaba, Sat Sep 5 09:26:55 2009 UTC revision 1.34 by wakaba, Sat Sep 5 11:31:58 2009 UTC
# Line 105  sub COMMENT_START_STATE () { 14 } Line 105  sub COMMENT_START_STATE () { 14 }
105  sub COMMENT_START_DASH_STATE () { 15 }  sub COMMENT_START_DASH_STATE () { 15 }
106  sub COMMENT_STATE () { 16 }  sub COMMENT_STATE () { 16 }
107  sub COMMENT_END_STATE () { 17 }  sub COMMENT_END_STATE () { 17 }
108  sub COMMENT_END_BANG_STATE () { 102 } ## LAST  sub COMMENT_END_BANG_STATE () { 102 }
109    sub COMMENT_END_SPACE_STATE () { 103 } ## LAST
110  sub COMMENT_END_DASH_STATE () { 18 }  sub COMMENT_END_DASH_STATE () { 18 }
111  sub BOGUS_COMMENT_STATE () { 19 }  sub BOGUS_COMMENT_STATE () { 19 }
112  sub DOCTYPE_STATE () { 20 }  sub DOCTYPE_STATE () { 20 }
# Line 205  sub FOREIGN_EL () { 0b1_00000000000 } Line 206  sub FOREIGN_EL () { 0b1_00000000000 }
206  ## Character reference mappings  ## Character reference mappings
207    
208  my $charref_map = {  my $charref_map = {
209      0x00 => 0xFFFD, # REPLACEMENT CHARACTER
210    0x0D => 0x000A,    0x0D => 0x000A,
211    0x80 => 0x20AC,    0x80 => 0x20AC,
212    0x81 => 0xFFFD,    0x81 => 0x0081,
213    0x82 => 0x201A,    0x82 => 0x201A,
214    0x83 => 0x0192,    0x83 => 0x0192,
215    0x84 => 0x201E,    0x84 => 0x201E,
# Line 219  my $charref_map = { Line 221  my $charref_map = {
221    0x8A => 0x0160,    0x8A => 0x0160,
222    0x8B => 0x2039,    0x8B => 0x2039,
223    0x8C => 0x0152,    0x8C => 0x0152,
224    0x8D => 0xFFFD,    0x8D => 0x008D,
225    0x8E => 0x017D,    0x8E => 0x017D,
226    0x8F => 0xFFFD,    0x8F => 0x008F,
227    0x90 => 0xFFFD,    0x90 => 0x0090,
228    0x91 => 0x2018,    0x91 => 0x2018,
229    0x92 => 0x2019,    0x92 => 0x2019,
230    0x93 => 0x201C,    0x93 => 0x201C,
# Line 235  my $charref_map = { Line 237  my $charref_map = {
237    0x9A => 0x0161,    0x9A => 0x0161,
238    0x9B => 0x203A,    0x9B => 0x203A,
239    0x9C => 0x0153,    0x9C => 0x0153,
240    0x9D => 0xFFFD,    0x9D => 0x009D,
241    0x9E => 0x017E,    0x9E => 0x017E,
242    0x9F => 0x0178,    0x9F => 0x0178,
243  }; # $charref_map  }; # $charref_map
244  $charref_map->{$_} = 0xFFFD  $charref_map->{$_} = $_
245      for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,      for 0x0001..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
246          0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF          0xD800..0xDFFF, 0xFDD0..0xFDEF,
247          0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,          0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
248          0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,          0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
249          0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,          0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
# Line 861  sub _get_next_token ($) { Line 863  sub _get_next_token ($) {
863          $self->{s_kwd} = '';          $self->{s_kwd} = '';
864          # reconsume          # reconsume
865    
866          !!!emit ($self->{ct}); # start tag or end tag          ## Discard the token.
867            #!!!emit ($self->{ct}); # start tag or end tag
868    
869          redo A;          redo A;
870        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
# Line 942  sub _get_next_token ($) { Line 945  sub _get_next_token ($) {
945          $self->{s_kwd} = '';          $self->{s_kwd} = '';
946          # reconsume          # reconsume
947    
948          !!!emit ($self->{ct}); # start tag or end tag          ## Discard the token.
949            #!!!emit ($self->{ct}); # start tag or end tag
950    
951          redo A;          redo A;
952        } else {        } else {
# Line 1068  sub _get_next_token ($) { Line 1072  sub _get_next_token ($) {
1072          $self->{s_kwd} = '';          $self->{s_kwd} = '';
1073          # reconsume          # reconsume
1074    
1075          !!!emit ($self->{ct}); # start tag or end tag          ## Discard the token.
1076            #!!!emit ($self->{ct}); # start tag or end tag
1077    
1078          redo A;          redo A;
1079        } else {        } else {
# Line 1175  sub _get_next_token ($) { Line 1180  sub _get_next_token ($) {
1180          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1181          # reconsume          # reconsume
1182    
1183          !!!emit ($self->{ct}); # start tag or end tag          ## Discard the token.
1184            #!!!emit ($self->{ct}); # start tag or end tag
1185    
1186          redo A;          redo A;
1187        } else {        } else {
# Line 1274  sub _get_next_token ($) { Line 1280  sub _get_next_token ($) {
1280          $self->{s_kwd} = '';          $self->{s_kwd} = '';
1281          ## reconsume          ## reconsume
1282    
1283          !!!emit ($self->{ct}); # start tag or end tag          ## Discard the token.
1284            #!!!emit ($self->{ct}); # start tag or end tag
1285    
1286          redo A;          redo A;
1287        } else {        } else {
# Line 1355  sub _get_next_token ($) { Line 1362  sub _get_next_token ($) {
1362            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
1363            $self->{s_kwd} = '';            $self->{s_kwd} = '';
1364            ## reconsume            ## reconsume
1365            !!!emit ($self->{ct}); # end tag  
1366              ## Discard the token.
1367              #!!!emit ($self->{ct}); # end tag
1368    
1369            redo A;            redo A;
1370          } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {          } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1371            ## XML5: No parse error above; not defined yet.            ## XML5: No parse error above; not defined yet.
1372            push @{$self->{ct}->{attrdefs}}, $self->{ca};            push @{$self->{ct}->{attrdefs}}, $self->{ca};
1373            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1374            ## Reconsume.            ## Reconsume.
1375            !!!emit ($self->{ct}); # ATTLIST  
1376              ## Discard the token.
1377              #!!!emit ($self->{ct}); # ATTLIST
1378    
1379            redo A;            redo A;
1380          } else {          } else {
1381            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
# Line 1431  sub _get_next_token ($) { Line 1444  sub _get_next_token ($) {
1444            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
1445            $self->{s_kwd} = '';            $self->{s_kwd} = '';
1446            ## reconsume            ## reconsume
1447            !!!emit ($self->{ct}); # start tag  
1448              ## Discard the token.
1449              #!!!emit ($self->{ct}); # start tag
1450    
1451            redo A;            redo A;
1452          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1453            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
# Line 1446  sub _get_next_token ($) { Line 1462  sub _get_next_token ($) {
1462            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
1463            $self->{s_kwd} = '';            $self->{s_kwd} = '';
1464            ## reconsume            ## reconsume
1465            !!!emit ($self->{ct}); # end tag  
1466              ## Discard the token.
1467              #!!!emit ($self->{ct}); # end tag
1468    
1469            redo A;            redo A;
1470          } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {          } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1471            ## XML5: No parse error above; not defined yet.            ## XML5: No parse error above; not defined yet.
1472            push @{$self->{ct}->{attrdefs}}, $self->{ca};            push @{$self->{ct}->{attrdefs}}, $self->{ca};
1473            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1474            ## Reconsume.            ## Reconsume.
1475            !!!emit ($self->{ct}); # ATTLIST  
1476              ## Discard the token.
1477              #!!!emit ($self->{ct}); # ATTLIST
1478    
1479            redo A;            redo A;
1480          } else {          } else {
1481            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
# Line 1548  sub _get_next_token ($) { Line 1570  sub _get_next_token ($) {
1570            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
1571            $self->{s_kwd} = '';            $self->{s_kwd} = '';
1572            ## reconsume            ## reconsume
1573            !!!emit ($self->{ct}); # start tag  
1574              ## Discard the token.
1575              #!!!emit ($self->{ct}); # start tag
1576              
1577            redo A;            redo A;
1578          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1579            !!!parse-error (type => 'unclosed tag');            !!!parse-error (type => 'unclosed tag');
# Line 1564  sub _get_next_token ($) { Line 1589  sub _get_next_token ($) {
1589            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
1590            $self->{s_kwd} = '';            $self->{s_kwd} = '';
1591            ## reconsume            ## reconsume
1592            !!!emit ($self->{ct}); # end tag  
1593              ## Discard the token.
1594              #!!!emit ($self->{ct}); # end tag
1595    
1596            redo A;            redo A;
1597          } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {          } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1598            !!!parse-error (type => 'unclosed md'); ## TODO: type            !!!parse-error (type => 'unclosed md'); ## TODO: type
1599            push @{$self->{ct}->{attrdefs}}, $self->{ca};            push @{$self->{ct}->{attrdefs}}, $self->{ca};
1600            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1601            ## Reconsume.            ## Reconsume.
1602            !!!emit ($self->{ct}); # ATTLIST  
1603              ## Discard the token.
1604              #!!!emit ($self->{ct}); # ATTLIST
1605    
1606            redo A;            redo A;
1607          } else {          } else {
1608            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
# Line 1651  sub _get_next_token ($) { Line 1682  sub _get_next_token ($) {
1682          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1683          $self->{s_kwd} = '';          $self->{s_kwd} = '';
1684          ## Reconsume.          ## Reconsume.
1685          !!!emit ($self->{ct}); # start tag or end tag  
1686            ## Discard the token.
1687            #!!!emit ($self->{ct}); # start tag or end tag
1688    
1689          redo A;          redo A;
1690        } else {        } else {
1691          !!!cp ('124.1');          !!!cp ('124.1');
# Line 1708  sub _get_next_token ($) { Line 1742  sub _get_next_token ($) {
1742          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1743          $self->{s_kwd} = '';          $self->{s_kwd} = '';
1744          ## Reconsume.          ## Reconsume.
1745          !!!emit ($self->{ct}); # start tag or end tag  
1746            ## Discard the token.
1747            #!!!emit ($self->{ct}); # start tag or end tag
1748    
1749          redo A;          redo A;
1750        } else {        } else {
1751          !!!cp ('124.4');          !!!cp ('124.4');
# Line 2118  sub _get_next_token ($) { Line 2155  sub _get_next_token ($) {
2155          }          }
2156          !!!next-input-character;          !!!next-input-character;
2157          redo A;          redo A;
2158        } elsif ($self->{nc} == 0x0021 and # !        } elsif ($self->{state} != COMMENT_END_BANG_STATE and
2159                 $self->{state} != COMMENT_END_BANG_STATE) {                 $is_space->{$self->{nc}}) {
2160            !!!cp (152.1);
2161            !!!parse-error (type => 'comment end space'); # XXX error type
2162            $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2163            $self->{state} = COMMENT_END_SPACE_STATE;
2164            !!!next-input-character;
2165            redo A;
2166          } elsif ($self->{state} != COMMENT_END_BANG_STATE and
2167                   $self->{nc} == 0x0021) { # !
2168            !!!cp (152.2);
2169          !!!parse-error (type => 'comment end bang'); # XXX error type          !!!parse-error (type => 'comment end bang'); # XXX error type
2170          $self->{state} = COMMENT_END_BANG_STATE;          $self->{state} = COMMENT_END_BANG_STATE;
2171          !!!next-input-character;          !!!next-input-character;
# Line 2150  sub _get_next_token ($) { Line 2196  sub _get_next_token ($) {
2196          !!!next-input-character;          !!!next-input-character;
2197          redo A;          redo A;
2198        }        }
2199        } elsif ($self->{state} == COMMENT_END_SPACE_STATE) {
2200          ## XML5: Not exist.
2201    
2202          if ($self->{nc} == 0x003E) { # >
2203            if ($self->{in_subset}) {
2204              !!!cp (154.4);
2205              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2206            } else {
2207              !!!cp (154.5);
2208              $self->{state} = DATA_STATE;
2209              $self->{s_kwd} = '';
2210            }
2211            !!!next-input-character;
2212    
2213            !!!emit ($self->{ct}); # comment
2214    
2215            redo A;
2216          } elsif ($is_space->{$self->{nc}}) {
2217            !!!cp (154.6);
2218            $self->{ct}->{data} .= chr ($self->{nc}); # comment
2219            ## Stay in the state.
2220            !!!next-input-character;
2221            redo A;
2222          } elsif ($self->{nc} == -1) {
2223            !!!parse-error (type => 'unclosed comment');
2224            if ($self->{in_subset}) {
2225              !!!cp (154.7);
2226              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2227            } else {
2228              !!!cp (154.8);
2229              $self->{state} = DATA_STATE;
2230              $self->{s_kwd} = '';
2231            }
2232            ## Reconsume.
2233    
2234            !!!emit ($self->{ct}); # comment
2235    
2236            redo A;
2237          } else {
2238            !!!cp (154.9);
2239            $self->{ct}->{data} .= chr ($self->{nc}); # comment
2240            $self->{state} = COMMENT_STATE;
2241            !!!next-input-character;
2242            redo A;
2243          }
2244      } elsif ($self->{state} == DOCTYPE_STATE) {      } elsif ($self->{state} == DOCTYPE_STATE) {
2245        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2246          !!!cp (155);          !!!cp (155);

Legend:
Removed from v.1.31  
changed lines
  Added in v.1.34

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24