/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.30 by wakaba, Sun Aug 16 05:24:47 2009 UTC revision 1.33 by wakaba, Sat Sep 5 10:41:07 2009 UTC
# Line 105  sub COMMENT_START_STATE () { 14 } Line 105  sub COMMENT_START_STATE () { 14 }
105  sub COMMENT_START_DASH_STATE () { 15 }  sub COMMENT_START_DASH_STATE () { 15 }
106  sub COMMENT_STATE () { 16 }  sub COMMENT_STATE () { 16 }
107  sub COMMENT_END_STATE () { 17 }  sub COMMENT_END_STATE () { 17 }
108    sub COMMENT_END_BANG_STATE () { 102 }
109    sub COMMENT_END_SPACE_STATE () { 103 } ## LAST
110  sub COMMENT_END_DASH_STATE () { 18 }  sub COMMENT_END_DASH_STATE () { 18 }
111  sub BOGUS_COMMENT_STATE () { 19 }  sub BOGUS_COMMENT_STATE () { 19 }
112  sub DOCTYPE_STATE () { 20 }  sub DOCTYPE_STATE () { 20 }
# Line 860  sub _get_next_token ($) { Line 862  sub _get_next_token ($) {
862          $self->{s_kwd} = '';          $self->{s_kwd} = '';
863          # reconsume          # reconsume
864    
865          !!!emit ($self->{ct}); # start tag or end tag          ## Discard the token.
866            #!!!emit ($self->{ct}); # start tag or end tag
867    
868          redo A;          redo A;
869        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
# Line 941  sub _get_next_token ($) { Line 944  sub _get_next_token ($) {
944          $self->{s_kwd} = '';          $self->{s_kwd} = '';
945          # reconsume          # reconsume
946    
947          !!!emit ($self->{ct}); # start tag or end tag          ## Discard the token.
948            #!!!emit ($self->{ct}); # start tag or end tag
949    
950          redo A;          redo A;
951        } else {        } else {
# Line 1067  sub _get_next_token ($) { Line 1071  sub _get_next_token ($) {
1071          $self->{s_kwd} = '';          $self->{s_kwd} = '';
1072          # reconsume          # reconsume
1073    
1074          !!!emit ($self->{ct}); # start tag or end tag          ## Discard the token.
1075            #!!!emit ($self->{ct}); # start tag or end tag
1076    
1077          redo A;          redo A;
1078        } else {        } else {
# Line 1174  sub _get_next_token ($) { Line 1179  sub _get_next_token ($) {
1179          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1180          # reconsume          # reconsume
1181    
1182          !!!emit ($self->{ct}); # start tag or end tag          ## Discard the token.
1183            #!!!emit ($self->{ct}); # start tag or end tag
1184    
1185          redo A;          redo A;
1186        } else {        } else {
# Line 1273  sub _get_next_token ($) { Line 1279  sub _get_next_token ($) {
1279          $self->{s_kwd} = '';          $self->{s_kwd} = '';
1280          ## reconsume          ## reconsume
1281    
1282          !!!emit ($self->{ct}); # start tag or end tag          ## Discard the token.
1283            #!!!emit ($self->{ct}); # start tag or end tag
1284    
1285          redo A;          redo A;
1286        } else {        } else {
# Line 1354  sub _get_next_token ($) { Line 1361  sub _get_next_token ($) {
1361            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
1362            $self->{s_kwd} = '';            $self->{s_kwd} = '';
1363            ## reconsume            ## reconsume
1364            !!!emit ($self->{ct}); # end tag  
1365              ## Discard the token.
1366              #!!!emit ($self->{ct}); # end tag
1367    
1368            redo A;            redo A;
1369          } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {          } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1370            ## XML5: No parse error above; not defined yet.            ## XML5: No parse error above; not defined yet.
1371            push @{$self->{ct}->{attrdefs}}, $self->{ca};            push @{$self->{ct}->{attrdefs}}, $self->{ca};
1372            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1373            ## Reconsume.            ## Reconsume.
1374            !!!emit ($self->{ct}); # ATTLIST  
1375              ## Discard the token.
1376              #!!!emit ($self->{ct}); # ATTLIST
1377    
1378            redo A;            redo A;
1379          } else {          } else {
1380            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
# Line 1430  sub _get_next_token ($) { Line 1443  sub _get_next_token ($) {
1443            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
1444            $self->{s_kwd} = '';            $self->{s_kwd} = '';
1445            ## reconsume            ## reconsume
1446            !!!emit ($self->{ct}); # start tag  
1447              ## Discard the token.
1448              #!!!emit ($self->{ct}); # start tag
1449    
1450            redo A;            redo A;
1451          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1452            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
# Line 1445  sub _get_next_token ($) { Line 1461  sub _get_next_token ($) {
1461            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
1462            $self->{s_kwd} = '';            $self->{s_kwd} = '';
1463            ## reconsume            ## reconsume
1464            !!!emit ($self->{ct}); # end tag  
1465              ## Discard the token.
1466              #!!!emit ($self->{ct}); # end tag
1467    
1468            redo A;            redo A;
1469          } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {          } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1470            ## XML5: No parse error above; not defined yet.            ## XML5: No parse error above; not defined yet.
1471            push @{$self->{ct}->{attrdefs}}, $self->{ca};            push @{$self->{ct}->{attrdefs}}, $self->{ca};
1472            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1473            ## Reconsume.            ## Reconsume.
1474            !!!emit ($self->{ct}); # ATTLIST  
1475              ## Discard the token.
1476              #!!!emit ($self->{ct}); # ATTLIST
1477    
1478            redo A;            redo A;
1479          } else {          } else {
1480            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
# Line 1547  sub _get_next_token ($) { Line 1569  sub _get_next_token ($) {
1569            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
1570            $self->{s_kwd} = '';            $self->{s_kwd} = '';
1571            ## reconsume            ## reconsume
1572            !!!emit ($self->{ct}); # start tag  
1573              ## Discard the token.
1574              #!!!emit ($self->{ct}); # start tag
1575              
1576            redo A;            redo A;
1577          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1578            !!!parse-error (type => 'unclosed tag');            !!!parse-error (type => 'unclosed tag');
# Line 1563  sub _get_next_token ($) { Line 1588  sub _get_next_token ($) {
1588            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
1589            $self->{s_kwd} = '';            $self->{s_kwd} = '';
1590            ## reconsume            ## reconsume
1591            !!!emit ($self->{ct}); # end tag  
1592              ## Discard the token.
1593              #!!!emit ($self->{ct}); # end tag
1594    
1595            redo A;            redo A;
1596          } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {          } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1597            !!!parse-error (type => 'unclosed md'); ## TODO: type            !!!parse-error (type => 'unclosed md'); ## TODO: type
1598            push @{$self->{ct}->{attrdefs}}, $self->{ca};            push @{$self->{ct}->{attrdefs}}, $self->{ca};
1599            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1600            ## Reconsume.            ## Reconsume.
1601            !!!emit ($self->{ct}); # ATTLIST  
1602              ## Discard the token.
1603              #!!!emit ($self->{ct}); # ATTLIST
1604    
1605            redo A;            redo A;
1606          } else {          } else {
1607            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
# Line 1650  sub _get_next_token ($) { Line 1681  sub _get_next_token ($) {
1681          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1682          $self->{s_kwd} = '';          $self->{s_kwd} = '';
1683          ## Reconsume.          ## Reconsume.
1684          !!!emit ($self->{ct}); # start tag or end tag  
1685            ## Discard the token.
1686            #!!!emit ($self->{ct}); # start tag or end tag
1687    
1688          redo A;          redo A;
1689        } else {        } else {
1690          !!!cp ('124.1');          !!!cp ('124.1');
# Line 1707  sub _get_next_token ($) { Line 1741  sub _get_next_token ($) {
1741          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1742          $self->{s_kwd} = '';          $self->{s_kwd} = '';
1743          ## Reconsume.          ## Reconsume.
1744          !!!emit ($self->{ct}); # start tag or end tag  
1745            ## Discard the token.
1746            #!!!emit ($self->{ct}); # start tag or end tag
1747    
1748          redo A;          redo A;
1749        } else {        } else {
1750          !!!cp ('124.4');          !!!cp ('124.4');
# Line 2082  sub _get_next_token ($) { Line 2119  sub _get_next_token ($) {
2119          !!!next-input-character;          !!!next-input-character;
2120          redo A;          redo A;
2121        }        }
2122      } elsif ($self->{state} == COMMENT_END_STATE) {      } elsif ($self->{state} == COMMENT_END_STATE or
2123                 $self->{state} == COMMENT_END_BANG_STATE) {
2124        ## XML5: "Comment end state" and "DOCTYPE comment end state".        ## XML5: "Comment end state" and "DOCTYPE comment end state".
2125          ## (No comment end bang state.)
2126    
2127        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2128          if ($self->{in_subset}) {          if ($self->{in_subset}) {
# Line 2100  sub _get_next_token ($) { Line 2139  sub _get_next_token ($) {
2139    
2140          redo A;          redo A;
2141        } elsif ($self->{nc} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
2142          !!!cp (152);          if ($self->{state} == COMMENT_END_BANG_STATE) {
2143          ## XML5: Not a parse error.            !!!cp (154.3);
2144          !!!parse-error (type => 'dash in comment',            $self->{ct}->{data} .= '--!'; # comment
2145                          line => $self->{line_prev},            $self->{state} = COMMENT_END_DASH_STATE;
2146                          column => $self->{column_prev});          } else {
2147          $self->{ct}->{data} .= '-'; # comment            !!!cp (152);
2148          ## Stay in the state            ## XML5: Not a parse error.
2149              !!!parse-error (type => 'dash in comment',
2150                              line => $self->{line_prev},
2151                              column => $self->{column_prev});
2152              $self->{ct}->{data} .= '-'; # comment
2153              ## Stay in the state
2154            }
2155            !!!next-input-character;
2156            redo A;
2157          } elsif ($self->{state} != COMMENT_END_BANG_STATE and
2158                   $is_space->{$self->{nc}}) {
2159            !!!cp (152.1);
2160            !!!parse-error (type => 'comment end space'); # XXX error type
2161            $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2162            $self->{state} = COMMENT_END_SPACE_STATE;
2163            !!!next-input-character;
2164            redo A;
2165          } elsif ($self->{state} != COMMENT_END_BANG_STATE and
2166                   $self->{nc} == 0x0021) { # !
2167            !!!cp (152.2);
2168            !!!parse-error (type => 'comment end bang'); # XXX error type
2169            $self->{state} = COMMENT_END_BANG_STATE;
2170          !!!next-input-character;          !!!next-input-character;
2171          redo A;          redo A;
2172        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
# Line 2119  sub _get_next_token ($) { Line 2179  sub _get_next_token ($) {
2179            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
2180            $self->{s_kwd} = '';            $self->{s_kwd} = '';
2181          }          }
2182          ## reconsume          ## Reconsume.
2183    
2184          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
2185    
2186          redo A;          redo A;
2187        } else {        } else {
2188          !!!cp (154);          !!!cp (154);
2189          $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment          if ($self->{state} == COMMENT_END_BANG_STATE) {
2190              $self->{ct}->{data} .= '--!' . chr ($self->{nc}); # comment
2191            } else {
2192              $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2193            }
2194          $self->{state} = COMMENT_STATE;          $self->{state} = COMMENT_STATE;
2195          !!!next-input-character;          !!!next-input-character;
2196          redo A;          redo A;
2197        }        }
2198        } elsif ($self->{state} == COMMENT_END_SPACE_STATE) {
2199          ## XML5: Not exist.
2200    
2201          if ($self->{nc} == 0x003E) { # >
2202            if ($self->{in_subset}) {
2203              !!!cp (154.4);
2204              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2205            } else {
2206              !!!cp (154.5);
2207              $self->{state} = DATA_STATE;
2208              $self->{s_kwd} = '';
2209            }
2210            !!!next-input-character;
2211    
2212            !!!emit ($self->{ct}); # comment
2213    
2214            redo A;
2215          } elsif ($is_space->{$self->{nc}}) {
2216            !!!cp (154.6);
2217            $self->{ct}->{data} .= chr ($self->{nc}); # comment
2218            ## Stay in the state.
2219            !!!next-input-character;
2220            redo A;
2221          } elsif ($self->{nc} == -1) {
2222            !!!parse-error (type => 'unclosed comment');
2223            if ($self->{in_subset}) {
2224              !!!cp (154.7);
2225              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2226            } else {
2227              !!!cp (154.8);
2228              $self->{state} = DATA_STATE;
2229              $self->{s_kwd} = '';
2230            }
2231            ## Reconsume.
2232    
2233            !!!emit ($self->{ct}); # comment
2234    
2235            redo A;
2236          } else {
2237            !!!cp (154.9);
2238            $self->{ct}->{data} .= chr ($self->{nc}); # comment
2239            $self->{state} = COMMENT_STATE;
2240            !!!next-input-character;
2241            redo A;
2242          }
2243      } elsif ($self->{state} == DOCTYPE_STATE) {      } elsif ($self->{state} == DOCTYPE_STATE) {
2244        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2245          !!!cp (155);          !!!cp (155);

Legend:
Removed from v.1.30  
changed lines
  Added in v.1.33

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24