/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.31 by wakaba, Sat Sep 5 09:26:55 2009 UTC revision 1.33 by wakaba, Sat Sep 5 10:41:07 2009 UTC
# Line 105  sub COMMENT_START_STATE () { 14 } Line 105  sub COMMENT_START_STATE () { 14 }
105  sub COMMENT_START_DASH_STATE () { 15 }  sub COMMENT_START_DASH_STATE () { 15 }
106  sub COMMENT_STATE () { 16 }  sub COMMENT_STATE () { 16 }
107  sub COMMENT_END_STATE () { 17 }  sub COMMENT_END_STATE () { 17 }
108  sub COMMENT_END_BANG_STATE () { 102 } ## LAST  sub COMMENT_END_BANG_STATE () { 102 }
109    sub COMMENT_END_SPACE_STATE () { 103 } ## LAST
110  sub COMMENT_END_DASH_STATE () { 18 }  sub COMMENT_END_DASH_STATE () { 18 }
111  sub BOGUS_COMMENT_STATE () { 19 }  sub BOGUS_COMMENT_STATE () { 19 }
112  sub DOCTYPE_STATE () { 20 }  sub DOCTYPE_STATE () { 20 }
# Line 1101  sub _get_next_token ($) { Line 1102  sub _get_next_token ($) {
1102          $self->{s_kwd} = '';          $self->{s_kwd} = '';
1103          # reconsume          # reconsume
1104    
1105          return  ($self->{ct}); # start tag or end tag          ## Discard the token.
1106            #return  ($self->{ct}); # start tag or end tag
1107    
1108          redo A;          redo A;
1109        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
# Line 1242  sub _get_next_token ($) { Line 1244  sub _get_next_token ($) {
1244          $self->{s_kwd} = '';          $self->{s_kwd} = '';
1245          # reconsume          # reconsume
1246    
1247          return  ($self->{ct}); # start tag or end tag          ## Discard the token.
1248            #return  ($self->{ct}); # start tag or end tag
1249    
1250          redo A;          redo A;
1251        } else {        } else {
# Line 1428  sub _get_next_token ($) { Line 1431  sub _get_next_token ($) {
1431          $self->{s_kwd} = '';          $self->{s_kwd} = '';
1432          # reconsume          # reconsume
1433    
1434          return  ($self->{ct}); # start tag or end tag          ## Discard the token.
1435            #return  ($self->{ct}); # start tag or end tag
1436    
1437          redo A;          redo A;
1438        } else {        } else {
# Line 1595  sub _get_next_token ($) { Line 1599  sub _get_next_token ($) {
1599          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1600          # reconsume          # reconsume
1601    
1602          return  ($self->{ct}); # start tag or end tag          ## Discard the token.
1603            #return  ($self->{ct}); # start tag or end tag
1604    
1605          redo A;          redo A;
1606        } else {        } else {
# Line 1744  sub _get_next_token ($) { Line 1749  sub _get_next_token ($) {
1749          $self->{s_kwd} = '';          $self->{s_kwd} = '';
1750          ## reconsume          ## reconsume
1751    
1752          return  ($self->{ct}); # start tag or end tag          ## Discard the token.
1753            #return  ($self->{ct}); # start tag or end tag
1754    
1755          redo A;          redo A;
1756        } else {        } else {
# Line 1865  sub _get_next_token ($) { Line 1871  sub _get_next_token ($) {
1871            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
1872            $self->{s_kwd} = '';            $self->{s_kwd} = '';
1873            ## reconsume            ## reconsume
1874            return  ($self->{ct}); # end tag  
1875              ## Discard the token.
1876              #return  ($self->{ct}); # end tag
1877    
1878            redo A;            redo A;
1879          } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {          } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1880            ## XML5: No parse error above; not defined yet.            ## XML5: No parse error above; not defined yet.
1881            push @{$self->{ct}->{attrdefs}}, $self->{ca};            push @{$self->{ct}->{attrdefs}}, $self->{ca};
1882            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1883            ## Reconsume.            ## Reconsume.
1884            return  ($self->{ct}); # ATTLIST  
1885              ## Discard the token.
1886              #return  ($self->{ct}); # ATTLIST
1887    
1888            redo A;            redo A;
1889          } else {          } else {
1890            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
# Line 1981  sub _get_next_token ($) { Line 1993  sub _get_next_token ($) {
1993            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
1994            $self->{s_kwd} = '';            $self->{s_kwd} = '';
1995            ## reconsume            ## reconsume
1996            return  ($self->{ct}); # start tag  
1997              ## Discard the token.
1998              #return  ($self->{ct}); # start tag
1999    
2000            redo A;            redo A;
2001          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2002            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
# Line 1996  sub _get_next_token ($) { Line 2011  sub _get_next_token ($) {
2011            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
2012            $self->{s_kwd} = '';            $self->{s_kwd} = '';
2013            ## reconsume            ## reconsume
2014            return  ($self->{ct}); # end tag  
2015              ## Discard the token.
2016              #return  ($self->{ct}); # end tag
2017    
2018            redo A;            redo A;
2019          } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {          } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2020            ## XML5: No parse error above; not defined yet.            ## XML5: No parse error above; not defined yet.
2021            push @{$self->{ct}->{attrdefs}}, $self->{ca};            push @{$self->{ct}->{attrdefs}}, $self->{ca};
2022            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2023            ## Reconsume.            ## Reconsume.
2024            return  ($self->{ct}); # ATTLIST  
2025              ## Discard the token.
2026              #return  ($self->{ct}); # ATTLIST
2027    
2028            redo A;            redo A;
2029          } else {          } else {
2030            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
# Line 2158  sub _get_next_token ($) { Line 2179  sub _get_next_token ($) {
2179            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
2180            $self->{s_kwd} = '';            $self->{s_kwd} = '';
2181            ## reconsume            ## reconsume
2182            return  ($self->{ct}); # start tag  
2183              ## Discard the token.
2184              #return  ($self->{ct}); # start tag
2185              
2186            redo A;            redo A;
2187          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2188            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
# Line 2174  sub _get_next_token ($) { Line 2198  sub _get_next_token ($) {
2198            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
2199            $self->{s_kwd} = '';            $self->{s_kwd} = '';
2200            ## reconsume            ## reconsume
2201            return  ($self->{ct}); # end tag  
2202              ## Discard the token.
2203              #return  ($self->{ct}); # end tag
2204    
2205            redo A;            redo A;
2206          } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {          } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2207            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
2208            push @{$self->{ct}->{attrdefs}}, $self->{ca};            push @{$self->{ct}->{attrdefs}}, $self->{ca};
2209            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2210            ## Reconsume.            ## Reconsume.
2211            return  ($self->{ct}); # ATTLIST  
2212              ## Discard the token.
2213              #return  ($self->{ct}); # ATTLIST
2214    
2215            redo A;            redo A;
2216          } else {          } else {
2217            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
# Line 2301  sub _get_next_token ($) { Line 2331  sub _get_next_token ($) {
2331          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2332          $self->{s_kwd} = '';          $self->{s_kwd} = '';
2333          ## Reconsume.          ## Reconsume.
2334          return  ($self->{ct}); # start tag or end tag  
2335            ## Discard the token.
2336            #return  ($self->{ct}); # start tag or end tag
2337    
2338          redo A;          redo A;
2339        } else {        } else {
2340                    
# Line 2368  sub _get_next_token ($) { Line 2401  sub _get_next_token ($) {
2401          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2402          $self->{s_kwd} = '';          $self->{s_kwd} = '';
2403          ## Reconsume.          ## Reconsume.
2404          return  ($self->{ct}); # start tag or end tag  
2405            ## Discard the token.
2406            #return  ($self->{ct}); # start tag or end tag
2407    
2408          redo A;          redo A;
2409        } else {        } else {
2410                    
# Line 2998  sub _get_next_token ($) { Line 3034  sub _get_next_token ($) {
3034      }      }
3035        
3036          redo A;          redo A;
3037        } elsif ($self->{nc} == 0x0021 and # !        } elsif ($self->{state} != COMMENT_END_BANG_STATE and
3038                 $self->{state} != COMMENT_END_BANG_STATE) {                 $is_space->{$self->{nc}}) {
3039            
3040            $self->{parse_error}->(level => $self->{level}->{must}, type => 'comment end space'); # XXX error type
3041            $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
3042            $self->{state} = COMMENT_END_SPACE_STATE;
3043            
3044        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3045          $self->{line_prev} = $self->{line};
3046          $self->{column_prev} = $self->{column};
3047          $self->{column}++;
3048          $self->{nc}
3049              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3050        } else {
3051          $self->{set_nc}->($self);
3052        }
3053      
3054            redo A;
3055          } elsif ($self->{state} != COMMENT_END_BANG_STATE and
3056                   $self->{nc} == 0x0021) { # !
3057            
3058          $self->{parse_error}->(level => $self->{level}->{must}, type => 'comment end bang'); # XXX error type          $self->{parse_error}->(level => $self->{level}->{must}, type => 'comment end bang'); # XXX error type
3059          $self->{state} = COMMENT_END_BANG_STATE;          $self->{state} = COMMENT_END_BANG_STATE;
3060                    
# Line 3050  sub _get_next_token ($) { Line 3105  sub _get_next_token ($) {
3105        
3106          redo A;          redo A;
3107        }        }
3108        } elsif ($self->{state} == COMMENT_END_SPACE_STATE) {
3109          ## XML5: Not exist.
3110    
3111          if ($self->{nc} == 0x003E) { # >
3112            if ($self->{in_subset}) {
3113              
3114              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3115            } else {
3116              
3117              $self->{state} = DATA_STATE;
3118              $self->{s_kwd} = '';
3119            }
3120            
3121        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3122          $self->{line_prev} = $self->{line};
3123          $self->{column_prev} = $self->{column};
3124          $self->{column}++;
3125          $self->{nc}
3126              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3127        } else {
3128          $self->{set_nc}->($self);
3129        }
3130      
3131    
3132            return  ($self->{ct}); # comment
3133    
3134            redo A;
3135          } elsif ($is_space->{$self->{nc}}) {
3136            
3137            $self->{ct}->{data} .= chr ($self->{nc}); # comment
3138            ## Stay in the state.
3139            
3140        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3141          $self->{line_prev} = $self->{line};
3142          $self->{column_prev} = $self->{column};
3143          $self->{column}++;
3144          $self->{nc}
3145              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3146        } else {
3147          $self->{set_nc}->($self);
3148        }
3149      
3150            redo A;
3151          } elsif ($self->{nc} == -1) {
3152            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
3153            if ($self->{in_subset}) {
3154              
3155              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3156            } else {
3157              
3158              $self->{state} = DATA_STATE;
3159              $self->{s_kwd} = '';
3160            }
3161            ## Reconsume.
3162    
3163            return  ($self->{ct}); # comment
3164    
3165            redo A;
3166          } else {
3167            
3168            $self->{ct}->{data} .= chr ($self->{nc}); # comment
3169            $self->{state} = COMMENT_STATE;
3170            
3171        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3172          $self->{line_prev} = $self->{line};
3173          $self->{column_prev} = $self->{column};
3174          $self->{column}++;
3175          $self->{nc}
3176              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3177        } else {
3178          $self->{set_nc}->($self);
3179        }
3180      
3181            redo A;
3182          }
3183      } elsif ($self->{state} == DOCTYPE_STATE) {      } elsif ($self->{state} == DOCTYPE_STATE) {
3184        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
3185                    

Legend:
Removed from v.1.31  
changed lines
  Added in v.1.33

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24