/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.30 by wakaba, Sun Aug 16 05:24:47 2009 UTC revision 1.33 by wakaba, Sat Sep 5 10:41:07 2009 UTC
# Line 105  sub COMMENT_START_STATE () { 14 } Line 105  sub COMMENT_START_STATE () { 14 }
105  sub COMMENT_START_DASH_STATE () { 15 }  sub COMMENT_START_DASH_STATE () { 15 }
106  sub COMMENT_STATE () { 16 }  sub COMMENT_STATE () { 16 }
107  sub COMMENT_END_STATE () { 17 }  sub COMMENT_END_STATE () { 17 }
108    sub COMMENT_END_BANG_STATE () { 102 }
109    sub COMMENT_END_SPACE_STATE () { 103 } ## LAST
110  sub COMMENT_END_DASH_STATE () { 18 }  sub COMMENT_END_DASH_STATE () { 18 }
111  sub BOGUS_COMMENT_STATE () { 19 }  sub BOGUS_COMMENT_STATE () { 19 }
112  sub DOCTYPE_STATE () { 20 }  sub DOCTYPE_STATE () { 20 }
# Line 1100  sub _get_next_token ($) { Line 1102  sub _get_next_token ($) {
1102          $self->{s_kwd} = '';          $self->{s_kwd} = '';
1103          # reconsume          # reconsume
1104    
1105          return  ($self->{ct}); # start tag or end tag          ## Discard the token.
1106            #return  ($self->{ct}); # start tag or end tag
1107    
1108          redo A;          redo A;
1109        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
# Line 1241  sub _get_next_token ($) { Line 1244  sub _get_next_token ($) {
1244          $self->{s_kwd} = '';          $self->{s_kwd} = '';
1245          # reconsume          # reconsume
1246    
1247          return  ($self->{ct}); # start tag or end tag          ## Discard the token.
1248            #return  ($self->{ct}); # start tag or end tag
1249    
1250          redo A;          redo A;
1251        } else {        } else {
# Line 1427  sub _get_next_token ($) { Line 1431  sub _get_next_token ($) {
1431          $self->{s_kwd} = '';          $self->{s_kwd} = '';
1432          # reconsume          # reconsume
1433    
1434          return  ($self->{ct}); # start tag or end tag          ## Discard the token.
1435            #return  ($self->{ct}); # start tag or end tag
1436    
1437          redo A;          redo A;
1438        } else {        } else {
# Line 1594  sub _get_next_token ($) { Line 1599  sub _get_next_token ($) {
1599          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1600          # reconsume          # reconsume
1601    
1602          return  ($self->{ct}); # start tag or end tag          ## Discard the token.
1603            #return  ($self->{ct}); # start tag or end tag
1604    
1605          redo A;          redo A;
1606        } else {        } else {
# Line 1743  sub _get_next_token ($) { Line 1749  sub _get_next_token ($) {
1749          $self->{s_kwd} = '';          $self->{s_kwd} = '';
1750          ## reconsume          ## reconsume
1751    
1752          return  ($self->{ct}); # start tag or end tag          ## Discard the token.
1753            #return  ($self->{ct}); # start tag or end tag
1754    
1755          redo A;          redo A;
1756        } else {        } else {
# Line 1864  sub _get_next_token ($) { Line 1871  sub _get_next_token ($) {
1871            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
1872            $self->{s_kwd} = '';            $self->{s_kwd} = '';
1873            ## reconsume            ## reconsume
1874            return  ($self->{ct}); # end tag  
1875              ## Discard the token.
1876              #return  ($self->{ct}); # end tag
1877    
1878            redo A;            redo A;
1879          } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {          } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1880            ## XML5: No parse error above; not defined yet.            ## XML5: No parse error above; not defined yet.
1881            push @{$self->{ct}->{attrdefs}}, $self->{ca};            push @{$self->{ct}->{attrdefs}}, $self->{ca};
1882            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1883            ## Reconsume.            ## Reconsume.
1884            return  ($self->{ct}); # ATTLIST  
1885              ## Discard the token.
1886              #return  ($self->{ct}); # ATTLIST
1887    
1888            redo A;            redo A;
1889          } else {          } else {
1890            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
# Line 1980  sub _get_next_token ($) { Line 1993  sub _get_next_token ($) {
1993            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
1994            $self->{s_kwd} = '';            $self->{s_kwd} = '';
1995            ## reconsume            ## reconsume
1996            return  ($self->{ct}); # start tag  
1997              ## Discard the token.
1998              #return  ($self->{ct}); # start tag
1999    
2000            redo A;            redo A;
2001          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2002            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
# Line 1995  sub _get_next_token ($) { Line 2011  sub _get_next_token ($) {
2011            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
2012            $self->{s_kwd} = '';            $self->{s_kwd} = '';
2013            ## reconsume            ## reconsume
2014            return  ($self->{ct}); # end tag  
2015              ## Discard the token.
2016              #return  ($self->{ct}); # end tag
2017    
2018            redo A;            redo A;
2019          } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {          } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2020            ## XML5: No parse error above; not defined yet.            ## XML5: No parse error above; not defined yet.
2021            push @{$self->{ct}->{attrdefs}}, $self->{ca};            push @{$self->{ct}->{attrdefs}}, $self->{ca};
2022            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2023            ## Reconsume.            ## Reconsume.
2024            return  ($self->{ct}); # ATTLIST  
2025              ## Discard the token.
2026              #return  ($self->{ct}); # ATTLIST
2027    
2028            redo A;            redo A;
2029          } else {          } else {
2030            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
# Line 2157  sub _get_next_token ($) { Line 2179  sub _get_next_token ($) {
2179            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
2180            $self->{s_kwd} = '';            $self->{s_kwd} = '';
2181            ## reconsume            ## reconsume
2182            return  ($self->{ct}); # start tag  
2183              ## Discard the token.
2184              #return  ($self->{ct}); # start tag
2185              
2186            redo A;            redo A;
2187          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2188            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
# Line 2173  sub _get_next_token ($) { Line 2198  sub _get_next_token ($) {
2198            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
2199            $self->{s_kwd} = '';            $self->{s_kwd} = '';
2200            ## reconsume            ## reconsume
2201            return  ($self->{ct}); # end tag  
2202              ## Discard the token.
2203              #return  ($self->{ct}); # end tag
2204    
2205            redo A;            redo A;
2206          } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {          } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2207            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
2208            push @{$self->{ct}->{attrdefs}}, $self->{ca};            push @{$self->{ct}->{attrdefs}}, $self->{ca};
2209            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2210            ## Reconsume.            ## Reconsume.
2211            return  ($self->{ct}); # ATTLIST  
2212              ## Discard the token.
2213              #return  ($self->{ct}); # ATTLIST
2214    
2215            redo A;            redo A;
2216          } else {          } else {
2217            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
# Line 2300  sub _get_next_token ($) { Line 2331  sub _get_next_token ($) {
2331          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2332          $self->{s_kwd} = '';          $self->{s_kwd} = '';
2333          ## Reconsume.          ## Reconsume.
2334          return  ($self->{ct}); # start tag or end tag  
2335            ## Discard the token.
2336            #return  ($self->{ct}); # start tag or end tag
2337    
2338          redo A;          redo A;
2339        } else {        } else {
2340                    
# Line 2367  sub _get_next_token ($) { Line 2401  sub _get_next_token ($) {
2401          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2402          $self->{s_kwd} = '';          $self->{s_kwd} = '';
2403          ## Reconsume.          ## Reconsume.
2404          return  ($self->{ct}); # start tag or end tag  
2405            ## Discard the token.
2406            #return  ($self->{ct}); # start tag or end tag
2407    
2408          redo A;          redo A;
2409        } else {        } else {
2410                    
# Line 2942  sub _get_next_token ($) { Line 2979  sub _get_next_token ($) {
2979        
2980          redo A;          redo A;
2981        }        }
2982      } elsif ($self->{state} == COMMENT_END_STATE) {      } elsif ($self->{state} == COMMENT_END_STATE or
2983                 $self->{state} == COMMENT_END_BANG_STATE) {
2984        ## XML5: "Comment end state" and "DOCTYPE comment end state".        ## XML5: "Comment end state" and "DOCTYPE comment end state".
2985          ## (No comment end bang state.)
2986    
2987        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2988          if ($self->{in_subset}) {          if ($self->{in_subset}) {
# Line 2970  sub _get_next_token ($) { Line 3009  sub _get_next_token ($) {
3009    
3010          redo A;          redo A;
3011        } elsif ($self->{nc} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
3012            if ($self->{state} == COMMENT_END_BANG_STATE) {
3013              
3014              $self->{ct}->{data} .= '--!'; # comment
3015              $self->{state} = COMMENT_END_DASH_STATE;
3016            } else {
3017              
3018              ## XML5: Not a parse error.
3019              $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
3020                              line => $self->{line_prev},
3021                              column => $self->{column_prev});
3022              $self->{ct}->{data} .= '-'; # comment
3023              ## Stay in the state
3024            }
3025                    
3026          ## XML5: Not a parse error.      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3027          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',        $self->{line_prev} = $self->{line};
3028                          line => $self->{line_prev},        $self->{column_prev} = $self->{column};
3029                          column => $self->{column_prev});        $self->{column}++;
3030          $self->{ct}->{data} .= '-'; # comment        $self->{nc}
3031          ## Stay in the state            = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3032        } else {
3033          $self->{set_nc}->($self);
3034        }
3035      
3036            redo A;
3037          } elsif ($self->{state} != COMMENT_END_BANG_STATE and
3038                   $is_space->{$self->{nc}}) {
3039            
3040            $self->{parse_error}->(level => $self->{level}->{must}, type => 'comment end space'); # XXX error type
3041            $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
3042            $self->{state} = COMMENT_END_SPACE_STATE;
3043            
3044        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3045          $self->{line_prev} = $self->{line};
3046          $self->{column_prev} = $self->{column};
3047          $self->{column}++;
3048          $self->{nc}
3049              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3050        } else {
3051          $self->{set_nc}->($self);
3052        }
3053      
3054            redo A;
3055          } elsif ($self->{state} != COMMENT_END_BANG_STATE and
3056                   $self->{nc} == 0x0021) { # !
3057            
3058            $self->{parse_error}->(level => $self->{level}->{must}, type => 'comment end bang'); # XXX error type
3059            $self->{state} = COMMENT_END_BANG_STATE;
3060                    
3061      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3062        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2999  sub _get_next_token ($) { Line 3079  sub _get_next_token ($) {
3079            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
3080            $self->{s_kwd} = '';            $self->{s_kwd} = '';
3081          }          }
3082          ## reconsume          ## Reconsume.
3083    
3084          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
3085    
3086          redo A;          redo A;
3087        } else {        } else {
3088                    
3089          $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment          if ($self->{state} == COMMENT_END_BANG_STATE) {
3090              $self->{ct}->{data} .= '--!' . chr ($self->{nc}); # comment
3091            } else {
3092              $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
3093            }
3094          $self->{state} = COMMENT_STATE;          $self->{state} = COMMENT_STATE;
3095                    
3096      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3021  sub _get_next_token ($) { Line 3105  sub _get_next_token ($) {
3105        
3106          redo A;          redo A;
3107        }        }
3108        } elsif ($self->{state} == COMMENT_END_SPACE_STATE) {
3109          ## XML5: Not exist.
3110    
3111          if ($self->{nc} == 0x003E) { # >
3112            if ($self->{in_subset}) {
3113              
3114              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3115            } else {
3116              
3117              $self->{state} = DATA_STATE;
3118              $self->{s_kwd} = '';
3119            }
3120            
3121        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3122          $self->{line_prev} = $self->{line};
3123          $self->{column_prev} = $self->{column};
3124          $self->{column}++;
3125          $self->{nc}
3126              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3127        } else {
3128          $self->{set_nc}->($self);
3129        }
3130      
3131    
3132            return  ($self->{ct}); # comment
3133    
3134            redo A;
3135          } elsif ($is_space->{$self->{nc}}) {
3136            
3137            $self->{ct}->{data} .= chr ($self->{nc}); # comment
3138            ## Stay in the state.
3139            
3140        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3141          $self->{line_prev} = $self->{line};
3142          $self->{column_prev} = $self->{column};
3143          $self->{column}++;
3144          $self->{nc}
3145              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3146        } else {
3147          $self->{set_nc}->($self);
3148        }
3149      
3150            redo A;
3151          } elsif ($self->{nc} == -1) {
3152            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
3153            if ($self->{in_subset}) {
3154              
3155              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3156            } else {
3157              
3158              $self->{state} = DATA_STATE;
3159              $self->{s_kwd} = '';
3160            }
3161            ## Reconsume.
3162    
3163            return  ($self->{ct}); # comment
3164    
3165            redo A;
3166          } else {
3167            
3168            $self->{ct}->{data} .= chr ($self->{nc}); # comment
3169            $self->{state} = COMMENT_STATE;
3170            
3171        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3172          $self->{line_prev} = $self->{line};
3173          $self->{column_prev} = $self->{column};
3174          $self->{column}++;
3175          $self->{nc}
3176              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3177        } else {
3178          $self->{set_nc}->($self);
3179        }
3180      
3181            redo A;
3182          }
3183      } elsif ($self->{state} == DOCTYPE_STATE) {      } elsif ($self->{state} == DOCTYPE_STATE) {
3184        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
3185                    

Legend:
Removed from v.1.30  
changed lines
  Added in v.1.33

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24