/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.28 by wakaba, Sun Jul 5 04:38:45 2009 UTC revision 1.32 by wakaba, Sat Sep 5 09:57:55 2009 UTC
# Line 105  sub COMMENT_START_STATE () { 14 } Line 105  sub COMMENT_START_STATE () { 14 }
105  sub COMMENT_START_DASH_STATE () { 15 }  sub COMMENT_START_DASH_STATE () { 15 }
106  sub COMMENT_STATE () { 16 }  sub COMMENT_STATE () { 16 }
107  sub COMMENT_END_STATE () { 17 }  sub COMMENT_END_STATE () { 17 }
108    sub COMMENT_END_BANG_STATE () { 102 }
109    sub COMMENT_END_SPACE_STATE () { 103 } ## LAST
110  sub COMMENT_END_DASH_STATE () { 18 }  sub COMMENT_END_DASH_STATE () { 18 }
111  sub BOGUS_COMMENT_STATE () { 19 }  sub BOGUS_COMMENT_STATE () { 19 }
112  sub DOCTYPE_STATE () { 20 }  sub DOCTYPE_STATE () { 20 }
# Line 1248  sub _get_next_token ($) { Line 1250  sub _get_next_token ($) {
1250          if ({          if ({
1251               0x0022 => 1, # "               0x0022 => 1, # "
1252               0x0027 => 1, # '               0x0027 => 1, # '
1253                 0x003C => 1, # <
1254               0x003D => 1, # =               0x003D => 1, # =
1255              }->{$self->{nc}}) {              }->{$self->{nc}}) {
1256                        
# Line 1430  sub _get_next_token ($) { Line 1433  sub _get_next_token ($) {
1433    
1434          redo A;          redo A;
1435        } else {        } else {
1436          if ($self->{nc} == 0x0022 or # "          if ({
1437              $self->{nc} == 0x0027) { # '               0x0022 => 1, # "
1438                 0x0027 => 1, # '
1439                 0x003C => 1, # <
1440                }->{$self->{nc}}) {
1441                        
1442            ## XML5: Not a parse error.            ## XML5: Not a parse error.
1443            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
# Line 1602  sub _get_next_token ($) { Line 1608  sub _get_next_token ($) {
1608                        
1609          }          }
1610    
1611          if ($self->{nc} == 0x0022 or # "          if ({
1612              $self->{nc} == 0x0027) { # '               0x0022 => 1, # "
1613                 0x0027 => 1, # '
1614                 0x003C => 1, # <
1615                }->{$self->{nc}}) {
1616                        
1617            ## XML5: Not a parse error.            ## XML5: Not a parse error.
1618            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
# Line 2935  sub _get_next_token ($) { Line 2944  sub _get_next_token ($) {
2944        
2945          redo A;          redo A;
2946        }        }
2947      } elsif ($self->{state} == COMMENT_END_STATE) {      } elsif ($self->{state} == COMMENT_END_STATE or
2948                 $self->{state} == COMMENT_END_BANG_STATE) {
2949        ## XML5: "Comment end state" and "DOCTYPE comment end state".        ## XML5: "Comment end state" and "DOCTYPE comment end state".
2950          ## (No comment end bang state.)
2951    
2952        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2953          if ($self->{in_subset}) {          if ($self->{in_subset}) {
# Line 2963  sub _get_next_token ($) { Line 2974  sub _get_next_token ($) {
2974    
2975          redo A;          redo A;
2976        } elsif ($self->{nc} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
2977            if ($self->{state} == COMMENT_END_BANG_STATE) {
2978              
2979              $self->{ct}->{data} .= '--!'; # comment
2980              $self->{state} = COMMENT_END_DASH_STATE;
2981            } else {
2982              
2983              ## XML5: Not a parse error.
2984              $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2985                              line => $self->{line_prev},
2986                              column => $self->{column_prev});
2987              $self->{ct}->{data} .= '-'; # comment
2988              ## Stay in the state
2989            }
2990                    
2991          ## XML5: Not a parse error.      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2992          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',        $self->{line_prev} = $self->{line};
2993                          line => $self->{line_prev},        $self->{column_prev} = $self->{column};
2994                          column => $self->{column_prev});        $self->{column}++;
2995          $self->{ct}->{data} .= '-'; # comment        $self->{nc}
2996          ## Stay in the state            = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2997        } else {
2998          $self->{set_nc}->($self);
2999        }
3000      
3001            redo A;
3002          } elsif ($self->{state} != COMMENT_END_BANG_STATE and
3003                   $is_space->{$self->{nc}}) {
3004            
3005            $self->{parse_error}->(level => $self->{level}->{must}, type => 'comment end space'); # XXX error type
3006            $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
3007            $self->{state} = COMMENT_END_SPACE_STATE;
3008            
3009        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3010          $self->{line_prev} = $self->{line};
3011          $self->{column_prev} = $self->{column};
3012          $self->{column}++;
3013          $self->{nc}
3014              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3015        } else {
3016          $self->{set_nc}->($self);
3017        }
3018      
3019            redo A;
3020          } elsif ($self->{state} != COMMENT_END_BANG_STATE and
3021                   $self->{nc} == 0x0021) { # !
3022            
3023            $self->{parse_error}->(level => $self->{level}->{must}, type => 'comment end bang'); # XXX error type
3024            $self->{state} = COMMENT_END_BANG_STATE;
3025                    
3026      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3027        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2992  sub _get_next_token ($) { Line 3044  sub _get_next_token ($) {
3044            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
3045            $self->{s_kwd} = '';            $self->{s_kwd} = '';
3046          }          }
3047          ## reconsume          ## Reconsume.
3048    
3049          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
3050    
3051          redo A;          redo A;
3052        } else {        } else {
3053                    
3054          $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment          if ($self->{state} == COMMENT_END_BANG_STATE) {
3055              $self->{ct}->{data} .= '--!' . chr ($self->{nc}); # comment
3056            } else {
3057              $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
3058            }
3059          $self->{state} = COMMENT_STATE;          $self->{state} = COMMENT_STATE;
3060                    
3061      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3014  sub _get_next_token ($) { Line 3070  sub _get_next_token ($) {
3070        
3071          redo A;          redo A;
3072        }        }
3073        } elsif ($self->{state} == COMMENT_END_SPACE_STATE) {
3074          ## XML5: Not exist.
3075    
3076          if ($self->{nc} == 0x003E) { # >
3077            if ($self->{in_subset}) {
3078              
3079              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3080            } else {
3081              
3082              $self->{state} = DATA_STATE;
3083              $self->{s_kwd} = '';
3084            }
3085            
3086        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3087          $self->{line_prev} = $self->{line};
3088          $self->{column_prev} = $self->{column};
3089          $self->{column}++;
3090          $self->{nc}
3091              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3092        } else {
3093          $self->{set_nc}->($self);
3094        }
3095      
3096    
3097            return  ($self->{ct}); # comment
3098    
3099            redo A;
3100          } elsif ($is_space->{$self->{nc}}) {
3101            
3102            $self->{ct}->{data} .= chr ($self->{nc}); # comment
3103            ## Stay in the state.
3104            
3105        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3106          $self->{line_prev} = $self->{line};
3107          $self->{column_prev} = $self->{column};
3108          $self->{column}++;
3109          $self->{nc}
3110              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3111        } else {
3112          $self->{set_nc}->($self);
3113        }
3114      
3115            redo A;
3116          } elsif ($self->{nc} == -1) {
3117            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
3118            if ($self->{in_subset}) {
3119              
3120              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3121            } else {
3122              
3123              $self->{state} = DATA_STATE;
3124              $self->{s_kwd} = '';
3125            }
3126            ## Reconsume.
3127    
3128            return  ($self->{ct}); # comment
3129    
3130            redo A;
3131          } else {
3132            
3133            $self->{ct}->{data} .= chr ($self->{nc}); # comment
3134            $self->{state} = COMMENT_STATE;
3135            
3136        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3137          $self->{line_prev} = $self->{line};
3138          $self->{column_prev} = $self->{column};
3139          $self->{column}++;
3140          $self->{nc}
3141              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3142        } else {
3143          $self->{set_nc}->($self);
3144        }
3145      
3146            redo A;
3147          }
3148      } elsif ($self->{state} == DOCTYPE_STATE) {      } elsif ($self->{state} == DOCTYPE_STATE) {
3149        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
3150                    
# Line 3087  sub _get_next_token ($) { Line 3218  sub _get_next_token ($) {
3218          return  ($self->{ct}); # DOCTYPE (quirks)          return  ($self->{ct}); # DOCTYPE (quirks)
3219    
3220          redo A;          redo A;
3221          } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
3222            
3223            $self->{ct}->{name} # DOCTYPE
3224                = chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
3225            delete $self->{ct}->{quirks};
3226            $self->{state} = DOCTYPE_NAME_STATE;
3227            
3228        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3229          $self->{line_prev} = $self->{line};
3230          $self->{column_prev} = $self->{column};
3231          $self->{column}++;
3232          $self->{nc}
3233              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3234        } else {
3235          $self->{set_nc}->($self);
3236        }
3237      
3238            redo A;
3239        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3240                    
3241          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
# Line 3173  sub _get_next_token ($) { Line 3322  sub _get_next_token ($) {
3322          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
3323    
3324          redo A;          redo A;
3325          } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
3326            
3327            $self->{ct}->{name} # DOCTYPE
3328                .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
3329            delete $self->{ct}->{quirks};
3330            ## Stay in the state.
3331            
3332        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3333          $self->{line_prev} = $self->{line};
3334          $self->{column_prev} = $self->{column};
3335          $self->{column}++;
3336          $self->{nc}
3337              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3338        } else {
3339          $self->{set_nc}->($self);
3340        }
3341      
3342            redo A;
3343        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3344                    
3345          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
# Line 3204  sub _get_next_token ($) { Line 3371  sub _get_next_token ($) {
3371          redo A;          redo A;
3372        } else {        } else {
3373                    
3374          $self->{ct}->{name}          $self->{ct}->{name} .= chr ($self->{nc}); # DOCTYPE
3375            .= chr ($self->{nc}); # DOCTYPE          ## Stay in the state.
         ## Stay in the state  
3376                    
3377      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3378        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};

Legend:
Removed from v.1.28  
changed lines
  Added in v.1.32

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24