/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.24 by wakaba, Sun Oct 19 14:05:20 2008 UTC revision 1.31 by wakaba, Sat Sep 5 09:26:55 2009 UTC
# Line 105  sub COMMENT_START_STATE () { 14 } Line 105  sub COMMENT_START_STATE () { 14 }
105  sub COMMENT_START_DASH_STATE () { 15 }  sub COMMENT_START_DASH_STATE () { 15 }
106  sub COMMENT_STATE () { 16 }  sub COMMENT_STATE () { 16 }
107  sub COMMENT_END_STATE () { 17 }  sub COMMENT_END_STATE () { 17 }
108    sub COMMENT_END_BANG_STATE () { 102 } ## LAST
109  sub COMMENT_END_DASH_STATE () { 18 }  sub COMMENT_END_DASH_STATE () { 18 }
110  sub BOGUS_COMMENT_STATE () { 19 }  sub BOGUS_COMMENT_STATE () { 19 }
111  sub DOCTYPE_STATE () { 20 }  sub DOCTYPE_STATE () { 20 }
# Line 1248  sub _get_next_token ($) { Line 1249  sub _get_next_token ($) {
1249          if ({          if ({
1250               0x0022 => 1, # "               0x0022 => 1, # "
1251               0x0027 => 1, # '               0x0027 => 1, # '
1252                 0x003C => 1, # <
1253               0x003D => 1, # =               0x003D => 1, # =
1254              }->{$self->{nc}}) {              }->{$self->{nc}}) {
1255                        
# Line 1430  sub _get_next_token ($) { Line 1432  sub _get_next_token ($) {
1432    
1433          redo A;          redo A;
1434        } else {        } else {
1435          if ($self->{nc} == 0x0022 or # "          if ({
1436              $self->{nc} == 0x0027) { # '               0x0022 => 1, # "
1437                 0x0027 => 1, # '
1438                 0x003C => 1, # <
1439                }->{$self->{nc}}) {
1440                        
1441            ## XML5: Not a parse error.            ## XML5: Not a parse error.
1442            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
# Line 1602  sub _get_next_token ($) { Line 1607  sub _get_next_token ($) {
1607                        
1608          }          }
1609    
1610          if ($self->{nc} == 0x0022 or # "          if ({
1611              $self->{nc} == 0x0027) { # '               0x0022 => 1, # "
1612                 0x0027 => 1, # '
1613                 0x003C => 1, # <
1614                }->{$self->{nc}}) {
1615                        
1616            ## XML5: Not a parse error.            ## XML5: Not a parse error.
1617            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
# Line 1740  sub _get_next_token ($) { Line 1748  sub _get_next_token ($) {
1748    
1749          redo A;          redo A;
1750        } else {        } else {
1751          if ($self->{nc} == 0x003D) { # =          if ($self->{nc} == 0x003D or $self->{nc} == 0x003C) { # =, <
1752                        
1753            ## XML5: Not a parse error.            ## XML5: Not a parse error.
1754            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
# Line 1816  sub _get_next_token ($) { Line 1824  sub _get_next_token ($) {
1824      }      }
1825        
1826          redo A;          redo A;
1827          } elsif ($self->{is_xml} and
1828                   $is_space->{$self->{nc}}) {
1829            
1830            $self->{ca}->{value} .= ' ';
1831            ## Stay in the state.
1832            
1833        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1834          $self->{line_prev} = $self->{line};
1835          $self->{column_prev} = $self->{column};
1836          $self->{column}++;
1837          $self->{nc}
1838              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1839        } else {
1840          $self->{set_nc}->($self);
1841        }
1842      
1843            redo A;
1844        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1845          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1846          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
# Line 1863  sub _get_next_token ($) { Line 1888  sub _get_next_token ($) {
1888          }          }
1889          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1890          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1891                                q["&<],                                qq["&<\x09\x0C\x20],
1892                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1893    
1894          ## Stay in the state          ## Stay in the state
# Line 1930  sub _get_next_token ($) { Line 1955  sub _get_next_token ($) {
1955      }      }
1956        
1957          redo A;          redo A;
1958          } elsif ($self->{is_xml} and
1959                   $is_space->{$self->{nc}}) {
1960            
1961            $self->{ca}->{value} .= ' ';
1962            ## Stay in the state.
1963            
1964        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1965          $self->{line_prev} = $self->{line};
1966          $self->{column_prev} = $self->{column};
1967          $self->{column}++;
1968          $self->{nc}
1969              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1970        } else {
1971          $self->{set_nc}->($self);
1972        }
1973      
1974            redo A;
1975        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1976          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1977          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
# Line 1977  sub _get_next_token ($) { Line 2019  sub _get_next_token ($) {
2019          }          }
2020          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
2021          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
2022                                q['&<],                                qq['&<\x09\x0C\x20],
2023                                length $self->{ca}->{value});                                length $self->{ca}->{value});
2024    
2025          ## Stay in the state          ## Stay in the state
# Line 2149  sub _get_next_token ($) { Line 2191  sub _get_next_token ($) {
2191               0x0022 => 1, # "               0x0022 => 1, # "
2192               0x0027 => 1, # '               0x0027 => 1, # '
2193               0x003D => 1, # =               0x003D => 1, # =
2194                 0x003C => 1, # <
2195              }->{$self->{nc}}) {              }->{$self->{nc}}) {
2196                        
2197            ## XML5: Not a parse error.            ## XML5: Not a parse error.
# Line 2158  sub _get_next_token ($) { Line 2201  sub _get_next_token ($) {
2201          }          }
2202          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
2203          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
2204                                q["'=& >],                                qq["'=& \x09\x0C>],
2205                                length $self->{ca}->{value});                                length $self->{ca}->{value});
2206    
2207          ## Stay in the state          ## Stay in the state
# Line 2900  sub _get_next_token ($) { Line 2943  sub _get_next_token ($) {
2943        
2944          redo A;          redo A;
2945        }        }
2946      } elsif ($self->{state} == COMMENT_END_STATE) {      } elsif ($self->{state} == COMMENT_END_STATE or
2947                 $self->{state} == COMMENT_END_BANG_STATE) {
2948        ## XML5: "Comment end state" and "DOCTYPE comment end state".        ## XML5: "Comment end state" and "DOCTYPE comment end state".
2949          ## (No comment end bang state.)
2950    
2951        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2952          if ($self->{in_subset}) {          if ($self->{in_subset}) {
# Line 2928  sub _get_next_token ($) { Line 2973  sub _get_next_token ($) {
2973    
2974          redo A;          redo A;
2975        } elsif ($self->{nc} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
2976            if ($self->{state} == COMMENT_END_BANG_STATE) {
2977              
2978              $self->{ct}->{data} .= '--!'; # comment
2979              $self->{state} = COMMENT_END_DASH_STATE;
2980            } else {
2981              
2982              ## XML5: Not a parse error.
2983              $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2984                              line => $self->{line_prev},
2985                              column => $self->{column_prev});
2986              $self->{ct}->{data} .= '-'; # comment
2987              ## Stay in the state
2988            }
2989                    
2990          ## XML5: Not a parse error.      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2991          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',        $self->{line_prev} = $self->{line};
2992                          line => $self->{line_prev},        $self->{column_prev} = $self->{column};
2993                          column => $self->{column_prev});        $self->{column}++;
2994          $self->{ct}->{data} .= '-'; # comment        $self->{nc}
2995          ## Stay in the state            = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2996        } else {
2997          $self->{set_nc}->($self);
2998        }
2999      
3000            redo A;
3001          } elsif ($self->{nc} == 0x0021 and # !
3002                   $self->{state} != COMMENT_END_BANG_STATE) {
3003            $self->{parse_error}->(level => $self->{level}->{must}, type => 'comment end bang'); # XXX error type
3004            $self->{state} = COMMENT_END_BANG_STATE;
3005                    
3006      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3007        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2957  sub _get_next_token ($) { Line 3024  sub _get_next_token ($) {
3024            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
3025            $self->{s_kwd} = '';            $self->{s_kwd} = '';
3026          }          }
3027          ## reconsume          ## Reconsume.
3028    
3029          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
3030    
3031          redo A;          redo A;
3032        } else {        } else {
3033                    
3034          ## XML5: Not a parse error.          if ($self->{state} == COMMENT_END_BANG_STATE) {
3035          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',            $self->{ct}->{data} .= '--!' . chr ($self->{nc}); # comment
3036                          line => $self->{line_prev},          } else {
3037                          column => $self->{column_prev});            $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
3038          $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment          }
3039          $self->{state} = COMMENT_STATE;          $self->{state} = COMMENT_STATE;
3040                    
3041      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 2999  sub _get_next_token ($) { Line 3066  sub _get_next_token ($) {
3066      }      }
3067        
3068          redo A;          redo A;
3069          } elsif ($self->{nc} == -1) {
3070            
3071            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3072            $self->{ct}->{quirks} = 1;
3073    
3074            $self->{state} = DATA_STATE;
3075            ## Reconsume.
3076            return  ($self->{ct}); # DOCTYPE (quirks)
3077    
3078            redo A;
3079        } else {        } else {
3080                    
3081          ## XML5: Unless EOF, swith to the bogus comment state.          ## XML5: Swith to the bogus comment state.
3082          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
3083          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3084          ## reconsume          ## reconsume
# Line 3046  sub _get_next_token ($) { Line 3123  sub _get_next_token ($) {
3123          return  ($self->{ct}); # DOCTYPE (quirks)          return  ($self->{ct}); # DOCTYPE (quirks)
3124    
3125          redo A;          redo A;
3126          } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
3127            
3128            $self->{ct}->{name} # DOCTYPE
3129                = chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
3130            delete $self->{ct}->{quirks};
3131            $self->{state} = DOCTYPE_NAME_STATE;
3132            
3133        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3134          $self->{line_prev} = $self->{line};
3135          $self->{column_prev} = $self->{column};
3136          $self->{column}++;
3137          $self->{nc}
3138              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3139        } else {
3140          $self->{set_nc}->($self);
3141        }
3142      
3143            redo A;
3144        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3145                    
3146          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
# Line 3132  sub _get_next_token ($) { Line 3227  sub _get_next_token ($) {
3227          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
3228    
3229          redo A;          redo A;
3230          } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
3231            
3232            $self->{ct}->{name} # DOCTYPE
3233                .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
3234            delete $self->{ct}->{quirks};
3235            ## Stay in the state.
3236            
3237        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3238          $self->{line_prev} = $self->{line};
3239          $self->{column_prev} = $self->{column};
3240          $self->{column}++;
3241          $self->{nc}
3242              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3243        } else {
3244          $self->{set_nc}->($self);
3245        }
3246      
3247            redo A;
3248        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3249                    
3250          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
# Line 3163  sub _get_next_token ($) { Line 3276  sub _get_next_token ($) {
3276          redo A;          redo A;
3277        } else {        } else {
3278                    
3279          $self->{ct}->{name}          $self->{ct}->{name} .= chr ($self->{nc}); # DOCTYPE
3280            .= chr ($self->{nc}); # DOCTYPE          ## Stay in the state.
         ## Stay in the state  
3281                    
3282      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3283        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 4837  sub _get_next_token ($) { Line 4949  sub _get_next_token ($) {
4949        my $code = $self->{kwd};        my $code = $self->{kwd};
4950        my $l = $self->{line_prev};        my $l = $self->{line_prev};
4951        my $c = $self->{column_prev};        my $c = $self->{column_prev};
4952        if ($charref_map->{$code}) {        if ((not $self->{is_xml} and $charref_map->{$code}) or
4953              ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
4954              ($self->{is_xml} and $code == 0x0000)) {
4955                    
4956          $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4957                          text => (sprintf 'U+%04X', $code),                          text => (sprintf 'U+%04X', $code),
# Line 4990  sub _get_next_token ($) { Line 5104  sub _get_next_token ($) {
5104        my $code = $self->{kwd};        my $code = $self->{kwd};
5105        my $l = $self->{line_prev};        my $l = $self->{line_prev};
5106        my $c = $self->{column_prev};        my $c = $self->{column_prev};
5107        if ($charref_map->{$code}) {        if ((not $self->{is_xml} and $charref_map->{$code}) or
5108              ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
5109              ($self->{is_xml} and $code == 0x0000)) {
5110                    
5111          $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5112                          text => (sprintf 'U+%04X', $code),                          text => (sprintf 'U+%04X', $code),

Legend:
Removed from v.1.24  
changed lines
  Added in v.1.31

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24