/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.24 by wakaba, Sun Oct 19 14:05:20 2008 UTC revision 1.30 by wakaba, Sun Aug 16 05:24:47 2009 UTC
# Line 1248  sub _get_next_token ($) { Line 1248  sub _get_next_token ($) {
1248          if ({          if ({
1249               0x0022 => 1, # "               0x0022 => 1, # "
1250               0x0027 => 1, # '               0x0027 => 1, # '
1251                 0x003C => 1, # <
1252               0x003D => 1, # =               0x003D => 1, # =
1253              }->{$self->{nc}}) {              }->{$self->{nc}}) {
1254                        
# Line 1430  sub _get_next_token ($) { Line 1431  sub _get_next_token ($) {
1431    
1432          redo A;          redo A;
1433        } else {        } else {
1434          if ($self->{nc} == 0x0022 or # "          if ({
1435              $self->{nc} == 0x0027) { # '               0x0022 => 1, # "
1436                 0x0027 => 1, # '
1437                 0x003C => 1, # <
1438                }->{$self->{nc}}) {
1439                        
1440            ## XML5: Not a parse error.            ## XML5: Not a parse error.
1441            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
# Line 1602  sub _get_next_token ($) { Line 1606  sub _get_next_token ($) {
1606                        
1607          }          }
1608    
1609          if ($self->{nc} == 0x0022 or # "          if ({
1610              $self->{nc} == 0x0027) { # '               0x0022 => 1, # "
1611                 0x0027 => 1, # '
1612                 0x003C => 1, # <
1613                }->{$self->{nc}}) {
1614                        
1615            ## XML5: Not a parse error.            ## XML5: Not a parse error.
1616            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
# Line 1740  sub _get_next_token ($) { Line 1747  sub _get_next_token ($) {
1747    
1748          redo A;          redo A;
1749        } else {        } else {
1750          if ($self->{nc} == 0x003D) { # =          if ($self->{nc} == 0x003D or $self->{nc} == 0x003C) { # =, <
1751                        
1752            ## XML5: Not a parse error.            ## XML5: Not a parse error.
1753            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
# Line 1816  sub _get_next_token ($) { Line 1823  sub _get_next_token ($) {
1823      }      }
1824        
1825          redo A;          redo A;
1826          } elsif ($self->{is_xml} and
1827                   $is_space->{$self->{nc}}) {
1828            
1829            $self->{ca}->{value} .= ' ';
1830            ## Stay in the state.
1831            
1832        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1833          $self->{line_prev} = $self->{line};
1834          $self->{column_prev} = $self->{column};
1835          $self->{column}++;
1836          $self->{nc}
1837              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1838        } else {
1839          $self->{set_nc}->($self);
1840        }
1841      
1842            redo A;
1843        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1844          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1845          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
# Line 1863  sub _get_next_token ($) { Line 1887  sub _get_next_token ($) {
1887          }          }
1888          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1889          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1890                                q["&<],                                qq["&<\x09\x0C\x20],
1891                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1892    
1893          ## Stay in the state          ## Stay in the state
# Line 1930  sub _get_next_token ($) { Line 1954  sub _get_next_token ($) {
1954      }      }
1955        
1956          redo A;          redo A;
1957          } elsif ($self->{is_xml} and
1958                   $is_space->{$self->{nc}}) {
1959            
1960            $self->{ca}->{value} .= ' ';
1961            ## Stay in the state.
1962            
1963        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1964          $self->{line_prev} = $self->{line};
1965          $self->{column_prev} = $self->{column};
1966          $self->{column}++;
1967          $self->{nc}
1968              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1969        } else {
1970          $self->{set_nc}->($self);
1971        }
1972      
1973            redo A;
1974        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1975          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1976          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
# Line 1977  sub _get_next_token ($) { Line 2018  sub _get_next_token ($) {
2018          }          }
2019          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
2020          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
2021                                q['&<],                                qq['&<\x09\x0C\x20],
2022                                length $self->{ca}->{value});                                length $self->{ca}->{value});
2023    
2024          ## Stay in the state          ## Stay in the state
# Line 2149  sub _get_next_token ($) { Line 2190  sub _get_next_token ($) {
2190               0x0022 => 1, # "               0x0022 => 1, # "
2191               0x0027 => 1, # '               0x0027 => 1, # '
2192               0x003D => 1, # =               0x003D => 1, # =
2193                 0x003C => 1, # <
2194              }->{$self->{nc}}) {              }->{$self->{nc}}) {
2195                        
2196            ## XML5: Not a parse error.            ## XML5: Not a parse error.
# Line 2158  sub _get_next_token ($) { Line 2200  sub _get_next_token ($) {
2200          }          }
2201          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
2202          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
2203                                q["'=& >],                                qq["'=& \x09\x0C>],
2204                                length $self->{ca}->{value});                                length $self->{ca}->{value});
2205    
2206          ## Stay in the state          ## Stay in the state
# Line 2964  sub _get_next_token ($) { Line 3006  sub _get_next_token ($) {
3006          redo A;          redo A;
3007        } else {        } else {
3008                    
         ## XML5: Not a parse error.  
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',  
                         line => $self->{line_prev},  
                         column => $self->{column_prev});  
3009          $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment          $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
3010          $self->{state} = COMMENT_STATE;          $self->{state} = COMMENT_STATE;
3011                    
# Line 2999  sub _get_next_token ($) { Line 3037  sub _get_next_token ($) {
3037      }      }
3038        
3039          redo A;          redo A;
3040          } elsif ($self->{nc} == -1) {
3041            
3042            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3043            $self->{ct}->{quirks} = 1;
3044    
3045            $self->{state} = DATA_STATE;
3046            ## Reconsume.
3047            return  ($self->{ct}); # DOCTYPE (quirks)
3048    
3049            redo A;
3050        } else {        } else {
3051                    
3052          ## XML5: Unless EOF, swith to the bogus comment state.          ## XML5: Swith to the bogus comment state.
3053          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
3054          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3055          ## reconsume          ## reconsume
# Line 3046  sub _get_next_token ($) { Line 3094  sub _get_next_token ($) {
3094          return  ($self->{ct}); # DOCTYPE (quirks)          return  ($self->{ct}); # DOCTYPE (quirks)
3095    
3096          redo A;          redo A;
3097          } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
3098            
3099            $self->{ct}->{name} # DOCTYPE
3100                = chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
3101            delete $self->{ct}->{quirks};
3102            $self->{state} = DOCTYPE_NAME_STATE;
3103            
3104        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3105          $self->{line_prev} = $self->{line};
3106          $self->{column_prev} = $self->{column};
3107          $self->{column}++;
3108          $self->{nc}
3109              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3110        } else {
3111          $self->{set_nc}->($self);
3112        }
3113      
3114            redo A;
3115        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3116                    
3117          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
# Line 3132  sub _get_next_token ($) { Line 3198  sub _get_next_token ($) {
3198          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
3199    
3200          redo A;          redo A;
3201          } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
3202            
3203            $self->{ct}->{name} # DOCTYPE
3204                .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
3205            delete $self->{ct}->{quirks};
3206            ## Stay in the state.
3207            
3208        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3209          $self->{line_prev} = $self->{line};
3210          $self->{column_prev} = $self->{column};
3211          $self->{column}++;
3212          $self->{nc}
3213              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3214        } else {
3215          $self->{set_nc}->($self);
3216        }
3217      
3218            redo A;
3219        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3220                    
3221          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
# Line 3163  sub _get_next_token ($) { Line 3247  sub _get_next_token ($) {
3247          redo A;          redo A;
3248        } else {        } else {
3249                    
3250          $self->{ct}->{name}          $self->{ct}->{name} .= chr ($self->{nc}); # DOCTYPE
3251            .= chr ($self->{nc}); # DOCTYPE          ## Stay in the state.
         ## Stay in the state  
3252                    
3253      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3254        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 4837  sub _get_next_token ($) { Line 4920  sub _get_next_token ($) {
4920        my $code = $self->{kwd};        my $code = $self->{kwd};
4921        my $l = $self->{line_prev};        my $l = $self->{line_prev};
4922        my $c = $self->{column_prev};        my $c = $self->{column_prev};
4923        if ($charref_map->{$code}) {        if ((not $self->{is_xml} and $charref_map->{$code}) or
4924              ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
4925              ($self->{is_xml} and $code == 0x0000)) {
4926                    
4927          $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4928                          text => (sprintf 'U+%04X', $code),                          text => (sprintf 'U+%04X', $code),
# Line 4990  sub _get_next_token ($) { Line 5075  sub _get_next_token ($) {
5075        my $code = $self->{kwd};        my $code = $self->{kwd};
5076        my $l = $self->{line_prev};        my $l = $self->{line_prev};
5077        my $c = $self->{column_prev};        my $c = $self->{column_prev};
5078        if ($charref_map->{$code}) {        if ((not $self->{is_xml} and $charref_map->{$code}) or
5079              ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
5080              ($self->{is_xml} and $code == 0x0000)) {
5081                    
5082          $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5083                          text => (sprintf 'U+%04X', $code),                          text => (sprintf 'U+%04X', $code),

Legend:
Removed from v.1.24  
changed lines
  Added in v.1.30

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24