/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.22 by wakaba, Sun Oct 19 10:12:54 2008 UTC revision 1.29 by wakaba, Sun Aug 16 04:06:34 2009 UTC
# Line 1740  sub _get_next_token ($) { Line 1740  sub _get_next_token ($) {
1740    
1741          redo A;          redo A;
1742        } else {        } else {
1743          if ($self->{nc} == 0x003D) { # =          if ($self->{nc} == 0x003D or $self->{nc} == 0x003C) { # =, <
1744                        
1745            ## XML5: Not a parse error.            ## XML5: Not a parse error.
1746            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
# Line 1816  sub _get_next_token ($) { Line 1816  sub _get_next_token ($) {
1816      }      }
1817        
1818          redo A;          redo A;
1819          } elsif ($self->{is_xml} and
1820                   $is_space->{$self->{nc}}) {
1821            
1822            $self->{ca}->{value} .= ' ';
1823            ## Stay in the state.
1824            
1825        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1826          $self->{line_prev} = $self->{line};
1827          $self->{column_prev} = $self->{column};
1828          $self->{column}++;
1829          $self->{nc}
1830              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1831        } else {
1832          $self->{set_nc}->($self);
1833        }
1834      
1835            redo A;
1836        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1837          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1838          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
# Line 1863  sub _get_next_token ($) { Line 1880  sub _get_next_token ($) {
1880          }          }
1881          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1882          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1883                                q["&<],                                qq["&<\x09\x0C\x20],
1884                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1885    
1886          ## Stay in the state          ## Stay in the state
# Line 1930  sub _get_next_token ($) { Line 1947  sub _get_next_token ($) {
1947      }      }
1948        
1949          redo A;          redo A;
1950          } elsif ($self->{is_xml} and
1951                   $is_space->{$self->{nc}}) {
1952            
1953            $self->{ca}->{value} .= ' ';
1954            ## Stay in the state.
1955            
1956        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1957          $self->{line_prev} = $self->{line};
1958          $self->{column_prev} = $self->{column};
1959          $self->{column}++;
1960          $self->{nc}
1961              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1962        } else {
1963          $self->{set_nc}->($self);
1964        }
1965      
1966            redo A;
1967        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1968          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1969          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
# Line 1977  sub _get_next_token ($) { Line 2011  sub _get_next_token ($) {
2011          }          }
2012          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
2013          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
2014                                q['&<],                                qq['&<\x09\x0C\x20],
2015                                length $self->{ca}->{value});                                length $self->{ca}->{value});
2016    
2017          ## Stay in the state          ## Stay in the state
# Line 2149  sub _get_next_token ($) { Line 2183  sub _get_next_token ($) {
2183               0x0022 => 1, # "               0x0022 => 1, # "
2184               0x0027 => 1, # '               0x0027 => 1, # '
2185               0x003D => 1, # =               0x003D => 1, # =
2186                 0x003C => 1, # <
2187              }->{$self->{nc}}) {              }->{$self->{nc}}) {
2188                        
2189            ## XML5: Not a parse error.            ## XML5: Not a parse error.
# Line 2158  sub _get_next_token ($) { Line 2193  sub _get_next_token ($) {
2193          }          }
2194          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
2195          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
2196                                q["'=& >],                                qq["'=& \x09\x0C>],
2197                                length $self->{ca}->{value});                                length $self->{ca}->{value});
2198    
2199          ## Stay in the state          ## Stay in the state
# Line 2964  sub _get_next_token ($) { Line 2999  sub _get_next_token ($) {
2999          redo A;          redo A;
3000        } else {        } else {
3001                    
         ## XML5: Not a parse error.  
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',  
                         line => $self->{line_prev},  
                         column => $self->{column_prev});  
3002          $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment          $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
3003          $self->{state} = COMMENT_STATE;          $self->{state} = COMMENT_STATE;
3004                    
# Line 2999  sub _get_next_token ($) { Line 3030  sub _get_next_token ($) {
3030      }      }
3031        
3032          redo A;          redo A;
3033          } elsif ($self->{nc} == -1) {
3034            
3035            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3036            $self->{ct}->{quirks} = 1;
3037    
3038            $self->{state} = DATA_STATE;
3039            ## Reconsume.
3040            return  ($self->{ct}); # DOCTYPE (quirks)
3041    
3042            redo A;
3043        } else {        } else {
3044                    
3045          ## XML5: Unless EOF, swith to the bogus comment state.          ## XML5: Swith to the bogus comment state.
3046          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
3047          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3048          ## reconsume          ## reconsume
# Line 3046  sub _get_next_token ($) { Line 3087  sub _get_next_token ($) {
3087          return  ($self->{ct}); # DOCTYPE (quirks)          return  ($self->{ct}); # DOCTYPE (quirks)
3088    
3089          redo A;          redo A;
3090          } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
3091            
3092            $self->{ct}->{name} # DOCTYPE
3093                = chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
3094            delete $self->{ct}->{quirks};
3095            $self->{state} = DOCTYPE_NAME_STATE;
3096            
3097        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3098          $self->{line_prev} = $self->{line};
3099          $self->{column_prev} = $self->{column};
3100          $self->{column}++;
3101          $self->{nc}
3102              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3103        } else {
3104          $self->{set_nc}->($self);
3105        }
3106      
3107            redo A;
3108        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3109                    
3110          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
# Line 3132  sub _get_next_token ($) { Line 3191  sub _get_next_token ($) {
3191          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
3192    
3193          redo A;          redo A;
3194          } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
3195            
3196            $self->{ct}->{name} # DOCTYPE
3197                .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
3198            delete $self->{ct}->{quirks};
3199            ## Stay in the state.
3200            
3201        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3202          $self->{line_prev} = $self->{line};
3203          $self->{column_prev} = $self->{column};
3204          $self->{column}++;
3205          $self->{nc}
3206              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3207        } else {
3208          $self->{set_nc}->($self);
3209        }
3210      
3211            redo A;
3212        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3213                    
3214          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
# Line 3163  sub _get_next_token ($) { Line 3240  sub _get_next_token ($) {
3240          redo A;          redo A;
3241        } else {        } else {
3242                    
3243          $self->{ct}->{name}          $self->{ct}->{name} .= chr ($self->{nc}); # DOCTYPE
3244            .= chr ($self->{nc}); # DOCTYPE          ## Stay in the state.
         ## Stay in the state  
3245                    
3246      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3247        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 4837  sub _get_next_token ($) { Line 4913  sub _get_next_token ($) {
4913        my $code = $self->{kwd};        my $code = $self->{kwd};
4914        my $l = $self->{line_prev};        my $l = $self->{line_prev};
4915        my $c = $self->{column_prev};        my $c = $self->{column_prev};
4916        if ($charref_map->{$code}) {        if ((not $self->{is_xml} and $charref_map->{$code}) or
4917              ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
4918              ($self->{is_xml} and $code == 0x0000)) {
4919                    
4920          $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4921                          text => (sprintf 'U+%04X', $code),                          text => (sprintf 'U+%04X', $code),
# Line 4990  sub _get_next_token ($) { Line 5068  sub _get_next_token ($) {
5068        my $code = $self->{kwd};        my $code = $self->{kwd};
5069        my $l = $self->{line_prev};        my $l = $self->{line_prev};
5070        my $c = $self->{column_prev};        my $c = $self->{column_prev};
5071        if ($charref_map->{$code}) {        if ((not $self->{is_xml} and $charref_map->{$code}) or
5072              ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
5073              ($self->{is_xml} and $code == 0x0000)) {
5074                    
5075          $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5076                          text => (sprintf 'U+%04X', $code),                          text => (sprintf 'U+%04X', $code),
# Line 5469  sub _get_next_token ($) { Line 5549  sub _get_next_token ($) {
5549          ## XML5: Not defined yet.          ## XML5: Not defined yet.
5550    
5551          ## TODO:          ## TODO:
5552    
5553            if (not $self->{stop_processing} and
5554                not $self->{document}->xml_standalone) {
5555              $self->{parse_error}->(level => $self->{level}->{must}, type => 'stop processing', ## TODO: type
5556                              level => $self->{level}->{info});
5557              $self->{stop_processing} = 1;
5558            }
5559    
5560                    
5561      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5562        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 5903  sub _get_next_token ($) { Line 5991  sub _get_next_token ($) {
5991          }          }
5992          $self->{ct} = {type => ELEMENT_TOKEN, name => '',          $self->{ct} = {type => ELEMENT_TOKEN, name => '',
5993                         line => $self->{line_prev},                         line => $self->{line_prev},
5994                         column => $self->{column_prev} - 6};                         column => $self->{column_prev} - 7};
5995          $self->{state} = DOCTYPE_MD_STATE;          $self->{state} = DOCTYPE_MD_STATE;
5996                    
5997      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 5971  sub _get_next_token ($) { Line 6059  sub _get_next_token ($) {
6059          $self->{ct} = {type => ATTLIST_TOKEN, name => '',          $self->{ct} = {type => ATTLIST_TOKEN, name => '',
6060                         attrdefs => [],                         attrdefs => [],
6061                         line => $self->{line_prev},                         line => $self->{line_prev},
6062                         column => $self->{column_prev} - 6};                         column => $self->{column_prev} - 7};
6063          $self->{state} = DOCTYPE_MD_STATE;          $self->{state} = DOCTYPE_MD_STATE;
6064                    
6065      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 6040  sub _get_next_token ($) { Line 6128  sub _get_next_token ($) {
6128          }          }
6129          $self->{ct} = {type => NOTATION_TOKEN, name => '',          $self->{ct} = {type => NOTATION_TOKEN, name => '',
6130                         line => $self->{line_prev},                         line => $self->{line_prev},
6131                         column => $self->{column_prev} - 6};                         column => $self->{column_prev} - 8};
6132          $self->{state} = DOCTYPE_MD_STATE;          $self->{state} = DOCTYPE_MD_STATE;
6133                    
6134      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {

Legend:
Removed from v.1.22  
changed lines
  Added in v.1.29

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24