/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.21 by wakaba, Sun Oct 19 09:25:21 2008 UTC revision 1.30 by wakaba, Sun Aug 16 05:24:47 2009 UTC
# Line 1248  sub _get_next_token ($) { Line 1248  sub _get_next_token ($) {
1248          if ({          if ({
1249               0x0022 => 1, # "               0x0022 => 1, # "
1250               0x0027 => 1, # '               0x0027 => 1, # '
1251                 0x003C => 1, # <
1252               0x003D => 1, # =               0x003D => 1, # =
1253              }->{$self->{nc}}) {              }->{$self->{nc}}) {
1254                        
# Line 1430  sub _get_next_token ($) { Line 1431  sub _get_next_token ($) {
1431    
1432          redo A;          redo A;
1433        } else {        } else {
1434          if ($self->{nc} == 0x0022 or # "          if ({
1435              $self->{nc} == 0x0027) { # '               0x0022 => 1, # "
1436                 0x0027 => 1, # '
1437                 0x003C => 1, # <
1438                }->{$self->{nc}}) {
1439                        
1440            ## XML5: Not a parse error.            ## XML5: Not a parse error.
1441            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
# Line 1602  sub _get_next_token ($) { Line 1606  sub _get_next_token ($) {
1606                        
1607          }          }
1608    
1609          if ($self->{nc} == 0x0022 or # "          if ({
1610              $self->{nc} == 0x0027) { # '               0x0022 => 1, # "
1611                 0x0027 => 1, # '
1612                 0x003C => 1, # <
1613                }->{$self->{nc}}) {
1614                        
1615            ## XML5: Not a parse error.            ## XML5: Not a parse error.
1616            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
# Line 1740  sub _get_next_token ($) { Line 1747  sub _get_next_token ($) {
1747    
1748          redo A;          redo A;
1749        } else {        } else {
1750          if ($self->{nc} == 0x003D) { # =          if ($self->{nc} == 0x003D or $self->{nc} == 0x003C) { # =, <
1751                        
1752            ## XML5: Not a parse error.            ## XML5: Not a parse error.
1753            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
# Line 1816  sub _get_next_token ($) { Line 1823  sub _get_next_token ($) {
1823      }      }
1824        
1825          redo A;          redo A;
1826          } elsif ($self->{is_xml} and
1827                   $is_space->{$self->{nc}}) {
1828            
1829            $self->{ca}->{value} .= ' ';
1830            ## Stay in the state.
1831            
1832        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1833          $self->{line_prev} = $self->{line};
1834          $self->{column_prev} = $self->{column};
1835          $self->{column}++;
1836          $self->{nc}
1837              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1838        } else {
1839          $self->{set_nc}->($self);
1840        }
1841      
1842            redo A;
1843        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1844          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1845          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
# Line 1863  sub _get_next_token ($) { Line 1887  sub _get_next_token ($) {
1887          }          }
1888          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1889          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1890                                q["&<],                                qq["&<\x09\x0C\x20],
1891                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1892    
1893          ## Stay in the state          ## Stay in the state
# Line 1930  sub _get_next_token ($) { Line 1954  sub _get_next_token ($) {
1954      }      }
1955        
1956          redo A;          redo A;
1957          } elsif ($self->{is_xml} and
1958                   $is_space->{$self->{nc}}) {
1959            
1960            $self->{ca}->{value} .= ' ';
1961            ## Stay in the state.
1962            
1963        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1964          $self->{line_prev} = $self->{line};
1965          $self->{column_prev} = $self->{column};
1966          $self->{column}++;
1967          $self->{nc}
1968              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1969        } else {
1970          $self->{set_nc}->($self);
1971        }
1972      
1973            redo A;
1974        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1975          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1976          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
# Line 1977  sub _get_next_token ($) { Line 2018  sub _get_next_token ($) {
2018          }          }
2019          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
2020          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
2021                                q['&<],                                qq['&<\x09\x0C\x20],
2022                                length $self->{ca}->{value});                                length $self->{ca}->{value});
2023    
2024          ## Stay in the state          ## Stay in the state
# Line 2149  sub _get_next_token ($) { Line 2190  sub _get_next_token ($) {
2190               0x0022 => 1, # "               0x0022 => 1, # "
2191               0x0027 => 1, # '               0x0027 => 1, # '
2192               0x003D => 1, # =               0x003D => 1, # =
2193                 0x003C => 1, # <
2194              }->{$self->{nc}}) {              }->{$self->{nc}}) {
2195                        
2196            ## XML5: Not a parse error.            ## XML5: Not a parse error.
# Line 2158  sub _get_next_token ($) { Line 2200  sub _get_next_token ($) {
2200          }          }
2201          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
2202          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
2203                                q["'=& >],                                qq["'=& \x09\x0C>],
2204                                length $self->{ca}->{value});                                length $self->{ca}->{value});
2205    
2206          ## Stay in the state          ## Stay in the state
# Line 2964  sub _get_next_token ($) { Line 3006  sub _get_next_token ($) {
3006          redo A;          redo A;
3007        } else {        } else {
3008                    
         ## XML5: Not a parse error.  
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',  
                         line => $self->{line_prev},  
                         column => $self->{column_prev});  
3009          $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment          $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
3010          $self->{state} = COMMENT_STATE;          $self->{state} = COMMENT_STATE;
3011                    
# Line 2999  sub _get_next_token ($) { Line 3037  sub _get_next_token ($) {
3037      }      }
3038        
3039          redo A;          redo A;
3040          } elsif ($self->{nc} == -1) {
3041            
3042            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3043            $self->{ct}->{quirks} = 1;
3044    
3045            $self->{state} = DATA_STATE;
3046            ## Reconsume.
3047            return  ($self->{ct}); # DOCTYPE (quirks)
3048    
3049            redo A;
3050        } else {        } else {
3051                    
3052          ## XML5: Unless EOF, swith to the bogus comment state.          ## XML5: Swith to the bogus comment state.
3053          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
3054          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3055          ## reconsume          ## reconsume
# Line 3046  sub _get_next_token ($) { Line 3094  sub _get_next_token ($) {
3094          return  ($self->{ct}); # DOCTYPE (quirks)          return  ($self->{ct}); # DOCTYPE (quirks)
3095    
3096          redo A;          redo A;
3097          } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
3098            
3099            $self->{ct}->{name} # DOCTYPE
3100                = chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
3101            delete $self->{ct}->{quirks};
3102            $self->{state} = DOCTYPE_NAME_STATE;
3103            
3104        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3105          $self->{line_prev} = $self->{line};
3106          $self->{column_prev} = $self->{column};
3107          $self->{column}++;
3108          $self->{nc}
3109              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3110        } else {
3111          $self->{set_nc}->($self);
3112        }
3113      
3114            redo A;
3115        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3116                    
3117          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
# Line 3132  sub _get_next_token ($) { Line 3198  sub _get_next_token ($) {
3198          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
3199    
3200          redo A;          redo A;
3201          } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
3202            
3203            $self->{ct}->{name} # DOCTYPE
3204                .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
3205            delete $self->{ct}->{quirks};
3206            ## Stay in the state.
3207            
3208        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3209          $self->{line_prev} = $self->{line};
3210          $self->{column_prev} = $self->{column};
3211          $self->{column}++;
3212          $self->{nc}
3213              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3214        } else {
3215          $self->{set_nc}->($self);
3216        }
3217      
3218            redo A;
3219        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3220                    
3221          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
# Line 3163  sub _get_next_token ($) { Line 3247  sub _get_next_token ($) {
3247          redo A;          redo A;
3248        } else {        } else {
3249                    
3250          $self->{ct}->{name}          $self->{ct}->{name} .= chr ($self->{nc}); # DOCTYPE
3251            .= chr ($self->{nc}); # DOCTYPE          ## Stay in the state.
         ## Stay in the state  
3252                    
3253      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3254        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 4628  sub _get_next_token ($) { Line 4711  sub _get_next_token ($) {
4711              0x003C => 1, 0x0026 => 1, -1 => 1, # <, &              0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4712              $self->{entity_add} => 1,              $self->{entity_add} => 1,
4713            }->{$self->{nc}}) {            }->{$self->{nc}}) {
4714                    if ($self->{is_xml}) {
4715              
4716              $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4717                              line => $self->{line_prev},
4718                              column => $self->{column_prev}
4719                                  + ($self->{nc} == -1 ? 1 : 0));
4720            } else {
4721              
4722              ## No error
4723            }
4724          ## Don't consume          ## Don't consume
         ## No error  
4725          ## Return nothing.          ## Return nothing.
4726          #          #
4727        } elsif ($self->{nc} == 0x0023) { # #        } elsif ($self->{nc} == 0x0023) { # #
# Line 4649  sub _get_next_token ($) { Line 4740  sub _get_next_token ($) {
4740      }      }
4741        
4742          redo A;          redo A;
4743        } elsif ((0x0041 <= $self->{nc} and        } elsif ($self->{is_xml} or
4744                   (0x0041 <= $self->{nc} and
4745                  $self->{nc} <= 0x005A) or # A..Z                  $self->{nc} <= 0x005A) or # A..Z
4746                 (0x0061 <= $self->{nc} and                 (0x0061 <= $self->{nc} and
4747                  $self->{nc} <= 0x007A)) { # a..z                  $self->{nc} <= 0x007A)) { # a..z
# Line 4828  sub _get_next_token ($) { Line 4920  sub _get_next_token ($) {
4920        my $code = $self->{kwd};        my $code = $self->{kwd};
4921        my $l = $self->{line_prev};        my $l = $self->{line_prev};
4922        my $c = $self->{column_prev};        my $c = $self->{column_prev};
4923        if ($charref_map->{$code}) {        if ((not $self->{is_xml} and $charref_map->{$code}) or
4924              ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
4925              ($self->{is_xml} and $code == 0x0000)) {
4926                    
4927          $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4928                          text => (sprintf 'U+%04X', $code),                          text => (sprintf 'U+%04X', $code),
# Line 4981  sub _get_next_token ($) { Line 5075  sub _get_next_token ($) {
5075        my $code = $self->{kwd};        my $code = $self->{kwd};
5076        my $l = $self->{line_prev};        my $l = $self->{line_prev};
5077        my $c = $self->{column_prev};        my $c = $self->{column_prev};
5078        if ($charref_map->{$code}) {        if ((not $self->{is_xml} and $charref_map->{$code}) or
5079              ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
5080              ($self->{is_xml} and $code == 0x0000)) {
5081                    
5082          $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5083                          text => (sprintf 'U+%04X', $code),                          text => (sprintf 'U+%04X', $code),
# Line 5021  sub _get_next_token ($) { Line 5117  sub _get_next_token ($) {
5117             $self->{nc} <= 0x007A) or # z             $self->{nc} <= 0x007A) or # z
5118            (0x0030 <= $self->{nc} and # 0            (0x0030 <= $self->{nc} and # 0
5119             $self->{nc} <= 0x0039) or # 9             $self->{nc} <= 0x0039) or # 9
5120            $self->{nc} == 0x003B) { # ;            $self->{nc} == 0x003B or # ;
5121              ($self->{is_xml} and
5122               not ($is_space->{$self->{nc}} or
5123                    {
5124                      0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
5125                      $self->{entity_add} => 1,
5126                    }->{$self->{nc}}))) {
5127          our $EntityChar;          our $EntityChar;
5128          $self->{kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
5129          if (defined $EntityChar->{$self->{kwd}} or          if (defined $EntityChar->{$self->{kwd}} or
# Line 5454  sub _get_next_token ($) { Line 5556  sub _get_next_token ($) {
5556          ## XML5: Not defined yet.          ## XML5: Not defined yet.
5557    
5558          ## TODO:          ## TODO:
5559    
5560            if (not $self->{stop_processing} and
5561                not $self->{document}->xml_standalone) {
5562              $self->{parse_error}->(level => $self->{level}->{must}, type => 'stop processing', ## TODO: type
5563                              level => $self->{level}->{info});
5564              $self->{stop_processing} = 1;
5565            }
5566    
5567                    
5568      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5569        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 5888  sub _get_next_token ($) { Line 5998  sub _get_next_token ($) {
5998          }          }
5999          $self->{ct} = {type => ELEMENT_TOKEN, name => '',          $self->{ct} = {type => ELEMENT_TOKEN, name => '',
6000                         line => $self->{line_prev},                         line => $self->{line_prev},
6001                         column => $self->{column_prev} - 6};                         column => $self->{column_prev} - 7};
6002          $self->{state} = DOCTYPE_MD_STATE;          $self->{state} = DOCTYPE_MD_STATE;
6003                    
6004      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 5956  sub _get_next_token ($) { Line 6066  sub _get_next_token ($) {
6066          $self->{ct} = {type => ATTLIST_TOKEN, name => '',          $self->{ct} = {type => ATTLIST_TOKEN, name => '',
6067                         attrdefs => [],                         attrdefs => [],
6068                         line => $self->{line_prev},                         line => $self->{line_prev},
6069                         column => $self->{column_prev} - 6};                         column => $self->{column_prev} - 7};
6070          $self->{state} = DOCTYPE_MD_STATE;          $self->{state} = DOCTYPE_MD_STATE;
6071                    
6072      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 6025  sub _get_next_token ($) { Line 6135  sub _get_next_token ($) {
6135          }          }
6136          $self->{ct} = {type => NOTATION_TOKEN, name => '',          $self->{ct} = {type => NOTATION_TOKEN, name => '',
6137                         line => $self->{line_prev},                         line => $self->{line_prev},
6138                         column => $self->{column_prev} - 6};                         column => $self->{column_prev} - 8};
6139          $self->{state} = DOCTYPE_MD_STATE;          $self->{state} = DOCTYPE_MD_STATE;
6140                    
6141      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 7887  sub _get_next_token ($) { Line 7997  sub _get_next_token ($) {
7997          redo A;          redo A;
7998        }        }
7999      } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {      } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
       ## TODO: XMLize  
   
8000        if ($is_space->{$self->{nc}} or        if ($is_space->{$self->{nc}} or
8001            {            {
8002              0x003C => 1, 0x0026 => 1, -1 => 1, # <, &              0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
8003              $self->{entity_add} => 1,              $self->{entity_add} => 1,
8004            }->{$self->{nc}}) {            }->{$self->{nc}}) {
8005            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
8006                            line => $self->{line_prev},
8007                            column => $self->{column_prev}
8008                                + ($self->{nc} == -1 ? 1 : 0));
8009          ## Don't consume          ## Don't consume
         ## No error  
8010          ## Return nothing.          ## Return nothing.
8011          #          #
8012        } elsif ($self->{nc} == 0x0023) { # #        } elsif ($self->{nc} == 0x0023) { # #
# Line 7914  sub _get_next_token ($) { Line 8025  sub _get_next_token ($) {
8025      }      }
8026        
8027          redo A;          redo A;
       } elsif ((0x0041 <= $self->{nc} and  
                 $self->{nc} <= 0x005A) or # A..Z  
                (0x0061 <= $self->{nc} and  
                 $self->{nc} <= 0x007A)) { # a..z  
         #  
8028        } else {        } else {
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');  
         ## Return nothing.  
8029          #          #
8030        }        }
8031    

Legend:
Removed from v.1.21  
changed lines
  Added in v.1.30

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24