/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.10 by wakaba, Wed Oct 15 08:51:02 2008 UTC revision 1.11 by wakaba, Wed Oct 15 10:50:38 2008 UTC
# Line 216  sub _initialize_tokenizer ($) { Line 216  sub _initialize_tokenizer ($) {
216    
217  ## A token has:  ## A token has:
218  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
219  ##       CHARACTER_TOKEN, or END_OF_FILE_TOKEN  ##       CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
220  ##   ->{name} (DOCTYPE_TOKEN)  ##   ->{name} (DOCTYPE_TOKEN)
221  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
222    ##   ->{target} (PI_TOKEN)
223  ##   ->{pubid} (DOCTYPE_TOKEN)  ##   ->{pubid} (DOCTYPE_TOKEN)
224  ##   ->{sysid} (DOCTYPE_TOKEN)  ##   ->{sysid} (DOCTYPE_TOKEN)
225  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
# Line 226  sub _initialize_tokenizer ($) { Line 227  sub _initialize_tokenizer ($) {
227  ##        ->{name}  ##        ->{name}
228  ##        ->{value}  ##        ->{value}
229  ##        ->{has_reference} == 1 or 0  ##        ->{has_reference} == 1 or 0
230  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)  ##        ->{index}: Index of the attribute in a tag.
231    ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
232  ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)  ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
233    ##   ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
234  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
235  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|
236  ##     while the token is pushed back to the stack.  ##     while the token is pushed back to the stack.
# Line 1058  sub _get_next_token ($) { Line 1061  sub _get_next_token ($) {
1061          redo A;          redo A;
1062        }        }
1063      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1064          ## XML5: "Tag attribute name before state".
1065    
1066        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1067                    
1068          ## Stay in the state          ## Stay in the state
# Line 1170  sub _get_next_token ($) { Line 1175  sub _get_next_token ($) {
1175               0x003D => 1, # =               0x003D => 1, # =
1176              }->{$self->{nc}}) {              }->{$self->{nc}}) {
1177                        
1178              ## XML5: Not a parse error.
1179            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1180          } else {          } else {
1181                        
1182              ## XML5: ":" raises a parse error and is ignored.
1183          }          }
1184          $self->{ca}          $self->{ca}
1185              = {name => chr ($self->{nc}),              = {name => chr ($self->{nc}),
# Line 1193  sub _get_next_token ($) { Line 1200  sub _get_next_token ($) {
1200          redo A;          redo A;
1201        }        }
1202      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1203          ## XML5: "Tag attribute name state".
1204    
1205        my $before_leave = sub {        my $before_leave = sub {
1206          if (exists $self->{ct}->{attributes} # start tag or end tag          if (exists $self->{ct}->{attributes} # start tag or end tag
1207              ->{$self->{ca}->{name}}) { # MUST              ->{$self->{ca}->{name}}) { # MUST
# Line 1203  sub _get_next_token ($) { Line 1212  sub _get_next_token ($) {
1212                        
1213            $self->{ct}->{attributes}->{$self->{ca}->{name}}            $self->{ct}->{attributes}->{$self->{ca}->{name}}
1214              = $self->{ca};              = $self->{ca};
1215              $self->{ca}->{index} = ++$self->{ct}->{last_index};
1216          }          }
1217        }; # $before_leave        }; # $before_leave
1218    
# Line 1239  sub _get_next_token ($) { Line 1249  sub _get_next_token ($) {
1249        
1250          redo A;          redo A;
1251        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1252            if ($self->{is_xml}) {
1253              
1254              ## XML5: Not a parse error.
1255              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1256            } else {
1257              
1258            }
1259    
1260          $before_leave->();          $before_leave->();
1261          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1262                        
# Line 1288  sub _get_next_token ($) { Line 1306  sub _get_next_token ($) {
1306        
1307          redo A;          redo A;
1308        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1309            if ($self->{is_xml}) {
1310              
1311              ## XML5: Not a parse error.
1312              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1313            } else {
1314              
1315            }
1316                    
1317          $before_leave->();          $before_leave->();
1318          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
# Line 1332  sub _get_next_token ($) { Line 1357  sub _get_next_token ($) {
1357          if ($self->{nc} == 0x0022 or # "          if ($self->{nc} == 0x0022 or # "
1358              $self->{nc} == 0x0027) { # '              $self->{nc} == 0x0027) { # '
1359                        
1360              ## XML5: Not a parse error.
1361            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1362          } else {          } else {
1363                        
# Line 1352  sub _get_next_token ($) { Line 1378  sub _get_next_token ($) {
1378          redo A;          redo A;
1379        }        }
1380      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1381          ## XML5: "Tag attribute name after state".
1382          
1383        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1384                    
1385          ## Stay in the state          ## Stay in the state
# Line 1383  sub _get_next_token ($) { Line 1411  sub _get_next_token ($) {
1411        
1412          redo A;          redo A;
1413        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1414            if ($self->{is_xml}) {
1415              
1416              ## XML5: Not a parse error.
1417              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1418            } else {
1419              
1420            }
1421    
1422          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1423                        
1424            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
# Line 1436  sub _get_next_token ($) { Line 1472  sub _get_next_token ($) {
1472        
1473          redo A;          redo A;
1474        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1475            if ($self->{is_xml}) {
1476              
1477              ## XML5: Not a parse error.
1478              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1479            } else {
1480              
1481            }
1482                    
1483          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
1484                    
# Line 1475  sub _get_next_token ($) { Line 1518  sub _get_next_token ($) {
1518    
1519          redo A;          redo A;
1520        } else {        } else {
1521            if ($self->{is_xml}) {
1522              
1523              ## XML5: Not a parse error.
1524              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1525            } else {
1526              
1527            }
1528    
1529          if ($self->{nc} == 0x0022 or # "          if ($self->{nc} == 0x0022 or # "
1530              $self->{nc} == 0x0027) { # '              $self->{nc} == 0x0027) { # '
1531                        
1532              ## XML5: Not a parse error.
1533            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1534          } else {          } else {
1535                        
# Line 1501  sub _get_next_token ($) { Line 1553  sub _get_next_token ($) {
1553          redo A;                  redo A;        
1554        }        }
1555      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1556          ## XML5: "Tag attribute value before state".
1557    
1558        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1559                    
1560          ## Stay in the state          ## Stay in the state
# Line 1612  sub _get_next_token ($) { Line 1666  sub _get_next_token ($) {
1666        } else {        } else {
1667          if ($self->{nc} == 0x003D) { # =          if ($self->{nc} == 0x003D) { # =
1668                        
1669              ## XML5: Not a parse error.
1670            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1671            } elsif ($self->{is_xml}) {
1672              
1673              ## XML5: No parse error.
1674              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1675          } else {          } else {
1676                        
1677          }          }
# Line 1632  sub _get_next_token ($) { Line 1691  sub _get_next_token ($) {
1691          redo A;          redo A;
1692        }        }
1693      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1694          ## XML5: "Tag attribute value double quoted state".
1695          
1696        if ($self->{nc} == 0x0022) { # "        if ($self->{nc} == 0x0022) { # "
1697                    
1698            ## XML5: "Tag attribute name before state".
1699          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1700                    
1701      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 1649  sub _get_next_token ($) { Line 1711  sub _get_next_token ($) {
1711          redo A;          redo A;
1712        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1713                    
1714            ## XML5: Not defined yet.
1715    
1716          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1717          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1718          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1693  sub _get_next_token ($) { Line 1757  sub _get_next_token ($) {
1757    
1758          redo A;          redo A;
1759        } else {        } else {
1760                    if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1761              
1762              ## XML5: Not a parse error.
1763              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1764            } else {
1765              
1766            }
1767          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1768          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1769                                q["&],                                q["&<],
1770                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1771    
1772          ## Stay in the state          ## Stay in the state
# Line 1714  sub _get_next_token ($) { Line 1784  sub _get_next_token ($) {
1784          redo A;          redo A;
1785        }        }
1786      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1787          ## XML5: "Tag attribute value single quoted state".
1788    
1789        if ($self->{nc} == 0x0027) { # '        if ($self->{nc} == 0x0027) { # '
1790                    
1791            ## XML5: "Before attribute name state" (sic).
1792          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1793                    
1794      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 1731  sub _get_next_token ($) { Line 1804  sub _get_next_token ($) {
1804          redo A;          redo A;
1805        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1806                    
1807            ## XML5: Not defined yet.
1808    
1809          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1810          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1811          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1775  sub _get_next_token ($) { Line 1850  sub _get_next_token ($) {
1850    
1851          redo A;          redo A;
1852        } else {        } else {
1853                    if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1854              
1855              ## XML5: Not a parse error.
1856              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1857            } else {
1858              
1859            }
1860          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1861          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1862                                q['&],                                q['&<],
1863                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1864    
1865          ## Stay in the state          ## Stay in the state
# Line 1796  sub _get_next_token ($) { Line 1877  sub _get_next_token ($) {
1877          redo A;          redo A;
1878        }        }
1879      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1880          ## XML5: "Tag attribute value unquoted state".
1881    
1882        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1883                    
1884            ## XML5: "Tag attribute name before state".
1885          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1886                    
1887      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 1813  sub _get_next_token ($) { Line 1897  sub _get_next_token ($) {
1897          redo A;          redo A;
1898        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1899                    
1900    
1901            ## XML5: Not defined yet.
1902    
1903          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1904          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1905          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1896  sub _get_next_token ($) { Line 1983  sub _get_next_token ($) {
1983               0x003D => 1, # =               0x003D => 1, # =
1984              }->{$self->{nc}}) {              }->{$self->{nc}}) {
1985                        
1986              ## XML5: Not a parse error.
1987            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1988          } else {          } else {
1989                        
# Line 2012  sub _get_next_token ($) { Line 2100  sub _get_next_token ($) {
2100          redo A;          redo A;
2101        }        }
2102      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2103          ## XML5: "Empty tag state".
2104    
2105        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2106          if ($self->{ct}->{type} == END_TAG_TOKEN) {          if ($self->{ct}->{type} == END_TAG_TOKEN) {
2107                        
# Line 2063  sub _get_next_token ($) { Line 2153  sub _get_next_token ($) {
2153          } else {          } else {
2154            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2155          }          }
2156            ## XML5: "Tag attribute name before state".
2157          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2158          $self->{s_kwd} = '';          $self->{s_kwd} = '';
2159          ## Reconsume.          ## Reconsume.

Legend:
Removed from v.1.10  
changed lines
  Added in v.1.11

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24