/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.64 by wakaba, Sun Nov 11 08:39:42 2007 UTC revision 1.69 by wakaba, Sun Feb 17 12:39:32 2008 UTC
# Line 97  sub parse_byte_string ($$$$;$) { Line 97  sub parse_byte_string ($$$$;$) {
97      $self->{input_encoding} = lc $charset; ## TODO: normalize name      $self->{input_encoding} = lc $charset; ## TODO: normalize name
98      $self->{confident} = 1;      $self->{confident} = 1;
99    } else {    } else {
100      $charset = 'windows-1252'; ## TODO: for now.      ## TODO: Implement HTML5 detection algorithm
101        require Whatpm::Charset::UniversalCharDet;
102        $charset = Whatpm::Charset::UniversalCharDet->detect_byte_string
103            (substr ($$bytes_s, 0, 1024));
104        $charset ||= 'windows-1252';
105      $s = \ (Encode::decode ($charset, $$bytes_s));      $s = \ (Encode::decode ($charset, $$bytes_s));
106      $self->{input_encoding} = $charset;      $self->{input_encoding} = $charset;
107      $self->{confident} = 0;      $self->{confident} = 0;
# Line 336  sub _initialize_tokenizer ($) { Line 340  sub _initialize_tokenizer ($) {
340  ##   ->{system_identifier} (DOCTYPE_TOKEN)  ##   ->{system_identifier} (DOCTYPE_TOKEN)
341  ##   ->{correct} == 1 or 0 (DOCTYPE_TOKEN)  ##   ->{correct} == 1 or 0 (DOCTYPE_TOKEN)
342  ##   ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)  ##   ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
343    ##        ->{name}
344    ##        ->{value}
345    ##        ->{has_reference} == 1 or 0
346  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
347    
348  ## Emitted token MUST immediately be handled by the tree construction state.  ## Emitted token MUST immediately be handled by the tree construction state.
# Line 1111  sub _get_next_token ($) { Line 1118  sub _get_next_token ($) {
1118          $self->{current_attribute}->{value} .= '&';          $self->{current_attribute}->{value} .= '&';
1119        } else {        } else {
1120          $self->{current_attribute}->{value} .= $token->{data};          $self->{current_attribute}->{value} .= $token->{data};
1121            $self->{current_attribute}->{has_reference} = $token->{has_reference};
1122          ## ISSUE: spec says "append the returned character token to the current attribute's value"          ## ISSUE: spec says "append the returned character token to the current attribute's value"
1123        }        }
1124    
# Line 1547  sub _get_next_token ($) { Line 1555  sub _get_next_token ($) {
1555          $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;          $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1556          !!!next-input-character;          !!!next-input-character;
1557          redo A;          redo A;
1558          } elsif ($self->{next_input_character} == 0x003E) { # >
1559            !!!parse-error (type => 'unclosed PUBLIC literal');
1560    
1561            $self->{state} = DATA_STATE;
1562            !!!next-input-character;
1563    
1564            delete $self->{current_token}->{correct};
1565            !!!emit ($self->{current_token}); # DOCTYPE
1566    
1567            redo A;
1568        } elsif ($self->{next_input_character} == -1) {        } elsif ($self->{next_input_character} == -1) {
1569          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
1570    
# Line 1569  sub _get_next_token ($) { Line 1587  sub _get_next_token ($) {
1587          $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;          $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1588          !!!next-input-character;          !!!next-input-character;
1589          redo A;          redo A;
1590          } elsif ($self->{next_input_character} == 0x003E) { # >
1591            !!!parse-error (type => 'unclosed PUBLIC literal');
1592    
1593            $self->{state} = DATA_STATE;
1594            !!!next-input-character;
1595    
1596            delete $self->{current_token}->{correct};
1597            !!!emit ($self->{current_token}); # DOCTYPE
1598    
1599            redo A;
1600        } elsif ($self->{next_input_character} == -1) {        } elsif ($self->{next_input_character} == -1) {
1601          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
1602    
# Line 1675  sub _get_next_token ($) { Line 1703  sub _get_next_token ($) {
1703          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1704          !!!next-input-character;          !!!next-input-character;
1705          redo A;          redo A;
1706          } elsif ($self->{next_input_character} == 0x003E) { # >
1707            !!!parse-error (type => 'unclosed PUBLIC literal');
1708    
1709            $self->{state} = DATA_STATE;
1710            !!!next-input-character;
1711    
1712            delete $self->{current_token}->{correct};
1713            !!!emit ($self->{current_token}); # DOCTYPE
1714    
1715            redo A;
1716        } elsif ($self->{next_input_character} == -1) {        } elsif ($self->{next_input_character} == -1) {
1717          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
1718    
# Line 1697  sub _get_next_token ($) { Line 1735  sub _get_next_token ($) {
1735          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1736          !!!next-input-character;          !!!next-input-character;
1737          redo A;          redo A;
1738          } elsif ($self->{next_input_character} == 0x003E) { # >
1739            !!!parse-error (type => 'unclosed PUBLIC literal');
1740    
1741            $self->{state} = DATA_STATE;
1742            !!!next-input-character;
1743    
1744            delete $self->{current_token}->{correct};
1745            !!!emit ($self->{current_token}); # DOCTYPE
1746    
1747            redo A;
1748        } elsif ($self->{next_input_character} == -1) {        } elsif ($self->{next_input_character} == -1) {
1749          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
1750    
# Line 1837  sub _tokenize_attempt_to_consume_an_enti Line 1885  sub _tokenize_attempt_to_consume_an_enti
1885            $code = $c1_entity_char->{$code};            $code = $c1_entity_char->{$code};
1886          }          }
1887    
1888          return {type => CHARACTER_TOKEN, data => chr $code};          return {type => CHARACTER_TOKEN, data => chr $code,
1889                    has_reference => 1};
1890        } # X        } # X
1891      } elsif (0x0030 <= $self->{next_input_character} and      } elsif (0x0030 <= $self->{next_input_character} and
1892               $self->{next_input_character} <= 0x0039) { # 0..9               $self->{next_input_character} <= 0x0039) { # 0..9
# Line 1872  sub _tokenize_attempt_to_consume_an_enti Line 1921  sub _tokenize_attempt_to_consume_an_enti
1921          $code = $c1_entity_char->{$code};          $code = $c1_entity_char->{$code};
1922        }        }
1923                
1924        return {type => CHARACTER_TOKEN, data => chr $code};        return {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1};
1925      } else {      } else {
1926        !!!parse-error (type => 'bare nero');        !!!parse-error (type => 'bare nero');
1927        !!!back-next-input-character ($self->{next_input_character});        !!!back-next-input-character ($self->{next_input_character});
# Line 1920  sub _tokenize_attempt_to_consume_an_enti Line 1969  sub _tokenize_attempt_to_consume_an_enti
1969      }      }
1970            
1971      if ($match > 0) {      if ($match > 0) {
1972        return {type => CHARACTER_TOKEN, data => $value};        return {type => CHARACTER_TOKEN, data => $value, has_reference => 1};
1973      } elsif ($match < 0) {      } elsif ($match < 0) {
1974        !!!parse-error (type => 'no refc');        !!!parse-error (type => 'no refc');
1975        if ($in_attr and $match < -1) {        if ($in_attr and $match < -1) {
1976          return {type => CHARACTER_TOKEN, data => '&'.$entity_name};          return {type => CHARACTER_TOKEN, data => '&'.$entity_name};
1977        } else {        } else {
1978          return {type => CHARACTER_TOKEN, data => $value};          return {type => CHARACTER_TOKEN, data => $value, has_reference => 1};
1979        }        }
1980      } else {      } else {
1981        !!!parse-error (type => 'bare ero');        !!!parse-error (type => 'bare ero');
1982        ## NOTE: No characters are consumed in the spec.        ## NOTE: "No characters are consumed" in the spec.
1983        return {type => CHARACTER_TOKEN, data => '&'.$value};        return {type => CHARACTER_TOKEN, data => '&'.$value};
1984      }      }
1985    } else {    } else {
# Line 2184  sub _tree_construction_root_element ($) Line 2233  sub _tree_construction_root_element ($)
2233          #          #
2234        } elsif ($token->{type} == START_TAG_TOKEN) {        } elsif ($token->{type} == START_TAG_TOKEN) {
2235          if ($token->{tag_name} eq 'html' and          if ($token->{tag_name} eq 'html' and
2236              $token->{attributes}->{manifest}) { ## ISSUE: Spec spells as "application"              $token->{attributes}->{manifest}) {
2237            $self->{application_cache_selection}            $self->{application_cache_selection}
2238                 ->($token->{attributes}->{manifest}->{value});                 ->($token->{attributes}->{manifest}->{value});
2239            ## ISSUE: No relative reference resolution?            ## ISSUE: No relative reference resolution?
# Line 2862  sub _tree_construction_main ($) { Line 2911  sub _tree_construction_main ($) {
2911                  push @{$self->{open_elements}}, [$self->{head_element}, 'head'];                  push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2912                }                }
2913                !!!insert-element ($token->{tag_name}, $token->{attributes});                !!!insert-element ($token->{tag_name}, $token->{attributes});
2914                pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.                my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
2915    
2916                unless ($self->{confident}) {                unless ($self->{confident}) {
2917                  if ($token->{attributes}->{charset}) { ## TODO: And if supported                  if ($token->{attributes}->{charset}) { ## TODO: And if supported
2918                    $self->{change_encoding}                    $self->{change_encoding}
2919                        ->($self, $token->{attributes}->{charset}->{value});                        ->($self, $token->{attributes}->{charset}->{value});
2920                      
2921                      $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
2922                          ->set_user_data (manakai_has_reference =>
2923                                               $token->{attributes}->{charset}
2924                                                   ->{has_reference});
2925                  } elsif ($token->{attributes}->{content}) {                  } elsif ($token->{attributes}->{content}) {
2926                    ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.                    ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
2927                    if ($token->{attributes}->{content}->{value}                    if ($token->{attributes}->{content}->{value}
# Line 2876  sub _tree_construction_main ($) { Line 2930  sub _tree_construction_main ($) {
2930                            ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {                            ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
2931                      $self->{change_encoding}                      $self->{change_encoding}
2932                          ->($self, defined $1 ? $1 : defined $2 ? $2 : $3);                          ->($self, defined $1 ? $1 : defined $2 ? $2 : $3);
2933                        $meta_el->[0]->get_attribute_node_ns (undef, 'content')
2934                            ->set_user_data (manakai_has_reference =>
2935                                                 $token->{attributes}->{content}
2936                                                       ->{has_reference});
2937                    }                    }
2938                  }                  }
2939                  } else {
2940                    if ($token->{attributes}->{charset}) {
2941                      $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
2942                          ->set_user_data (manakai_has_reference =>
2943                                               $token->{attributes}->{charset}
2944                                                   ->{has_reference});
2945                    }
2946                    if ($token->{attributes}->{content}) {
2947                      $meta_el->[0]->get_attribute_node_ns (undef, 'content')
2948                          ->set_user_data (manakai_has_reference =>
2949                                               $token->{attributes}->{content}
2950                                                   ->{has_reference});
2951                    }
2952                }                }
2953    
2954                pop @{$self->{open_elements}}                pop @{$self->{open_elements}}
# Line 4450  sub _tree_construction_main ($) { Line 4521  sub _tree_construction_main ($) {
4521        } elsif ($token->{tag_name} eq 'meta') {        } elsif ($token->{tag_name} eq 'meta') {
4522          ## NOTE: This is an "as if in head" code clone, only "-t" differs          ## NOTE: This is an "as if in head" code clone, only "-t" differs
4523          !!!insert-element-t ($token->{tag_name}, $token->{attributes});          !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4524          pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.          my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4525    
4526          unless ($self->{confident}) {          unless ($self->{confident}) {
4527            if ($token->{attributes}->{charset}) { ## TODO: And if supported            if ($token->{attributes}->{charset}) { ## TODO: And if supported
4528              $self->{change_encoding}              $self->{change_encoding}
4529                  ->($self, $token->{attributes}->{charset}->{value});                  ->($self, $token->{attributes}->{charset}->{value});
4530                
4531                $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4532                    ->set_user_data (manakai_has_reference =>
4533                                         $token->{attributes}->{charset}
4534                                             ->{has_reference});
4535            } elsif ($token->{attributes}->{content}) {            } elsif ($token->{attributes}->{content}) {
4536              ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.              ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
4537              if ($token->{attributes}->{content}->{value}              if ($token->{attributes}->{content}->{value}
# Line 4464  sub _tree_construction_main ($) { Line 4540  sub _tree_construction_main ($) {
4540                      ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {                      ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
4541                $self->{change_encoding}                $self->{change_encoding}
4542                    ->($self, defined $1 ? $1 : defined $2 ? $2 : $3);                    ->($self, defined $1 ? $1 : defined $2 ? $2 : $3);
4543                  $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4544                      ->set_user_data (manakai_has_reference =>
4545                                           $token->{attributes}->{content}
4546                                                 ->{has_reference});
4547              }              }
4548            }            }
4549            } else {
4550              if ($token->{attributes}->{charset}) {
4551                $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4552                    ->set_user_data (manakai_has_reference =>
4553                                         $token->{attributes}->{charset}
4554                                             ->{has_reference});
4555              }
4556              if ($token->{attributes}->{content}) {
4557                $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4558                    ->set_user_data (manakai_has_reference =>
4559                                         $token->{attributes}->{content}
4560                                             ->{has_reference});
4561              }
4562          }          }
4563    
4564          !!!next-token;          !!!next-token;

Legend:
Removed from v.1.64  
changed lines
  Added in v.1.69

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24