/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.64 by wakaba, Sun Nov 11 08:39:42 2007 UTC revision 1.73 by wakaba, Sun Mar 2 23:38:37 2008 UTC
# Line 8  use Error qw(:try); Line 8  use Error qw(:try);
8  ## doc.write ('');  ## doc.write ('');
9  ## alert (doc.compatMode);  ## alert (doc.compatMode);
10    
11  ## ISSUE: HTML5 revision 967 says that the encoding layer MUST NOT  ## TODO: Control charcters and noncharacters are not allowed (HTML5 revision 1263)
12  ## strip BOM and the HTML layer MUST ignore it.  Whether we can do it  ## TODO: 1252 parse error (revision 1264)
13  ## is not yet clear.  ## TODO: 8859-11 = 874 (revision 1271)
 ## "{U+FEFF}..." in UTF-16BE/UTF-16LE is three or four characters?  
 ## "{U+FEFF}..." in GB18030?  
14    
15  my $permitted_slash_tag_name = {  my $permitted_slash_tag_name = {
16    base => 1,    base => 1,
# Line 20  my $permitted_slash_tag_name = { Line 18  my $permitted_slash_tag_name = {
18    meta => 1,    meta => 1,
19    hr => 1,    hr => 1,
20    br => 1,    br => 1,
21    img=> 1,    img => 1,
22    embed => 1,    embed => 1,
23    param => 1,    param => 1,
24    area => 1,    area => 1,
# Line 97  sub parse_byte_string ($$$$;$) { Line 95  sub parse_byte_string ($$$$;$) {
95      $self->{input_encoding} = lc $charset; ## TODO: normalize name      $self->{input_encoding} = lc $charset; ## TODO: normalize name
96      $self->{confident} = 1;      $self->{confident} = 1;
97    } else {    } else {
98      $charset = 'windows-1252'; ## TODO: for now.      ## TODO: Implement HTML5 detection algorithm
99        require Whatpm::Charset::UniversalCharDet;
100        $charset = Whatpm::Charset::UniversalCharDet->detect_byte_string
101            (substr ($$bytes_s, 0, 1024));
102        $charset ||= 'windows-1252';
103      $s = \ (Encode::decode ($charset, $$bytes_s));      $s = \ (Encode::decode ($charset, $$bytes_s));
104      $self->{input_encoding} = $charset;      $self->{input_encoding} = $charset;
105      $self->{confident} = 0;      $self->{confident} = 0;
# Line 151  sub parse_byte_string ($$$$;$) { Line 153  sub parse_byte_string ($$$$;$) {
153    return $return;    return $return;
154  } # parse_byte_string  } # parse_byte_string
155    
156    ## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM
157    ## and the HTML layer MUST ignore it.  However, we does strip BOM in
158    ## the encoding layer and the HTML layer does not ignore any U+FEFF,
159    ## because the core part of our HTML parser expects a string of character,
160    ## not a string of bytes or code units or anything which might contain a BOM.
161    ## Therefore, any parser interface that accepts a string of bytes,
162    ## such as |parse_byte_string| in this module, must ensure that it does
163    ## strip the BOM and never strip any ZWNBSP.
164    
165  *parse_char_string = \&parse_string;  *parse_char_string = \&parse_string;
166    
167  sub parse_string ($$$;$) {  sub parse_string ($$$;$) {
# Line 275  sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUO Line 286  sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUO
286  sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }  sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
287  sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }  sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
288  sub BOGUS_DOCTYPE_STATE () { 32 }  sub BOGUS_DOCTYPE_STATE () { 32 }
289    sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
290    
291  sub DOCTYPE_TOKEN () { 1 }  sub DOCTYPE_TOKEN () { 1 }
292  sub COMMENT_TOKEN () { 2 }  sub COMMENT_TOKEN () { 2 }
# Line 336  sub _initialize_tokenizer ($) { Line 348  sub _initialize_tokenizer ($) {
348  ##   ->{system_identifier} (DOCTYPE_TOKEN)  ##   ->{system_identifier} (DOCTYPE_TOKEN)
349  ##   ->{correct} == 1 or 0 (DOCTYPE_TOKEN)  ##   ->{correct} == 1 or 0 (DOCTYPE_TOKEN)
350  ##   ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)  ##   ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
351    ##        ->{name}
352    ##        ->{value}
353    ##        ->{has_reference} == 1 or 0
354  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
355    
356  ## Emitted token MUST immediately be handled by the tree construction state.  ## Emitted token MUST immediately be handled by the tree construction state.
# Line 370  sub _get_next_token ($) { Line 385  sub _get_next_token ($) {
385    A: {    A: {
386      if ($self->{state} == DATA_STATE) {      if ($self->{state} == DATA_STATE) {
387        if ($self->{next_input_character} == 0x0026) { # &        if ($self->{next_input_character} == 0x0026) { # &
388          if ($self->{content_model} & CM_ENTITY) { # PCDATA | RCDATA          if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
389                not $self->{escape}) {
390            $self->{state} = ENTITY_DATA_STATE;            $self->{state} = ENTITY_DATA_STATE;
391            !!!next-input-character;            !!!next-input-character;
392            redo A;            redo A;
# Line 425  sub _get_next_token ($) { Line 441  sub _get_next_token ($) {
441      } elsif ($self->{state} == ENTITY_DATA_STATE) {      } elsif ($self->{state} == ENTITY_DATA_STATE) {
442        ## (cannot happen in CDATA state)        ## (cannot happen in CDATA state)
443                
444        my $token = $self->_tokenize_attempt_to_consume_an_entity (0);        my $token = $self->_tokenize_attempt_to_consume_an_entity (0, -1);
445    
446        $self->{state} = DATA_STATE;        $self->{state} = DATA_STATE;
447        # next-input-character is already done        # next-input-character is already done
# Line 728  sub _get_next_token ($) { Line 744  sub _get_next_token ($) {
744    
745          redo A;          redo A;
746        } else {        } else {
747            if ({
748                 0x0022 => 1, # "
749                 0x0027 => 1, # '
750                 0x003D => 1, # =
751                }->{$self->{next_input_character}}) {
752              !!!parse-error (type => 'bad attribute name');
753            }
754          $self->{current_attribute} = {name => chr ($self->{next_input_character}),          $self->{current_attribute} = {name => chr ($self->{next_input_character}),
755                                value => ''};                                value => ''};
756          $self->{state} = ATTRIBUTE_NAME_STATE;          $self->{state} = ATTRIBUTE_NAME_STATE;
# Line 822  sub _get_next_token ($) { Line 845  sub _get_next_token ($) {
845    
846          redo A;          redo A;
847        } else {        } else {
848            if ($self->{next_input_character} == 0x0022 or # "
849                $self->{next_input_character} == 0x0027) { # '
850              !!!parse-error (type => 'bad attribute name');
851            }
852          $self->{current_attribute}->{name} .= chr ($self->{next_input_character});          $self->{current_attribute}->{name} .= chr ($self->{next_input_character});
853          ## Stay in the state          ## Stay in the state
854          !!!next-input-character;          !!!next-input-character;
# Line 968  sub _get_next_token ($) { Line 995  sub _get_next_token ($) {
995    
996          redo A;          redo A;
997        } else {        } else {
998            if ($self->{next_input_character} == 0x003D) { # =
999              !!!parse-error (type => 'bad attribute value');
1000            }
1001          $self->{current_attribute}->{value} .= chr ($self->{next_input_character});          $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1002          $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;          $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1003          !!!next-input-character;          !!!next-input-character;
# Line 975  sub _get_next_token ($) { Line 1005  sub _get_next_token ($) {
1005        }        }
1006      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1007        if ($self->{next_input_character} == 0x0022) { # "        if ($self->{next_input_character} == 0x0022) { # "
1008          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1009          !!!next-input-character;          !!!next-input-character;
1010          redo A;          redo A;
1011        } elsif ($self->{next_input_character} == 0x0026) { # &        } elsif ($self->{next_input_character} == 0x0026) { # &
# Line 1011  sub _get_next_token ($) { Line 1041  sub _get_next_token ($) {
1041        }        }
1042      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1043        if ($self->{next_input_character} == 0x0027) { # '        if ($self->{next_input_character} == 0x0027) { # '
1044          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1045          !!!next-input-character;          !!!next-input-character;
1046          redo A;          redo A;
1047        } elsif ($self->{next_input_character} == 0x0026) { # &        } elsif ($self->{next_input_character} == 0x0026) { # &
# Line 1099  sub _get_next_token ($) { Line 1129  sub _get_next_token ($) {
1129    
1130          redo A;          redo A;
1131        } else {        } else {
1132            if ({
1133                 0x0022 => 1, # "
1134                 0x0027 => 1, # '
1135                 0x003D => 1, # =
1136                }->{$self->{next_input_character}}) {
1137              !!!parse-error (type => 'bad attribute value');
1138            }
1139          $self->{current_attribute}->{value} .= chr ($self->{next_input_character});          $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1140          ## Stay in the state          ## Stay in the state
1141          !!!next-input-character;          !!!next-input-character;
1142          redo A;          redo A;
1143        }        }
1144      } elsif ($self->{state} == ENTITY_IN_ATTRIBUTE_VALUE_STATE) {      } elsif ($self->{state} == ENTITY_IN_ATTRIBUTE_VALUE_STATE) {
1145        my $token = $self->_tokenize_attempt_to_consume_an_entity (1);        my $token = $self->_tokenize_attempt_to_consume_an_entity
1146              (1,
1147               $self->{last_attribute_value_state}
1148                 == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE ? 0x0022 : # "
1149               $self->{last_attribute_value_state}
1150                 == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE ? 0x0027 : # '
1151               -1);
1152    
1153        unless (defined $token) {        unless (defined $token) {
1154          $self->{current_attribute}->{value} .= '&';          $self->{current_attribute}->{value} .= '&';
1155        } else {        } else {
1156          $self->{current_attribute}->{value} .= $token->{data};          $self->{current_attribute}->{value} .= $token->{data};
1157            $self->{current_attribute}->{has_reference} = $token->{has_reference};
1158          ## ISSUE: spec says "append the returned character token to the current attribute's value"          ## ISSUE: spec says "append the returned character token to the current attribute's value"
1159        }        }
1160    
1161        $self->{state} = $self->{last_attribute_value_state};        $self->{state} = $self->{last_attribute_value_state};
1162        # next-input-character is already done        # next-input-character is already done
1163        redo A;        redo A;
1164        } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1165          if ($self->{next_input_character} == 0x0009 or # HT
1166              $self->{next_input_character} == 0x000A or # LF
1167              $self->{next_input_character} == 0x000B or # VT
1168              $self->{next_input_character} == 0x000C or # FF
1169              $self->{next_input_character} == 0x0020) { # SP
1170            $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1171            !!!next-input-character;
1172            redo A;
1173          } elsif ($self->{next_input_character} == 0x003E) { # >
1174            if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1175              $self->{current_token}->{first_start_tag}
1176                  = not defined $self->{last_emitted_start_tag_name};
1177              $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1178            } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1179              $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1180              if ($self->{current_token}->{attributes}) {
1181                !!!parse-error (type => 'end tag attribute');
1182              }
1183            } else {
1184              die "$0: $self->{current_token}->{type}: Unknown token type";
1185            }
1186            $self->{state} = DATA_STATE;
1187            !!!next-input-character;
1188    
1189            !!!emit ($self->{current_token}); # start tag or end tag
1190    
1191            redo A;
1192          } elsif ($self->{next_input_character} == 0x002F) { # /
1193            !!!next-input-character;
1194            if ($self->{next_input_character} == 0x003E and # >
1195                $self->{current_token}->{type} == START_TAG_TOKEN and
1196                $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
1197              # permitted slash
1198              #
1199            } else {
1200              !!!parse-error (type => 'nestc');
1201            }
1202            $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1203            # next-input-character is already done
1204            redo A;
1205          } else {
1206            !!!parse-error (type => 'no space between attributes');
1207            $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1208            ## reconsume
1209            redo A;
1210          }
1211      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1212        ## (only happen if PCDATA state)        ## (only happen if PCDATA state)
1213                
# Line 1495  sub _get_next_token ($) { Line 1586  sub _get_next_token ($) {
1586        }        }
1587    
1588        !!!parse-error (type => 'string after DOCTYPE name');        !!!parse-error (type => 'string after DOCTYPE name');
1589          delete $self->{current_token}->{correct};
1590    
1591        $self->{state} = BOGUS_DOCTYPE_STATE;        $self->{state} = BOGUS_DOCTYPE_STATE;
1592        # next-input-character is already done        # next-input-character is already done
1593        redo A;        redo A;
# Line 1538  sub _get_next_token ($) { Line 1631  sub _get_next_token ($) {
1631          redo A;          redo A;
1632        } else {        } else {
1633          !!!parse-error (type => 'string after PUBLIC');          !!!parse-error (type => 'string after PUBLIC');
1634            delete $self->{current_token}->{correct};
1635    
1636          $self->{state} = BOGUS_DOCTYPE_STATE;          $self->{state} = BOGUS_DOCTYPE_STATE;
1637          !!!next-input-character;          !!!next-input-character;
1638          redo A;          redo A;
# Line 1547  sub _get_next_token ($) { Line 1642  sub _get_next_token ($) {
1642          $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;          $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1643          !!!next-input-character;          !!!next-input-character;
1644          redo A;          redo A;
1645          } elsif ($self->{next_input_character} == 0x003E) { # >
1646            !!!parse-error (type => 'unclosed PUBLIC literal');
1647    
1648            $self->{state} = DATA_STATE;
1649            !!!next-input-character;
1650    
1651            delete $self->{current_token}->{correct};
1652            !!!emit ($self->{current_token}); # DOCTYPE
1653    
1654            redo A;
1655        } elsif ($self->{next_input_character} == -1) {        } elsif ($self->{next_input_character} == -1) {
1656          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
1657    
# Line 1569  sub _get_next_token ($) { Line 1674  sub _get_next_token ($) {
1674          $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;          $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1675          !!!next-input-character;          !!!next-input-character;
1676          redo A;          redo A;
1677          } elsif ($self->{next_input_character} == 0x003E) { # >
1678            !!!parse-error (type => 'unclosed PUBLIC literal');
1679    
1680            $self->{state} = DATA_STATE;
1681            !!!next-input-character;
1682    
1683            delete $self->{current_token}->{correct};
1684            !!!emit ($self->{current_token}); # DOCTYPE
1685    
1686            redo A;
1687        } elsif ($self->{next_input_character} == -1) {        } elsif ($self->{next_input_character} == -1) {
1688          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
1689    
# Line 1623  sub _get_next_token ($) { Line 1738  sub _get_next_token ($) {
1738          redo A;          redo A;
1739        } else {        } else {
1740          !!!parse-error (type => 'string after PUBLIC literal');          !!!parse-error (type => 'string after PUBLIC literal');
1741            delete $self->{current_token}->{correct};
1742    
1743          $self->{state} = BOGUS_DOCTYPE_STATE;          $self->{state} = BOGUS_DOCTYPE_STATE;
1744          !!!next-input-character;          !!!next-input-character;
1745          redo A;          redo A;
# Line 1666  sub _get_next_token ($) { Line 1783  sub _get_next_token ($) {
1783          redo A;          redo A;
1784        } else {        } else {
1785          !!!parse-error (type => 'string after SYSTEM');          !!!parse-error (type => 'string after SYSTEM');
1786            delete $self->{current_token}->{correct};
1787    
1788          $self->{state} = BOGUS_DOCTYPE_STATE;          $self->{state} = BOGUS_DOCTYPE_STATE;
1789          !!!next-input-character;          !!!next-input-character;
1790          redo A;          redo A;
# Line 1675  sub _get_next_token ($) { Line 1794  sub _get_next_token ($) {
1794          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1795          !!!next-input-character;          !!!next-input-character;
1796          redo A;          redo A;
1797          } elsif ($self->{next_input_character} == 0x003E) { # >
1798            !!!parse-error (type => 'unclosed PUBLIC literal');
1799    
1800            $self->{state} = DATA_STATE;
1801            !!!next-input-character;
1802    
1803            delete $self->{current_token}->{correct};
1804            !!!emit ($self->{current_token}); # DOCTYPE
1805    
1806            redo A;
1807        } elsif ($self->{next_input_character} == -1) {        } elsif ($self->{next_input_character} == -1) {
1808          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
1809    
# Line 1697  sub _get_next_token ($) { Line 1826  sub _get_next_token ($) {
1826          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1827          !!!next-input-character;          !!!next-input-character;
1828          redo A;          redo A;
1829          } elsif ($self->{next_input_character} == 0x003E) { # >
1830            !!!parse-error (type => 'unclosed PUBLIC literal');
1831    
1832            $self->{state} = DATA_STATE;
1833            !!!next-input-character;
1834    
1835            delete $self->{current_token}->{correct};
1836            !!!emit ($self->{current_token}); # DOCTYPE
1837    
1838            redo A;
1839        } elsif ($self->{next_input_character} == -1) {        } elsif ($self->{next_input_character} == -1) {
1840          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
1841    
# Line 1741  sub _get_next_token ($) { Line 1880  sub _get_next_token ($) {
1880          redo A;          redo A;
1881        } else {        } else {
1882          !!!parse-error (type => 'string after SYSTEM literal');          !!!parse-error (type => 'string after SYSTEM literal');
1883            delete $self->{current_token}->{correct};
1884    
1885          $self->{state} = BOGUS_DOCTYPE_STATE;          $self->{state} = BOGUS_DOCTYPE_STATE;
1886          !!!next-input-character;          !!!next-input-character;
1887          redo A;          redo A;
# Line 1750  sub _get_next_token ($) { Line 1891  sub _get_next_token ($) {
1891          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1892          !!!next-input-character;          !!!next-input-character;
1893    
         delete $self->{current_token}->{correct};  
1894          !!!emit ($self->{current_token}); # DOCTYPE          !!!emit ($self->{current_token}); # DOCTYPE
1895    
1896          redo A;          redo A;
# Line 1759  sub _get_next_token ($) { Line 1899  sub _get_next_token ($) {
1899          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1900          ## reconsume          ## reconsume
1901    
         delete $self->{current_token}->{correct};  
1902          !!!emit ($self->{current_token}); # DOCTYPE          !!!emit ($self->{current_token}); # DOCTYPE
1903    
1904          redo A;          redo A;
# Line 1776  sub _get_next_token ($) { Line 1915  sub _get_next_token ($) {
1915    die "$0: _get_next_token: unexpected case";    die "$0: _get_next_token: unexpected case";
1916  } # _get_next_token  } # _get_next_token
1917    
1918  sub _tokenize_attempt_to_consume_an_entity ($$) {  sub _tokenize_attempt_to_consume_an_entity ($$$) {
1919    my ($self, $in_attr) = @_;    my ($self, $in_attr, $additional) = @_;
1920    
1921    if ({    if ({
1922         0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,         0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
1923         0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR         0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR
1924           $additional => 1,
1925        }->{$self->{next_input_character}}) {        }->{$self->{next_input_character}}) {
1926      ## Don't consume      ## Don't consume
1927      ## No error      ## No error
# Line 1837  sub _tokenize_attempt_to_consume_an_enti Line 1977  sub _tokenize_attempt_to_consume_an_enti
1977            $code = $c1_entity_char->{$code};            $code = $c1_entity_char->{$code};
1978          }          }
1979    
1980          return {type => CHARACTER_TOKEN, data => chr $code};          return {type => CHARACTER_TOKEN, data => chr $code,
1981                    has_reference => 1};
1982        } # X        } # X
1983      } elsif (0x0030 <= $self->{next_input_character} and      } elsif (0x0030 <= $self->{next_input_character} and
1984               $self->{next_input_character} <= 0x0039) { # 0..9               $self->{next_input_character} <= 0x0039) { # 0..9
# Line 1872  sub _tokenize_attempt_to_consume_an_enti Line 2013  sub _tokenize_attempt_to_consume_an_enti
2013          $code = $c1_entity_char->{$code};          $code = $c1_entity_char->{$code};
2014        }        }
2015                
2016        return {type => CHARACTER_TOKEN, data => chr $code};        return {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1};
2017      } else {      } else {
2018        !!!parse-error (type => 'bare nero');        !!!parse-error (type => 'bare nero');
2019        !!!back-next-input-character ($self->{next_input_character});        !!!back-next-input-character ($self->{next_input_character});
# Line 1920  sub _tokenize_attempt_to_consume_an_enti Line 2061  sub _tokenize_attempt_to_consume_an_enti
2061      }      }
2062            
2063      if ($match > 0) {      if ($match > 0) {
2064        return {type => CHARACTER_TOKEN, data => $value};        return {type => CHARACTER_TOKEN, data => $value, has_reference => 1};
2065      } elsif ($match < 0) {      } elsif ($match < 0) {
2066        !!!parse-error (type => 'no refc');        !!!parse-error (type => 'no refc');
2067        if ($in_attr and $match < -1) {        if ($in_attr and $match < -1) {
2068          return {type => CHARACTER_TOKEN, data => '&'.$entity_name};          return {type => CHARACTER_TOKEN, data => '&'.$entity_name};
2069        } else {        } else {
2070          return {type => CHARACTER_TOKEN, data => $value};          return {type => CHARACTER_TOKEN, data => $value, has_reference => 1};
2071        }        }
2072      } else {      } else {
2073        !!!parse-error (type => 'bare ero');        !!!parse-error (type => 'bare ero');
2074        ## NOTE: No characters are consumed in the spec.        ## NOTE: "No characters are consumed" in the spec.
2075        return {type => CHARACTER_TOKEN, data => '&'.$value};        return {type => CHARACTER_TOKEN, data => '&'.$value};
2076      }      }
2077    } else {    } else {
# Line 2067  sub _tree_construction_initial ($) { Line 2208  sub _tree_construction_initial ($) {
2208            "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1,            "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1,
2209            "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1,            "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1,
2210            "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1,            "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1,
2211              "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED RELAXED 1.0//EN" => 1,
2212              "-//SOFTQUAD SOFTWARE//DTD HOTMETAL PRO 6.0::19990601::EXTENSIONS TO HTML 4.0//EN" => 1,
2213              "-//SOFTQUAD//DTD HOTMETAL PRO 4.0::19971010::EXTENSIONS TO HTML 4.0//EN" => 1,
2214            "-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1,            "-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1,
2215            "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1,            "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1,
2216            "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1,            "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1,
# Line 2184  sub _tree_construction_root_element ($) Line 2328  sub _tree_construction_root_element ($)
2328          #          #
2329        } elsif ($token->{type} == START_TAG_TOKEN) {        } elsif ($token->{type} == START_TAG_TOKEN) {
2330          if ($token->{tag_name} eq 'html' and          if ($token->{tag_name} eq 'html' and
2331              $token->{attributes}->{manifest}) { ## ISSUE: Spec spells as "application"              $token->{attributes}->{manifest}) {
2332            $self->{application_cache_selection}            $self->{application_cache_selection}
2333                 ->($token->{attributes}->{manifest}->{value});                 ->($token->{attributes}->{manifest}->{value});
2334            ## ISSUE: No relative reference resolution?            ## ISSUE: No relative reference resolution?
# Line 2862  sub _tree_construction_main ($) { Line 3006  sub _tree_construction_main ($) {
3006                  push @{$self->{open_elements}}, [$self->{head_element}, 'head'];                  push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3007                }                }
3008                !!!insert-element ($token->{tag_name}, $token->{attributes});                !!!insert-element ($token->{tag_name}, $token->{attributes});
3009                pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.                my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
3010    
3011                unless ($self->{confident}) {                unless ($self->{confident}) {
3012                  if ($token->{attributes}->{charset}) { ## TODO: And if supported                  if ($token->{attributes}->{charset}) { ## TODO: And if supported
3013                    $self->{change_encoding}                    $self->{change_encoding}
3014                        ->($self, $token->{attributes}->{charset}->{value});                        ->($self, $token->{attributes}->{charset}->{value});
3015                      
3016                      $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
3017                          ->set_user_data (manakai_has_reference =>
3018                                               $token->{attributes}->{charset}
3019                                                   ->{has_reference});
3020                  } elsif ($token->{attributes}->{content}) {                  } elsif ($token->{attributes}->{content}) {
3021                    ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.                    ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
3022                    if ($token->{attributes}->{content}->{value}                    if ($token->{attributes}->{content}->{value}
3023                        =~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*=                        =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
3024                              [\x09-\x0D\x20]*=
3025                            [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|                            [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
3026                            ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {                            ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
3027                      $self->{change_encoding}                      $self->{change_encoding}
3028                          ->($self, defined $1 ? $1 : defined $2 ? $2 : $3);                          ->($self, defined $1 ? $1 : defined $2 ? $2 : $3);
3029                        $meta_el->[0]->get_attribute_node_ns (undef, 'content')
3030                            ->set_user_data (manakai_has_reference =>
3031                                                 $token->{attributes}->{content}
3032                                                       ->{has_reference});
3033                    }                    }
3034                  }                  }
3035                  } else {
3036                    if ($token->{attributes}->{charset}) {
3037                      $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
3038                          ->set_user_data (manakai_has_reference =>
3039                                               $token->{attributes}->{charset}
3040                                                   ->{has_reference});
3041                    }
3042                    if ($token->{attributes}->{content}) {
3043                      $meta_el->[0]->get_attribute_node_ns (undef, 'content')
3044                          ->set_user_data (manakai_has_reference =>
3045                                               $token->{attributes}->{content}
3046                                                   ->{has_reference});
3047                    }
3048                }                }
3049    
3050                pop @{$self->{open_elements}}                pop @{$self->{open_elements}}
# Line 4450  sub _tree_construction_main ($) { Line 4617  sub _tree_construction_main ($) {
4617        } elsif ($token->{tag_name} eq 'meta') {        } elsif ($token->{tag_name} eq 'meta') {
4618          ## NOTE: This is an "as if in head" code clone, only "-t" differs          ## NOTE: This is an "as if in head" code clone, only "-t" differs
4619          !!!insert-element-t ($token->{tag_name}, $token->{attributes});          !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4620          pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.          my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4621    
4622          unless ($self->{confident}) {          unless ($self->{confident}) {
4623            if ($token->{attributes}->{charset}) { ## TODO: And if supported            if ($token->{attributes}->{charset}) { ## TODO: And if supported
4624              $self->{change_encoding}              $self->{change_encoding}
4625                  ->($self, $token->{attributes}->{charset}->{value});                  ->($self, $token->{attributes}->{charset}->{value});
4626                
4627                $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4628                    ->set_user_data (manakai_has_reference =>
4629                                         $token->{attributes}->{charset}
4630                                             ->{has_reference});
4631            } elsif ($token->{attributes}->{content}) {            } elsif ($token->{attributes}->{content}) {
4632              ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.              ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
4633              if ($token->{attributes}->{content}->{value}              if ($token->{attributes}->{content}->{value}
4634                  =~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*=                  =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
4635                        [\x09-\x0D\x20]*=
4636                      [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|                      [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
4637                      ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {                      ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
4638                $self->{change_encoding}                $self->{change_encoding}
4639                    ->($self, defined $1 ? $1 : defined $2 ? $2 : $3);                    ->($self, defined $1 ? $1 : defined $2 ? $2 : $3);
4640                  $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4641                      ->set_user_data (manakai_has_reference =>
4642                                           $token->{attributes}->{content}
4643                                                 ->{has_reference});
4644              }              }
4645            }            }
4646            } else {
4647              if ($token->{attributes}->{charset}) {
4648                $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4649                    ->set_user_data (manakai_has_reference =>
4650                                         $token->{attributes}->{charset}
4651                                             ->{has_reference});
4652              }
4653              if ($token->{attributes}->{content}) {
4654                $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4655                    ->set_user_data (manakai_has_reference =>
4656                                         $token->{attributes}->{content}
4657                                             ->{has_reference});
4658              }
4659          }          }
4660    
4661          !!!next-token;          !!!next-token;
# Line 5365  sub set_inner_html ($$$) { Line 5555  sub set_inner_html ($$$) {
5555      $p->_initialize_tree_constructor;      $p->_initialize_tree_constructor;
5556    
5557      ## Step 2      ## Step 2
5558      my $node_ln = $node->local_name;      my $node_ln = $node->manakai_local_name;
5559      $p->{content_model} = {      $p->{content_model} = {
5560        title => RCDATA_CONTENT_MODEL,        title => RCDATA_CONTENT_MODEL,
5561        textarea => RCDATA_CONTENT_MODEL,        textarea => RCDATA_CONTENT_MODEL,
# Line 5405  sub set_inner_html ($$$) { Line 5595  sub set_inner_html ($$$) {
5595        if ($anode->node_type == 1) {        if ($anode->node_type == 1) {
5596          my $nsuri = $anode->namespace_uri;          my $nsuri = $anode->namespace_uri;
5597          if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {          if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {
5598            if ($anode->local_name eq 'form') { ## TODO: case?            if ($anode->manakai_local_name eq 'form') {
5599              $p->{form_element} = $anode;              $p->{form_element} = $anode;
5600              last AN;              last AN;
5601            }            }

Legend:
Removed from v.1.64  
changed lines
  Added in v.1.73

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24