/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.61 by wakaba, Sun Nov 4 04:15:06 2007 UTC revision 1.72 by wakaba, Sun Mar 2 14:32:26 2008 UTC
# Line 1  Line 1 
1  package Whatpm::HTML;  package Whatpm::HTML;
2  use strict;  use strict;
3  our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};  our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4    use Error qw(:try);
5    
6  ## ISSUE:  ## ISSUE:
7  ## var doc = implementation.createDocument (null, null, null);  ## var doc = implementation.createDocument (null, null, null);
8  ## doc.write ('');  ## doc.write ('');
9  ## alert (doc.compatMode);  ## alert (doc.compatMode);
10    
11  ## ISSUE: HTML5 revision 967 says that the encoding layer MUST NOT  ## TODO: Control charcters and noncharacters are not allowed (HTML5 revision 1263)
12  ## strip BOM and the HTML layer MUST ignore it.  Whether we can do it  ## TODO: 1252 parse error (revision 1264)
13  ## is not yet clear.  ## TODO: 8859-11 = 874 (revision 1271)
 ## "{U+FEFF}..." in UTF-16BE/UTF-16LE is three or four characters?  
 ## "{U+FEFF}..." in GB18030?  
14    
15  my $permitted_slash_tag_name = {  my $permitted_slash_tag_name = {
16    base => 1,    base => 1,
# Line 19  my $permitted_slash_tag_name = { Line 18  my $permitted_slash_tag_name = {
18    meta => 1,    meta => 1,
19    hr => 1,    hr => 1,
20    br => 1,    br => 1,
21    img=> 1,    img => 1,
22    embed => 1,    embed => 1,
23    param => 1,    param => 1,
24    area => 1,    area => 1,
# Line 84  my $formatting_category = { Line 83  my $formatting_category = {
83  };  };
84  # $phrasing_category: all other elements  # $phrasing_category: all other elements
85    
86    sub parse_byte_string ($$$$;$) {
87      my $self = ref $_[0] ? shift : shift->new;
88      my $charset = shift;
89      my $bytes_s = ref $_[0] ? $_[0] : \($_[0]);
90      my $s;
91      
92      if (defined $charset) {
93        require Encode; ## TODO: decode(utf8) don't delete BOM
94        $s = \ (Encode::decode ($charset, $$bytes_s));
95        $self->{input_encoding} = lc $charset; ## TODO: normalize name
96        $self->{confident} = 1;
97      } else {
98        ## TODO: Implement HTML5 detection algorithm
99        require Whatpm::Charset::UniversalCharDet;
100        $charset = Whatpm::Charset::UniversalCharDet->detect_byte_string
101            (substr ($$bytes_s, 0, 1024));
102        $charset ||= 'windows-1252';
103        $s = \ (Encode::decode ($charset, $$bytes_s));
104        $self->{input_encoding} = $charset;
105        $self->{confident} = 0;
106      }
107    
108      $self->{change_encoding} = sub {
109        my $self = shift;
110        my $charset = lc shift;
111        ## TODO: if $charset is supported
112        ## TODO: normalize charset name
113    
114        ## "Change the encoding" algorithm:
115    
116        ## Step 1    
117        if ($charset eq 'utf-16') { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8?
118          $charset = 'utf-8';
119        }
120    
121        ## Step 2
122        if (defined $self->{input_encoding} and
123            $self->{input_encoding} eq $charset) {
124          $self->{confident} = 1;
125          return;
126        }
127    
128        !!!parse-error (type => 'charset label detected:'.$self->{input_encoding}.
129            ':'.$charset, level => 'w');
130    
131        ## Step 3
132        # if (can) {
133          ## change the encoding on the fly.
134          #$self->{confident} = 1;
135          #return;
136        # }
137    
138        ## Step 4
139        throw Whatpm::HTML::RestartParser (charset => $charset);
140      }; # $self->{change_encoding}
141    
142      my @args = @_; shift @args; # $s
143      my $return;
144      try {
145        $return = $self->parse_char_string ($s, @args);  
146      } catch Whatpm::HTML::RestartParser with {
147        my $charset = shift->{charset};
148        $s = \ (Encode::decode ($charset, $$bytes_s));    
149        $self->{input_encoding} = $charset; ## TODO: normalize
150        $self->{confident} = 1;
151        $return = $self->parse_char_string ($s, @args);
152      };
153      return $return;
154    } # parse_byte_string
155    
156    ## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM
157    ## and the HTML layer MUST ignore it.  However, we does strip BOM in
158    ## the encoding layer and the HTML layer does not ignore any U+FEFF,
159    ## because the core part of our HTML parser expects a string of character,
160    ## not a string of bytes or code units or anything which might contain a BOM.
161    ## Therefore, any parser interface that accepts a string of bytes,
162    ## such as |parse_byte_string| in this module, must ensure that it does
163    ## strip the BOM and never strip any ZWNBSP.
164    
165    *parse_char_string = \&parse_string;
166    
167  sub parse_string ($$$;$) {  sub parse_string ($$$;$) {
168    my $self = shift->new;    my $self = ref $_[0] ? shift : shift->new;
169    my $s = \$_[0];    my $s = ref $_[0] ? $_[0] : \($_[0]);
170    $self->{document} = $_[1];    $self->{document} = $_[1];
171      @{$self->{document}->child_nodes} = ();
172    
173    ## NOTE: |set_inner_html| copies most of this method's code    ## NOTE: |set_inner_html| copies most of this method's code
174    
175      $self->{confident} = 1 unless exists $self->{confident};
176      $self->{document}->input_encoding ($self->{input_encoding})
177          if defined $self->{input_encoding};
178    
179    my $i = 0;    my $i = 0;
180    my $line = 1;    my $line = 1;
181    my $column = 0;    my $column = 0;
# Line 147  sub new ($) { Line 232  sub new ($) {
232    $self->{parse_error} = sub {    $self->{parse_error} = sub {
233      #      #
234    };    };
235      $self->{change_encoding} = sub {
236        # if ($_[0] is a supported encoding) {
237        #   run "change the encoding" algorithm;
238        #   throw Whatpm::HTML::RestartParser (charset => $new_encoding);
239        # }
240      };
241    $self->{application_cache_selection} = sub {    $self->{application_cache_selection} = sub {
242      #      #
243    };    };
# Line 195  sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUO Line 286  sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUO
286  sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }  sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
287  sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }  sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
288  sub BOGUS_DOCTYPE_STATE () { 32 }  sub BOGUS_DOCTYPE_STATE () { 32 }
289    sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
290    
291  sub DOCTYPE_TOKEN () { 1 }  sub DOCTYPE_TOKEN () { 1 }
292  sub COMMENT_TOKEN () { 2 }  sub COMMENT_TOKEN () { 2 }
# Line 256  sub _initialize_tokenizer ($) { Line 348  sub _initialize_tokenizer ($) {
348  ##   ->{system_identifier} (DOCTYPE_TOKEN)  ##   ->{system_identifier} (DOCTYPE_TOKEN)
349  ##   ->{correct} == 1 or 0 (DOCTYPE_TOKEN)  ##   ->{correct} == 1 or 0 (DOCTYPE_TOKEN)
350  ##   ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)  ##   ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
351    ##        ->{name}
352    ##        ->{value}
353    ##        ->{has_reference} == 1 or 0
354  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
355    
356  ## Emitted token MUST immediately be handled by the tree construction state.  ## Emitted token MUST immediately be handled by the tree construction state.
# Line 290  sub _get_next_token ($) { Line 385  sub _get_next_token ($) {
385    A: {    A: {
386      if ($self->{state} == DATA_STATE) {      if ($self->{state} == DATA_STATE) {
387        if ($self->{next_input_character} == 0x0026) { # &        if ($self->{next_input_character} == 0x0026) { # &
388          if ($self->{content_model} & CM_ENTITY) { # PCDATA | RCDATA          if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
389                not $self->{escape}) {
390            $self->{state} = ENTITY_DATA_STATE;            $self->{state} = ENTITY_DATA_STATE;
391            !!!next-input-character;            !!!next-input-character;
392            redo A;            redo A;
# Line 345  sub _get_next_token ($) { Line 441  sub _get_next_token ($) {
441      } elsif ($self->{state} == ENTITY_DATA_STATE) {      } elsif ($self->{state} == ENTITY_DATA_STATE) {
442        ## (cannot happen in CDATA state)        ## (cannot happen in CDATA state)
443                
444        my $token = $self->_tokenize_attempt_to_consume_an_entity (0);        my $token = $self->_tokenize_attempt_to_consume_an_entity (0, -1);
445    
446        $self->{state} = DATA_STATE;        $self->{state} = DATA_STATE;
447        # next-input-character is already done        # next-input-character is already done
# Line 648  sub _get_next_token ($) { Line 744  sub _get_next_token ($) {
744    
745          redo A;          redo A;
746        } else {        } else {
747            if ({
748                 0x0022 => 1, # "
749                 0x0027 => 1, # '
750                 0x003D => 1, # =
751                }->{$self->{next_input_character}}) {
752              !!!parse-error (type => 'bad attribute name');
753            }
754          $self->{current_attribute} = {name => chr ($self->{next_input_character}),          $self->{current_attribute} = {name => chr ($self->{next_input_character}),
755                                value => ''};                                value => ''};
756          $self->{state} = ATTRIBUTE_NAME_STATE;          $self->{state} = ATTRIBUTE_NAME_STATE;
# Line 742  sub _get_next_token ($) { Line 845  sub _get_next_token ($) {
845    
846          redo A;          redo A;
847        } else {        } else {
848            if ($self->{next_input_character} == 0x0022 or # "
849                $self->{next_input_character} == 0x0027) { # '
850              !!!parse-error (type => 'bad attribute name');
851            }
852          $self->{current_attribute}->{name} .= chr ($self->{next_input_character});          $self->{current_attribute}->{name} .= chr ($self->{next_input_character});
853          ## Stay in the state          ## Stay in the state
854          !!!next-input-character;          !!!next-input-character;
# Line 888  sub _get_next_token ($) { Line 995  sub _get_next_token ($) {
995    
996          redo A;          redo A;
997        } else {        } else {
998            if ($self->{next_input_character} == 0x003D) { # =
999              !!!parse-error (type => 'bad attribute value');
1000            }
1001          $self->{current_attribute}->{value} .= chr ($self->{next_input_character});          $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1002          $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;          $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1003          !!!next-input-character;          !!!next-input-character;
# Line 895  sub _get_next_token ($) { Line 1005  sub _get_next_token ($) {
1005        }        }
1006      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1007        if ($self->{next_input_character} == 0x0022) { # "        if ($self->{next_input_character} == 0x0022) { # "
1008          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1009          !!!next-input-character;          !!!next-input-character;
1010          redo A;          redo A;
1011        } elsif ($self->{next_input_character} == 0x0026) { # &        } elsif ($self->{next_input_character} == 0x0026) { # &
# Line 931  sub _get_next_token ($) { Line 1041  sub _get_next_token ($) {
1041        }        }
1042      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1043        if ($self->{next_input_character} == 0x0027) { # '        if ($self->{next_input_character} == 0x0027) { # '
1044          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1045          !!!next-input-character;          !!!next-input-character;
1046          redo A;          redo A;
1047        } elsif ($self->{next_input_character} == 0x0026) { # &        } elsif ($self->{next_input_character} == 0x0026) { # &
# Line 1019  sub _get_next_token ($) { Line 1129  sub _get_next_token ($) {
1129    
1130          redo A;          redo A;
1131        } else {        } else {
1132            if ({
1133                 0x0022 => 1, # "
1134                 0x0027 => 1, # '
1135                 0x003D => 1, # =
1136                }->{$self->{next_input_character}}) {
1137              !!!parse-error (type => 'bad attribute value');
1138            }
1139          $self->{current_attribute}->{value} .= chr ($self->{next_input_character});          $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1140          ## Stay in the state          ## Stay in the state
1141          !!!next-input-character;          !!!next-input-character;
1142          redo A;          redo A;
1143        }        }
1144      } elsif ($self->{state} == ENTITY_IN_ATTRIBUTE_VALUE_STATE) {      } elsif ($self->{state} == ENTITY_IN_ATTRIBUTE_VALUE_STATE) {
1145        my $token = $self->_tokenize_attempt_to_consume_an_entity (1);        my $token = $self->_tokenize_attempt_to_consume_an_entity
1146              (1,
1147               $self->{last_attribute_value_state}
1148                 == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE ? 0x0022 : # "
1149               $self->{last_attribute_value_state}
1150                 == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE ? 0x0027 : # '
1151               -1);
1152    
1153        unless (defined $token) {        unless (defined $token) {
1154          $self->{current_attribute}->{value} .= '&';          $self->{current_attribute}->{value} .= '&';
1155        } else {        } else {
1156          $self->{current_attribute}->{value} .= $token->{data};          $self->{current_attribute}->{value} .= $token->{data};
1157            $self->{current_attribute}->{has_reference} = $token->{has_reference};
1158          ## ISSUE: spec says "append the returned character token to the current attribute's value"          ## ISSUE: spec says "append the returned character token to the current attribute's value"
1159        }        }
1160    
1161        $self->{state} = $self->{last_attribute_value_state};        $self->{state} = $self->{last_attribute_value_state};
1162        # next-input-character is already done        # next-input-character is already done
1163        redo A;        redo A;
1164        } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1165          if ($self->{next_input_character} == 0x0009 or # HT
1166              $self->{next_input_character} == 0x000A or # LF
1167              $self->{next_input_character} == 0x000B or # VT
1168              $self->{next_input_character} == 0x000C or # FF
1169              $self->{next_input_character} == 0x0020) { # SP
1170            $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1171            !!!next-input-character;
1172            redo A;
1173          } elsif ($self->{next_input_character} == 0x003E) { # >
1174            if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1175              $self->{current_token}->{first_start_tag}
1176                  = not defined $self->{last_emitted_start_tag_name};
1177              $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1178            } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1179              $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1180              if ($self->{current_token}->{attributes}) {
1181                !!!parse-error (type => 'end tag attribute');
1182              }
1183            } else {
1184              die "$0: $self->{current_token}->{type}: Unknown token type";
1185            }
1186            $self->{state} = DATA_STATE;
1187            !!!next-input-character;
1188    
1189            !!!emit ($self->{current_token}); # start tag or end tag
1190    
1191            redo A;
1192          } elsif ($self->{next_input_character} == 0x002F) { # /
1193            !!!next-input-character;
1194            if ($self->{next_input_character} == 0x003E and # >
1195                $self->{current_token}->{type} == START_TAG_TOKEN and
1196                $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
1197              # permitted slash
1198              #
1199            } else {
1200              !!!parse-error (type => 'nestc');
1201            }
1202            $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1203            # next-input-character is already done
1204            redo A;
1205          } else {
1206            !!!parse-error (type => 'no space between attributes');
1207            $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1208            ## reconsume
1209            redo A;
1210          }
1211      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1212        ## (only happen if PCDATA state)        ## (only happen if PCDATA state)
1213                
# Line 1467  sub _get_next_token ($) { Line 1638  sub _get_next_token ($) {
1638          $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;          $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1639          !!!next-input-character;          !!!next-input-character;
1640          redo A;          redo A;
1641          } elsif ($self->{next_input_character} == 0x003E) { # >
1642            !!!parse-error (type => 'unclosed PUBLIC literal');
1643    
1644            $self->{state} = DATA_STATE;
1645            !!!next-input-character;
1646    
1647            delete $self->{current_token}->{correct};
1648            !!!emit ($self->{current_token}); # DOCTYPE
1649    
1650            redo A;
1651        } elsif ($self->{next_input_character} == -1) {        } elsif ($self->{next_input_character} == -1) {
1652          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
1653    
# Line 1489  sub _get_next_token ($) { Line 1670  sub _get_next_token ($) {
1670          $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;          $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1671          !!!next-input-character;          !!!next-input-character;
1672          redo A;          redo A;
1673          } elsif ($self->{next_input_character} == 0x003E) { # >
1674            !!!parse-error (type => 'unclosed PUBLIC literal');
1675    
1676            $self->{state} = DATA_STATE;
1677            !!!next-input-character;
1678    
1679            delete $self->{current_token}->{correct};
1680            !!!emit ($self->{current_token}); # DOCTYPE
1681    
1682            redo A;
1683        } elsif ($self->{next_input_character} == -1) {        } elsif ($self->{next_input_character} == -1) {
1684          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
1685    
# Line 1595  sub _get_next_token ($) { Line 1786  sub _get_next_token ($) {
1786          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1787          !!!next-input-character;          !!!next-input-character;
1788          redo A;          redo A;
1789          } elsif ($self->{next_input_character} == 0x003E) { # >
1790            !!!parse-error (type => 'unclosed PUBLIC literal');
1791    
1792            $self->{state} = DATA_STATE;
1793            !!!next-input-character;
1794    
1795            delete $self->{current_token}->{correct};
1796            !!!emit ($self->{current_token}); # DOCTYPE
1797    
1798            redo A;
1799        } elsif ($self->{next_input_character} == -1) {        } elsif ($self->{next_input_character} == -1) {
1800          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
1801    
# Line 1617  sub _get_next_token ($) { Line 1818  sub _get_next_token ($) {
1818          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1819          !!!next-input-character;          !!!next-input-character;
1820          redo A;          redo A;
1821          } elsif ($self->{next_input_character} == 0x003E) { # >
1822            !!!parse-error (type => 'unclosed PUBLIC literal');
1823    
1824            $self->{state} = DATA_STATE;
1825            !!!next-input-character;
1826    
1827            delete $self->{current_token}->{correct};
1828            !!!emit ($self->{current_token}); # DOCTYPE
1829    
1830            redo A;
1831        } elsif ($self->{next_input_character} == -1) {        } elsif ($self->{next_input_character} == -1) {
1832          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
1833    
# Line 1696  sub _get_next_token ($) { Line 1907  sub _get_next_token ($) {
1907    die "$0: _get_next_token: unexpected case";    die "$0: _get_next_token: unexpected case";
1908  } # _get_next_token  } # _get_next_token
1909    
1910  sub _tokenize_attempt_to_consume_an_entity ($$) {  sub _tokenize_attempt_to_consume_an_entity ($$$) {
1911    my ($self, $in_attr) = @_;    my ($self, $in_attr, $additional) = @_;
1912    
1913    if ({    if ({
1914         0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,         0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
1915         0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR         0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR
1916           $additional => 1,
1917        }->{$self->{next_input_character}}) {        }->{$self->{next_input_character}}) {
1918      ## Don't consume      ## Don't consume
1919      ## No error      ## No error
# Line 1757  sub _tokenize_attempt_to_consume_an_enti Line 1969  sub _tokenize_attempt_to_consume_an_enti
1969            $code = $c1_entity_char->{$code};            $code = $c1_entity_char->{$code};
1970          }          }
1971    
1972          return {type => CHARACTER_TOKEN, data => chr $code};          return {type => CHARACTER_TOKEN, data => chr $code,
1973                    has_reference => 1};
1974        } # X        } # X
1975      } elsif (0x0030 <= $self->{next_input_character} and      } elsif (0x0030 <= $self->{next_input_character} and
1976               $self->{next_input_character} <= 0x0039) { # 0..9               $self->{next_input_character} <= 0x0039) { # 0..9
# Line 1792  sub _tokenize_attempt_to_consume_an_enti Line 2005  sub _tokenize_attempt_to_consume_an_enti
2005          $code = $c1_entity_char->{$code};          $code = $c1_entity_char->{$code};
2006        }        }
2007                
2008        return {type => CHARACTER_TOKEN, data => chr $code};        return {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1};
2009      } else {      } else {
2010        !!!parse-error (type => 'bare nero');        !!!parse-error (type => 'bare nero');
2011        !!!back-next-input-character ($self->{next_input_character});        !!!back-next-input-character ($self->{next_input_character});
# Line 1840  sub _tokenize_attempt_to_consume_an_enti Line 2053  sub _tokenize_attempt_to_consume_an_enti
2053      }      }
2054            
2055      if ($match > 0) {      if ($match > 0) {
2056        return {type => CHARACTER_TOKEN, data => $value};        return {type => CHARACTER_TOKEN, data => $value, has_reference => 1};
2057      } elsif ($match < 0) {      } elsif ($match < 0) {
2058        !!!parse-error (type => 'no refc');        !!!parse-error (type => 'no refc');
2059        if ($in_attr and $match < -1) {        if ($in_attr and $match < -1) {
2060          return {type => CHARACTER_TOKEN, data => '&'.$entity_name};          return {type => CHARACTER_TOKEN, data => '&'.$entity_name};
2061        } else {        } else {
2062          return {type => CHARACTER_TOKEN, data => $value};          return {type => CHARACTER_TOKEN, data => $value, has_reference => 1};
2063        }        }
2064      } else {      } else {
2065        !!!parse-error (type => 'bare ero');        !!!parse-error (type => 'bare ero');
2066        ## NOTE: No characters are consumed in the spec.        ## NOTE: "No characters are consumed" in the spec.
2067        return {type => CHARACTER_TOKEN, data => '&'.$value};        return {type => CHARACTER_TOKEN, data => '&'.$value};
2068      }      }
2069    } else {    } else {
# Line 1987  sub _tree_construction_initial ($) { Line 2200  sub _tree_construction_initial ($) {
2200            "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1,            "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1,
2201            "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1,            "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1,
2202            "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1,            "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1,
2203              "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED RELAXED 1.0//EN" => 1,
2204              "-//SOFTQUAD SOFTWARE//DTD HOTMETAL PRO 6.0::19990601::EXTENSIONS TO HTML 4.0//EN" => 1,
2205              "-//SOFTQUAD//DTD HOTMETAL PRO 4.0::19971010::EXTENSIONS TO HTML 4.0//EN" => 1,
2206            "-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1,            "-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1,
2207            "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1,            "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1,
2208            "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1,            "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1,
# Line 2104  sub _tree_construction_root_element ($) Line 2320  sub _tree_construction_root_element ($)
2320          #          #
2321        } elsif ($token->{type} == START_TAG_TOKEN) {        } elsif ($token->{type} == START_TAG_TOKEN) {
2322          if ($token->{tag_name} eq 'html' and          if ($token->{tag_name} eq 'html' and
2323              $token->{attributes}->{manifest}) { ## ISSUE: Spec spells as "application"              $token->{attributes}->{manifest}) {
2324            $self->{application_cache_selection}            $self->{application_cache_selection}
2325                 ->($token->{attributes}->{manifest}->{value});                 ->($token->{attributes}->{manifest}->{value});
2326            ## ISSUE: No relative reference resolution?            ## ISSUE: No relative reference resolution?
# Line 2782  sub _tree_construction_main ($) { Line 2998  sub _tree_construction_main ($) {
2998                  push @{$self->{open_elements}}, [$self->{head_element}, 'head'];                  push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2999                }                }
3000                !!!insert-element ($token->{tag_name}, $token->{attributes});                !!!insert-element ($token->{tag_name}, $token->{attributes});
3001                pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.                my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
3002    
3003                unless ($self->{confident}) {                unless ($self->{confident}) {
                 my $charset;  
3004                  if ($token->{attributes}->{charset}) { ## TODO: And if supported                  if ($token->{attributes}->{charset}) { ## TODO: And if supported
3005                    $charset = $token->{attributes}->{charset}->{value};                    $self->{change_encoding}
3006                  }                        ->($self, $token->{attributes}->{charset}->{value});
3007                  if ($token->{attributes}->{'http-equiv'}) {                    
3008                      $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
3009                          ->set_user_data (manakai_has_reference =>
3010                                               $token->{attributes}->{charset}
3011                                                   ->{has_reference});
3012                    } elsif ($token->{attributes}->{content}) {
3013                    ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.                    ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
3014                    if ($token->{attributes}->{'http-equiv'}->{value}                    if ($token->{attributes}->{content}->{value}
3015                        =~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*=                        =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
3016                              [\x09-\x0D\x20]*=
3017                            [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|                            [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
3018                            ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {                            ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
3019                      $charset = defined $1 ? $1 : defined $2 ? $2 : $3;                      $self->{change_encoding}
3020                    } ## TODO: And if supported                          ->($self, defined $1 ? $1 : defined $2 ? $2 : $3);
3021                        $meta_el->[0]->get_attribute_node_ns (undef, 'content')
3022                            ->set_user_data (manakai_has_reference =>
3023                                                 $token->{attributes}->{content}
3024                                                       ->{has_reference});
3025                      }
3026                    }
3027                  } else {
3028                    if ($token->{attributes}->{charset}) {
3029                      $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
3030                          ->set_user_data (manakai_has_reference =>
3031                                               $token->{attributes}->{charset}
3032                                                   ->{has_reference});
3033                    }
3034                    if ($token->{attributes}->{content}) {
3035                      $meta_el->[0]->get_attribute_node_ns (undef, 'content')
3036                          ->set_user_data (manakai_has_reference =>
3037                                               $token->{attributes}->{content}
3038                                                   ->{has_reference});
3039                  }                  }
                 ## TODO: Change the encoding  
3040                }                }
3041    
               ## TODO: Extracting |charset| from |meta|.  
3042                pop @{$self->{open_elements}}                pop @{$self->{open_elements}}
3043                    if $self->{insertion_mode} == AFTER_HEAD_IM;                    if $self->{insertion_mode} == AFTER_HEAD_IM;
3044                !!!next-token;                !!!next-token;
# Line 4372  sub _tree_construction_main ($) { Line 4609  sub _tree_construction_main ($) {
4609        } elsif ($token->{tag_name} eq 'meta') {        } elsif ($token->{tag_name} eq 'meta') {
4610          ## NOTE: This is an "as if in head" code clone, only "-t" differs          ## NOTE: This is an "as if in head" code clone, only "-t" differs
4611          !!!insert-element-t ($token->{tag_name}, $token->{attributes});          !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4612          pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.          my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4613    
4614          unless ($self->{confident}) {          unless ($self->{confident}) {
           my $charset;  
4615            if ($token->{attributes}->{charset}) { ## TODO: And if supported            if ($token->{attributes}->{charset}) { ## TODO: And if supported
4616              $charset = $token->{attributes}->{charset}->{value};              $self->{change_encoding}
4617            }                  ->($self, $token->{attributes}->{charset}->{value});
4618            if ($token->{attributes}->{'http-equiv'}) {              
4619                $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4620                    ->set_user_data (manakai_has_reference =>
4621                                         $token->{attributes}->{charset}
4622                                             ->{has_reference});
4623              } elsif ($token->{attributes}->{content}) {
4624              ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.              ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
4625              if ($token->{attributes}->{'http-equiv'}->{value}              if ($token->{attributes}->{content}->{value}
4626                  =~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*=                  =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
4627                        [\x09-\x0D\x20]*=
4628                      [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|                      [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
4629                      ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {                      ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
4630                $charset = defined $1 ? $1 : defined $2 ? $2 : $3;                $self->{change_encoding}
4631              } ## TODO: And if supported                    ->($self, defined $1 ? $1 : defined $2 ? $2 : $3);
4632                  $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4633                      ->set_user_data (manakai_has_reference =>
4634                                           $token->{attributes}->{content}
4635                                                 ->{has_reference});
4636                }
4637              }
4638            } else {
4639              if ($token->{attributes}->{charset}) {
4640                $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4641                    ->set_user_data (manakai_has_reference =>
4642                                         $token->{attributes}->{charset}
4643                                             ->{has_reference});
4644              }
4645              if ($token->{attributes}->{content}) {
4646                $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4647                    ->set_user_data (manakai_has_reference =>
4648                                         $token->{attributes}->{content}
4649                                             ->{has_reference});
4650            }            }
           ## TODO: Change the encoding  
4651          }          }
4652    
4653          !!!next-token;          !!!next-token;
# Line 5214  sub set_inner_html ($$$) { Line 5473  sub set_inner_html ($$$) {
5473    my $s = \$_[0];    my $s = \$_[0];
5474    my $onerror = $_[1];    my $onerror = $_[1];
5475    
5476      ## ISSUE: Should {confident} be true?
5477    
5478    my $nt = $node->node_type;    my $nt = $node->node_type;
5479    if ($nt == 9) {    if ($nt == 9) {
5480      # MUST      # MUST
# Line 5286  sub set_inner_html ($$$) { Line 5547  sub set_inner_html ($$$) {
5547      $p->_initialize_tree_constructor;      $p->_initialize_tree_constructor;
5548    
5549      ## Step 2      ## Step 2
5550      my $node_ln = $node->local_name;      my $node_ln = $node->manakai_local_name;
5551      $p->{content_model} = {      $p->{content_model} = {
5552        title => RCDATA_CONTENT_MODEL,        title => RCDATA_CONTENT_MODEL,
5553        textarea => RCDATA_CONTENT_MODEL,        textarea => RCDATA_CONTENT_MODEL,
# Line 5326  sub set_inner_html ($$$) { Line 5587  sub set_inner_html ($$$) {
5587        if ($anode->node_type == 1) {        if ($anode->node_type == 1) {
5588          my $nsuri = $anode->namespace_uri;          my $nsuri = $anode->namespace_uri;
5589          if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {          if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {
5590            if ($anode->local_name eq 'form') { ## TODO: case?            if ($anode->manakai_local_name eq 'form') {
5591              $p->{form_element} = $anode;              $p->{form_element} = $anode;
5592              last AN;              last AN;
5593            }            }
# Line 5366  sub set_inner_html ($$$) { Line 5627  sub set_inner_html ($$$) {
5627    
5628  } # tree construction stage  } # tree construction stage
5629    
5630  sub get_inner_html ($$$) {  package Whatpm::HTML::RestartParser;
5631    my (undef, $node, $on_error) = @_;  push our @ISA, 'Error';
   
   ## Step 1  
   my $s = '';  
   
   my $in_cdata;  
   my $parent = $node;  
   while (defined $parent) {  
     if ($parent->node_type == 1 and  
         $parent->namespace_uri eq 'http://www.w3.org/1999/xhtml' and  
         {  
           style => 1, script => 1, xmp => 1, iframe => 1,  
           noembed => 1, noframes => 1, noscript => 1,  
         }->{$parent->local_name}) { ## TODO: case thingy  
       $in_cdata = 1;  
     }  
     $parent = $parent->parent_node;  
   }  
   
   ## Step 2  
   my @node = @{$node->child_nodes};  
   C: while (@node) {  
     my $child = shift @node;  
     unless (ref $child) {  
       if ($child eq 'cdata-out') {  
         $in_cdata = 0;  
       } else {  
         $s .= $child; # end tag  
       }  
       next C;  
     }  
       
     my $nt = $child->node_type;  
     if ($nt == 1) { # Element  
       my $tag_name = $child->tag_name; ## TODO: manakai_tag_name  
       $s .= '<' . $tag_name;  
       ## NOTE: Non-HTML case:  
       ## <http://permalink.gmane.org/gmane.org.w3c.whatwg.discuss/11191>  
   
       my @attrs = @{$child->attributes}; # sort order MUST be stable  
       for my $attr (@attrs) { # order is implementation dependent  
         my $attr_name = $attr->name; ## TODO: manakai_name  
         $s .= ' ' . $attr_name . '="';  
         my $attr_value = $attr->value;  
         ## escape  
         $attr_value =~ s/&/&amp;/g;  
         $attr_value =~ s/</&lt;/g;  
         $attr_value =~ s/>/&gt;/g;  
         $attr_value =~ s/"/&quot;/g;  
         $s .= $attr_value . '"';  
       }  
       $s .= '>';  
         
       next C if {  
         area => 1, base => 1, basefont => 1, bgsound => 1,  
         br => 1, col => 1, embed => 1, frame => 1, hr => 1,  
         img => 1, input => 1, link => 1, meta => 1, param => 1,  
         spacer => 1, wbr => 1,  
       }->{$tag_name};  
   
       $s .= "\x0A" if $tag_name eq 'pre' or $tag_name eq 'textarea';  
   
       if (not $in_cdata and {  
         style => 1, script => 1, xmp => 1, iframe => 1,  
         noembed => 1, noframes => 1, noscript => 1,  
         plaintext => 1,  
       }->{$tag_name}) {  
         unshift @node, 'cdata-out';  
         $in_cdata = 1;  
       }  
   
       unshift @node, @{$child->child_nodes}, '</' . $tag_name . '>';  
     } elsif ($nt == 3 or $nt == 4) {  
       if ($in_cdata) {  
         $s .= $child->data;  
       } else {  
         my $value = $child->data;  
         $value =~ s/&/&amp;/g;  
         $value =~ s/</&lt;/g;  
         $value =~ s/>/&gt;/g;  
         $value =~ s/"/&quot;/g;  
         $s .= $value;  
       }  
     } elsif ($nt == 8) {  
       $s .= '<!--' . $child->data . '-->';  
     } elsif ($nt == 10) {  
       $s .= '<!DOCTYPE ' . $child->name . '>';  
     } elsif ($nt == 5) { # entrefs  
       push @node, @{$child->child_nodes};  
     } else {  
       $on_error->($child) if defined $on_error;  
     }  
     ## ISSUE: This code does not support PIs.  
   } # C  
     
   ## Step 3  
   return \$s;  
 } # get_inner_html  
5632    
5633  1;  1;
5634  # $Date$  # $Date$

Legend:
Removed from v.1.61  
changed lines
  Added in v.1.72

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24