/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src

Diff of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory | Revision Log | View Patch Patch

-revision 1.61 by wakaba,
Sun Nov  4 04:15:06 2007 UTC
+revision 1.67 by wakaba,
Sat Feb 16 03:47:33 2008 UTC
 Line 1
  package Whatpm::HTML;
  use strict;
  our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
+ use Error qw(:try);
  ## ISSUE:
  ## var doc = implementation.createDocument (null, null, null);
-Line 84 
 my $formatting_category = {
+Line 85 
 my $formatting_category = {
  };
  # $phrasing_category: all other elements
+ sub parse_byte_string ($$$$;$) {
+   my $self = ref $_[0] ? shift : shift->new;
+   my $charset = shift;
+   my $bytes_s = ref $_[0] ? $_[0] : \($_[0]);
+   my $s;
+   if (defined $charset) {
+     require Encode; ## TODO: decode(utf8) don't delete BOM
+     $s = \ (Encode::decode ($charset, $$bytes_s));
+     $self->{input_encoding} = lc $charset; ## TODO: normalize name
+     $self->{confident} = 1;
+   } else {
+     ## TODO: Implement HTML5 detection algorithm
+     require Whatpm::Charset::UniversalCharDet;
+     $charset = Whatpm::Charset::UniversalCharDet->detect_byte_string
+         (substr ($$bytes_s, 0, 1024));
+     $charset ||= 'windows-1252';
+     $s = \ (Encode::decode ($charset, $$bytes_s));
+     $self->{input_encoding} = $charset;
+     $self->{confident} = 0;
+   }
+   $self->{change_encoding} = sub {
+     my $self = shift;
+     my $charset = lc shift;
+     ## TODO: if $charset is supported
+     ## TODO: normalize charset name
+     ## "Change the encoding" algorithm:
+     ## Step 1
+     if ($charset eq 'utf-16') { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8?
+       $charset = 'utf-8';
+     }
+     ## Step 2
+     if (defined $self->{input_encoding} and
+         $self->{input_encoding} eq $charset) {
+       $self->{confident} = 1;
+       return;
+     }
+     !!!parse-error (type => 'charset label detected:'.$self->{input_encoding}.
+         ':'.$charset, level => 'w');
+     ## Step 3
+     # if (can) {
+       ## change the encoding on the fly.
+       #$self->{confident} = 1;
+       #return;
+     # }
+     ## Step 4
+     throw Whatpm::HTML::RestartParser (charset => $charset);
+   }; # $self->{change_encoding}
+   my @args = @_; shift @args; # $s
+   my $return;
+   try {
+     $return = $self->parse_char_string ($s, @args);
+   } catch Whatpm::HTML::RestartParser with {
+     my $charset = shift->{charset};
+     $s = \ (Encode::decode ($charset, $$bytes_s));
+     $self->{input_encoding} = $charset; ## TODO: normalize
+     $self->{confident} = 1;
+     $return = $self->parse_char_string ($s, @args);
+   };
+   return $return;
+ } # parse_byte_string
+ *parse_char_string = \&parse_string;
  sub parse_string ($$$;$) {
-   my $self = shift->new;
+   my $self = ref $_[0] ? shift : shift->new;
-   my $s = \$_[0];
+   my $s = ref $_[0] ? $_[0] : \($_[0]);
    $self->{document} = $_[1];
+   @{$self->{document}->child_nodes} = ();
    ## NOTE: |set_inner_html| copies most of this method's code
+   $self->{confident} = 1 unless exists $self->{confident};
+   $self->{document}->input_encoding ($self->{input_encoding})
+       if defined $self->{input_encoding};
    my $i = 0;
    my $line = 1;
    my $column = 0;
-Line 147 
 sub new ($) {
+Line 225 
 sub new ($) {
    $self->{parse_error} = sub {
      #
    };
+   $self->{change_encoding} = sub {
+     # if ($_[0] is a supported encoding) {
+     #   run "change the encoding" algorithm;
+     #   throw Whatpm::HTML::RestartParser (charset => $new_encoding);
+     # }
+   };
    $self->{application_cache_selection} = sub {
      #
    };
-Line 256 
 sub _initialize_tokenizer ($) {
+Line 340 
 sub _initialize_tokenizer ($) {
  ##   ->{system_identifier} (DOCTYPE_TOKEN)
  ##   ->{correct} == 1 or 0 (DOCTYPE_TOKEN)
  ##   ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
+ ##        ->{name}
+ ##        ->{value}
+ ##        ->{has_reference} == 1 or 0
  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
  ## Emitted token MUST immediately be handled by the tree construction state.
-Line 1031 
 sub _get_next_token ($) {
+Line 1118 
 sub _get_next_token ($) {
          $self->{current_attribute}->{value} .= '&';
        } else {
          $self->{current_attribute}->{value} .= $token->{data};
+         $self->{current_attribute}->{has_reference} = $token->{has_reference};
          ## ISSUE: spec says "append the returned character token to the current attribute's value"
        }
-Line 1757 
 sub _tokenize_attempt_to_consume_an_enti
+Line 1845 
 sub _tokenize_attempt_to_consume_an_enti
            $code = $c1_entity_char->{$code};
          }
-         return {type => CHARACTER_TOKEN, data => chr $code};
+         return {type => CHARACTER_TOKEN, data => chr $code,
+                 has_reference => 1};
        } # X
      } elsif (0x0030 <= $self->{next_input_character} and
               $self->{next_input_character} <= 0x0039) { # 0..9
-Line 1792 
 sub _tokenize_attempt_to_consume_an_enti
+Line 1881 
 sub _tokenize_attempt_to_consume_an_enti
          $code = $c1_entity_char->{$code};
        }
-       return {type => CHARACTER_TOKEN, data => chr $code};
+       return {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1};
      } else {
        !!!parse-error (type => 'bare nero');
        !!!back-next-input-character ($self->{next_input_character});
-Line 1840 
 sub _tokenize_attempt_to_consume_an_enti
+Line 1929 
 sub _tokenize_attempt_to_consume_an_enti
      }
      if ($match > 0) {
-       return {type => CHARACTER_TOKEN, data => $value};
+       return {type => CHARACTER_TOKEN, data => $value, has_reference => 1};
      } elsif ($match < 0) {
        !!!parse-error (type => 'no refc');
        if ($in_attr and $match < -1) {
          return {type => CHARACTER_TOKEN, data => '&'.$entity_name};
        } else {
-         return {type => CHARACTER_TOKEN, data => $value};
+         return {type => CHARACTER_TOKEN, data => $value, has_reference => 1};
        }
      } else {
        !!!parse-error (type => 'bare ero');
-       ## NOTE: No characters are consumed in the spec.
+       ## NOTE: "No characters are consumed" in the spec.
        return {type => CHARACTER_TOKEN, data => '&'.$value};
      }
    } else {
-Line 2104 
 sub _tree_construction_root_element ($)
+Line 2193 
 sub _tree_construction_root_element ($)
          #
        } elsif ($token->{type} == START_TAG_TOKEN) {
          if ($token->{tag_name} eq 'html' and
-             $token->{attributes}->{manifest}) { ## ISSUE: Spec spells as "application"
+             $token->{attributes}->{manifest}) {
            $self->{application_cache_selection}
                 ->($token->{attributes}->{manifest}->{value});
            ## ISSUE: No relative reference resolution?
-Line 2782 
 sub _tree_construction_main ($) {
+Line 2871 
 sub _tree_construction_main ($) {
                  push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
                }
                !!!insert-element ($token->{tag_name}, $token->{attributes});
-               pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
+               my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
                unless ($self->{confident}) {
-                 my $charset;
                  if ($token->{attributes}->{charset}) { ## TODO: And if supported
-                   $charset = $token->{attributes}->{charset}->{value};
+                   $self->{change_encoding}
-                 }
+                       ->($self, $token->{attributes}->{charset}->{value});
-                 if ($token->{attributes}->{'http-equiv'}) {
+                   $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
+                       ->set_user_data (manakai_has_reference =>
+                                            $token->{attributes}->{charset}
+                                                ->{has_reference});
+                 } elsif ($token->{attributes}->{content}) {
                    ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
-                   if ($token->{attributes}->{'http-equiv'}->{value}
+                   if ($token->{attributes}->{content}->{value}
                        =~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*=
                            [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
                            ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
-                     $charset = defined $1 ? $1 : defined $2 ? $2 : $3;
+                     $self->{change_encoding}
-                   } ## TODO: And if supported
+                         ->($self, defined $1 ? $1 : defined $2 ? $2 : $3);
+                   }
+                 }
+               } else {
+                 if ($token->{attributes}->{charset}) {
+                   $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
+                       ->set_user_data (manakai_has_reference =>
+                                            $token->{attributes}->{charset}
+                                                ->{has_reference});
                  }
-                 ## TODO: Change the encoding
                }
-               ## TODO: Extracting |charset| from |meta|.
                pop @{$self->{open_elements}}
                    if $self->{insertion_mode} == AFTER_HEAD_IM;
                !!!next-token;
-Line 4372 
 sub _tree_construction_main ($) {
+Line 4471 
 sub _tree_construction_main ($) {
        } elsif ($token->{tag_name} eq 'meta') {
          ## NOTE: This is an "as if in head" code clone, only "-t" differs
          !!!insert-element-t ($token->{tag_name}, $token->{attributes});
-         pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
+         my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
          unless ($self->{confident}) {
-           my $charset;
            if ($token->{attributes}->{charset}) { ## TODO: And if supported
-             $charset = $token->{attributes}->{charset}->{value};
+             $self->{change_encoding}
-           }
+                 ->($self, $token->{attributes}->{charset}->{value});
-           if ($token->{attributes}->{'http-equiv'}) {
+             $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
+                 ->set_user_data (manakai_has_reference =>
+                                      $token->{attributes}->{charset}
+                                          ->{has_reference});
+           } elsif ($token->{attributes}->{content}) {
              ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
-             if ($token->{attributes}->{'http-equiv'}->{value}
+             if ($token->{attributes}->{content}->{value}
                  =~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*=
                      [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
                      ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
-               $charset = defined $1 ? $1 : defined $2 ? $2 : $3;
+               $self->{change_encoding}
-             } ## TODO: And if supported
+                   ->($self, defined $1 ? $1 : defined $2 ? $2 : $3);
+             }
+           }
+         } else {
+           if ($token->{attributes}->{charset}) {
+             $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
+                 ->set_user_data (manakai_has_reference =>
+                                      $token->{attributes}->{charset}
+                                          ->{has_reference});
            }
-           ## TODO: Change the encoding
          }
          !!!next-token;
-Line 5214 
 sub set_inner_html ($$$) {
+Line 5324 
 sub set_inner_html ($$$) {
    my $s = \$_[0];
    my $onerror = $_[1];
+   ## ISSUE: Should {confident} be true?
    my $nt = $node->node_type;
    if ($nt == 9) {
      # MUST
-Line 5366 
 sub set_inner_html ($$$) {
+Line 5478 
 sub set_inner_html ($$$) {
  } # tree construction stage
- sub get_inner_html ($$$) {
+ package Whatpm::HTML::RestartParser;
-   my (undef, $node, $on_error) = @_;
+ push our @ISA, 'Error';
-   ## Step 1
-   my $s = '';
-   my $in_cdata;
-   my $parent = $node;
-   while (defined $parent) {
-     if ($parent->node_type == 1 and
-         $parent->namespace_uri eq 'http://www.w3.org/1999/xhtml' and
-         {
-           style => 1, script => 1, xmp => 1, iframe => 1,
-           noembed => 1, noframes => 1, noscript => 1,
-         }->{$parent->local_name}) { ## TODO: case thingy
-       $in_cdata = 1;
-     }
-     $parent = $parent->parent_node;
-   }
-   ## Step 2
-   my @node = @{$node->child_nodes};
-   C: while (@node) {
-     my $child = shift @node;
-     unless (ref $child) {
-       if ($child eq 'cdata-out') {
-         $in_cdata = 0;
-       } else {
-         $s .= $child; # end tag
-       }
-       next C;
-     }
-     my $nt = $child->node_type;
-     if ($nt == 1) { # Element
-       my $tag_name = $child->tag_name; ## TODO: manakai_tag_name
-       $s .= '<' . $tag_name;
-       ## NOTE: Non-HTML case:
-       ## <http://permalink.gmane.org/gmane.org.w3c.whatwg.discuss/11191>
-       my @attrs = @{$child->attributes}; # sort order MUST be stable
-       for my $attr (@attrs) { # order is implementation dependent
-         my $attr_name = $attr->name; ## TODO: manakai_name
-         $s .= ' ' . $attr_name . '="';
-         my $attr_value = $attr->value;
-         ## escape
-         $attr_value =~ s/&/&amp;/g;
-         $attr_value =~ s/</&lt;/g;
-         $attr_value =~ s/>/&gt;/g;
-         $attr_value =~ s/"/&quot;/g;
-         $s .= $attr_value . '"';
-       }
-       $s .= '>';
-       next C if {
-         area => 1, base => 1, basefont => 1, bgsound => 1,
-         br => 1, col => 1, embed => 1, frame => 1, hr => 1,
-         img => 1, input => 1, link => 1, meta => 1, param => 1,
-         spacer => 1, wbr => 1,
-       }->{$tag_name};
-       $s .= "\x0A" if $tag_name eq 'pre' or $tag_name eq 'textarea';
-       if (not $in_cdata and {
-         style => 1, script => 1, xmp => 1, iframe => 1,
-         noembed => 1, noframes => 1, noscript => 1,
-         plaintext => 1,
-       }->{$tag_name}) {
-         unshift @node, 'cdata-out';
-         $in_cdata = 1;
-       }
-       unshift @node, @{$child->child_nodes}, '</' . $tag_name . '>';
-     } elsif ($nt == 3 or $nt == 4) {
-       if ($in_cdata) {
-         $s .= $child->data;
-       } else {
-         my $value = $child->data;
-         $value =~ s/&/&amp;/g;
-         $value =~ s/</&lt;/g;
-         $value =~ s/>/&gt;/g;
-         $value =~ s/"/&quot;/g;
-         $s .= $value;
-       }
-     } elsif ($nt == 8) {
-       $s .= '<!--' . $child->data . '-->';
-     } elsif ($nt == 10) {
-       $s .= '<!DOCTYPE ' . $child->name . '>';
-     } elsif ($nt == 5) { # entrefs
-       push @node, @{$child->child_nodes};
-     } else {
-       $on_error->($child) if defined $on_error;
-     }
-     ## ISSUE: This code does not support PIs.
-   } # C
-   ## Step 3
-   return \$s;
- } # get_inner_html
 ;
  # $Date$

 Legend:



Removed from v.1.61
 


changed lines


 
Added in v.1.67
 Legend:



Removed from v.1.61
 


changed lines


 
Added in v.1.67
-Removed from v.1.61
+Added in v.1.67

admin@suikawiki.org	ViewVC Help
Powered by ViewVC 1.1.24