/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src

Diff of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory | Revision Log | View Patch Patch

-revision 1.168 by wakaba,
Sat Sep 13 10:49:21 2008 UTC
+revision 1.169 by wakaba,
Sat Sep 13 11:31:09 2008 UTC
 Line 874 
 sub _initialize_tokenizer ($) {
    my $self = shift;
    $self->{state} = DATA_STATE; # MUST
    #$self->{state_keyword}; # initialized when used
+   #$self->{entity__value}; # initialized when used
+   #$self->{entity__match}; # initialized when used
    $self->{content_model} = PCDATA_CONTENT_MODEL; # be
    undef $self->{current_token};
    undef $self->{current_attribute};
    undef $self->{last_emitted_start_tag_name};
-   undef $self->{last_attribute_value_state};
+   #$self->{prev_state}; # initialized when used
    delete $self->{self_closing};
-   $self->{char} = [];
    # $self->{next_char}
    !!!next-input-character;
    $self->{token} = [];
-Line 912 
 sub _initialize_tokenizer ($) {
+Line 913 
 sub _initialize_tokenizer ($) {
  ## has completed loading.  If one has, then it MUST be executed
  ## and removed from the list.
- ## NOTE: HTML5 "Writing HTML documents" section, applied to
+ ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
- ## documents and not to user agents and conformance checkers,
+ ## (This requirement was dropped from HTML5 spec, unfortunately.)
- ## contains some requirements that are not detected by the
- ## parsing algorithm:
- ## - Some requirements on character encoding declarations. ## TODO
- ## - "Elements MUST NOT contain content that their content model disallows."
- ##   ... Some are parse error, some are not (will be reported by c.c.).
- ## - Polytheistic slash SHOULD NOT be used. (Applied only to atheists.) ## TODO
- ## - Text (in elements, attributes, and comments) SHOULD NOT contain
- ##   control characters other than space characters. ## TODO: (what is control character? C0, C1 and DEL?  Unicode control character?)
- ## TODO: HTML5 poses authors two SHOULD-level requirements that cannot
- ## be detected by the HTML5 parsing algorithm:
- ## - Text,
  sub _get_next_token ($) {
    my $self = shift;
-Line 953 
 sub _get_next_token ($) {
+Line 942 
 sub _get_next_token ($) {
            ## "entity data state".  In this implementation, the tokenizer
            ## is switched to the |ENTITY_STATE|, which is an implementation
            ## of the "consume a character reference" algorithm.
-           $self->{entity_in_attr} = 0;
            $self->{entity_additional} = -1;
+           $self->{prev_state} = DATA_STATE;
            $self->{state} = ENTITY_STATE;
            !!!next-input-character;
            redo A;
-Line 1698 
 sub _get_next_token ($) {
+Line 1687 
 sub _get_next_token ($) {
          redo A;
        } elsif ($self->{next_char} == 0x0026) { # &
          !!!cp (96);
-         $self->{last_attribute_value_state} = $self->{state};
          ## NOTE: In the spec, the tokenizer is switched to the
          ## "entity in attribute value state".  In this implementation, the
          ## tokenizer is switched to the |ENTITY_STATE|, which is an
          ## implementation of the "consume a character reference" algorithm.
-         $self->{entity_in_attr} = 1;
+         $self->{prev_state} = $self->{state};
          $self->{entity_additional} = 0x0022; # "
          $self->{state} = ENTITY_STATE;
          !!!next-input-character;
-Line 1746 
 sub _get_next_token ($) {
+Line 1734 
 sub _get_next_token ($) {
          redo A;
        } elsif ($self->{next_char} == 0x0026) { # &
          !!!cp (102);
-         $self->{last_attribute_value_state} = $self->{state};
          ## NOTE: In the spec, the tokenizer is switched to the
          ## "entity in attribute value state".  In this implementation, the
          ## tokenizer is switched to the |ENTITY_STATE|, which is an
          ## implementation of the "consume a character reference" algorithm.
-         $self->{entity_in_attr} = 1;
          $self->{entity_additional} = 0x0027; # '
+         $self->{prev_state} = $self->{state};
          $self->{state} = ENTITY_STATE;
          !!!next-input-character;
          redo A;
-Line 1798 
 sub _get_next_token ($) {
+Line 1785 
 sub _get_next_token ($) {
          redo A;
        } elsif ($self->{next_char} == 0x0026) { # &
          !!!cp (108);
-         $self->{last_attribute_value_state} = $self->{state};
          ## NOTE: In the spec, the tokenizer is switched to the
          ## "entity in attribute value state".  In this implementation, the
          ## tokenizer is switched to the |ENTITY_STATE|, which is an
          ## implementation of the "consume a character reference" algorithm.
-         $self->{entity_in_attr} = 1;
          $self->{entity_additional} = -1;
+         $self->{prev_state} = $self->{state};
          $self->{state} = ENTITY_STATE;
          !!!next-input-character;
          redo A;
-Line 2988 
 sub _get_next_token ($) {
+Line 2974 
 sub _get_next_token ($) {
        ## appended to the parent element or the attribute value in later
        ## process of the tokenizer.
-       if ($self->{entity_in_attr}) {
+       if ($self->{prev_state} == DATA_STATE) {
-         $self->{current_attribute}->{value} .= '&';
+         $self->{state} = $self->{prev_state};
-         $self->{state} = $self->{last_attribute_value_state};
-         ## Reconsume.
-         redo A;
-       } else {
-         $self->{state} = DATA_STATE;
          ## Reconsume.
          !!!emit ({type => CHARACTER_TOKEN, data => '&',
                    line => $self->{line_prev},
                    column => $self->{column_prev},
                   });
          redo A;
+       } else {
+         $self->{current_attribute}->{value} .= '&';
+         $self->{state} = $self->{prev_state};
+         ## Reconsume.
+         redo A;
        }
      } elsif ($self->{state} == ENTITY_HASH_STATE) {
        if ($self->{next_char} == 0x0078 or # x
-Line 3025 
 sub _get_next_token ($) {
+Line 3011 
 sub _get_next_token ($) {
          ## and then "&#" is appended to the parent element or the attribute
          ## value in the later processing.
-         if ($self->{entity_in_attr}) {
+         if ($self->{prev_state} == DATA_STATE) {
-           $self->{current_attribute}->{value} .= '&#';
+           $self->{state} = $self->{prev_state};
-           $self->{state} = $self->{last_attribute_value_state};
-           ## Reconsume.
-           redo A;
-         } else {
-           $self->{state} = DATA_STATE;
            ## Reconsume.
            !!!emit ({type => CHARACTER_TOKEN,
                      data => '&#',
-Line 3039 
 sub _get_next_token ($) {
+Line 3020 
 sub _get_next_token ($) {
                      column => $self->{column_prev} - 1,
                     });
            redo A;
+         } else {
+           $self->{current_attribute}->{value} .= '&#';
+           $self->{state} = $self->{prev_state};
+           ## Reconsume.
+           redo A;
          }
        }
      } elsif ($self->{state} == NCR_NUM_STATE) {
-Line 3090 
 sub _get_next_token ($) {
+Line 3076 
 sub _get_next_token ($) {
          $code = $c1_entity_char->{$code};
        }
-       if ($self->{entity_in_attr}) {
+       if ($self->{prev_state} == DATA_STATE) {
-         $self->{current_attribute}->{value} .= chr $code;
+         $self->{state} = $self->{prev_state};
-         $self->{current_attribute}->{has_reference} = 1;
-         $self->{state} = $self->{last_attribute_value_state};
-         ## Reconsume.
-         redo A;
-       } else {
-         $self->{state} = DATA_STATE;
          ## Reconsume.
          !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
-                   has_reference => 1,
                    line => $l, column => $c,
                   });
          redo A;
+       } else {
+         $self->{current_attribute}->{value} .= chr $code;
+         $self->{current_attribute}->{has_reference} = 1;
+         $self->{state} = $self->{prev_state};
+         ## Reconsume.
+         redo A;
        }
      } elsif ($self->{state} == HEXREF_X_STATE) {
        if ((0x0030 <= $self->{next_char} and $self->{next_char} <= 0x0039) or
-Line 3124 
 sub _get_next_token ($) {
+Line 3109 
 sub _get_next_token ($) {
          ## and then "&#" followed by "X" or "x" is appended to the parent
          ## element or the attribute value in the later processing.
-         if ($self->{entity_in_attr}) {
+         if ($self->{prev_state} == DATA_STATE) {
-           $self->{current_attribute}->{value} .= '&' . $self->{state_keyword};
+           $self->{state} = $self->{prev_state};
-           $self->{state} = $self->{last_attribute_value_state};
-           ## Reconsume.
-           redo A;
-         } else {
-           $self->{state} = DATA_STATE;
            ## Reconsume.
            !!!emit ({type => CHARACTER_TOKEN,
                      data => '&' . $self->{state_keyword},
-Line 3138 
 sub _get_next_token ($) {
+Line 3118 
 sub _get_next_token ($) {
                      column => $self->{column_prev} - length $self->{state_keyword},
                     });
            redo A;
+         } else {
+           $self->{current_attribute}->{value} .= '&' . $self->{state_keyword};
+           $self->{state} = $self->{prev_state};
+           ## Reconsume.
+           redo A;
          }
        }
      } elsif ($self->{state} == HEXREF_HEX_STATE) {
-Line 3203 
 sub _get_next_token ($) {
+Line 3188 
 sub _get_next_token ($) {
          $code = $c1_entity_char->{$code};
        }
-       if ($self->{entity_in_attr}) {
+       if ($self->{prev_state} == DATA_STATE) {
-         $self->{current_attribute}->{value} .= chr $code;
+         $self->{state} = $self->{prev_state};
-         $self->{current_attribute}->{has_reference} = 1;
-         $self->{state} = $self->{last_attribute_value_state};
-         ## Reconsume.
-         redo A;
-       } else {
-         $self->{state} = DATA_STATE;
          ## Reconsume.
          !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
-                   has_reference => 1,
                    line => $l, column => $c,
                   });
          redo A;
+       } else {
+         $self->{current_attribute}->{value} .= chr $code;
+         $self->{current_attribute}->{has_reference} = 1;
+         $self->{state} = $self->{prev_state};
+         ## Reconsume.
+         redo A;
        }
      } elsif ($self->{state} == ENTITY_NAME_STATE) {
        if (length $self->{state_keyword} < 30 and
-Line 3264 
 sub _get_next_token ($) {
+Line 3248 
 sub _get_next_token ($) {
          #
        } elsif ($self->{entity__match} < 0) {
          !!!parse-error (type => 'no refc');
-         if ($self->{entity_in_attr} and $self->{entity__match} < -1) {
+         if ($self->{prev_state} != DATA_STATE and # in attribute
+             $self->{entity__match} < -1) {
            !!!cp (1024);
            $data = '&' . $self->{state_keyword};
            #
-Line 3293 
 sub _get_next_token ($) {
+Line 3278 
 sub _get_next_token ($) {
        ## that would not be consumed are appended in the data state or in an
        ## appropriate attribute value state anyway.
-       if ($self->{entity_in_attr}) {
+       if ($self->{prev_state} == DATA_STATE) {
-         $self->{current_attribute}->{value} .= $data;
+         $self->{state} = $self->{prev_state};
-         $self->{current_attribute}->{has_reference} = 1 if $has_ref;
-         $self->{state} = $self->{last_attribute_value_state};
-         ## Reconsume.
-         redo A;
-       } else {
-         $self->{state} = DATA_STATE;
          ## Reconsume.
          !!!emit ({type => CHARACTER_TOKEN,
-                   data => $data, has_reference => $has_ref,
+                   data => $data,
                    line => $self->{line_prev},
                    column => $self->{column_prev} + 1 - length $self->{state_keyword},
                   });
          redo A;
+       } else {
+         $self->{current_attribute}->{value} .= $data;
+         $self->{current_attribute}->{has_reference} = 1 if $has_ref;
+         $self->{state} = $self->{prev_state};
+         ## Reconsume.
+         redo A;
        }
      } else {
        die "$0: $self->{state}: Unknown state";

 Legend:



Removed from v.1.168
 


changed lines


 
Added in v.1.169
 Legend:



Removed from v.1.168
 


changed lines


 
Added in v.1.169
-Removed from v.1.168
+Added in v.1.169

admin@suikawiki.org	ViewVC Help
Powered by ViewVC 1.1.24