/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src

Diff of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory | Revision Log | View Patch Patch

-revision 1.167 by wakaba,
Sat Sep 13 09:02:28 2008 UTC
+revision 1.168 by wakaba,
Sat Sep 13 10:49:21 2008 UTC
 Line 769 
 sub RCDATA_CONTENT_MODEL () { CM_ENTITY
  sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
  sub DATA_STATE () { 0 }
- sub ENTITY_DATA_STATE () { 1 }
+ #sub ENTITY_DATA_STATE () { 1 }
  sub TAG_OPEN_STATE () { 2 }
  sub CLOSE_TAG_OPEN_STATE () { 3 }
  sub TAG_NAME_STATE () { 4 }
 Line 780 
 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8
  sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
  sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
  sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
- sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
+ #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
  sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
  sub COMMENT_START_STATE () { 14 }
  sub COMMENT_START_DASH_STATE () { 15 }
 Line 812 
 sub CDATA_SECTION_MSE1_STATE () { 40 } #
  sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
  sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
  sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
- sub ENTITY_STATE () { 44 } # "consume a character reference" in the spec
+ ## NOTE: "Entity data state", "entity in attribute value state", and
+ ## "consume a character reference" algorithm are jointly implemented
+ ## using the following six states:
+ sub ENTITY_STATE () { 44 }
+ sub ENTITY_HASH_STATE () { 45 }
+ sub NCR_NUM_STATE () { 46 }
+ sub HEXREF_X_STATE () { 47 }
+ sub HEXREF_HEX_STATE () { 48 }
+ sub ENTITY_NAME_STATE () { 49 }
  sub DOCTYPE_TOKEN () { 1 }
  sub COMMENT_TOKEN () { 2 }
-Line 945 
 sub _get_next_token ($) {
+Line 953 
 sub _get_next_token ($) {
            ## "entity data state".  In this implementation, the tokenizer
            ## is switched to the |ENTITY_STATE|, which is an implementation
            ## of the "consume a character reference" algorithm.
-           #$self->{state} = ENTITY_DATA_STATE;
            $self->{entity_in_attr} = 0;
            $self->{entity_additional} = -1;
            $self->{state} = ENTITY_STATE;
-Line 1018 
 sub _get_next_token ($) {
+Line 1025 
 sub _get_next_token ($) {
        !!!emit ($token);
        redo A;
-     } elsif ($self->{state} == ENTITY_DATA_STATE) {
-       my ($l, $c) = ($self->{line_prev}, $self->{column_prev});
-       my $token = $self->{entity_return};
-       $self->{state} = DATA_STATE;
-       # next-input-character is already done
-       unless (defined $token) {
-         !!!cp (13);
-         !!!emit ({type => CHARACTER_TOKEN, data => '&',
-                   line => $l, column => $c,
-                  });
-       } else {
-         !!!cp (14);
-         !!!emit ($token);
-       }
-       redo A;
      } elsif ($self->{state} == TAG_OPEN_STATE) {
        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
          if ($self->{next_char} == 0x002F) { # /
-Line 1715 
 sub _get_next_token ($) {
+Line 1703 
 sub _get_next_token ($) {
          ## "entity in attribute value state".  In this implementation, the
          ## tokenizer is switched to the |ENTITY_STATE|, which is an
          ## implementation of the "consume a character reference" algorithm.
-         #$self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
          $self->{entity_in_attr} = 1;
          $self->{entity_additional} = 0x0022; # "
          $self->{state} = ENTITY_STATE;
-Line 1764 
 sub _get_next_token ($) {
+Line 1751 
 sub _get_next_token ($) {
          ## "entity in attribute value state".  In this implementation, the
          ## tokenizer is switched to the |ENTITY_STATE|, which is an
          ## implementation of the "consume a character reference" algorithm.
-         #$self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
          $self->{entity_in_attr} = 1;
          $self->{entity_additional} = 0x0027; # '
          $self->{state} = ENTITY_STATE;
-Line 1817 
 sub _get_next_token ($) {
+Line 1803 
 sub _get_next_token ($) {
          ## "entity in attribute value state".  In this implementation, the
          ## tokenizer is switched to the |ENTITY_STATE|, which is an
          ## implementation of the "consume a character reference" algorithm.
-         #$self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
          $self->{entity_in_attr} = 1;
          $self->{entity_additional} = -1;
          $self->{state} = ENTITY_STATE;
-Line 1884 
 sub _get_next_token ($) {
+Line 1869 
 sub _get_next_token ($) {
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} == ENTITY_IN_ATTRIBUTE_VALUE_STATE) {
-       my $token = $self->{entity_return};
-       unless (defined $token) {
-         !!!cp (117);
-         $self->{current_attribute}->{value} .= '&';
-       } else {
-         !!!cp (118);
-         $self->{current_attribute}->{value} .= $token->{data};
-         $self->{current_attribute}->{has_reference} = $token->{has_reference};
-         ## ISSUE: spec says "append the returned character token to the current attribute's value"
-       }
-       $self->{state} = $self->{last_attribute_value_state};
-       # next-input-character is already done
-       redo A;
      } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
        if ($self->{next_char} == 0x0009 or # HT
            $self->{next_char} == 0x000A or # LF
-Line 2979 
 sub _get_next_token ($) {
+Line 2948 
 sub _get_next_token ($) {
          ## Reconsume.
          redo A;
        }
      } elsif ($self->{state} == ENTITY_STATE) {
-       my $in_attr = $self->{entity_in_attr};
+       if ({
-       my $additional = $self->{entity_additional};
+x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
+x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, &
+         $self->{entity_additional} => 1,
+       }->{$self->{next_char}}) {
+         !!!cp (1001);
+         ## Don't consume
+         ## No error
+         ## Return nothing.
+         #
+       } elsif ($self->{next_char} == 0x0023) { # #
+         $self->{state} = ENTITY_HASH_STATE;
+         $self->{state_keyword} = '#';
+         !!!next-input-character;
+         redo A;
+       } elsif ((0x0041 <= $self->{next_char} and
+                 $self->{next_char} <= 0x005A) or # A..Z
+                (0x0061 <= $self->{next_char} and
+                 $self->{next_char} <= 0x007A)) { # a..z
+         require Whatpm::_NamedEntityList;
+         $self->{state} = ENTITY_NAME_STATE;
+         $self->{state_keyword} = chr $self->{next_char};
+         $self->{entity__value} = $self->{state_keyword};
+         $self->{entity__match} = 0;
+         !!!next-input-character;
+         redo A;
+       } else {
+         !!!cp (1027);
+         !!!parse-error (type => 'bare ero');
+         ## Return nothing.
+         #
+       }
-   my ($l, $c) = ($self->{line_prev}, $self->{column_prev});
+       ## NOTE: No character is consumed by the "consume a character
+       ## reference" algorithm.  In other word, there is an "&" character
+       ## that does not introduce a character reference, which would be
+       ## appended to the parent element or the attribute value in later
+       ## process of the tokenizer.
-   if ({
+       if ($self->{entity_in_attr}) {
-x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
+         $self->{current_attribute}->{value} .= '&';
-x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR
+         $self->{state} = $self->{last_attribute_value_state};
-        $additional => 1,
+         ## Reconsume.
-       }->{$self->{next_char}}) {
+         redo A;
-     !!!cp (1001);
+       } else {
-     ## Don't consume
+         $self->{state} = DATA_STATE;
-     ## No error
+         ## Reconsume.
-     $self->{entity_return} = undef;
+         !!!emit ({type => CHARACTER_TOKEN, data => '&',
-     $self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE;
+                   line => $self->{line_prev},
-     redo A;
+                   column => $self->{column_prev},
-   } elsif ($self->{next_char} == 0x0023) { # #
+                  });
-     !!!next-input-character;
+         redo A;
-     if ($self->{next_char} == 0x0078 or # x
+       }
-         $self->{next_char} == 0x0058) { # X
+     } elsif ($self->{state} == ENTITY_HASH_STATE) {
-       my $code;
+       if ($self->{next_char} == 0x0078 or # x
-       X: {
+           $self->{next_char} == 0x0058) { # X
-         my $x_char = $self->{next_char};
+         $self->{state} = HEXREF_X_STATE;
-         !!!next-input-character;
+         $self->{state_keyword} .= chr $self->{next_char};
-         if (0x0030 <= $self->{next_char} and
+         !!!next-input-character;
-             $self->{next_char} <= 0x0039) { # 0..9
+         redo A;
-           !!!cp (1002);
+       } elsif (0x0030 <= $self->{next_char} and
-           $code ||= 0;
+                $self->{next_char} <= 0x0039) { # 0..9
-           $code *= 0x10;
+         $self->{state} = NCR_NUM_STATE;
-           $code += $self->{next_char} - 0x0030;
+         $self->{state_keyword} = $self->{next_char} - 0x0030;
-           redo X;
+         !!!next-input-character;
-         } elsif (0x0061 <= $self->{next_char} and
+         redo A;
-                  $self->{next_char} <= 0x0066) { # a..f
+       } else {
-           !!!cp (1003);
+         !!!cp (1019);
-           $code ||= 0;
+         !!!parse-error (type => 'bare nero',
-           $code *= 0x10;
+                         line => $self->{line_prev},
-           $code += $self->{next_char} - 0x0060 + 9;
+                         column => $self->{column_prev} - 1);
-           redo X;
-         } elsif (0x0041 <= $self->{next_char} and
+         ## NOTE: According to the spec algorithm, nothing is returned,
-                  $self->{next_char} <= 0x0046) { # A..F
+         ## and then "&#" is appended to the parent element or the attribute
-           !!!cp (1004);
+         ## value in the later processing.
-           $code ||= 0;
-           $code *= 0x10;
+         if ($self->{entity_in_attr}) {
-           $code += $self->{next_char} - 0x0040 + 9;
+           $self->{current_attribute}->{value} .= '&#';
-           redo X;
+           $self->{state} = $self->{last_attribute_value_state};
-         } elsif (not defined $code) { # no hexadecimal digit
+           ## Reconsume.
-           !!!cp (1005);
-           !!!parse-error (type => 'bare hcro', line => $l, column => $c);
-           !!!back-next-input-character ($x_char, $self->{next_char});
-           $self->{next_char} = 0x0023; # #
-           $self->{entity_return} = undef;
-           $self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE;
            redo A;
-         } elsif ($self->{next_char} == 0x003B) { # ;
-           !!!cp (1006);
-           !!!next-input-character;
          } else {
-           !!!cp (1007);
+           $self->{state} = DATA_STATE;
-           !!!parse-error (type => 'no refc', line => $l, column => $c);
+           ## Reconsume.
+           !!!emit ({type => CHARACTER_TOKEN,
+                     data => '&#',
+                     line => $self->{line_prev},
+                     column => $self->{column_prev} - 1,
+                    });
+           redo A;
          }
+       }
-         if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
+     } elsif ($self->{state} == NCR_NUM_STATE) {
-           !!!cp (1008);
+       if (0x0030 <= $self->{next_char} and
-           !!!parse-error (type => 'invalid character reference',
+           $self->{next_char} <= 0x0039) { # 0..9
-                           text => (sprintf 'U+%04X', $code),
-                           line => $l, column => $c);
-           $code = 0xFFFD;
-         } elsif ($code > 0x10FFFF) {
-           !!!cp (1009);
-           !!!parse-error (type => 'invalid character reference',
-                           text => (sprintf 'U-%08X', $code),
-                           line => $l, column => $c);
-           $code = 0xFFFD;
-         } elsif ($code == 0x000D) {
-           !!!cp (1010);
-           !!!parse-error (type => 'CR character reference', line => $l, column => $c);
-           $code = 0x000A;
-         } elsif (0x80 <= $code and $code <= 0x9F) {
-           !!!cp (1011);
-           !!!parse-error (type => 'C1 character reference', text => (sprintf 'U+%04X', $code), line => $l, column => $c);
-           $code = $c1_entity_char->{$code};
-         }
-         $self->{entity_return} = {type => CHARACTER_TOKEN, data => chr $code,
-                 has_reference => 1,
-                 line => $l, column => $c,
-                };
-         $self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE;
-         redo A;
-       } # X
-     } elsif (0x0030 <= $self->{next_char} and
-              $self->{next_char} <= 0x0039) { # 0..9
-       my $code = $self->{next_char} - 0x0030;
-       !!!next-input-character;
-       while (0x0030 <= $self->{next_char} and
-                 $self->{next_char} <= 0x0039) { # 0..9
          !!!cp (1012);
-         $code *= 10;
+         $self->{state_keyword} *= 10;
-         $code += $self->{next_char} - 0x0030;
+         $self->{state_keyword} += $self->{next_char} - 0x0030;
+         ## Stay in the state.
          !!!next-input-character;
-       }
+         redo A;
+       } elsif ($self->{next_char} == 0x003B) { # ;
-       if ($self->{next_char} == 0x003B) { # ;
          !!!cp (1013);
          !!!next-input-character;
+         #
        } else {
          !!!cp (1014);
-         !!!parse-error (type => 'no refc', line => $l, column => $c);
+         !!!parse-error (type => 'no refc');
+         ## Reconsume.
+         #
        }
+       my $code = $self->{state_keyword};
+       my $l = $self->{line_prev};
+       my $c = $self->{column_prev};
        if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
          !!!cp (1015);
          !!!parse-error (type => 'invalid character reference',
-Line 3117 
 sub _get_next_token ($) {
+Line 3089 
 sub _get_next_token ($) {
                          line => $l, column => $c);
          $code = $c1_entity_char->{$code};
        }
-       $self->{entity_return} = {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1,
+       if ($self->{entity_in_attr}) {
-               line => $l, column => $c,
+         $self->{current_attribute}->{value} .= chr $code;
-              };
+         $self->{current_attribute}->{has_reference} = 1;
-       $self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE;
+         $self->{state} = $self->{last_attribute_value_state};
-       redo A;
+         ## Reconsume.
-     } else {
+         redo A;
-       !!!cp (1019);
+       } else {
-       !!!parse-error (type => 'bare nero', line => $l, column => $c);
+         $self->{state} = DATA_STATE;
-       !!!back-next-input-character ($self->{next_char});
+         ## Reconsume.
-       $self->{next_char} = 0x0023; # #
+         !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
-       $self->{entity_return} = undef;
+                   has_reference => 1,
-       $self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE;
+                   line => $l, column => $c,
-       redo A;
+                  });
-     }
+         redo A;
-   } elsif ((0x0041 <= $self->{next_char} and
+       }
-             $self->{next_char} <= 0x005A) or
+     } elsif ($self->{state} == HEXREF_X_STATE) {
-            (0x0061 <= $self->{next_char} and
+       if ((0x0030 <= $self->{next_char} and $self->{next_char} <= 0x0039) or
-             $self->{next_char} <= 0x007A)) {
+           (0x0041 <= $self->{next_char} and $self->{next_char} <= 0x0046) or
-     my $entity_name = chr $self->{next_char};
+           (0x0061 <= $self->{next_char} and $self->{next_char} <= 0x0066)) {
-     !!!next-input-character;
+         # 0..9, A..F, a..f
+         $self->{state} = HEXREF_HEX_STATE;
-     my $value = $entity_name;
+         $self->{state_keyword} = 0;
-     my $match = 0;
+         ## Reconsume.
-     require Whatpm::_NamedEntityList;
+         redo A;
-     our $EntityChar;
+       } else {
+         !!!cp (1005);
-     while (length $entity_name < 30 and
+         !!!parse-error (type => 'bare hcro',
-            ## NOTE: Some number greater than the maximum length of entity name
+                         line => $self->{line_prev},
-            ((0x0041 <= $self->{next_char} and # a
+                         column => $self->{column_prev} - 2);
-              $self->{next_char} <= 0x005A) or # x
-             (0x0061 <= $self->{next_char} and # a
+         ## NOTE: According to the spec algorithm, nothing is returned,
-              $self->{next_char} <= 0x007A) or # z
+         ## and then "&#" followed by "X" or "x" is appended to the parent
-             (0x0030 <= $self->{next_char} and # 0
+         ## element or the attribute value in the later processing.
-              $self->{next_char} <= 0x0039) or # 9
-             $self->{next_char} == 0x003B)) { # ;
+         if ($self->{entity_in_attr}) {
-       $entity_name .= chr $self->{next_char};
+           $self->{current_attribute}->{value} .= '&' . $self->{state_keyword};
-       if (defined $EntityChar->{$entity_name}) {
+           $self->{state} = $self->{last_attribute_value_state};
-         if ($self->{next_char} == 0x003B) { # ;
+           ## Reconsume.
-           !!!cp (1020);
+           redo A;
-           $value = $EntityChar->{$entity_name};
-           $match = 1;
-           !!!next-input-character;
-           last;
          } else {
-           !!!cp (1021);
+           $self->{state} = DATA_STATE;
-           $value = $EntityChar->{$entity_name};
+           ## Reconsume.
-           $match = -1;
+           !!!emit ({type => CHARACTER_TOKEN,
-           !!!next-input-character;
+                     data => '&' . $self->{state_keyword},
+                     line => $self->{line_prev},
+                     column => $self->{column_prev} - length $self->{state_keyword},
+                    });
+           redo A;
          }
-       } else {
+       }
-         !!!cp (1022);
+     } elsif ($self->{state} == HEXREF_HEX_STATE) {
-         $value .= chr $self->{next_char};
+       if (0x0030 <= $self->{next_char} and $self->{next_char} <= 0x0039) {
-         $match *= 2;
+         # 0..9
+         !!!cp (1002);
+         $self->{state_keyword} *= 0x10;
+         $self->{state_keyword} += $self->{next_char} - 0x0030;
+         ## Stay in the state.
          !!!next-input-character;
+         redo A;
+       } elsif (0x0061 <= $self->{next_char} and
+                $self->{next_char} <= 0x0066) { # a..f
+         !!!cp (1003);
+         $self->{state_keyword} *= 0x10;
+         $self->{state_keyword} += $self->{next_char} - 0x0060 + 9;
+         ## Stay in the state.
+         !!!next-input-character;
+         redo A;
+       } elsif (0x0041 <= $self->{next_char} and
+                $self->{next_char} <= 0x0046) { # A..F
+         !!!cp (1004);
+         $self->{state_keyword} *= 0x10;
+         $self->{state_keyword} += $self->{next_char} - 0x0040 + 9;
+         ## Stay in the state.
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_char} == 0x003B) { # ;
+         !!!cp (1006);
+         !!!next-input-character;
+         #
+       } else {
+         !!!cp (1007);
+         !!!parse-error (type => 'no refc',
+                         line => $self->{line},
+                         column => $self->{column});
+         ## Reconsume.
+         #
        }
-     }
+       my $code = $self->{state_keyword};
-     if ($match > 0) {
+       my $l = $self->{line_prev};
-       !!!cp (1023);
+       my $c = $self->{column_prev};
-       $self->{entity_return} = {type => CHARACTER_TOKEN, data => $value, has_reference => 1,
+       if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
-               line => $l, column => $c,
+         !!!cp (1008);
-              };
+         !!!parse-error (type => 'invalid character reference',
-       $self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE;
+                         text => (sprintf 'U+%04X', $code),
-       redo A;
+                         line => $l, column => $c);
-     } elsif ($match < 0) {
+         $code = 0xFFFD;
-       !!!parse-error (type => 'no refc', line => $l, column => $c);
+       } elsif ($code > 0x10FFFF) {
-       if ($in_attr and $match < -1) {
+         !!!cp (1009);
-         !!!cp (1024);
+         !!!parse-error (type => 'invalid character reference',
-         $self->{entity_return} = {type => CHARACTER_TOKEN, data => '&'.$entity_name,
+                         text => (sprintf 'U-%08X', $code),
-                 line => $l, column => $c,
+                         line => $l, column => $c);
-                };
+         $code = 0xFFFD;
-         $self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE;
+       } elsif ($code == 0x000D) {
+         !!!cp (1010);
+         !!!parse-error (type => 'CR character reference', line => $l, column => $c);
+         $code = 0x000A;
+       } elsif (0x80 <= $code and $code <= 0x9F) {
+         !!!cp (1011);
+         !!!parse-error (type => 'C1 character reference', text => (sprintf 'U+%04X', $code), line => $l, column => $c);
+         $code = $c1_entity_char->{$code};
+       }
+       if ($self->{entity_in_attr}) {
+         $self->{current_attribute}->{value} .= chr $code;
+         $self->{current_attribute}->{has_reference} = 1;
+         $self->{state} = $self->{last_attribute_value_state};
+         ## Reconsume.
          redo A;
        } else {
-         !!!cp (1025);
+         $self->{state} = DATA_STATE;
-         $self->{entity_return} = {type => CHARACTER_TOKEN, data => $value, has_reference => 1,
+         ## Reconsume.
-                 line => $l, column => $c,
+         !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
-                };
+                   has_reference => 1,
-         $self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE;
+                   line => $l, column => $c,
+                  });
          redo A;
        }
-     } else {
+     } elsif ($self->{state} == ENTITY_NAME_STATE) {
-       !!!cp (1026);
+       if (length $self->{state_keyword} < 30 and
-       !!!parse-error (type => 'bare ero', line => $l, column => $c);
+           ## NOTE: Some number greater than the maximum length of entity name
-       ## NOTE: "No characters are consumed" in the spec.
+           ((0x0041 <= $self->{next_char} and # a
-       $self->{entity_return} = {type => CHARACTER_TOKEN, data => '&'.$value,
+             $self->{next_char} <= 0x005A) or # x
-               line => $l, column => $c,
+            (0x0061 <= $self->{next_char} and # a
-              };
+             $self->{next_char} <= 0x007A) or # z
-       $self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE;
+            (0x0030 <= $self->{next_char} and # 0
-       redo A;
+             $self->{next_char} <= 0x0039) or # 9
-     }
+            $self->{next_char} == 0x003B)) { # ;
-   } else {
+         our $EntityChar;
-     !!!cp (1027);
+         $self->{state_keyword} .= chr $self->{next_char};
-     ## no characters are consumed
+         if (defined $EntityChar->{$self->{state_keyword}}) {
-     !!!parse-error (type => 'bare ero', line => $l, column => $c);
+           if ($self->{next_char} == 0x003B) { # ;
-     $self->{entity_return} = undef;
+             !!!cp (1020);
-     $self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE;
+             $self->{entity__value} = $EntityChar->{$self->{state_keyword}};
-     redo A;
+             $self->{entity__match} = 1;
-   }
+             !!!next-input-character;
+             #
+           } else {
+             !!!cp (1021);
+             $self->{entity__value} = $EntityChar->{$self->{state_keyword}};
+             $self->{entity__match} = -1;
+             ## Stay in the state.
+             !!!next-input-character;
+             redo A;
+           }
+         } else {
+           !!!cp (1022);
+           $self->{entity__value} .= chr $self->{next_char};
+           $self->{entity__match} *= 2;
+           ## Stay in the state.
+           !!!next-input-character;
+           redo A;
+         }
+       }
+       my $data;
+       my $has_ref;
+       if ($self->{entity__match} > 0) {
+         !!!cp (1023);
+         $data = $self->{entity__value};
+         $has_ref = 1;
+         #
+       } elsif ($self->{entity__match} < 0) {
+         !!!parse-error (type => 'no refc');
+         if ($self->{entity_in_attr} and $self->{entity__match} < -1) {
+           !!!cp (1024);
+           $data = '&' . $self->{state_keyword};
+           #
+         } else {
+           !!!cp (1025);
+           $data = $self->{entity__value};
+           $has_ref = 1;
+           #
+         }
+       } else {
+         !!!cp (1026);
+         !!!parse-error (type => 'bare ero',
+                         line => $self->{line_prev},
+                         column => $self->{column_prev});
+         $data = '&' . $self->{state_keyword};
+         #
+       }
+       ## NOTE: In these cases, when a character reference is found,
+       ## it is consumed and a character token is returned, or, otherwise,
+       ## nothing is consumed and returned, according to the spec algorithm.
+       ## In this implementation, anything that has been examined by the
+       ## tokenizer is appended to the parent element or the attribute value
+       ## as string, either literal string when no character reference or
+       ## entity-replaced string otherwise, in this stage, since any characters
+       ## that would not be consumed are appended in the data state or in an
+       ## appropriate attribute value state anyway.
+       if ($self->{entity_in_attr}) {
+         $self->{current_attribute}->{value} .= $data;
+         $self->{current_attribute}->{has_reference} = 1 if $has_ref;
+         $self->{state} = $self->{last_attribute_value_state};
+         ## Reconsume.
+         redo A;
+       } else {
+         $self->{state} = DATA_STATE;
+         ## Reconsume.
+         !!!emit ({type => CHARACTER_TOKEN,
+                   data => $data, has_reference => $has_ref,
+                   line => $self->{line_prev},
+                   column => $self->{column_prev} + 1 - length $self->{state_keyword},
+                  });
+         redo A;
+       }
      } else {
        die "$0: $self->{state}: Unknown state";
      }

 Legend:



Removed from v.1.167
 


changed lines


 
Added in v.1.168
 Legend:



Removed from v.1.167
 


changed lines


 
Added in v.1.168
-Removed from v.1.167
+Added in v.1.168

admin@suikawiki.org	ViewVC Help
Powered by ViewVC 1.1.24