/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src

Diff of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory | Revision Log | View Patch Patch

-revision 1.166 by wakaba,
Sat Sep 13 08:21:35 2008 UTC
+revision 1.170 by wakaba,
Sat Sep 13 12:25:44 2008 UTC
 Line 669 
 sub parse_char_stream ($$$;$) {
        $self->{column} = 0;
      } elsif ($self->{next_char} == 0x000D) { # CR
        !!!cp ('j2');
+ ## TODO: support for abort/streaming
        my $next = $input->getc;
        if (defined $next and $next ne "\x0A") {
          $self->{next_next_char} = $next;
-Line 769 
 sub RCDATA_CONTENT_MODEL () { CM_ENTITY
+Line 770 
 sub RCDATA_CONTENT_MODEL () { CM_ENTITY
  sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
  sub DATA_STATE () { 0 }
- sub ENTITY_DATA_STATE () { 1 }
+ #sub ENTITY_DATA_STATE () { 1 }
  sub TAG_OPEN_STATE () { 2 }
  sub CLOSE_TAG_OPEN_STATE () { 3 }
  sub TAG_NAME_STATE () { 4 }
-Line 780 
 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8
+Line 781 
 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8
  sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
  sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
  sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
- sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
+ #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
  sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
  sub COMMENT_START_STATE () { 14 }
  sub COMMENT_START_DASH_STATE () { 15 }
-Line 812 
 sub CDATA_SECTION_MSE1_STATE () { 40 } #
+Line 813 
 sub CDATA_SECTION_MSE1_STATE () { 40 } #
  sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
  sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
  sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
+ ## NOTE: "Entity data state", "entity in attribute value state", and
+ ## "consume a character reference" algorithm are jointly implemented
+ ## using the following six states:
+ sub ENTITY_STATE () { 44 }
+ sub ENTITY_HASH_STATE () { 45 }
+ sub NCR_NUM_STATE () { 46 }
+ sub HEXREF_X_STATE () { 47 }
+ sub HEXREF_HEX_STATE () { 48 }
+ sub ENTITY_NAME_STATE () { 49 }
  sub DOCTYPE_TOKEN () { 1 }
  sub COMMENT_TOKEN () { 2 }
-Line 865 
 sub _initialize_tokenizer ($) {
+Line 875 
 sub _initialize_tokenizer ($) {
    my $self = shift;
    $self->{state} = DATA_STATE; # MUST
    #$self->{state_keyword}; # initialized when used
+   #$self->{entity__value}; # initialized when used
+   #$self->{entity__match}; # initialized when used
    $self->{content_model} = PCDATA_CONTENT_MODEL; # be
    undef $self->{current_token};
    undef $self->{current_attribute};
    undef $self->{last_emitted_start_tag_name};
-   undef $self->{last_attribute_value_state};
+   #$self->{prev_state}; # initialized when used
    delete $self->{self_closing};
-   $self->{char} = [];
    # $self->{next_char}
    !!!next-input-character;
    $self->{token} = [];
-Line 903 
 sub _initialize_tokenizer ($) {
+Line 914 
 sub _initialize_tokenizer ($) {
  ## has completed loading.  If one has, then it MUST be executed
  ## and removed from the list.
- ## NOTE: HTML5 "Writing HTML documents" section, applied to
+ ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
- ## documents and not to user agents and conformance checkers,
+ ## (This requirement was dropped from HTML5 spec, unfortunately.)
- ## contains some requirements that are not detected by the
- ## parsing algorithm:
- ## - Some requirements on character encoding declarations. ## TODO
- ## - "Elements MUST NOT contain content that their content model disallows."
- ##   ... Some are parse error, some are not (will be reported by c.c.).
- ## - Polytheistic slash SHOULD NOT be used. (Applied only to atheists.) ## TODO
- ## - Text (in elements, attributes, and comments) SHOULD NOT contain
- ##   control characters other than space characters. ## TODO: (what is control character? C0, C1 and DEL?  Unicode control character?)
- ## TODO: HTML5 poses authors two SHOULD-level requirements that cannot
- ## be detected by the HTML5 parsing algorithm:
- ## - Text,
  sub _get_next_token ($) {
    my $self = shift;
-Line 940 
 sub _get_next_token ($) {
+Line 939 
 sub _get_next_token ($) {
          if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
              not $self->{escape}) {
            !!!cp (1);
-           $self->{state} = ENTITY_DATA_STATE;
+           ## NOTE: In the spec, the tokenizer is switched to the
+           ## "entity data state".  In this implementation, the tokenizer
+           ## is switched to the |ENTITY_STATE|, which is an implementation
+           ## of the "consume a character reference" algorithm.
+           $self->{entity_additional} = -1;
+           $self->{prev_state} = DATA_STATE;
+           $self->{state} = ENTITY_STATE;
            !!!next-input-character;
            redo A;
          } else {
-Line 1010 
 sub _get_next_token ($) {
+Line 1015 
 sub _get_next_token ($) {
        !!!emit ($token);
        redo A;
-     } elsif ($self->{state} == ENTITY_DATA_STATE) {
-       ## (cannot happen in CDATA state)
-       my ($l, $c) = ($self->{line_prev}, $self->{column_prev});
-       my $token = $self->_tokenize_attempt_to_consume_an_entity (0, -1);
-       $self->{state} = DATA_STATE;
-       # next-input-character is already done
-       unless (defined $token) {
-         !!!cp (13);
-         !!!emit ({type => CHARACTER_TOKEN, data => '&',
-                   line => $l, column => $c,
-                  });
-       } else {
-         !!!cp (14);
-         !!!emit ($token);
-       }
-       redo A;
      } elsif ($self->{state} == TAG_OPEN_STATE) {
        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
          if ($self->{next_char} == 0x002F) { # /
-Line 1704 
 sub _get_next_token ($) {
+Line 1688 
 sub _get_next_token ($) {
          redo A;
        } elsif ($self->{next_char} == 0x0026) { # &
          !!!cp (96);
-         $self->{last_attribute_value_state} = $self->{state};
+         ## NOTE: In the spec, the tokenizer is switched to the
-         $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
+         ## "entity in attribute value state".  In this implementation, the
+         ## tokenizer is switched to the |ENTITY_STATE|, which is an
+         ## implementation of the "consume a character reference" algorithm.
+         $self->{prev_state} = $self->{state};
+         $self->{entity_additional} = 0x0022; # "
+         $self->{state} = ENTITY_STATE;
          !!!next-input-character;
          redo A;
        } elsif ($self->{next_char} == -1) {
-Line 1746 
 sub _get_next_token ($) {
+Line 1735 
 sub _get_next_token ($) {
          redo A;
        } elsif ($self->{next_char} == 0x0026) { # &
          !!!cp (102);
-         $self->{last_attribute_value_state} = $self->{state};
+         ## NOTE: In the spec, the tokenizer is switched to the
-         $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
+         ## "entity in attribute value state".  In this implementation, the
+         ## tokenizer is switched to the |ENTITY_STATE|, which is an
+         ## implementation of the "consume a character reference" algorithm.
+         $self->{entity_additional} = 0x0027; # '
+         $self->{prev_state} = $self->{state};
+         $self->{state} = ENTITY_STATE;
          !!!next-input-character;
          redo A;
        } elsif ($self->{next_char} == -1) {
-Line 1792 
 sub _get_next_token ($) {
+Line 1786 
 sub _get_next_token ($) {
          redo A;
        } elsif ($self->{next_char} == 0x0026) { # &
          !!!cp (108);
-         $self->{last_attribute_value_state} = $self->{state};
+         ## NOTE: In the spec, the tokenizer is switched to the
-         $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
+         ## "entity in attribute value state".  In this implementation, the
+         ## tokenizer is switched to the |ENTITY_STATE|, which is an
+         ## implementation of the "consume a character reference" algorithm.
+         $self->{entity_additional} = -1;
+         $self->{prev_state} = $self->{state};
+         $self->{state} = ENTITY_STATE;
          !!!next-input-character;
          redo A;
        } elsif ($self->{next_char} == 0x003E) { # >
-Line 1857 
 sub _get_next_token ($) {
+Line 1856 
 sub _get_next_token ($) {
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} == ENTITY_IN_ATTRIBUTE_VALUE_STATE) {
-       my $token = $self->_tokenize_attempt_to_consume_an_entity
-           (1,
-            $self->{last_attribute_value_state}
-              == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE ? 0x0022 : # "
-            $self->{last_attribute_value_state}
-              == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE ? 0x0027 : # '
-            -1);
-       unless (defined $token) {
-         !!!cp (117);
-         $self->{current_attribute}->{value} .= '&';
-       } else {
-         !!!cp (118);
-         $self->{current_attribute}->{value} .= $token->{data};
-         $self->{current_attribute}->{has_reference} = $token->{has_reference};
-         ## ISSUE: spec says "append the returned character token to the current attribute's value"
-       }
-       $self->{state} = $self->{last_attribute_value_state};
-       # next-input-character is already done
-       redo A;
      } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
        if ($self->{next_char} == 0x0009 or # HT
            $self->{next_char} == 0x000A or # LF
-Line 1998 
 sub _get_next_token ($) {
+Line 1975 
 sub _get_next_token ($) {
        }
      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
        ## (only happen if PCDATA state)
-       ## NOTE: Set by the previous state
-       #my $token = {type => COMMENT_TOKEN, data => ''};
-       BC: {
-         if ($self->{next_char} == 0x003E) { # >
-           !!!cp (124);
-           $self->{state} = DATA_STATE;
-           !!!next-input-character;
-           !!!emit ($self->{current_token}); # comment
-           redo A;
-         } elsif ($self->{next_char} == -1) {
-           !!!cp (125);
-           $self->{state} = DATA_STATE;
-           ## reconsume
-           !!!emit ($self->{current_token}); # comment
+       ## NOTE: Unlike spec's "bogus comment state", this implementation
+       ## consumes characters one-by-one basis.
+       if ($self->{next_char} == 0x003E) { # >
+         !!!cp (124);
+         $self->{state} = DATA_STATE;
+         !!!next-input-character;
-           redo A;
+         !!!emit ($self->{current_token}); # comment
-         } else {
+         redo A;
-           !!!cp (126);
+       } elsif ($self->{next_char} == -1) {
-           $self->{current_token}->{data} .= chr ($self->{next_char}); # comment
+         !!!cp (125);
-           !!!next-input-character;
+         $self->{state} = DATA_STATE;
-           redo BC;
+         ## reconsume
-         }
-       } # BC
-       die "$0: _get_next_token: unexpected case [BC]";
+         !!!emit ($self->{current_token}); # comment
+         redo A;
+       } else {
+         !!!cp (126);
+         $self->{current_token}->{data} .= chr ($self->{next_char}); # comment
+         ## Stay in the state.
+         !!!next-input-character;
+         redo A;
+       }
      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
        ## (only happen if PCDATA state)
-Line 2963 
 sub _get_next_token ($) {
+Line 2935 
 sub _get_next_token ($) {
          ## Reconsume.
          redo A;
        }
-     } else {
+     } elsif ($self->{state} == ENTITY_STATE) {
-       die "$0: $self->{state}: Unknown state";
+       if ({
-     }
+x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
-   } # A
+x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, &
+         $self->{entity_additional} => 1,
-   die "$0: _get_next_token: unexpected case";
+       }->{$self->{next_char}}) {
- } # _get_next_token
+         !!!cp (1001);
+         ## Don't consume
- sub _tokenize_attempt_to_consume_an_entity ($$$) {
+         ## No error
-   my ($self, $in_attr, $additional) = @_;
+         ## Return nothing.
+         #
+       } elsif ($self->{next_char} == 0x0023) { # #
+         !!!cp (999);
+         $self->{state} = ENTITY_HASH_STATE;
+         $self->{state_keyword} = '#';
+         !!!next-input-character;
+         redo A;
+       } elsif ((0x0041 <= $self->{next_char} and
+                 $self->{next_char} <= 0x005A) or # A..Z
+                (0x0061 <= $self->{next_char} and
+                 $self->{next_char} <= 0x007A)) { # a..z
+         !!!cp (998);
+         require Whatpm::_NamedEntityList;
+         $self->{state} = ENTITY_NAME_STATE;
+         $self->{state_keyword} = chr $self->{next_char};
+         $self->{entity__value} = $self->{state_keyword};
+         $self->{entity__match} = 0;
+         !!!next-input-character;
+         redo A;
+       } else {
+         !!!cp (1027);
+         !!!parse-error (type => 'bare ero');
+         ## Return nothing.
+         #
+       }
-   my ($l, $c) = ($self->{line_prev}, $self->{column_prev});
+       ## NOTE: No character is consumed by the "consume a character
+       ## reference" algorithm.  In other word, there is an "&" character
+       ## that does not introduce a character reference, which would be
+       ## appended to the parent element or the attribute value in later
+       ## process of the tokenizer.
+       if ($self->{prev_state} == DATA_STATE) {
+         !!!cp (997);
+         $self->{state} = $self->{prev_state};
+         ## Reconsume.
+         !!!emit ({type => CHARACTER_TOKEN, data => '&',
+                   line => $self->{line_prev},
+                   column => $self->{column_prev},
+                  });
+         redo A;
+       } else {
+         !!!cp (996);
+         $self->{current_attribute}->{value} .= '&';
+         $self->{state} = $self->{prev_state};
+         ## Reconsume.
+         redo A;
+       }
+     } elsif ($self->{state} == ENTITY_HASH_STATE) {
+       if ($self->{next_char} == 0x0078 or # x
+           $self->{next_char} == 0x0058) { # X
+         !!!cp (995);
+         $self->{state} = HEXREF_X_STATE;
+         $self->{state_keyword} .= chr $self->{next_char};
+         !!!next-input-character;
+         redo A;
+       } elsif (0x0030 <= $self->{next_char} and
+                $self->{next_char} <= 0x0039) { # 0..9
+         !!!cp (994);
+         $self->{state} = NCR_NUM_STATE;
+         $self->{state_keyword} = $self->{next_char} - 0x0030;
+         !!!next-input-character;
+         redo A;
+       } else {
+         !!!parse-error (type => 'bare nero',
+                         line => $self->{line_prev},
+                         column => $self->{column_prev} - 1);
-   if ({
+         ## NOTE: According to the spec algorithm, nothing is returned,
-x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
+         ## and then "&#" is appended to the parent element or the attribute
-x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR
+         ## value in the later processing.
-        $additional => 1,
-       }->{$self->{next_char}}) {
+         if ($self->{prev_state} == DATA_STATE) {
-     !!!cp (1001);
+           !!!cp (1019);
-     ## Don't consume
+           $self->{state} = $self->{prev_state};
-     ## No error
+           ## Reconsume.
-     return undef;
+           !!!emit ({type => CHARACTER_TOKEN,
-   } elsif ($self->{next_char} == 0x0023) { # #
+                     data => '&#',
-     !!!next-input-character;
+                     line => $self->{line_prev},
-     if ($self->{next_char} == 0x0078 or # x
+                     column => $self->{column_prev} - 1,
-         $self->{next_char} == 0x0058) { # X
+                    });
-       my $code;
+           redo A;
-       X: {
-         my $x_char = $self->{next_char};
-         !!!next-input-character;
-         if (0x0030 <= $self->{next_char} and
-             $self->{next_char} <= 0x0039) { # 0..9
-           !!!cp (1002);
-           $code ||= 0;
-           $code *= 0x10;
-           $code += $self->{next_char} - 0x0030;
-           redo X;
-         } elsif (0x0061 <= $self->{next_char} and
-                  $self->{next_char} <= 0x0066) { # a..f
-           !!!cp (1003);
-           $code ||= 0;
-           $code *= 0x10;
-           $code += $self->{next_char} - 0x0060 + 9;
-           redo X;
-         } elsif (0x0041 <= $self->{next_char} and
-                  $self->{next_char} <= 0x0046) { # A..F
-           !!!cp (1004);
-           $code ||= 0;
-           $code *= 0x10;
-           $code += $self->{next_char} - 0x0040 + 9;
-           redo X;
-         } elsif (not defined $code) { # no hexadecimal digit
-           !!!cp (1005);
-           !!!parse-error (type => 'bare hcro', line => $l, column => $c);
-           !!!back-next-input-character ($x_char, $self->{next_char});
-           $self->{next_char} = 0x0023; # #
-           return undef;
-         } elsif ($self->{next_char} == 0x003B) { # ;
-           !!!cp (1006);
-           !!!next-input-character;
          } else {
-           !!!cp (1007);
+           !!!cp (993);
-           !!!parse-error (type => 'no refc', line => $l, column => $c);
+           $self->{current_attribute}->{value} .= '&#';
+           $self->{state} = $self->{prev_state};
+           ## Reconsume.
+           redo A;
          }
+       }
-         if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
+     } elsif ($self->{state} == NCR_NUM_STATE) {
-           !!!cp (1008);
+       if (0x0030 <= $self->{next_char} and
-           !!!parse-error (type => 'invalid character reference',
+           $self->{next_char} <= 0x0039) { # 0..9
-                           text => (sprintf 'U+%04X', $code),
-                           line => $l, column => $c);
-           $code = 0xFFFD;
-         } elsif ($code > 0x10FFFF) {
-           !!!cp (1009);
-           !!!parse-error (type => 'invalid character reference',
-                           text => (sprintf 'U-%08X', $code),
-                           line => $l, column => $c);
-           $code = 0xFFFD;
-         } elsif ($code == 0x000D) {
-           !!!cp (1010);
-           !!!parse-error (type => 'CR character reference', line => $l, column => $c);
-           $code = 0x000A;
-         } elsif (0x80 <= $code and $code <= 0x9F) {
-           !!!cp (1011);
-           !!!parse-error (type => 'C1 character reference', text => (sprintf 'U+%04X', $code), line => $l, column => $c);
-           $code = $c1_entity_char->{$code};
-         }
-         return {type => CHARACTER_TOKEN, data => chr $code,
-                 has_reference => 1,
-                 line => $l, column => $c,
-                };
-       } # X
-     } elsif (0x0030 <= $self->{next_char} and
-              $self->{next_char} <= 0x0039) { # 0..9
-       my $code = $self->{next_char} - 0x0030;
-       !!!next-input-character;
-       while (0x0030 <= $self->{next_char} and
-                 $self->{next_char} <= 0x0039) { # 0..9
          !!!cp (1012);
-         $code *= 10;
+         $self->{state_keyword} *= 10;
-         $code += $self->{next_char} - 0x0030;
+         $self->{state_keyword} += $self->{next_char} - 0x0030;
+         ## Stay in the state.
          !!!next-input-character;
-       }
+         redo A;
+       } elsif ($self->{next_char} == 0x003B) { # ;
-       if ($self->{next_char} == 0x003B) { # ;
          !!!cp (1013);
          !!!next-input-character;
+         #
        } else {
          !!!cp (1014);
-         !!!parse-error (type => 'no refc', line => $l, column => $c);
+         !!!parse-error (type => 'no refc');
+         ## Reconsume.
+         #
        }
+       my $code = $self->{state_keyword};
+       my $l = $self->{line_prev};
+       my $c = $self->{column_prev};
        if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
          !!!cp (1015);
          !!!parse-error (type => 'invalid character reference',
-Line 3101 
 sub _tokenize_attempt_to_consume_an_enti
+Line 3083 
 sub _tokenize_attempt_to_consume_an_enti
                          line => $l, column => $c);
          $code = $c1_entity_char->{$code};
        }
-       return {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1,
+       if ($self->{prev_state} == DATA_STATE) {
-               line => $l, column => $c,
+         !!!cp (992);
-              };
+         $self->{state} = $self->{prev_state};
-     } else {
+         ## Reconsume.
-       !!!cp (1019);
+         !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
-       !!!parse-error (type => 'bare nero', line => $l, column => $c);
+                   line => $l, column => $c,
-       !!!back-next-input-character ($self->{next_char});
+                  });
-       $self->{next_char} = 0x0023; # #
+         redo A;
-       return undef;
+       } else {
-     }
+         !!!cp (991);
-   } elsif ((0x0041 <= $self->{next_char} and
+         $self->{current_attribute}->{value} .= chr $code;
-             $self->{next_char} <= 0x005A) or
+         $self->{current_attribute}->{has_reference} = 1;
-            (0x0061 <= $self->{next_char} and
+         $self->{state} = $self->{prev_state};
-             $self->{next_char} <= 0x007A)) {
+         ## Reconsume.
-     my $entity_name = chr $self->{next_char};
+         redo A;
-     !!!next-input-character;
+       }
+     } elsif ($self->{state} == HEXREF_X_STATE) {
-     my $value = $entity_name;
+       if ((0x0030 <= $self->{next_char} and $self->{next_char} <= 0x0039) or
-     my $match = 0;
+           (0x0041 <= $self->{next_char} and $self->{next_char} <= 0x0046) or
-     require Whatpm::_NamedEntityList;
+           (0x0061 <= $self->{next_char} and $self->{next_char} <= 0x0066)) {
-     our $EntityChar;
+         # 0..9, A..F, a..f
+         !!!cp (990);
-     while (length $entity_name < 30 and
+         $self->{state} = HEXREF_HEX_STATE;
-            ## NOTE: Some number greater than the maximum length of entity name
+         $self->{state_keyword} = 0;
-            ((0x0041 <= $self->{next_char} and # a
+         ## Reconsume.
-              $self->{next_char} <= 0x005A) or # x
+         redo A;
-             (0x0061 <= $self->{next_char} and # a
+       } else {
-              $self->{next_char} <= 0x007A) or # z
+         !!!parse-error (type => 'bare hcro',
-             (0x0030 <= $self->{next_char} and # 0
+                         line => $self->{line_prev},
-              $self->{next_char} <= 0x0039) or # 9
+                         column => $self->{column_prev} - 2);
-             $self->{next_char} == 0x003B)) { # ;
-       $entity_name .= chr $self->{next_char};
+         ## NOTE: According to the spec algorithm, nothing is returned,
-       if (defined $EntityChar->{$entity_name}) {
+         ## and then "&#" followed by "X" or "x" is appended to the parent
-         if ($self->{next_char} == 0x003B) { # ;
+         ## element or the attribute value in the later processing.
-           !!!cp (1020);
-           $value = $EntityChar->{$entity_name};
+         if ($self->{prev_state} == DATA_STATE) {
-           $match = 1;
+           !!!cp (1005);
-           !!!next-input-character;
+           $self->{state} = $self->{prev_state};
-           last;
+           ## Reconsume.
+           !!!emit ({type => CHARACTER_TOKEN,
+                     data => '&' . $self->{state_keyword},
+                     line => $self->{line_prev},
+                     column => $self->{column_prev} - length $self->{state_keyword},
+                    });
+           redo A;
+         } else {
+           !!!cp (989);
+           $self->{current_attribute}->{value} .= '&' . $self->{state_keyword};
+           $self->{state} = $self->{prev_state};
+           ## Reconsume.
+           redo A;
+         }
+       }
+     } elsif ($self->{state} == HEXREF_HEX_STATE) {
+       if (0x0030 <= $self->{next_char} and $self->{next_char} <= 0x0039) {
+         # 0..9
+         !!!cp (1002);
+         $self->{state_keyword} *= 0x10;
+         $self->{state_keyword} += $self->{next_char} - 0x0030;
+         ## Stay in the state.
+         !!!next-input-character;
+         redo A;
+       } elsif (0x0061 <= $self->{next_char} and
+                $self->{next_char} <= 0x0066) { # a..f
+         !!!cp (1003);
+         $self->{state_keyword} *= 0x10;
+         $self->{state_keyword} += $self->{next_char} - 0x0060 + 9;
+         ## Stay in the state.
+         !!!next-input-character;
+         redo A;
+       } elsif (0x0041 <= $self->{next_char} and
+                $self->{next_char} <= 0x0046) { # A..F
+         !!!cp (1004);
+         $self->{state_keyword} *= 0x10;
+         $self->{state_keyword} += $self->{next_char} - 0x0040 + 9;
+         ## Stay in the state.
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_char} == 0x003B) { # ;
+         !!!cp (1006);
+         !!!next-input-character;
+         #
+       } else {
+         !!!cp (1007);
+         !!!parse-error (type => 'no refc',
+                         line => $self->{line},
+                         column => $self->{column});
+         ## Reconsume.
+         #
+       }
+       my $code = $self->{state_keyword};
+       my $l = $self->{line_prev};
+       my $c = $self->{column_prev};
+       if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
+         !!!cp (1008);
+         !!!parse-error (type => 'invalid character reference',
+                         text => (sprintf 'U+%04X', $code),
+                         line => $l, column => $c);
+         $code = 0xFFFD;
+       } elsif ($code > 0x10FFFF) {
+         !!!cp (1009);
+         !!!parse-error (type => 'invalid character reference',
+                         text => (sprintf 'U-%08X', $code),
+                         line => $l, column => $c);
+         $code = 0xFFFD;
+       } elsif ($code == 0x000D) {
+         !!!cp (1010);
+         !!!parse-error (type => 'CR character reference', line => $l, column => $c);
+         $code = 0x000A;
+       } elsif (0x80 <= $code and $code <= 0x9F) {
+         !!!cp (1011);
+         !!!parse-error (type => 'C1 character reference', text => (sprintf 'U+%04X', $code), line => $l, column => $c);
+         $code = $c1_entity_char->{$code};
+       }
+       if ($self->{prev_state} == DATA_STATE) {
+         !!!cp (988);
+         $self->{state} = $self->{prev_state};
+         ## Reconsume.
+         !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
+                   line => $l, column => $c,
+                  });
+         redo A;
+       } else {
+         !!!cp (987);
+         $self->{current_attribute}->{value} .= chr $code;
+         $self->{current_attribute}->{has_reference} = 1;
+         $self->{state} = $self->{prev_state};
+         ## Reconsume.
+         redo A;
+       }
+     } elsif ($self->{state} == ENTITY_NAME_STATE) {
+       if (length $self->{state_keyword} < 30 and
+           ## NOTE: Some number greater than the maximum length of entity name
+           ((0x0041 <= $self->{next_char} and # a
+             $self->{next_char} <= 0x005A) or # x
+            (0x0061 <= $self->{next_char} and # a
+             $self->{next_char} <= 0x007A) or # z
+            (0x0030 <= $self->{next_char} and # 0
+             $self->{next_char} <= 0x0039) or # 9
+            $self->{next_char} == 0x003B)) { # ;
+         our $EntityChar;
+         $self->{state_keyword} .= chr $self->{next_char};
+         if (defined $EntityChar->{$self->{state_keyword}}) {
+           if ($self->{next_char} == 0x003B) { # ;
+             !!!cp (1020);
+             $self->{entity__value} = $EntityChar->{$self->{state_keyword}};
+             $self->{entity__match} = 1;
+             !!!next-input-character;
+             #
+           } else {
+             !!!cp (1021);
+             $self->{entity__value} = $EntityChar->{$self->{state_keyword}};
+             $self->{entity__match} = -1;
+             ## Stay in the state.
+             !!!next-input-character;
+             redo A;
+           }
          } else {
-           !!!cp (1021);
+           !!!cp (1022);
-           $value = $EntityChar->{$entity_name};
+           $self->{entity__value} .= chr $self->{next_char};
-           $match = -1;
+           $self->{entity__match} *= 2;
+           ## Stay in the state.
            !!!next-input-character;
+           redo A;
+         }
+       }
+       my $data;
+       my $has_ref;
+       if ($self->{entity__match} > 0) {
+         !!!cp (1023);
+         $data = $self->{entity__value};
+         $has_ref = 1;
+         #
+       } elsif ($self->{entity__match} < 0) {
+         !!!parse-error (type => 'no refc');
+         if ($self->{prev_state} != DATA_STATE and # in attribute
+             $self->{entity__match} < -1) {
+           !!!cp (1024);
+           $data = '&' . $self->{state_keyword};
+           #
+         } else {
+           !!!cp (1025);
+           $data = $self->{entity__value};
+           $has_ref = 1;
+           #
          }
        } else {
-         !!!cp (1022);
+         !!!cp (1026);
-         $value .= chr $self->{next_char};
+         !!!parse-error (type => 'bare ero',
-         $match *= 2;
+                         line => $self->{line_prev},
-         !!!next-input-character;
+                         column => $self->{column_prev});
+         $data = '&' . $self->{state_keyword};
+         #
        }
-     }
+       ## NOTE: In these cases, when a character reference is found,
-     if ($match > 0) {
+       ## it is consumed and a character token is returned, or, otherwise,
-       !!!cp (1023);
+       ## nothing is consumed and returned, according to the spec algorithm.
-       return {type => CHARACTER_TOKEN, data => $value, has_reference => 1,
+       ## In this implementation, anything that has been examined by the
-               line => $l, column => $c,
+       ## tokenizer is appended to the parent element or the attribute value
-              };
+       ## as string, either literal string when no character reference or
-     } elsif ($match < 0) {
+       ## entity-replaced string otherwise, in this stage, since any characters
-       !!!parse-error (type => 'no refc', line => $l, column => $c);
+       ## that would not be consumed are appended in the data state or in an
-       if ($in_attr and $match < -1) {
+       ## appropriate attribute value state anyway.
-         !!!cp (1024);
-         return {type => CHARACTER_TOKEN, data => '&'.$entity_name,
+       if ($self->{prev_state} == DATA_STATE) {
-                 line => $l, column => $c,
+         !!!cp (986);
-                };
+         $self->{state} = $self->{prev_state};
-       } else {
+         ## Reconsume.
-         !!!cp (1025);
+         !!!emit ({type => CHARACTER_TOKEN,
-         return {type => CHARACTER_TOKEN, data => $value, has_reference => 1,
+                   data => $data,
-                 line => $l, column => $c,
+                   line => $self->{line_prev},
-                };
+                   column => $self->{column_prev} + 1 - length $self->{state_keyword},
+                  });
+         redo A;
+       } else {
+         !!!cp (985);
+         $self->{current_attribute}->{value} .= $data;
+         $self->{current_attribute}->{has_reference} = 1 if $has_ref;
+         $self->{state} = $self->{prev_state};
+         ## Reconsume.
+         redo A;
        }
      } else {
-       !!!cp (1026);
+       die "$0: $self->{state}: Unknown state";
-       !!!parse-error (type => 'bare ero', line => $l, column => $c);
-       ## NOTE: "No characters are consumed" in the spec.
-       return {type => CHARACTER_TOKEN, data => '&'.$value,
-               line => $l, column => $c,
-              };
      }
-   } else {
+   } # A
-     !!!cp (1027);
-     ## no characters are consumed
+   die "$0: _get_next_token: unexpected case";
-     !!!parse-error (type => 'bare ero', line => $l, column => $c);
+ } # _get_next_token
-     return undef;
-   }
- } # _tokenize_attempt_to_consume_an_entity
  sub _initialize_tree_constructor ($) {
    my $self = shift;

 Legend:



Removed from v.1.166
 


changed lines


 
Added in v.1.170
 Legend:



Removed from v.1.166
 


changed lines


 
Added in v.1.170
-Removed from v.1.166
+Added in v.1.170

admin@suikawiki.org	ViewVC Help
Powered by ViewVC 1.1.24