/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.167 by wakaba, Sat Sep 13 09:02:28 2008 UTC revision 1.168 by wakaba, Sat Sep 13 10:49:21 2008 UTC
# Line 769  sub RCDATA_CONTENT_MODEL () { CM_ENTITY Line 769  sub RCDATA_CONTENT_MODEL () { CM_ENTITY
769  sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }  sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
770    
771  sub DATA_STATE () { 0 }  sub DATA_STATE () { 0 }
772  sub ENTITY_DATA_STATE () { 1 }  #sub ENTITY_DATA_STATE () { 1 }
773  sub TAG_OPEN_STATE () { 2 }  sub TAG_OPEN_STATE () { 2 }
774  sub CLOSE_TAG_OPEN_STATE () { 3 }  sub CLOSE_TAG_OPEN_STATE () { 3 }
775  sub TAG_NAME_STATE () { 4 }  sub TAG_NAME_STATE () { 4 }
# Line 780  sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 Line 780  sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8
780  sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }  sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
781  sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }  sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
782  sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }  sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
783  sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }  #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
784  sub MARKUP_DECLARATION_OPEN_STATE () { 13 }  sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
785  sub COMMENT_START_STATE () { 14 }  sub COMMENT_START_STATE () { 14 }
786  sub COMMENT_START_DASH_STATE () { 15 }  sub COMMENT_START_DASH_STATE () { 15 }
# Line 812  sub CDATA_SECTION_MSE1_STATE () { 40 } # Line 812  sub CDATA_SECTION_MSE1_STATE () { 40 } #
812  sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec  sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
813  sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec  sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
814  sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec  sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
815  sub ENTITY_STATE () { 44 } # "consume a character reference" in the spec  ## NOTE: "Entity data state", "entity in attribute value state", and
816    ## "consume a character reference" algorithm are jointly implemented
817    ## using the following six states:
818    sub ENTITY_STATE () { 44 }
819    sub ENTITY_HASH_STATE () { 45 }
820    sub NCR_NUM_STATE () { 46 }
821    sub HEXREF_X_STATE () { 47 }
822    sub HEXREF_HEX_STATE () { 48 }
823    sub ENTITY_NAME_STATE () { 49 }
824    
825  sub DOCTYPE_TOKEN () { 1 }  sub DOCTYPE_TOKEN () { 1 }
826  sub COMMENT_TOKEN () { 2 }  sub COMMENT_TOKEN () { 2 }
# Line 945  sub _get_next_token ($) { Line 953  sub _get_next_token ($) {
953            ## "entity data state".  In this implementation, the tokenizer            ## "entity data state".  In this implementation, the tokenizer
954            ## is switched to the |ENTITY_STATE|, which is an implementation            ## is switched to the |ENTITY_STATE|, which is an implementation
955            ## of the "consume a character reference" algorithm.            ## of the "consume a character reference" algorithm.
           #$self->{state} = ENTITY_DATA_STATE;  
956            $self->{entity_in_attr} = 0;            $self->{entity_in_attr} = 0;
957            $self->{entity_additional} = -1;            $self->{entity_additional} = -1;
958            $self->{state} = ENTITY_STATE;            $self->{state} = ENTITY_STATE;
# Line 1018  sub _get_next_token ($) { Line 1025  sub _get_next_token ($) {
1025        !!!emit ($token);        !!!emit ($token);
1026    
1027        redo A;        redo A;
     } elsif ($self->{state} == ENTITY_DATA_STATE) {  
       my ($l, $c) = ($self->{line_prev}, $self->{column_prev});  
   
       my $token = $self->{entity_return};  
   
       $self->{state} = DATA_STATE;  
       # next-input-character is already done  
   
       unless (defined $token) {  
         !!!cp (13);  
         !!!emit ({type => CHARACTER_TOKEN, data => '&',  
                   line => $l, column => $c,  
                  });  
       } else {  
         !!!cp (14);  
         !!!emit ($token);  
       }  
   
       redo A;  
1028      } elsif ($self->{state} == TAG_OPEN_STATE) {      } elsif ($self->{state} == TAG_OPEN_STATE) {
1029        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
1030          if ($self->{next_char} == 0x002F) { # /          if ($self->{next_char} == 0x002F) { # /
# Line 1715  sub _get_next_token ($) { Line 1703  sub _get_next_token ($) {
1703          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1704          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
1705          ## implementation of the "consume a character reference" algorithm.          ## implementation of the "consume a character reference" algorithm.
         #$self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;  
1706          $self->{entity_in_attr} = 1;          $self->{entity_in_attr} = 1;
1707          $self->{entity_additional} = 0x0022; # "          $self->{entity_additional} = 0x0022; # "
1708          $self->{state} = ENTITY_STATE;          $self->{state} = ENTITY_STATE;
# Line 1764  sub _get_next_token ($) { Line 1751  sub _get_next_token ($) {
1751          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1752          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
1753          ## implementation of the "consume a character reference" algorithm.          ## implementation of the "consume a character reference" algorithm.
         #$self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;  
1754          $self->{entity_in_attr} = 1;          $self->{entity_in_attr} = 1;
1755          $self->{entity_additional} = 0x0027; # '          $self->{entity_additional} = 0x0027; # '
1756          $self->{state} = ENTITY_STATE;          $self->{state} = ENTITY_STATE;
# Line 1817  sub _get_next_token ($) { Line 1803  sub _get_next_token ($) {
1803          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1804          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
1805          ## implementation of the "consume a character reference" algorithm.          ## implementation of the "consume a character reference" algorithm.
         #$self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;  
1806          $self->{entity_in_attr} = 1;          $self->{entity_in_attr} = 1;
1807          $self->{entity_additional} = -1;          $self->{entity_additional} = -1;
1808          $self->{state} = ENTITY_STATE;          $self->{state} = ENTITY_STATE;
# Line 1884  sub _get_next_token ($) { Line 1869  sub _get_next_token ($) {
1869          !!!next-input-character;          !!!next-input-character;
1870          redo A;          redo A;
1871        }        }
     } elsif ($self->{state} == ENTITY_IN_ATTRIBUTE_VALUE_STATE) {  
       my $token = $self->{entity_return};  
   
       unless (defined $token) {  
         !!!cp (117);  
         $self->{current_attribute}->{value} .= '&';  
       } else {  
         !!!cp (118);  
         $self->{current_attribute}->{value} .= $token->{data};  
         $self->{current_attribute}->{has_reference} = $token->{has_reference};  
         ## ISSUE: spec says "append the returned character token to the current attribute's value"  
       }  
   
       $self->{state} = $self->{last_attribute_value_state};  
       # next-input-character is already done  
       redo A;  
1872      } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {      } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1873        if ($self->{next_char} == 0x0009 or # HT        if ($self->{next_char} == 0x0009 or # HT
1874            $self->{next_char} == 0x000A or # LF            $self->{next_char} == 0x000A or # LF
# Line 2979  sub _get_next_token ($) { Line 2948  sub _get_next_token ($) {
2948          ## Reconsume.          ## Reconsume.
2949          redo A;          redo A;
2950        }        }
   
2951      } elsif ($self->{state} == ENTITY_STATE) {      } elsif ($self->{state} == ENTITY_STATE) {
2952        my $in_attr = $self->{entity_in_attr};        if ({
2953        my $additional = $self->{entity_additional};          0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
2954            0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, &
2955            $self->{entity_additional} => 1,
2956          }->{$self->{next_char}}) {
2957            !!!cp (1001);
2958            ## Don't consume
2959            ## No error
2960            ## Return nothing.
2961            #
2962          } elsif ($self->{next_char} == 0x0023) { # #
2963            $self->{state} = ENTITY_HASH_STATE;
2964            $self->{state_keyword} = '#';
2965            !!!next-input-character;
2966            redo A;
2967          } elsif ((0x0041 <= $self->{next_char} and
2968                    $self->{next_char} <= 0x005A) or # A..Z
2969                   (0x0061 <= $self->{next_char} and
2970                    $self->{next_char} <= 0x007A)) { # a..z
2971            require Whatpm::_NamedEntityList;
2972            $self->{state} = ENTITY_NAME_STATE;
2973            $self->{state_keyword} = chr $self->{next_char};
2974            $self->{entity__value} = $self->{state_keyword};
2975            $self->{entity__match} = 0;
2976            !!!next-input-character;
2977            redo A;
2978          } else {
2979            !!!cp (1027);
2980            !!!parse-error (type => 'bare ero');
2981            ## Return nothing.
2982            #
2983          }
2984    
2985    my ($l, $c) = ($self->{line_prev}, $self->{column_prev});        ## NOTE: No character is consumed by the "consume a character
2986          ## reference" algorithm.  In other word, there is an "&" character
2987          ## that does not introduce a character reference, which would be
2988          ## appended to the parent element or the attribute value in later
2989          ## process of the tokenizer.
2990    
2991    if ({        if ($self->{entity_in_attr}) {
2992         0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,          $self->{current_attribute}->{value} .= '&';
2993         0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR          $self->{state} = $self->{last_attribute_value_state};
2994         $additional => 1,          ## Reconsume.
2995        }->{$self->{next_char}}) {          redo A;
2996      !!!cp (1001);        } else {
2997      ## Don't consume          $self->{state} = DATA_STATE;
2998      ## No error          ## Reconsume.
2999      $self->{entity_return} = undef;          !!!emit ({type => CHARACTER_TOKEN, data => '&',
3000      $self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE;                    line => $self->{line_prev},
3001      redo A;                    column => $self->{column_prev},
3002    } elsif ($self->{next_char} == 0x0023) { # #                   });
3003      !!!next-input-character;          redo A;
3004      if ($self->{next_char} == 0x0078 or # x        }
3005          $self->{next_char} == 0x0058) { # X      } elsif ($self->{state} == ENTITY_HASH_STATE) {
3006        my $code;        if ($self->{next_char} == 0x0078 or # x
3007        X: {            $self->{next_char} == 0x0058) { # X
3008          my $x_char = $self->{next_char};          $self->{state} = HEXREF_X_STATE;
3009          !!!next-input-character;          $self->{state_keyword} .= chr $self->{next_char};
3010          if (0x0030 <= $self->{next_char} and          !!!next-input-character;
3011              $self->{next_char} <= 0x0039) { # 0..9          redo A;
3012            !!!cp (1002);        } elsif (0x0030 <= $self->{next_char} and
3013            $code ||= 0;                 $self->{next_char} <= 0x0039) { # 0..9
3014            $code *= 0x10;          $self->{state} = NCR_NUM_STATE;
3015            $code += $self->{next_char} - 0x0030;          $self->{state_keyword} = $self->{next_char} - 0x0030;
3016            redo X;          !!!next-input-character;
3017          } elsif (0x0061 <= $self->{next_char} and          redo A;
3018                   $self->{next_char} <= 0x0066) { # a..f        } else {
3019            !!!cp (1003);          !!!cp (1019);
3020            $code ||= 0;          !!!parse-error (type => 'bare nero',
3021            $code *= 0x10;                          line => $self->{line_prev},
3022            $code += $self->{next_char} - 0x0060 + 9;                          column => $self->{column_prev} - 1);
3023            redo X;  
3024          } elsif (0x0041 <= $self->{next_char} and          ## NOTE: According to the spec algorithm, nothing is returned,
3025                   $self->{next_char} <= 0x0046) { # A..F          ## and then "&#" is appended to the parent element or the attribute
3026            !!!cp (1004);          ## value in the later processing.
3027            $code ||= 0;  
3028            $code *= 0x10;          if ($self->{entity_in_attr}) {
3029            $code += $self->{next_char} - 0x0040 + 9;            $self->{current_attribute}->{value} .= '&#';
3030            redo X;            $self->{state} = $self->{last_attribute_value_state};
3031          } elsif (not defined $code) { # no hexadecimal digit            ## Reconsume.
           !!!cp (1005);  
           !!!parse-error (type => 'bare hcro', line => $l, column => $c);  
           !!!back-next-input-character ($x_char, $self->{next_char});  
           $self->{next_char} = 0x0023; # #  
           $self->{entity_return} = undef;  
           $self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE;  
3032            redo A;            redo A;
         } elsif ($self->{next_char} == 0x003B) { # ;  
           !!!cp (1006);  
           !!!next-input-character;  
3033          } else {          } else {
3034            !!!cp (1007);            $self->{state} = DATA_STATE;
3035            !!!parse-error (type => 'no refc', line => $l, column => $c);            ## Reconsume.
3036              !!!emit ({type => CHARACTER_TOKEN,
3037                        data => '&#',
3038                        line => $self->{line_prev},
3039                        column => $self->{column_prev} - 1,
3040                       });
3041              redo A;
3042          }          }
3043          }
3044          if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {      } elsif ($self->{state} == NCR_NUM_STATE) {
3045            !!!cp (1008);        if (0x0030 <= $self->{next_char} and
3046            !!!parse-error (type => 'invalid character reference',            $self->{next_char} <= 0x0039) { # 0..9
                           text => (sprintf 'U+%04X', $code),  
                           line => $l, column => $c);  
           $code = 0xFFFD;  
         } elsif ($code > 0x10FFFF) {  
           !!!cp (1009);  
           !!!parse-error (type => 'invalid character reference',  
                           text => (sprintf 'U-%08X', $code),  
                           line => $l, column => $c);  
           $code = 0xFFFD;  
         } elsif ($code == 0x000D) {  
           !!!cp (1010);  
           !!!parse-error (type => 'CR character reference', line => $l, column => $c);  
           $code = 0x000A;  
         } elsif (0x80 <= $code and $code <= 0x9F) {  
           !!!cp (1011);  
           !!!parse-error (type => 'C1 character reference', text => (sprintf 'U+%04X', $code), line => $l, column => $c);  
           $code = $c1_entity_char->{$code};  
         }  
   
         $self->{entity_return} = {type => CHARACTER_TOKEN, data => chr $code,  
                 has_reference => 1,  
                 line => $l, column => $c,  
                };  
         $self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE;  
         redo A;  
       } # X  
     } elsif (0x0030 <= $self->{next_char} and  
              $self->{next_char} <= 0x0039) { # 0..9  
       my $code = $self->{next_char} - 0x0030;  
       !!!next-input-character;  
         
       while (0x0030 <= $self->{next_char} and  
                 $self->{next_char} <= 0x0039) { # 0..9  
3047          !!!cp (1012);          !!!cp (1012);
3048          $code *= 10;          $self->{state_keyword} *= 10;
3049          $code += $self->{next_char} - 0x0030;          $self->{state_keyword} += $self->{next_char} - 0x0030;
3050                    
3051            ## Stay in the state.
3052          !!!next-input-character;          !!!next-input-character;
3053        }          redo A;
3054          } elsif ($self->{next_char} == 0x003B) { # ;
       if ($self->{next_char} == 0x003B) { # ;  
3055          !!!cp (1013);          !!!cp (1013);
3056          !!!next-input-character;          !!!next-input-character;
3057            #
3058        } else {        } else {
3059          !!!cp (1014);          !!!cp (1014);
3060          !!!parse-error (type => 'no refc', line => $l, column => $c);          !!!parse-error (type => 'no refc');
3061            ## Reconsume.
3062            #
3063        }        }
3064    
3065          my $code = $self->{state_keyword};
3066          my $l = $self->{line_prev};
3067          my $c = $self->{column_prev};
3068        if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {        if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
3069          !!!cp (1015);          !!!cp (1015);
3070          !!!parse-error (type => 'invalid character reference',          !!!parse-error (type => 'invalid character reference',
# Line 3117  sub _get_next_token ($) { Line 3089  sub _get_next_token ($) {
3089                          line => $l, column => $c);                          line => $l, column => $c);
3090          $code = $c1_entity_char->{$code};          $code = $c1_entity_char->{$code};
3091        }        }
3092          
3093        $self->{entity_return} = {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1,        if ($self->{entity_in_attr}) {
3094                line => $l, column => $c,          $self->{current_attribute}->{value} .= chr $code;
3095               };          $self->{current_attribute}->{has_reference} = 1;
3096        $self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE;          $self->{state} = $self->{last_attribute_value_state};
3097        redo A;          ## Reconsume.
3098      } else {          redo A;
3099        !!!cp (1019);        } else {
3100        !!!parse-error (type => 'bare nero', line => $l, column => $c);          $self->{state} = DATA_STATE;
3101        !!!back-next-input-character ($self->{next_char});          ## Reconsume.
3102        $self->{next_char} = 0x0023; # #          !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3103        $self->{entity_return} = undef;                    has_reference => 1,
3104        $self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE;                    line => $l, column => $c,
3105        redo A;                   });
3106      }          redo A;
3107    } elsif ((0x0041 <= $self->{next_char} and        }
3108              $self->{next_char} <= 0x005A) or      } elsif ($self->{state} == HEXREF_X_STATE) {
3109             (0x0061 <= $self->{next_char} and        if ((0x0030 <= $self->{next_char} and $self->{next_char} <= 0x0039) or
3110              $self->{next_char} <= 0x007A)) {            (0x0041 <= $self->{next_char} and $self->{next_char} <= 0x0046) or
3111      my $entity_name = chr $self->{next_char};            (0x0061 <= $self->{next_char} and $self->{next_char} <= 0x0066)) {
3112      !!!next-input-character;          # 0..9, A..F, a..f
3113            $self->{state} = HEXREF_HEX_STATE;
3114      my $value = $entity_name;          $self->{state_keyword} = 0;
3115      my $match = 0;          ## Reconsume.
3116      require Whatpm::_NamedEntityList;          redo A;
3117      our $EntityChar;        } else {
3118            !!!cp (1005);
3119      while (length $entity_name < 30 and          !!!parse-error (type => 'bare hcro',
3120             ## NOTE: Some number greater than the maximum length of entity name                          line => $self->{line_prev},
3121             ((0x0041 <= $self->{next_char} and # a                          column => $self->{column_prev} - 2);
3122               $self->{next_char} <= 0x005A) or # x  
3123              (0x0061 <= $self->{next_char} and # a          ## NOTE: According to the spec algorithm, nothing is returned,
3124               $self->{next_char} <= 0x007A) or # z          ## and then "&#" followed by "X" or "x" is appended to the parent
3125              (0x0030 <= $self->{next_char} and # 0          ## element or the attribute value in the later processing.
3126               $self->{next_char} <= 0x0039) or # 9  
3127              $self->{next_char} == 0x003B)) { # ;          if ($self->{entity_in_attr}) {
3128        $entity_name .= chr $self->{next_char};            $self->{current_attribute}->{value} .= '&' . $self->{state_keyword};
3129        if (defined $EntityChar->{$entity_name}) {            $self->{state} = $self->{last_attribute_value_state};
3130          if ($self->{next_char} == 0x003B) { # ;            ## Reconsume.
3131            !!!cp (1020);            redo A;
           $value = $EntityChar->{$entity_name};  
           $match = 1;  
           !!!next-input-character;  
           last;  
3132          } else {          } else {
3133            !!!cp (1021);            $self->{state} = DATA_STATE;
3134            $value = $EntityChar->{$entity_name};            ## Reconsume.
3135            $match = -1;            !!!emit ({type => CHARACTER_TOKEN,
3136            !!!next-input-character;                      data => '&' . $self->{state_keyword},
3137                        line => $self->{line_prev},
3138                        column => $self->{column_prev} - length $self->{state_keyword},
3139                       });
3140              redo A;
3141          }          }
3142        } else {        }
3143          !!!cp (1022);      } elsif ($self->{state} == HEXREF_HEX_STATE) {
3144          $value .= chr $self->{next_char};        if (0x0030 <= $self->{next_char} and $self->{next_char} <= 0x0039) {
3145          $match *= 2;          # 0..9
3146            !!!cp (1002);
3147            $self->{state_keyword} *= 0x10;
3148            $self->{state_keyword} += $self->{next_char} - 0x0030;
3149            ## Stay in the state.
3150          !!!next-input-character;          !!!next-input-character;
3151            redo A;
3152          } elsif (0x0061 <= $self->{next_char} and
3153                   $self->{next_char} <= 0x0066) { # a..f
3154            !!!cp (1003);
3155            $self->{state_keyword} *= 0x10;
3156            $self->{state_keyword} += $self->{next_char} - 0x0060 + 9;
3157            ## Stay in the state.
3158            !!!next-input-character;
3159            redo A;
3160          } elsif (0x0041 <= $self->{next_char} and
3161                   $self->{next_char} <= 0x0046) { # A..F
3162            !!!cp (1004);
3163            $self->{state_keyword} *= 0x10;
3164            $self->{state_keyword} += $self->{next_char} - 0x0040 + 9;
3165            ## Stay in the state.
3166            !!!next-input-character;
3167            redo A;
3168          } elsif ($self->{next_char} == 0x003B) { # ;
3169            !!!cp (1006);
3170            !!!next-input-character;
3171            #
3172          } else {
3173            !!!cp (1007);
3174            !!!parse-error (type => 'no refc',
3175                            line => $self->{line},
3176                            column => $self->{column});
3177            ## Reconsume.
3178            #
3179        }        }
3180      }  
3181              my $code = $self->{state_keyword};
3182      if ($match > 0) {        my $l = $self->{line_prev};
3183        !!!cp (1023);        my $c = $self->{column_prev};
3184        $self->{entity_return} = {type => CHARACTER_TOKEN, data => $value, has_reference => 1,        if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
3185                line => $l, column => $c,          !!!cp (1008);
3186               };          !!!parse-error (type => 'invalid character reference',
3187        $self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE;                          text => (sprintf 'U+%04X', $code),
3188        redo A;                          line => $l, column => $c);
3189      } elsif ($match < 0) {          $code = 0xFFFD;
3190        !!!parse-error (type => 'no refc', line => $l, column => $c);        } elsif ($code > 0x10FFFF) {
3191        if ($in_attr and $match < -1) {          !!!cp (1009);
3192          !!!cp (1024);          !!!parse-error (type => 'invalid character reference',
3193          $self->{entity_return} = {type => CHARACTER_TOKEN, data => '&'.$entity_name,                          text => (sprintf 'U-%08X', $code),
3194                  line => $l, column => $c,                          line => $l, column => $c);
3195                 };          $code = 0xFFFD;
3196          $self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE;        } elsif ($code == 0x000D) {
3197            !!!cp (1010);
3198            !!!parse-error (type => 'CR character reference', line => $l, column => $c);
3199            $code = 0x000A;
3200          } elsif (0x80 <= $code and $code <= 0x9F) {
3201            !!!cp (1011);
3202            !!!parse-error (type => 'C1 character reference', text => (sprintf 'U+%04X', $code), line => $l, column => $c);
3203            $code = $c1_entity_char->{$code};
3204          }
3205    
3206          if ($self->{entity_in_attr}) {
3207            $self->{current_attribute}->{value} .= chr $code;
3208            $self->{current_attribute}->{has_reference} = 1;
3209            $self->{state} = $self->{last_attribute_value_state};
3210            ## Reconsume.
3211          redo A;          redo A;
3212        } else {        } else {
3213          !!!cp (1025);          $self->{state} = DATA_STATE;
3214          $self->{entity_return} = {type => CHARACTER_TOKEN, data => $value, has_reference => 1,          ## Reconsume.
3215                  line => $l, column => $c,          !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3216                 };                    has_reference => 1,
3217          $self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE;                    line => $l, column => $c,
3218                     });
3219          redo A;          redo A;
3220        }        }
3221      } else {      } elsif ($self->{state} == ENTITY_NAME_STATE) {
3222        !!!cp (1026);        if (length $self->{state_keyword} < 30 and
3223        !!!parse-error (type => 'bare ero', line => $l, column => $c);            ## NOTE: Some number greater than the maximum length of entity name
3224        ## NOTE: "No characters are consumed" in the spec.            ((0x0041 <= $self->{next_char} and # a
3225        $self->{entity_return} = {type => CHARACTER_TOKEN, data => '&'.$value,              $self->{next_char} <= 0x005A) or # x
3226                line => $l, column => $c,             (0x0061 <= $self->{next_char} and # a
3227               };              $self->{next_char} <= 0x007A) or # z
3228        $self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE;             (0x0030 <= $self->{next_char} and # 0
3229        redo A;              $self->{next_char} <= 0x0039) or # 9
3230      }             $self->{next_char} == 0x003B)) { # ;
3231    } else {          our $EntityChar;
3232      !!!cp (1027);          $self->{state_keyword} .= chr $self->{next_char};
3233      ## no characters are consumed          if (defined $EntityChar->{$self->{state_keyword}}) {
3234      !!!parse-error (type => 'bare ero', line => $l, column => $c);            if ($self->{next_char} == 0x003B) { # ;
3235      $self->{entity_return} = undef;              !!!cp (1020);
3236      $self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE;              $self->{entity__value} = $EntityChar->{$self->{state_keyword}};
3237      redo A;              $self->{entity__match} = 1;
3238    }              !!!next-input-character;
3239                #
3240              } else {
3241                !!!cp (1021);
3242                $self->{entity__value} = $EntityChar->{$self->{state_keyword}};
3243                $self->{entity__match} = -1;
3244                ## Stay in the state.
3245                !!!next-input-character;
3246                redo A;
3247              }
3248            } else {
3249              !!!cp (1022);
3250              $self->{entity__value} .= chr $self->{next_char};
3251              $self->{entity__match} *= 2;
3252              ## Stay in the state.
3253              !!!next-input-character;
3254              redo A;
3255            }
3256          }
3257    
3258          my $data;
3259          my $has_ref;
3260          if ($self->{entity__match} > 0) {
3261            !!!cp (1023);
3262            $data = $self->{entity__value};
3263            $has_ref = 1;
3264            #
3265          } elsif ($self->{entity__match} < 0) {
3266            !!!parse-error (type => 'no refc');
3267            if ($self->{entity_in_attr} and $self->{entity__match} < -1) {
3268              !!!cp (1024);
3269              $data = '&' . $self->{state_keyword};
3270              #
3271            } else {
3272              !!!cp (1025);
3273              $data = $self->{entity__value};
3274              $has_ref = 1;
3275              #
3276            }
3277          } else {
3278            !!!cp (1026);
3279            !!!parse-error (type => 'bare ero',
3280                            line => $self->{line_prev},
3281                            column => $self->{column_prev});
3282            $data = '&' . $self->{state_keyword};
3283            #
3284          }
3285      
3286          ## NOTE: In these cases, when a character reference is found,
3287          ## it is consumed and a character token is returned, or, otherwise,
3288          ## nothing is consumed and returned, according to the spec algorithm.
3289          ## In this implementation, anything that has been examined by the
3290          ## tokenizer is appended to the parent element or the attribute value
3291          ## as string, either literal string when no character reference or
3292          ## entity-replaced string otherwise, in this stage, since any characters
3293          ## that would not be consumed are appended in the data state or in an
3294          ## appropriate attribute value state anyway.
3295    
3296          if ($self->{entity_in_attr}) {
3297            $self->{current_attribute}->{value} .= $data;
3298            $self->{current_attribute}->{has_reference} = 1 if $has_ref;
3299            $self->{state} = $self->{last_attribute_value_state};
3300            ## Reconsume.
3301            redo A;
3302          } else {
3303            $self->{state} = DATA_STATE;
3304            ## Reconsume.
3305            !!!emit ({type => CHARACTER_TOKEN,
3306                      data => $data, has_reference => $has_ref,
3307                      line => $self->{line_prev},
3308                      column => $self->{column_prev} + 1 - length $self->{state_keyword},
3309                     });
3310            redo A;
3311          }
3312      } else {      } else {
3313        die "$0: $self->{state}: Unknown state";        die "$0: $self->{state}: Unknown state";
3314      }      }

Legend:
Removed from v.1.167  
changed lines
  Added in v.1.168

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24