/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.166 by wakaba, Sat Sep 13 08:21:35 2008 UTC revision 1.170 by wakaba, Sat Sep 13 12:25:44 2008 UTC
# Line 669  sub parse_char_stream ($$$;$) { Line 669  sub parse_char_stream ($$$;$) {
669        $self->{column} = 0;        $self->{column} = 0;
670      } elsif ($self->{next_char} == 0x000D) { # CR      } elsif ($self->{next_char} == 0x000D) { # CR
671        !!!cp ('j2');        !!!cp ('j2');
672    ## TODO: support for abort/streaming
673        my $next = $input->getc;        my $next = $input->getc;
674        if (defined $next and $next ne "\x0A") {        if (defined $next and $next ne "\x0A") {
675          $self->{next_next_char} = $next;          $self->{next_next_char} = $next;
# Line 769  sub RCDATA_CONTENT_MODEL () { CM_ENTITY Line 770  sub RCDATA_CONTENT_MODEL () { CM_ENTITY
770  sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }  sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
771    
772  sub DATA_STATE () { 0 }  sub DATA_STATE () { 0 }
773  sub ENTITY_DATA_STATE () { 1 }  #sub ENTITY_DATA_STATE () { 1 }
774  sub TAG_OPEN_STATE () { 2 }  sub TAG_OPEN_STATE () { 2 }
775  sub CLOSE_TAG_OPEN_STATE () { 3 }  sub CLOSE_TAG_OPEN_STATE () { 3 }
776  sub TAG_NAME_STATE () { 4 }  sub TAG_NAME_STATE () { 4 }
# Line 780  sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 Line 781  sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8
781  sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }  sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
782  sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }  sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
783  sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }  sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
784  sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }  #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
785  sub MARKUP_DECLARATION_OPEN_STATE () { 13 }  sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
786  sub COMMENT_START_STATE () { 14 }  sub COMMENT_START_STATE () { 14 }
787  sub COMMENT_START_DASH_STATE () { 15 }  sub COMMENT_START_DASH_STATE () { 15 }
# Line 812  sub CDATA_SECTION_MSE1_STATE () { 40 } # Line 813  sub CDATA_SECTION_MSE1_STATE () { 40 } #
813  sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec  sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
814  sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec  sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
815  sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec  sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
816    ## NOTE: "Entity data state", "entity in attribute value state", and
817    ## "consume a character reference" algorithm are jointly implemented
818    ## using the following six states:
819    sub ENTITY_STATE () { 44 }
820    sub ENTITY_HASH_STATE () { 45 }
821    sub NCR_NUM_STATE () { 46 }
822    sub HEXREF_X_STATE () { 47 }
823    sub HEXREF_HEX_STATE () { 48 }
824    sub ENTITY_NAME_STATE () { 49 }
825    
826  sub DOCTYPE_TOKEN () { 1 }  sub DOCTYPE_TOKEN () { 1 }
827  sub COMMENT_TOKEN () { 2 }  sub COMMENT_TOKEN () { 2 }
# Line 865  sub _initialize_tokenizer ($) { Line 875  sub _initialize_tokenizer ($) {
875    my $self = shift;    my $self = shift;
876    $self->{state} = DATA_STATE; # MUST    $self->{state} = DATA_STATE; # MUST
877    #$self->{state_keyword}; # initialized when used    #$self->{state_keyword}; # initialized when used
878      #$self->{entity__value}; # initialized when used
879      #$self->{entity__match}; # initialized when used
880    $self->{content_model} = PCDATA_CONTENT_MODEL; # be    $self->{content_model} = PCDATA_CONTENT_MODEL; # be
881    undef $self->{current_token};    undef $self->{current_token};
882    undef $self->{current_attribute};    undef $self->{current_attribute};
883    undef $self->{last_emitted_start_tag_name};    undef $self->{last_emitted_start_tag_name};
884    undef $self->{last_attribute_value_state};    #$self->{prev_state}; # initialized when used
885    delete $self->{self_closing};    delete $self->{self_closing};
   $self->{char} = [];  
886    # $self->{next_char}    # $self->{next_char}
887    !!!next-input-character;    !!!next-input-character;
888    $self->{token} = [];    $self->{token} = [];
# Line 903  sub _initialize_tokenizer ($) { Line 914  sub _initialize_tokenizer ($) {
914  ## has completed loading.  If one has, then it MUST be executed  ## has completed loading.  If one has, then it MUST be executed
915  ## and removed from the list.  ## and removed from the list.
916    
917  ## NOTE: HTML5 "Writing HTML documents" section, applied to  ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
918  ## documents and not to user agents and conformance checkers,  ## (This requirement was dropped from HTML5 spec, unfortunately.)
 ## contains some requirements that are not detected by the  
 ## parsing algorithm:  
 ## - Some requirements on character encoding declarations. ## TODO  
 ## - "Elements MUST NOT contain content that their content model disallows."  
 ##   ... Some are parse error, some are not (will be reported by c.c.).  
 ## - Polytheistic slash SHOULD NOT be used. (Applied only to atheists.) ## TODO  
 ## - Text (in elements, attributes, and comments) SHOULD NOT contain  
 ##   control characters other than space characters. ## TODO: (what is control character? C0, C1 and DEL?  Unicode control character?)  
   
 ## TODO: HTML5 poses authors two SHOULD-level requirements that cannot  
 ## be detected by the HTML5 parsing algorithm:  
 ## - Text,  
919    
920  sub _get_next_token ($) {  sub _get_next_token ($) {
921    my $self = shift;    my $self = shift;
# Line 940  sub _get_next_token ($) { Line 939  sub _get_next_token ($) {
939          if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA          if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
940              not $self->{escape}) {              not $self->{escape}) {
941            !!!cp (1);            !!!cp (1);
942            $self->{state} = ENTITY_DATA_STATE;            ## NOTE: In the spec, the tokenizer is switched to the
943              ## "entity data state".  In this implementation, the tokenizer
944              ## is switched to the |ENTITY_STATE|, which is an implementation
945              ## of the "consume a character reference" algorithm.
946              $self->{entity_additional} = -1;
947              $self->{prev_state} = DATA_STATE;
948              $self->{state} = ENTITY_STATE;
949            !!!next-input-character;            !!!next-input-character;
950            redo A;            redo A;
951          } else {          } else {
# Line 1010  sub _get_next_token ($) { Line 1015  sub _get_next_token ($) {
1015        !!!emit ($token);        !!!emit ($token);
1016    
1017        redo A;        redo A;
     } elsif ($self->{state} == ENTITY_DATA_STATE) {  
       ## (cannot happen in CDATA state)  
   
       my ($l, $c) = ($self->{line_prev}, $self->{column_prev});  
         
       my $token = $self->_tokenize_attempt_to_consume_an_entity (0, -1);  
   
       $self->{state} = DATA_STATE;  
       # next-input-character is already done  
   
       unless (defined $token) {  
         !!!cp (13);  
         !!!emit ({type => CHARACTER_TOKEN, data => '&',  
                   line => $l, column => $c,  
                  });  
       } else {  
         !!!cp (14);  
         !!!emit ($token);  
       }  
   
       redo A;  
1018      } elsif ($self->{state} == TAG_OPEN_STATE) {      } elsif ($self->{state} == TAG_OPEN_STATE) {
1019        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
1020          if ($self->{next_char} == 0x002F) { # /          if ($self->{next_char} == 0x002F) { # /
# Line 1704  sub _get_next_token ($) { Line 1688  sub _get_next_token ($) {
1688          redo A;          redo A;
1689        } elsif ($self->{next_char} == 0x0026) { # &        } elsif ($self->{next_char} == 0x0026) { # &
1690          !!!cp (96);          !!!cp (96);
1691          $self->{last_attribute_value_state} = $self->{state};          ## NOTE: In the spec, the tokenizer is switched to the
1692          $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;          ## "entity in attribute value state".  In this implementation, the
1693            ## tokenizer is switched to the |ENTITY_STATE|, which is an
1694            ## implementation of the "consume a character reference" algorithm.
1695            $self->{prev_state} = $self->{state};
1696            $self->{entity_additional} = 0x0022; # "
1697            $self->{state} = ENTITY_STATE;
1698          !!!next-input-character;          !!!next-input-character;
1699          redo A;          redo A;
1700        } elsif ($self->{next_char} == -1) {        } elsif ($self->{next_char} == -1) {
# Line 1746  sub _get_next_token ($) { Line 1735  sub _get_next_token ($) {
1735          redo A;          redo A;
1736        } elsif ($self->{next_char} == 0x0026) { # &        } elsif ($self->{next_char} == 0x0026) { # &
1737          !!!cp (102);          !!!cp (102);
1738          $self->{last_attribute_value_state} = $self->{state};          ## NOTE: In the spec, the tokenizer is switched to the
1739          $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;          ## "entity in attribute value state".  In this implementation, the
1740            ## tokenizer is switched to the |ENTITY_STATE|, which is an
1741            ## implementation of the "consume a character reference" algorithm.
1742            $self->{entity_additional} = 0x0027; # '
1743            $self->{prev_state} = $self->{state};
1744            $self->{state} = ENTITY_STATE;
1745          !!!next-input-character;          !!!next-input-character;
1746          redo A;          redo A;
1747        } elsif ($self->{next_char} == -1) {        } elsif ($self->{next_char} == -1) {
# Line 1792  sub _get_next_token ($) { Line 1786  sub _get_next_token ($) {
1786          redo A;          redo A;
1787        } elsif ($self->{next_char} == 0x0026) { # &        } elsif ($self->{next_char} == 0x0026) { # &
1788          !!!cp (108);          !!!cp (108);
1789          $self->{last_attribute_value_state} = $self->{state};          ## NOTE: In the spec, the tokenizer is switched to the
1790          $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;          ## "entity in attribute value state".  In this implementation, the
1791            ## tokenizer is switched to the |ENTITY_STATE|, which is an
1792            ## implementation of the "consume a character reference" algorithm.
1793            $self->{entity_additional} = -1;
1794            $self->{prev_state} = $self->{state};
1795            $self->{state} = ENTITY_STATE;
1796          !!!next-input-character;          !!!next-input-character;
1797          redo A;          redo A;
1798        } elsif ($self->{next_char} == 0x003E) { # >        } elsif ($self->{next_char} == 0x003E) { # >
# Line 1857  sub _get_next_token ($) { Line 1856  sub _get_next_token ($) {
1856          !!!next-input-character;          !!!next-input-character;
1857          redo A;          redo A;
1858        }        }
     } elsif ($self->{state} == ENTITY_IN_ATTRIBUTE_VALUE_STATE) {  
       my $token = $self->_tokenize_attempt_to_consume_an_entity  
           (1,  
            $self->{last_attribute_value_state}  
              == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE ? 0x0022 : # "  
            $self->{last_attribute_value_state}  
              == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE ? 0x0027 : # '  
            -1);  
   
       unless (defined $token) {  
         !!!cp (117);  
         $self->{current_attribute}->{value} .= '&';  
       } else {  
         !!!cp (118);  
         $self->{current_attribute}->{value} .= $token->{data};  
         $self->{current_attribute}->{has_reference} = $token->{has_reference};  
         ## ISSUE: spec says "append the returned character token to the current attribute's value"  
       }  
   
       $self->{state} = $self->{last_attribute_value_state};  
       # next-input-character is already done  
       redo A;  
1859      } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {      } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1860        if ($self->{next_char} == 0x0009 or # HT        if ($self->{next_char} == 0x0009 or # HT
1861            $self->{next_char} == 0x000A or # LF            $self->{next_char} == 0x000A or # LF
# Line 1998  sub _get_next_token ($) { Line 1975  sub _get_next_token ($) {
1975        }        }
1976      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1977        ## (only happen if PCDATA state)        ## (only happen if PCDATA state)
         
       ## NOTE: Set by the previous state  
       #my $token = {type => COMMENT_TOKEN, data => ''};  
   
       BC: {  
         if ($self->{next_char} == 0x003E) { # >  
           !!!cp (124);  
           $self->{state} = DATA_STATE;  
           !!!next-input-character;  
   
           !!!emit ($self->{current_token}); # comment  
   
           redo A;  
         } elsif ($self->{next_char} == -1) {  
           !!!cp (125);  
           $self->{state} = DATA_STATE;  
           ## reconsume  
1978    
1979            !!!emit ($self->{current_token}); # comment        ## NOTE: Unlike spec's "bogus comment state", this implementation
1980          ## consumes characters one-by-one basis.
1981          
1982          if ($self->{next_char} == 0x003E) { # >
1983            !!!cp (124);
1984            $self->{state} = DATA_STATE;
1985            !!!next-input-character;
1986    
1987            redo A;          !!!emit ($self->{current_token}); # comment
1988          } else {          redo A;
1989            !!!cp (126);        } elsif ($self->{next_char} == -1) {
1990            $self->{current_token}->{data} .= chr ($self->{next_char}); # comment          !!!cp (125);
1991            !!!next-input-character;          $self->{state} = DATA_STATE;
1992            redo BC;          ## reconsume
         }  
       } # BC  
1993    
1994        die "$0: _get_next_token: unexpected case [BC]";          !!!emit ($self->{current_token}); # comment
1995            redo A;
1996          } else {
1997            !!!cp (126);
1998            $self->{current_token}->{data} .= chr ($self->{next_char}); # comment
1999            ## Stay in the state.
2000            !!!next-input-character;
2001            redo A;
2002          }
2003      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2004        ## (only happen if PCDATA state)        ## (only happen if PCDATA state)
2005                
# Line 2963  sub _get_next_token ($) { Line 2935  sub _get_next_token ($) {
2935          ## Reconsume.          ## Reconsume.
2936          redo A;          redo A;
2937        }        }
2938      } else {      } elsif ($self->{state} == ENTITY_STATE) {
2939        die "$0: $self->{state}: Unknown state";        if ({
2940      }          0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
2941    } # A            0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, &
2942            $self->{entity_additional} => 1,
2943    die "$0: _get_next_token: unexpected case";        }->{$self->{next_char}}) {
2944  } # _get_next_token          !!!cp (1001);
2945            ## Don't consume
2946  sub _tokenize_attempt_to_consume_an_entity ($$$) {          ## No error
2947    my ($self, $in_attr, $additional) = @_;          ## Return nothing.
2948            #
2949          } elsif ($self->{next_char} == 0x0023) { # #
2950            !!!cp (999);
2951            $self->{state} = ENTITY_HASH_STATE;
2952            $self->{state_keyword} = '#';
2953            !!!next-input-character;
2954            redo A;
2955          } elsif ((0x0041 <= $self->{next_char} and
2956                    $self->{next_char} <= 0x005A) or # A..Z
2957                   (0x0061 <= $self->{next_char} and
2958                    $self->{next_char} <= 0x007A)) { # a..z
2959            !!!cp (998);
2960            require Whatpm::_NamedEntityList;
2961            $self->{state} = ENTITY_NAME_STATE;
2962            $self->{state_keyword} = chr $self->{next_char};
2963            $self->{entity__value} = $self->{state_keyword};
2964            $self->{entity__match} = 0;
2965            !!!next-input-character;
2966            redo A;
2967          } else {
2968            !!!cp (1027);
2969            !!!parse-error (type => 'bare ero');
2970            ## Return nothing.
2971            #
2972          }
2973    
2974    my ($l, $c) = ($self->{line_prev}, $self->{column_prev});        ## NOTE: No character is consumed by the "consume a character
2975          ## reference" algorithm.  In other word, there is an "&" character
2976          ## that does not introduce a character reference, which would be
2977          ## appended to the parent element or the attribute value in later
2978          ## process of the tokenizer.
2979    
2980          if ($self->{prev_state} == DATA_STATE) {
2981            !!!cp (997);
2982            $self->{state} = $self->{prev_state};
2983            ## Reconsume.
2984            !!!emit ({type => CHARACTER_TOKEN, data => '&',
2985                      line => $self->{line_prev},
2986                      column => $self->{column_prev},
2987                     });
2988            redo A;
2989          } else {
2990            !!!cp (996);
2991            $self->{current_attribute}->{value} .= '&';
2992            $self->{state} = $self->{prev_state};
2993            ## Reconsume.
2994            redo A;
2995          }
2996        } elsif ($self->{state} == ENTITY_HASH_STATE) {
2997          if ($self->{next_char} == 0x0078 or # x
2998              $self->{next_char} == 0x0058) { # X
2999            !!!cp (995);
3000            $self->{state} = HEXREF_X_STATE;
3001            $self->{state_keyword} .= chr $self->{next_char};
3002            !!!next-input-character;
3003            redo A;
3004          } elsif (0x0030 <= $self->{next_char} and
3005                   $self->{next_char} <= 0x0039) { # 0..9
3006            !!!cp (994);
3007            $self->{state} = NCR_NUM_STATE;
3008            $self->{state_keyword} = $self->{next_char} - 0x0030;
3009            !!!next-input-character;
3010            redo A;
3011          } else {
3012            !!!parse-error (type => 'bare nero',
3013                            line => $self->{line_prev},
3014                            column => $self->{column_prev} - 1);
3015    
3016    if ({          ## NOTE: According to the spec algorithm, nothing is returned,
3017         0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,          ## and then "&#" is appended to the parent element or the attribute
3018         0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR          ## value in the later processing.
3019         $additional => 1,  
3020        }->{$self->{next_char}}) {          if ($self->{prev_state} == DATA_STATE) {
3021      !!!cp (1001);            !!!cp (1019);
3022      ## Don't consume            $self->{state} = $self->{prev_state};
3023      ## No error            ## Reconsume.
3024      return undef;            !!!emit ({type => CHARACTER_TOKEN,
3025    } elsif ($self->{next_char} == 0x0023) { # #                      data => '&#',
3026      !!!next-input-character;                      line => $self->{line_prev},
3027      if ($self->{next_char} == 0x0078 or # x                      column => $self->{column_prev} - 1,
3028          $self->{next_char} == 0x0058) { # X                     });
3029        my $code;            redo A;
       X: {  
         my $x_char = $self->{next_char};  
         !!!next-input-character;  
         if (0x0030 <= $self->{next_char} and  
             $self->{next_char} <= 0x0039) { # 0..9  
           !!!cp (1002);  
           $code ||= 0;  
           $code *= 0x10;  
           $code += $self->{next_char} - 0x0030;  
           redo X;  
         } elsif (0x0061 <= $self->{next_char} and  
                  $self->{next_char} <= 0x0066) { # a..f  
           !!!cp (1003);  
           $code ||= 0;  
           $code *= 0x10;  
           $code += $self->{next_char} - 0x0060 + 9;  
           redo X;  
         } elsif (0x0041 <= $self->{next_char} and  
                  $self->{next_char} <= 0x0046) { # A..F  
           !!!cp (1004);  
           $code ||= 0;  
           $code *= 0x10;  
           $code += $self->{next_char} - 0x0040 + 9;  
           redo X;  
         } elsif (not defined $code) { # no hexadecimal digit  
           !!!cp (1005);  
           !!!parse-error (type => 'bare hcro', line => $l, column => $c);  
           !!!back-next-input-character ($x_char, $self->{next_char});  
           $self->{next_char} = 0x0023; # #  
           return undef;  
         } elsif ($self->{next_char} == 0x003B) { # ;  
           !!!cp (1006);  
           !!!next-input-character;  
3030          } else {          } else {
3031            !!!cp (1007);            !!!cp (993);
3032            !!!parse-error (type => 'no refc', line => $l, column => $c);            $self->{current_attribute}->{value} .= '&#';
3033              $self->{state} = $self->{prev_state};
3034              ## Reconsume.
3035              redo A;
3036          }          }
3037          }
3038          if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {      } elsif ($self->{state} == NCR_NUM_STATE) {
3039            !!!cp (1008);        if (0x0030 <= $self->{next_char} and
3040            !!!parse-error (type => 'invalid character reference',            $self->{next_char} <= 0x0039) { # 0..9
                           text => (sprintf 'U+%04X', $code),  
                           line => $l, column => $c);  
           $code = 0xFFFD;  
         } elsif ($code > 0x10FFFF) {  
           !!!cp (1009);  
           !!!parse-error (type => 'invalid character reference',  
                           text => (sprintf 'U-%08X', $code),  
                           line => $l, column => $c);  
           $code = 0xFFFD;  
         } elsif ($code == 0x000D) {  
           !!!cp (1010);  
           !!!parse-error (type => 'CR character reference', line => $l, column => $c);  
           $code = 0x000A;  
         } elsif (0x80 <= $code and $code <= 0x9F) {  
           !!!cp (1011);  
           !!!parse-error (type => 'C1 character reference', text => (sprintf 'U+%04X', $code), line => $l, column => $c);  
           $code = $c1_entity_char->{$code};  
         }  
   
         return {type => CHARACTER_TOKEN, data => chr $code,  
                 has_reference => 1,  
                 line => $l, column => $c,  
                };  
       } # X  
     } elsif (0x0030 <= $self->{next_char} and  
              $self->{next_char} <= 0x0039) { # 0..9  
       my $code = $self->{next_char} - 0x0030;  
       !!!next-input-character;  
         
       while (0x0030 <= $self->{next_char} and  
                 $self->{next_char} <= 0x0039) { # 0..9  
3041          !!!cp (1012);          !!!cp (1012);
3042          $code *= 10;          $self->{state_keyword} *= 10;
3043          $code += $self->{next_char} - 0x0030;          $self->{state_keyword} += $self->{next_char} - 0x0030;
3044                    
3045            ## Stay in the state.
3046          !!!next-input-character;          !!!next-input-character;
3047        }          redo A;
3048          } elsif ($self->{next_char} == 0x003B) { # ;
       if ($self->{next_char} == 0x003B) { # ;  
3049          !!!cp (1013);          !!!cp (1013);
3050          !!!next-input-character;          !!!next-input-character;
3051            #
3052        } else {        } else {
3053          !!!cp (1014);          !!!cp (1014);
3054          !!!parse-error (type => 'no refc', line => $l, column => $c);          !!!parse-error (type => 'no refc');
3055            ## Reconsume.
3056            #
3057        }        }
3058    
3059          my $code = $self->{state_keyword};
3060          my $l = $self->{line_prev};
3061          my $c = $self->{column_prev};
3062        if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {        if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
3063          !!!cp (1015);          !!!cp (1015);
3064          !!!parse-error (type => 'invalid character reference',          !!!parse-error (type => 'invalid character reference',
# Line 3101  sub _tokenize_attempt_to_consume_an_enti Line 3083  sub _tokenize_attempt_to_consume_an_enti
3083                          line => $l, column => $c);                          line => $l, column => $c);
3084          $code = $c1_entity_char->{$code};          $code = $c1_entity_char->{$code};
3085        }        }
3086          
3087        return {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1,        if ($self->{prev_state} == DATA_STATE) {
3088                line => $l, column => $c,          !!!cp (992);
3089               };          $self->{state} = $self->{prev_state};
3090      } else {          ## Reconsume.
3091        !!!cp (1019);          !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3092        !!!parse-error (type => 'bare nero', line => $l, column => $c);                    line => $l, column => $c,
3093        !!!back-next-input-character ($self->{next_char});                   });
3094        $self->{next_char} = 0x0023; # #          redo A;
3095        return undef;        } else {
3096      }          !!!cp (991);
3097    } elsif ((0x0041 <= $self->{next_char} and          $self->{current_attribute}->{value} .= chr $code;
3098              $self->{next_char} <= 0x005A) or          $self->{current_attribute}->{has_reference} = 1;
3099             (0x0061 <= $self->{next_char} and          $self->{state} = $self->{prev_state};
3100              $self->{next_char} <= 0x007A)) {          ## Reconsume.
3101      my $entity_name = chr $self->{next_char};          redo A;
3102      !!!next-input-character;        }
3103        } elsif ($self->{state} == HEXREF_X_STATE) {
3104      my $value = $entity_name;        if ((0x0030 <= $self->{next_char} and $self->{next_char} <= 0x0039) or
3105      my $match = 0;            (0x0041 <= $self->{next_char} and $self->{next_char} <= 0x0046) or
3106      require Whatpm::_NamedEntityList;            (0x0061 <= $self->{next_char} and $self->{next_char} <= 0x0066)) {
3107      our $EntityChar;          # 0..9, A..F, a..f
3108            !!!cp (990);
3109      while (length $entity_name < 30 and          $self->{state} = HEXREF_HEX_STATE;
3110             ## NOTE: Some number greater than the maximum length of entity name          $self->{state_keyword} = 0;
3111             ((0x0041 <= $self->{next_char} and # a          ## Reconsume.
3112               $self->{next_char} <= 0x005A) or # x          redo A;
3113              (0x0061 <= $self->{next_char} and # a        } else {
3114               $self->{next_char} <= 0x007A) or # z          !!!parse-error (type => 'bare hcro',
3115              (0x0030 <= $self->{next_char} and # 0                          line => $self->{line_prev},
3116               $self->{next_char} <= 0x0039) or # 9                          column => $self->{column_prev} - 2);
3117              $self->{next_char} == 0x003B)) { # ;  
3118        $entity_name .= chr $self->{next_char};          ## NOTE: According to the spec algorithm, nothing is returned,
3119        if (defined $EntityChar->{$entity_name}) {          ## and then "&#" followed by "X" or "x" is appended to the parent
3120          if ($self->{next_char} == 0x003B) { # ;          ## element or the attribute value in the later processing.
3121            !!!cp (1020);  
3122            $value = $EntityChar->{$entity_name};          if ($self->{prev_state} == DATA_STATE) {
3123            $match = 1;            !!!cp (1005);
3124            !!!next-input-character;            $self->{state} = $self->{prev_state};
3125            last;            ## Reconsume.
3126              !!!emit ({type => CHARACTER_TOKEN,
3127                        data => '&' . $self->{state_keyword},
3128                        line => $self->{line_prev},
3129                        column => $self->{column_prev} - length $self->{state_keyword},
3130                       });
3131              redo A;
3132            } else {
3133              !!!cp (989);
3134              $self->{current_attribute}->{value} .= '&' . $self->{state_keyword};
3135              $self->{state} = $self->{prev_state};
3136              ## Reconsume.
3137              redo A;
3138            }
3139          }
3140        } elsif ($self->{state} == HEXREF_HEX_STATE) {
3141          if (0x0030 <= $self->{next_char} and $self->{next_char} <= 0x0039) {
3142            # 0..9
3143            !!!cp (1002);
3144            $self->{state_keyword} *= 0x10;
3145            $self->{state_keyword} += $self->{next_char} - 0x0030;
3146            ## Stay in the state.
3147            !!!next-input-character;
3148            redo A;
3149          } elsif (0x0061 <= $self->{next_char} and
3150                   $self->{next_char} <= 0x0066) { # a..f
3151            !!!cp (1003);
3152            $self->{state_keyword} *= 0x10;
3153            $self->{state_keyword} += $self->{next_char} - 0x0060 + 9;
3154            ## Stay in the state.
3155            !!!next-input-character;
3156            redo A;
3157          } elsif (0x0041 <= $self->{next_char} and
3158                   $self->{next_char} <= 0x0046) { # A..F
3159            !!!cp (1004);
3160            $self->{state_keyword} *= 0x10;
3161            $self->{state_keyword} += $self->{next_char} - 0x0040 + 9;
3162            ## Stay in the state.
3163            !!!next-input-character;
3164            redo A;
3165          } elsif ($self->{next_char} == 0x003B) { # ;
3166            !!!cp (1006);
3167            !!!next-input-character;
3168            #
3169          } else {
3170            !!!cp (1007);
3171            !!!parse-error (type => 'no refc',
3172                            line => $self->{line},
3173                            column => $self->{column});
3174            ## Reconsume.
3175            #
3176          }
3177    
3178          my $code = $self->{state_keyword};
3179          my $l = $self->{line_prev};
3180          my $c = $self->{column_prev};
3181          if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
3182            !!!cp (1008);
3183            !!!parse-error (type => 'invalid character reference',
3184                            text => (sprintf 'U+%04X', $code),
3185                            line => $l, column => $c);
3186            $code = 0xFFFD;
3187          } elsif ($code > 0x10FFFF) {
3188            !!!cp (1009);
3189            !!!parse-error (type => 'invalid character reference',
3190                            text => (sprintf 'U-%08X', $code),
3191                            line => $l, column => $c);
3192            $code = 0xFFFD;
3193          } elsif ($code == 0x000D) {
3194            !!!cp (1010);
3195            !!!parse-error (type => 'CR character reference', line => $l, column => $c);
3196            $code = 0x000A;
3197          } elsif (0x80 <= $code and $code <= 0x9F) {
3198            !!!cp (1011);
3199            !!!parse-error (type => 'C1 character reference', text => (sprintf 'U+%04X', $code), line => $l, column => $c);
3200            $code = $c1_entity_char->{$code};
3201          }
3202    
3203          if ($self->{prev_state} == DATA_STATE) {
3204            !!!cp (988);
3205            $self->{state} = $self->{prev_state};
3206            ## Reconsume.
3207            !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3208                      line => $l, column => $c,
3209                     });
3210            redo A;
3211          } else {
3212            !!!cp (987);
3213            $self->{current_attribute}->{value} .= chr $code;
3214            $self->{current_attribute}->{has_reference} = 1;
3215            $self->{state} = $self->{prev_state};
3216            ## Reconsume.
3217            redo A;
3218          }
3219        } elsif ($self->{state} == ENTITY_NAME_STATE) {
3220          if (length $self->{state_keyword} < 30 and
3221              ## NOTE: Some number greater than the maximum length of entity name
3222              ((0x0041 <= $self->{next_char} and # a
3223                $self->{next_char} <= 0x005A) or # x
3224               (0x0061 <= $self->{next_char} and # a
3225                $self->{next_char} <= 0x007A) or # z
3226               (0x0030 <= $self->{next_char} and # 0
3227                $self->{next_char} <= 0x0039) or # 9
3228               $self->{next_char} == 0x003B)) { # ;
3229            our $EntityChar;
3230            $self->{state_keyword} .= chr $self->{next_char};
3231            if (defined $EntityChar->{$self->{state_keyword}}) {
3232              if ($self->{next_char} == 0x003B) { # ;
3233                !!!cp (1020);
3234                $self->{entity__value} = $EntityChar->{$self->{state_keyword}};
3235                $self->{entity__match} = 1;
3236                !!!next-input-character;
3237                #
3238              } else {
3239                !!!cp (1021);
3240                $self->{entity__value} = $EntityChar->{$self->{state_keyword}};
3241                $self->{entity__match} = -1;
3242                ## Stay in the state.
3243                !!!next-input-character;
3244                redo A;
3245              }
3246          } else {          } else {
3247            !!!cp (1021);            !!!cp (1022);
3248            $value = $EntityChar->{$entity_name};            $self->{entity__value} .= chr $self->{next_char};
3249            $match = -1;            $self->{entity__match} *= 2;
3250              ## Stay in the state.
3251            !!!next-input-character;            !!!next-input-character;
3252              redo A;
3253            }
3254          }
3255    
3256          my $data;
3257          my $has_ref;
3258          if ($self->{entity__match} > 0) {
3259            !!!cp (1023);
3260            $data = $self->{entity__value};
3261            $has_ref = 1;
3262            #
3263          } elsif ($self->{entity__match} < 0) {
3264            !!!parse-error (type => 'no refc');
3265            if ($self->{prev_state} != DATA_STATE and # in attribute
3266                $self->{entity__match} < -1) {
3267              !!!cp (1024);
3268              $data = '&' . $self->{state_keyword};
3269              #
3270            } else {
3271              !!!cp (1025);
3272              $data = $self->{entity__value};
3273              $has_ref = 1;
3274              #
3275          }          }
3276        } else {        } else {
3277          !!!cp (1022);          !!!cp (1026);
3278          $value .= chr $self->{next_char};          !!!parse-error (type => 'bare ero',
3279          $match *= 2;                          line => $self->{line_prev},
3280          !!!next-input-character;                          column => $self->{column_prev});
3281            $data = '&' . $self->{state_keyword};
3282            #
3283        }        }
3284      }    
3285              ## NOTE: In these cases, when a character reference is found,
3286      if ($match > 0) {        ## it is consumed and a character token is returned, or, otherwise,
3287        !!!cp (1023);        ## nothing is consumed and returned, according to the spec algorithm.
3288        return {type => CHARACTER_TOKEN, data => $value, has_reference => 1,        ## In this implementation, anything that has been examined by the
3289                line => $l, column => $c,        ## tokenizer is appended to the parent element or the attribute value
3290               };        ## as string, either literal string when no character reference or
3291      } elsif ($match < 0) {        ## entity-replaced string otherwise, in this stage, since any characters
3292        !!!parse-error (type => 'no refc', line => $l, column => $c);        ## that would not be consumed are appended in the data state or in an
3293        if ($in_attr and $match < -1) {        ## appropriate attribute value state anyway.
3294          !!!cp (1024);  
3295          return {type => CHARACTER_TOKEN, data => '&'.$entity_name,        if ($self->{prev_state} == DATA_STATE) {
3296                  line => $l, column => $c,          !!!cp (986);
3297                 };          $self->{state} = $self->{prev_state};
3298        } else {          ## Reconsume.
3299          !!!cp (1025);          !!!emit ({type => CHARACTER_TOKEN,
3300          return {type => CHARACTER_TOKEN, data => $value, has_reference => 1,                    data => $data,
3301                  line => $l, column => $c,                    line => $self->{line_prev},
3302                 };                    column => $self->{column_prev} + 1 - length $self->{state_keyword},
3303                     });
3304            redo A;
3305          } else {
3306            !!!cp (985);
3307            $self->{current_attribute}->{value} .= $data;
3308            $self->{current_attribute}->{has_reference} = 1 if $has_ref;
3309            $self->{state} = $self->{prev_state};
3310            ## Reconsume.
3311            redo A;
3312        }        }
3313      } else {      } else {
3314        !!!cp (1026);        die "$0: $self->{state}: Unknown state";
       !!!parse-error (type => 'bare ero', line => $l, column => $c);  
       ## NOTE: "No characters are consumed" in the spec.  
       return {type => CHARACTER_TOKEN, data => '&'.$value,  
               line => $l, column => $c,  
              };  
3315      }      }
3316    } else {    } # A  
3317      !!!cp (1027);  
3318      ## no characters are consumed    die "$0: _get_next_token: unexpected case";
3319      !!!parse-error (type => 'bare ero', line => $l, column => $c);  } # _get_next_token
     return undef;  
   }  
 } # _tokenize_attempt_to_consume_an_entity  
3320    
3321  sub _initialize_tree_constructor ($) {  sub _initialize_tree_constructor ($) {
3322    my $self = shift;    my $self = shift;

Legend:
Removed from v.1.166  
changed lines
  Added in v.1.170

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24