/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.168 by wakaba, Sat Sep 13 10:49:21 2008 UTC revision 1.169 by wakaba, Sat Sep 13 11:31:09 2008 UTC
# Line 874  sub _initialize_tokenizer ($) { Line 874  sub _initialize_tokenizer ($) {
874    my $self = shift;    my $self = shift;
875    $self->{state} = DATA_STATE; # MUST    $self->{state} = DATA_STATE; # MUST
876    #$self->{state_keyword}; # initialized when used    #$self->{state_keyword}; # initialized when used
877      #$self->{entity__value}; # initialized when used
878      #$self->{entity__match}; # initialized when used
879    $self->{content_model} = PCDATA_CONTENT_MODEL; # be    $self->{content_model} = PCDATA_CONTENT_MODEL; # be
880    undef $self->{current_token};    undef $self->{current_token};
881    undef $self->{current_attribute};    undef $self->{current_attribute};
882    undef $self->{last_emitted_start_tag_name};    undef $self->{last_emitted_start_tag_name};
883    undef $self->{last_attribute_value_state};    #$self->{prev_state}; # initialized when used
884    delete $self->{self_closing};    delete $self->{self_closing};
   $self->{char} = [];  
885    # $self->{next_char}    # $self->{next_char}
886    !!!next-input-character;    !!!next-input-character;
887    $self->{token} = [];    $self->{token} = [];
# Line 912  sub _initialize_tokenizer ($) { Line 913  sub _initialize_tokenizer ($) {
913  ## has completed loading.  If one has, then it MUST be executed  ## has completed loading.  If one has, then it MUST be executed
914  ## and removed from the list.  ## and removed from the list.
915    
916  ## NOTE: HTML5 "Writing HTML documents" section, applied to  ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
917  ## documents and not to user agents and conformance checkers,  ## (This requirement was dropped from HTML5 spec, unfortunately.)
 ## contains some requirements that are not detected by the  
 ## parsing algorithm:  
 ## - Some requirements on character encoding declarations. ## TODO  
 ## - "Elements MUST NOT contain content that their content model disallows."  
 ##   ... Some are parse error, some are not (will be reported by c.c.).  
 ## - Polytheistic slash SHOULD NOT be used. (Applied only to atheists.) ## TODO  
 ## - Text (in elements, attributes, and comments) SHOULD NOT contain  
 ##   control characters other than space characters. ## TODO: (what is control character? C0, C1 and DEL?  Unicode control character?)  
   
 ## TODO: HTML5 poses authors two SHOULD-level requirements that cannot  
 ## be detected by the HTML5 parsing algorithm:  
 ## - Text,  
918    
919  sub _get_next_token ($) {  sub _get_next_token ($) {
920    my $self = shift;    my $self = shift;
# Line 953  sub _get_next_token ($) { Line 942  sub _get_next_token ($) {
942            ## "entity data state".  In this implementation, the tokenizer            ## "entity data state".  In this implementation, the tokenizer
943            ## is switched to the |ENTITY_STATE|, which is an implementation            ## is switched to the |ENTITY_STATE|, which is an implementation
944            ## of the "consume a character reference" algorithm.            ## of the "consume a character reference" algorithm.
           $self->{entity_in_attr} = 0;  
945            $self->{entity_additional} = -1;            $self->{entity_additional} = -1;
946              $self->{prev_state} = DATA_STATE;
947            $self->{state} = ENTITY_STATE;            $self->{state} = ENTITY_STATE;
948            !!!next-input-character;            !!!next-input-character;
949            redo A;            redo A;
# Line 1698  sub _get_next_token ($) { Line 1687  sub _get_next_token ($) {
1687          redo A;          redo A;
1688        } elsif ($self->{next_char} == 0x0026) { # &        } elsif ($self->{next_char} == 0x0026) { # &
1689          !!!cp (96);          !!!cp (96);
         $self->{last_attribute_value_state} = $self->{state};  
1690          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1691          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1692          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
1693          ## implementation of the "consume a character reference" algorithm.          ## implementation of the "consume a character reference" algorithm.
1694          $self->{entity_in_attr} = 1;          $self->{prev_state} = $self->{state};
1695          $self->{entity_additional} = 0x0022; # "          $self->{entity_additional} = 0x0022; # "
1696          $self->{state} = ENTITY_STATE;          $self->{state} = ENTITY_STATE;
1697          !!!next-input-character;          !!!next-input-character;
# Line 1746  sub _get_next_token ($) { Line 1734  sub _get_next_token ($) {
1734          redo A;          redo A;
1735        } elsif ($self->{next_char} == 0x0026) { # &        } elsif ($self->{next_char} == 0x0026) { # &
1736          !!!cp (102);          !!!cp (102);
         $self->{last_attribute_value_state} = $self->{state};  
1737          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1738          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1739          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
1740          ## implementation of the "consume a character reference" algorithm.          ## implementation of the "consume a character reference" algorithm.
         $self->{entity_in_attr} = 1;  
1741          $self->{entity_additional} = 0x0027; # '          $self->{entity_additional} = 0x0027; # '
1742            $self->{prev_state} = $self->{state};
1743          $self->{state} = ENTITY_STATE;          $self->{state} = ENTITY_STATE;
1744          !!!next-input-character;          !!!next-input-character;
1745          redo A;          redo A;
# Line 1798  sub _get_next_token ($) { Line 1785  sub _get_next_token ($) {
1785          redo A;          redo A;
1786        } elsif ($self->{next_char} == 0x0026) { # &        } elsif ($self->{next_char} == 0x0026) { # &
1787          !!!cp (108);          !!!cp (108);
         $self->{last_attribute_value_state} = $self->{state};  
1788          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1789          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1790          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
1791          ## implementation of the "consume a character reference" algorithm.          ## implementation of the "consume a character reference" algorithm.
         $self->{entity_in_attr} = 1;  
1792          $self->{entity_additional} = -1;          $self->{entity_additional} = -1;
1793            $self->{prev_state} = $self->{state};
1794          $self->{state} = ENTITY_STATE;          $self->{state} = ENTITY_STATE;
1795          !!!next-input-character;          !!!next-input-character;
1796          redo A;          redo A;
# Line 2988  sub _get_next_token ($) { Line 2974  sub _get_next_token ($) {
2974        ## appended to the parent element or the attribute value in later        ## appended to the parent element or the attribute value in later
2975        ## process of the tokenizer.        ## process of the tokenizer.
2976    
2977        if ($self->{entity_in_attr}) {        if ($self->{prev_state} == DATA_STATE) {
2978          $self->{current_attribute}->{value} .= '&';          $self->{state} = $self->{prev_state};
         $self->{state} = $self->{last_attribute_value_state};  
         ## Reconsume.  
         redo A;  
       } else {  
         $self->{state} = DATA_STATE;  
2979          ## Reconsume.          ## Reconsume.
2980          !!!emit ({type => CHARACTER_TOKEN, data => '&',          !!!emit ({type => CHARACTER_TOKEN, data => '&',
2981                    line => $self->{line_prev},                    line => $self->{line_prev},
2982                    column => $self->{column_prev},                    column => $self->{column_prev},
2983                   });                   });
2984          redo A;          redo A;
2985          } else {
2986            $self->{current_attribute}->{value} .= '&';
2987            $self->{state} = $self->{prev_state};
2988            ## Reconsume.
2989            redo A;
2990        }        }
2991      } elsif ($self->{state} == ENTITY_HASH_STATE) {      } elsif ($self->{state} == ENTITY_HASH_STATE) {
2992        if ($self->{next_char} == 0x0078 or # x        if ($self->{next_char} == 0x0078 or # x
# Line 3025  sub _get_next_token ($) { Line 3011  sub _get_next_token ($) {
3011          ## and then "&#" is appended to the parent element or the attribute          ## and then "&#" is appended to the parent element or the attribute
3012          ## value in the later processing.          ## value in the later processing.
3013    
3014          if ($self->{entity_in_attr}) {          if ($self->{prev_state} == DATA_STATE) {
3015            $self->{current_attribute}->{value} .= '&#';            $self->{state} = $self->{prev_state};
           $self->{state} = $self->{last_attribute_value_state};  
           ## Reconsume.  
           redo A;  
         } else {  
           $self->{state} = DATA_STATE;  
3016            ## Reconsume.            ## Reconsume.
3017            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
3018                      data => '&#',                      data => '&#',
# Line 3039  sub _get_next_token ($) { Line 3020  sub _get_next_token ($) {
3020                      column => $self->{column_prev} - 1,                      column => $self->{column_prev} - 1,
3021                     });                     });
3022            redo A;            redo A;
3023            } else {
3024              $self->{current_attribute}->{value} .= '&#';
3025              $self->{state} = $self->{prev_state};
3026              ## Reconsume.
3027              redo A;
3028          }          }
3029        }        }
3030      } elsif ($self->{state} == NCR_NUM_STATE) {      } elsif ($self->{state} == NCR_NUM_STATE) {
# Line 3090  sub _get_next_token ($) { Line 3076  sub _get_next_token ($) {
3076          $code = $c1_entity_char->{$code};          $code = $c1_entity_char->{$code};
3077        }        }
3078    
3079        if ($self->{entity_in_attr}) {        if ($self->{prev_state} == DATA_STATE) {
3080          $self->{current_attribute}->{value} .= chr $code;          $self->{state} = $self->{prev_state};
         $self->{current_attribute}->{has_reference} = 1;  
         $self->{state} = $self->{last_attribute_value_state};  
         ## Reconsume.  
         redo A;  
       } else {  
         $self->{state} = DATA_STATE;  
3081          ## Reconsume.          ## Reconsume.
3082          !!!emit ({type => CHARACTER_TOKEN, data => chr $code,          !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
                   has_reference => 1,  
3083                    line => $l, column => $c,                    line => $l, column => $c,
3084                   });                   });
3085          redo A;          redo A;
3086          } else {
3087            $self->{current_attribute}->{value} .= chr $code;
3088            $self->{current_attribute}->{has_reference} = 1;
3089            $self->{state} = $self->{prev_state};
3090            ## Reconsume.
3091            redo A;
3092        }        }
3093      } elsif ($self->{state} == HEXREF_X_STATE) {      } elsif ($self->{state} == HEXREF_X_STATE) {
3094        if ((0x0030 <= $self->{next_char} and $self->{next_char} <= 0x0039) or        if ((0x0030 <= $self->{next_char} and $self->{next_char} <= 0x0039) or
# Line 3124  sub _get_next_token ($) { Line 3109  sub _get_next_token ($) {
3109          ## and then "&#" followed by "X" or "x" is appended to the parent          ## and then "&#" followed by "X" or "x" is appended to the parent
3110          ## element or the attribute value in the later processing.          ## element or the attribute value in the later processing.
3111    
3112          if ($self->{entity_in_attr}) {          if ($self->{prev_state} == DATA_STATE) {
3113            $self->{current_attribute}->{value} .= '&' . $self->{state_keyword};            $self->{state} = $self->{prev_state};
           $self->{state} = $self->{last_attribute_value_state};  
           ## Reconsume.  
           redo A;  
         } else {  
           $self->{state} = DATA_STATE;  
3114            ## Reconsume.            ## Reconsume.
3115            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
3116                      data => '&' . $self->{state_keyword},                      data => '&' . $self->{state_keyword},
# Line 3138  sub _get_next_token ($) { Line 3118  sub _get_next_token ($) {
3118                      column => $self->{column_prev} - length $self->{state_keyword},                      column => $self->{column_prev} - length $self->{state_keyword},
3119                     });                     });
3120            redo A;            redo A;
3121            } else {
3122              $self->{current_attribute}->{value} .= '&' . $self->{state_keyword};
3123              $self->{state} = $self->{prev_state};
3124              ## Reconsume.
3125              redo A;
3126          }          }
3127        }        }
3128      } elsif ($self->{state} == HEXREF_HEX_STATE) {      } elsif ($self->{state} == HEXREF_HEX_STATE) {
# Line 3203  sub _get_next_token ($) { Line 3188  sub _get_next_token ($) {
3188          $code = $c1_entity_char->{$code};          $code = $c1_entity_char->{$code};
3189        }        }
3190    
3191        if ($self->{entity_in_attr}) {        if ($self->{prev_state} == DATA_STATE) {
3192          $self->{current_attribute}->{value} .= chr $code;          $self->{state} = $self->{prev_state};
         $self->{current_attribute}->{has_reference} = 1;  
         $self->{state} = $self->{last_attribute_value_state};  
         ## Reconsume.  
         redo A;  
       } else {  
         $self->{state} = DATA_STATE;  
3193          ## Reconsume.          ## Reconsume.
3194          !!!emit ({type => CHARACTER_TOKEN, data => chr $code,          !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
                   has_reference => 1,  
3195                    line => $l, column => $c,                    line => $l, column => $c,
3196                   });                   });
3197          redo A;          redo A;
3198          } else {
3199            $self->{current_attribute}->{value} .= chr $code;
3200            $self->{current_attribute}->{has_reference} = 1;
3201            $self->{state} = $self->{prev_state};
3202            ## Reconsume.
3203            redo A;
3204        }        }
3205      } elsif ($self->{state} == ENTITY_NAME_STATE) {      } elsif ($self->{state} == ENTITY_NAME_STATE) {
3206        if (length $self->{state_keyword} < 30 and        if (length $self->{state_keyword} < 30 and
# Line 3264  sub _get_next_token ($) { Line 3248  sub _get_next_token ($) {
3248          #          #
3249        } elsif ($self->{entity__match} < 0) {        } elsif ($self->{entity__match} < 0) {
3250          !!!parse-error (type => 'no refc');          !!!parse-error (type => 'no refc');
3251          if ($self->{entity_in_attr} and $self->{entity__match} < -1) {          if ($self->{prev_state} != DATA_STATE and # in attribute
3252                $self->{entity__match} < -1) {
3253            !!!cp (1024);            !!!cp (1024);
3254            $data = '&' . $self->{state_keyword};            $data = '&' . $self->{state_keyword};
3255            #            #
# Line 3293  sub _get_next_token ($) { Line 3278  sub _get_next_token ($) {
3278        ## that would not be consumed are appended in the data state or in an        ## that would not be consumed are appended in the data state or in an
3279        ## appropriate attribute value state anyway.        ## appropriate attribute value state anyway.
3280    
3281        if ($self->{entity_in_attr}) {        if ($self->{prev_state} == DATA_STATE) {
3282          $self->{current_attribute}->{value} .= $data;          $self->{state} = $self->{prev_state};
         $self->{current_attribute}->{has_reference} = 1 if $has_ref;  
         $self->{state} = $self->{last_attribute_value_state};  
         ## Reconsume.  
         redo A;  
       } else {  
         $self->{state} = DATA_STATE;  
3283          ## Reconsume.          ## Reconsume.
3284          !!!emit ({type => CHARACTER_TOKEN,          !!!emit ({type => CHARACTER_TOKEN,
3285                    data => $data, has_reference => $has_ref,                    data => $data,
3286                    line => $self->{line_prev},                    line => $self->{line_prev},
3287                    column => $self->{column_prev} + 1 - length $self->{state_keyword},                    column => $self->{column_prev} + 1 - length $self->{state_keyword},
3288                   });                   });
3289          redo A;          redo A;
3290          } else {
3291            $self->{current_attribute}->{value} .= $data;
3292            $self->{current_attribute}->{has_reference} = 1 if $has_ref;
3293            $self->{state} = $self->{prev_state};
3294            ## Reconsume.
3295            redo A;
3296        }        }
3297      } else {      } else {
3298        die "$0: $self->{state}: Unknown state";        die "$0: $self->{state}: Unknown state";

Legend:
Removed from v.1.168  
changed lines
  Added in v.1.169

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24