/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.9 by wakaba, Wed Oct 15 08:05:47 2008 UTC revision 1.10 by wakaba, Wed Oct 15 08:51:02 2008 UTC
# Line 507  sub _get_next_token ($) { Line 507  sub _get_next_token ($) {
507        return  ($token);        return  ($token);
508        redo A;        redo A;
509      } elsif ($self->{state} == TAG_OPEN_STATE) {      } elsif ($self->{state} == TAG_OPEN_STATE) {
510          ## XML5: "tag state".
511    
512        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
513          if ($self->{nc} == 0x002F) { # /          if ($self->{nc} == 0x002F) { # /
514                        
# Line 709  sub _get_next_token ($) { Line 711  sub _get_next_token ($) {
711        ## NOTE: The "close tag open state" in the spec is implemented as        ## NOTE: The "close tag open state" in the spec is implemented as
712        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
713    
714          ## XML5: "end tag state".
715    
716        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
717        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
718          if (defined $self->{last_stag_name}) {          if (defined $self->{last_stag_name}) {
# Line 770  sub _get_next_token ($) { Line 774  sub _get_next_token ($) {
774        
775          redo A;          redo A;
776        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
777          $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
778                          line => $self->{line_prev}, ## "<" in "</>"                          line => $self->{line_prev}, ## "<" in "</>"
779                          column => $self->{column_prev} - 1);                          column => $self->{column_prev} - 1);
780          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
781          $self->{s_kwd} = '';          $self->{s_kwd} = '';
782                    if ($self->{is_xml}) {
783              
784              ## XML5: No parse error.
785              
786              ## NOTE: This parser raises a parse error, since it supports
787              ## XML1, not XML5.
788    
789              ## NOTE: A short end tag token.
790              my $ct = {type => END_TAG_TOKEN,
791                        tag_name => '',
792                        line => $self->{line_prev},
793                        column => $self->{column_prev} - 1,
794                       };
795              
796      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
797        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
798        $self->{column_prev} = $self->{column};        $self->{column_prev} = $self->{column};
# Line 787  sub _get_next_token ($) { Line 803  sub _get_next_token ($) {
803        $self->{set_nc}->($self);        $self->{set_nc}->($self);
804      }      }
805        
806              return  ($ct);
807            } else {
808              
809              
810        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
811          $self->{line_prev} = $self->{line};
812          $self->{column_prev} = $self->{column};
813          $self->{column}++;
814          $self->{nc}
815              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
816        } else {
817          $self->{set_nc}->($self);
818        }
819      
820            }
821          redo A;          redo A;
822        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
823                    
# Line 800  sub _get_next_token ($) { Line 831  sub _get_next_token ($) {
831                   });                   });
832    
833          redo A;          redo A;
834        } else {        } elsif (not $self->{is_xml} or
835                   $is_space->{$self->{nc}}) {
836                    
837          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
838                            line => $self->{line_prev}, # "<" of "</"
839                            column => $self->{column_prev} - 1);
840          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
841          $self->{ct} = {type => COMMENT_TOKEN, data => '',          $self->{ct} = {type => COMMENT_TOKEN, data => '',
842                                    line => $self->{line_prev}, # "<" of "</"                                    line => $self->{line_prev}, # "<" of "</"
# Line 815  sub _get_next_token ($) { Line 849  sub _get_next_token ($) {
849          ## generated from the bogus end tag, as defined in the          ## generated from the bogus end tag, as defined in the
850          ## "bogus comment state" entry.          ## "bogus comment state" entry.
851          redo A;          redo A;
852          } else {
853            ## XML5: "</:" is a parse error.
854            
855            $self->{ct} = {type => END_TAG_TOKEN,
856                           tag_name => chr ($self->{nc}),
857                           line => $l, column => $c};
858            $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
859            
860        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
861          $self->{line_prev} = $self->{line};
862          $self->{column_prev} = $self->{column};
863          $self->{column}++;
864          $self->{nc}
865              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
866        } else {
867          $self->{set_nc}->($self);
868        }
869      
870            redo A;
871        }        }
872      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
873        my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;        my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
# Line 2152  sub _get_next_token ($) { Line 2205  sub _get_next_token ($) {
2205                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2206                                    column => $self->{column_prev} - 2,                                    column => $self->{column_prev} - 2,
2207                                   };                                   };
2208          $self->{state} = COMMENT_START_STATE;          $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2209                    
2210      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2211        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2215  sub _get_next_token ($) { Line 2268  sub _get_next_token ($) {
2268        } elsif ((length $self->{s_kwd}) == 6 and        } elsif ((length $self->{s_kwd}) == 6 and
2269                 ($self->{nc} == 0x0045 or # E                 ($self->{nc} == 0x0045 or # E
2270                  $self->{nc} == 0x0065)) { # e                  $self->{nc} == 0x0065)) { # e
2271                    if ($self->{s_kwd} ne 'DOCTYP') {
2272              
2273              ## XML5: case-sensitive.
2274              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2275                              text => 'DOCTYPE',
2276                              line => $self->{line_prev},
2277                              column => $self->{column_prev} - 5);
2278            } else {
2279              
2280            }
2281          $self->{state} = DOCTYPE_STATE;          $self->{state} = DOCTYPE_STATE;
2282          $self->{ct} = {type => DOCTYPE_TOKEN,          $self->{ct} = {type => DOCTYPE_TOKEN,
2283                                    quirks => 1,                                    quirks => 1,
# Line 2492  sub _get_next_token ($) { Line 2554  sub _get_next_token ($) {
2554          redo A;          redo A;
2555        }        }
2556      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2557          ## XML5: "comment dash state".
2558    
2559        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2560                    
2561          $self->{state} = COMMENT_END_STATE;          $self->{state} = COMMENT_END_STATE;
# Line 2557  sub _get_next_token ($) { Line 2621  sub _get_next_token ($) {
2621          redo A;          redo A;
2622        } elsif ($self->{nc} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
2623                    
2624            ## XML5: Not a parse error.
2625          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2626                          line => $self->{line_prev},                          line => $self->{line_prev},
2627                          column => $self->{column_prev});                          column => $self->{column_prev});
# Line 2586  sub _get_next_token ($) { Line 2651  sub _get_next_token ($) {
2651          redo A;          redo A;
2652        } else {        } else {
2653                    
2654            ## XML5: Not a parse error.
2655          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2656                          line => $self->{line_prev},                          line => $self->{line_prev},
2657                          column => $self->{column_prev});                          column => $self->{column_prev});
# Line 3671  sub _get_next_token ($) { Line 3737  sub _get_next_token ($) {
3737        ## NOTE: "CDATA section state" in the state is jointly implemented        ## NOTE: "CDATA section state" in the state is jointly implemented
3738        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
3739        ## and |CDATA_SECTION_MSE2_STATE|.        ## and |CDATA_SECTION_MSE2_STATE|.
3740    
3741          ## XML5: "CDATA state".
3742                
3743        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
3744                    
# Line 3697  sub _get_next_token ($) { Line 3765  sub _get_next_token ($) {
3765    
3766          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3767          $self->{s_kwd} = '';          $self->{s_kwd} = '';
3768                    ## Reconsume.
     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {  
       $self->{line_prev} = $self->{line};  
       $self->{column_prev} = $self->{column};  
       $self->{column}++;  
       $self->{nc}  
           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);  
     } else {  
       $self->{set_nc}->($self);  
     }  
     
3769          if (length $self->{ct}->{data}) { # character          if (length $self->{ct}->{data}) { # character
3770                        
3771            return  ($self->{ct}); # character            return  ($self->{ct}); # character
# Line 3740  sub _get_next_token ($) { Line 3798  sub _get_next_token ($) {
3798    
3799        ## ISSUE: "text tokens" in spec.        ## ISSUE: "text tokens" in spec.
3800      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3801          ## XML5: "CDATA bracket state".
3802    
3803        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
3804                    
3805          $self->{state} = CDATA_SECTION_MSE2_STATE;          $self->{state} = CDATA_SECTION_MSE2_STATE;
# Line 3757  sub _get_next_token ($) { Line 3817  sub _get_next_token ($) {
3817          redo A;          redo A;
3818        } else {        } else {
3819                    
3820            ## XML5: If EOF, "]" is not appended and changed to the data state.
3821          $self->{ct}->{data} .= ']';          $self->{ct}->{data} .= ']';
3822          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
3823          ## Reconsume.          ## Reconsume.
3824          redo A;          redo A;
3825        }        }
3826      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3827          ## XML5: "CDATA end state".
3828    
3829        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
3830          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3831          $self->{s_kwd} = '';          $self->{s_kwd} = '';
# Line 3805  sub _get_next_token ($) { Line 3868  sub _get_next_token ($) {
3868                    
3869          $self->{ct}->{data} .= ']]'; # character          $self->{ct}->{data} .= ']]'; # character
3870          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE;
3871          ## Reconsume.          ## Reconsume. ## XML5: Emit.
3872          redo A;          redo A;
3873        }        }
3874      } elsif ($self->{state} == ENTITY_STATE) {      } elsif ($self->{state} == ENTITY_STATE) {

Legend:
Removed from v.1.9  
changed lines
  Added in v.1.10

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24