/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.12 by wakaba, Wed Oct 15 12:49:49 2008 UTC revision 1.13 by wakaba, Thu Oct 16 03:39:57 2008 UTC
# Line 15  BEGIN { Line 15  BEGIN {
15      CHARACTER_TOKEN      CHARACTER_TOKEN
16      PI_TOKEN      PI_TOKEN
17      ABORT_TOKEN      ABORT_TOKEN
18        END_OF_DOCTYPE_TOKEN
19    );    );
20        
21    our %EXPORT_TAGS = (    our %EXPORT_TAGS = (
# Line 27  BEGIN { Line 28  BEGIN {
28        CHARACTER_TOKEN        CHARACTER_TOKEN
29        PI_TOKEN        PI_TOKEN
30        ABORT_TOKEN        ABORT_TOKEN
31          END_OF_DOCTYPE_TOKEN
32      )],      )],
33    );    );
34  }  }
# Line 43  sub END_OF_FILE_TOKEN () { 5 } Line 45  sub END_OF_FILE_TOKEN () { 5 }
45  sub CHARACTER_TOKEN () { 6 }  sub CHARACTER_TOKEN () { 6 }
46  sub PI_TOKEN () { 7 } ## NOTE: XML only.  sub PI_TOKEN () { 7 } ## NOTE: XML only.
47  sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.  sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
48    sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only
49    
50  ## XML5: XML5 has "empty tag token".  In this implementation, it is  ## XML5: XML5 has "empty tag token".  In this implementation, it is
51  ## represented as a start tag token with $self->{self_closing} flag  ## represented as a start tag token with $self->{self_closing} flag
# Line 133  sub PI_AFTER_STATE () { 55 } Line 136  sub PI_AFTER_STATE () { 55 }
136  sub PI_DATA_AFTER_STATE () { 56 }  sub PI_DATA_AFTER_STATE () { 56 }
137  sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }  sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
138  sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }  sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
139    sub DOCTYPE_TAG_STATE () { 59 }
140    sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 60 }
141    
142  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
143  ## list and descriptions)  ## list and descriptions)
# Line 2183  sub _get_next_token ($) { Line 2188  sub _get_next_token ($) {
2188          redo A;          redo A;
2189        }        }
2190      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
       ## (only happen if PCDATA state)  
   
2191        ## NOTE: Unlike spec's "bogus comment state", this implementation        ## NOTE: Unlike spec's "bogus comment state", this implementation
2192        ## consumes characters one-by-one basis.        ## consumes characters one-by-one basis.
2193                
2194        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2195                    if ($self->{in_subset}) {
2196          $self->{state} = DATA_STATE;            
2197          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2198            } else {
2199              
2200              $self->{state} = DATA_STATE;
2201              $self->{s_kwd} = '';
2202            }
2203                    
2204      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2205        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2207  sub _get_next_token ($) { Line 2215  sub _get_next_token ($) {
2215          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
2216          redo A;          redo A;
2217        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2218                    if ($self->{in_subset}) {
2219          $self->{state} = DATA_STATE;            
2220          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2221            } else {
2222              
2223              $self->{state} = DATA_STATE;
2224              $self->{s_kwd} = '';
2225            }
2226          ## reconsume          ## reconsume
2227    
2228          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2236  sub _get_next_token ($) { Line 2249  sub _get_next_token ($) {
2249          redo A;          redo A;
2250        }        }
2251      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2252        ## (only happen if PCDATA state)        ## XML5: "Markup declaration state" and "DOCTYPE markup
2253          ## declaration state".
2254                
2255        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2256                    
# Line 2502  sub _get_next_token ($) { Line 2516  sub _get_next_token ($) {
2516        
2517          redo A;          redo A;
2518        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
2519          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2520          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2521          $self->{s_kwd} = '';            
2522              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2523            } else {
2524              
2525              $self->{state} = DATA_STATE;
2526              $self->{s_kwd} = '';
2527            }
2528                    
2529      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2530        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2522  sub _get_next_token ($) { Line 2541  sub _get_next_token ($) {
2541    
2542          redo A;          redo A;
2543        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2544          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2545          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2546          $self->{s_kwd} = '';            
2547              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2548            } else {
2549              
2550              $self->{state} = DATA_STATE;
2551              $self->{s_kwd} = '';
2552            }
2553          ## reconsume          ## reconsume
2554    
2555          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2566  sub _get_next_token ($) { Line 2590  sub _get_next_token ($) {
2590        
2591          redo A;          redo A;
2592        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
2593          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2594          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2595          $self->{s_kwd} = '';            
2596              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2597            } else {
2598              
2599              $self->{state} = DATA_STATE;
2600              $self->{s_kwd} = '';
2601            }
2602                    
2603      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2604        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2586  sub _get_next_token ($) { Line 2615  sub _get_next_token ($) {
2615    
2616          redo A;          redo A;
2617        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2618          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2619          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2620          $self->{s_kwd} = '';            
2621              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2622            } else {
2623              
2624              $self->{state} = DATA_STATE;
2625              $self->{s_kwd} = '';
2626            }
2627          ## reconsume          ## reconsume
2628    
2629          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2630  sub _get_next_token ($) { Line 2664  sub _get_next_token ($) {
2664        
2665          redo A;          redo A;
2666        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2667          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2668          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2669          $self->{s_kwd} = '';            
2670              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2671            } else {
2672              
2673              $self->{state} = DATA_STATE;
2674              $self->{s_kwd} = '';
2675            }
2676          ## reconsume          ## reconsume
2677    
2678          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2679  sub _get_next_token ($) { Line 2718  sub _get_next_token ($) {
2718        
2719          redo A;          redo A;
2720        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2721          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2722          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2723          $self->{s_kwd} = '';            
2724              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2725            } else {
2726              
2727              $self->{state} = DATA_STATE;
2728              $self->{s_kwd} = '';
2729            }
2730          ## reconsume          ## reconsume
2731    
2732          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2707  sub _get_next_token ($) { Line 2751  sub _get_next_token ($) {
2751        }        }
2752      } elsif ($self->{state} == COMMENT_END_STATE) {      } elsif ($self->{state} == COMMENT_END_STATE) {
2753        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2754                    if ($self->{in_subset}) {
2755          $self->{state} = DATA_STATE;            
2756          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2757            } else {
2758              
2759              $self->{state} = DATA_STATE;
2760              $self->{s_kwd} = '';
2761            }
2762                    
2763      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2764        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2746  sub _get_next_token ($) { Line 2795  sub _get_next_token ($) {
2795        
2796          redo A;          redo A;
2797        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2798          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2799          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2800          $self->{s_kwd} = '';            
2801              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2802            } else {
2803              
2804              $self->{state} = DATA_STATE;
2805              $self->{s_kwd} = '';
2806            }
2807          ## reconsume          ## reconsume
2808    
2809          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2853  sub _get_next_token ($) { Line 2907  sub _get_next_token ($) {
2907                    
2908          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2909          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2910            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2911            $self->{in_subset} = 1;
2912                    
2913      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2914        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2864  sub _get_next_token ($) { Line 2920  sub _get_next_token ($) {
2920        $self->{set_nc}->($self);        $self->{set_nc}->($self);
2921      }      }
2922        
2923            return  ($self->{ct}); # DOCTYPE
2924          redo A;          redo A;
2925        } else {        } else {
2926                    
# Line 2936  sub _get_next_token ($) { Line 2993  sub _get_next_token ($) {
2993        } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [        } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2994                    
2995          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2996            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2997            $self->{in_subset} = 1;
2998                    
2999      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3000        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2947  sub _get_next_token ($) { Line 3006  sub _get_next_token ($) {
3006        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3007      }      }
3008        
3009            return  ($self->{ct}); # DOCTYPE
3010          redo A;          redo A;
3011        } else {        } else {
3012                    
# Line 3053  sub _get_next_token ($) { Line 3113  sub _get_next_token ($) {
3113                    
3114          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3115          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3116            $self->{in_subset} = 1;
3117                    
3118      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3119        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3064  sub _get_next_token ($) { Line 3125  sub _get_next_token ($) {
3125        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3126      }      }
3127        
3128            return  ($self->{ct}); # DOCTYPE
3129          redo A;          redo A;
3130        } else {        } else {
3131                    
# Line 3307  sub _get_next_token ($) { Line 3369  sub _get_next_token ($) {
3369          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3370          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3371          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3372            $self->{in_subset} = 1;
3373                    
3374      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3375        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3318  sub _get_next_token ($) { Line 3381  sub _get_next_token ($) {
3381        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3382      }      }
3383        
3384            return  ($self->{ct}); # DOCTYPE
3385          redo A;          redo A;
3386        } else {        } else {
3387                    
# Line 3569  sub _get_next_token ($) { Line 3633  sub _get_next_token ($) {
3633          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3634          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3635          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3636            $self->{in_subset} = 1;
3637                    
3638      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3639        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3580  sub _get_next_token ($) { Line 3645  sub _get_next_token ($) {
3645        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3646      }      }
3647        
3648            return  ($self->{ct}); # DOCTYPE
3649          redo A;          redo A;
3650        } else {        } else {
3651                    
# Line 3687  sub _get_next_token ($) { Line 3753  sub _get_next_token ($) {
3753    
3754          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3755          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3756            $self->{in_subset} = 1;
3757                    
3758      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3759        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3698  sub _get_next_token ($) { Line 3765  sub _get_next_token ($) {
3765        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3766      }      }
3767        
3768            return  ($self->{ct}); # DOCTYPE
3769          redo A;          redo A;
3770        } else {        } else {
3771                    
# Line 3910  sub _get_next_token ($) { Line 3978  sub _get_next_token ($) {
3978                    
3979          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3980          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3981            $self->{in_subset} = 1;
3982                    
3983      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3984        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3921  sub _get_next_token ($) { Line 3990  sub _get_next_token ($) {
3990        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3991      }      }
3992        
3993            return  ($self->{ct}); # DOCTYPE
3994          redo A;          redo A;
3995        } else {        } else {
3996                    
# Line 3962  sub _get_next_token ($) { Line 4032  sub _get_next_token ($) {
4032    
4033          redo A;          redo A;
4034        } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [        } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4035          if ($self->{ct}->{has_internal_subset}) { # DOCTYPE          
4036                      $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4037            ## Stay in the state.          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4038                      $self->{in_subset} = 1;
4039      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {          
       $self->{line_prev} = $self->{line};  
       $self->{column_prev} = $self->{column};  
       $self->{column}++;  
       $self->{nc}  
           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);  
     } else {  
       $self->{set_nc}->($self);  
     }  
     
           redo A;  
         } else {  
             
           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;  
           $self->{ct}->{has_internal_subset} = 1; # DOCTYPE  
             
4040      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4041        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
4042        $self->{column_prev} = $self->{column};        $self->{column_prev} = $self->{column};
# Line 3992  sub _get_next_token ($) { Line 4047  sub _get_next_token ($) {
4047        $self->{set_nc}->($self);        $self->{set_nc}->($self);
4048      }      }
4049        
4050            redo A;          return  ($self->{ct}); # DOCTYPE
4051          }          redo A;
4052        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4053                    
4054          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
# Line 4719  sub _get_next_token ($) { Line 4774  sub _get_next_token ($) {
4774          redo A;          redo A;
4775        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4776          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
4777          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
4778          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4779            } else {
4780              $self->{state} = DATA_STATE;
4781              $self->{s_kwd} = '';
4782            }
4783          ## Reconsume.          ## Reconsume.
4784          return  ($self->{ct}); # pi          return  ($self->{ct}); # pi
4785          redo A;          redo A;
# Line 4791  sub _get_next_token ($) { Line 4850  sub _get_next_token ($) {
4850          redo A;          redo A;
4851        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4852          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
4853          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
4854          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4855            } else {
4856              $self->{state} = DATA_STATE;
4857              $self->{s_kwd} = '';
4858            }
4859          ## Reprocess.          ## Reprocess.
4860          return  ($self->{ct}); # pi          return  ($self->{ct}); # pi
4861          redo A;          redo A;
# Line 4817  sub _get_next_token ($) { Line 4880  sub _get_next_token ($) {
4880        }        }
4881      } elsif ($self->{state} == PI_AFTER_STATE) {      } elsif ($self->{state} == PI_AFTER_STATE) {
4882        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
4883          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
4884          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4885            } else {
4886              $self->{state} = DATA_STATE;
4887              $self->{s_kwd} = '';
4888            }
4889                    
4890      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4891        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 4863  sub _get_next_token ($) { Line 4930  sub _get_next_token ($) {
4930      } elsif ($self->{state} == PI_DATA_AFTER_STATE) {      } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
4931        ## XML5: Same as "pi after state" in XML5        ## XML5: Same as "pi after state" in XML5
4932        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
4933          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
4934          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4935            } else {
4936              $self->{state} = DATA_STATE;
4937              $self->{s_kwd} = '';
4938            }
4939                    
4940      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4941        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 4902  sub _get_next_token ($) { Line 4973  sub _get_next_token ($) {
4973    
4974      } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {      } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
4975        if ($self->{nc} == 0x003C) { # <        if ($self->{nc} == 0x003C) { # <
4976          ## TODO:          $self->{state} = DOCTYPE_TAG_STATE;
4977                    
4978      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4979        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 4932  sub _get_next_token ($) { Line 5003  sub _get_next_token ($) {
5003        
5004          redo A;          redo A;
5005        } elsif ($self->{nc} == 0x005D) { # ]        } elsif ($self->{nc} == 0x005D) { # ]
5006            delete $self->{in_subset};
5007          $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5008                    
5009      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 4961  sub _get_next_token ($) { Line 5033  sub _get_next_token ($) {
5033          redo A;          redo A;
5034        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
5035          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5036            delete $self->{in_subset};
5037          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
5038          $self->{s_kwd} = '';          $self->{s_kwd} = '';
5039          ## Reconsume.          ## Reconsume.
5040          return  ($self->{ct}); # DOCTYPE          return  ({type => END_OF_DOCTYPE_TOKEN});
5041          redo A;          redo A;
5042        } else {        } else {
5043          unless ($self->{internal_subset_tainted}) {          unless ($self->{internal_subset_tainted}) {
# Line 5001  sub _get_next_token ($) { Line 5074  sub _get_next_token ($) {
5074        $self->{set_nc}->($self);        $self->{set_nc}->($self);
5075      }      }
5076        
5077          return  ($self->{ct}); # DOCTYPE          return  ({type => END_OF_DOCTYPE_TOKEN});
5078          redo A;          redo A;
5079        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
5080          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5081          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
5082          $self->{s_kwd} = '';          $self->{s_kwd} = '';
5083          ## Reconsume.          ## Reconsume.
5084          return  ($self->{ct}); # DOCTYPE          return  ({type => END_OF_DOCTYPE_TOKEN});
5085          redo A;          redo A;
5086        } else {        } else {
5087          ## XML5: No parse error and stay in the state.          ## XML5: No parse error and stay in the state.
5088          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5089    
5090          $self->{state} = BOGUS_DOCTYPE_STATE;          $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5091            
5092        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5093          $self->{line_prev} = $self->{line};
5094          $self->{column_prev} = $self->{column};
5095          $self->{column}++;
5096          $self->{nc}
5097              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5098        } else {
5099          $self->{set_nc}->($self);
5100        }
5101      
5102            redo A;
5103          }
5104        } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5105          if ($self->{nc} == 0x003E) { # >
5106            $self->{state} = DATA_STATE;
5107            $self->{s_kwd} = '';
5108            
5109        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5110          $self->{line_prev} = $self->{line};
5111          $self->{column_prev} = $self->{column};
5112          $self->{column}++;
5113          $self->{nc}
5114              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5115        } else {
5116          $self->{set_nc}->($self);
5117        }
5118      
5119            return  ({type => END_OF_DOCTYPE_TOKEN});
5120            redo A;
5121          } elsif ($self->{nc} == -1) {
5122            $self->{state} = DATA_STATE;
5123            $self->{s_kwd} = '';
5124            ## Reconsume.
5125            return  ({type => END_OF_DOCTYPE_TOKEN});
5126            redo A;
5127          } else {
5128            ## Stay in the state.
5129            
5130        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5131          $self->{line_prev} = $self->{line};
5132          $self->{column_prev} = $self->{column};
5133          $self->{column}++;
5134          $self->{nc}
5135              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5136        } else {
5137          $self->{set_nc}->($self);
5138        }
5139      
5140            redo A;
5141          }
5142        } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5143          if ($self->{nc} == 0x0021) { # !
5144            $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
5145            
5146        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5147          $self->{line_prev} = $self->{line};
5148          $self->{column_prev} = $self->{column};
5149          $self->{column}++;
5150          $self->{nc}
5151              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5152        } else {
5153          $self->{set_nc}->($self);
5154        }
5155      
5156            redo A;
5157          } elsif ($self->{nc} == 0x003F) { # ?
5158            $self->{state} = PI_STATE;
5159            
5160        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5161          $self->{line_prev} = $self->{line};
5162          $self->{column_prev} = $self->{column};
5163          $self->{column}++;
5164          $self->{nc}
5165              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5166        } else {
5167          $self->{set_nc}->($self);
5168        }
5169      
5170            redo A;
5171          } elsif ($self->{nc} == -1) {
5172            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5173            $self->{state} = DATA_STATE;
5174            $self->{s_kwd} = '';
5175            ## Reconsume.
5176            redo A;
5177          } else {
5178            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5179                            line => $self->{line_prev},
5180                            column => $self->{column_prev});
5181            $self->{state} = BOGUS_COMMENT_STATE;
5182            $self->{ct} = {type => COMMENT_TOKEN,
5183                           data => '',
5184                          }; ## NOTE: Will be discarded.
5185                    
5186      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5187        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};

Legend:
Removed from v.1.12  
changed lines
  Added in v.1.13

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24