/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.15 by wakaba, Sat Oct 18 08:05:29 2008 UTC revision 1.16 by wakaba, Sat Oct 18 11:34:49 2008 UTC
# Line 177  sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATIO Line 177  sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATIO
177  sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }  sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178  sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }  sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179  sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }  sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180    sub BOGUS_MD_STATE () { 85 }
181    
182  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
183  ## list and descriptions)  ## list and descriptions)
# Line 3182  sub _get_next_token ($) { Line 3183  sub _get_next_token ($) {
3183        
3184          redo A;          redo A;
3185        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3186            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3187              
3188              $self->{state} = DATA_STATE;
3189              $self->{s_kwd} = '';
3190            } else {
3191              
3192              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
3193              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3194            }
3195                    
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
3196                    
3197      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3198        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3196  sub _get_next_token ($) { Line 3204  sub _get_next_token ($) {
3204        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3205      }      }
3206        
3207            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         return  ($self->{ct}); # DOCTYPE  
   
3208          redo A;          redo A;
3209        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3210            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3211              
3212              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3213              $self->{state} = DATA_STATE;
3214              $self->{s_kwd} = '';
3215              $self->{ct}->{quirks} = 1;
3216            } else {
3217              
3218              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3219              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3220            }
3221                    
3222          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          ## Reconsume.
3223          $self->{state} = DATA_STATE;          return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{s_kwd} = '';  
         ## reconsume  
   
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3224          redo A;          redo A;
3225        } elsif ($self->{nc} == 0x0050 or # P        } elsif ($self->{nc} == 0x0050 or # P
3226                 $self->{nc} == 0x0070) { # p                 $self->{nc} == 0x0070) { # p
# Line 3245  sub _get_next_token ($) { Line 3256  sub _get_next_token ($) {
3256      }      }
3257        
3258          redo A;          redo A;
3259        } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [  ## TODO: " and ' for ENTITY
3260          } elsif ($self->{is_xml} and
3261                   $self->{ct}->{type} == DOCTYPE_TOKEN and
3262                   $self->{nc} == 0x005B) { # [
3263                    
3264          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3265          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
# Line 3264  sub _get_next_token ($) { Line 3278  sub _get_next_token ($) {
3278          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
3279          redo A;          redo A;
3280        } else {        } else {
3281                    $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name'); ## TODO: type
3282          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name');  
3283          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3284              
3285              $self->{ct}->{quirks} = 1;
3286              $self->{state} = BOGUS_DOCTYPE_STATE;
3287            } else {
3288              
3289              $self->{state} = BOGUS_MD_STATE;
3290            }
3291    
         $self->{state} = BOGUS_DOCTYPE_STATE;  
3292                    
3293      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3294        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3340  sub _get_next_token ($) { Line 3360  sub _get_next_token ($) {
3360        
3361          redo A;          redo A;
3362        } else {        } else {
3363                    $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',  
3364                          line => $self->{line_prev},                          line => $self->{line_prev},
3365                          column => $self->{column_prev} + 1 - length $self->{kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
3366          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3367              
3368          $self->{state} = BOGUS_DOCTYPE_STATE;            $self->{ct}->{quirks} = 1;
3369              $self->{state} = BOGUS_DOCTYPE_STATE;
3370            } else {
3371              
3372              $self->{state} = BOGUS_MD_STATE;
3373            }
3374          ## Reconsume.          ## Reconsume.
3375          redo A;          redo A;
3376        }        }
# Line 3408  sub _get_next_token ($) { Line 3432  sub _get_next_token ($) {
3432        
3433          redo A;          redo A;
3434        } else {        } else {
3435                    $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',  
3436                          line => $self->{line_prev},                          line => $self->{line_prev},
3437                          column => $self->{column_prev} + 1 - length $self->{kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
3438          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3439              
3440          $self->{state} = BOGUS_DOCTYPE_STATE;            $self->{ct}->{quirks} = 1;
3441              $self->{state} = BOGUS_DOCTYPE_STATE;
3442            } else {
3443              
3444              $self->{state} = BOGUS_MD_STATE;
3445            }
3446          ## Reconsume.          ## Reconsume.
3447          redo A;          redo A;
3448        }        }
# Line 3467  sub _get_next_token ($) { Line 3495  sub _get_next_token ($) {
3495        
3496          redo A;          redo A;
3497        } elsif ($self->{nc} eq 0x003E) { # >        } elsif ($self->{nc} eq 0x003E) { # >
           
3498          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3499            
3500          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3501          $self->{s_kwd} = '';            
3502              $self->{state} = DATA_STATE;
3503              $self->{s_kwd} = '';
3504              $self->{ct}->{quirks} = 1;
3505            } else {
3506              
3507              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3508            }
3509            
3510                    
3511      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3512        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3483  sub _get_next_token ($) { Line 3518  sub _get_next_token ($) {
3518        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3519      }      }
3520        
3521            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3522          redo A;          redo A;
3523        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3524            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3525              
3526              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3527              $self->{state} = DATA_STATE;
3528              $self->{s_kwd} = '';
3529              $self->{ct}->{quirks} = 1;
3530            } else {
3531              
3532              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3533              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3534            }
3535                    
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');  
   
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
3536          ## reconsume          ## reconsume
   
         $self->{ct}->{quirks} = 1;  
3537          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
   
3538          redo A;          redo A;
3539        } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [        } elsif ($self->{is_xml} and
3540                   $self->{ct}->{type} == DOCTYPE_TOKEN and
3541                   $self->{nc} == 0x005B) { # [
3542                    
3543          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3544          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
# Line 3520  sub _get_next_token ($) { Line 3558  sub _get_next_token ($) {
3558          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
3559          redo A;          redo A;
3560        } else {        } else {
           
3561          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
         $self->{ct}->{quirks} = 1;  
3562    
3563          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3564              
3565              $self->{ct}->{quirks} = 1;
3566              $self->{state} = BOGUS_DOCTYPE_STATE;
3567            } else {
3568              
3569              $self->{state} = BOGUS_MD_STATE;
3570            }
3571    
3572                    
3573      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3574        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3555  sub _get_next_token ($) { Line 3599  sub _get_next_token ($) {
3599        
3600          redo A;          redo A;
3601        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
3602          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3603    
3604          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3605          $self->{s_kwd} = '';            
3606              $self->{state} = DATA_STATE;
3607              $self->{s_kwd} = '';
3608              $self->{ct}->{quirks} = 1;
3609            } else {
3610              
3611              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3612            }
3613    
3614                    
3615      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3616        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3571  sub _get_next_token ($) { Line 3622  sub _get_next_token ($) {
3622        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3623      }      }
3624        
3625            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3626          redo A;          redo A;
3627        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
3628          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3629    
3630          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3631          $self->{s_kwd} = '';            
3632          ## reconsume            $self->{state} = DATA_STATE;
3633              $self->{s_kwd} = '';
3634          $self->{ct}->{quirks} = 1;            $self->{ct}->{quirks} = 1;
3635            } else {
3636              
3637              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3638            }
3639            
3640            ## Reconsume.
3641          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
   
3642          redo A;          redo A;
3643        } else {        } else {
3644                    
3645          $self->{ct}->{pubid} # DOCTYPE          $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
3646          $self->{read_until}->($self->{ct}->{pubid}, q[">],          $self->{read_until}->($self->{ct}->{pubid}, q[">],
3647                                length $self->{ct}->{pubid});                                length $self->{ct}->{pubid});
3648    
# Line 3626  sub _get_next_token ($) { Line 3677  sub _get_next_token ($) {
3677        
3678          redo A;          redo A;
3679        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
3680          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3681    
3682          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3683          $self->{s_kwd} = '';            
3684              $self->{state} = DATA_STATE;
3685              $self->{s_kwd} = '';
3686              $self->{ct}->{quirks} = 1;
3687            } else {
3688              
3689              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3690            }
3691    
3692                    
3693      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3694        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3642  sub _get_next_token ($) { Line 3700  sub _get_next_token ($) {
3700        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3701      }      }
3702        
3703            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3704          redo A;          redo A;
3705        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
3706          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3707    
3708          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3709          $self->{s_kwd} = '';            
3710              $self->{state} = DATA_STATE;
3711              $self->{s_kwd} = '';
3712              $self->{ct}->{quirks} = 1;
3713            } else {
3714              
3715              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3716            }
3717          
3718          ## reconsume          ## reconsume
3719            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3720          redo A;          redo A;
3721        } else {        } else {
3722                    
3723          $self->{ct}->{pubid} # DOCTYPE          $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
3724          $self->{read_until}->($self->{ct}->{pubid}, q['>],          $self->{read_until}->($self->{ct}->{pubid}, q['>],
3725                                length $self->{ct}->{pubid});                                length $self->{ct}->{pubid});
3726    
# Line 3698  sub _get_next_token ($) { Line 3756  sub _get_next_token ($) {
3756          redo A;          redo A;
3757        } elsif ($self->{nc} == 0x0022) { # "        } elsif ($self->{nc} == 0x0022) { # "
3758                    
3759          $self->{ct}->{sysid} = ''; # DOCTYPE          $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3760          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3761                    
3762      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3714  sub _get_next_token ($) { Line 3772  sub _get_next_token ($) {
3772          redo A;          redo A;
3773        } elsif ($self->{nc} == 0x0027) { # '        } elsif ($self->{nc} == 0x0027) { # '
3774                    
3775          $self->{ct}->{sysid} = ''; # DOCTYPE          $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3776          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3777                    
3778      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3729  sub _get_next_token ($) { Line 3787  sub _get_next_token ($) {
3787        
3788          redo A;          redo A;
3789        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3790          if ($self->{is_xml}) {          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3791                        if ($self->{is_xml}) {
3792            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');              
3793                $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3794              } else {
3795                
3796              }
3797              $self->{state} = DATA_STATE;
3798              $self->{s_kwd} = '';
3799          } else {          } else {
3800                        if ($self->{ct}->{type} == NOTATION_TOKEN) {
3801                
3802              } else {
3803                
3804                $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');            
3805              }
3806              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3807          }          }
3808          $self->{state} = DATA_STATE;          
         $self->{s_kwd} = '';  
3809                    
3810      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3811        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3748  sub _get_next_token ($) { Line 3817  sub _get_next_token ($) {
3817        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3818      }      }
3819        
3820            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         return  ($self->{ct}); # DOCTYPE  
   
3821          redo A;          redo A;
3822        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3823            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3824              
3825              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3826              
3827              $self->{state} = DATA_STATE;
3828              $self->{s_kwd} = '';
3829              $self->{ct}->{quirks} = 1;
3830            } else {
3831              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3832              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3833            }
3834                    
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');  
   
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
3835          ## reconsume          ## reconsume
3836            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3837          redo A;          redo A;
3838        } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [        } elsif ($self->{is_xml} and
3839                   $self->{ct}->{type} == DOCTYPE_TOKEN and
3840                   $self->{nc} == 0x005B) { # [
3841                    
3842          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3843          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
# Line 3784  sub _get_next_token ($) { Line 3857  sub _get_next_token ($) {
3857          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
3858          redo A;          redo A;
3859        } else {        } else {
           
3860          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
         $self->{ct}->{quirks} = 1;  
3861    
3862          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3863              
3864              $self->{ct}->{quirks} = 1;
3865              $self->{state} = BOGUS_DOCTYPE_STATE;
3866            } else {
3867              
3868              $self->{state} = BOGUS_MD_STATE;
3869            }
3870    
3871                    
3872      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3873        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3851  sub _get_next_token ($) { Line 3930  sub _get_next_token ($) {
3930        
3931          redo A;          redo A;
3932        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
3933          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
3934                    
3935      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3936        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3867  sub _get_next_token ($) { Line 3943  sub _get_next_token ($) {
3943      }      }
3944        
3945    
3946          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3947          return  ($self->{ct}); # DOCTYPE            
3948              $self->{state} = DATA_STATE;
3949              $self->{s_kwd} = '';
3950              $self->{ct}->{quirks} = 1;
3951            } else {
3952              
3953              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3954            }
3955    
3956            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3957          redo A;          redo A;
3958        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3959            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3960              
3961              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3962              $self->{state} = DATA_STATE;
3963              $self->{s_kwd} = '';
3964              $self->{ct}->{quirks} = 1;
3965            } else {
3966              
3967              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3968              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3969            }
3970                    
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');  
   
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
3971          ## reconsume          ## reconsume
3972            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3973          redo A;          redo A;
3974        } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [        } elsif ($self->{is_xml} and
3975                   $self->{ct}->{type} == DOCTYPE_TOKEN and
3976                   $self->{nc} == 0x005B) { # [
3977                    
3978          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3979    
# Line 3904  sub _get_next_token ($) { Line 3994  sub _get_next_token ($) {
3994          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
3995          redo A;          redo A;
3996        } else {        } else {
           
3997          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
         $self->{ct}->{quirks} = 1;  
3998    
3999          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4000                        
4001              $self->{ct}->{quirks} = 1;
4002              $self->{state} = BOGUS_DOCTYPE_STATE;
4003            } else {
4004              
4005              $self->{state} = BOGUS_MD_STATE;
4006            }
4007    
4008                    
4009      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4010        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3939  sub _get_next_token ($) { Line 4035  sub _get_next_token ($) {
4035        
4036          redo A;          redo A;
4037        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
           
4038          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4039    
4040          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4041          $self->{s_kwd} = '';            
4042              $self->{state} = DATA_STATE;
4043              $self->{s_kwd} = '';
4044              $self->{ct}->{quirks} = 1;
4045            } else {
4046              
4047              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4048            }
4049            
4050                    
4051      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4052        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3955  sub _get_next_token ($) { Line 4058  sub _get_next_token ($) {
4058        $self->{set_nc}->($self);        $self->{set_nc}->($self);
4059      }      }
4060        
4061            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
4062          redo A;          redo A;
4063        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
4064          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4065    
4066          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4067          $self->{s_kwd} = '';            
4068              $self->{state} = DATA_STATE;
4069              $self->{s_kwd} = '';
4070              $self->{ct}->{quirks} = 1;
4071            } else {
4072              
4073              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4074            }
4075            
4076          ## reconsume          ## reconsume
4077            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
4078          redo A;          redo A;
4079        } else {        } else {
4080                    
4081          $self->{ct}->{sysid} # DOCTYPE          $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
4082          $self->{read_until}->($self->{ct}->{sysid}, q[">],          $self->{read_until}->($self->{ct}->{sysid}, q[">],
4083                                length $self->{ct}->{sysid});                                length $self->{ct}->{sysid});
4084    
# Line 4032  sub _get_next_token ($) { Line 4135  sub _get_next_token ($) {
4135    
4136          redo A;          redo A;
4137        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
4138          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4139    
4140          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4141          $self->{s_kwd} = '';            
4142          ## reconsume            $self->{state} = DATA_STATE;
4143              $self->{s_kwd} = '';
4144          $self->{ct}->{quirks} = 1;            $self->{ct}->{quirks} = 1;
4145          return  ($self->{ct}); # DOCTYPE          } else {
4146              
4147              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4148            }
4149    
4150            ## reconsume
4151            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4152          redo A;          redo A;
4153        } else {        } else {
4154                    
4155          $self->{ct}->{sysid} # DOCTYPE          $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
4156          $self->{read_until}->($self->{ct}->{sysid}, q['>],          $self->{read_until}->($self->{ct}->{sysid}, q['>],
4157                                length $self->{ct}->{sysid});                                length $self->{ct}->{sysid});
4158    
# Line 4081  sub _get_next_token ($) { Line 4187  sub _get_next_token ($) {
4187        
4188          redo A;          redo A;
4189        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
4190                    if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4191          $self->{state} = DATA_STATE;            
4192          $self->{s_kwd} = '';            $self->{state} = DATA_STATE;
4193              $self->{s_kwd} = '';
4194            } else {
4195              
4196              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4197            }
4198    
4199                    
4200      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4201        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 4095  sub _get_next_token ($) { Line 4207  sub _get_next_token ($) {
4207        $self->{set_nc}->($self);        $self->{set_nc}->($self);
4208      }      }
4209        
4210            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         return  ($self->{ct}); # DOCTYPE  
   
4211          redo A;          redo A;
4212    ## TODO: "NDATA"
4213        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4214                    if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4215          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');            
4216          $self->{state} = DATA_STATE;            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4217          $self->{s_kwd} = '';            $self->{state} = DATA_STATE;
4218          ## reconsume            $self->{s_kwd} = '';
4219              $self->{ct}->{quirks} = 1;
4220          $self->{ct}->{quirks} = 1;          } else {
4221          return  ($self->{ct}); # DOCTYPE            
4222              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4223              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4224            }
4225    
4226            ## reconsume
4227            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4228          redo A;          redo A;
4229        } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [        } elsif ($self->{is_xml} and
4230                   $self->{ct}->{type} == DOCTYPE_TOKEN and
4231                   $self->{nc} == 0x005B) { # [
4232                    
4233          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4234          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
# Line 4129  sub _get_next_token ($) { Line 4247  sub _get_next_token ($) {
4247          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
4248          redo A;          redo A;
4249        } else {        } else {
           
4250          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
         #$self->{ct}->{quirks} = 1;  
4251    
4252          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4253              
4254              #$self->{ct}->{quirks} = 1;
4255              $self->{state} = BOGUS_DOCTYPE_STATE;
4256            } else {
4257              
4258              $self->{state} = BOGUS_MD_STATE;
4259            }
4260    
4261                    
4262      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4263        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 5845  sub _get_next_token ($) { Line 5969  sub _get_next_token ($) {
5969        ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".        ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
5970                
5971        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
5972          ## TODO:          if ($self->{ct}->{type} == ATTLIST_TOKEN) {
5973          $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
5974            } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
5975              ## TODO: ...
5976              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
5977            } else { # ENTITY/NOTATION
5978              $self->{state} = AFTER_DOCTYPE_NAME_STATE;
5979            }
5980                    
5981      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5982        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 5863  sub _get_next_token ($) { Line 5993  sub _get_next_token ($) {
5993          if ($self->{ct}->{type} == ATTLIST_TOKEN) {          if ($self->{ct}->{type} == ATTLIST_TOKEN) {
5994            #            #
5995          } else {          } else {
5996            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md body'); ## TODO: type            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
5997          }          }
5998          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5999                    
# Line 6853  sub _get_next_token ($) { Line 6983  sub _get_next_token ($) {
6983        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
6984          ## XML5: No parse error.          ## XML5: No parse error.
6985          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type
6986          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_MD_STATE;
         $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded  
6987          ## Reconsume.          ## Reconsume.
6988          redo A;          redo A;
6989        } elsif ($self->{nc} == 0x0022) { # "        } elsif ($self->{nc} == 0x0022) { # "
# Line 7141  sub _get_next_token ($) { Line 7270  sub _get_next_token ($) {
7270          $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;          $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7271          ## Reconsume.          ## Reconsume.
7272          redo A;          redo A;
7273        }              }
7274    
7275        } elsif ($self->{state} == BOGUS_MD_STATE) {
7276          if ($self->{nc} == 0x003E) { # >
7277            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7278            
7279        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7280          $self->{line_prev} = $self->{line};
7281          $self->{column_prev} = $self->{column};
7282          $self->{column}++;
7283          $self->{nc}
7284              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7285        } else {
7286          $self->{set_nc}->($self);
7287        }
7288      
7289            return  ($self->{ct}); # ATTLIST/ENTITY/NOTATION
7290            redo A;
7291          } elsif ($self->{nc} == -1) {
7292            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7293            ## Reconsume.
7294            return  ($self->{ct}); # ATTLIST/ENTITY/NOTATION
7295            redo A;
7296          } else {
7297            ## Stay in the state.
7298            
7299        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7300          $self->{line_prev} = $self->{line};
7301          $self->{column_prev} = $self->{column};
7302          $self->{column}++;
7303          $self->{nc}
7304              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7305        } else {
7306          $self->{set_nc}->($self);
7307        }
7308      
7309            redo A;
7310          }
7311      } else {      } else {
7312        die "$0: $self->{state}: Unknown state";        die "$0: $self->{state}: Unknown state";
7313      }      }

Legend:
Removed from v.1.15  
changed lines
  Added in v.1.16

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24