/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.15 by wakaba, Sat Oct 18 08:05:29 2008 UTC revision 1.18 by wakaba, Sun Oct 19 06:14:57 2008 UTC
# Line 177  sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATIO Line 177  sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATIO
177  sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }  sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178  sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }  sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179  sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }  sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180    sub BEFORE_NDATA_STATE () { 85 }
181    sub NDATA_STATE () { 86 }
182    sub AFTER_NDATA_STATE () { 87 }
183    sub BEFORE_NOTATION_NAME_STATE () { 88 }
184    sub NOTATION_NAME_STATE () { 89 }
185    sub AFTER_NOTATION_NAME_STATE () { 90 }
186    sub BOGUS_MD_STATE () { 91 }
187    
188  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
189  ## list and descriptions)  ## list and descriptions)
# Line 3182  sub _get_next_token ($) { Line 3189  sub _get_next_token ($) {
3189        
3190          redo A;          redo A;
3191        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3192            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3193              
3194              $self->{state} = DATA_STATE;
3195              $self->{s_kwd} = '';
3196            } else {
3197              
3198              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
3199              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3200            }
3201                    
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
3202                    
3203      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3204        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3196  sub _get_next_token ($) { Line 3210  sub _get_next_token ($) {
3210        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3211      }      }
3212        
3213            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         return  ($self->{ct}); # DOCTYPE  
   
3214          redo A;          redo A;
3215        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3216            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3217              
3218              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3219              $self->{state} = DATA_STATE;
3220              $self->{s_kwd} = '';
3221              $self->{ct}->{quirks} = 1;
3222            } else {
3223              
3224              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3225              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3226            }
3227                    
3228          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          ## Reconsume.
3229          $self->{state} = DATA_STATE;          return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{s_kwd} = '';  
         ## reconsume  
   
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3230          redo A;          redo A;
3231        } elsif ($self->{nc} == 0x0050 or # P        } elsif ($self->{nc} == 0x0050 or # P
3232                 $self->{nc} == 0x0070) { # p                 $self->{nc} == 0x0070) { # p
# Line 3245  sub _get_next_token ($) { Line 3262  sub _get_next_token ($) {
3262      }      }
3263        
3264          redo A;          redo A;
3265        } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [  ## TODO: " and ' for ENTITY
3266          } elsif ($self->{is_xml} and
3267                   $self->{ct}->{type} == DOCTYPE_TOKEN and
3268                   $self->{nc} == 0x005B) { # [
3269                    
3270          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3271          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
# Line 3264  sub _get_next_token ($) { Line 3284  sub _get_next_token ($) {
3284          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
3285          redo A;          redo A;
3286        } else {        } else {
3287                    $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name'); ## TODO: type
3288          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name');  
3289          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3290              
3291              $self->{ct}->{quirks} = 1;
3292              $self->{state} = BOGUS_DOCTYPE_STATE;
3293            } else {
3294              
3295              $self->{state} = BOGUS_MD_STATE;
3296            }
3297    
         $self->{state} = BOGUS_DOCTYPE_STATE;  
3298                    
3299      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3300        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3340  sub _get_next_token ($) { Line 3366  sub _get_next_token ($) {
3366        
3367          redo A;          redo A;
3368        } else {        } else {
3369                    $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',  
3370                          line => $self->{line_prev},                          line => $self->{line_prev},
3371                          column => $self->{column_prev} + 1 - length $self->{kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
3372          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3373              
3374          $self->{state} = BOGUS_DOCTYPE_STATE;            $self->{ct}->{quirks} = 1;
3375              $self->{state} = BOGUS_DOCTYPE_STATE;
3376            } else {
3377              
3378              $self->{state} = BOGUS_MD_STATE;
3379            }
3380          ## Reconsume.          ## Reconsume.
3381          redo A;          redo A;
3382        }        }
# Line 3408  sub _get_next_token ($) { Line 3438  sub _get_next_token ($) {
3438        
3439          redo A;          redo A;
3440        } else {        } else {
3441                    $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',  
3442                          line => $self->{line_prev},                          line => $self->{line_prev},
3443                          column => $self->{column_prev} + 1 - length $self->{kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
3444          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3445              
3446          $self->{state} = BOGUS_DOCTYPE_STATE;            $self->{ct}->{quirks} = 1;
3447              $self->{state} = BOGUS_DOCTYPE_STATE;
3448            } else {
3449              
3450              $self->{state} = BOGUS_MD_STATE;
3451            }
3452          ## Reconsume.          ## Reconsume.
3453          redo A;          redo A;
3454        }        }
# Line 3467  sub _get_next_token ($) { Line 3501  sub _get_next_token ($) {
3501        
3502          redo A;          redo A;
3503        } elsif ($self->{nc} eq 0x003E) { # >        } elsif ($self->{nc} eq 0x003E) { # >
           
3504          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3505            
3506          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3507          $self->{s_kwd} = '';            
3508              $self->{state} = DATA_STATE;
3509              $self->{s_kwd} = '';
3510              $self->{ct}->{quirks} = 1;
3511            } else {
3512              
3513              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3514            }
3515            
3516                    
3517      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3518        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3483  sub _get_next_token ($) { Line 3524  sub _get_next_token ($) {
3524        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3525      }      }
3526        
3527            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3528          redo A;          redo A;
3529        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3530            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3531              
3532              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3533              $self->{state} = DATA_STATE;
3534              $self->{s_kwd} = '';
3535              $self->{ct}->{quirks} = 1;
3536            } else {
3537              
3538              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3539              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3540            }
3541                    
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');  
   
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
3542          ## reconsume          ## reconsume
   
         $self->{ct}->{quirks} = 1;  
3543          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
   
3544          redo A;          redo A;
3545        } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [        } elsif ($self->{is_xml} and
3546                   $self->{ct}->{type} == DOCTYPE_TOKEN and
3547                   $self->{nc} == 0x005B) { # [
3548                    
3549          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3550          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
# Line 3520  sub _get_next_token ($) { Line 3564  sub _get_next_token ($) {
3564          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
3565          redo A;          redo A;
3566        } else {        } else {
           
3567          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
         $self->{ct}->{quirks} = 1;  
3568    
3569          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3570              
3571              $self->{ct}->{quirks} = 1;
3572              $self->{state} = BOGUS_DOCTYPE_STATE;
3573            } else {
3574              
3575              $self->{state} = BOGUS_MD_STATE;
3576            }
3577    
3578                    
3579      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3580        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3555  sub _get_next_token ($) { Line 3605  sub _get_next_token ($) {
3605        
3606          redo A;          redo A;
3607        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
3608          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3609    
3610          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3611          $self->{s_kwd} = '';            
3612              $self->{state} = DATA_STATE;
3613              $self->{s_kwd} = '';
3614              $self->{ct}->{quirks} = 1;
3615            } else {
3616              
3617              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3618            }
3619    
3620                    
3621      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3622        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3571  sub _get_next_token ($) { Line 3628  sub _get_next_token ($) {
3628        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3629      }      }
3630        
3631            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3632          redo A;          redo A;
3633        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
3634          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3635    
3636          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3637          $self->{s_kwd} = '';            
3638          ## reconsume            $self->{state} = DATA_STATE;
3639              $self->{s_kwd} = '';
3640          $self->{ct}->{quirks} = 1;            $self->{ct}->{quirks} = 1;
3641            } else {
3642              
3643              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3644            }
3645            
3646            ## Reconsume.
3647          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
   
3648          redo A;          redo A;
3649        } else {        } else {
3650                    
3651          $self->{ct}->{pubid} # DOCTYPE          $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
3652          $self->{read_until}->($self->{ct}->{pubid}, q[">],          $self->{read_until}->($self->{ct}->{pubid}, q[">],
3653                                length $self->{ct}->{pubid});                                length $self->{ct}->{pubid});
3654    
# Line 3626  sub _get_next_token ($) { Line 3683  sub _get_next_token ($) {
3683        
3684          redo A;          redo A;
3685        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
3686          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3687    
3688          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3689          $self->{s_kwd} = '';            
3690              $self->{state} = DATA_STATE;
3691              $self->{s_kwd} = '';
3692              $self->{ct}->{quirks} = 1;
3693            } else {
3694              
3695              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3696            }
3697    
3698                    
3699      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3700        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3642  sub _get_next_token ($) { Line 3706  sub _get_next_token ($) {
3706        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3707      }      }
3708        
3709            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3710          redo A;          redo A;
3711        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
3712          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3713    
3714          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3715          $self->{s_kwd} = '';            
3716              $self->{state} = DATA_STATE;
3717              $self->{s_kwd} = '';
3718              $self->{ct}->{quirks} = 1;
3719            } else {
3720              
3721              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3722            }
3723          
3724          ## reconsume          ## reconsume
3725            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3726          redo A;          redo A;
3727        } else {        } else {
3728                    
3729          $self->{ct}->{pubid} # DOCTYPE          $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
3730          $self->{read_until}->($self->{ct}->{pubid}, q['>],          $self->{read_until}->($self->{ct}->{pubid}, q['>],
3731                                length $self->{ct}->{pubid});                                length $self->{ct}->{pubid});
3732    
# Line 3698  sub _get_next_token ($) { Line 3762  sub _get_next_token ($) {
3762          redo A;          redo A;
3763        } elsif ($self->{nc} == 0x0022) { # "        } elsif ($self->{nc} == 0x0022) { # "
3764                    
3765          $self->{ct}->{sysid} = ''; # DOCTYPE          $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3766          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3767                    
3768      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3714  sub _get_next_token ($) { Line 3778  sub _get_next_token ($) {
3778          redo A;          redo A;
3779        } elsif ($self->{nc} == 0x0027) { # '        } elsif ($self->{nc} == 0x0027) { # '
3780                    
3781          $self->{ct}->{sysid} = ''; # DOCTYPE          $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3782          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3783                    
3784      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3729  sub _get_next_token ($) { Line 3793  sub _get_next_token ($) {
3793        
3794          redo A;          redo A;
3795        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3796          if ($self->{is_xml}) {          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3797                        if ($self->{is_xml}) {
3798            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');              
3799                $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3800              } else {
3801                
3802              }
3803              $self->{state} = DATA_STATE;
3804              $self->{s_kwd} = '';
3805          } else {          } else {
3806                        if ($self->{ct}->{type} == NOTATION_TOKEN) {
3807                
3808              } else {
3809                
3810                $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');            
3811              }
3812              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3813          }          }
3814          $self->{state} = DATA_STATE;          
         $self->{s_kwd} = '';  
3815                    
3816      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3817        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3748  sub _get_next_token ($) { Line 3823  sub _get_next_token ($) {
3823        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3824      }      }
3825        
3826            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         return  ($self->{ct}); # DOCTYPE  
   
3827          redo A;          redo A;
3828        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3829            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3830              
3831              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3832              
3833              $self->{state} = DATA_STATE;
3834              $self->{s_kwd} = '';
3835              $self->{ct}->{quirks} = 1;
3836            } else {
3837              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3838              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3839            }
3840                    
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');  
   
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
3841          ## reconsume          ## reconsume
3842            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3843          redo A;          redo A;
3844        } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [        } elsif ($self->{is_xml} and
3845                   $self->{ct}->{type} == DOCTYPE_TOKEN and
3846                   $self->{nc} == 0x005B) { # [
3847                    
3848          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3849          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
# Line 3784  sub _get_next_token ($) { Line 3863  sub _get_next_token ($) {
3863          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
3864          redo A;          redo A;
3865        } else {        } else {
           
3866          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
         $self->{ct}->{quirks} = 1;  
3867    
3868          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3869              
3870              $self->{ct}->{quirks} = 1;
3871              $self->{state} = BOGUS_DOCTYPE_STATE;
3872            } else {
3873              
3874              $self->{state} = BOGUS_MD_STATE;
3875            }
3876    
3877                    
3878      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3879        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3851  sub _get_next_token ($) { Line 3936  sub _get_next_token ($) {
3936        
3937          redo A;          redo A;
3938        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
3939          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
3940                    
3941      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3942        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3867  sub _get_next_token ($) { Line 3949  sub _get_next_token ($) {
3949      }      }
3950        
3951    
3952          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3953          return  ($self->{ct}); # DOCTYPE            
3954              $self->{state} = DATA_STATE;
3955              $self->{s_kwd} = '';
3956              $self->{ct}->{quirks} = 1;
3957            } else {
3958              
3959              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3960            }
3961    
3962            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3963          redo A;          redo A;
3964        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3965            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3966              
3967              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3968              $self->{state} = DATA_STATE;
3969              $self->{s_kwd} = '';
3970              $self->{ct}->{quirks} = 1;
3971            } else {
3972              
3973              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3974              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3975            }
3976                    
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');  
   
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
3977          ## reconsume          ## reconsume
3978            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3979          redo A;          redo A;
3980        } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [        } elsif ($self->{is_xml} and
3981                   $self->{ct}->{type} == DOCTYPE_TOKEN and
3982                   $self->{nc} == 0x005B) { # [
3983                    
3984          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3985    
# Line 3904  sub _get_next_token ($) { Line 4000  sub _get_next_token ($) {
4000          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
4001          redo A;          redo A;
4002        } else {        } else {
           
4003          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
         $self->{ct}->{quirks} = 1;  
4004    
4005          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4006                        
4007              $self->{ct}->{quirks} = 1;
4008              $self->{state} = BOGUS_DOCTYPE_STATE;
4009            } else {
4010              
4011              $self->{state} = BOGUS_MD_STATE;
4012            }
4013    
4014                    
4015      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4016        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3939  sub _get_next_token ($) { Line 4041  sub _get_next_token ($) {
4041        
4042          redo A;          redo A;
4043        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
           
4044          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4045    
4046          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4047          $self->{s_kwd} = '';            
4048              $self->{state} = DATA_STATE;
4049              $self->{s_kwd} = '';
4050              $self->{ct}->{quirks} = 1;
4051            } else {
4052              
4053              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4054            }
4055            
4056                    
4057      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4058        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3955  sub _get_next_token ($) { Line 4064  sub _get_next_token ($) {
4064        $self->{set_nc}->($self);        $self->{set_nc}->($self);
4065      }      }
4066        
4067            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
4068          redo A;          redo A;
4069        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
4070          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4071    
4072          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4073          $self->{s_kwd} = '';            
4074              $self->{state} = DATA_STATE;
4075              $self->{s_kwd} = '';
4076              $self->{ct}->{quirks} = 1;
4077            } else {
4078              
4079              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4080            }
4081            
4082          ## reconsume          ## reconsume
4083            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
4084          redo A;          redo A;
4085        } else {        } else {
4086                    
4087          $self->{ct}->{sysid} # DOCTYPE          $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
4088          $self->{read_until}->($self->{ct}->{sysid}, q[">],          $self->{read_until}->($self->{ct}->{sysid}, q[">],
4089                                length $self->{ct}->{sysid});                                length $self->{ct}->{sysid});
4090    
# Line 4032  sub _get_next_token ($) { Line 4141  sub _get_next_token ($) {
4141    
4142          redo A;          redo A;
4143        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
4144          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4145    
4146          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4147          $self->{s_kwd} = '';            
4148          ## reconsume            $self->{state} = DATA_STATE;
4149              $self->{s_kwd} = '';
4150          $self->{ct}->{quirks} = 1;            $self->{ct}->{quirks} = 1;
4151          return  ($self->{ct}); # DOCTYPE          } else {
4152              
4153              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4154            }
4155    
4156            ## reconsume
4157            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4158          redo A;          redo A;
4159        } else {        } else {
4160                    
4161          $self->{ct}->{sysid} # DOCTYPE          $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
4162          $self->{read_until}->($self->{ct}->{sysid}, q['>],          $self->{read_until}->($self->{ct}->{sysid}, q['>],
4163                                length $self->{ct}->{sysid});                                length $self->{ct}->{sysid});
4164    
# Line 4066  sub _get_next_token ($) { Line 4178  sub _get_next_token ($) {
4178        }        }
4179      } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {      } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4180        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
4181                    if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
4182          ## Stay in the state            
4183              $self->{state} = BEFORE_NDATA_STATE;
4184            } else {
4185              
4186              ## Stay in the state
4187            }
4188                    
4189      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4190        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 4081  sub _get_next_token ($) { Line 4198  sub _get_next_token ($) {
4198        
4199          redo A;          redo A;
4200        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
4201            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4202              
4203              $self->{state} = DATA_STATE;
4204              $self->{s_kwd} = '';
4205            } else {
4206              
4207              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4208            }
4209    
4210                    
4211          $self->{state} = DATA_STATE;      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4212          $self->{s_kwd} = '';        $self->{line_prev} = $self->{line};
4213          $self->{column_prev} = $self->{column};
4214          $self->{column}++;
4215          $self->{nc}
4216              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4217        } else {
4218          $self->{set_nc}->($self);
4219        }
4220      
4221            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4222            redo A;
4223          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4224                   ($self->{nc} == 0x004E or # N
4225                    $self->{nc} == 0x006E)) { # n
4226            
4227            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before NDATA'); ## TODO: type
4228            $self->{state} = NDATA_STATE;
4229            $self->{kwd} = chr $self->{nc};
4230                    
4231      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4232        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 4095  sub _get_next_token ($) { Line 4238  sub _get_next_token ($) {
4238        $self->{set_nc}->($self);        $self->{set_nc}->($self);
4239      }      }
4240        
   
         return  ($self->{ct}); # DOCTYPE  
   
4241          redo A;          redo A;
4242        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4243                    if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4244          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');            
4245          $self->{state} = DATA_STATE;            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4246          $self->{s_kwd} = '';            $self->{state} = DATA_STATE;
4247          ## reconsume            $self->{s_kwd} = '';
4248              $self->{ct}->{quirks} = 1;
4249          $self->{ct}->{quirks} = 1;          } else {
4250          return  ($self->{ct}); # DOCTYPE            
4251              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4252              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4253            }
4254    
4255            ## reconsume
4256            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4257          redo A;          redo A;
4258        } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [        } elsif ($self->{is_xml} and
4259                   $self->{ct}->{type} == DOCTYPE_TOKEN and
4260                   $self->{nc} == 0x005B) { # [
4261                    
4262          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4263          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
# Line 4129  sub _get_next_token ($) { Line 4276  sub _get_next_token ($) {
4276          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
4277          redo A;          redo A;
4278        } else {        } else {
           
4279          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
         #$self->{ct}->{quirks} = 1;  
4280    
4281          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4282              
4283              #$self->{ct}->{quirks} = 1;
4284              $self->{state} = BOGUS_DOCTYPE_STATE;
4285            } else {
4286              
4287              $self->{state} = BOGUS_MD_STATE;
4288            }
4289    
4290            
4291        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4292          $self->{line_prev} = $self->{line};
4293          $self->{column_prev} = $self->{column};
4294          $self->{column}++;
4295          $self->{nc}
4296              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4297        } else {
4298          $self->{set_nc}->($self);
4299        }
4300      
4301            redo A;
4302          }
4303        } elsif ($self->{state} == BEFORE_NDATA_STATE) {
4304          if ($is_space->{$self->{nc}}) {
4305            
4306            ## Stay in the state.
4307            
4308        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4309          $self->{line_prev} = $self->{line};
4310          $self->{column_prev} = $self->{column};
4311          $self->{column}++;
4312          $self->{nc}
4313              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4314        } else {
4315          $self->{set_nc}->($self);
4316        }
4317      
4318            redo A;
4319          } elsif ($self->{nc} == 0x003E) { # >
4320            
4321            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4322            
4323        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4324          $self->{line_prev} = $self->{line};
4325          $self->{column_prev} = $self->{column};
4326          $self->{column}++;
4327          $self->{nc}
4328              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4329        } else {
4330          $self->{set_nc}->($self);
4331        }
4332      
4333            return  ($self->{ct}); # ENTITY
4334            redo A;
4335          } elsif ($self->{nc} == 0x004E or # N
4336                   $self->{nc} == 0x006E) { # n
4337            
4338            $self->{state} = NDATA_STATE;
4339            $self->{kwd} = chr $self->{nc};
4340            
4341        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4342          $self->{line_prev} = $self->{line};
4343          $self->{column_prev} = $self->{column};
4344          $self->{column}++;
4345          $self->{nc}
4346              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4347        } else {
4348          $self->{set_nc}->($self);
4349        }
4350      
4351            redo A;
4352          } elsif ($self->{nc} == -1) {
4353            
4354            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4355            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4356            ## reconsume
4357            return  ($self->{ct}); # ENTITY
4358            redo A;
4359          } else {
4360            
4361            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4362            $self->{state} = BOGUS_MD_STATE;
4363                    
4364      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4365        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 5358  sub _get_next_token ($) { Line 5584  sub _get_next_token ($) {
5584      }      }
5585        
5586          redo A;          redo A;
5587        } elsif ($self->{nc} == 0x0045) { # E        } elsif ($self->{nc} == 0x0045 or # E
5588                   $self->{nc} == 0x0065) { # e
5589          $self->{state} = MD_E_STATE;          $self->{state} = MD_E_STATE;
5590          $self->{kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
5591                    
# Line 5373  sub _get_next_token ($) { Line 5600  sub _get_next_token ($) {
5600      }      }
5601        
5602          redo A;          redo A;
5603        } elsif ($self->{nc} == 0x0041) { # A        } elsif ($self->{nc} == 0x0041 or # A
5604                   $self->{nc} == 0x0061) { # a
5605          $self->{state} = MD_ATTLIST_STATE;          $self->{state} = MD_ATTLIST_STATE;
5606          $self->{kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
5607                    
# Line 5388  sub _get_next_token ($) { Line 5616  sub _get_next_token ($) {
5616      }      }
5617        
5618          redo A;          redo A;
5619        } elsif ($self->{nc} == 0x004E) { # N        } elsif ($self->{nc} == 0x004E or # N
5620                   $self->{nc} == 0x006E) { # n
5621          $self->{state} = MD_NOTATION_STATE;          $self->{state} = MD_NOTATION_STATE;
5622          $self->{kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
5623                    
# Line 5416  sub _get_next_token ($) { Line 5645  sub _get_next_token ($) {
5645        $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.        $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
5646        redo A;        redo A;
5647      } elsif ($self->{state} == MD_E_STATE) {      } elsif ($self->{state} == MD_E_STATE) {
5648        if ($self->{nc} == 0x004E) { # N        if ($self->{nc} == 0x004E or # N
5649              $self->{nc} == 0x006E) { # n
5650          $self->{state} = MD_ENTITY_STATE;          $self->{state} = MD_ENTITY_STATE;
5651          $self->{kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
5652                    
# Line 5431  sub _get_next_token ($) { Line 5661  sub _get_next_token ($) {
5661      }      }
5662        
5663          redo A;          redo A;
5664        } elsif ($self->{nc} == 0x004C) { # L        } elsif ($self->{nc} == 0x004C or # L
5665                   $self->{nc} == 0x006C) { # l
5666          ## XML5: <!ELEMENT> not supported.          ## XML5: <!ELEMENT> not supported.
5667          $self->{state} = MD_ELEMENT_STATE;          $self->{state} = MD_ELEMENT_STATE;
5668          $self->{kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
# Line 5459  sub _get_next_token ($) { Line 5690  sub _get_next_token ($) {
5690          redo A;          redo A;
5691        }        }
5692      } elsif ($self->{state} == MD_ENTITY_STATE) {      } elsif ($self->{state} == MD_ENTITY_STATE) {
5693        if ($self->{nc} == {        if ($self->{nc} == [
5694              'EN' => 0x0054, # T              undef,
5695              'ENT' => 0x0049, # I              undef,
5696              'ENTI' => 0x0054, # T              0x0054, # T
5697            }->{$self->{kwd}}) {              0x0049, # I
5698                0x0054, # T
5699              ]->[length $self->{kwd}] or
5700              $self->{nc} == [
5701                undef,
5702                undef,
5703                0x0074, # t
5704                0x0069, # i
5705                0x0074, # t
5706              ]->[length $self->{kwd}]) {
5707          ## Stay in the state.          ## Stay in the state.
5708          $self->{kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
5709                    
# Line 5478  sub _get_next_token ($) { Line 5718  sub _get_next_token ($) {
5718      }      }
5719        
5720          redo A;          redo A;
5721        } elsif ($self->{kwd} eq 'ENTIT' and        } elsif ((length $self->{kwd}) == 5 and
5722                 $self->{nc} == 0x0059) { # Y                 ($self->{nc} == 0x0059 or # Y
5723          $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '', text => '',                  $self->{nc} == 0x0079)) { # y
5724            if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
5725              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5726                              text => 'ENTITY',
5727                              line => $self->{line_prev},
5728                              column => $self->{column_prev} - 4);
5729            }
5730            $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
5731                         line => $self->{line_prev},                         line => $self->{line_prev},
5732                         column => $self->{column_prev} - 6};                         column => $self->{column_prev} - 6};
5733          $self->{state} = DOCTYPE_MD_STATE;          $self->{state} = DOCTYPE_MD_STATE;
# Line 5508  sub _get_next_token ($) { Line 5755  sub _get_next_token ($) {
5755          redo A;          redo A;
5756        }        }
5757      } elsif ($self->{state} == MD_ELEMENT_STATE) {      } elsif ($self->{state} == MD_ELEMENT_STATE) {
5758        if ($self->{nc} == {        if ($self->{nc} == [
5759              'EL' => 0x0045, # E             undef,
5760              'ELE' => 0x004D, # M             undef,
5761              'ELEM' => 0x0045, # E             0x0045, # E
5762              'ELEME' => 0x004E, # N             0x004D, # M
5763            }->{$self->{kwd}}) {             0x0045, # E
5764               0x004E, # N
5765              ]->[length $self->{kwd}] or
5766              $self->{nc} == [
5767               undef,
5768               undef,
5769               0x0065, # e
5770               0x006D, # m
5771               0x0065, # e
5772               0x006E, # n
5773              ]->[length $self->{kwd}]) {
5774          ## Stay in the state.          ## Stay in the state.
5775          $self->{kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
5776                    
# Line 5528  sub _get_next_token ($) { Line 5785  sub _get_next_token ($) {
5785      }      }
5786        
5787          redo A;          redo A;
5788        } elsif ($self->{kwd} eq 'ELEMEN' and        } elsif ((length $self->{kwd}) == 6 and
5789                 $self->{nc} == 0x0054) { # T                 ($self->{nc} == 0x0054 or # T
5790                    $self->{nc} == 0x0074)) { # t
5791            if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
5792              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5793                              text => 'ELEMENT',
5794                              line => $self->{line_prev},
5795                              column => $self->{column_prev} - 5);
5796            }
5797          $self->{ct} = {type => ELEMENT_TOKEN, name => '',          $self->{ct} = {type => ELEMENT_TOKEN, name => '',
5798                         line => $self->{line_prev},                         line => $self->{line_prev},
5799                         column => $self->{column_prev} - 6};                         column => $self->{column_prev} - 6};
# Line 5558  sub _get_next_token ($) { Line 5822  sub _get_next_token ($) {
5822          redo A;          redo A;
5823        }        }
5824      } elsif ($self->{state} == MD_ATTLIST_STATE) {      } elsif ($self->{state} == MD_ATTLIST_STATE) {
5825        if ($self->{nc} == {        if ($self->{nc} == [
5826              'A' => 0x0054, # T             undef,
5827              'AT' => 0x0054, # T             0x0054, # T
5828              'ATT' => 0x004C, # L             0x0054, # T
5829              'ATTL' => 0x0049, # I             0x004C, # L
5830              'ATTLI' => 0x0053, # S             0x0049, # I
5831            }->{$self->{kwd}}) {             0x0053, # S
5832              ]->[length $self->{kwd}] or
5833              $self->{nc} == [
5834               undef,
5835               0x0074, # t
5836               0x0074, # t
5837               0x006C, # l
5838               0x0069, # i
5839               0x0073, # s
5840              ]->[length $self->{kwd}]) {
5841          ## Stay in the state.          ## Stay in the state.
5842          $self->{kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
5843                    
# Line 5579  sub _get_next_token ($) { Line 5852  sub _get_next_token ($) {
5852      }      }
5853        
5854          redo A;          redo A;
5855        } elsif ($self->{kwd} eq 'ATTLIS' and        } elsif ((length $self->{kwd}) == 6 and
5856                 $self->{nc} == 0x0054) { # T                 ($self->{nc} == 0x0054 or # T
5857                    $self->{nc} == 0x0074)) { # t
5858            if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
5859              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5860                              text => 'ATTLIST',
5861                              line => $self->{line_prev},
5862                              column => $self->{column_prev} - 5);
5863            }
5864          $self->{ct} = {type => ATTLIST_TOKEN, name => '',          $self->{ct} = {type => ATTLIST_TOKEN, name => '',
5865                         attrdefs => [],                         attrdefs => [],
5866                         line => $self->{line_prev},                         line => $self->{line_prev},
# Line 5610  sub _get_next_token ($) { Line 5890  sub _get_next_token ($) {
5890          redo A;          redo A;
5891        }        }
5892      } elsif ($self->{state} == MD_NOTATION_STATE) {      } elsif ($self->{state} == MD_NOTATION_STATE) {
5893        if ($self->{nc} == {        if ($self->{nc} == [
5894              'N' => 0x004F, # O             undef,
5895              'NO' => 0x0054, # T             0x004F, # O
5896              'NOT' => 0x0041, # A             0x0054, # T
5897              'NOTA' => 0x0054, # T             0x0041, # A
5898              'NOTAT' => 0x0049, # I             0x0054, # T
5899              'NOTATI' => 0x004F, # O             0x0049, # I
5900            }->{$self->{kwd}}) {             0x004F, # O
5901              ]->[length $self->{kwd}] or
5902              $self->{nc} == [
5903               undef,
5904               0x006F, # o
5905               0x0074, # t
5906               0x0061, # a
5907               0x0074, # t
5908               0x0069, # i
5909               0x006F, # o
5910              ]->[length $self->{kwd}]) {
5911          ## Stay in the state.          ## Stay in the state.
5912          $self->{kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
5913                    
# Line 5632  sub _get_next_token ($) { Line 5922  sub _get_next_token ($) {
5922      }      }
5923        
5924          redo A;          redo A;
5925        } elsif ($self->{kwd} eq 'NOTATIO' and        } elsif ((length $self->{kwd}) == 7 and
5926                 $self->{nc} == 0x004E) { # N                 ($self->{nc} == 0x004E or # N
5927                    $self->{nc} == 0x006E)) { # n
5928            if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
5929              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5930                              text => 'NOTATION',
5931                              line => $self->{line_prev},
5932                              column => $self->{column_prev} - 6);
5933            }
5934          $self->{ct} = {type => NOTATION_TOKEN, name => '',          $self->{ct} = {type => NOTATION_TOKEN, name => '',
5935                         line => $self->{line_prev},                         line => $self->{line_prev},
5936                         column => $self->{column_prev} - 6};                         column => $self->{column_prev} - 6};
# Line 5845  sub _get_next_token ($) { Line 6142  sub _get_next_token ($) {
6142        ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".        ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
6143                
6144        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
6145          ## TODO:          if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6146          $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6147            } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
6148              ## TODO: ...
6149              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6150            } else { # ENTITY/NOTATION
6151              $self->{state} = AFTER_DOCTYPE_NAME_STATE;
6152            }
6153                    
6154      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6155        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 5863  sub _get_next_token ($) { Line 6166  sub _get_next_token ($) {
6166          if ($self->{ct}->{type} == ATTLIST_TOKEN) {          if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6167            #            #
6168          } else {          } else {
6169            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md body'); ## TODO: type            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
6170          }          }
6171          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6172                    
# Line 6853  sub _get_next_token ($) { Line 7156  sub _get_next_token ($) {
7156        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
7157          ## XML5: No parse error.          ## XML5: No parse error.
7158          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type
7159          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_MD_STATE;
         $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded  
7160          ## Reconsume.          ## Reconsume.
7161          redo A;          redo A;
7162        } elsif ($self->{nc} == 0x0022) { # "        } elsif ($self->{nc} == 0x0022) { # "
# Line 7141  sub _get_next_token ($) { Line 7443  sub _get_next_token ($) {
7443          $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;          $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7444          ## Reconsume.          ## Reconsume.
7445          redo A;          redo A;
7446        }              }
7447        } elsif ($self->{state} == NDATA_STATE) {
7448          ## ASCII case-insensitive
7449          if ($self->{nc} == [
7450                undef,
7451                0x0044, # D
7452                0x0041, # A
7453                0x0054, # T
7454              ]->[length $self->{kwd}] or
7455              $self->{nc} == [
7456                undef,
7457                0x0064, # d
7458                0x0061, # a
7459                0x0074, # t
7460              ]->[length $self->{kwd}]) {
7461            
7462            ## Stay in the state.
7463            $self->{kwd} .= chr $self->{nc};
7464            
7465        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7466          $self->{line_prev} = $self->{line};
7467          $self->{column_prev} = $self->{column};
7468          $self->{column}++;
7469          $self->{nc}
7470              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7471        } else {
7472          $self->{set_nc}->($self);
7473        }
7474      
7475            redo A;
7476          } elsif ((length $self->{kwd}) == 4 and
7477                   ($self->{nc} == 0x0041 or # A
7478                    $self->{nc} == 0x0061)) { # a
7479            if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
7480              
7481              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
7482                              text => 'NDATA',
7483                              line => $self->{line_prev},
7484                              column => $self->{column_prev} - 4);
7485            } else {
7486              
7487            }
7488            $self->{state} = AFTER_NDATA_STATE;
7489            
7490        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7491          $self->{line_prev} = $self->{line};
7492          $self->{column_prev} = $self->{column};
7493          $self->{column}++;
7494          $self->{nc}
7495              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7496        } else {
7497          $self->{set_nc}->($self);
7498        }
7499      
7500            redo A;
7501          } else {
7502            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7503                            line => $self->{line_prev},
7504                            column => $self->{column_prev} + 1
7505                                - length $self->{kwd});
7506            
7507            $self->{state} = BOGUS_MD_STATE;
7508            ## Reconsume.
7509            redo A;
7510          }
7511        } elsif ($self->{state} == AFTER_NDATA_STATE) {
7512          if ($is_space->{$self->{nc}}) {
7513            $self->{state} = BEFORE_NOTATION_NAME_STATE;
7514            
7515        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7516          $self->{line_prev} = $self->{line};
7517          $self->{column_prev} = $self->{column};
7518          $self->{column}++;
7519          $self->{nc}
7520              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7521        } else {
7522          $self->{set_nc}->($self);
7523        }
7524      
7525            redo A;
7526          } elsif ($self->{nc} == 0x003E) { # >
7527            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7528            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7529            
7530        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7531          $self->{line_prev} = $self->{line};
7532          $self->{column_prev} = $self->{column};
7533          $self->{column}++;
7534          $self->{nc}
7535              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7536        } else {
7537          $self->{set_nc}->($self);
7538        }
7539      
7540            return  ($self->{ct}); # ENTITY
7541            redo A;
7542          } elsif ($self->{nc} == -1) {
7543            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7544            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7545            
7546        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7547          $self->{line_prev} = $self->{line};
7548          $self->{column_prev} = $self->{column};
7549          $self->{column}++;
7550          $self->{nc}
7551              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7552        } else {
7553          $self->{set_nc}->($self);
7554        }
7555      
7556            return  ($self->{ct}); # ENTITY
7557            redo A;
7558          } else {
7559            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7560                            line => $self->{line_prev},
7561                            column => $self->{column_prev} + 1
7562                                - length $self->{kwd});
7563            $self->{state} = BOGUS_MD_STATE;
7564            ## Reconsume.
7565            redo A;
7566          }
7567        } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
7568          if ($is_space->{$self->{nc}}) {
7569            ## Stay in the state.
7570            
7571        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7572          $self->{line_prev} = $self->{line};
7573          $self->{column_prev} = $self->{column};
7574          $self->{column}++;
7575          $self->{nc}
7576              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7577        } else {
7578          $self->{set_nc}->($self);
7579        }
7580      
7581            redo A;
7582          } elsif ($self->{nc} == 0x003E) { # >
7583            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7584            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7585            
7586        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7587          $self->{line_prev} = $self->{line};
7588          $self->{column_prev} = $self->{column};
7589          $self->{column}++;
7590          $self->{nc}
7591              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7592        } else {
7593          $self->{set_nc}->($self);
7594        }
7595      
7596            return  ($self->{ct}); # ENTITY
7597            redo A;
7598          } elsif ($self->{nc} == -1) {
7599            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7600            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7601            
7602        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7603          $self->{line_prev} = $self->{line};
7604          $self->{column_prev} = $self->{column};
7605          $self->{column}++;
7606          $self->{nc}
7607              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7608        } else {
7609          $self->{set_nc}->($self);
7610        }
7611      
7612            return  ($self->{ct}); # ENTITY
7613            redo A;
7614          } else {
7615            $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
7616            $self->{state} = NOTATION_NAME_STATE;
7617            
7618        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7619          $self->{line_prev} = $self->{line};
7620          $self->{column_prev} = $self->{column};
7621          $self->{column}++;
7622          $self->{nc}
7623              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7624        } else {
7625          $self->{set_nc}->($self);
7626        }
7627      
7628            redo A;
7629          }
7630        } elsif ($self->{state} == NOTATION_NAME_STATE) {
7631          if ($is_space->{$self->{nc}}) {
7632            $self->{state} = AFTER_NOTATION_NAME_STATE;
7633            
7634        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7635          $self->{line_prev} = $self->{line};
7636          $self->{column_prev} = $self->{column};
7637          $self->{column}++;
7638          $self->{nc}
7639              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7640        } else {
7641          $self->{set_nc}->($self);
7642        }
7643      
7644            redo A;
7645          } elsif ($self->{nc} == 0x003E) { # >
7646            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7647            
7648        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7649          $self->{line_prev} = $self->{line};
7650          $self->{column_prev} = $self->{column};
7651          $self->{column}++;
7652          $self->{nc}
7653              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7654        } else {
7655          $self->{set_nc}->($self);
7656        }
7657      
7658            return  ($self->{ct}); # ENTITY
7659            redo A;
7660          } elsif ($self->{nc} == -1) {
7661            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7662            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7663            
7664        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7665          $self->{line_prev} = $self->{line};
7666          $self->{column_prev} = $self->{column};
7667          $self->{column}++;
7668          $self->{nc}
7669              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7670        } else {
7671          $self->{set_nc}->($self);
7672        }
7673      
7674            return  ($self->{ct}); # ENTITY
7675            redo A;
7676          } else {
7677            $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
7678            ## Stay in the state.
7679            
7680        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7681          $self->{line_prev} = $self->{line};
7682          $self->{column_prev} = $self->{column};
7683          $self->{column}++;
7684          $self->{nc}
7685              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7686        } else {
7687          $self->{set_nc}->($self);
7688        }
7689      
7690            redo A;
7691          }
7692        } elsif ($self->{state} == AFTER_NOTATION_NAME_STATE) {
7693          if ($is_space->{$self->{nc}}) {
7694            ## Stay in the state.
7695            
7696        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7697          $self->{line_prev} = $self->{line};
7698          $self->{column_prev} = $self->{column};
7699          $self->{column}++;
7700          $self->{nc}
7701              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7702        } else {
7703          $self->{set_nc}->($self);
7704        }
7705      
7706            redo A;
7707          } elsif ($self->{nc} == 0x003E) { # >
7708            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7709            
7710        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7711          $self->{line_prev} = $self->{line};
7712          $self->{column_prev} = $self->{column};
7713          $self->{column}++;
7714          $self->{nc}
7715              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7716        } else {
7717          $self->{set_nc}->($self);
7718        }
7719      
7720            return  ($self->{ct}); # ENTITY
7721            redo A;
7722          } elsif ($self->{nc} == -1) {
7723            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7724            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7725            
7726        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7727          $self->{line_prev} = $self->{line};
7728          $self->{column_prev} = $self->{column};
7729          $self->{column}++;
7730          $self->{nc}
7731              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7732        } else {
7733          $self->{set_nc}->($self);
7734        }
7735      
7736            return  ($self->{ct}); # ENTITY
7737            redo A;
7738          } else {
7739            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after notation name'); ## TODO: type
7740            $self->{state} = BOGUS_MD_STATE;
7741            ## Reconsume.
7742            redo A;
7743          }
7744    
7745    
7746        } elsif ($self->{state} == BOGUS_MD_STATE) {
7747          if ($self->{nc} == 0x003E) { # >
7748            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7749            
7750        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7751          $self->{line_prev} = $self->{line};
7752          $self->{column_prev} = $self->{column};
7753          $self->{column}++;
7754          $self->{nc}
7755              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7756        } else {
7757          $self->{set_nc}->($self);
7758        }
7759      
7760            return  ($self->{ct}); # ATTLIST/ENTITY/NOTATION
7761            redo A;
7762          } elsif ($self->{nc} == -1) {
7763            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7764            ## Reconsume.
7765            return  ($self->{ct}); # ATTLIST/ENTITY/NOTATION
7766            redo A;
7767          } else {
7768            ## Stay in the state.
7769            
7770        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7771          $self->{line_prev} = $self->{line};
7772          $self->{column_prev} = $self->{column};
7773          $self->{column}++;
7774          $self->{nc}
7775              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7776        } else {
7777          $self->{set_nc}->($self);
7778        }
7779      
7780            redo A;
7781          }
7782      } else {      } else {
7783        die "$0: $self->{state}: Unknown state";        die "$0: $self->{state}: Unknown state";
7784      }      }

Legend:
Removed from v.1.15  
changed lines
  Added in v.1.18

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24