/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.17 by wakaba, Sun Oct 19 04:39:25 2008 UTC revision 1.18 by wakaba, Sun Oct 19 06:14:57 2008 UTC
# Line 177  sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATIO Line 177  sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATIO
177  sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }  sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178  sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }  sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179  sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }  sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180  sub BOGUS_MD_STATE () { 85 }  sub BEFORE_NDATA_STATE () { 85 }
181    sub NDATA_STATE () { 86 }
182    sub AFTER_NDATA_STATE () { 87 }
183    sub BEFORE_NOTATION_NAME_STATE () { 88 }
184    sub NOTATION_NAME_STATE () { 89 }
185    sub AFTER_NOTATION_NAME_STATE () { 90 }
186    sub BOGUS_MD_STATE () { 91 }
187    
188  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
189  ## list and descriptions)  ## list and descriptions)
# Line 4172  sub _get_next_token ($) { Line 4178  sub _get_next_token ($) {
4178        }        }
4179      } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {      } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4180        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
4181                    if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
4182          ## Stay in the state            
4183              $self->{state} = BEFORE_NDATA_STATE;
4184            } else {
4185              
4186              ## Stay in the state
4187            }
4188                    
4189      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4190        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 4209  sub _get_next_token ($) { Line 4220  sub _get_next_token ($) {
4220        
4221          return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION          return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4222          redo A;          redo A;
4223  ## TODO: "NDATA"        } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4224                   ($self->{nc} == 0x004E or # N
4225                    $self->{nc} == 0x006E)) { # n
4226            
4227            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before NDATA'); ## TODO: type
4228            $self->{state} = NDATA_STATE;
4229            $self->{kwd} = chr $self->{nc};
4230            
4231        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4232          $self->{line_prev} = $self->{line};
4233          $self->{column_prev} = $self->{column};
4234          $self->{column}++;
4235          $self->{nc}
4236              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4237        } else {
4238          $self->{set_nc}->($self);
4239        }
4240      
4241            redo A;
4242        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4243          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4244                        
# Line 4271  sub _get_next_token ($) { Line 4300  sub _get_next_token ($) {
4300        
4301          redo A;          redo A;
4302        }        }
4303        } elsif ($self->{state} == BEFORE_NDATA_STATE) {
4304          if ($is_space->{$self->{nc}}) {
4305            
4306            ## Stay in the state.
4307            
4308        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4309          $self->{line_prev} = $self->{line};
4310          $self->{column_prev} = $self->{column};
4311          $self->{column}++;
4312          $self->{nc}
4313              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4314        } else {
4315          $self->{set_nc}->($self);
4316        }
4317      
4318            redo A;
4319          } elsif ($self->{nc} == 0x003E) { # >
4320            
4321            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4322            
4323        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4324          $self->{line_prev} = $self->{line};
4325          $self->{column_prev} = $self->{column};
4326          $self->{column}++;
4327          $self->{nc}
4328              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4329        } else {
4330          $self->{set_nc}->($self);
4331        }
4332      
4333            return  ($self->{ct}); # ENTITY
4334            redo A;
4335          } elsif ($self->{nc} == 0x004E or # N
4336                   $self->{nc} == 0x006E) { # n
4337            
4338            $self->{state} = NDATA_STATE;
4339            $self->{kwd} = chr $self->{nc};
4340            
4341        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4342          $self->{line_prev} = $self->{line};
4343          $self->{column_prev} = $self->{column};
4344          $self->{column}++;
4345          $self->{nc}
4346              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4347        } else {
4348          $self->{set_nc}->($self);
4349        }
4350      
4351            redo A;
4352          } elsif ($self->{nc} == -1) {
4353            
4354            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4355            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4356            ## reconsume
4357            return  ($self->{ct}); # ENTITY
4358            redo A;
4359          } else {
4360            
4361            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4362            $self->{state} = BOGUS_MD_STATE;
4363            
4364        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4365          $self->{line_prev} = $self->{line};
4366          $self->{column_prev} = $self->{column};
4367          $self->{column}++;
4368          $self->{nc}
4369              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4370        } else {
4371          $self->{set_nc}->($self);
4372        }
4373      
4374            redo A;
4375          }
4376      } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {      } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
4377        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
4378                    
# Line 7342  sub _get_next_token ($) { Line 7444  sub _get_next_token ($) {
7444          ## Reconsume.          ## Reconsume.
7445          redo A;          redo A;
7446        }        }
7447        } elsif ($self->{state} == NDATA_STATE) {
7448          ## ASCII case-insensitive
7449          if ($self->{nc} == [
7450                undef,
7451                0x0044, # D
7452                0x0041, # A
7453                0x0054, # T
7454              ]->[length $self->{kwd}] or
7455              $self->{nc} == [
7456                undef,
7457                0x0064, # d
7458                0x0061, # a
7459                0x0074, # t
7460              ]->[length $self->{kwd}]) {
7461            
7462            ## Stay in the state.
7463            $self->{kwd} .= chr $self->{nc};
7464            
7465        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7466          $self->{line_prev} = $self->{line};
7467          $self->{column_prev} = $self->{column};
7468          $self->{column}++;
7469          $self->{nc}
7470              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7471        } else {
7472          $self->{set_nc}->($self);
7473        }
7474      
7475            redo A;
7476          } elsif ((length $self->{kwd}) == 4 and
7477                   ($self->{nc} == 0x0041 or # A
7478                    $self->{nc} == 0x0061)) { # a
7479            if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
7480              
7481              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
7482                              text => 'NDATA',
7483                              line => $self->{line_prev},
7484                              column => $self->{column_prev} - 4);
7485            } else {
7486              
7487            }
7488            $self->{state} = AFTER_NDATA_STATE;
7489            
7490        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7491          $self->{line_prev} = $self->{line};
7492          $self->{column_prev} = $self->{column};
7493          $self->{column}++;
7494          $self->{nc}
7495              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7496        } else {
7497          $self->{set_nc}->($self);
7498        }
7499      
7500            redo A;
7501          } else {
7502            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7503                            line => $self->{line_prev},
7504                            column => $self->{column_prev} + 1
7505                                - length $self->{kwd});
7506            
7507            $self->{state} = BOGUS_MD_STATE;
7508            ## Reconsume.
7509            redo A;
7510          }
7511        } elsif ($self->{state} == AFTER_NDATA_STATE) {
7512          if ($is_space->{$self->{nc}}) {
7513            $self->{state} = BEFORE_NOTATION_NAME_STATE;
7514            
7515        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7516          $self->{line_prev} = $self->{line};
7517          $self->{column_prev} = $self->{column};
7518          $self->{column}++;
7519          $self->{nc}
7520              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7521        } else {
7522          $self->{set_nc}->($self);
7523        }
7524      
7525            redo A;
7526          } elsif ($self->{nc} == 0x003E) { # >
7527            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7528            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7529            
7530        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7531          $self->{line_prev} = $self->{line};
7532          $self->{column_prev} = $self->{column};
7533          $self->{column}++;
7534          $self->{nc}
7535              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7536        } else {
7537          $self->{set_nc}->($self);
7538        }
7539      
7540            return  ($self->{ct}); # ENTITY
7541            redo A;
7542          } elsif ($self->{nc} == -1) {
7543            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7544            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7545            
7546        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7547          $self->{line_prev} = $self->{line};
7548          $self->{column_prev} = $self->{column};
7549          $self->{column}++;
7550          $self->{nc}
7551              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7552        } else {
7553          $self->{set_nc}->($self);
7554        }
7555      
7556            return  ($self->{ct}); # ENTITY
7557            redo A;
7558          } else {
7559            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7560                            line => $self->{line_prev},
7561                            column => $self->{column_prev} + 1
7562                                - length $self->{kwd});
7563            $self->{state} = BOGUS_MD_STATE;
7564            ## Reconsume.
7565            redo A;
7566          }
7567        } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
7568          if ($is_space->{$self->{nc}}) {
7569            ## Stay in the state.
7570            
7571        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7572          $self->{line_prev} = $self->{line};
7573          $self->{column_prev} = $self->{column};
7574          $self->{column}++;
7575          $self->{nc}
7576              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7577        } else {
7578          $self->{set_nc}->($self);
7579        }
7580      
7581            redo A;
7582          } elsif ($self->{nc} == 0x003E) { # >
7583            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7584            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7585            
7586        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7587          $self->{line_prev} = $self->{line};
7588          $self->{column_prev} = $self->{column};
7589          $self->{column}++;
7590          $self->{nc}
7591              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7592        } else {
7593          $self->{set_nc}->($self);
7594        }
7595      
7596            return  ($self->{ct}); # ENTITY
7597            redo A;
7598          } elsif ($self->{nc} == -1) {
7599            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7600            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7601            
7602        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7603          $self->{line_prev} = $self->{line};
7604          $self->{column_prev} = $self->{column};
7605          $self->{column}++;
7606          $self->{nc}
7607              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7608        } else {
7609          $self->{set_nc}->($self);
7610        }
7611      
7612            return  ($self->{ct}); # ENTITY
7613            redo A;
7614          } else {
7615            $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
7616            $self->{state} = NOTATION_NAME_STATE;
7617            
7618        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7619          $self->{line_prev} = $self->{line};
7620          $self->{column_prev} = $self->{column};
7621          $self->{column}++;
7622          $self->{nc}
7623              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7624        } else {
7625          $self->{set_nc}->($self);
7626        }
7627      
7628            redo A;
7629          }
7630        } elsif ($self->{state} == NOTATION_NAME_STATE) {
7631          if ($is_space->{$self->{nc}}) {
7632            $self->{state} = AFTER_NOTATION_NAME_STATE;
7633            
7634        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7635          $self->{line_prev} = $self->{line};
7636          $self->{column_prev} = $self->{column};
7637          $self->{column}++;
7638          $self->{nc}
7639              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7640        } else {
7641          $self->{set_nc}->($self);
7642        }
7643      
7644            redo A;
7645          } elsif ($self->{nc} == 0x003E) { # >
7646            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7647            
7648        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7649          $self->{line_prev} = $self->{line};
7650          $self->{column_prev} = $self->{column};
7651          $self->{column}++;
7652          $self->{nc}
7653              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7654        } else {
7655          $self->{set_nc}->($self);
7656        }
7657      
7658            return  ($self->{ct}); # ENTITY
7659            redo A;
7660          } elsif ($self->{nc} == -1) {
7661            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7662            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7663            
7664        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7665          $self->{line_prev} = $self->{line};
7666          $self->{column_prev} = $self->{column};
7667          $self->{column}++;
7668          $self->{nc}
7669              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7670        } else {
7671          $self->{set_nc}->($self);
7672        }
7673      
7674            return  ($self->{ct}); # ENTITY
7675            redo A;
7676          } else {
7677            $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
7678            ## Stay in the state.
7679            
7680        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7681          $self->{line_prev} = $self->{line};
7682          $self->{column_prev} = $self->{column};
7683          $self->{column}++;
7684          $self->{nc}
7685              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7686        } else {
7687          $self->{set_nc}->($self);
7688        }
7689      
7690            redo A;
7691          }
7692        } elsif ($self->{state} == AFTER_NOTATION_NAME_STATE) {
7693          if ($is_space->{$self->{nc}}) {
7694            ## Stay in the state.
7695            
7696        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7697          $self->{line_prev} = $self->{line};
7698          $self->{column_prev} = $self->{column};
7699          $self->{column}++;
7700          $self->{nc}
7701              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7702        } else {
7703          $self->{set_nc}->($self);
7704        }
7705      
7706            redo A;
7707          } elsif ($self->{nc} == 0x003E) { # >
7708            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7709            
7710        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7711          $self->{line_prev} = $self->{line};
7712          $self->{column_prev} = $self->{column};
7713          $self->{column}++;
7714          $self->{nc}
7715              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7716        } else {
7717          $self->{set_nc}->($self);
7718        }
7719      
7720            return  ($self->{ct}); # ENTITY
7721            redo A;
7722          } elsif ($self->{nc} == -1) {
7723            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7724            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7725            
7726        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7727          $self->{line_prev} = $self->{line};
7728          $self->{column_prev} = $self->{column};
7729          $self->{column}++;
7730          $self->{nc}
7731              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7732        } else {
7733          $self->{set_nc}->($self);
7734        }
7735      
7736            return  ($self->{ct}); # ENTITY
7737            redo A;
7738          } else {
7739            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after notation name'); ## TODO: type
7740            $self->{state} = BOGUS_MD_STATE;
7741            ## Reconsume.
7742            redo A;
7743          }
7744    
7745    
7746      } elsif ($self->{state} == BOGUS_MD_STATE) {      } elsif ($self->{state} == BOGUS_MD_STATE) {
7747        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >

Legend:
Removed from v.1.17  
changed lines
  Added in v.1.18

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24