/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.7 by wakaba, Tue Oct 14 15:25:50 2008 UTC revision 1.8 by wakaba, Wed Oct 15 04:38:22 2008 UTC
# Line 114  sub HEXREF_HEX_STATE () { 48 } Line 114  sub HEXREF_HEX_STATE () { 48 }
114  sub ENTITY_NAME_STATE () { 49 }  sub ENTITY_NAME_STATE () { 49 }
115  sub PCDATA_STATE () { 50 } # "data state" in the spec  sub PCDATA_STATE () { 50 } # "data state" in the spec
116    
117    ## XML states
118    sub PI_STATE () { 51 }
119    sub PI_TARGET_STATE () { 52 }
120    sub PI_TARGET_AFTER_STATE () { 53 }
121    sub PI_DATA_STATE () { 54 }
122    sub PI_AFTER_STATE () { 55 }
123    sub PI_DATA_AFTER_STATE () { 56 }
124    
125  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
126  ## list and descriptions)  ## list and descriptions)
127    
# Line 630  sub _get_next_token ($) { Line 638  sub _get_next_token ($) {
638    
639            redo A;            redo A;
640          } elsif ($self->{nc} == 0x003F) { # ?          } elsif ($self->{nc} == 0x003F) { # ?
641                        if ($self->{is_xml}) {
642            $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',              
643                            line => $self->{line_prev},              $self->{state} = PI_STATE;
644                            column => $self->{column_prev});              
645            $self->{state} = BOGUS_COMMENT_STATE;      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
646            $self->{ct} = {type => COMMENT_TOKEN, data => '',        $self->{line_prev} = $self->{line};
647                                      line => $self->{line_prev},        $self->{column_prev} = $self->{column};
648                                      column => $self->{column_prev},        $self->{column}++;
649                                     };        $self->{nc}
650            ## $self->{nc} is intentionally left as is            = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
651            redo A;      } else {
652          $self->{set_nc}->($self);
653        }
654      
655                redo A;
656              } else {
657                
658                $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
659                                line => $self->{line_prev},
660                                column => $self->{column_prev});
661                $self->{state} = BOGUS_COMMENT_STATE;
662                $self->{ct} = {type => COMMENT_TOKEN, data => '',
663                               line => $self->{line_prev},
664                               column => $self->{column_prev},
665                              };
666                ## $self->{nc} is intentionally left as is
667                redo A;
668              }
669          } else {          } else {
670                        
671            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
# Line 2228  sub _get_next_token ($) { Line 2253  sub _get_next_token ($) {
2253          redo A;          redo A;
2254        } elsif ($self->{s_kwd} eq '[CDATA' and        } elsif ($self->{s_kwd} eq '[CDATA' and
2255                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
           
   
2256          if ($self->{is_xml} and          if ($self->{is_xml} and
2257              not $self->{tainted} and              not $self->{tainted} and
2258              @{$self->{open_elements} or []} == 0) {              @{$self->{open_elements} or []} == 0) {
2259              
2260            $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',            $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2261                            line => $self->{line_prev},                            line => $self->{line_prev},
2262                            column => $self->{column_prev} - 7);                            column => $self->{column_prev} - 7);
2263            $self->{tainted} = 1;            $self->{tainted} = 1;
2264            } else {
2265              
2266          }          }
2267    
2268          $self->{ct} = {type => CHARACTER_TOKEN,          $self->{ct} = {type => CHARACTER_TOKEN,
# Line 3643  sub _get_next_token ($) { Line 3669  sub _get_next_token ($) {
3669          redo A;          redo A;
3670        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3671          if ($self->{is_xml}) {          if ($self->{is_xml}) {
3672              
3673            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
3674            } else {
3675              
3676          }          }
3677    
3678          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
# Line 4260  sub _get_next_token ($) { Line 4289  sub _get_next_token ($) {
4289          ## Reconsume.          ## Reconsume.
4290          redo A;          redo A;
4291        }        }
4292    
4293        ## XML-only states
4294    
4295        } elsif ($self->{state} == PI_STATE) {
4296          if ($is_space->{$self->{nc}} or
4297              $self->{nc} == 0x003F or # ? ## XML5: Same as "Anything else"
4298              $self->{nc} == -1) {
4299            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
4300                            line => $self->{line_prev},
4301                            column => $self->{column_prev}
4302                                - 1 * ($self->{nc} != -1));
4303            $self->{state} = BOGUS_COMMENT_STATE;
4304            ## Reconsume.
4305            $self->{ct} = {type => COMMENT_TOKEN,
4306                           data => '?',
4307                           line => $self->{line_prev},
4308                           column => $self->{column_prev}
4309                               - 1 * ($self->{nc} != -1),
4310                          };
4311            redo A;
4312          } else {
4313            $self->{ct} = {type => PI_TOKEN,
4314                           target => chr $self->{nc},
4315                           data => '',
4316                           line => $self->{line_prev},
4317                           column => $self->{column_prev} - 1,
4318                          };
4319            $self->{state} = PI_TARGET_STATE;
4320            
4321        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4322          $self->{line_prev} = $self->{line};
4323          $self->{column_prev} = $self->{column};
4324          $self->{column}++;
4325          $self->{nc}
4326              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4327        } else {
4328          $self->{set_nc}->($self);
4329        }
4330      
4331            redo A;
4332          }
4333        } elsif ($self->{state} == PI_TARGET_STATE) {
4334          if ($is_space->{$self->{nc}}) {
4335            $self->{state} = PI_TARGET_AFTER_STATE;
4336            
4337        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4338          $self->{line_prev} = $self->{line};
4339          $self->{column_prev} = $self->{column};
4340          $self->{column}++;
4341          $self->{nc}
4342              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4343        } else {
4344          $self->{set_nc}->($self);
4345        }
4346      
4347            redo A;
4348          } elsif ($self->{nc} == -1) {
4349            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
4350            $self->{state} = DATA_STATE;
4351            $self->{s_kwd} = '';
4352            ## Reconsume.
4353            return  ($self->{ct}); # pi
4354            redo A;
4355          } elsif ($self->{nc} == 0x003F) { # ?
4356            $self->{state} = PI_AFTER_STATE;
4357            
4358        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4359          $self->{line_prev} = $self->{line};
4360          $self->{column_prev} = $self->{column};
4361          $self->{column}++;
4362          $self->{nc}
4363              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4364        } else {
4365          $self->{set_nc}->($self);
4366        }
4367      
4368            redo A;
4369          } else {
4370            ## XML5: typo ("tag name" -> "target")
4371            $self->{ct}->{target} .= chr $self->{nc}; # pi
4372            
4373        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4374          $self->{line_prev} = $self->{line};
4375          $self->{column_prev} = $self->{column};
4376          $self->{column}++;
4377          $self->{nc}
4378              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4379        } else {
4380          $self->{set_nc}->($self);
4381        }
4382      
4383            redo A;
4384          }
4385        } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
4386          if ($is_space->{$self->{nc}}) {
4387            ## Stay in the state.
4388            
4389        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4390          $self->{line_prev} = $self->{line};
4391          $self->{column_prev} = $self->{column};
4392          $self->{column}++;
4393          $self->{nc}
4394              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4395        } else {
4396          $self->{set_nc}->($self);
4397        }
4398      
4399            redo A;
4400          } else {
4401            $self->{state} = PI_DATA_STATE;
4402            ## Reprocess.
4403            redo A;
4404          }
4405        } elsif ($self->{state} == PI_DATA_STATE) {
4406          if ($self->{nc} == 0x003F) { # ?
4407            $self->{state} = PI_DATA_AFTER_STATE;
4408            
4409        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4410          $self->{line_prev} = $self->{line};
4411          $self->{column_prev} = $self->{column};
4412          $self->{column}++;
4413          $self->{nc}
4414              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4415        } else {
4416          $self->{set_nc}->($self);
4417        }
4418      
4419            redo A;
4420          } elsif ($self->{nc} == -1) {
4421            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
4422            $self->{state} = DATA_STATE;
4423            $self->{s_kwd} = '';
4424            ## Reprocess.
4425            return  ($self->{ct}); # pi
4426            redo A;
4427          } else {
4428            $self->{ct}->{data} .= chr $self->{nc}; # pi
4429            $self->{read_until}->($self->{ct}->{data}, q[?],
4430                                  length $self->{ct}->{data});
4431            ## Stay in the state.
4432            
4433        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4434          $self->{line_prev} = $self->{line};
4435          $self->{column_prev} = $self->{column};
4436          $self->{column}++;
4437          $self->{nc}
4438              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4439        } else {
4440          $self->{set_nc}->($self);
4441        }
4442      
4443            ## Reprocess.
4444            redo A;
4445          }
4446        } elsif ($self->{state} == PI_AFTER_STATE) {
4447          if ($self->{nc} == 0x003E) { # >
4448            $self->{state} = DATA_STATE;
4449            $self->{s_kwd} = '';
4450            
4451        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4452          $self->{line_prev} = $self->{line};
4453          $self->{column_prev} = $self->{column};
4454          $self->{column}++;
4455          $self->{nc}
4456              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4457        } else {
4458          $self->{set_nc}->($self);
4459        }
4460      
4461            return  ($self->{ct}); # pi
4462            redo A;
4463          } elsif ($self->{nc} == 0x003F) { # ?
4464            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
4465                            line => $self->{line_prev},
4466                            column => $self->{column_prev}); ## XML5: no error
4467            $self->{ct}->{data} .= '?';
4468            $self->{state} = PI_DATA_AFTER_STATE;
4469            
4470        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4471          $self->{line_prev} = $self->{line};
4472          $self->{column_prev} = $self->{column};
4473          $self->{column}++;
4474          $self->{nc}
4475              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4476        } else {
4477          $self->{set_nc}->($self);
4478        }
4479      
4480            redo A;
4481          } else {
4482            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
4483                            line => $self->{line_prev},
4484                            column => $self->{column_prev}
4485                                + 1 * ($self->{nc} == -1)); ## XML5: no error
4486            $self->{ct}->{data} .= '?'; ## XML5: not appended
4487            $self->{state} = PI_DATA_STATE;
4488            ## Reprocess.
4489            redo A;
4490          }
4491        } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
4492          ## XML5: Same as "pi after state" in XML5
4493          if ($self->{nc} == 0x003E) { # >
4494            $self->{state} = DATA_STATE;
4495            $self->{s_kwd} = '';
4496            
4497        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4498          $self->{line_prev} = $self->{line};
4499          $self->{column_prev} = $self->{column};
4500          $self->{column}++;
4501          $self->{nc}
4502              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4503        } else {
4504          $self->{set_nc}->($self);
4505        }
4506      
4507            return  ($self->{ct}); # pi
4508            redo A;
4509          } elsif ($self->{nc} == 0x003F) { # ?
4510            $self->{ct}->{data} .= '?';
4511            ## Stay in the state.
4512            
4513        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4514          $self->{line_prev} = $self->{line};
4515          $self->{column_prev} = $self->{column};
4516          $self->{column}++;
4517          $self->{nc}
4518              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4519        } else {
4520          $self->{set_nc}->($self);
4521        }
4522      
4523            redo A;
4524          } else {
4525            $self->{ct}->{data} .= '?'; ## XML5: not appended
4526            $self->{state} = PI_DATA_STATE;
4527            ## Reprocess.
4528            redo A;
4529          }
4530            
4531      } else {      } else {
4532        die "$0: $self->{state}: Unknown state";        die "$0: $self->{state}: Unknown state";
4533      }      }

Legend:
Removed from v.1.7  
changed lines
  Added in v.1.8

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24