/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.17 by wakaba, Sun Oct 19 04:39:25 2008 UTC revision 1.19 by wakaba, Sun Oct 19 07:19:00 2008 UTC
# Line 177  sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATIO Line 177  sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATIO
177  sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }  sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178  sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }  sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179  sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }  sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180  sub BOGUS_MD_STATE () { 85 }  sub BEFORE_NDATA_STATE () { 85 }
181    sub NDATA_STATE () { 86 }
182    sub AFTER_NDATA_STATE () { 87 }
183    sub BEFORE_NOTATION_NAME_STATE () { 88 }
184    sub NOTATION_NAME_STATE () { 89 }
185    sub AFTER_NOTATION_NAME_STATE () { 90 }
186    sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 91 }
187    sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 92 }
188    sub ENTITY_VALUE_ENTITY_STATE () { 93 }
189    sub BOGUS_MD_STATE () { 94 }
190    
191  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
192  ## list and descriptions)  ## list and descriptions)
# Line 2256  sub _get_next_token ($) { Line 2265  sub _get_next_token ($) {
2265          $self->{kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
2266          !!!next-input-character;          !!!next-input-character;
2267          redo A;          redo A;
2268  ## TODO: " and ' for ENTITY        } elsif ($self->{nc} == 0x0022 and # "
2269                   ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2270                    $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2271            !!!cp (167.21);
2272            $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
2273            $self->{ct}->{value} = ''; # ENTITY
2274            !!!next-input-character;
2275            redo A;
2276          } elsif ($self->{nc} == 0x0027 and # '
2277                   ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2278                    $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2279            !!!cp (167.22);
2280            $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
2281            $self->{ct}->{value} = ''; # ENTITY
2282            !!!next-input-character;
2283            redo A;
2284        } elsif ($self->{is_xml} and        } elsif ($self->{is_xml} and
2285                 $self->{ct}->{type} == DOCTYPE_TOKEN and                 $self->{ct}->{type} == DOCTYPE_TOKEN and
2286                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
# Line 2812  sub _get_next_token ($) { Line 2836  sub _get_next_token ($) {
2836        }        }
2837      } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {      } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2838        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2839          !!!cp (215);          if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
2840          ## Stay in the state            !!!cp (215.1);
2841              $self->{state} = BEFORE_NDATA_STATE;
2842            } else {
2843              !!!cp (215);
2844              ## Stay in the state
2845            }
2846          !!!next-input-character;          !!!next-input-character;
2847          redo A;          redo A;
2848        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
# Line 2829  sub _get_next_token ($) { Line 2858  sub _get_next_token ($) {
2858          !!!next-input-character;          !!!next-input-character;
2859          !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION          !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2860          redo A;          redo A;
2861  ## TODO: "NDATA"        } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
2862                   ($self->{nc} == 0x004E or # N
2863                    $self->{nc} == 0x006E)) { # n
2864            !!!cp (216.2);
2865            !!!parse-error (type => 'no space before NDATA'); ## TODO: type
2866            $self->{state} = NDATA_STATE;
2867            $self->{kwd} = chr $self->{nc};
2868            !!!next-input-character;
2869            redo A;
2870        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2871          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2872            !!!cp (217);            !!!cp (217);
# Line 2871  sub _get_next_token ($) { Line 2908  sub _get_next_token ($) {
2908          !!!next-input-character;          !!!next-input-character;
2909          redo A;          redo A;
2910        }        }
2911        } elsif ($self->{state} == BEFORE_NDATA_STATE) {
2912          if ($is_space->{$self->{nc}}) {
2913            !!!cp (218.3);
2914            ## Stay in the state.
2915            !!!next-input-character;
2916            redo A;
2917          } elsif ($self->{nc} == 0x003E) { # >
2918            !!!cp (218.4);
2919            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2920            !!!next-input-character;
2921            !!!emit ($self->{ct}); # ENTITY
2922            redo A;
2923          } elsif ($self->{nc} == 0x004E or # N
2924                   $self->{nc} == 0x006E) { # n
2925            !!!cp (218.5);
2926            $self->{state} = NDATA_STATE;
2927            $self->{kwd} = chr $self->{nc};
2928            !!!next-input-character;
2929            redo A;
2930          } elsif ($self->{nc} == -1) {
2931            !!!cp (218.6);
2932            !!!parse-error (type => 'unclosed md'); ## TODO: type
2933            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2934            ## reconsume
2935            !!!emit ($self->{ct}); # ENTITY
2936            redo A;
2937          } else {
2938            !!!cp (218.7);
2939            !!!parse-error (type => 'string after SYSTEM literal');
2940            $self->{state} = BOGUS_MD_STATE;
2941            !!!next-input-character;
2942            redo A;
2943          }
2944      } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {      } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2945        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2946          !!!cp (219);          !!!cp (219);
# Line 4502  sub _get_next_token ($) { Line 4572  sub _get_next_token ($) {
4572          ## Reconsume.          ## Reconsume.
4573          redo A;          redo A;
4574        }        }
4575        } elsif ($self->{state} == NDATA_STATE) {
4576          ## ASCII case-insensitive
4577          if ($self->{nc} == [
4578                undef,
4579                0x0044, # D
4580                0x0041, # A
4581                0x0054, # T
4582              ]->[length $self->{kwd}] or
4583              $self->{nc} == [
4584                undef,
4585                0x0064, # d
4586                0x0061, # a
4587                0x0074, # t
4588              ]->[length $self->{kwd}]) {
4589            !!!cp (172.2);
4590            ## Stay in the state.
4591            $self->{kwd} .= chr $self->{nc};
4592            !!!next-input-character;
4593            redo A;
4594          } elsif ((length $self->{kwd}) == 4 and
4595                   ($self->{nc} == 0x0041 or # A
4596                    $self->{nc} == 0x0061)) { # a
4597            if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
4598              !!!cp (172.3);
4599              !!!parse-error (type => 'lowercase keyword', ## TODO: type
4600                              text => 'NDATA',
4601                              line => $self->{line_prev},
4602                              column => $self->{column_prev} - 4);
4603            } else {
4604              !!!cp (172.4);
4605            }
4606            $self->{state} = AFTER_NDATA_STATE;
4607            !!!next-input-character;
4608            redo A;
4609          } else {
4610            !!!parse-error (type => 'string after literal', ## TODO: type
4611                            line => $self->{line_prev},
4612                            column => $self->{column_prev} + 1
4613                                - length $self->{kwd});
4614            !!!cp (172.5);
4615            $self->{state} = BOGUS_MD_STATE;
4616            ## Reconsume.
4617            redo A;
4618          }
4619        } elsif ($self->{state} == AFTER_NDATA_STATE) {
4620          if ($is_space->{$self->{nc}}) {
4621            $self->{state} = BEFORE_NOTATION_NAME_STATE;
4622            !!!next-input-character;
4623            redo A;
4624          } elsif ($self->{nc} == 0x003E) { # >
4625            !!!parse-error (type => 'no notation name'); ## TODO: type
4626            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4627            !!!next-input-character;
4628            !!!emit ($self->{ct}); # ENTITY
4629            redo A;
4630          } elsif ($self->{nc} == -1) {
4631            !!!parse-error (type => 'unclosed md'); ## TODO: type
4632            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4633            !!!next-input-character;
4634            !!!emit ($self->{ct}); # ENTITY
4635            redo A;
4636          } else {
4637            !!!parse-error (type => 'string after literal', ## TODO: type
4638                            line => $self->{line_prev},
4639                            column => $self->{column_prev} + 1
4640                                - length $self->{kwd});
4641            $self->{state} = BOGUS_MD_STATE;
4642            ## Reconsume.
4643            redo A;
4644          }
4645        } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
4646          if ($is_space->{$self->{nc}}) {
4647            ## Stay in the state.
4648            !!!next-input-character;
4649            redo A;
4650          } elsif ($self->{nc} == 0x003E) { # >
4651            !!!parse-error (type => 'no notation name'); ## TODO: type
4652            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4653            !!!next-input-character;
4654            !!!emit ($self->{ct}); # ENTITY
4655            redo A;
4656          } elsif ($self->{nc} == -1) {
4657            !!!parse-error (type => 'unclosed md'); ## TODO: type
4658            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4659            !!!next-input-character;
4660            !!!emit ($self->{ct}); # ENTITY
4661            redo A;
4662          } else {
4663            $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
4664            $self->{state} = NOTATION_NAME_STATE;
4665            !!!next-input-character;
4666            redo A;
4667          }
4668        } elsif ($self->{state} == NOTATION_NAME_STATE) {
4669          if ($is_space->{$self->{nc}}) {
4670            $self->{state} = AFTER_NOTATION_NAME_STATE;
4671            !!!next-input-character;
4672            redo A;
4673          } elsif ($self->{nc} == 0x003E) { # >
4674            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4675            !!!next-input-character;
4676            !!!emit ($self->{ct}); # ENTITY
4677            redo A;
4678          } elsif ($self->{nc} == -1) {
4679            !!!parse-error (type => 'unclosed md'); ## TODO: type
4680            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4681            !!!next-input-character;
4682            !!!emit ($self->{ct}); # ENTITY
4683            redo A;
4684          } else {
4685            $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
4686            ## Stay in the state.
4687            !!!next-input-character;
4688            redo A;
4689          }
4690        } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
4691          if ($self->{nc} == 0x0022) { # "
4692            $self->{state} = AFTER_NOTATION_NAME_STATE;
4693            !!!next-input-character;
4694            redo A;
4695          } elsif ($self->{nc} == 0x0026) { # &
4696            $self->{prev_state} = $self->{state};
4697            $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4698            $self->{entity_add} = 0x0022; # "
4699            !!!next-input-character;
4700            redo A;
4701    ## TODO: %
4702          } elsif ($self->{nc} == -1) {
4703            !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4704            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4705            ## Reconsume.
4706            !!!emit ($self->{ct}); # ENTITY
4707            redo A;
4708          } else {
4709            $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4710            !!!next-input-character;
4711            redo A;
4712          }
4713        } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
4714          if ($self->{nc} == 0x0027) { # '
4715            $self->{state} = AFTER_NOTATION_NAME_STATE;
4716            !!!next-input-character;
4717            redo A;
4718          } elsif ($self->{nc} == 0x0026) { # &
4719            $self->{prev_state} = $self->{state};
4720            $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4721            $self->{entity_add} = 0x0027; # '
4722            !!!next-input-character;
4723            redo A;
4724    ## TODO: %
4725          } elsif ($self->{nc} == -1) {
4726            !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4727            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4728            ## Reconsume.
4729            !!!emit ($self->{ct}); # ENTITY
4730            redo A;
4731          } else {
4732            $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4733            !!!next-input-character;
4734            redo A;
4735          }
4736        } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
4737          ## TODO: XMLize
4738    
4739          if ($is_space->{$self->{nc}} or
4740              {
4741                0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4742                $self->{entity_add} => 1,
4743              }->{$self->{nc}}) {
4744            ## Don't consume
4745            ## No error
4746            ## Return nothing.
4747            #
4748          } elsif ($self->{nc} == 0x0023) { # #
4749            $self->{ca} = $self->{ct};
4750            $self->{state} = ENTITY_HASH_STATE;
4751            $self->{kwd} = '#';
4752            !!!next-input-character;
4753            redo A;
4754          } elsif ((0x0041 <= $self->{nc} and
4755                    $self->{nc} <= 0x005A) or # A..Z
4756                   (0x0061 <= $self->{nc} and
4757                    $self->{nc} <= 0x007A)) { # a..z
4758            #
4759          } else {
4760            !!!parse-error (type => 'bare ero');
4761            ## Return nothing.
4762            #
4763          }
4764    
4765          $self->{ct}->{value} .= '&';
4766          $self->{state} = $self->{prev_state};
4767          ## Reconsume.
4768          redo A;
4769        } elsif ($self->{state} == AFTER_NOTATION_NAME_STATE) {
4770          if ($is_space->{$self->{nc}}) {
4771            ## Stay in the state.
4772            !!!next-input-character;
4773            redo A;
4774          } elsif ($self->{nc} == 0x003E) { # >
4775            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4776            !!!next-input-character;
4777            !!!emit ($self->{ct}); # ENTITY
4778            redo A;
4779          } elsif ($self->{nc} == -1) {
4780            !!!parse-error (type => 'unclosed md'); ## TODO: type
4781            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4782            !!!next-input-character;
4783            !!!emit ($self->{ct}); # ENTITY
4784            redo A;
4785          } else {
4786            !!!parse-error (type => 'string after notation name'); ## TODO: type
4787            $self->{state} = BOGUS_MD_STATE;
4788            ## Reconsume.
4789            redo A;
4790          }
4791      } elsif ($self->{state} == BOGUS_MD_STATE) {      } elsif ($self->{state} == BOGUS_MD_STATE) {
4792        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
4793          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;

Legend:
Removed from v.1.17  
changed lines
  Added in v.1.19

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24