/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.19 by wakaba, Sun Oct 19 07:19:00 2008 UTC revision 1.20 by wakaba, Sun Oct 19 08:20:29 2008 UTC
# Line 182  sub NDATA_STATE () { 86 } Line 182  sub NDATA_STATE () { 86 }
182  sub AFTER_NDATA_STATE () { 87 }  sub AFTER_NDATA_STATE () { 87 }
183  sub BEFORE_NOTATION_NAME_STATE () { 88 }  sub BEFORE_NOTATION_NAME_STATE () { 88 }
184  sub NOTATION_NAME_STATE () { 89 }  sub NOTATION_NAME_STATE () { 89 }
185  sub AFTER_NOTATION_NAME_STATE () { 90 }  sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
186  sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 91 }  sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
187  sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 92 }  sub ENTITY_VALUE_ENTITY_STATE () { 92 }
188  sub ENTITY_VALUE_ENTITY_STATE () { 93 }  sub AFTER_ELEMENT_NAME_STATE () { 93 }
189  sub BOGUS_MD_STATE () { 94 }  sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
190    sub CONTENT_KEYWORD_STATE () { 95 }
191    sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
192    sub CM_ELEMENT_NAME_STATE () { 97 }
193    sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
194    sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
195    sub AFTER_MD_DEF_STATE () { 100 }
196    sub BOGUS_MD_STATE () { 101 }
197    
198  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
199  ## list and descriptions)  ## list and descriptions)
# Line 4043  sub _get_next_token ($) { Line 4050  sub _get_next_token ($) {
4050          if ($self->{ct}->{type} == ATTLIST_TOKEN) {          if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4051            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4052          } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {          } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
4053            ## TODO: ...            $self->{state} = AFTER_ELEMENT_NAME_STATE;
           $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;  
4054          } else { # ENTITY/NOTATION          } else { # ENTITY/NOTATION
4055            $self->{state} = AFTER_DOCTYPE_NAME_STATE;            $self->{state} = AFTER_DOCTYPE_NAME_STATE;
4056          }          }
# Line 4667  sub _get_next_token ($) { Line 4673  sub _get_next_token ($) {
4673        }        }
4674      } elsif ($self->{state} == NOTATION_NAME_STATE) {      } elsif ($self->{state} == NOTATION_NAME_STATE) {
4675        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
4676          $self->{state} = AFTER_NOTATION_NAME_STATE;          $self->{state} = AFTER_MD_DEF_STATE;
4677          !!!next-input-character;          !!!next-input-character;
4678          redo A;          redo A;
4679        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
# Line 4689  sub _get_next_token ($) { Line 4695  sub _get_next_token ($) {
4695        }        }
4696      } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {      } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
4697        if ($self->{nc} == 0x0022) { # "        if ($self->{nc} == 0x0022) { # "
4698          $self->{state} = AFTER_NOTATION_NAME_STATE;          $self->{state} = AFTER_MD_DEF_STATE;
4699          !!!next-input-character;          !!!next-input-character;
4700          redo A;          redo A;
4701        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
# Line 4712  sub _get_next_token ($) { Line 4718  sub _get_next_token ($) {
4718        }        }
4719      } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {      } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
4720        if ($self->{nc} == 0x0027) { # '        if ($self->{nc} == 0x0027) { # '
4721          $self->{state} = AFTER_NOTATION_NAME_STATE;          $self->{state} = AFTER_MD_DEF_STATE;
4722          !!!next-input-character;          !!!next-input-character;
4723          redo A;          redo A;
4724        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
# Line 4766  sub _get_next_token ($) { Line 4772  sub _get_next_token ($) {
4772        $self->{state} = $self->{prev_state};        $self->{state} = $self->{prev_state};
4773        ## Reconsume.        ## Reconsume.
4774        redo A;        redo A;
4775      } elsif ($self->{state} == AFTER_NOTATION_NAME_STATE) {      } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
4776          if ($is_space->{$self->{nc}}) {
4777            $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
4778            !!!next-input-character;
4779            redo A;
4780          } elsif ($self->{nc} == 0x0028) { # (
4781            $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4782            $self->{ct}->{content} = ['('];
4783            $self->{group_depth} = 1;
4784            !!!next-input-character;
4785            redo A;
4786          } elsif ($self->{nc} == 0x003E) { # >
4787            !!!parse-error (type => 'no md def'); ## TODO: type
4788            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4789            !!!next-input-character;
4790            !!!emit ($self->{ct}); # ELEMENT
4791            redo A;
4792          } elsif ($self->{nc} == -1) {
4793            !!!parse-error (type => 'unclosed md'); ## TODO: type
4794            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4795            !!!next-input-character;
4796            !!!emit ($self->{ct}); # ELEMENT
4797            redo A;
4798          } else {
4799            $self->{ct}->{content} = [chr $self->{nc}];
4800            $self->{state} = CONTENT_KEYWORD_STATE;
4801            !!!next-input-character;
4802            redo A;
4803          }
4804        } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
4805          if ($is_space->{$self->{nc}}) {
4806            $self->{state} = AFTER_MD_DEF_STATE;
4807            !!!next-input-character;
4808            redo A;
4809          } elsif ($self->{nc} == 0x003E) { # >
4810            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4811            !!!next-input-character;
4812            !!!emit ($self->{ct}); # ELEMENT
4813            redo A;
4814          } elsif ($self->{nc} == -1) {
4815            !!!parse-error (type => 'unclosed md'); ## TODO: type
4816            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4817            !!!next-input-character;
4818            !!!emit ($self->{ct}); # ELEMENT
4819            redo A;
4820          } else {
4821            $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
4822            ## Stay in the state.
4823            !!!next-input-character;
4824            redo A;
4825          }
4826        } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
4827        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
4828          ## Stay in the state.          ## Stay in the state.
4829          !!!next-input-character;          !!!next-input-character;
4830          redo A;          redo A;
4831          } elsif ($self->{nc} == 0x0028) { # (
4832            $self->{group_depth}++;
4833            push @{$self->{ct}->{content}}, chr $self->{nc};
4834            ## Stay in the state.
4835            !!!next-input-character;
4836            redo A;
4837          } elsif ($self->{nc} == 0x007C or # |
4838                   $self->{nc} == 0x002C) { # ,
4839            !!!parse-error (type => 'empty element name'); ## TODO: type
4840            ## Stay in the state.
4841            !!!next-input-character;
4842            redo A;
4843          } elsif ($self->{nc} == 0x0029) { # )
4844            !!!parse-error (type => 'empty element name'); ## TODO: type
4845            push @{$self->{ct}->{content}}, chr $self->{nc};
4846            $self->{group_depth}--;
4847            $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
4848            !!!next-input-character;
4849            redo A;
4850        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
4851            !!!parse-error (type => 'unclosed cm group'); ## TODO: type
4852            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4853          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4854          !!!next-input-character;          !!!next-input-character;
4855          !!!emit ($self->{ct}); # ENTITY          !!!emit ($self->{ct}); # ELEMENT
4856          redo A;          redo A;
4857        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4858          !!!parse-error (type => 'unclosed md'); ## TODO: type          !!!parse-error (type => 'unclosed md'); ## TODO: type
4859            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4860          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4861          !!!next-input-character;          !!!next-input-character;
4862          !!!emit ($self->{ct}); # ENTITY          !!!emit ($self->{ct}); # ELEMENT
4863            redo A;
4864          } else {
4865            push @{$self->{ct}->{content}}, chr $self->{nc};
4866            $self->{state} = CM_ELEMENT_NAME_STATE;
4867            !!!next-input-character;
4868            redo A;
4869          }
4870        } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
4871          if ($is_space->{$self->{nc}}) {
4872            $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
4873            !!!next-input-character;
4874            redo A;
4875          } elsif ($self->{nc} == 0x002A or # *
4876                   $self->{nc} == 0x002B or # +
4877                   $self->{nc} == 0x003F) { # ?
4878            push @{$self->{ct}->{content}}, chr $self->{nc};
4879            $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
4880            !!!next-input-character;
4881            redo A;
4882          } elsif ($self->{nc} == 0x007C or # |
4883                   $self->{nc} == 0x002C) { # ,
4884            push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
4885            $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4886            !!!next-input-character;
4887            redo A;
4888          } elsif ($self->{nc} == 0x0029) { # )
4889            $self->{group_depth}--;
4890            push @{$self->{ct}->{content}}, chr $self->{nc};
4891            $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
4892            !!!next-input-character;
4893            redo A;
4894          } elsif ($self->{nc} == 0x003E) { # >
4895            !!!parse-error (type => 'unclosed cm group'); ## TODO: type
4896            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4897            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4898            !!!next-input-character;
4899            !!!emit ($self->{ct}); # ELEMENT
4900            redo A;
4901          } elsif ($self->{nc} == -1) {
4902            !!!parse-error (type => 'unclosed md'); ## TODO: type
4903            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4904            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4905            !!!next-input-character;
4906            !!!emit ($self->{ct}); # ELEMENT
4907            redo A;
4908          } else {
4909            $self->{ct}->{content}->[-1] .= chr $self->{nc};
4910            ## Stay in the state.
4911            !!!next-input-character;
4912            redo A;
4913          }
4914        } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
4915          if ($is_space->{$self->{nc}}) {
4916            ## Stay in the state.
4917            !!!next-input-character;
4918            redo A;
4919          } elsif ($self->{nc} == 0x007C or # |
4920                   $self->{nc} == 0x002C) { # ,
4921            push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
4922            $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4923            !!!next-input-character;
4924            redo A;
4925          } elsif ($self->{nc} == 0x0029) { # )
4926            $self->{group_depth}--;
4927            push @{$self->{ct}->{content}}, chr $self->{nc};
4928            $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
4929            !!!next-input-character;
4930            redo A;
4931          } elsif ($self->{nc} == 0x003E) { # >
4932            !!!parse-error (type => 'unclosed cm group'); ## TODO: type
4933            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4934            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4935            !!!next-input-character;
4936            !!!emit ($self->{ct}); # ELEMENT
4937            redo A;
4938          } elsif ($self->{nc} == -1) {
4939            !!!parse-error (type => 'unclosed md'); ## TODO: type
4940            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4941            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4942            !!!next-input-character;
4943            !!!emit ($self->{ct}); # ELEMENT
4944            redo A;
4945          } else {
4946            !!!parse-error (type => 'after element name'); ## TODO: type
4947            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4948            $self->{state} = BOGUS_MD_STATE;
4949            !!!next-input-character;
4950            redo A;
4951          }
4952        } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
4953          if ($is_space->{$self->{nc}}) {
4954            if ($self->{group_depth}) {
4955              $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
4956            } else {
4957              $self->{state} = AFTER_MD_DEF_STATE;
4958            }
4959            !!!next-input-character;
4960            redo A;
4961          } elsif ($self->{nc} == 0x002A or # *
4962                   $self->{nc} == 0x002B or # +
4963                   $self->{nc} == 0x003F) { # ?
4964            push @{$self->{ct}->{content}}, chr $self->{nc};
4965            if ($self->{group_depth}) {
4966              $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
4967            } else {
4968              $self->{state} = AFTER_MD_DEF_STATE;
4969            }
4970            !!!next-input-character;
4971            redo A;
4972          } elsif ($self->{nc} == 0x0029) { # )
4973            if ($self->{group_depth}) {
4974              $self->{group_depth}--;
4975              push @{$self->{ct}->{content}}, chr $self->{nc};
4976              ## Stay in the state.
4977              !!!next-input-character;
4978              redo A;
4979            } else {
4980              !!!parse-error (type => 'string after md def'); ## TODO: type
4981              $self->{state} = BOGUS_MD_STATE;
4982              ## Reconsume.
4983              redo A;
4984            }
4985          } elsif ($self->{nc} == 0x003E) { # >
4986            if ($self->{group_depth}) {
4987              !!!parse-error (type => 'unclosed cm group'); ## TODO: type
4988              push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4989            }
4990            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4991            !!!next-input-character;
4992            !!!emit ($self->{ct}); # ELEMENT
4993            redo A;
4994          } elsif ($self->{nc} == -1) {
4995            !!!parse-error (type => 'unclosed md'); ## TODO: type
4996            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4997            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4998            !!!next-input-character;
4999            !!!emit ($self->{ct}); # ELEMENT
5000            redo A;
5001          } else {
5002            if ($self->{group_depth}) {
5003              $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5004            } else {
5005              !!!parse-error (type => 'string after md def'); ## TODO: type
5006              $self->{state} = BOGUS_MD_STATE;
5007            }
5008            ## Reconsume.
5009            redo A;
5010          }
5011        } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
5012          if ($is_space->{$self->{nc}}) {
5013            ## Stay in the state.
5014            !!!next-input-character;
5015            redo A;
5016          } elsif ($self->{nc} == 0x003E) { # >
5017            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5018            !!!next-input-character;
5019            !!!emit ($self->{ct}); # ENTITY/ELEMENT
5020            redo A;
5021          } elsif ($self->{nc} == -1) {
5022            !!!parse-error (type => 'unclosed md'); ## TODO: type
5023            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5024            !!!next-input-character;
5025            !!!emit ($self->{ct}); # ENTITY/ELEMENT
5026          redo A;          redo A;
5027        } else {        } else {
5028          !!!parse-error (type => 'string after notation name'); ## TODO: type          !!!parse-error (type => 'string after md def'); ## TODO: type
5029          $self->{state} = BOGUS_MD_STATE;          $self->{state} = BOGUS_MD_STATE;
5030          ## Reconsume.          ## Reconsume.
5031          redo A;          redo A;

Legend:
Removed from v.1.19  
changed lines
  Added in v.1.20

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24