/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.17 by wakaba, Sun Oct 19 04:39:25 2008 UTC revision 1.21 by wakaba, Sun Oct 19 09:25:21 2008 UTC
# Line 177  sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATIO Line 177  sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATIO
177  sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }  sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178  sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }  sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179  sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }  sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180  sub BOGUS_MD_STATE () { 85 }  sub BEFORE_NDATA_STATE () { 85 }
181    sub NDATA_STATE () { 86 }
182    sub AFTER_NDATA_STATE () { 87 }
183    sub BEFORE_NOTATION_NAME_STATE () { 88 }
184    sub NOTATION_NAME_STATE () { 89 }
185    sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
186    sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
187    sub ENTITY_VALUE_ENTITY_STATE () { 92 }
188    sub AFTER_ELEMENT_NAME_STATE () { 93 }
189    sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
190    sub CONTENT_KEYWORD_STATE () { 95 }
191    sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
192    sub CM_ELEMENT_NAME_STATE () { 97 }
193    sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
194    sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
195    sub AFTER_MD_DEF_STATE () { 100 }
196    sub BOGUS_MD_STATE () { 101 }
197    
198  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
199  ## list and descriptions)  ## list and descriptions)
# Line 2256  sub _get_next_token ($) { Line 2272  sub _get_next_token ($) {
2272          $self->{kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
2273          !!!next-input-character;          !!!next-input-character;
2274          redo A;          redo A;
2275  ## TODO: " and ' for ENTITY        } elsif ($self->{nc} == 0x0022 and # "
2276                   ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2277                    $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2278            !!!cp (167.21);
2279            $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
2280            $self->{ct}->{value} = ''; # ENTITY
2281            !!!next-input-character;
2282            redo A;
2283          } elsif ($self->{nc} == 0x0027 and # '
2284                   ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2285                    $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2286            !!!cp (167.22);
2287            $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
2288            $self->{ct}->{value} = ''; # ENTITY
2289            !!!next-input-character;
2290            redo A;
2291        } elsif ($self->{is_xml} and        } elsif ($self->{is_xml} and
2292                 $self->{ct}->{type} == DOCTYPE_TOKEN and                 $self->{ct}->{type} == DOCTYPE_TOKEN and
2293                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
# Line 2812  sub _get_next_token ($) { Line 2843  sub _get_next_token ($) {
2843        }        }
2844      } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {      } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2845        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2846          !!!cp (215);          if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
2847          ## Stay in the state            !!!cp (215.1);
2848              $self->{state} = BEFORE_NDATA_STATE;
2849            } else {
2850              !!!cp (215);
2851              ## Stay in the state
2852            }
2853          !!!next-input-character;          !!!next-input-character;
2854          redo A;          redo A;
2855        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
# Line 2829  sub _get_next_token ($) { Line 2865  sub _get_next_token ($) {
2865          !!!next-input-character;          !!!next-input-character;
2866          !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION          !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2867          redo A;          redo A;
2868  ## TODO: "NDATA"        } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
2869                   ($self->{nc} == 0x004E or # N
2870                    $self->{nc} == 0x006E)) { # n
2871            !!!cp (216.2);
2872            !!!parse-error (type => 'no space before NDATA'); ## TODO: type
2873            $self->{state} = NDATA_STATE;
2874            $self->{kwd} = chr $self->{nc};
2875            !!!next-input-character;
2876            redo A;
2877        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2878          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2879            !!!cp (217);            !!!cp (217);
# Line 2871  sub _get_next_token ($) { Line 2915  sub _get_next_token ($) {
2915          !!!next-input-character;          !!!next-input-character;
2916          redo A;          redo A;
2917        }        }
2918        } elsif ($self->{state} == BEFORE_NDATA_STATE) {
2919          if ($is_space->{$self->{nc}}) {
2920            !!!cp (218.3);
2921            ## Stay in the state.
2922            !!!next-input-character;
2923            redo A;
2924          } elsif ($self->{nc} == 0x003E) { # >
2925            !!!cp (218.4);
2926            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2927            !!!next-input-character;
2928            !!!emit ($self->{ct}); # ENTITY
2929            redo A;
2930          } elsif ($self->{nc} == 0x004E or # N
2931                   $self->{nc} == 0x006E) { # n
2932            !!!cp (218.5);
2933            $self->{state} = NDATA_STATE;
2934            $self->{kwd} = chr $self->{nc};
2935            !!!next-input-character;
2936            redo A;
2937          } elsif ($self->{nc} == -1) {
2938            !!!cp (218.6);
2939            !!!parse-error (type => 'unclosed md'); ## TODO: type
2940            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2941            ## reconsume
2942            !!!emit ($self->{ct}); # ENTITY
2943            redo A;
2944          } else {
2945            !!!cp (218.7);
2946            !!!parse-error (type => 'string after SYSTEM literal');
2947            $self->{state} = BOGUS_MD_STATE;
2948            !!!next-input-character;
2949            redo A;
2950          }
2951      } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {      } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2952        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2953          !!!cp (219);          !!!cp (219);
# Line 3056  sub _get_next_token ($) { Line 3133  sub _get_next_token ($) {
3133          redo A;          redo A;
3134        }        }
3135      } elsif ($self->{state} == ENTITY_HASH_STATE) {      } elsif ($self->{state} == ENTITY_HASH_STATE) {
3136        if ($self->{nc} == 0x0078 or # x        if ($self->{nc} == 0x0078) { # x
           $self->{nc} == 0x0058) { # X  
3137          !!!cp (995);          !!!cp (995);
3138          $self->{state} = HEXREF_X_STATE;          $self->{state} = HEXREF_X_STATE;
3139          $self->{kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3140          !!!next-input-character;          !!!next-input-character;
3141          redo A;          redo A;
3142          } elsif ($self->{nc} == 0x0058) { # X
3143            !!!cp (995.1);
3144            if ($self->{is_xml}) {
3145              !!!parse-error (type => 'uppercase hcro'); ## TODO: type
3146            }
3147            $self->{state} = HEXREF_X_STATE;
3148            $self->{kwd} .= chr $self->{nc};
3149            !!!next-input-character;
3150            redo A;
3151        } elsif (0x0030 <= $self->{nc} and        } elsif (0x0030 <= $self->{nc} and
3152                 $self->{nc} <= 0x0039) { # 0..9                 $self->{nc} <= 0x0039) { # 0..9
3153          !!!cp (994);          !!!cp (994);
# Line 3270  sub _get_next_token ($) { Line 3355  sub _get_next_token ($) {
3355          redo A;          redo A;
3356        }        }
3357      } elsif ($self->{state} == ENTITY_NAME_STATE) {      } elsif ($self->{state} == ENTITY_NAME_STATE) {
3358        if (length $self->{kwd} < 30 and        if ((0x0041 <= $self->{nc} and # a
3359            ## NOTE: Some number greater than the maximum length of entity name             $self->{nc} <= 0x005A) or # x
3360            ((0x0041 <= $self->{nc} and # a            (0x0061 <= $self->{nc} and # a
3361              $self->{nc} <= 0x005A) or # x             $self->{nc} <= 0x007A) or # z
3362             (0x0061 <= $self->{nc} and # a            (0x0030 <= $self->{nc} and # 0
3363              $self->{nc} <= 0x007A) or # z             $self->{nc} <= 0x0039) or # 9
3364             (0x0030 <= $self->{nc} and # 0            $self->{nc} == 0x003B) { # ;
             $self->{nc} <= 0x0039) or # 9  
            $self->{nc} == 0x003B)) { # ;  
3365          our $EntityChar;          our $EntityChar;
3366          $self->{kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3367          if (defined $EntityChar->{$self->{kwd}}) {          if (defined $EntityChar->{$self->{kwd}} or
3368                $self->{ge}->{$self->{kwd}}) {
3369            if ($self->{nc} == 0x003B) { # ;            if ($self->{nc} == 0x003B) { # ;
3370              !!!cp (1020);              if (defined $self->{ge}->{$self->{kwd}}) {
3371              $self->{entity__value} = $EntityChar->{$self->{kwd}};                if ($self->{ge}->{$self->{kwd}}->{only_text}) {
3372                    !!!cp (1020.1);
3373                    $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
3374                  } else {
3375                    if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
3376                      !!!cp (1020.2);
3377                      !!!parse-error (type => 'unparsed entity', ## TODO: type
3378                                      value => $self->{kwd});
3379                    } else {
3380                      !!!cp (1020.3);
3381                    }
3382                    $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
3383                  }
3384                } else {
3385                  if ($self->{is_xml}) {
3386                    !!!cp (1020.4);
3387                    !!!parse-error (type => 'entity not declared', ## TODO: type
3388                                    value => $self->{kwd},
3389                                    level => {
3390                                              'amp;' => $self->{level}->{warn},
3391                                              'quot;' => $self->{level}->{warn},
3392                                              'lt;' => $self->{level}->{warn},
3393                                              'gt;' => $self->{level}->{warn},
3394                                              'apos;' => $self->{level}->{warn},
3395                                             }->{$self->{kwd}} ||
3396                                             $self->{level}->{must});
3397                  } else {
3398                    !!!cp (1020);
3399                  }
3400                  $self->{entity__value} = $EntityChar->{$self->{kwd}};
3401                }
3402              $self->{entity__match} = 1;              $self->{entity__match} = 1;
3403              !!!next-input-character;              !!!next-input-character;
3404              #              #
# Line 3973  sub _get_next_token ($) { Line 4087  sub _get_next_token ($) {
4087          if ($self->{ct}->{type} == ATTLIST_TOKEN) {          if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4088            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4089          } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {          } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
4090            ## TODO: ...            $self->{state} = AFTER_ELEMENT_NAME_STATE;
           $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;  
4091          } else { # ENTITY/NOTATION          } else { # ENTITY/NOTATION
4092            $self->{state} = AFTER_DOCTYPE_NAME_STATE;            $self->{state} = AFTER_DOCTYPE_NAME_STATE;
4093          }          }
# Line 4502  sub _get_next_token ($) { Line 4615  sub _get_next_token ($) {
4615          ## Reconsume.          ## Reconsume.
4616          redo A;          redo A;
4617        }        }
4618        } elsif ($self->{state} == NDATA_STATE) {
4619          ## ASCII case-insensitive
4620          if ($self->{nc} == [
4621                undef,
4622                0x0044, # D
4623                0x0041, # A
4624                0x0054, # T
4625              ]->[length $self->{kwd}] or
4626              $self->{nc} == [
4627                undef,
4628                0x0064, # d
4629                0x0061, # a
4630                0x0074, # t
4631              ]->[length $self->{kwd}]) {
4632            !!!cp (172.2);
4633            ## Stay in the state.
4634            $self->{kwd} .= chr $self->{nc};
4635            !!!next-input-character;
4636            redo A;
4637          } elsif ((length $self->{kwd}) == 4 and
4638                   ($self->{nc} == 0x0041 or # A
4639                    $self->{nc} == 0x0061)) { # a
4640            if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
4641              !!!cp (172.3);
4642              !!!parse-error (type => 'lowercase keyword', ## TODO: type
4643                              text => 'NDATA',
4644                              line => $self->{line_prev},
4645                              column => $self->{column_prev} - 4);
4646            } else {
4647              !!!cp (172.4);
4648            }
4649            $self->{state} = AFTER_NDATA_STATE;
4650            !!!next-input-character;
4651            redo A;
4652          } else {
4653            !!!parse-error (type => 'string after literal', ## TODO: type
4654                            line => $self->{line_prev},
4655                            column => $self->{column_prev} + 1
4656                                - length $self->{kwd});
4657            !!!cp (172.5);
4658            $self->{state} = BOGUS_MD_STATE;
4659            ## Reconsume.
4660            redo A;
4661          }
4662        } elsif ($self->{state} == AFTER_NDATA_STATE) {
4663          if ($is_space->{$self->{nc}}) {
4664            $self->{state} = BEFORE_NOTATION_NAME_STATE;
4665            !!!next-input-character;
4666            redo A;
4667          } elsif ($self->{nc} == 0x003E) { # >
4668            !!!parse-error (type => 'no notation name'); ## TODO: type
4669            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4670            !!!next-input-character;
4671            !!!emit ($self->{ct}); # ENTITY
4672            redo A;
4673          } elsif ($self->{nc} == -1) {
4674            !!!parse-error (type => 'unclosed md'); ## TODO: type
4675            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4676            !!!next-input-character;
4677            !!!emit ($self->{ct}); # ENTITY
4678            redo A;
4679          } else {
4680            !!!parse-error (type => 'string after literal', ## TODO: type
4681                            line => $self->{line_prev},
4682                            column => $self->{column_prev} + 1
4683                                - length $self->{kwd});
4684            $self->{state} = BOGUS_MD_STATE;
4685            ## Reconsume.
4686            redo A;
4687          }
4688        } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
4689          if ($is_space->{$self->{nc}}) {
4690            ## Stay in the state.
4691            !!!next-input-character;
4692            redo A;
4693          } elsif ($self->{nc} == 0x003E) { # >
4694            !!!parse-error (type => 'no notation name'); ## TODO: type
4695            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4696            !!!next-input-character;
4697            !!!emit ($self->{ct}); # ENTITY
4698            redo A;
4699          } elsif ($self->{nc} == -1) {
4700            !!!parse-error (type => 'unclosed md'); ## TODO: type
4701            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4702            !!!next-input-character;
4703            !!!emit ($self->{ct}); # ENTITY
4704            redo A;
4705          } else {
4706            $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
4707            $self->{state} = NOTATION_NAME_STATE;
4708            !!!next-input-character;
4709            redo A;
4710          }
4711        } elsif ($self->{state} == NOTATION_NAME_STATE) {
4712          if ($is_space->{$self->{nc}}) {
4713            $self->{state} = AFTER_MD_DEF_STATE;
4714            !!!next-input-character;
4715            redo A;
4716          } elsif ($self->{nc} == 0x003E) { # >
4717            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4718            !!!next-input-character;
4719            !!!emit ($self->{ct}); # ENTITY
4720            redo A;
4721          } elsif ($self->{nc} == -1) {
4722            !!!parse-error (type => 'unclosed md'); ## TODO: type
4723            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4724            !!!next-input-character;
4725            !!!emit ($self->{ct}); # ENTITY
4726            redo A;
4727          } else {
4728            $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
4729            ## Stay in the state.
4730            !!!next-input-character;
4731            redo A;
4732          }
4733        } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
4734          if ($self->{nc} == 0x0022) { # "
4735            $self->{state} = AFTER_MD_DEF_STATE;
4736            !!!next-input-character;
4737            redo A;
4738          } elsif ($self->{nc} == 0x0026) { # &
4739            $self->{prev_state} = $self->{state};
4740            $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4741            $self->{entity_add} = 0x0022; # "
4742            !!!next-input-character;
4743            redo A;
4744    ## TODO: %
4745          } elsif ($self->{nc} == -1) {
4746            !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4747            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4748            ## Reconsume.
4749            !!!emit ($self->{ct}); # ENTITY
4750            redo A;
4751          } else {
4752            $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4753            !!!next-input-character;
4754            redo A;
4755          }
4756        } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
4757          if ($self->{nc} == 0x0027) { # '
4758            $self->{state} = AFTER_MD_DEF_STATE;
4759            !!!next-input-character;
4760            redo A;
4761          } elsif ($self->{nc} == 0x0026) { # &
4762            $self->{prev_state} = $self->{state};
4763            $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4764            $self->{entity_add} = 0x0027; # '
4765            !!!next-input-character;
4766            redo A;
4767    ## TODO: %
4768          } elsif ($self->{nc} == -1) {
4769            !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4770            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4771            ## Reconsume.
4772            !!!emit ($self->{ct}); # ENTITY
4773            redo A;
4774          } else {
4775            $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4776            !!!next-input-character;
4777            redo A;
4778          }
4779        } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
4780          ## TODO: XMLize
4781    
4782          if ($is_space->{$self->{nc}} or
4783              {
4784                0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4785                $self->{entity_add} => 1,
4786              }->{$self->{nc}}) {
4787            ## Don't consume
4788            ## No error
4789            ## Return nothing.
4790            #
4791          } elsif ($self->{nc} == 0x0023) { # #
4792            $self->{ca} = $self->{ct};
4793            $self->{state} = ENTITY_HASH_STATE;
4794            $self->{kwd} = '#';
4795            !!!next-input-character;
4796            redo A;
4797          } elsif ((0x0041 <= $self->{nc} and
4798                    $self->{nc} <= 0x005A) or # A..Z
4799                   (0x0061 <= $self->{nc} and
4800                    $self->{nc} <= 0x007A)) { # a..z
4801            #
4802          } else {
4803            !!!parse-error (type => 'bare ero');
4804            ## Return nothing.
4805            #
4806          }
4807    
4808          $self->{ct}->{value} .= '&';
4809          $self->{state} = $self->{prev_state};
4810          ## Reconsume.
4811          redo A;
4812        } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
4813          if ($is_space->{$self->{nc}}) {
4814            $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
4815            !!!next-input-character;
4816            redo A;
4817          } elsif ($self->{nc} == 0x0028) { # (
4818            $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4819            $self->{ct}->{content} = ['('];
4820            $self->{group_depth} = 1;
4821            !!!next-input-character;
4822            redo A;
4823          } elsif ($self->{nc} == 0x003E) { # >
4824            !!!parse-error (type => 'no md def'); ## TODO: type
4825            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4826            !!!next-input-character;
4827            !!!emit ($self->{ct}); # ELEMENT
4828            redo A;
4829          } elsif ($self->{nc} == -1) {
4830            !!!parse-error (type => 'unclosed md'); ## TODO: type
4831            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4832            !!!next-input-character;
4833            !!!emit ($self->{ct}); # ELEMENT
4834            redo A;
4835          } else {
4836            $self->{ct}->{content} = [chr $self->{nc}];
4837            $self->{state} = CONTENT_KEYWORD_STATE;
4838            !!!next-input-character;
4839            redo A;
4840          }
4841        } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
4842          if ($is_space->{$self->{nc}}) {
4843            $self->{state} = AFTER_MD_DEF_STATE;
4844            !!!next-input-character;
4845            redo A;
4846          } elsif ($self->{nc} == 0x003E) { # >
4847            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4848            !!!next-input-character;
4849            !!!emit ($self->{ct}); # ELEMENT
4850            redo A;
4851          } elsif ($self->{nc} == -1) {
4852            !!!parse-error (type => 'unclosed md'); ## TODO: type
4853            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4854            !!!next-input-character;
4855            !!!emit ($self->{ct}); # ELEMENT
4856            redo A;
4857          } else {
4858            $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
4859            ## Stay in the state.
4860            !!!next-input-character;
4861            redo A;
4862          }
4863        } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
4864          if ($is_space->{$self->{nc}}) {
4865            ## Stay in the state.
4866            !!!next-input-character;
4867            redo A;
4868          } elsif ($self->{nc} == 0x0028) { # (
4869            $self->{group_depth}++;
4870            push @{$self->{ct}->{content}}, chr $self->{nc};
4871            ## Stay in the state.
4872            !!!next-input-character;
4873            redo A;
4874          } elsif ($self->{nc} == 0x007C or # |
4875                   $self->{nc} == 0x002C) { # ,
4876            !!!parse-error (type => 'empty element name'); ## TODO: type
4877            ## Stay in the state.
4878            !!!next-input-character;
4879            redo A;
4880          } elsif ($self->{nc} == 0x0029) { # )
4881            !!!parse-error (type => 'empty element name'); ## TODO: type
4882            push @{$self->{ct}->{content}}, chr $self->{nc};
4883            $self->{group_depth}--;
4884            $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
4885            !!!next-input-character;
4886            redo A;
4887          } elsif ($self->{nc} == 0x003E) { # >
4888            !!!parse-error (type => 'unclosed cm group'); ## TODO: type
4889            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4890            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4891            !!!next-input-character;
4892            !!!emit ($self->{ct}); # ELEMENT
4893            redo A;
4894          } elsif ($self->{nc} == -1) {
4895            !!!parse-error (type => 'unclosed md'); ## TODO: type
4896            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4897            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4898            !!!next-input-character;
4899            !!!emit ($self->{ct}); # ELEMENT
4900            redo A;
4901          } else {
4902            push @{$self->{ct}->{content}}, chr $self->{nc};
4903            $self->{state} = CM_ELEMENT_NAME_STATE;
4904            !!!next-input-character;
4905            redo A;
4906          }
4907        } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
4908          if ($is_space->{$self->{nc}}) {
4909            $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
4910            !!!next-input-character;
4911            redo A;
4912          } elsif ($self->{nc} == 0x002A or # *
4913                   $self->{nc} == 0x002B or # +
4914                   $self->{nc} == 0x003F) { # ?
4915            push @{$self->{ct}->{content}}, chr $self->{nc};
4916            $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
4917            !!!next-input-character;
4918            redo A;
4919          } elsif ($self->{nc} == 0x007C or # |
4920                   $self->{nc} == 0x002C) { # ,
4921            push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
4922            $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4923            !!!next-input-character;
4924            redo A;
4925          } elsif ($self->{nc} == 0x0029) { # )
4926            $self->{group_depth}--;
4927            push @{$self->{ct}->{content}}, chr $self->{nc};
4928            $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
4929            !!!next-input-character;
4930            redo A;
4931          } elsif ($self->{nc} == 0x003E) { # >
4932            !!!parse-error (type => 'unclosed cm group'); ## TODO: type
4933            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4934            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4935            !!!next-input-character;
4936            !!!emit ($self->{ct}); # ELEMENT
4937            redo A;
4938          } elsif ($self->{nc} == -1) {
4939            !!!parse-error (type => 'unclosed md'); ## TODO: type
4940            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4941            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4942            !!!next-input-character;
4943            !!!emit ($self->{ct}); # ELEMENT
4944            redo A;
4945          } else {
4946            $self->{ct}->{content}->[-1] .= chr $self->{nc};
4947            ## Stay in the state.
4948            !!!next-input-character;
4949            redo A;
4950          }
4951        } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
4952          if ($is_space->{$self->{nc}}) {
4953            ## Stay in the state.
4954            !!!next-input-character;
4955            redo A;
4956          } elsif ($self->{nc} == 0x007C or # |
4957                   $self->{nc} == 0x002C) { # ,
4958            push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
4959            $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4960            !!!next-input-character;
4961            redo A;
4962          } elsif ($self->{nc} == 0x0029) { # )
4963            $self->{group_depth}--;
4964            push @{$self->{ct}->{content}}, chr $self->{nc};
4965            $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
4966            !!!next-input-character;
4967            redo A;
4968          } elsif ($self->{nc} == 0x003E) { # >
4969            !!!parse-error (type => 'unclosed cm group'); ## TODO: type
4970            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4971            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4972            !!!next-input-character;
4973            !!!emit ($self->{ct}); # ELEMENT
4974            redo A;
4975          } elsif ($self->{nc} == -1) {
4976            !!!parse-error (type => 'unclosed md'); ## TODO: type
4977            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4978            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4979            !!!next-input-character;
4980            !!!emit ($self->{ct}); # ELEMENT
4981            redo A;
4982          } else {
4983            !!!parse-error (type => 'after element name'); ## TODO: type
4984            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4985            $self->{state} = BOGUS_MD_STATE;
4986            !!!next-input-character;
4987            redo A;
4988          }
4989        } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
4990          if ($is_space->{$self->{nc}}) {
4991            if ($self->{group_depth}) {
4992              $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
4993            } else {
4994              $self->{state} = AFTER_MD_DEF_STATE;
4995            }
4996            !!!next-input-character;
4997            redo A;
4998          } elsif ($self->{nc} == 0x002A or # *
4999                   $self->{nc} == 0x002B or # +
5000                   $self->{nc} == 0x003F) { # ?
5001            push @{$self->{ct}->{content}}, chr $self->{nc};
5002            if ($self->{group_depth}) {
5003              $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5004            } else {
5005              $self->{state} = AFTER_MD_DEF_STATE;
5006            }
5007            !!!next-input-character;
5008            redo A;
5009          } elsif ($self->{nc} == 0x0029) { # )
5010            if ($self->{group_depth}) {
5011              $self->{group_depth}--;
5012              push @{$self->{ct}->{content}}, chr $self->{nc};
5013              ## Stay in the state.
5014              !!!next-input-character;
5015              redo A;
5016            } else {
5017              !!!parse-error (type => 'string after md def'); ## TODO: type
5018              $self->{state} = BOGUS_MD_STATE;
5019              ## Reconsume.
5020              redo A;
5021            }
5022          } elsif ($self->{nc} == 0x003E) { # >
5023            if ($self->{group_depth}) {
5024              !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5025              push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5026            }
5027            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5028            !!!next-input-character;
5029            !!!emit ($self->{ct}); # ELEMENT
5030            redo A;
5031          } elsif ($self->{nc} == -1) {
5032            !!!parse-error (type => 'unclosed md'); ## TODO: type
5033            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5034            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5035            !!!next-input-character;
5036            !!!emit ($self->{ct}); # ELEMENT
5037            redo A;
5038          } else {
5039            if ($self->{group_depth}) {
5040              $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5041            } else {
5042              !!!parse-error (type => 'string after md def'); ## TODO: type
5043              $self->{state} = BOGUS_MD_STATE;
5044            }
5045            ## Reconsume.
5046            redo A;
5047          }
5048        } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
5049          if ($is_space->{$self->{nc}}) {
5050            ## Stay in the state.
5051            !!!next-input-character;
5052            redo A;
5053          } elsif ($self->{nc} == 0x003E) { # >
5054            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5055            !!!next-input-character;
5056            !!!emit ($self->{ct}); # ENTITY/ELEMENT
5057            redo A;
5058          } elsif ($self->{nc} == -1) {
5059            !!!parse-error (type => 'unclosed md'); ## TODO: type
5060            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5061            !!!next-input-character;
5062            !!!emit ($self->{ct}); # ENTITY/ELEMENT
5063            redo A;
5064          } else {
5065            !!!parse-error (type => 'string after md def'); ## TODO: type
5066            $self->{state} = BOGUS_MD_STATE;
5067            ## Reconsume.
5068            redo A;
5069          }
5070      } elsif ($self->{state} == BOGUS_MD_STATE) {      } elsif ($self->{state} == BOGUS_MD_STATE) {
5071        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
5072          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;

Legend:
Removed from v.1.17  
changed lines
  Added in v.1.21

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24