/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.19 by wakaba, Sun Oct 19 07:19:00 2008 UTC revision 1.27 by wakaba, Thu Jul 2 22:24:28 2009 UTC
# Line 182  sub NDATA_STATE () { 86 } Line 182  sub NDATA_STATE () { 86 }
182  sub AFTER_NDATA_STATE () { 87 }  sub AFTER_NDATA_STATE () { 87 }
183  sub BEFORE_NOTATION_NAME_STATE () { 88 }  sub BEFORE_NOTATION_NAME_STATE () { 88 }
184  sub NOTATION_NAME_STATE () { 89 }  sub NOTATION_NAME_STATE () { 89 }
185  sub AFTER_NOTATION_NAME_STATE () { 90 }  sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
186  sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 91 }  sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
187  sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 92 }  sub ENTITY_VALUE_ENTITY_STATE () { 92 }
188  sub ENTITY_VALUE_ENTITY_STATE () { 93 }  sub AFTER_ELEMENT_NAME_STATE () { 93 }
189  sub BOGUS_MD_STATE () { 94 }  sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
190    sub CONTENT_KEYWORD_STATE () { 95 }
191    sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
192    sub CM_ELEMENT_NAME_STATE () { 97 }
193    sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
194    sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
195    sub AFTER_MD_DEF_STATE () { 100 }
196    sub BOGUS_MD_STATE () { 101 }
197    
198  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
199  ## list and descriptions)  ## list and descriptions)
# Line 1263  sub _get_next_token ($) { Line 1270  sub _get_next_token ($) {
1270    
1271          redo A;          redo A;
1272        } else {        } else {
1273          if ($self->{nc} == 0x003D) { # =          if ($self->{nc} == 0x003D or $self->{nc} == 0x003C) { # =, <
1274            !!!cp (93);            !!!cp (93);
1275            ## XML5: Not a parse error.            ## XML5: Not a parse error.
1276            !!!parse-error (type => 'bad attribute value');            !!!parse-error (type => 'bad attribute value');
# Line 1309  sub _get_next_token ($) { Line 1316  sub _get_next_token ($) {
1316          $self->{state} = ENTITY_STATE;          $self->{state} = ENTITY_STATE;
1317          !!!next-input-character;          !!!next-input-character;
1318          redo A;          redo A;
1319          } elsif ($self->{is_xml} and
1320                   $is_space->{$self->{nc}}) {
1321            !!!cp (97.1);
1322            $self->{ca}->{value} .= ' ';
1323            ## Stay in the state.
1324            !!!next-input-character;
1325            redo A;
1326        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1327          !!!parse-error (type => 'unclosed attribute value');          !!!parse-error (type => 'unclosed attribute value');
1328          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
# Line 1356  sub _get_next_token ($) { Line 1370  sub _get_next_token ($) {
1370          }          }
1371          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1372          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1373                                q["&<],                                qq["&<\x09\x0C\x20],
1374                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1375    
1376          ## Stay in the state          ## Stay in the state
# Line 1393  sub _get_next_token ($) { Line 1407  sub _get_next_token ($) {
1407          $self->{state} = ENTITY_STATE;          $self->{state} = ENTITY_STATE;
1408          !!!next-input-character;          !!!next-input-character;
1409          redo A;          redo A;
1410          } elsif ($self->{is_xml} and
1411                   $is_space->{$self->{nc}}) {
1412            !!!cp (103.1);
1413            $self->{ca}->{value} .= ' ';
1414            ## Stay in the state.
1415            !!!next-input-character;
1416            redo A;
1417        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1418          !!!parse-error (type => 'unclosed attribute value');          !!!parse-error (type => 'unclosed attribute value');
1419          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
# Line 1440  sub _get_next_token ($) { Line 1461  sub _get_next_token ($) {
1461          }          }
1462          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1463          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1464                                q['&<],                                qq['&<\x09\x0C\x20],
1465                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1466    
1467          ## Stay in the state          ## Stay in the state
# Line 1552  sub _get_next_token ($) { Line 1573  sub _get_next_token ($) {
1573               0x0022 => 1, # "               0x0022 => 1, # "
1574               0x0027 => 1, # '               0x0027 => 1, # '
1575               0x003D => 1, # =               0x003D => 1, # =
1576                 0x003C => 1, # <
1577              }->{$self->{nc}}) {              }->{$self->{nc}}) {
1578            !!!cp (115);            !!!cp (115);
1579            ## XML5: Not a parse error.            ## XML5: Not a parse error.
# Line 1561  sub _get_next_token ($) { Line 1583  sub _get_next_token ($) {
1583          }          }
1584          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1585          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1586                                q["'=& >],                                qq["'=& \x09\x0C>],
1587                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1588    
1589          ## Stay in the state          ## Stay in the state
# Line 2097  sub _get_next_token ($) { Line 2119  sub _get_next_token ($) {
2119          redo A;          redo A;
2120        } else {        } else {
2121          !!!cp (154);          !!!cp (154);
         ## XML5: Not a parse error.  
         !!!parse-error (type => 'dash in comment',  
                         line => $self->{line_prev},  
                         column => $self->{column_prev});  
2122          $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment          $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2123          $self->{state} = COMMENT_STATE;          $self->{state} = COMMENT_STATE;
2124          !!!next-input-character;          !!!next-input-character;
# Line 3071  sub _get_next_token ($) { Line 3089  sub _get_next_token ($) {
3089              0x003C => 1, 0x0026 => 1, -1 => 1, # <, &              0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3090              $self->{entity_add} => 1,              $self->{entity_add} => 1,
3091            }->{$self->{nc}}) {            }->{$self->{nc}}) {
3092          !!!cp (1001);          if ($self->{is_xml}) {
3093              !!!cp (1001.1);
3094              !!!parse-error (type => 'bare ero',
3095                              line => $self->{line_prev},
3096                              column => $self->{column_prev}
3097                                  + ($self->{nc} == -1 ? 1 : 0));
3098            } else {
3099              !!!cp (1001);
3100              ## No error
3101            }
3102          ## Don't consume          ## Don't consume
         ## No error  
3103          ## Return nothing.          ## Return nothing.
3104          #          #
3105        } elsif ($self->{nc} == 0x0023) { # #        } elsif ($self->{nc} == 0x0023) { # #
# Line 3082  sub _get_next_token ($) { Line 3108  sub _get_next_token ($) {
3108          $self->{kwd} = '#';          $self->{kwd} = '#';
3109          !!!next-input-character;          !!!next-input-character;
3110          redo A;          redo A;
3111        } elsif ((0x0041 <= $self->{nc} and        } elsif ($self->{is_xml} or
3112                   (0x0041 <= $self->{nc} and
3113                  $self->{nc} <= 0x005A) or # A..Z                  $self->{nc} <= 0x005A) or # A..Z
3114                 (0x0061 <= $self->{nc} and                 (0x0061 <= $self->{nc} and
3115                  $self->{nc} <= 0x007A)) { # a..z                  $self->{nc} <= 0x007A)) { # a..z
# Line 3126  sub _get_next_token ($) { Line 3153  sub _get_next_token ($) {
3153          redo A;          redo A;
3154        }        }
3155      } elsif ($self->{state} == ENTITY_HASH_STATE) {      } elsif ($self->{state} == ENTITY_HASH_STATE) {
3156        if ($self->{nc} == 0x0078 or # x        if ($self->{nc} == 0x0078) { # x
           $self->{nc} == 0x0058) { # X  
3157          !!!cp (995);          !!!cp (995);
3158          $self->{state} = HEXREF_X_STATE;          $self->{state} = HEXREF_X_STATE;
3159          $self->{kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3160          !!!next-input-character;          !!!next-input-character;
3161          redo A;          redo A;
3162          } elsif ($self->{nc} == 0x0058) { # X
3163            !!!cp (995.1);
3164            if ($self->{is_xml}) {
3165              !!!parse-error (type => 'uppercase hcro'); ## TODO: type
3166            }
3167            $self->{state} = HEXREF_X_STATE;
3168            $self->{kwd} .= chr $self->{nc};
3169            !!!next-input-character;
3170            redo A;
3171        } elsif (0x0030 <= $self->{nc} and        } elsif (0x0030 <= $self->{nc} and
3172                 $self->{nc} <= 0x0039) { # 0..9                 $self->{nc} <= 0x0039) { # 0..9
3173          !!!cp (994);          !!!cp (994);
# Line 3193  sub _get_next_token ($) { Line 3228  sub _get_next_token ($) {
3228        my $code = $self->{kwd};        my $code = $self->{kwd};
3229        my $l = $self->{line_prev};        my $l = $self->{line_prev};
3230        my $c = $self->{column_prev};        my $c = $self->{column_prev};
3231        if ($charref_map->{$code}) {        if ((not $self->{is_xml} and $charref_map->{$code}) or
3232              ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
3233              ($self->{is_xml} and $code == 0x0000)) {
3234          !!!cp (1015);          !!!cp (1015);
3235          !!!parse-error (type => 'invalid character reference',          !!!parse-error (type => 'invalid character reference',
3236                          text => (sprintf 'U+%04X', $code),                          text => (sprintf 'U+%04X', $code),
# Line 3306  sub _get_next_token ($) { Line 3343  sub _get_next_token ($) {
3343        my $code = $self->{kwd};        my $code = $self->{kwd};
3344        my $l = $self->{line_prev};        my $l = $self->{line_prev};
3345        my $c = $self->{column_prev};        my $c = $self->{column_prev};
3346        if ($charref_map->{$code}) {        if ((not $self->{is_xml} and $charref_map->{$code}) or
3347              ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
3348              ($self->{is_xml} and $code == 0x0000)) {
3349          !!!cp (1008);          !!!cp (1008);
3350          !!!parse-error (type => 'invalid character reference',          !!!parse-error (type => 'invalid character reference',
3351                          text => (sprintf 'U+%04X', $code),                          text => (sprintf 'U+%04X', $code),
# Line 3340  sub _get_next_token ($) { Line 3379  sub _get_next_token ($) {
3379          redo A;          redo A;
3380        }        }
3381      } elsif ($self->{state} == ENTITY_NAME_STATE) {      } elsif ($self->{state} == ENTITY_NAME_STATE) {
3382        if (length $self->{kwd} < 30 and        if ((0x0041 <= $self->{nc} and # a
3383            ## NOTE: Some number greater than the maximum length of entity name             $self->{nc} <= 0x005A) or # x
3384            ((0x0041 <= $self->{nc} and # a            (0x0061 <= $self->{nc} and # a
3385              $self->{nc} <= 0x005A) or # x             $self->{nc} <= 0x007A) or # z
3386             (0x0061 <= $self->{nc} and # a            (0x0030 <= $self->{nc} and # 0
3387              $self->{nc} <= 0x007A) or # z             $self->{nc} <= 0x0039) or # 9
3388             (0x0030 <= $self->{nc} and # 0            $self->{nc} == 0x003B or # ;
3389              $self->{nc} <= 0x0039) or # 9            ($self->{is_xml} and
3390             $self->{nc} == 0x003B)) { # ;             not ($is_space->{$self->{nc}} or
3391                    {
3392                      0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3393                      $self->{entity_add} => 1,
3394                    }->{$self->{nc}}))) {
3395          our $EntityChar;          our $EntityChar;
3396          $self->{kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3397          if (defined $EntityChar->{$self->{kwd}}) {          if (defined $EntityChar->{$self->{kwd}} or
3398                $self->{ge}->{$self->{kwd}}) {
3399            if ($self->{nc} == 0x003B) { # ;            if ($self->{nc} == 0x003B) { # ;
3400              !!!cp (1020);              if (defined $self->{ge}->{$self->{kwd}}) {
3401              $self->{entity__value} = $EntityChar->{$self->{kwd}};                if ($self->{ge}->{$self->{kwd}}->{only_text}) {
3402                    !!!cp (1020.1);
3403                    $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
3404                  } else {
3405                    if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
3406                      !!!cp (1020.2);
3407                      !!!parse-error (type => 'unparsed entity', ## TODO: type
3408                                      value => $self->{kwd});
3409                    } else {
3410                      !!!cp (1020.3);
3411                    }
3412                    $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
3413                  }
3414                } else {
3415                  if ($self->{is_xml}) {
3416                    !!!cp (1020.4);
3417                    !!!parse-error (type => 'entity not declared', ## TODO: type
3418                                    value => $self->{kwd},
3419                                    level => {
3420                                              'amp;' => $self->{level}->{warn},
3421                                              'quot;' => $self->{level}->{warn},
3422                                              'lt;' => $self->{level}->{warn},
3423                                              'gt;' => $self->{level}->{warn},
3424                                              'apos;' => $self->{level}->{warn},
3425                                             }->{$self->{kwd}} ||
3426                                             $self->{level}->{must});
3427                  } else {
3428                    !!!cp (1020);
3429                  }
3430                  $self->{entity__value} = $EntityChar->{$self->{kwd}};
3431                }
3432              $self->{entity__match} = 1;              $self->{entity__match} = 1;
3433              !!!next-input-character;              !!!next-input-character;
3434              #              #
# Line 3600  sub _get_next_token ($) { Line 3674  sub _get_next_token ($) {
3674          ## XML5: Not defined yet.          ## XML5: Not defined yet.
3675    
3676          ## TODO:          ## TODO:
3677    
3678            if (not $self->{stop_processing} and
3679                not $self->{document}->xml_standalone) {
3680              !!!parse-error (type => 'stop processing', ## TODO: type
3681                              level => $self->{level}->{info});
3682              $self->{stop_processing} = 1;
3683            }
3684    
3685          !!!next-input-character;          !!!next-input-character;
3686          redo A;          redo A;
3687        } elsif ($self->{nc} == 0x005D) { # ]        } elsif ($self->{nc} == 0x005D) { # ]
# Line 3834  sub _get_next_token ($) { Line 3916  sub _get_next_token ($) {
3916          }          }
3917          $self->{ct} = {type => ELEMENT_TOKEN, name => '',          $self->{ct} = {type => ELEMENT_TOKEN, name => '',
3918                         line => $self->{line_prev},                         line => $self->{line_prev},
3919                         column => $self->{column_prev} - 6};                         column => $self->{column_prev} - 7};
3920          $self->{state} = DOCTYPE_MD_STATE;          $self->{state} = DOCTYPE_MD_STATE;
3921          !!!next-input-character;          !!!next-input-character;
3922          redo A;          redo A;
# Line 3882  sub _get_next_token ($) { Line 3964  sub _get_next_token ($) {
3964          $self->{ct} = {type => ATTLIST_TOKEN, name => '',          $self->{ct} = {type => ATTLIST_TOKEN, name => '',
3965                         attrdefs => [],                         attrdefs => [],
3966                         line => $self->{line_prev},                         line => $self->{line_prev},
3967                         column => $self->{column_prev} - 6};                         column => $self->{column_prev} - 7};
3968          $self->{state} = DOCTYPE_MD_STATE;          $self->{state} = DOCTYPE_MD_STATE;
3969          !!!next-input-character;          !!!next-input-character;
3970          redo A;          redo A;
# Line 3931  sub _get_next_token ($) { Line 4013  sub _get_next_token ($) {
4013          }          }
4014          $self->{ct} = {type => NOTATION_TOKEN, name => '',          $self->{ct} = {type => NOTATION_TOKEN, name => '',
4015                         line => $self->{line_prev},                         line => $self->{line_prev},
4016                         column => $self->{column_prev} - 6};                         column => $self->{column_prev} - 8};
4017          $self->{state} = DOCTYPE_MD_STATE;          $self->{state} = DOCTYPE_MD_STATE;
4018          !!!next-input-character;          !!!next-input-character;
4019          redo A;          redo A;
# Line 4043  sub _get_next_token ($) { Line 4125  sub _get_next_token ($) {
4125          if ($self->{ct}->{type} == ATTLIST_TOKEN) {          if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4126            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4127          } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {          } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
4128            ## TODO: ...            $self->{state} = AFTER_ELEMENT_NAME_STATE;
           $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;  
4129          } else { # ENTITY/NOTATION          } else { # ENTITY/NOTATION
4130            $self->{state} = AFTER_DOCTYPE_NAME_STATE;            $self->{state} = AFTER_DOCTYPE_NAME_STATE;
4131          }          }
# Line 4667  sub _get_next_token ($) { Line 4748  sub _get_next_token ($) {
4748        }        }
4749      } elsif ($self->{state} == NOTATION_NAME_STATE) {      } elsif ($self->{state} == NOTATION_NAME_STATE) {
4750        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
4751          $self->{state} = AFTER_NOTATION_NAME_STATE;          $self->{state} = AFTER_MD_DEF_STATE;
4752          !!!next-input-character;          !!!next-input-character;
4753          redo A;          redo A;
4754        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
# Line 4689  sub _get_next_token ($) { Line 4770  sub _get_next_token ($) {
4770        }        }
4771      } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {      } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
4772        if ($self->{nc} == 0x0022) { # "        if ($self->{nc} == 0x0022) { # "
4773          $self->{state} = AFTER_NOTATION_NAME_STATE;          $self->{state} = AFTER_MD_DEF_STATE;
4774          !!!next-input-character;          !!!next-input-character;
4775          redo A;          redo A;
4776        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
# Line 4712  sub _get_next_token ($) { Line 4793  sub _get_next_token ($) {
4793        }        }
4794      } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {      } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
4795        if ($self->{nc} == 0x0027) { # '        if ($self->{nc} == 0x0027) { # '
4796          $self->{state} = AFTER_NOTATION_NAME_STATE;          $self->{state} = AFTER_MD_DEF_STATE;
4797          !!!next-input-character;          !!!next-input-character;
4798          redo A;          redo A;
4799        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
# Line 4734  sub _get_next_token ($) { Line 4815  sub _get_next_token ($) {
4815          redo A;          redo A;
4816        }        }
4817      } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {      } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
       ## TODO: XMLize  
   
4818        if ($is_space->{$self->{nc}} or        if ($is_space->{$self->{nc}} or
4819            {            {
4820              0x003C => 1, 0x0026 => 1, -1 => 1, # <, &              0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4821              $self->{entity_add} => 1,              $self->{entity_add} => 1,
4822            }->{$self->{nc}}) {            }->{$self->{nc}}) {
4823            !!!parse-error (type => 'bare ero',
4824                            line => $self->{line_prev},
4825                            column => $self->{column_prev}
4826                                + ($self->{nc} == -1 ? 1 : 0));
4827          ## Don't consume          ## Don't consume
         ## No error  
4828          ## Return nothing.          ## Return nothing.
4829          #          #
4830        } elsif ($self->{nc} == 0x0023) { # #        } elsif ($self->{nc} == 0x0023) { # #
# Line 4751  sub _get_next_token ($) { Line 4833  sub _get_next_token ($) {
4833          $self->{kwd} = '#';          $self->{kwd} = '#';
4834          !!!next-input-character;          !!!next-input-character;
4835          redo A;          redo A;
       } elsif ((0x0041 <= $self->{nc} and  
                 $self->{nc} <= 0x005A) or # A..Z  
                (0x0061 <= $self->{nc} and  
                 $self->{nc} <= 0x007A)) { # a..z  
         #  
4836        } else {        } else {
         !!!parse-error (type => 'bare ero');  
         ## Return nothing.  
4837          #          #
4838        }        }
4839    
# Line 4766  sub _get_next_token ($) { Line 4841  sub _get_next_token ($) {
4841        $self->{state} = $self->{prev_state};        $self->{state} = $self->{prev_state};
4842        ## Reconsume.        ## Reconsume.
4843        redo A;        redo A;
4844      } elsif ($self->{state} == AFTER_NOTATION_NAME_STATE) {      } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
4845          if ($is_space->{$self->{nc}}) {
4846            $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
4847            !!!next-input-character;
4848            redo A;
4849          } elsif ($self->{nc} == 0x0028) { # (
4850            $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4851            $self->{ct}->{content} = ['('];
4852            $self->{group_depth} = 1;
4853            !!!next-input-character;
4854            redo A;
4855          } elsif ($self->{nc} == 0x003E) { # >
4856            !!!parse-error (type => 'no md def'); ## TODO: type
4857            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4858            !!!next-input-character;
4859            !!!emit ($self->{ct}); # ELEMENT
4860            redo A;
4861          } elsif ($self->{nc} == -1) {
4862            !!!parse-error (type => 'unclosed md'); ## TODO: type
4863            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4864            !!!next-input-character;
4865            !!!emit ($self->{ct}); # ELEMENT
4866            redo A;
4867          } else {
4868            $self->{ct}->{content} = [chr $self->{nc}];
4869            $self->{state} = CONTENT_KEYWORD_STATE;
4870            !!!next-input-character;
4871            redo A;
4872          }
4873        } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
4874        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
4875            $self->{state} = AFTER_MD_DEF_STATE;
4876            !!!next-input-character;
4877            redo A;
4878          } elsif ($self->{nc} == 0x003E) { # >
4879            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4880            !!!next-input-character;
4881            !!!emit ($self->{ct}); # ELEMENT
4882            redo A;
4883          } elsif ($self->{nc} == -1) {
4884            !!!parse-error (type => 'unclosed md'); ## TODO: type
4885            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4886            !!!next-input-character;
4887            !!!emit ($self->{ct}); # ELEMENT
4888            redo A;
4889          } else {
4890            $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
4891          ## Stay in the state.          ## Stay in the state.
4892          !!!next-input-character;          !!!next-input-character;
4893          redo A;          redo A;
4894          }
4895        } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
4896          if ($is_space->{$self->{nc}}) {
4897            ## Stay in the state.
4898            !!!next-input-character;
4899            redo A;
4900          } elsif ($self->{nc} == 0x0028) { # (
4901            $self->{group_depth}++;
4902            push @{$self->{ct}->{content}}, chr $self->{nc};
4903            ## Stay in the state.
4904            !!!next-input-character;
4905            redo A;
4906          } elsif ($self->{nc} == 0x007C or # |
4907                   $self->{nc} == 0x002C) { # ,
4908            !!!parse-error (type => 'empty element name'); ## TODO: type
4909            ## Stay in the state.
4910            !!!next-input-character;
4911            redo A;
4912          } elsif ($self->{nc} == 0x0029) { # )
4913            !!!parse-error (type => 'empty element name'); ## TODO: type
4914            push @{$self->{ct}->{content}}, chr $self->{nc};
4915            $self->{group_depth}--;
4916            $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
4917            !!!next-input-character;
4918            redo A;
4919        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
4920            !!!parse-error (type => 'unclosed cm group'); ## TODO: type
4921            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4922          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4923          !!!next-input-character;          !!!next-input-character;
4924          !!!emit ($self->{ct}); # ENTITY          !!!emit ($self->{ct}); # ELEMENT
4925          redo A;          redo A;
4926        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4927          !!!parse-error (type => 'unclosed md'); ## TODO: type          !!!parse-error (type => 'unclosed md'); ## TODO: type
4928            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4929          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4930          !!!next-input-character;          !!!next-input-character;
4931          !!!emit ($self->{ct}); # ENTITY          !!!emit ($self->{ct}); # ELEMENT
4932            redo A;
4933          } else {
4934            push @{$self->{ct}->{content}}, chr $self->{nc};
4935            $self->{state} = CM_ELEMENT_NAME_STATE;
4936            !!!next-input-character;
4937            redo A;
4938          }
4939        } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
4940          if ($is_space->{$self->{nc}}) {
4941            $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
4942            !!!next-input-character;
4943            redo A;
4944          } elsif ($self->{nc} == 0x002A or # *
4945                   $self->{nc} == 0x002B or # +
4946                   $self->{nc} == 0x003F) { # ?
4947            push @{$self->{ct}->{content}}, chr $self->{nc};
4948            $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
4949            !!!next-input-character;
4950            redo A;
4951          } elsif ($self->{nc} == 0x007C or # |
4952                   $self->{nc} == 0x002C) { # ,
4953            push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
4954            $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4955            !!!next-input-character;
4956            redo A;
4957          } elsif ($self->{nc} == 0x0029) { # )
4958            $self->{group_depth}--;
4959            push @{$self->{ct}->{content}}, chr $self->{nc};
4960            $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
4961            !!!next-input-character;
4962            redo A;
4963          } elsif ($self->{nc} == 0x003E) { # >
4964            !!!parse-error (type => 'unclosed cm group'); ## TODO: type
4965            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4966            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4967            !!!next-input-character;
4968            !!!emit ($self->{ct}); # ELEMENT
4969            redo A;
4970          } elsif ($self->{nc} == -1) {
4971            !!!parse-error (type => 'unclosed md'); ## TODO: type
4972            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4973            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4974            !!!next-input-character;
4975            !!!emit ($self->{ct}); # ELEMENT
4976            redo A;
4977          } else {
4978            $self->{ct}->{content}->[-1] .= chr $self->{nc};
4979            ## Stay in the state.
4980            !!!next-input-character;
4981            redo A;
4982          }
4983        } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
4984          if ($is_space->{$self->{nc}}) {
4985            ## Stay in the state.
4986            !!!next-input-character;
4987            redo A;
4988          } elsif ($self->{nc} == 0x007C or # |
4989                   $self->{nc} == 0x002C) { # ,
4990            push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
4991            $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4992            !!!next-input-character;
4993            redo A;
4994          } elsif ($self->{nc} == 0x0029) { # )
4995            $self->{group_depth}--;
4996            push @{$self->{ct}->{content}}, chr $self->{nc};
4997            $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
4998            !!!next-input-character;
4999            redo A;
5000          } elsif ($self->{nc} == 0x003E) { # >
5001            !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5002            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5003            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5004            !!!next-input-character;
5005            !!!emit ($self->{ct}); # ELEMENT
5006            redo A;
5007          } elsif ($self->{nc} == -1) {
5008            !!!parse-error (type => 'unclosed md'); ## TODO: type
5009            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5010            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5011            !!!next-input-character;
5012            !!!emit ($self->{ct}); # ELEMENT
5013            redo A;
5014          } else {
5015            !!!parse-error (type => 'after element name'); ## TODO: type
5016            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5017            $self->{state} = BOGUS_MD_STATE;
5018            !!!next-input-character;
5019            redo A;
5020          }
5021        } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
5022          if ($is_space->{$self->{nc}}) {
5023            if ($self->{group_depth}) {
5024              $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5025            } else {
5026              $self->{state} = AFTER_MD_DEF_STATE;
5027            }
5028            !!!next-input-character;
5029            redo A;
5030          } elsif ($self->{nc} == 0x002A or # *
5031                   $self->{nc} == 0x002B or # +
5032                   $self->{nc} == 0x003F) { # ?
5033            push @{$self->{ct}->{content}}, chr $self->{nc};
5034            if ($self->{group_depth}) {
5035              $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5036            } else {
5037              $self->{state} = AFTER_MD_DEF_STATE;
5038            }
5039            !!!next-input-character;
5040            redo A;
5041          } elsif ($self->{nc} == 0x0029) { # )
5042            if ($self->{group_depth}) {
5043              $self->{group_depth}--;
5044              push @{$self->{ct}->{content}}, chr $self->{nc};
5045              ## Stay in the state.
5046              !!!next-input-character;
5047              redo A;
5048            } else {
5049              !!!parse-error (type => 'string after md def'); ## TODO: type
5050              $self->{state} = BOGUS_MD_STATE;
5051              ## Reconsume.
5052              redo A;
5053            }
5054          } elsif ($self->{nc} == 0x003E) { # >
5055            if ($self->{group_depth}) {
5056              !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5057              push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5058            }
5059            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5060            !!!next-input-character;
5061            !!!emit ($self->{ct}); # ELEMENT
5062            redo A;
5063          } elsif ($self->{nc} == -1) {
5064            !!!parse-error (type => 'unclosed md'); ## TODO: type
5065            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5066            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5067            !!!next-input-character;
5068            !!!emit ($self->{ct}); # ELEMENT
5069            redo A;
5070          } else {
5071            if ($self->{group_depth}) {
5072              $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5073            } else {
5074              !!!parse-error (type => 'string after md def'); ## TODO: type
5075              $self->{state} = BOGUS_MD_STATE;
5076            }
5077            ## Reconsume.
5078            redo A;
5079          }
5080        } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
5081          if ($is_space->{$self->{nc}}) {
5082            ## Stay in the state.
5083            !!!next-input-character;
5084            redo A;
5085          } elsif ($self->{nc} == 0x003E) { # >
5086            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5087            !!!next-input-character;
5088            !!!emit ($self->{ct}); # ENTITY/ELEMENT
5089            redo A;
5090          } elsif ($self->{nc} == -1) {
5091            !!!parse-error (type => 'unclosed md'); ## TODO: type
5092            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5093            !!!next-input-character;
5094            !!!emit ($self->{ct}); # ENTITY/ELEMENT
5095          redo A;          redo A;
5096        } else {        } else {
5097          !!!parse-error (type => 'string after notation name'); ## TODO: type          !!!parse-error (type => 'string after md def'); ## TODO: type
5098          $self->{state} = BOGUS_MD_STATE;          $self->{state} = BOGUS_MD_STATE;
5099          ## Reconsume.          ## Reconsume.
5100          redo A;          redo A;

Legend:
Removed from v.1.19  
changed lines
  Added in v.1.27

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24