/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.19 by wakaba, Sun Oct 19 07:19:00 2008 UTC revision 1.31 by wakaba, Sat Sep 5 09:26:55 2009 UTC
# Line 105  sub COMMENT_START_STATE () { 14 } Line 105  sub COMMENT_START_STATE () { 14 }
105  sub COMMENT_START_DASH_STATE () { 15 }  sub COMMENT_START_DASH_STATE () { 15 }
106  sub COMMENT_STATE () { 16 }  sub COMMENT_STATE () { 16 }
107  sub COMMENT_END_STATE () { 17 }  sub COMMENT_END_STATE () { 17 }
108    sub COMMENT_END_BANG_STATE () { 102 } ## LAST
109  sub COMMENT_END_DASH_STATE () { 18 }  sub COMMENT_END_DASH_STATE () { 18 }
110  sub BOGUS_COMMENT_STATE () { 19 }  sub BOGUS_COMMENT_STATE () { 19 }
111  sub DOCTYPE_STATE () { 20 }  sub DOCTYPE_STATE () { 20 }
# Line 182  sub NDATA_STATE () { 86 } Line 183  sub NDATA_STATE () { 86 }
183  sub AFTER_NDATA_STATE () { 87 }  sub AFTER_NDATA_STATE () { 87 }
184  sub BEFORE_NOTATION_NAME_STATE () { 88 }  sub BEFORE_NOTATION_NAME_STATE () { 88 }
185  sub NOTATION_NAME_STATE () { 89 }  sub NOTATION_NAME_STATE () { 89 }
186  sub AFTER_NOTATION_NAME_STATE () { 90 }  sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
187  sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 91 }  sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
188  sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 92 }  sub ENTITY_VALUE_ENTITY_STATE () { 92 }
189  sub ENTITY_VALUE_ENTITY_STATE () { 93 }  sub AFTER_ELEMENT_NAME_STATE () { 93 }
190  sub BOGUS_MD_STATE () { 94 }  sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
191    sub CONTENT_KEYWORD_STATE () { 95 }
192    sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
193    sub CM_ELEMENT_NAME_STATE () { 97 }
194    sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
195    sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
196    sub AFTER_MD_DEF_STATE () { 100 }
197    sub BOGUS_MD_STATE () { 101 }
198    
199  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
200  ## list and descriptions)  ## list and descriptions)
# Line 941  sub _get_next_token ($) { Line 949  sub _get_next_token ($) {
949          if ({          if ({
950               0x0022 => 1, # "               0x0022 => 1, # "
951               0x0027 => 1, # '               0x0027 => 1, # '
952                 0x003C => 1, # <
953               0x003D => 1, # =               0x003D => 1, # =
954              }->{$self->{nc}}) {              }->{$self->{nc}}) {
955            !!!cp (55);            !!!cp (55);
# Line 1063  sub _get_next_token ($) { Line 1072  sub _get_next_token ($) {
1072    
1073          redo A;          redo A;
1074        } else {        } else {
1075          if ($self->{nc} == 0x0022 or # "          if ({
1076              $self->{nc} == 0x0027) { # '               0x0022 => 1, # "
1077                 0x0027 => 1, # '
1078                 0x003C => 1, # <
1079                }->{$self->{nc}}) {
1080            !!!cp (69);            !!!cp (69);
1081            ## XML5: Not a parse error.            ## XML5: Not a parse error.
1082            !!!parse-error (type => 'bad attribute name');            !!!parse-error (type => 'bad attribute name');
# Line 1175  sub _get_next_token ($) { Line 1187  sub _get_next_token ($) {
1187            !!!cp (78.2);            !!!cp (78.2);
1188          }          }
1189    
1190          if ($self->{nc} == 0x0022 or # "          if ({
1191              $self->{nc} == 0x0027) { # '               0x0022 => 1, # "
1192                 0x0027 => 1, # '
1193                 0x003C => 1, # <
1194                }->{$self->{nc}}) {
1195            !!!cp (78);            !!!cp (78);
1196            ## XML5: Not a parse error.            ## XML5: Not a parse error.
1197            !!!parse-error (type => 'bad attribute name');            !!!parse-error (type => 'bad attribute name');
# Line 1263  sub _get_next_token ($) { Line 1278  sub _get_next_token ($) {
1278    
1279          redo A;          redo A;
1280        } else {        } else {
1281          if ($self->{nc} == 0x003D) { # =          if ($self->{nc} == 0x003D or $self->{nc} == 0x003C) { # =, <
1282            !!!cp (93);            !!!cp (93);
1283            ## XML5: Not a parse error.            ## XML5: Not a parse error.
1284            !!!parse-error (type => 'bad attribute value');            !!!parse-error (type => 'bad attribute value');
# Line 1309  sub _get_next_token ($) { Line 1324  sub _get_next_token ($) {
1324          $self->{state} = ENTITY_STATE;          $self->{state} = ENTITY_STATE;
1325          !!!next-input-character;          !!!next-input-character;
1326          redo A;          redo A;
1327          } elsif ($self->{is_xml} and
1328                   $is_space->{$self->{nc}}) {
1329            !!!cp (97.1);
1330            $self->{ca}->{value} .= ' ';
1331            ## Stay in the state.
1332            !!!next-input-character;
1333            redo A;
1334        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1335          !!!parse-error (type => 'unclosed attribute value');          !!!parse-error (type => 'unclosed attribute value');
1336          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
# Line 1356  sub _get_next_token ($) { Line 1378  sub _get_next_token ($) {
1378          }          }
1379          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1380          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1381                                q["&<],                                qq["&<\x09\x0C\x20],
1382                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1383    
1384          ## Stay in the state          ## Stay in the state
# Line 1393  sub _get_next_token ($) { Line 1415  sub _get_next_token ($) {
1415          $self->{state} = ENTITY_STATE;          $self->{state} = ENTITY_STATE;
1416          !!!next-input-character;          !!!next-input-character;
1417          redo A;          redo A;
1418          } elsif ($self->{is_xml} and
1419                   $is_space->{$self->{nc}}) {
1420            !!!cp (103.1);
1421            $self->{ca}->{value} .= ' ';
1422            ## Stay in the state.
1423            !!!next-input-character;
1424            redo A;
1425        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1426          !!!parse-error (type => 'unclosed attribute value');          !!!parse-error (type => 'unclosed attribute value');
1427          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
# Line 1440  sub _get_next_token ($) { Line 1469  sub _get_next_token ($) {
1469          }          }
1470          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1471          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1472                                q['&<],                                qq['&<\x09\x0C\x20],
1473                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1474    
1475          ## Stay in the state          ## Stay in the state
# Line 1552  sub _get_next_token ($) { Line 1581  sub _get_next_token ($) {
1581               0x0022 => 1, # "               0x0022 => 1, # "
1582               0x0027 => 1, # '               0x0027 => 1, # '
1583               0x003D => 1, # =               0x003D => 1, # =
1584                 0x003C => 1, # <
1585              }->{$self->{nc}}) {              }->{$self->{nc}}) {
1586            !!!cp (115);            !!!cp (115);
1587            ## XML5: Not a parse error.            ## XML5: Not a parse error.
# Line 1561  sub _get_next_token ($) { Line 1591  sub _get_next_token ($) {
1591          }          }
1592          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1593          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1594                                q["'=& >],                                qq["'=& \x09\x0C>],
1595                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1596    
1597          ## Stay in the state          ## Stay in the state
# Line 2053  sub _get_next_token ($) { Line 2083  sub _get_next_token ($) {
2083          !!!next-input-character;          !!!next-input-character;
2084          redo A;          redo A;
2085        }        }
2086      } elsif ($self->{state} == COMMENT_END_STATE) {      } elsif ($self->{state} == COMMENT_END_STATE or
2087                 $self->{state} == COMMENT_END_BANG_STATE) {
2088        ## XML5: "Comment end state" and "DOCTYPE comment end state".        ## XML5: "Comment end state" and "DOCTYPE comment end state".
2089          ## (No comment end bang state.)
2090    
2091        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2092          if ($self->{in_subset}) {          if ($self->{in_subset}) {
# Line 2071  sub _get_next_token ($) { Line 2103  sub _get_next_token ($) {
2103    
2104          redo A;          redo A;
2105        } elsif ($self->{nc} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
2106          !!!cp (152);          if ($self->{state} == COMMENT_END_BANG_STATE) {
2107          ## XML5: Not a parse error.            !!!cp (154.3);
2108          !!!parse-error (type => 'dash in comment',            $self->{ct}->{data} .= '--!'; # comment
2109                          line => $self->{line_prev},            $self->{state} = COMMENT_END_DASH_STATE;
2110                          column => $self->{column_prev});          } else {
2111          $self->{ct}->{data} .= '-'; # comment            !!!cp (152);
2112          ## Stay in the state            ## XML5: Not a parse error.
2113              !!!parse-error (type => 'dash in comment',
2114                              line => $self->{line_prev},
2115                              column => $self->{column_prev});
2116              $self->{ct}->{data} .= '-'; # comment
2117              ## Stay in the state
2118            }
2119            !!!next-input-character;
2120            redo A;
2121          } elsif ($self->{nc} == 0x0021 and # !
2122                   $self->{state} != COMMENT_END_BANG_STATE) {
2123            !!!parse-error (type => 'comment end bang'); # XXX error type
2124            $self->{state} = COMMENT_END_BANG_STATE;
2125          !!!next-input-character;          !!!next-input-character;
2126          redo A;          redo A;
2127        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
# Line 2090  sub _get_next_token ($) { Line 2134  sub _get_next_token ($) {
2134            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
2135            $self->{s_kwd} = '';            $self->{s_kwd} = '';
2136          }          }
2137          ## reconsume          ## Reconsume.
2138    
2139          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
2140    
2141          redo A;          redo A;
2142        } else {        } else {
2143          !!!cp (154);          !!!cp (154);
2144          ## XML5: Not a parse error.          if ($self->{state} == COMMENT_END_BANG_STATE) {
2145          !!!parse-error (type => 'dash in comment',            $self->{ct}->{data} .= '--!' . chr ($self->{nc}); # comment
2146                          line => $self->{line_prev},          } else {
2147                          column => $self->{column_prev});            $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2148          $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment          }
2149          $self->{state} = COMMENT_STATE;          $self->{state} = COMMENT_STATE;
2150          !!!next-input-character;          !!!next-input-character;
2151          redo A;          redo A;
# Line 2112  sub _get_next_token ($) { Line 2156  sub _get_next_token ($) {
2156          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2157          !!!next-input-character;          !!!next-input-character;
2158          redo A;          redo A;
2159          } elsif ($self->{nc} == -1) {
2160            !!!cp (155.1);
2161            !!!parse-error (type => 'unclosed DOCTYPE');
2162            $self->{ct}->{quirks} = 1;
2163    
2164            $self->{state} = DATA_STATE;
2165            ## Reconsume.
2166            !!!emit ($self->{ct}); # DOCTYPE (quirks)
2167    
2168            redo A;
2169        } else {        } else {
2170          !!!cp (156);          !!!cp (156);
2171          ## XML5: Unless EOF, swith to the bogus comment state.          ## XML5: Swith to the bogus comment state.
2172          !!!parse-error (type => 'no space before DOCTYPE name');          !!!parse-error (type => 'no space before DOCTYPE name');
2173          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2174          ## reconsume          ## reconsume
# Line 2139  sub _get_next_token ($) { Line 2193  sub _get_next_token ($) {
2193          !!!emit ($self->{ct}); # DOCTYPE (quirks)          !!!emit ($self->{ct}); # DOCTYPE (quirks)
2194    
2195          redo A;          redo A;
2196          } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
2197            !!!cp (158.1);
2198            $self->{ct}->{name} # DOCTYPE
2199                = chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
2200            delete $self->{ct}->{quirks};
2201            $self->{state} = DOCTYPE_NAME_STATE;
2202            !!!next-input-character;
2203            redo A;
2204        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2205          !!!cp (159);          !!!cp (159);
2206          !!!parse-error (type => 'no DOCTYPE name');          !!!parse-error (type => 'no DOCTYPE name');
# Line 2185  sub _get_next_token ($) { Line 2247  sub _get_next_token ($) {
2247          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2248    
2249          redo A;          redo A;
2250          } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
2251            !!!cp (162.1);
2252            $self->{ct}->{name} # DOCTYPE
2253                .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
2254            delete $self->{ct}->{quirks};
2255            ## Stay in the state.
2256            !!!next-input-character;
2257            redo A;
2258        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2259          !!!cp (163);          !!!cp (163);
2260          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
# Line 2206  sub _get_next_token ($) { Line 2276  sub _get_next_token ($) {
2276          redo A;          redo A;
2277        } else {        } else {
2278          !!!cp (164);          !!!cp (164);
2279          $self->{ct}->{name}          $self->{ct}->{name} .= chr ($self->{nc}); # DOCTYPE
2280            .= chr ($self->{nc}); # DOCTYPE          ## Stay in the state.
         ## Stay in the state  
2281          !!!next-input-character;          !!!next-input-character;
2282          redo A;          redo A;
2283        }        }
# Line 3071  sub _get_next_token ($) { Line 3140  sub _get_next_token ($) {
3140              0x003C => 1, 0x0026 => 1, -1 => 1, # <, &              0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3141              $self->{entity_add} => 1,              $self->{entity_add} => 1,
3142            }->{$self->{nc}}) {            }->{$self->{nc}}) {
3143          !!!cp (1001);          if ($self->{is_xml}) {
3144              !!!cp (1001.1);
3145              !!!parse-error (type => 'bare ero',
3146                              line => $self->{line_prev},
3147                              column => $self->{column_prev}
3148                                  + ($self->{nc} == -1 ? 1 : 0));
3149            } else {
3150              !!!cp (1001);
3151              ## No error
3152            }
3153          ## Don't consume          ## Don't consume
         ## No error  
3154          ## Return nothing.          ## Return nothing.
3155          #          #
3156        } elsif ($self->{nc} == 0x0023) { # #        } elsif ($self->{nc} == 0x0023) { # #
# Line 3082  sub _get_next_token ($) { Line 3159  sub _get_next_token ($) {
3159          $self->{kwd} = '#';          $self->{kwd} = '#';
3160          !!!next-input-character;          !!!next-input-character;
3161          redo A;          redo A;
3162        } elsif ((0x0041 <= $self->{nc} and        } elsif ($self->{is_xml} or
3163                   (0x0041 <= $self->{nc} and
3164                  $self->{nc} <= 0x005A) or # A..Z                  $self->{nc} <= 0x005A) or # A..Z
3165                 (0x0061 <= $self->{nc} and                 (0x0061 <= $self->{nc} and
3166                  $self->{nc} <= 0x007A)) { # a..z                  $self->{nc} <= 0x007A)) { # a..z
# Line 3126  sub _get_next_token ($) { Line 3204  sub _get_next_token ($) {
3204          redo A;          redo A;
3205        }        }
3206      } elsif ($self->{state} == ENTITY_HASH_STATE) {      } elsif ($self->{state} == ENTITY_HASH_STATE) {
3207        if ($self->{nc} == 0x0078 or # x        if ($self->{nc} == 0x0078) { # x
           $self->{nc} == 0x0058) { # X  
3208          !!!cp (995);          !!!cp (995);
3209          $self->{state} = HEXREF_X_STATE;          $self->{state} = HEXREF_X_STATE;
3210          $self->{kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3211          !!!next-input-character;          !!!next-input-character;
3212          redo A;          redo A;
3213          } elsif ($self->{nc} == 0x0058) { # X
3214            !!!cp (995.1);
3215            if ($self->{is_xml}) {
3216              !!!parse-error (type => 'uppercase hcro'); ## TODO: type
3217            }
3218            $self->{state} = HEXREF_X_STATE;
3219            $self->{kwd} .= chr $self->{nc};
3220            !!!next-input-character;
3221            redo A;
3222        } elsif (0x0030 <= $self->{nc} and        } elsif (0x0030 <= $self->{nc} and
3223                 $self->{nc} <= 0x0039) { # 0..9                 $self->{nc} <= 0x0039) { # 0..9
3224          !!!cp (994);          !!!cp (994);
# Line 3193  sub _get_next_token ($) { Line 3279  sub _get_next_token ($) {
3279        my $code = $self->{kwd};        my $code = $self->{kwd};
3280        my $l = $self->{line_prev};        my $l = $self->{line_prev};
3281        my $c = $self->{column_prev};        my $c = $self->{column_prev};
3282        if ($charref_map->{$code}) {        if ((not $self->{is_xml} and $charref_map->{$code}) or
3283              ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
3284              ($self->{is_xml} and $code == 0x0000)) {
3285          !!!cp (1015);          !!!cp (1015);
3286          !!!parse-error (type => 'invalid character reference',          !!!parse-error (type => 'invalid character reference',
3287                          text => (sprintf 'U+%04X', $code),                          text => (sprintf 'U+%04X', $code),
# Line 3306  sub _get_next_token ($) { Line 3394  sub _get_next_token ($) {
3394        my $code = $self->{kwd};        my $code = $self->{kwd};
3395        my $l = $self->{line_prev};        my $l = $self->{line_prev};
3396        my $c = $self->{column_prev};        my $c = $self->{column_prev};
3397        if ($charref_map->{$code}) {        if ((not $self->{is_xml} and $charref_map->{$code}) or
3398              ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
3399              ($self->{is_xml} and $code == 0x0000)) {
3400          !!!cp (1008);          !!!cp (1008);
3401          !!!parse-error (type => 'invalid character reference',          !!!parse-error (type => 'invalid character reference',
3402                          text => (sprintf 'U+%04X', $code),                          text => (sprintf 'U+%04X', $code),
# Line 3340  sub _get_next_token ($) { Line 3430  sub _get_next_token ($) {
3430          redo A;          redo A;
3431        }        }
3432      } elsif ($self->{state} == ENTITY_NAME_STATE) {      } elsif ($self->{state} == ENTITY_NAME_STATE) {
3433        if (length $self->{kwd} < 30 and        if ((0x0041 <= $self->{nc} and # a
3434            ## NOTE: Some number greater than the maximum length of entity name             $self->{nc} <= 0x005A) or # x
3435            ((0x0041 <= $self->{nc} and # a            (0x0061 <= $self->{nc} and # a
3436              $self->{nc} <= 0x005A) or # x             $self->{nc} <= 0x007A) or # z
3437             (0x0061 <= $self->{nc} and # a            (0x0030 <= $self->{nc} and # 0
3438              $self->{nc} <= 0x007A) or # z             $self->{nc} <= 0x0039) or # 9
3439             (0x0030 <= $self->{nc} and # 0            $self->{nc} == 0x003B or # ;
3440              $self->{nc} <= 0x0039) or # 9            ($self->{is_xml} and
3441             $self->{nc} == 0x003B)) { # ;             not ($is_space->{$self->{nc}} or
3442                    {
3443                      0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3444                      $self->{entity_add} => 1,
3445                    }->{$self->{nc}}))) {
3446          our $EntityChar;          our $EntityChar;
3447          $self->{kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3448          if (defined $EntityChar->{$self->{kwd}}) {          if (defined $EntityChar->{$self->{kwd}} or
3449                $self->{ge}->{$self->{kwd}}) {
3450            if ($self->{nc} == 0x003B) { # ;            if ($self->{nc} == 0x003B) { # ;
3451              !!!cp (1020);              if (defined $self->{ge}->{$self->{kwd}}) {
3452              $self->{entity__value} = $EntityChar->{$self->{kwd}};                if ($self->{ge}->{$self->{kwd}}->{only_text}) {
3453                    !!!cp (1020.1);
3454                    $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
3455                  } else {
3456                    if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
3457                      !!!cp (1020.2);
3458                      !!!parse-error (type => 'unparsed entity', ## TODO: type
3459                                      value => $self->{kwd});
3460                    } else {
3461                      !!!cp (1020.3);
3462                    }
3463                    $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
3464                  }
3465                } else {
3466                  if ($self->{is_xml}) {
3467                    !!!cp (1020.4);
3468                    !!!parse-error (type => 'entity not declared', ## TODO: type
3469                                    value => $self->{kwd},
3470                                    level => {
3471                                              'amp;' => $self->{level}->{warn},
3472                                              'quot;' => $self->{level}->{warn},
3473                                              'lt;' => $self->{level}->{warn},
3474                                              'gt;' => $self->{level}->{warn},
3475                                              'apos;' => $self->{level}->{warn},
3476                                             }->{$self->{kwd}} ||
3477                                             $self->{level}->{must});
3478                  } else {
3479                    !!!cp (1020);
3480                  }
3481                  $self->{entity__value} = $EntityChar->{$self->{kwd}};
3482                }
3483              $self->{entity__match} = 1;              $self->{entity__match} = 1;
3484              !!!next-input-character;              !!!next-input-character;
3485              #              #
# Line 3600  sub _get_next_token ($) { Line 3725  sub _get_next_token ($) {
3725          ## XML5: Not defined yet.          ## XML5: Not defined yet.
3726    
3727          ## TODO:          ## TODO:
3728    
3729            if (not $self->{stop_processing} and
3730                not $self->{document}->xml_standalone) {
3731              !!!parse-error (type => 'stop processing', ## TODO: type
3732                              level => $self->{level}->{info});
3733              $self->{stop_processing} = 1;
3734            }
3735    
3736          !!!next-input-character;          !!!next-input-character;
3737          redo A;          redo A;
3738        } elsif ($self->{nc} == 0x005D) { # ]        } elsif ($self->{nc} == 0x005D) { # ]
# Line 3834  sub _get_next_token ($) { Line 3967  sub _get_next_token ($) {
3967          }          }
3968          $self->{ct} = {type => ELEMENT_TOKEN, name => '',          $self->{ct} = {type => ELEMENT_TOKEN, name => '',
3969                         line => $self->{line_prev},                         line => $self->{line_prev},
3970                         column => $self->{column_prev} - 6};                         column => $self->{column_prev} - 7};
3971          $self->{state} = DOCTYPE_MD_STATE;          $self->{state} = DOCTYPE_MD_STATE;
3972          !!!next-input-character;          !!!next-input-character;
3973          redo A;          redo A;
# Line 3882  sub _get_next_token ($) { Line 4015  sub _get_next_token ($) {
4015          $self->{ct} = {type => ATTLIST_TOKEN, name => '',          $self->{ct} = {type => ATTLIST_TOKEN, name => '',
4016                         attrdefs => [],                         attrdefs => [],
4017                         line => $self->{line_prev},                         line => $self->{line_prev},
4018                         column => $self->{column_prev} - 6};                         column => $self->{column_prev} - 7};
4019          $self->{state} = DOCTYPE_MD_STATE;          $self->{state} = DOCTYPE_MD_STATE;
4020          !!!next-input-character;          !!!next-input-character;
4021          redo A;          redo A;
# Line 3931  sub _get_next_token ($) { Line 4064  sub _get_next_token ($) {
4064          }          }
4065          $self->{ct} = {type => NOTATION_TOKEN, name => '',          $self->{ct} = {type => NOTATION_TOKEN, name => '',
4066                         line => $self->{line_prev},                         line => $self->{line_prev},
4067                         column => $self->{column_prev} - 6};                         column => $self->{column_prev} - 8};
4068          $self->{state} = DOCTYPE_MD_STATE;          $self->{state} = DOCTYPE_MD_STATE;
4069          !!!next-input-character;          !!!next-input-character;
4070          redo A;          redo A;
# Line 4043  sub _get_next_token ($) { Line 4176  sub _get_next_token ($) {
4176          if ($self->{ct}->{type} == ATTLIST_TOKEN) {          if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4177            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4178          } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {          } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
4179            ## TODO: ...            $self->{state} = AFTER_ELEMENT_NAME_STATE;
           $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;  
4180          } else { # ENTITY/NOTATION          } else { # ENTITY/NOTATION
4181            $self->{state} = AFTER_DOCTYPE_NAME_STATE;            $self->{state} = AFTER_DOCTYPE_NAME_STATE;
4182          }          }
# Line 4667  sub _get_next_token ($) { Line 4799  sub _get_next_token ($) {
4799        }        }
4800      } elsif ($self->{state} == NOTATION_NAME_STATE) {      } elsif ($self->{state} == NOTATION_NAME_STATE) {
4801        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
4802          $self->{state} = AFTER_NOTATION_NAME_STATE;          $self->{state} = AFTER_MD_DEF_STATE;
4803          !!!next-input-character;          !!!next-input-character;
4804          redo A;          redo A;
4805        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
# Line 4689  sub _get_next_token ($) { Line 4821  sub _get_next_token ($) {
4821        }        }
4822      } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {      } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
4823        if ($self->{nc} == 0x0022) { # "        if ($self->{nc} == 0x0022) { # "
4824          $self->{state} = AFTER_NOTATION_NAME_STATE;          $self->{state} = AFTER_MD_DEF_STATE;
4825          !!!next-input-character;          !!!next-input-character;
4826          redo A;          redo A;
4827        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
# Line 4712  sub _get_next_token ($) { Line 4844  sub _get_next_token ($) {
4844        }        }
4845      } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {      } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
4846        if ($self->{nc} == 0x0027) { # '        if ($self->{nc} == 0x0027) { # '
4847          $self->{state} = AFTER_NOTATION_NAME_STATE;          $self->{state} = AFTER_MD_DEF_STATE;
4848          !!!next-input-character;          !!!next-input-character;
4849          redo A;          redo A;
4850        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
# Line 4734  sub _get_next_token ($) { Line 4866  sub _get_next_token ($) {
4866          redo A;          redo A;
4867        }        }
4868      } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {      } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
       ## TODO: XMLize  
   
4869        if ($is_space->{$self->{nc}} or        if ($is_space->{$self->{nc}} or
4870            {            {
4871              0x003C => 1, 0x0026 => 1, -1 => 1, # <, &              0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4872              $self->{entity_add} => 1,              $self->{entity_add} => 1,
4873            }->{$self->{nc}}) {            }->{$self->{nc}}) {
4874            !!!parse-error (type => 'bare ero',
4875                            line => $self->{line_prev},
4876                            column => $self->{column_prev}
4877                                + ($self->{nc} == -1 ? 1 : 0));
4878          ## Don't consume          ## Don't consume
         ## No error  
4879          ## Return nothing.          ## Return nothing.
4880          #          #
4881        } elsif ($self->{nc} == 0x0023) { # #        } elsif ($self->{nc} == 0x0023) { # #
# Line 4751  sub _get_next_token ($) { Line 4884  sub _get_next_token ($) {
4884          $self->{kwd} = '#';          $self->{kwd} = '#';
4885          !!!next-input-character;          !!!next-input-character;
4886          redo A;          redo A;
       } elsif ((0x0041 <= $self->{nc} and  
                 $self->{nc} <= 0x005A) or # A..Z  
                (0x0061 <= $self->{nc} and  
                 $self->{nc} <= 0x007A)) { # a..z  
         #  
4887        } else {        } else {
         !!!parse-error (type => 'bare ero');  
         ## Return nothing.  
4888          #          #
4889        }        }
4890    
# Line 4766  sub _get_next_token ($) { Line 4892  sub _get_next_token ($) {
4892        $self->{state} = $self->{prev_state};        $self->{state} = $self->{prev_state};
4893        ## Reconsume.        ## Reconsume.
4894        redo A;        redo A;
4895      } elsif ($self->{state} == AFTER_NOTATION_NAME_STATE) {      } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
4896          if ($is_space->{$self->{nc}}) {
4897            $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
4898            !!!next-input-character;
4899            redo A;
4900          } elsif ($self->{nc} == 0x0028) { # (
4901            $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4902            $self->{ct}->{content} = ['('];
4903            $self->{group_depth} = 1;
4904            !!!next-input-character;
4905            redo A;
4906          } elsif ($self->{nc} == 0x003E) { # >
4907            !!!parse-error (type => 'no md def'); ## TODO: type
4908            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4909            !!!next-input-character;
4910            !!!emit ($self->{ct}); # ELEMENT
4911            redo A;
4912          } elsif ($self->{nc} == -1) {
4913            !!!parse-error (type => 'unclosed md'); ## TODO: type
4914            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4915            !!!next-input-character;
4916            !!!emit ($self->{ct}); # ELEMENT
4917            redo A;
4918          } else {
4919            $self->{ct}->{content} = [chr $self->{nc}];
4920            $self->{state} = CONTENT_KEYWORD_STATE;
4921            !!!next-input-character;
4922            redo A;
4923          }
4924        } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
4925          if ($is_space->{$self->{nc}}) {
4926            $self->{state} = AFTER_MD_DEF_STATE;
4927            !!!next-input-character;
4928            redo A;
4929          } elsif ($self->{nc} == 0x003E) { # >
4930            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4931            !!!next-input-character;
4932            !!!emit ($self->{ct}); # ELEMENT
4933            redo A;
4934          } elsif ($self->{nc} == -1) {
4935            !!!parse-error (type => 'unclosed md'); ## TODO: type
4936            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4937            !!!next-input-character;
4938            !!!emit ($self->{ct}); # ELEMENT
4939            redo A;
4940          } else {
4941            $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
4942            ## Stay in the state.
4943            !!!next-input-character;
4944            redo A;
4945          }
4946        } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
4947        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
4948          ## Stay in the state.          ## Stay in the state.
4949          !!!next-input-character;          !!!next-input-character;
4950          redo A;          redo A;
4951          } elsif ($self->{nc} == 0x0028) { # (
4952            $self->{group_depth}++;
4953            push @{$self->{ct}->{content}}, chr $self->{nc};
4954            ## Stay in the state.
4955            !!!next-input-character;
4956            redo A;
4957          } elsif ($self->{nc} == 0x007C or # |
4958                   $self->{nc} == 0x002C) { # ,
4959            !!!parse-error (type => 'empty element name'); ## TODO: type
4960            ## Stay in the state.
4961            !!!next-input-character;
4962            redo A;
4963          } elsif ($self->{nc} == 0x0029) { # )
4964            !!!parse-error (type => 'empty element name'); ## TODO: type
4965            push @{$self->{ct}->{content}}, chr $self->{nc};
4966            $self->{group_depth}--;
4967            $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
4968            !!!next-input-character;
4969            redo A;
4970        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
4971            !!!parse-error (type => 'unclosed cm group'); ## TODO: type
4972            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4973          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4974          !!!next-input-character;          !!!next-input-character;
4975          !!!emit ($self->{ct}); # ENTITY          !!!emit ($self->{ct}); # ELEMENT
4976          redo A;          redo A;
4977        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4978          !!!parse-error (type => 'unclosed md'); ## TODO: type          !!!parse-error (type => 'unclosed md'); ## TODO: type
4979            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4980          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4981          !!!next-input-character;          !!!next-input-character;
4982          !!!emit ($self->{ct}); # ENTITY          !!!emit ($self->{ct}); # ELEMENT
4983            redo A;
4984          } else {
4985            push @{$self->{ct}->{content}}, chr $self->{nc};
4986            $self->{state} = CM_ELEMENT_NAME_STATE;
4987            !!!next-input-character;
4988            redo A;
4989          }
4990        } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
4991          if ($is_space->{$self->{nc}}) {
4992            $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
4993            !!!next-input-character;
4994            redo A;
4995          } elsif ($self->{nc} == 0x002A or # *
4996                   $self->{nc} == 0x002B or # +
4997                   $self->{nc} == 0x003F) { # ?
4998            push @{$self->{ct}->{content}}, chr $self->{nc};
4999            $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5000            !!!next-input-character;
5001            redo A;
5002          } elsif ($self->{nc} == 0x007C or # |
5003                   $self->{nc} == 0x002C) { # ,
5004            push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
5005            $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
5006            !!!next-input-character;
5007            redo A;
5008          } elsif ($self->{nc} == 0x0029) { # )
5009            $self->{group_depth}--;
5010            push @{$self->{ct}->{content}}, chr $self->{nc};
5011            $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
5012            !!!next-input-character;
5013            redo A;
5014          } elsif ($self->{nc} == 0x003E) { # >
5015            !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5016            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5017            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5018            !!!next-input-character;
5019            !!!emit ($self->{ct}); # ELEMENT
5020            redo A;
5021          } elsif ($self->{nc} == -1) {
5022            !!!parse-error (type => 'unclosed md'); ## TODO: type
5023            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5024            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5025            !!!next-input-character;
5026            !!!emit ($self->{ct}); # ELEMENT
5027            redo A;
5028          } else {
5029            $self->{ct}->{content}->[-1] .= chr $self->{nc};
5030            ## Stay in the state.
5031            !!!next-input-character;
5032            redo A;
5033          }
5034        } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
5035          if ($is_space->{$self->{nc}}) {
5036            ## Stay in the state.
5037            !!!next-input-character;
5038            redo A;
5039          } elsif ($self->{nc} == 0x007C or # |
5040                   $self->{nc} == 0x002C) { # ,
5041            push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
5042            $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
5043            !!!next-input-character;
5044            redo A;
5045          } elsif ($self->{nc} == 0x0029) { # )
5046            $self->{group_depth}--;
5047            push @{$self->{ct}->{content}}, chr $self->{nc};
5048            $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
5049            !!!next-input-character;
5050            redo A;
5051          } elsif ($self->{nc} == 0x003E) { # >
5052            !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5053            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5054            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5055            !!!next-input-character;
5056            !!!emit ($self->{ct}); # ELEMENT
5057            redo A;
5058          } elsif ($self->{nc} == -1) {
5059            !!!parse-error (type => 'unclosed md'); ## TODO: type
5060            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5061            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5062            !!!next-input-character;
5063            !!!emit ($self->{ct}); # ELEMENT
5064            redo A;
5065          } else {
5066            !!!parse-error (type => 'after element name'); ## TODO: type
5067            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5068            $self->{state} = BOGUS_MD_STATE;
5069            !!!next-input-character;
5070            redo A;
5071          }
5072        } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
5073          if ($is_space->{$self->{nc}}) {
5074            if ($self->{group_depth}) {
5075              $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5076            } else {
5077              $self->{state} = AFTER_MD_DEF_STATE;
5078            }
5079            !!!next-input-character;
5080            redo A;
5081          } elsif ($self->{nc} == 0x002A or # *
5082                   $self->{nc} == 0x002B or # +
5083                   $self->{nc} == 0x003F) { # ?
5084            push @{$self->{ct}->{content}}, chr $self->{nc};
5085            if ($self->{group_depth}) {
5086              $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5087            } else {
5088              $self->{state} = AFTER_MD_DEF_STATE;
5089            }
5090            !!!next-input-character;
5091            redo A;
5092          } elsif ($self->{nc} == 0x0029) { # )
5093            if ($self->{group_depth}) {
5094              $self->{group_depth}--;
5095              push @{$self->{ct}->{content}}, chr $self->{nc};
5096              ## Stay in the state.
5097              !!!next-input-character;
5098              redo A;
5099            } else {
5100              !!!parse-error (type => 'string after md def'); ## TODO: type
5101              $self->{state} = BOGUS_MD_STATE;
5102              ## Reconsume.
5103              redo A;
5104            }
5105          } elsif ($self->{nc} == 0x003E) { # >
5106            if ($self->{group_depth}) {
5107              !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5108              push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5109            }
5110            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5111            !!!next-input-character;
5112            !!!emit ($self->{ct}); # ELEMENT
5113            redo A;
5114          } elsif ($self->{nc} == -1) {
5115            !!!parse-error (type => 'unclosed md'); ## TODO: type
5116            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5117            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5118            !!!next-input-character;
5119            !!!emit ($self->{ct}); # ELEMENT
5120            redo A;
5121          } else {
5122            if ($self->{group_depth}) {
5123              $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5124            } else {
5125              !!!parse-error (type => 'string after md def'); ## TODO: type
5126              $self->{state} = BOGUS_MD_STATE;
5127            }
5128            ## Reconsume.
5129            redo A;
5130          }
5131        } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
5132          if ($is_space->{$self->{nc}}) {
5133            ## Stay in the state.
5134            !!!next-input-character;
5135            redo A;
5136          } elsif ($self->{nc} == 0x003E) { # >
5137            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5138            !!!next-input-character;
5139            !!!emit ($self->{ct}); # ENTITY/ELEMENT
5140            redo A;
5141          } elsif ($self->{nc} == -1) {
5142            !!!parse-error (type => 'unclosed md'); ## TODO: type
5143            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5144            !!!next-input-character;
5145            !!!emit ($self->{ct}); # ENTITY/ELEMENT
5146          redo A;          redo A;
5147        } else {        } else {
5148          !!!parse-error (type => 'string after notation name'); ## TODO: type          !!!parse-error (type => 'string after md def'); ## TODO: type
5149          $self->{state} = BOGUS_MD_STATE;          $self->{state} = BOGUS_MD_STATE;
5150          ## Reconsume.          ## Reconsume.
5151          redo A;          redo A;

Legend:
Removed from v.1.19  
changed lines
  Added in v.1.31

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24