/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.14 by wakaba, Fri Oct 17 07:14:29 2008 UTC revision 1.19 by wakaba, Sun Oct 19 07:19:00 2008 UTC
# Line 164  sub BEFORE_MD_NAME_STATE () { 68 } Line 164  sub BEFORE_MD_NAME_STATE () { 68 }
164  sub MD_NAME_STATE () { 69 }  sub MD_NAME_STATE () { 69 }
165  sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }  sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166  sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }  sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167    sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168    sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169    sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170    sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171    sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172    sub ALLOWED_TOKEN_STATE () { 77 }
173    sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174    sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175    sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179    sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180    sub BEFORE_NDATA_STATE () { 85 }
181    sub NDATA_STATE () { 86 }
182    sub AFTER_NDATA_STATE () { 87 }
183    sub BEFORE_NOTATION_NAME_STATE () { 88 }
184    sub NOTATION_NAME_STATE () { 89 }
185    sub AFTER_NOTATION_NAME_STATE () { 90 }
186    sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 91 }
187    sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 92 }
188    sub ENTITY_VALUE_ENTITY_STATE () { 93 }
189    sub BOGUS_MD_STATE () { 94 }
190    
191  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
192  ## list and descriptions)  ## list and descriptions)
# Line 1257  sub _get_next_token ($) { Line 1280  sub _get_next_token ($) {
1280          redo A;          redo A;
1281        }        }
1282      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1283        ## XML5: "Tag attribute value double quoted state".        ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1284          ## ATTLIST attribute value double quoted state".
1285                
1286        if ($self->{nc} == 0x0022) { # "        if ($self->{nc} == 0x0022) { # "
1287          !!!cp (95);          if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1288          ## XML5: "Tag attribute name before state".            !!!cp (95.1);
1289          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;            ## XML5: "DOCTYPE ATTLIST name after state".
1290              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1291              $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1292            } else {
1293              !!!cp (95);
1294              ## XML5: "Tag attribute name before state".
1295              $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1296            }
1297          !!!next-input-character;          !!!next-input-character;
1298          redo A;          redo A;
1299        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
# Line 1283  sub _get_next_token ($) { Line 1314  sub _get_next_token ($) {
1314          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1315            !!!cp (97);            !!!cp (97);
1316            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1317    
1318              $self->{state} = DATA_STATE;
1319              $self->{s_kwd} = '';
1320              ## reconsume
1321              !!!emit ($self->{ct}); # start tag
1322              redo A;
1323          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1324            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1325            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1292  sub _get_next_token ($) { Line 1329  sub _get_next_token ($) {
1329              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
1330              !!!cp (99);              !!!cp (99);
1331            }            }
1332    
1333              $self->{state} = DATA_STATE;
1334              $self->{s_kwd} = '';
1335              ## reconsume
1336              !!!emit ($self->{ct}); # end tag
1337              redo A;
1338            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1339              ## XML5: No parse error above; not defined yet.
1340              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1341              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1342              ## Reconsume.
1343              !!!emit ($self->{ct}); # ATTLIST
1344              redo A;
1345          } else {          } else {
1346            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1347          }          }
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
         ## reconsume  
   
         !!!emit ($self->{ct}); # start tag or end tag  
   
         redo A;  
1348        } else {        } else {
1349            ## XML5 [ATTLIST]: Not defined yet.
1350          if ($self->{is_xml} and $self->{nc} == 0x003C) { # <          if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1351            !!!cp (100);            !!!cp (100);
1352            ## XML5: Not a parse error.            ## XML5: Not a parse error.
# Line 1320  sub _get_next_token ($) { Line 1364  sub _get_next_token ($) {
1364          redo A;          redo A;
1365        }        }
1366      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1367        ## XML5: "Tag attribute value single quoted state".        ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1368          ## ATTLIST attribute value single quoted state".
1369    
1370        if ($self->{nc} == 0x0027) { # '        if ($self->{nc} == 0x0027) { # '
1371          !!!cp (101);          if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1372          ## XML5: "Before attribute name state" (sic).            !!!cp (101.1);
1373          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;            ## XML5: "DOCTYPE ATTLIST name after state".
1374              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1375              $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1376            } else {
1377              !!!cp (101);
1378              ## XML5: "Before attribute name state" (sic).
1379              $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1380            }
1381          !!!next-input-character;          !!!next-input-character;
1382          redo A;          redo A;
1383        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
# Line 1346  sub _get_next_token ($) { Line 1398  sub _get_next_token ($) {
1398          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1399            !!!cp (103);            !!!cp (103);
1400            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1401    
1402              $self->{state} = DATA_STATE;
1403              $self->{s_kwd} = '';
1404              ## reconsume
1405              !!!emit ($self->{ct}); # start tag
1406              redo A;
1407          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1408            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1409            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1355  sub _get_next_token ($) { Line 1413  sub _get_next_token ($) {
1413              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
1414              !!!cp (105);              !!!cp (105);
1415            }            }
1416    
1417              $self->{state} = DATA_STATE;
1418              $self->{s_kwd} = '';
1419              ## reconsume
1420              !!!emit ($self->{ct}); # end tag
1421              redo A;
1422            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1423              ## XML5: No parse error above; not defined yet.
1424              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1425              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1426              ## Reconsume.
1427              !!!emit ($self->{ct}); # ATTLIST
1428              redo A;
1429          } else {          } else {
1430            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1431          }          }
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
         ## reconsume  
   
         !!!emit ($self->{ct}); # start tag or end tag  
   
         redo A;  
1432        } else {        } else {
1433            ## XML5 [ATTLIST]: Not defined yet.
1434          if ($self->{is_xml} and $self->{nc} == 0x003C) { # <          if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1435            !!!cp (106);            !!!cp (106);
1436            ## XML5: Not a parse error.            ## XML5: Not a parse error.
# Line 1386  sub _get_next_token ($) { Line 1451  sub _get_next_token ($) {
1451        ## XML5: "Tag attribute value unquoted state".        ## XML5: "Tag attribute value unquoted state".
1452    
1453        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1454          !!!cp (107);          if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1455          ## XML5: "Tag attribute name before state".            !!!cp (107.1);
1456          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;            push @{$self->{ct}->{attrdefs}}, $self->{ca};
1457              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
1458            } else {
1459              !!!cp (107);
1460              ## XML5: "Tag attribute name before state".
1461              $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1462            }
1463          !!!next-input-character;          !!!next-input-character;
1464          redo A;          redo A;
1465        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
# Line 1409  sub _get_next_token ($) { Line 1480  sub _get_next_token ($) {
1480          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1481            !!!cp (109);            !!!cp (109);
1482            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1483    
1484              $self->{state} = DATA_STATE;
1485              $self->{s_kwd} = '';
1486              !!!next-input-character;
1487              !!!emit ($self->{ct}); # start tag
1488              redo A;
1489          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1490            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1491            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1418  sub _get_next_token ($) { Line 1495  sub _get_next_token ($) {
1495              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
1496              !!!cp (111);              !!!cp (111);
1497            }            }
1498    
1499              $self->{state} = DATA_STATE;
1500              $self->{s_kwd} = '';
1501              !!!next-input-character;
1502              !!!emit ($self->{ct}); # end tag
1503              redo A;
1504            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1505              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1506              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1507              !!!next-input-character;
1508              !!!emit ($self->{ct}); # ATTLIST
1509              redo A;
1510          } else {          } else {
1511            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1512          }          }
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
         !!!next-input-character;  
   
         !!!emit ($self->{ct}); # start tag or end tag  
   
         redo A;  
1513        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!parse-error (type => 'unclosed tag');  
1514          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1515            !!!cp (112);            !!!cp (112);
1516              !!!parse-error (type => 'unclosed tag');
1517            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1518    
1519              $self->{state} = DATA_STATE;
1520              $self->{s_kwd} = '';
1521              ## reconsume
1522              !!!emit ($self->{ct}); # start tag
1523              redo A;
1524          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1525              !!!parse-error (type => 'unclosed tag');
1526            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1527            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
1528              !!!cp (113);              !!!cp (113);
# Line 1442  sub _get_next_token ($) { Line 1531  sub _get_next_token ($) {
1531              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
1532              !!!cp (114);              !!!cp (114);
1533            }            }
1534    
1535              $self->{state} = DATA_STATE;
1536              $self->{s_kwd} = '';
1537              ## reconsume
1538              !!!emit ($self->{ct}); # end tag
1539              redo A;
1540            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1541              !!!parse-error (type => 'unclosed md'); ## TODO: type
1542              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1543              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1544              ## Reconsume.
1545              !!!emit ($self->{ct}); # ATTLIST
1546              redo A;
1547          } else {          } else {
1548            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1549          }          }
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
         ## reconsume  
   
         !!!emit ($self->{ct}); # start tag or end tag  
   
         redo A;  
1550        } else {        } else {
1551          if ({          if ({
1552               0x0022 => 1, # "               0x0022 => 1, # "
# Line 2127  sub _get_next_token ($) { Line 2222  sub _get_next_token ($) {
2222          !!!next-input-character;          !!!next-input-character;
2223          redo A;          redo A;
2224        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2225          !!!cp (166);          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2226          $self->{state} = DATA_STATE;            !!!cp (166);
2227          $self->{s_kwd} = '';            $self->{state} = DATA_STATE;
2228              $self->{s_kwd} = '';
2229            } else {
2230              !!!cp (166.1);
2231              !!!parse-error (type => 'no md def'); ## TODO: type
2232              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2233            }
2234            
2235          !!!next-input-character;          !!!next-input-character;
2236            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         !!!emit ($self->{ct}); # DOCTYPE  
   
2237          redo A;          redo A;
2238        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2239          !!!cp (167);          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2240          !!!parse-error (type => 'unclosed DOCTYPE');            !!!cp (167);
2241          $self->{state} = DATA_STATE;            !!!parse-error (type => 'unclosed DOCTYPE');
2242          $self->{s_kwd} = '';            $self->{state} = DATA_STATE;
2243          ## reconsume            $self->{s_kwd} = '';
2244              $self->{ct}->{quirks} = 1;
2245          $self->{ct}->{quirks} = 1;          } else {
2246          !!!emit ($self->{ct}); # DOCTYPE            !!!cp (167.12);
2247              !!!parse-error (type => 'unclosed md'); ## TODO: type
2248              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2249            }
2250            
2251            ## Reconsume.
2252            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2253          redo A;          redo A;
2254        } elsif ($self->{nc} == 0x0050 or # P        } elsif ($self->{nc} == 0x0050 or # P
2255                 $self->{nc} == 0x0070) { # p                 $self->{nc} == 0x0070) { # p
# Line 2160  sub _get_next_token ($) { Line 2265  sub _get_next_token ($) {
2265          $self->{kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
2266          !!!next-input-character;          !!!next-input-character;
2267          redo A;          redo A;
2268        } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [        } elsif ($self->{nc} == 0x0022 and # "
2269                   ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2270                    $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2271            !!!cp (167.21);
2272            $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
2273            $self->{ct}->{value} = ''; # ENTITY
2274            !!!next-input-character;
2275            redo A;
2276          } elsif ($self->{nc} == 0x0027 and # '
2277                   ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2278                    $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2279            !!!cp (167.22);
2280            $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
2281            $self->{ct}->{value} = ''; # ENTITY
2282            !!!next-input-character;
2283            redo A;
2284          } elsif ($self->{is_xml} and
2285                   $self->{ct}->{type} == DOCTYPE_TOKEN and
2286                   $self->{nc} == 0x005B) { # [
2287          !!!cp (167.3);          !!!cp (167.3);
2288          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2289          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
# Line 2169  sub _get_next_token ($) { Line 2292  sub _get_next_token ($) {
2292          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2293          redo A;          redo A;
2294        } else {        } else {
2295          !!!cp (180);          !!!parse-error (type => 'string after DOCTYPE name'); ## TODO: type
2296          !!!parse-error (type => 'string after DOCTYPE name');  
2297          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2298              !!!cp (180);
2299              $self->{ct}->{quirks} = 1;
2300              $self->{state} = BOGUS_DOCTYPE_STATE;
2301            } else {
2302              !!!cp (180.1);
2303              $self->{state} = BOGUS_MD_STATE;
2304            }
2305    
         $self->{state} = BOGUS_DOCTYPE_STATE;  
2306          !!!next-input-character;          !!!next-input-character;
2307          redo A;          redo A;
2308        }        }
# Line 2215  sub _get_next_token ($) { Line 2344  sub _get_next_token ($) {
2344          !!!next-input-character;          !!!next-input-character;
2345          redo A;          redo A;
2346        } else {        } else {
2347          !!!cp (169);          !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
         !!!parse-error (type => 'string after DOCTYPE name',  
2348                          line => $self->{line_prev},                          line => $self->{line_prev},
2349                          column => $self->{column_prev} + 1 - length $self->{kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
2350          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2351              !!!cp (169);
2352          $self->{state} = BOGUS_DOCTYPE_STATE;            $self->{ct}->{quirks} = 1;
2353              $self->{state} = BOGUS_DOCTYPE_STATE;
2354            } else {
2355              !!!cp (169.1);
2356              $self->{state} = BOGUS_MD_STATE;
2357            }
2358          ## Reconsume.          ## Reconsume.
2359          redo A;          redo A;
2360        }        }
# Line 2263  sub _get_next_token ($) { Line 2396  sub _get_next_token ($) {
2396          !!!next-input-character;          !!!next-input-character;
2397          redo A;          redo A;
2398        } else {        } else {
2399          !!!cp (172);          !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
         !!!parse-error (type => 'string after DOCTYPE name',  
2400                          line => $self->{line_prev},                          line => $self->{line_prev},
2401                          column => $self->{column_prev} + 1 - length $self->{kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
2402          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2403              !!!cp (172);
2404          $self->{state} = BOGUS_DOCTYPE_STATE;            $self->{ct}->{quirks} = 1;
2405              $self->{state} = BOGUS_DOCTYPE_STATE;
2406            } else {
2407              !!!cp (172.1);
2408              $self->{state} = BOGUS_MD_STATE;
2409            }
2410          ## Reconsume.          ## Reconsume.
2411          redo A;          redo A;
2412        }        }
# Line 2292  sub _get_next_token ($) { Line 2429  sub _get_next_token ($) {
2429          !!!next-input-character;          !!!next-input-character;
2430          redo A;          redo A;
2431        } elsif ($self->{nc} eq 0x003E) { # >        } elsif ($self->{nc} eq 0x003E) { # >
         !!!cp (184);  
2432          !!!parse-error (type => 'no PUBLIC literal');          !!!parse-error (type => 'no PUBLIC literal');
2433            
2434          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2435          $self->{s_kwd} = '';            !!!cp (184);
2436              $self->{state} = DATA_STATE;
2437              $self->{s_kwd} = '';
2438              $self->{ct}->{quirks} = 1;
2439            } else {
2440              !!!cp (184.1);
2441              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2442            }
2443            
2444          !!!next-input-character;          !!!next-input-character;
2445            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         !!!emit ($self->{ct}); # DOCTYPE  
   
2446          redo A;          redo A;
2447        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2448          !!!cp (185);          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2449          !!!parse-error (type => 'unclosed DOCTYPE');            !!!cp (185);
2450              !!!parse-error (type => 'unclosed DOCTYPE');
2451          $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
2452          $self->{s_kwd} = '';            $self->{s_kwd} = '';
2453              $self->{ct}->{quirks} = 1;
2454            } else {
2455              !!!cp (185.1);
2456              !!!parse-error (type => 'unclosed md'); ## TODO: type
2457              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2458            }
2459            
2460          ## reconsume          ## reconsume
   
         $self->{ct}->{quirks} = 1;  
2461          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
   
2462          redo A;          redo A;
2463        } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [        } elsif ($self->{is_xml} and
2464                   $self->{ct}->{type} == DOCTYPE_TOKEN and
2465                   $self->{nc} == 0x005B) { # [
2466          !!!cp (186.1);          !!!cp (186.1);
2467          !!!parse-error (type => 'no PUBLIC literal');          !!!parse-error (type => 'no PUBLIC literal');
2468          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
# Line 2325  sub _get_next_token ($) { Line 2472  sub _get_next_token ($) {
2472          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2473          redo A;          redo A;
2474        } else {        } else {
         !!!cp (186);  
2475          !!!parse-error (type => 'string after PUBLIC');          !!!parse-error (type => 'string after PUBLIC');
         $self->{ct}->{quirks} = 1;  
2476    
2477          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2478              !!!cp (186);
2479              $self->{ct}->{quirks} = 1;
2480              $self->{state} = BOGUS_DOCTYPE_STATE;
2481            } else {
2482              !!!cp (186.2);
2483              $self->{state} = BOGUS_MD_STATE;
2484            }
2485    
2486          !!!next-input-character;          !!!next-input-character;
2487          redo A;          redo A;
2488        }        }
# Line 2340  sub _get_next_token ($) { Line 2493  sub _get_next_token ($) {
2493          !!!next-input-character;          !!!next-input-character;
2494          redo A;          redo A;
2495        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
         !!!cp (188);  
2496          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2497    
2498          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2499          $self->{s_kwd} = '';            !!!cp (188);
2500          !!!next-input-character;            $self->{state} = DATA_STATE;
2501              $self->{s_kwd} = '';
2502          $self->{ct}->{quirks} = 1;            $self->{ct}->{quirks} = 1;
2503          !!!emit ($self->{ct}); # DOCTYPE          } else {
2504              !!!cp (188.1);
2505              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2506            }
2507    
2508            !!!next-input-character;
2509            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2510          redo A;          redo A;
2511        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (189);  
2512          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2513    
2514          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2515          $self->{s_kwd} = '';            !!!cp (189);
2516          ## reconsume            $self->{state} = DATA_STATE;
2517              $self->{s_kwd} = '';
2518          $self->{ct}->{quirks} = 1;            $self->{ct}->{quirks} = 1;
2519            } else {
2520              !!!cp (189.1);
2521              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2522            }
2523            
2524            ## Reconsume.
2525          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
   
2526          redo A;          redo A;
2527        } else {        } else {
2528          !!!cp (190);          !!!cp (190);
2529          $self->{ct}->{pubid} # DOCTYPE          $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
2530          $self->{read_until}->($self->{ct}->{pubid}, q[">],          $self->{read_until}->($self->{ct}->{pubid}, q[">],
2531                                length $self->{ct}->{pubid});                                length $self->{ct}->{pubid});
2532    
# Line 2381  sub _get_next_token ($) { Line 2541  sub _get_next_token ($) {
2541          !!!next-input-character;          !!!next-input-character;
2542          redo A;          redo A;
2543        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
         !!!cp (192);  
2544          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2545    
2546          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2547          $self->{s_kwd} = '';            !!!cp (192);
2548          !!!next-input-character;            $self->{state} = DATA_STATE;
2549              $self->{s_kwd} = '';
2550          $self->{ct}->{quirks} = 1;            $self->{ct}->{quirks} = 1;
2551          !!!emit ($self->{ct}); # DOCTYPE          } else {
2552              !!!cp (192.1);
2553              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2554            }
2555    
2556            !!!next-input-character;
2557            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2558          redo A;          redo A;
2559        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (193);  
2560          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2561    
2562          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2563          $self->{s_kwd} = '';            !!!cp (193);
2564              $self->{state} = DATA_STATE;
2565              $self->{s_kwd} = '';
2566              $self->{ct}->{quirks} = 1;
2567            } else {
2568              !!!cp (193.1);
2569              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2570            }
2571          
2572          ## reconsume          ## reconsume
2573            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         !!!emit ($self->{ct}); # DOCTYPE  
   
2574          redo A;          redo A;
2575        } else {        } else {
2576          !!!cp (194);          !!!cp (194);
2577          $self->{ct}->{pubid} # DOCTYPE          $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
2578          $self->{read_until}->($self->{ct}->{pubid}, q['>],          $self->{read_until}->($self->{ct}->{pubid}, q['>],
2579                                length $self->{ct}->{pubid});                                length $self->{ct}->{pubid});
2580    
# Line 2423  sub _get_next_token ($) { Line 2590  sub _get_next_token ($) {
2590          redo A;          redo A;
2591        } elsif ($self->{nc} == 0x0022) { # "        } elsif ($self->{nc} == 0x0022) { # "
2592          !!!cp (196);          !!!cp (196);
2593          $self->{ct}->{sysid} = ''; # DOCTYPE          $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2594          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2595          !!!next-input-character;          !!!next-input-character;
2596          redo A;          redo A;
2597        } elsif ($self->{nc} == 0x0027) { # '        } elsif ($self->{nc} == 0x0027) { # '
2598          !!!cp (197);          !!!cp (197);
2599          $self->{ct}->{sysid} = ''; # DOCTYPE          $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2600          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2601          !!!next-input-character;          !!!next-input-character;
2602          redo A;          redo A;
2603        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2604          if ($self->{is_xml}) {          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2605            !!!cp (198.1);            if ($self->{is_xml}) {
2606            !!!parse-error (type => 'no SYSTEM literal');              !!!cp (198.1);
2607                !!!parse-error (type => 'no SYSTEM literal');
2608              } else {
2609                !!!cp (198);
2610              }
2611              $self->{state} = DATA_STATE;
2612              $self->{s_kwd} = '';
2613          } else {          } else {
2614            !!!cp (198);            if ($self->{ct}->{type} == NOTATION_TOKEN) {
2615                !!!cp (198.2);
2616              } else {
2617                !!!cp (198.3);
2618                !!!parse-error (type => 'no SYSTEM literal');            
2619              }
2620              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2621          }          }
2622          $self->{state} = DATA_STATE;          
         $self->{s_kwd} = '';  
2623          !!!next-input-character;          !!!next-input-character;
2624            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         !!!emit ($self->{ct}); # DOCTYPE  
   
2625          redo A;          redo A;
2626        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2627          !!!cp (199);          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2628          !!!parse-error (type => 'unclosed DOCTYPE');            !!!cp (199);
2629              !!!parse-error (type => 'unclosed DOCTYPE');
2630          $self->{state} = DATA_STATE;            
2631          $self->{s_kwd} = '';            $self->{state} = DATA_STATE;
2632              $self->{s_kwd} = '';
2633              $self->{ct}->{quirks} = 1;
2634            } else {
2635              !!!parse-error (type => 'unclosed md'); ## TODO: type
2636              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2637            }
2638            
2639          ## reconsume          ## reconsume
2640            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         !!!emit ($self->{ct}); # DOCTYPE  
   
2641          redo A;          redo A;
2642        } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [        } elsif ($self->{is_xml} and
2643                   $self->{ct}->{type} == DOCTYPE_TOKEN and
2644                   $self->{nc} == 0x005B) { # [
2645          !!!cp (200.1);          !!!cp (200.1);
2646          !!!parse-error (type => 'no SYSTEM literal');          !!!parse-error (type => 'no SYSTEM literal');
2647          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
# Line 2469  sub _get_next_token ($) { Line 2651  sub _get_next_token ($) {
2651          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2652          redo A;          redo A;
2653        } else {        } else {
         !!!cp (200);  
2654          !!!parse-error (type => 'string after PUBLIC literal');          !!!parse-error (type => 'string after PUBLIC literal');
         $self->{ct}->{quirks} = 1;  
2655    
2656          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2657              !!!cp (200);
2658              $self->{ct}->{quirks} = 1;
2659              $self->{state} = BOGUS_DOCTYPE_STATE;
2660            } else {
2661              !!!cp (200.2);
2662              $self->{state} = BOGUS_MD_STATE;
2663            }
2664    
2665          !!!next-input-character;          !!!next-input-character;
2666          redo A;          redo A;
2667        }        }
# Line 2496  sub _get_next_token ($) { Line 2684  sub _get_next_token ($) {
2684          !!!next-input-character;          !!!next-input-character;
2685          redo A;          redo A;
2686        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
         !!!cp (204);  
2687          !!!parse-error (type => 'no SYSTEM literal');          !!!parse-error (type => 'no SYSTEM literal');
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
2688          !!!next-input-character;          !!!next-input-character;
2689    
2690          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2691          !!!emit ($self->{ct}); # DOCTYPE            !!!cp (204);
2692              $self->{state} = DATA_STATE;
2693              $self->{s_kwd} = '';
2694              $self->{ct}->{quirks} = 1;
2695            } else {
2696              !!!cp (204.1);
2697              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2698            }
2699    
2700            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2701          redo A;          redo A;
2702        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2703          !!!cp (205);          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2704          !!!parse-error (type => 'unclosed DOCTYPE');            !!!cp (205);
2705              !!!parse-error (type => 'unclosed DOCTYPE');
2706          $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
2707          $self->{s_kwd} = '';            $self->{s_kwd} = '';
2708              $self->{ct}->{quirks} = 1;
2709            } else {
2710              !!!cp (205.1);
2711              !!!parse-error (type => 'unclosed md'); ## TODO: type
2712              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2713            }
2714            
2715          ## reconsume          ## reconsume
2716            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         !!!emit ($self->{ct}); # DOCTYPE  
   
2717          redo A;          redo A;
2718        } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [        } elsif ($self->{is_xml} and
2719                   $self->{ct}->{type} == DOCTYPE_TOKEN and
2720                   $self->{nc} == 0x005B) { # [
2721          !!!cp (206.1);          !!!cp (206.1);
2722          !!!parse-error (type => 'no SYSTEM literal');          !!!parse-error (type => 'no SYSTEM literal');
2723    
# Line 2529  sub _get_next_token ($) { Line 2728  sub _get_next_token ($) {
2728          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2729          redo A;          redo A;
2730        } else {        } else {
         !!!cp (206);  
2731          !!!parse-error (type => 'string after SYSTEM');          !!!parse-error (type => 'string after SYSTEM');
         $self->{ct}->{quirks} = 1;  
2732    
2733          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2734              !!!cp (206);          
2735              $self->{ct}->{quirks} = 1;
2736              $self->{state} = BOGUS_DOCTYPE_STATE;
2737            } else {
2738              !!!cp (206.2);
2739              $self->{state} = BOGUS_MD_STATE;
2740            }
2741    
2742          !!!next-input-character;          !!!next-input-character;
2743          redo A;          redo A;
2744        }        }
# Line 2544  sub _get_next_token ($) { Line 2749  sub _get_next_token ($) {
2749          !!!next-input-character;          !!!next-input-character;
2750          redo A;          redo A;
2751        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
         !!!cp (208);  
2752          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2753    
2754          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2755          $self->{s_kwd} = '';            !!!cp (208);
2756              $self->{state} = DATA_STATE;
2757              $self->{s_kwd} = '';
2758              $self->{ct}->{quirks} = 1;
2759            } else {
2760              !!!cp (208.1);
2761              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2762            }
2763            
2764          !!!next-input-character;          !!!next-input-character;
2765            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         !!!emit ($self->{ct}); # DOCTYPE  
   
2766          redo A;          redo A;
2767        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (209);  
2768          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2769    
2770          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2771          $self->{s_kwd} = '';            !!!cp (209);
2772              $self->{state} = DATA_STATE;
2773              $self->{s_kwd} = '';
2774              $self->{ct}->{quirks} = 1;
2775            } else {
2776              !!!cp (209.1);
2777              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2778            }
2779            
2780          ## reconsume          ## reconsume
2781            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         !!!emit ($self->{ct}); # DOCTYPE  
   
2782          redo A;          redo A;
2783        } else {        } else {
2784          !!!cp (210);          !!!cp (210);
2785          $self->{ct}->{sysid} # DOCTYPE          $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
2786          $self->{read_until}->($self->{ct}->{sysid}, q[">],          $self->{read_until}->($self->{ct}->{sysid}, q[">],
2787                                length $self->{ct}->{sysid});                                length $self->{ct}->{sysid});
2788    
# Line 2597  sub _get_next_token ($) { Line 2809  sub _get_next_token ($) {
2809    
2810          redo A;          redo A;
2811        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (213);  
2812          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2813    
2814          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2815          $self->{s_kwd} = '';            !!!cp (213);
2816          ## reconsume            $self->{state} = DATA_STATE;
2817              $self->{s_kwd} = '';
2818          $self->{ct}->{quirks} = 1;            $self->{ct}->{quirks} = 1;
2819          !!!emit ($self->{ct}); # DOCTYPE          } else {
2820              !!!cp (213.1);
2821              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2822            }
2823    
2824            ## reconsume
2825            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2826          redo A;          redo A;
2827        } else {        } else {
2828          !!!cp (214);          !!!cp (214);
2829          $self->{ct}->{sysid} # DOCTYPE          $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
2830          $self->{read_until}->($self->{ct}->{sysid}, q['>],          $self->{read_until}->($self->{ct}->{sysid}, q['>],
2831                                length $self->{ct}->{sysid});                                length $self->{ct}->{sysid});
2832    
# Line 2621  sub _get_next_token ($) { Line 2836  sub _get_next_token ($) {
2836        }        }
2837      } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {      } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2838        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2839          !!!cp (215);          if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
2840          ## Stay in the state            !!!cp (215.1);
2841              $self->{state} = BEFORE_NDATA_STATE;
2842            } else {
2843              !!!cp (215);
2844              ## Stay in the state
2845            }
2846          !!!next-input-character;          !!!next-input-character;
2847          redo A;          redo A;
2848        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2849          !!!cp (216);          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2850          $self->{state} = DATA_STATE;            !!!cp (216);
2851          $self->{s_kwd} = '';            $self->{state} = DATA_STATE;
2852          !!!next-input-character;            $self->{s_kwd} = '';
2853            } else {
2854          !!!emit ($self->{ct}); # DOCTYPE            !!!cp (216.1);
2855              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2856            }
2857    
2858            !!!next-input-character;
2859            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2860            redo A;
2861          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
2862                   ($self->{nc} == 0x004E or # N
2863                    $self->{nc} == 0x006E)) { # n
2864            !!!cp (216.2);
2865            !!!parse-error (type => 'no space before NDATA'); ## TODO: type
2866            $self->{state} = NDATA_STATE;
2867            $self->{kwd} = chr $self->{nc};
2868            !!!next-input-character;
2869          redo A;          redo A;
2870        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2871          !!!cp (217);          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2872          !!!parse-error (type => 'unclosed DOCTYPE');            !!!cp (217);
2873          $self->{state} = DATA_STATE;            !!!parse-error (type => 'unclosed DOCTYPE');
2874          $self->{s_kwd} = '';            $self->{state} = DATA_STATE;
2875          ## reconsume            $self->{s_kwd} = '';
2876              $self->{ct}->{quirks} = 1;
2877          $self->{ct}->{quirks} = 1;          } else {
2878          !!!emit ($self->{ct}); # DOCTYPE            !!!cp (217.1);
2879              !!!parse-error (type => 'unclosed md'); ## TODO: type
2880              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2881            }
2882    
2883            ## reconsume
2884            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2885          redo A;          redo A;
2886        } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [        } elsif ($self->{is_xml} and
2887                   $self->{ct}->{type} == DOCTYPE_TOKEN and
2888                   $self->{nc} == 0x005B) { # [
2889          !!!cp (218.1);          !!!cp (218.1);
2890          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2891          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
# Line 2654  sub _get_next_token ($) { Line 2894  sub _get_next_token ($) {
2894          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2895          redo A;          redo A;
2896        } else {        } else {
         !!!cp (218);  
2897          !!!parse-error (type => 'string after SYSTEM literal');          !!!parse-error (type => 'string after SYSTEM literal');
         #$self->{ct}->{quirks} = 1;  
2898    
2899          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2900              !!!cp (218);
2901              #$self->{ct}->{quirks} = 1;
2902              $self->{state} = BOGUS_DOCTYPE_STATE;
2903            } else {
2904              !!!cp (218.2);
2905              $self->{state} = BOGUS_MD_STATE;
2906            }
2907    
2908            !!!next-input-character;
2909            redo A;
2910          }
2911        } elsif ($self->{state} == BEFORE_NDATA_STATE) {
2912          if ($is_space->{$self->{nc}}) {
2913            !!!cp (218.3);
2914            ## Stay in the state.
2915            !!!next-input-character;
2916            redo A;
2917          } elsif ($self->{nc} == 0x003E) { # >
2918            !!!cp (218.4);
2919            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2920            !!!next-input-character;
2921            !!!emit ($self->{ct}); # ENTITY
2922            redo A;
2923          } elsif ($self->{nc} == 0x004E or # N
2924                   $self->{nc} == 0x006E) { # n
2925            !!!cp (218.5);
2926            $self->{state} = NDATA_STATE;
2927            $self->{kwd} = chr $self->{nc};
2928            !!!next-input-character;
2929            redo A;
2930          } elsif ($self->{nc} == -1) {
2931            !!!cp (218.6);
2932            !!!parse-error (type => 'unclosed md'); ## TODO: type
2933            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2934            ## reconsume
2935            !!!emit ($self->{ct}); # ENTITY
2936            redo A;
2937          } else {
2938            !!!cp (218.7);
2939            !!!parse-error (type => 'string after SYSTEM literal');
2940            $self->{state} = BOGUS_MD_STATE;
2941          !!!next-input-character;          !!!next-input-character;
2942          redo A;          redo A;
2943        }        }
# Line 3423  sub _get_next_token ($) { Line 3702  sub _get_next_token ($) {
3702          $self->{state} = MD_HYPHEN_STATE;          $self->{state} = MD_HYPHEN_STATE;
3703          !!!next-input-character;          !!!next-input-character;
3704          redo A;          redo A;
3705        } elsif ($self->{nc} == 0x0045) { # E        } elsif ($self->{nc} == 0x0045 or # E
3706                   $self->{nc} == 0x0065) { # e
3707          $self->{state} = MD_E_STATE;          $self->{state} = MD_E_STATE;
3708          $self->{kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
3709          !!!next-input-character;          !!!next-input-character;
3710          redo A;          redo A;
3711        } elsif ($self->{nc} == 0x0041) { # A        } elsif ($self->{nc} == 0x0041 or # A
3712                   $self->{nc} == 0x0061) { # a
3713          $self->{state} = MD_ATTLIST_STATE;          $self->{state} = MD_ATTLIST_STATE;
3714          $self->{kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
3715          !!!next-input-character;          !!!next-input-character;
3716          redo A;          redo A;
3717        } elsif ($self->{nc} == 0x004E) { # N        } elsif ($self->{nc} == 0x004E or # N
3718                   $self->{nc} == 0x006E) { # n
3719          $self->{state} = MD_NOTATION_STATE;          $self->{state} = MD_NOTATION_STATE;
3720          $self->{kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
3721          !!!next-input-character;          !!!next-input-character;
# Line 3451  sub _get_next_token ($) { Line 3733  sub _get_next_token ($) {
3733        $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.        $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
3734        redo A;        redo A;
3735      } elsif ($self->{state} == MD_E_STATE) {      } elsif ($self->{state} == MD_E_STATE) {
3736        if ($self->{nc} == 0x004E) { # N        if ($self->{nc} == 0x004E or # N
3737              $self->{nc} == 0x006E) { # n
3738          $self->{state} = MD_ENTITY_STATE;          $self->{state} = MD_ENTITY_STATE;
3739          $self->{kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3740          !!!next-input-character;          !!!next-input-character;
3741          redo A;          redo A;
3742        } elsif ($self->{nc} == 0x004C) { # L        } elsif ($self->{nc} == 0x004C or # L
3743                   $self->{nc} == 0x006C) { # l
3744          ## XML5: <!ELEMENT> not supported.          ## XML5: <!ELEMENT> not supported.
3745          $self->{state} = MD_ELEMENT_STATE;          $self->{state} = MD_ELEMENT_STATE;
3746          $self->{kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
# Line 3474  sub _get_next_token ($) { Line 3758  sub _get_next_token ($) {
3758          redo A;          redo A;
3759        }        }
3760      } elsif ($self->{state} == MD_ENTITY_STATE) {      } elsif ($self->{state} == MD_ENTITY_STATE) {
3761        if ($self->{nc} == {        if ($self->{nc} == [
3762              'EN' => 0x0054, # T              undef,
3763              'ENT' => 0x0049, # I              undef,
3764              'ENTI' => 0x0054, # T              0x0054, # T
3765            }->{$self->{kwd}}) {              0x0049, # I
3766                0x0054, # T
3767              ]->[length $self->{kwd}] or
3768              $self->{nc} == [
3769                undef,
3770                undef,
3771                0x0074, # t
3772                0x0069, # i
3773                0x0074, # t
3774              ]->[length $self->{kwd}]) {
3775          ## Stay in the state.          ## Stay in the state.
3776          $self->{kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3777          !!!next-input-character;          !!!next-input-character;
3778          redo A;          redo A;
3779        } elsif ($self->{kwd} eq 'ENTIT' and        } elsif ((length $self->{kwd}) == 5 and
3780                 $self->{nc} == 0x0059) { # Y                 ($self->{nc} == 0x0059 or # Y
3781          $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '', text => '',                  $self->{nc} == 0x0079)) { # y
3782            if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
3783              !!!parse-error (type => 'lowercase keyword', ## TODO: type
3784                              text => 'ENTITY',
3785                              line => $self->{line_prev},
3786                              column => $self->{column_prev} - 4);
3787            }
3788            $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
3789                         line => $self->{line_prev},                         line => $self->{line_prev},
3790                         column => $self->{column_prev} - 6};                         column => $self->{column_prev} - 6};
3791          $self->{state} = DOCTYPE_MD_STATE;          $self->{state} = DOCTYPE_MD_STATE;
# Line 3503  sub _get_next_token ($) { Line 3803  sub _get_next_token ($) {
3803          redo A;          redo A;
3804        }        }
3805      } elsif ($self->{state} == MD_ELEMENT_STATE) {      } elsif ($self->{state} == MD_ELEMENT_STATE) {
3806        if ($self->{nc} == {        if ($self->{nc} == [
3807              'EL' => 0x0045, # E             undef,
3808              'ELE' => 0x004D, # M             undef,
3809              'ELEM' => 0x0045, # E             0x0045, # E
3810              'ELEME' => 0x004E, # N             0x004D, # M
3811            }->{$self->{kwd}}) {             0x0045, # E
3812               0x004E, # N
3813              ]->[length $self->{kwd}] or
3814              $self->{nc} == [
3815               undef,
3816               undef,
3817               0x0065, # e
3818               0x006D, # m
3819               0x0065, # e
3820               0x006E, # n
3821              ]->[length $self->{kwd}]) {
3822          ## Stay in the state.          ## Stay in the state.
3823          $self->{kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3824          !!!next-input-character;          !!!next-input-character;
3825          redo A;          redo A;
3826        } elsif ($self->{kwd} eq 'ELEMEN' and        } elsif ((length $self->{kwd}) == 6 and
3827                 $self->{nc} == 0x0054) { # T                 ($self->{nc} == 0x0054 or # T
3828                    $self->{nc} == 0x0074)) { # t
3829            if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
3830              !!!parse-error (type => 'lowercase keyword', ## TODO: type
3831                              text => 'ELEMENT',
3832                              line => $self->{line_prev},
3833                              column => $self->{column_prev} - 5);
3834            }
3835          $self->{ct} = {type => ELEMENT_TOKEN, name => '',          $self->{ct} = {type => ELEMENT_TOKEN, name => '',
3836                         line => $self->{line_prev},                         line => $self->{line_prev},
3837                         column => $self->{column_prev} - 6};                         column => $self->{column_prev} - 6};
# Line 3533  sub _get_next_token ($) { Line 3850  sub _get_next_token ($) {
3850          redo A;          redo A;
3851        }        }
3852      } elsif ($self->{state} == MD_ATTLIST_STATE) {      } elsif ($self->{state} == MD_ATTLIST_STATE) {
3853        if ($self->{nc} == {        if ($self->{nc} == [
3854              'A' => 0x0054, # T             undef,
3855              'AT' => 0x0054, # T             0x0054, # T
3856              'ATT' => 0x004C, # L             0x0054, # T
3857              'ATTL' => 0x0049, # I             0x004C, # L
3858              'ATTLI' => 0x0053, # S             0x0049, # I
3859            }->{$self->{kwd}}) {             0x0053, # S
3860              ]->[length $self->{kwd}] or
3861              $self->{nc} == [
3862               undef,
3863               0x0074, # t
3864               0x0074, # t
3865               0x006C, # l
3866               0x0069, # i
3867               0x0073, # s
3868              ]->[length $self->{kwd}]) {
3869          ## Stay in the state.          ## Stay in the state.
3870          $self->{kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3871          !!!next-input-character;          !!!next-input-character;
3872          redo A;          redo A;
3873        } elsif ($self->{kwd} eq 'ATTLIS' and        } elsif ((length $self->{kwd}) == 6 and
3874                 $self->{nc} == 0x0054) { # T                 ($self->{nc} == 0x0054 or # T
3875                    $self->{nc} == 0x0074)) { # t
3876            if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
3877              !!!parse-error (type => 'lowercase keyword', ## TODO: type
3878                              text => 'ATTLIST',
3879                              line => $self->{line_prev},
3880                              column => $self->{column_prev} - 5);
3881            }
3882          $self->{ct} = {type => ATTLIST_TOKEN, name => '',          $self->{ct} = {type => ATTLIST_TOKEN, name => '',
3883                           attrdefs => [],
3884                         line => $self->{line_prev},                         line => $self->{line_prev},
3885                         column => $self->{column_prev} - 6};                         column => $self->{column_prev} - 6};
3886          $self->{state} = DOCTYPE_MD_STATE;          $self->{state} = DOCTYPE_MD_STATE;
# Line 3564  sub _get_next_token ($) { Line 3898  sub _get_next_token ($) {
3898          redo A;          redo A;
3899        }        }
3900      } elsif ($self->{state} == MD_NOTATION_STATE) {      } elsif ($self->{state} == MD_NOTATION_STATE) {
3901        if ($self->{nc} == {        if ($self->{nc} == [
3902              'N' => 0x004F, # O             undef,
3903              'NO' => 0x0054, # T             0x004F, # O
3904              'NOT' => 0x0041, # A             0x0054, # T
3905              'NOTA' => 0x0054, # T             0x0041, # A
3906              'NOTAT' => 0x0049, # I             0x0054, # T
3907              'NOTATI' => 0x004F, # O             0x0049, # I
3908            }->{$self->{kwd}}) {             0x004F, # O
3909              ]->[length $self->{kwd}] or
3910              $self->{nc} == [
3911               undef,
3912               0x006F, # o
3913               0x0074, # t
3914               0x0061, # a
3915               0x0074, # t
3916               0x0069, # i
3917               0x006F, # o
3918              ]->[length $self->{kwd}]) {
3919          ## Stay in the state.          ## Stay in the state.
3920          $self->{kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3921          !!!next-input-character;          !!!next-input-character;
3922          redo A;          redo A;
3923        } elsif ($self->{kwd} eq 'NOTATIO' and        } elsif ((length $self->{kwd}) == 7 and
3924                 $self->{nc} == 0x004E) { # N                 ($self->{nc} == 0x004E or # N
3925                    $self->{nc} == 0x006E)) { # n
3926            if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
3927              !!!parse-error (type => 'lowercase keyword', ## TODO: type
3928                              text => 'NOTATION',
3929                              line => $self->{line_prev},
3930                              column => $self->{column_prev} - 6);
3931            }
3932          $self->{ct} = {type => NOTATION_TOKEN, name => '',          $self->{ct} = {type => NOTATION_TOKEN, name => '',
3933                         line => $self->{line_prev},                         line => $self->{line_prev},
3934                         column => $self->{column_prev} - 6};                         column => $self->{column_prev} - 6};
# Line 3689  sub _get_next_token ($) { Line 4040  sub _get_next_token ($) {
4040        ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".        ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
4041                
4042        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
4043          ## TODO:          if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4044          $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4045            } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
4046              ## TODO: ...
4047              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4048            } else { # ENTITY/NOTATION
4049              $self->{state} = AFTER_DOCTYPE_NAME_STATE;
4050            }
4051          !!!next-input-character;          !!!next-input-character;
4052          redo A;          redo A;
4053        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
4054          if ($self->{ct}->{type} == ATTLIST_TOKEN) {          if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4055            #            #
4056          } else {          } else {
4057            !!!parse-error (type => 'no md body'); ## TODO: type            !!!parse-error (type => 'no md def'); ## TODO: type
4058          }          }
4059          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4060          !!!next-input-character;          !!!next-input-character;
# Line 3731  sub _get_next_token ($) { Line 4088  sub _get_next_token ($) {
4088          ## XML5: No parse error.          ## XML5: No parse error.
4089          !!!parse-error (type => 'unclosed md'); ## TODO: type          !!!parse-error (type => 'unclosed md'); ## TODO: type
4090          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4091            !!!emit ($self->{ct});
4092            redo A;
4093          } else {
4094            ## XML5: Not defined yet.
4095            $self->{ca} = {name => chr ($self->{nc}), # attrdef
4096                           tokens => [],
4097                           line => $self->{line}, column => $self->{column}};
4098            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
4099            !!!next-input-character;
4100            redo A;
4101          }
4102        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
4103          if ($is_space->{$self->{nc}}) {
4104            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
4105            !!!next-input-character;
4106            redo A;
4107          } elsif ($self->{nc} == 0x003E) { # >
4108            ## XML5: Same as "anything else".
4109            !!!parse-error (type => 'no attr type'); ## TODO: type
4110            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4111            !!!next-input-character;
4112            !!!emit ($self->{ct}); # ATTLIST
4113            redo A;
4114          } elsif ($self->{nc} == 0x0028) { # (
4115            ## XML5: Same as "anything else".
4116            !!!parse-error (type => 'no space before paren'); ## TODO: type
4117            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4118            !!!next-input-character;
4119            redo A;
4120          } elsif ($self->{nc} == -1) {
4121            ## XML5: No parse error.
4122            !!!parse-error (type => 'unclosed md'); ## TODO: type
4123            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4124            !!!next-input-character;
4125            !!!emit ($self->{ct}); # ATTLIST
4126            redo A;
4127          } else {
4128            ## XML5: Not defined yet.
4129            $self->{ca}->{name} .= chr $self->{nc};
4130            ## Stay in the state.
4131            !!!next-input-character;
4132            redo A;
4133          }
4134        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
4135          if ($is_space->{$self->{nc}}) {
4136            ## Stay in the state.
4137            !!!next-input-character;
4138            redo A;
4139          } elsif ($self->{nc} == 0x003E) { # >
4140            ## XML5: Same as "anything else".
4141            !!!parse-error (type => 'no attr type'); ## TODO: type
4142            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4143            !!!next-input-character;
4144            !!!emit ($self->{ct}); # ATTLIST
4145            redo A;
4146          } elsif ($self->{nc} == 0x0028) { # (
4147            ## XML5: Same as "anything else".
4148            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4149            !!!next-input-character;
4150            redo A;
4151          } elsif ($self->{nc} == -1) {
4152            ## XML5: No parse error.
4153            !!!parse-error (type => 'unclosed md'); ## TODO: type
4154            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4155            !!!next-input-character;
4156            !!!emit ($self->{ct});
4157            redo A;
4158          } else {
4159            ## XML5: Not defined yet.
4160            $self->{ca}->{type} = chr $self->{nc};
4161            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
4162            !!!next-input-character;
4163            redo A;
4164          }
4165        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
4166          if ($is_space->{$self->{nc}}) {
4167            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
4168            !!!next-input-character;
4169            redo A;
4170          } elsif ($self->{nc} == 0x0023) { # #
4171            ## XML5: Same as "anything else".
4172            !!!parse-error (type => 'no space before default value'); ## TODO: type
4173            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4174            !!!next-input-character;
4175            redo A;
4176          } elsif ($self->{nc} == 0x0022) { # "
4177            ## XML5: Same as "anything else".
4178            !!!parse-error (type => 'no space before default value'); ## TODO: type
4179            $self->{ca}->{value} = '';
4180            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4181            !!!next-input-character;
4182            redo A;
4183          } elsif ($self->{nc} == 0x0027) { # '
4184            ## XML5: Same as "anything else".
4185            !!!parse-error (type => 'no space before default value'); ## TODO: type
4186            $self->{ca}->{value} = '';
4187            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4188            !!!next-input-character;
4189            redo A;
4190          } elsif ($self->{nc} == 0x003E) { # >
4191            ## XML5: Same as "anything else".
4192            !!!parse-error (type => 'no attr default'); ## TODO: type
4193            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4194            !!!next-input-character;
4195            !!!emit ($self->{ct}); # ATTLIST
4196            redo A;
4197          } elsif ($self->{nc} == 0x0028) { # (
4198            ## XML5: Same as "anything else".
4199            !!!parse-error (type => 'no space before paren'); ## TODO: type
4200            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4201            !!!next-input-character;
4202            redo A;
4203          } elsif ($self->{nc} == -1) {
4204            ## XML5: No parse error.
4205            !!!parse-error (type => 'unclosed md'); ## TODO: type
4206            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4207            !!!next-input-character;
4208            !!!emit ($self->{ct});
4209            redo A;
4210          } else {
4211            ## XML5: Not defined yet.
4212            $self->{ca}->{type} .= chr $self->{nc};
4213            ## Stay in the state.
4214            !!!next-input-character;
4215            redo A;
4216          }
4217        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
4218          if ($is_space->{$self->{nc}}) {
4219            ## Stay in the state.
4220            !!!next-input-character;
4221            redo A;
4222          } elsif ($self->{nc} == 0x0028) { # (
4223            ## XML5: Same as "anything else".
4224            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4225            !!!next-input-character;
4226            redo A;
4227          } elsif ($self->{nc} == 0x0023) { # #
4228            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4229            !!!next-input-character;
4230            redo A;
4231          } elsif ($self->{nc} == 0x0022) { # "
4232            ## XML5: Same as "anything else".
4233            $self->{ca}->{value} = '';
4234            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4235            !!!next-input-character;
4236            redo A;
4237          } elsif ($self->{nc} == 0x0027) { # '
4238            ## XML5: Same as "anything else".
4239            $self->{ca}->{value} = '';
4240            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4241            !!!next-input-character;
4242            redo A;
4243          } elsif ($self->{nc} == 0x003E) { # >
4244            ## XML5: Same as "anything else".
4245            !!!parse-error (type => 'no attr default'); ## TODO: type
4246            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4247            !!!next-input-character;
4248            !!!emit ($self->{ct}); # ATTLIST
4249            redo A;
4250          } elsif ($self->{nc} == -1) {
4251            ## XML5: No parse error.
4252            !!!parse-error (type => 'unclosed md'); ## TODO: type
4253            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4254            !!!next-input-character;
4255            !!!emit ($self->{ct});
4256            redo A;
4257          } else {
4258            ## XML5: Switch to the "DOCTYPE bogus comment state".
4259            !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4260            $self->{ca}->{value} = '';
4261            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4262            ## Reconsume.
4263            redo A;
4264          }
4265        } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
4266          if ($is_space->{$self->{nc}}) {
4267            ## Stay in the state.
4268            !!!next-input-character;
4269            redo A;
4270          } elsif ($self->{nc} == 0x007C) { # |
4271            !!!parse-error (type => 'empty allowed token'); ## TODO: type
4272            ## Stay in the state.
4273            !!!next-input-character;
4274            redo A;
4275          } elsif ($self->{nc} == 0x0029) { # )
4276            !!!parse-error (type => 'empty allowed token'); ## TODO: type
4277            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4278            !!!next-input-character;
4279            redo A;
4280          } elsif ($self->{nc} == 0x003E) { # >
4281            !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4282            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4283            !!!next-input-character;
4284            !!!emit ($self->{ct}); # ATTLIST
4285            redo A;
4286          } elsif ($self->{nc} == -1) {
4287            ## XML5: No parse error.
4288            !!!parse-error (type => 'unclosed md'); ## TODO: type
4289            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4290            !!!next-input-character;
4291            !!!emit ($self->{ct});
4292            redo A;
4293          } else {
4294            push @{$self->{ca}->{tokens}}, chr $self->{nc};
4295            $self->{state} = ALLOWED_TOKEN_STATE;
4296            !!!next-input-character;
4297            redo A;
4298          }
4299        } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
4300          if ($is_space->{$self->{nc}}) {
4301            $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
4302            !!!next-input-character;
4303            redo A;
4304          } elsif ($self->{nc} == 0x007C) { # |
4305            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4306            !!!next-input-character;
4307            redo A;
4308          } elsif ($self->{nc} == 0x0029) { # )
4309            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4310            !!!next-input-character;
4311            redo A;
4312          } elsif ($self->{nc} == 0x003E) { # >
4313            !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4314            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4315            !!!next-input-character;
4316            !!!emit ($self->{ct}); # ATTLIST
4317            redo A;
4318          } elsif ($self->{nc} == -1) {
4319            ## XML5: No parse error.
4320            !!!parse-error (type => 'unclosed md'); ## TODO: type
4321            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4322            !!!next-input-character;
4323            !!!emit ($self->{ct});
4324            redo A;
4325          } else {
4326            $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
4327            ## Stay in the state.
4328            !!!next-input-character;
4329            redo A;
4330          }
4331        } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
4332          if ($is_space->{$self->{nc}}) {
4333            ## Stay in the state.
4334            !!!next-input-character;
4335            redo A;
4336          } elsif ($self->{nc} == 0x007C) { # |
4337            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4338            !!!next-input-character;
4339            redo A;
4340          } elsif ($self->{nc} == 0x0029) { # )
4341            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4342            !!!next-input-character;
4343            redo A;
4344          } elsif ($self->{nc} == 0x003E) { # >
4345            !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4346            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4347            !!!next-input-character;
4348            !!!emit ($self->{ct}); # ATTLIST
4349            redo A;
4350          } elsif ($self->{nc} == -1) {
4351            ## XML5: No parse error.
4352            !!!parse-error (type => 'unclosed md'); ## TODO: type
4353            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4354            !!!next-input-character;
4355            !!!emit ($self->{ct});
4356            redo A;
4357          } else {
4358            !!!parse-error (type => 'space in allowed token', ## TODO: type
4359                            line => $self->{line_prev},
4360                            column => $self->{column_prev});
4361            $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
4362            $self->{state} = ALLOWED_TOKEN_STATE;
4363            !!!next-input-character;
4364            redo A;
4365          }
4366        } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
4367          if ($is_space->{$self->{nc}}) {
4368            $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
4369            !!!next-input-character;
4370            redo A;
4371          } elsif ($self->{nc} == 0x0023) { # #
4372            !!!parse-error (type => 'no space before default value'); ## TODO: type
4373            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4374            !!!next-input-character;
4375            redo A;
4376          } elsif ($self->{nc} == 0x0022) { # "
4377            !!!parse-error (type => 'no space before default value'); ## TODO: type
4378            $self->{ca}->{value} = '';
4379            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4380            !!!next-input-character;
4381            redo A;
4382          } elsif ($self->{nc} == 0x0027) { # '
4383            !!!parse-error (type => 'no space before default value'); ## TODO: type
4384            $self->{ca}->{value} = '';
4385            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4386            !!!next-input-character;
4387            redo A;
4388          } elsif ($self->{nc} == 0x003E) { # >
4389            !!!parse-error (type => 'no attr default'); ## TODO: type
4390            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4391            !!!next-input-character;
4392            !!!emit ($self->{ct}); # ATTLIST
4393            redo A;
4394          } elsif ($self->{nc} == -1) {
4395            !!!parse-error (type => 'unclosed md'); ## TODO: type
4396            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4397            !!!next-input-character;
4398            !!!emit ($self->{ct});
4399            redo A;
4400          } else {
4401            !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4402            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4403            ## Reconsume.
4404            redo A;
4405          }
4406        } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
4407          if ($is_space->{$self->{nc}}) {
4408            ## Stay in the state.
4409            !!!next-input-character;
4410            redo A;
4411          } elsif ($self->{nc} == 0x0023) { # #
4412            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4413            !!!next-input-character;
4414            redo A;
4415          } elsif ($self->{nc} == 0x0022) { # "
4416            $self->{ca}->{value} = '';
4417            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4418            !!!next-input-character;
4419            redo A;
4420          } elsif ($self->{nc} == 0x0027) { # '
4421            $self->{ca}->{value} = '';
4422            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4423            !!!next-input-character;
4424            redo A;
4425          } elsif ($self->{nc} == 0x003E) { # >
4426            !!!parse-error (type => 'no attr default'); ## TODO: type
4427            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4428            !!!next-input-character;
4429            !!!emit ($self->{ct}); # ATTLIST
4430            redo A;
4431          } elsif ($self->{nc} == -1) {
4432            !!!parse-error (type => 'unclosed md'); ## TODO: type
4433            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4434            !!!next-input-character;
4435            !!!emit ($self->{ct});
4436            redo A;
4437          } else {
4438            !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4439            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4440            ## Reconsume.
4441            redo A;
4442          }
4443        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
4444          if ($is_space->{$self->{nc}}) {
4445            ## XML5: No parse error.
4446            !!!parse-error (type => 'no default type'); ## TODO: type
4447            $self->{state} = BOGUS_MD_STATE;
4448            ## Reconsume.
4449            redo A;
4450          } elsif ($self->{nc} == 0x0022) { # "
4451            ## XML5: Same as "anything else".
4452            $self->{ca}->{value} = '';
4453            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4454            !!!next-input-character;
4455            redo A;
4456          } elsif ($self->{nc} == 0x0027) { # '
4457            ## XML5: Same as "anything else".
4458            $self->{ca}->{value} = '';
4459            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4460            !!!next-input-character;
4461            redo A;
4462          } elsif ($self->{nc} == 0x003E) { # >
4463            ## XML5: Same as "anything else".
4464            !!!parse-error (type => 'no attr default'); ## TODO: type
4465            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4466            !!!next-input-character;
4467            !!!emit ($self->{ct}); # ATTLIST
4468            redo A;
4469          } elsif ($self->{nc} == -1) {
4470            ## XML5: No parse error.
4471            !!!parse-error (type => 'unclosed md'); ## TODO: type
4472            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4473            !!!next-input-character;
4474            !!!emit ($self->{ct});
4475            redo A;
4476          } else {
4477            $self->{ca}->{default} = chr $self->{nc};
4478            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
4479            !!!next-input-character;
4480            redo A;
4481          }
4482        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
4483          if ($is_space->{$self->{nc}}) {
4484            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
4485            !!!next-input-character;
4486            redo A;
4487          } elsif ($self->{nc} == 0x0022) { # "
4488            ## XML5: Same as "anything else".
4489            !!!parse-error (type => 'no space before default value'); ## TODO: type
4490            $self->{ca}->{value} = '';
4491            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4492            !!!next-input-character;
4493            redo A;
4494          } elsif ($self->{nc} == 0x0027) { # '
4495            ## XML5: Same as "anything else".
4496            !!!parse-error (type => 'no space before default value'); ## TODO: type
4497            $self->{ca}->{value} = '';
4498            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4499            !!!next-input-character;
4500            redo A;
4501          } elsif ($self->{nc} == 0x003E) { # >
4502            ## XML5: Same as "anything else".
4503            push @{$self->{ct}->{attrdefs}}, $self->{ca};
4504            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4505            !!!next-input-character;
4506            !!!emit ($self->{ct}); # ATTLIST
4507            redo A;
4508          } elsif ($self->{nc} == -1) {
4509            ## XML5: No parse error.
4510            !!!parse-error (type => 'unclosed md'); ## TODO: type
4511            push @{$self->{ct}->{attrdefs}}, $self->{ca};
4512            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4513            !!!next-input-character;
4514            !!!emit ($self->{ct});
4515            redo A;
4516          } else {
4517            $self->{ca}->{default} .= chr $self->{nc};
4518            ## Stay in the state.
4519            !!!next-input-character;
4520            redo A;
4521          }
4522        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
4523          if ($is_space->{$self->{nc}}) {
4524            ## Stay in the state.
4525            !!!next-input-character;
4526            redo A;
4527          } elsif ($self->{nc} == 0x0022) { # "
4528            $self->{ca}->{value} = '';
4529            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4530            !!!next-input-character;
4531            redo A;
4532          } elsif ($self->{nc} == 0x0027) { # '
4533            $self->{ca}->{value} = '';
4534            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4535            !!!next-input-character;
4536            redo A;
4537          } elsif ($self->{nc} == 0x003E) { # >
4538            push @{$self->{ct}->{attrdefs}}, $self->{ca};
4539            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4540            !!!next-input-character;
4541            !!!emit ($self->{ct}); # ATTLIST
4542            redo A;
4543          } elsif ($self->{nc} == -1) {
4544            ## XML5: No parse error.
4545            !!!parse-error (type => 'unclosed md'); ## TODO: type
4546            push @{$self->{ct}->{attrdefs}}, $self->{ca};
4547            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4548            !!!next-input-character;
4549            !!!emit ($self->{ct});
4550          redo A;          redo A;
4551        } else {        } else {
4552          ## XML5: Not defined yet.          ## XML5: Not defined yet.
4553            if ($self->{ca}->{default} eq 'FIXED') {
4554              $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4555            } else {
4556              push @{$self->{ct}->{attrdefs}}, $self->{ca};
4557              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4558            }
4559            ## Reconsume.
4560            redo A;
4561          }
4562        } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
4563          if ($is_space->{$self->{nc}} or
4564              $self->{nc} == -1 or
4565              $self->{nc} == 0x003E) { # >
4566            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4567            ## Reconsume.
4568            redo A;
4569          } else {
4570            !!!parse-error (type => 'no space before attr name'); ## TODO: type
4571            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4572            ## Reconsume.
4573            redo A;
4574          }
4575        } elsif ($self->{state} == NDATA_STATE) {
4576          ## ASCII case-insensitive
4577          if ($self->{nc} == [
4578                undef,
4579                0x0044, # D
4580                0x0041, # A
4581                0x0054, # T
4582              ]->[length $self->{kwd}] or
4583              $self->{nc} == [
4584                undef,
4585                0x0064, # d
4586                0x0061, # a
4587                0x0074, # t
4588              ]->[length $self->{kwd}]) {
4589            !!!cp (172.2);
4590            ## Stay in the state.
4591            $self->{kwd} .= chr $self->{nc};
4592            !!!next-input-character;
4593            redo A;
4594          } elsif ((length $self->{kwd}) == 4 and
4595                   ($self->{nc} == 0x0041 or # A
4596                    $self->{nc} == 0x0061)) { # a
4597            if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
4598              !!!cp (172.3);
4599              !!!parse-error (type => 'lowercase keyword', ## TODO: type
4600                              text => 'NDATA',
4601                              line => $self->{line_prev},
4602                              column => $self->{column_prev} - 4);
4603            } else {
4604              !!!cp (172.4);
4605            }
4606            $self->{state} = AFTER_NDATA_STATE;
4607            !!!next-input-character;
4608            redo A;
4609          } else {
4610            !!!parse-error (type => 'string after literal', ## TODO: type
4611                            line => $self->{line_prev},
4612                            column => $self->{column_prev} + 1
4613                                - length $self->{kwd});
4614            !!!cp (172.5);
4615            $self->{state} = BOGUS_MD_STATE;
4616            ## Reconsume.
4617            redo A;
4618          }
4619        } elsif ($self->{state} == AFTER_NDATA_STATE) {
4620          if ($is_space->{$self->{nc}}) {
4621            $self->{state} = BEFORE_NOTATION_NAME_STATE;
4622            !!!next-input-character;
4623            redo A;
4624          } elsif ($self->{nc} == 0x003E) { # >
4625            !!!parse-error (type => 'no notation name'); ## TODO: type
4626            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4627            !!!next-input-character;
4628            !!!emit ($self->{ct}); # ENTITY
4629            redo A;
4630          } elsif ($self->{nc} == -1) {
4631            !!!parse-error (type => 'unclosed md'); ## TODO: type
4632            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4633            !!!next-input-character;
4634            !!!emit ($self->{ct}); # ENTITY
4635            redo A;
4636          } else {
4637            !!!parse-error (type => 'string after literal', ## TODO: type
4638                            line => $self->{line_prev},
4639                            column => $self->{column_prev} + 1
4640                                - length $self->{kwd});
4641            $self->{state} = BOGUS_MD_STATE;
4642            ## Reconsume.
4643            redo A;
4644          }
4645        } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
4646          if ($is_space->{$self->{nc}}) {
4647            ## Stay in the state.
4648            !!!next-input-character;
4649            redo A;
4650          } elsif ($self->{nc} == 0x003E) { # >
4651            !!!parse-error (type => 'no notation name'); ## TODO: type
4652            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4653            !!!next-input-character;
4654            !!!emit ($self->{ct}); # ENTITY
4655            redo A;
4656          } elsif ($self->{nc} == -1) {
4657            !!!parse-error (type => 'unclosed md'); ## TODO: type
4658            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4659            !!!next-input-character;
4660            !!!emit ($self->{ct}); # ENTITY
4661            redo A;
4662          } else {
4663            $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
4664            $self->{state} = NOTATION_NAME_STATE;
4665            !!!next-input-character;
4666            redo A;
4667          }
4668        } elsif ($self->{state} == NOTATION_NAME_STATE) {
4669          if ($is_space->{$self->{nc}}) {
4670            $self->{state} = AFTER_NOTATION_NAME_STATE;
4671            !!!next-input-character;
4672            redo A;
4673          } elsif ($self->{nc} == 0x003E) { # >
4674            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4675            !!!next-input-character;
4676            !!!emit ($self->{ct}); # ENTITY
4677            redo A;
4678          } elsif ($self->{nc} == -1) {
4679            !!!parse-error (type => 'unclosed md'); ## TODO: type
4680            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4681            !!!next-input-character;
4682            !!!emit ($self->{ct}); # ENTITY
4683            redo A;
4684          } else {
4685            $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
4686            ## Stay in the state.
4687            !!!next-input-character;
4688            redo A;
4689          }
4690        } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
4691          if ($self->{nc} == 0x0022) { # "
4692            $self->{state} = AFTER_NOTATION_NAME_STATE;
4693            !!!next-input-character;
4694            redo A;
4695          } elsif ($self->{nc} == 0x0026) { # &
4696            $self->{prev_state} = $self->{state};
4697            $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4698            $self->{entity_add} = 0x0022; # "
4699            !!!next-input-character;
4700            redo A;
4701    ## TODO: %
4702          } elsif ($self->{nc} == -1) {
4703            !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4704            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4705            ## Reconsume.
4706            !!!emit ($self->{ct}); # ENTITY
4707            redo A;
4708          } else {
4709            $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4710            !!!next-input-character;
4711            redo A;
4712          }
4713        } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
4714          if ($self->{nc} == 0x0027) { # '
4715            $self->{state} = AFTER_NOTATION_NAME_STATE;
4716            !!!next-input-character;
4717            redo A;
4718          } elsif ($self->{nc} == 0x0026) { # &
4719            $self->{prev_state} = $self->{state};
4720            $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4721            $self->{entity_add} = 0x0027; # '
4722            !!!next-input-character;
4723            redo A;
4724    ## TODO: %
4725          } elsif ($self->{nc} == -1) {
4726            !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4727            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4728            ## Reconsume.
4729            !!!emit ($self->{ct}); # ENTITY
4730            redo A;
4731          } else {
4732            $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4733            !!!next-input-character;
4734            redo A;
4735          }
4736        } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
4737          ## TODO: XMLize
4738    
4739          ## TODO: ...        if ($is_space->{$self->{nc}} or
4740              {
4741                0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4742                $self->{entity_add} => 1,
4743              }->{$self->{nc}}) {
4744            ## Don't consume
4745            ## No error
4746            ## Return nothing.
4747            #
4748          } elsif ($self->{nc} == 0x0023) { # #
4749            $self->{ca} = $self->{ct};
4750            $self->{state} = ENTITY_HASH_STATE;
4751            $self->{kwd} = '#';
4752            !!!next-input-character;
4753            redo A;
4754          } elsif ((0x0041 <= $self->{nc} and
4755                    $self->{nc} <= 0x005A) or # A..Z
4756                   (0x0061 <= $self->{nc} and
4757                    $self->{nc} <= 0x007A)) { # a..z
4758            #
4759          } else {
4760            !!!parse-error (type => 'bare ero');
4761            ## Return nothing.
4762            #
4763          }
4764    
4765          $self->{state} = BOGUS_COMMENT_STATE;        $self->{ct}->{value} .= '&';
4766          $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded        $self->{state} = $self->{prev_state};
4767          ## Reconsume.
4768          redo A;
4769        } elsif ($self->{state} == AFTER_NOTATION_NAME_STATE) {
4770          if ($is_space->{$self->{nc}}) {
4771            ## Stay in the state.
4772            !!!next-input-character;
4773            redo A;
4774          } elsif ($self->{nc} == 0x003E) { # >
4775            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4776            !!!next-input-character;
4777            !!!emit ($self->{ct}); # ENTITY
4778            redo A;
4779          } elsif ($self->{nc} == -1) {
4780            !!!parse-error (type => 'unclosed md'); ## TODO: type
4781            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4782            !!!next-input-character;
4783            !!!emit ($self->{ct}); # ENTITY
4784            redo A;
4785          } else {
4786            !!!parse-error (type => 'string after notation name'); ## TODO: type
4787            $self->{state} = BOGUS_MD_STATE;
4788          ## Reconsume.          ## Reconsume.
4789          redo A;          redo A;
4790        }        }
4791        } elsif ($self->{state} == BOGUS_MD_STATE) {
4792          if ($self->{nc} == 0x003E) { # >
4793            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4794            !!!next-input-character;
4795            !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
4796            redo A;
4797          } elsif ($self->{nc} == -1) {
4798            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4799            ## Reconsume.
4800            !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
4801            redo A;
4802          } else {
4803            ## Stay in the state.
4804            !!!next-input-character;
4805            redo A;
4806          }
4807      } else {      } else {
4808        die "$0: $self->{state}: Unknown state";        die "$0: $self->{state}: Unknown state";
4809      }      }
# Line 3753  sub _get_next_token ($) { Line 4814  sub _get_next_token ($) {
4814    
4815  1;  1;
4816  ## $Date$  ## $Date$
4817                                    

Legend:
Removed from v.1.14  
changed lines
  Added in v.1.19

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24