/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.12 by wakaba, Wed Oct 15 12:49:49 2008 UTC revision 1.21 by wakaba, Sun Oct 19 09:25:21 2008 UTC
# Line 15  BEGIN { Line 15  BEGIN {
15      CHARACTER_TOKEN      CHARACTER_TOKEN
16      PI_TOKEN      PI_TOKEN
17      ABORT_TOKEN      ABORT_TOKEN
18        END_OF_DOCTYPE_TOKEN
19        ATTLIST_TOKEN
20        ELEMENT_TOKEN
21        GENERAL_ENTITY_TOKEN
22        PARAMETER_ENTITY_TOKEN
23        NOTATION_TOKEN
24    );    );
25        
26    our %EXPORT_TAGS = (    our %EXPORT_TAGS = (
# Line 27  BEGIN { Line 33  BEGIN {
33        CHARACTER_TOKEN        CHARACTER_TOKEN
34        PI_TOKEN        PI_TOKEN
35        ABORT_TOKEN        ABORT_TOKEN
36          END_OF_DOCTYPE_TOKEN
37          ATTLIST_TOKEN
38          ELEMENT_TOKEN
39          GENERAL_ENTITY_TOKEN
40          PARAMETER_ENTITY_TOKEN
41          NOTATION_TOKEN
42      )],      )],
43    );    );
44  }  }
# Line 43  sub END_OF_FILE_TOKEN () { 5 } Line 55  sub END_OF_FILE_TOKEN () { 5 }
55  sub CHARACTER_TOKEN () { 6 }  sub CHARACTER_TOKEN () { 6 }
56  sub PI_TOKEN () { 7 } ## NOTE: XML only.  sub PI_TOKEN () { 7 } ## NOTE: XML only.
57  sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.  sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58    sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59    sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60    sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61    sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62    sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63    sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64    
65  ## XML5: XML5 has "empty tag token".  In this implementation, it is  ## XML5: XML5 has "empty tag token".  In this implementation, it is
66  ## represented as a start tag token with $self->{self_closing} flag  ## represented as a start tag token with $self->{self_closing} flag
# Line 133  sub PI_AFTER_STATE () { 55 } Line 151  sub PI_AFTER_STATE () { 55 }
151  sub PI_DATA_AFTER_STATE () { 56 }  sub PI_DATA_AFTER_STATE () { 56 }
152  sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }  sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153  sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }  sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154    sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155    sub DOCTYPE_TAG_STATE () { 60 }
156    sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157    sub MD_ATTLIST_STATE () { 62 }
158    sub MD_E_STATE () { 63 }
159    sub MD_ELEMENT_STATE () { 64 }
160    sub MD_ENTITY_STATE () { 65 }
161    sub MD_NOTATION_STATE () { 66 }
162    sub DOCTYPE_MD_STATE () { 67 }
163    sub BEFORE_MD_NAME_STATE () { 68 }
164    sub MD_NAME_STATE () { 69 }
165    sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166    sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167    sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168    sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169    sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170    sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171    sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172    sub ALLOWED_TOKEN_STATE () { 77 }
173    sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174    sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175    sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179    sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180    sub BEFORE_NDATA_STATE () { 85 }
181    sub NDATA_STATE () { 86 }
182    sub AFTER_NDATA_STATE () { 87 }
183    sub BEFORE_NOTATION_NAME_STATE () { 88 }
184    sub NOTATION_NAME_STATE () { 89 }
185    sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
186    sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
187    sub ENTITY_VALUE_ENTITY_STATE () { 92 }
188    sub AFTER_ELEMENT_NAME_STATE () { 93 }
189    sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
190    sub CONTENT_KEYWORD_STATE () { 95 }
191    sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
192    sub CM_ELEMENT_NAME_STATE () { 97 }
193    sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
194    sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
195    sub AFTER_MD_DEF_STATE () { 100 }
196    sub BOGUS_MD_STATE () { 101 }
197    
198  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
199  ## list and descriptions)  ## list and descriptions)
# Line 1226  sub _get_next_token ($) { Line 1287  sub _get_next_token ($) {
1287          redo A;          redo A;
1288        }        }
1289      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1290        ## XML5: "Tag attribute value double quoted state".        ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1291          ## ATTLIST attribute value double quoted state".
1292                
1293        if ($self->{nc} == 0x0022) { # "        if ($self->{nc} == 0x0022) { # "
1294          !!!cp (95);          if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1295          ## XML5: "Tag attribute name before state".            !!!cp (95.1);
1296          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;            ## XML5: "DOCTYPE ATTLIST name after state".
1297              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1298              $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1299            } else {
1300              !!!cp (95);
1301              ## XML5: "Tag attribute name before state".
1302              $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1303            }
1304          !!!next-input-character;          !!!next-input-character;
1305          redo A;          redo A;
1306        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
# Line 1252  sub _get_next_token ($) { Line 1321  sub _get_next_token ($) {
1321          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1322            !!!cp (97);            !!!cp (97);
1323            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1324    
1325              $self->{state} = DATA_STATE;
1326              $self->{s_kwd} = '';
1327              ## reconsume
1328              !!!emit ($self->{ct}); # start tag
1329              redo A;
1330          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1331            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1332            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1261  sub _get_next_token ($) { Line 1336  sub _get_next_token ($) {
1336              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
1337              !!!cp (99);              !!!cp (99);
1338            }            }
1339    
1340              $self->{state} = DATA_STATE;
1341              $self->{s_kwd} = '';
1342              ## reconsume
1343              !!!emit ($self->{ct}); # end tag
1344              redo A;
1345            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1346              ## XML5: No parse error above; not defined yet.
1347              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1348              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1349              ## Reconsume.
1350              !!!emit ($self->{ct}); # ATTLIST
1351              redo A;
1352          } else {          } else {
1353            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1354          }          }
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
         ## reconsume  
   
         !!!emit ($self->{ct}); # start tag or end tag  
   
         redo A;  
1355        } else {        } else {
1356            ## XML5 [ATTLIST]: Not defined yet.
1357          if ($self->{is_xml} and $self->{nc} == 0x003C) { # <          if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1358            !!!cp (100);            !!!cp (100);
1359            ## XML5: Not a parse error.            ## XML5: Not a parse error.
# Line 1289  sub _get_next_token ($) { Line 1371  sub _get_next_token ($) {
1371          redo A;          redo A;
1372        }        }
1373      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1374        ## XML5: "Tag attribute value single quoted state".        ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1375          ## ATTLIST attribute value single quoted state".
1376    
1377        if ($self->{nc} == 0x0027) { # '        if ($self->{nc} == 0x0027) { # '
1378          !!!cp (101);          if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1379          ## XML5: "Before attribute name state" (sic).            !!!cp (101.1);
1380          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;            ## XML5: "DOCTYPE ATTLIST name after state".
1381              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1382              $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1383            } else {
1384              !!!cp (101);
1385              ## XML5: "Before attribute name state" (sic).
1386              $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1387            }
1388          !!!next-input-character;          !!!next-input-character;
1389          redo A;          redo A;
1390        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
# Line 1315  sub _get_next_token ($) { Line 1405  sub _get_next_token ($) {
1405          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1406            !!!cp (103);            !!!cp (103);
1407            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1408    
1409              $self->{state} = DATA_STATE;
1410              $self->{s_kwd} = '';
1411              ## reconsume
1412              !!!emit ($self->{ct}); # start tag
1413              redo A;
1414          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1415            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1416            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1324  sub _get_next_token ($) { Line 1420  sub _get_next_token ($) {
1420              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
1421              !!!cp (105);              !!!cp (105);
1422            }            }
1423    
1424              $self->{state} = DATA_STATE;
1425              $self->{s_kwd} = '';
1426              ## reconsume
1427              !!!emit ($self->{ct}); # end tag
1428              redo A;
1429            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1430              ## XML5: No parse error above; not defined yet.
1431              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1432              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1433              ## Reconsume.
1434              !!!emit ($self->{ct}); # ATTLIST
1435              redo A;
1436          } else {          } else {
1437            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1438          }          }
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
         ## reconsume  
   
         !!!emit ($self->{ct}); # start tag or end tag  
   
         redo A;  
1439        } else {        } else {
1440            ## XML5 [ATTLIST]: Not defined yet.
1441          if ($self->{is_xml} and $self->{nc} == 0x003C) { # <          if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1442            !!!cp (106);            !!!cp (106);
1443            ## XML5: Not a parse error.            ## XML5: Not a parse error.
# Line 1355  sub _get_next_token ($) { Line 1458  sub _get_next_token ($) {
1458        ## XML5: "Tag attribute value unquoted state".        ## XML5: "Tag attribute value unquoted state".
1459    
1460        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1461          !!!cp (107);          if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1462          ## XML5: "Tag attribute name before state".            !!!cp (107.1);
1463          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;            push @{$self->{ct}->{attrdefs}}, $self->{ca};
1464              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
1465            } else {
1466              !!!cp (107);
1467              ## XML5: "Tag attribute name before state".
1468              $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1469            }
1470          !!!next-input-character;          !!!next-input-character;
1471          redo A;          redo A;
1472        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
# Line 1378  sub _get_next_token ($) { Line 1487  sub _get_next_token ($) {
1487          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1488            !!!cp (109);            !!!cp (109);
1489            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1490    
1491              $self->{state} = DATA_STATE;
1492              $self->{s_kwd} = '';
1493              !!!next-input-character;
1494              !!!emit ($self->{ct}); # start tag
1495              redo A;
1496          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1497            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1498            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1387  sub _get_next_token ($) { Line 1502  sub _get_next_token ($) {
1502              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
1503              !!!cp (111);              !!!cp (111);
1504            }            }
1505    
1506              $self->{state} = DATA_STATE;
1507              $self->{s_kwd} = '';
1508              !!!next-input-character;
1509              !!!emit ($self->{ct}); # end tag
1510              redo A;
1511            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1512              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1513              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1514              !!!next-input-character;
1515              !!!emit ($self->{ct}); # ATTLIST
1516              redo A;
1517          } else {          } else {
1518            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1519          }          }
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
         !!!next-input-character;  
   
         !!!emit ($self->{ct}); # start tag or end tag  
   
         redo A;  
1520        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!parse-error (type => 'unclosed tag');  
1521          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1522            !!!cp (112);            !!!cp (112);
1523              !!!parse-error (type => 'unclosed tag');
1524            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1525    
1526              $self->{state} = DATA_STATE;
1527              $self->{s_kwd} = '';
1528              ## reconsume
1529              !!!emit ($self->{ct}); # start tag
1530              redo A;
1531          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1532              !!!parse-error (type => 'unclosed tag');
1533            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1534            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
1535              !!!cp (113);              !!!cp (113);
# Line 1411  sub _get_next_token ($) { Line 1538  sub _get_next_token ($) {
1538              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
1539              !!!cp (114);              !!!cp (114);
1540            }            }
1541    
1542              $self->{state} = DATA_STATE;
1543              $self->{s_kwd} = '';
1544              ## reconsume
1545              !!!emit ($self->{ct}); # end tag
1546              redo A;
1547            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1548              !!!parse-error (type => 'unclosed md'); ## TODO: type
1549              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1550              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1551              ## Reconsume.
1552              !!!emit ($self->{ct}); # ATTLIST
1553              redo A;
1554          } else {          } else {
1555            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1556          }          }
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
         ## reconsume  
   
         !!!emit ($self->{ct}); # start tag or end tag  
   
         redo A;  
1557        } else {        } else {
1558          if ({          if ({
1559               0x0022 => 1, # "               0x0022 => 1, # "
# Line 1563  sub _get_next_token ($) { Line 1696  sub _get_next_token ($) {
1696          redo A;          redo A;
1697        }        }
1698      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1699        ## (only happen if PCDATA state)        ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
1700    
1701        ## NOTE: Unlike spec's "bogus comment state", this implementation        ## NOTE: Unlike spec's "bogus comment state", this implementation
1702        ## consumes characters one-by-one basis.        ## consumes characters one-by-one basis.
1703                
1704        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
1705          !!!cp (124);          if ($self->{in_subset}) {
1706          $self->{state} = DATA_STATE;            !!!cp (123);
1707          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1708            } else {
1709              !!!cp (124);
1710              $self->{state} = DATA_STATE;
1711              $self->{s_kwd} = '';
1712            }
1713          !!!next-input-character;          !!!next-input-character;
1714    
1715          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
1716          redo A;          redo A;
1717        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1718          !!!cp (125);          if ($self->{in_subset}) {
1719          $self->{state} = DATA_STATE;            !!!cp (125.1);
1720          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1721            } else {
1722              !!!cp (125);
1723              $self->{state} = DATA_STATE;
1724              $self->{s_kwd} = '';
1725            }
1726          ## reconsume          ## reconsume
1727    
1728          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1596  sub _get_next_token ($) { Line 1739  sub _get_next_token ($) {
1739          redo A;          redo A;
1740        }        }
1741      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1742        ## (only happen if PCDATA state)        ## XML5: "Markup declaration state".
1743                
1744        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
1745          !!!cp (133);          !!!cp (133);
# Line 1772  sub _get_next_token ($) { Line 1915  sub _get_next_token ($) {
1915          !!!next-input-character;          !!!next-input-character;
1916          redo A;          redo A;
1917        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
         !!!cp (138);  
1918          !!!parse-error (type => 'bogus comment');          !!!parse-error (type => 'bogus comment');
1919          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1920          $self->{s_kwd} = '';            !!!cp (138.1);
1921              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1922            } else {
1923              !!!cp (138);
1924              $self->{state} = DATA_STATE;
1925              $self->{s_kwd} = '';
1926            }
1927          !!!next-input-character;          !!!next-input-character;
1928    
1929          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
1930    
1931          redo A;          redo A;
1932        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (139);  
1933          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1934          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1935          $self->{s_kwd} = '';            !!!cp (139.1);
1936              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1937            } else {
1938              !!!cp (139);
1939              $self->{state} = DATA_STATE;
1940              $self->{s_kwd} = '';
1941            }
1942          ## reconsume          ## reconsume
1943    
1944          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1806  sub _get_next_token ($) { Line 1959  sub _get_next_token ($) {
1959          !!!next-input-character;          !!!next-input-character;
1960          redo A;          redo A;
1961        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
         !!!cp (142);  
1962          !!!parse-error (type => 'bogus comment');          !!!parse-error (type => 'bogus comment');
1963          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1964          $self->{s_kwd} = '';            !!!cp (142.1);
1965              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1966            } else {
1967              !!!cp (142);
1968              $self->{state} = DATA_STATE;
1969              $self->{s_kwd} = '';
1970            }
1971          !!!next-input-character;          !!!next-input-character;
1972    
1973          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
1974    
1975          redo A;          redo A;
1976        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (143);  
1977          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1978          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1979          $self->{s_kwd} = '';            !!!cp (143.1);
1980              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1981            } else {
1982              !!!cp (143);
1983              $self->{state} = DATA_STATE;
1984              $self->{s_kwd} = '';
1985            }
1986          ## reconsume          ## reconsume
1987    
1988          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1834  sub _get_next_token ($) { Line 1997  sub _get_next_token ($) {
1997          redo A;          redo A;
1998        }        }
1999      } elsif ($self->{state} == COMMENT_STATE) {      } elsif ($self->{state} == COMMENT_STATE) {
2000          ## XML5: "Comment state" and "DOCTYPE comment state".
2001    
2002        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2003          !!!cp (145);          !!!cp (145);
2004          $self->{state} = COMMENT_END_DASH_STATE;          $self->{state} = COMMENT_END_DASH_STATE;
2005          !!!next-input-character;          !!!next-input-character;
2006          redo A;          redo A;
2007        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (146);  
2008          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
2009          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2010          $self->{s_kwd} = '';            !!!cp (146.1);
2011              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2012            } else {
2013              !!!cp (146);
2014              $self->{state} = DATA_STATE;
2015              $self->{s_kwd} = '';
2016            }
2017          ## reconsume          ## reconsume
2018    
2019          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1861  sub _get_next_token ($) { Line 2031  sub _get_next_token ($) {
2031          redo A;          redo A;
2032        }        }
2033      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2034        ## XML5: "comment dash state".        ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2035    
2036        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2037          !!!cp (148);          !!!cp (148);
# Line 1869  sub _get_next_token ($) { Line 2039  sub _get_next_token ($) {
2039          !!!next-input-character;          !!!next-input-character;
2040          redo A;          redo A;
2041        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (149);  
2042          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
2043          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2044          $self->{s_kwd} = '';            !!!cp (149.1);
2045              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2046            } else {
2047              !!!cp (149);
2048              $self->{state} = DATA_STATE;
2049              $self->{s_kwd} = '';
2050            }
2051          ## reconsume          ## reconsume
2052    
2053          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1886  sub _get_next_token ($) { Line 2061  sub _get_next_token ($) {
2061          redo A;          redo A;
2062        }        }
2063      } elsif ($self->{state} == COMMENT_END_STATE) {      } elsif ($self->{state} == COMMENT_END_STATE) {
2064          ## XML5: "Comment end state" and "DOCTYPE comment end state".
2065    
2066        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2067          !!!cp (151);          if ($self->{in_subset}) {
2068          $self->{state} = DATA_STATE;            !!!cp (151.1);
2069          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2070            } else {
2071              !!!cp (151);
2072              $self->{state} = DATA_STATE;
2073              $self->{s_kwd} = '';
2074            }
2075          !!!next-input-character;          !!!next-input-character;
2076    
2077          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1906  sub _get_next_token ($) { Line 2088  sub _get_next_token ($) {
2088          !!!next-input-character;          !!!next-input-character;
2089          redo A;          redo A;
2090        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (153);  
2091          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
2092          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2093          $self->{s_kwd} = '';            !!!cp (153.1);
2094              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2095            } else {
2096              !!!cp (153);
2097              $self->{state} = DATA_STATE;
2098              $self->{s_kwd} = '';
2099            }
2100          ## reconsume          ## reconsume
2101    
2102          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1973  sub _get_next_token ($) { Line 2160  sub _get_next_token ($) {
2160          !!!cp (159.1);          !!!cp (159.1);
2161          !!!parse-error (type => 'no DOCTYPE name');          !!!parse-error (type => 'no DOCTYPE name');
2162          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2163            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2164            $self->{in_subset} = 1;
2165          !!!next-input-character;          !!!next-input-character;
2166            !!!emit ($self->{ct}); # DOCTYPE
2167          redo A;          redo A;
2168        } else {        } else {
2169          !!!cp (160);          !!!cp (160);
# Line 2016  sub _get_next_token ($) { Line 2206  sub _get_next_token ($) {
2206        } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [        } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2207          !!!cp (163.1);          !!!cp (163.1);
2208          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2209            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2210            $self->{in_subset} = 1;
2211          !!!next-input-character;          !!!next-input-character;
2212            !!!emit ($self->{ct}); # DOCTYPE
2213          redo A;          redo A;
2214        } else {        } else {
2215          !!!cp (164);          !!!cp (164);
# Line 2036  sub _get_next_token ($) { Line 2229  sub _get_next_token ($) {
2229          !!!next-input-character;          !!!next-input-character;
2230          redo A;          redo A;
2231        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2232          !!!cp (166);          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2233          $self->{state} = DATA_STATE;            !!!cp (166);
2234          $self->{s_kwd} = '';            $self->{state} = DATA_STATE;
2235              $self->{s_kwd} = '';
2236            } else {
2237              !!!cp (166.1);
2238              !!!parse-error (type => 'no md def'); ## TODO: type
2239              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2240            }
2241            
2242          !!!next-input-character;          !!!next-input-character;
2243            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         !!!emit ($self->{ct}); # DOCTYPE  
   
2244          redo A;          redo A;
2245        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2246          !!!cp (167);          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2247          !!!parse-error (type => 'unclosed DOCTYPE');            !!!cp (167);
2248          $self->{state} = DATA_STATE;            !!!parse-error (type => 'unclosed DOCTYPE');
2249          $self->{s_kwd} = '';            $self->{state} = DATA_STATE;
2250          ## reconsume            $self->{s_kwd} = '';
2251              $self->{ct}->{quirks} = 1;
2252          $self->{ct}->{quirks} = 1;          } else {
2253          !!!emit ($self->{ct}); # DOCTYPE            !!!cp (167.12);
2254              !!!parse-error (type => 'unclosed md'); ## TODO: type
2255              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2256            }
2257            
2258            ## Reconsume.
2259            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2260          redo A;          redo A;
2261        } elsif ($self->{nc} == 0x0050 or # P        } elsif ($self->{nc} == 0x0050 or # P
2262                 $self->{nc} == 0x0070) { # p                 $self->{nc} == 0x0070) { # p
# Line 2069  sub _get_next_token ($) { Line 2272  sub _get_next_token ($) {
2272          $self->{kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
2273          !!!next-input-character;          !!!next-input-character;
2274          redo A;          redo A;
2275        } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [        } elsif ($self->{nc} == 0x0022 and # "
2276                   ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2277                    $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2278            !!!cp (167.21);
2279            $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
2280            $self->{ct}->{value} = ''; # ENTITY
2281            !!!next-input-character;
2282            redo A;
2283          } elsif ($self->{nc} == 0x0027 and # '
2284                   ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2285                    $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2286            !!!cp (167.22);
2287            $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
2288            $self->{ct}->{value} = ''; # ENTITY
2289            !!!next-input-character;
2290            redo A;
2291          } elsif ($self->{is_xml} and
2292                   $self->{ct}->{type} == DOCTYPE_TOKEN and
2293                   $self->{nc} == 0x005B) { # [
2294          !!!cp (167.3);          !!!cp (167.3);
2295          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2296          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2297            $self->{in_subset} = 1;
2298          !!!next-input-character;          !!!next-input-character;
2299            !!!emit ($self->{ct}); # DOCTYPE
2300          redo A;          redo A;
2301        } else {        } else {
2302          !!!cp (180);          !!!parse-error (type => 'string after DOCTYPE name'); ## TODO: type
2303          !!!parse-error (type => 'string after DOCTYPE name');  
2304          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2305              !!!cp (180);
2306              $self->{ct}->{quirks} = 1;
2307              $self->{state} = BOGUS_DOCTYPE_STATE;
2308            } else {
2309              !!!cp (180.1);
2310              $self->{state} = BOGUS_MD_STATE;
2311            }
2312    
         $self->{state} = BOGUS_DOCTYPE_STATE;  
2313          !!!next-input-character;          !!!next-input-character;
2314          redo A;          redo A;
2315        }        }
# Line 2122  sub _get_next_token ($) { Line 2351  sub _get_next_token ($) {
2351          !!!next-input-character;          !!!next-input-character;
2352          redo A;          redo A;
2353        } else {        } else {
2354          !!!cp (169);          !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
         !!!parse-error (type => 'string after DOCTYPE name',  
2355                          line => $self->{line_prev},                          line => $self->{line_prev},
2356                          column => $self->{column_prev} + 1 - length $self->{kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
2357          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2358              !!!cp (169);
2359          $self->{state} = BOGUS_DOCTYPE_STATE;            $self->{ct}->{quirks} = 1;
2360              $self->{state} = BOGUS_DOCTYPE_STATE;
2361            } else {
2362              !!!cp (169.1);
2363              $self->{state} = BOGUS_MD_STATE;
2364            }
2365          ## Reconsume.          ## Reconsume.
2366          redo A;          redo A;
2367        }        }
# Line 2170  sub _get_next_token ($) { Line 2403  sub _get_next_token ($) {
2403          !!!next-input-character;          !!!next-input-character;
2404          redo A;          redo A;
2405        } else {        } else {
2406          !!!cp (172);          !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
         !!!parse-error (type => 'string after DOCTYPE name',  
2407                          line => $self->{line_prev},                          line => $self->{line_prev},
2408                          column => $self->{column_prev} + 1 - length $self->{kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
2409          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2410              !!!cp (172);
2411          $self->{state} = BOGUS_DOCTYPE_STATE;            $self->{ct}->{quirks} = 1;
2412              $self->{state} = BOGUS_DOCTYPE_STATE;
2413            } else {
2414              !!!cp (172.1);
2415              $self->{state} = BOGUS_MD_STATE;
2416            }
2417          ## Reconsume.          ## Reconsume.
2418          redo A;          redo A;
2419        }        }
# Line 2199  sub _get_next_token ($) { Line 2436  sub _get_next_token ($) {
2436          !!!next-input-character;          !!!next-input-character;
2437          redo A;          redo A;
2438        } elsif ($self->{nc} eq 0x003E) { # >        } elsif ($self->{nc} eq 0x003E) { # >
         !!!cp (184);  
2439          !!!parse-error (type => 'no PUBLIC literal');          !!!parse-error (type => 'no PUBLIC literal');
2440            
2441          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2442          $self->{s_kwd} = '';            !!!cp (184);
2443              $self->{state} = DATA_STATE;
2444              $self->{s_kwd} = '';
2445              $self->{ct}->{quirks} = 1;
2446            } else {
2447              !!!cp (184.1);
2448              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2449            }
2450            
2451          !!!next-input-character;          !!!next-input-character;
2452            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         !!!emit ($self->{ct}); # DOCTYPE  
   
2453          redo A;          redo A;
2454        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2455          !!!cp (185);          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2456          !!!parse-error (type => 'unclosed DOCTYPE');            !!!cp (185);
2457              !!!parse-error (type => 'unclosed DOCTYPE');
2458          $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
2459          $self->{s_kwd} = '';            $self->{s_kwd} = '';
2460              $self->{ct}->{quirks} = 1;
2461            } else {
2462              !!!cp (185.1);
2463              !!!parse-error (type => 'unclosed md'); ## TODO: type
2464              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2465            }
2466            
2467          ## reconsume          ## reconsume
   
         $self->{ct}->{quirks} = 1;  
2468          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
   
2469          redo A;          redo A;
2470        } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [        } elsif ($self->{is_xml} and
2471                   $self->{ct}->{type} == DOCTYPE_TOKEN and
2472                   $self->{nc} == 0x005B) { # [
2473          !!!cp (186.1);          !!!cp (186.1);
2474          !!!parse-error (type => 'no PUBLIC literal');          !!!parse-error (type => 'no PUBLIC literal');
2475          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2476          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2477            $self->{in_subset} = 1;
2478          !!!next-input-character;          !!!next-input-character;
2479            !!!emit ($self->{ct}); # DOCTYPE
2480          redo A;          redo A;
2481        } else {        } else {
         !!!cp (186);  
2482          !!!parse-error (type => 'string after PUBLIC');          !!!parse-error (type => 'string after PUBLIC');
         $self->{ct}->{quirks} = 1;  
2483    
2484          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2485              !!!cp (186);
2486              $self->{ct}->{quirks} = 1;
2487              $self->{state} = BOGUS_DOCTYPE_STATE;
2488            } else {
2489              !!!cp (186.2);
2490              $self->{state} = BOGUS_MD_STATE;
2491            }
2492    
2493          !!!next-input-character;          !!!next-input-character;
2494          redo A;          redo A;
2495        }        }
# Line 2245  sub _get_next_token ($) { Line 2500  sub _get_next_token ($) {
2500          !!!next-input-character;          !!!next-input-character;
2501          redo A;          redo A;
2502        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
         !!!cp (188);  
2503          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2504    
2505          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2506          $self->{s_kwd} = '';            !!!cp (188);
2507          !!!next-input-character;            $self->{state} = DATA_STATE;
2508              $self->{s_kwd} = '';
2509          $self->{ct}->{quirks} = 1;            $self->{ct}->{quirks} = 1;
2510          !!!emit ($self->{ct}); # DOCTYPE          } else {
2511              !!!cp (188.1);
2512              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2513            }
2514    
2515            !!!next-input-character;
2516            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2517          redo A;          redo A;
2518        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (189);  
2519          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2520    
2521          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2522          $self->{s_kwd} = '';            !!!cp (189);
2523          ## reconsume            $self->{state} = DATA_STATE;
2524              $self->{s_kwd} = '';
2525          $self->{ct}->{quirks} = 1;            $self->{ct}->{quirks} = 1;
2526            } else {
2527              !!!cp (189.1);
2528              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2529            }
2530            
2531            ## Reconsume.
2532          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
   
2533          redo A;          redo A;
2534        } else {        } else {
2535          !!!cp (190);          !!!cp (190);
2536          $self->{ct}->{pubid} # DOCTYPE          $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
2537          $self->{read_until}->($self->{ct}->{pubid}, q[">],          $self->{read_until}->($self->{ct}->{pubid}, q[">],
2538                                length $self->{ct}->{pubid});                                length $self->{ct}->{pubid});
2539    
# Line 2286  sub _get_next_token ($) { Line 2548  sub _get_next_token ($) {
2548          !!!next-input-character;          !!!next-input-character;
2549          redo A;          redo A;
2550        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
         !!!cp (192);  
2551          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2552    
2553          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2554          $self->{s_kwd} = '';            !!!cp (192);
2555          !!!next-input-character;            $self->{state} = DATA_STATE;
2556              $self->{s_kwd} = '';
2557          $self->{ct}->{quirks} = 1;            $self->{ct}->{quirks} = 1;
2558          !!!emit ($self->{ct}); # DOCTYPE          } else {
2559              !!!cp (192.1);
2560              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2561            }
2562    
2563            !!!next-input-character;
2564            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2565          redo A;          redo A;
2566        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (193);  
2567          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2568    
2569          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2570          $self->{s_kwd} = '';            !!!cp (193);
2571              $self->{state} = DATA_STATE;
2572              $self->{s_kwd} = '';
2573              $self->{ct}->{quirks} = 1;
2574            } else {
2575              !!!cp (193.1);
2576              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2577            }
2578          
2579          ## reconsume          ## reconsume
2580            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         !!!emit ($self->{ct}); # DOCTYPE  
   
2581          redo A;          redo A;
2582        } else {        } else {
2583          !!!cp (194);          !!!cp (194);
2584          $self->{ct}->{pubid} # DOCTYPE          $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
2585          $self->{read_until}->($self->{ct}->{pubid}, q['>],          $self->{read_until}->($self->{ct}->{pubid}, q['>],
2586                                length $self->{ct}->{pubid});                                length $self->{ct}->{pubid});
2587    
# Line 2328  sub _get_next_token ($) { Line 2597  sub _get_next_token ($) {
2597          redo A;          redo A;
2598        } elsif ($self->{nc} == 0x0022) { # "        } elsif ($self->{nc} == 0x0022) { # "
2599          !!!cp (196);          !!!cp (196);
2600          $self->{ct}->{sysid} = ''; # DOCTYPE          $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2601          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2602          !!!next-input-character;          !!!next-input-character;
2603          redo A;          redo A;
2604        } elsif ($self->{nc} == 0x0027) { # '        } elsif ($self->{nc} == 0x0027) { # '
2605          !!!cp (197);          !!!cp (197);
2606          $self->{ct}->{sysid} = ''; # DOCTYPE          $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2607          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2608          !!!next-input-character;          !!!next-input-character;
2609          redo A;          redo A;
2610        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2611          if ($self->{is_xml}) {          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2612            !!!cp (198.1);            if ($self->{is_xml}) {
2613            !!!parse-error (type => 'no SYSTEM literal');              !!!cp (198.1);
2614                !!!parse-error (type => 'no SYSTEM literal');
2615              } else {
2616                !!!cp (198);
2617              }
2618              $self->{state} = DATA_STATE;
2619              $self->{s_kwd} = '';
2620          } else {          } else {
2621            !!!cp (198);            if ($self->{ct}->{type} == NOTATION_TOKEN) {
2622                !!!cp (198.2);
2623              } else {
2624                !!!cp (198.3);
2625                !!!parse-error (type => 'no SYSTEM literal');            
2626              }
2627              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2628          }          }
2629          $self->{state} = DATA_STATE;          
         $self->{s_kwd} = '';  
2630          !!!next-input-character;          !!!next-input-character;
2631            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         !!!emit ($self->{ct}); # DOCTYPE  
   
2632          redo A;          redo A;
2633        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2634          !!!cp (199);          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2635          !!!parse-error (type => 'unclosed DOCTYPE');            !!!cp (199);
2636              !!!parse-error (type => 'unclosed DOCTYPE');
2637          $self->{state} = DATA_STATE;            
2638          $self->{s_kwd} = '';            $self->{state} = DATA_STATE;
2639              $self->{s_kwd} = '';
2640              $self->{ct}->{quirks} = 1;
2641            } else {
2642              !!!parse-error (type => 'unclosed md'); ## TODO: type
2643              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2644            }
2645            
2646          ## reconsume          ## reconsume
2647            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         !!!emit ($self->{ct}); # DOCTYPE  
   
2648          redo A;          redo A;
2649        } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [        } elsif ($self->{is_xml} and
2650                   $self->{ct}->{type} == DOCTYPE_TOKEN and
2651                   $self->{nc} == 0x005B) { # [
2652          !!!cp (200.1);          !!!cp (200.1);
2653          !!!parse-error (type => 'no SYSTEM literal');          !!!parse-error (type => 'no SYSTEM literal');
2654          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2655          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2656            $self->{in_subset} = 1;
2657          !!!next-input-character;          !!!next-input-character;
2658            !!!emit ($self->{ct}); # DOCTYPE
2659          redo A;          redo A;
2660        } else {        } else {
         !!!cp (200);  
2661          !!!parse-error (type => 'string after PUBLIC literal');          !!!parse-error (type => 'string after PUBLIC literal');
         $self->{ct}->{quirks} = 1;  
2662    
2663          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2664              !!!cp (200);
2665              $self->{ct}->{quirks} = 1;
2666              $self->{state} = BOGUS_DOCTYPE_STATE;
2667            } else {
2668              !!!cp (200.2);
2669              $self->{state} = BOGUS_MD_STATE;
2670            }
2671    
2672          !!!next-input-character;          !!!next-input-character;
2673          redo A;          redo A;
2674        }        }
# Line 2399  sub _get_next_token ($) { Line 2691  sub _get_next_token ($) {
2691          !!!next-input-character;          !!!next-input-character;
2692          redo A;          redo A;
2693        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
         !!!cp (204);  
2694          !!!parse-error (type => 'no SYSTEM literal');          !!!parse-error (type => 'no SYSTEM literal');
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
2695          !!!next-input-character;          !!!next-input-character;
2696    
2697          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2698          !!!emit ($self->{ct}); # DOCTYPE            !!!cp (204);
2699              $self->{state} = DATA_STATE;
2700              $self->{s_kwd} = '';
2701              $self->{ct}->{quirks} = 1;
2702            } else {
2703              !!!cp (204.1);
2704              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2705            }
2706    
2707            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2708          redo A;          redo A;
2709        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2710          !!!cp (205);          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2711          !!!parse-error (type => 'unclosed DOCTYPE');            !!!cp (205);
2712              !!!parse-error (type => 'unclosed DOCTYPE');
2713          $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
2714          $self->{s_kwd} = '';            $self->{s_kwd} = '';
2715              $self->{ct}->{quirks} = 1;
2716            } else {
2717              !!!cp (205.1);
2718              !!!parse-error (type => 'unclosed md'); ## TODO: type
2719              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2720            }
2721            
2722          ## reconsume          ## reconsume
2723            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         !!!emit ($self->{ct}); # DOCTYPE  
   
2724          redo A;          redo A;
2725        } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [        } elsif ($self->{is_xml} and
2726                   $self->{ct}->{type} == DOCTYPE_TOKEN and
2727                   $self->{nc} == 0x005B) { # [
2728          !!!cp (206.1);          !!!cp (206.1);
2729          !!!parse-error (type => 'no SYSTEM literal');          !!!parse-error (type => 'no SYSTEM literal');
2730    
2731          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2732          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2733            $self->{in_subset} = 1;
2734          !!!next-input-character;          !!!next-input-character;
2735            !!!emit ($self->{ct}); # DOCTYPE
2736          redo A;          redo A;
2737        } else {        } else {
         !!!cp (206);  
2738          !!!parse-error (type => 'string after SYSTEM');          !!!parse-error (type => 'string after SYSTEM');
         $self->{ct}->{quirks} = 1;  
2739    
2740          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2741              !!!cp (206);          
2742              $self->{ct}->{quirks} = 1;
2743              $self->{state} = BOGUS_DOCTYPE_STATE;
2744            } else {
2745              !!!cp (206.2);
2746              $self->{state} = BOGUS_MD_STATE;
2747            }
2748    
2749          !!!next-input-character;          !!!next-input-character;
2750          redo A;          redo A;
2751        }        }
# Line 2445  sub _get_next_token ($) { Line 2756  sub _get_next_token ($) {
2756          !!!next-input-character;          !!!next-input-character;
2757          redo A;          redo A;
2758        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
         !!!cp (208);  
2759          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2760    
2761          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2762          $self->{s_kwd} = '';            !!!cp (208);
2763              $self->{state} = DATA_STATE;
2764              $self->{s_kwd} = '';
2765              $self->{ct}->{quirks} = 1;
2766            } else {
2767              !!!cp (208.1);
2768              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2769            }
2770            
2771          !!!next-input-character;          !!!next-input-character;
2772            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         !!!emit ($self->{ct}); # DOCTYPE  
   
2773          redo A;          redo A;
2774        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (209);  
2775          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2776    
2777          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2778          $self->{s_kwd} = '';            !!!cp (209);
2779              $self->{state} = DATA_STATE;
2780              $self->{s_kwd} = '';
2781              $self->{ct}->{quirks} = 1;
2782            } else {
2783              !!!cp (209.1);
2784              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2785            }
2786            
2787          ## reconsume          ## reconsume
2788            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         !!!emit ($self->{ct}); # DOCTYPE  
   
2789          redo A;          redo A;
2790        } else {        } else {
2791          !!!cp (210);          !!!cp (210);
2792          $self->{ct}->{sysid} # DOCTYPE          $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
2793          $self->{read_until}->($self->{ct}->{sysid}, q[">],          $self->{read_until}->($self->{ct}->{sysid}, q[">],
2794                                length $self->{ct}->{sysid});                                length $self->{ct}->{sysid});
2795    
# Line 2498  sub _get_next_token ($) { Line 2816  sub _get_next_token ($) {
2816    
2817          redo A;          redo A;
2818        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (213);  
2819          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2820    
2821          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2822          $self->{s_kwd} = '';            !!!cp (213);
2823          ## reconsume            $self->{state} = DATA_STATE;
2824              $self->{s_kwd} = '';
2825          $self->{ct}->{quirks} = 1;            $self->{ct}->{quirks} = 1;
2826          !!!emit ($self->{ct}); # DOCTYPE          } else {
2827              !!!cp (213.1);
2828              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2829            }
2830    
2831            ## reconsume
2832            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2833          redo A;          redo A;
2834        } else {        } else {
2835          !!!cp (214);          !!!cp (214);
2836          $self->{ct}->{sysid} # DOCTYPE          $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
2837          $self->{read_until}->($self->{ct}->{sysid}, q['>],          $self->{read_until}->($self->{ct}->{sysid}, q['>],
2838                                length $self->{ct}->{sysid});                                length $self->{ct}->{sysid});
2839    
# Line 2522  sub _get_next_token ($) { Line 2843  sub _get_next_token ($) {
2843        }        }
2844      } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {      } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2845        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2846          !!!cp (215);          if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
2847          ## Stay in the state            !!!cp (215.1);
2848              $self->{state} = BEFORE_NDATA_STATE;
2849            } else {
2850              !!!cp (215);
2851              ## Stay in the state
2852            }
2853          !!!next-input-character;          !!!next-input-character;
2854          redo A;          redo A;
2855        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2856          !!!cp (216);          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2857          $self->{state} = DATA_STATE;            !!!cp (216);
2858          $self->{s_kwd} = '';            $self->{state} = DATA_STATE;
2859          !!!next-input-character;            $self->{s_kwd} = '';
2860            } else {
2861          !!!emit ($self->{ct}); # DOCTYPE            !!!cp (216.1);
2862              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2863            }
2864    
2865            !!!next-input-character;
2866            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2867            redo A;
2868          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
2869                   ($self->{nc} == 0x004E or # N
2870                    $self->{nc} == 0x006E)) { # n
2871            !!!cp (216.2);
2872            !!!parse-error (type => 'no space before NDATA'); ## TODO: type
2873            $self->{state} = NDATA_STATE;
2874            $self->{kwd} = chr $self->{nc};
2875            !!!next-input-character;
2876          redo A;          redo A;
2877        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2878          !!!cp (217);          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2879          !!!parse-error (type => 'unclosed DOCTYPE');            !!!cp (217);
2880          $self->{state} = DATA_STATE;            !!!parse-error (type => 'unclosed DOCTYPE');
2881          $self->{s_kwd} = '';            $self->{state} = DATA_STATE;
2882          ## reconsume            $self->{s_kwd} = '';
2883              $self->{ct}->{quirks} = 1;
2884          $self->{ct}->{quirks} = 1;          } else {
2885          !!!emit ($self->{ct}); # DOCTYPE            !!!cp (217.1);
2886              !!!parse-error (type => 'unclosed md'); ## TODO: type
2887              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2888            }
2889    
2890            ## reconsume
2891            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2892          redo A;          redo A;
2893        } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [        } elsif ($self->{is_xml} and
2894                   $self->{ct}->{type} == DOCTYPE_TOKEN and
2895                   $self->{nc} == 0x005B) { # [
2896          !!!cp (218.1);          !!!cp (218.1);
2897          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2898          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2899            $self->{in_subset} = 1;
2900          !!!next-input-character;          !!!next-input-character;
2901            !!!emit ($self->{ct}); # DOCTYPE
2902          redo A;          redo A;
2903        } else {        } else {
         !!!cp (218);  
2904          !!!parse-error (type => 'string after SYSTEM literal');          !!!parse-error (type => 'string after SYSTEM literal');
         #$self->{ct}->{quirks} = 1;  
2905    
2906          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2907              !!!cp (218);
2908              #$self->{ct}->{quirks} = 1;
2909              $self->{state} = BOGUS_DOCTYPE_STATE;
2910            } else {
2911              !!!cp (218.2);
2912              $self->{state} = BOGUS_MD_STATE;
2913            }
2914    
2915            !!!next-input-character;
2916            redo A;
2917          }
2918        } elsif ($self->{state} == BEFORE_NDATA_STATE) {
2919          if ($is_space->{$self->{nc}}) {
2920            !!!cp (218.3);
2921            ## Stay in the state.
2922            !!!next-input-character;
2923            redo A;
2924          } elsif ($self->{nc} == 0x003E) { # >
2925            !!!cp (218.4);
2926            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2927            !!!next-input-character;
2928            !!!emit ($self->{ct}); # ENTITY
2929            redo A;
2930          } elsif ($self->{nc} == 0x004E or # N
2931                   $self->{nc} == 0x006E) { # n
2932            !!!cp (218.5);
2933            $self->{state} = NDATA_STATE;
2934            $self->{kwd} = chr $self->{nc};
2935            !!!next-input-character;
2936            redo A;
2937          } elsif ($self->{nc} == -1) {
2938            !!!cp (218.6);
2939            !!!parse-error (type => 'unclosed md'); ## TODO: type
2940            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2941            ## reconsume
2942            !!!emit ($self->{ct}); # ENTITY
2943            redo A;
2944          } else {
2945            !!!cp (218.7);
2946            !!!parse-error (type => 'string after SYSTEM literal');
2947            $self->{state} = BOGUS_MD_STATE;
2948          !!!next-input-character;          !!!next-input-character;
2949          redo A;          redo A;
2950        }        }
# Line 2572  sub _get_next_token ($) { Line 2959  sub _get_next_token ($) {
2959    
2960          redo A;          redo A;
2961        } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [        } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2962          if ($self->{ct}->{has_internal_subset}) { # DOCTYPE          !!!cp (220.1);
2963            !!!cp (220.2);          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2964            ## Stay in the state.          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2965            !!!next-input-character;          $self->{in_subset} = 1;
2966            redo A;          !!!next-input-character;
2967          } else {          !!!emit ($self->{ct}); # DOCTYPE
2968            !!!cp (220.1);          redo A;
           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;  
           $self->{ct}->{has_internal_subset} = 1; # DOCTYPE  
           !!!next-input-character;  
           redo A;  
         }  
2969        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2970          !!!cp (220);          !!!cp (220);
2971          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
# Line 2751  sub _get_next_token ($) { Line 3133  sub _get_next_token ($) {
3133          redo A;          redo A;
3134        }        }
3135      } elsif ($self->{state} == ENTITY_HASH_STATE) {      } elsif ($self->{state} == ENTITY_HASH_STATE) {
3136        if ($self->{nc} == 0x0078 or # x        if ($self->{nc} == 0x0078) { # x
           $self->{nc} == 0x0058) { # X  
3137          !!!cp (995);          !!!cp (995);
3138          $self->{state} = HEXREF_X_STATE;          $self->{state} = HEXREF_X_STATE;
3139          $self->{kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3140          !!!next-input-character;          !!!next-input-character;
3141          redo A;          redo A;
3142          } elsif ($self->{nc} == 0x0058) { # X
3143            !!!cp (995.1);
3144            if ($self->{is_xml}) {
3145              !!!parse-error (type => 'uppercase hcro'); ## TODO: type
3146            }
3147            $self->{state} = HEXREF_X_STATE;
3148            $self->{kwd} .= chr $self->{nc};
3149            !!!next-input-character;
3150            redo A;
3151        } elsif (0x0030 <= $self->{nc} and        } elsif (0x0030 <= $self->{nc} and
3152                 $self->{nc} <= 0x0039) { # 0..9                 $self->{nc} <= 0x0039) { # 0..9
3153          !!!cp (994);          !!!cp (994);
# Line 2965  sub _get_next_token ($) { Line 3355  sub _get_next_token ($) {
3355          redo A;          redo A;
3356        }        }
3357      } elsif ($self->{state} == ENTITY_NAME_STATE) {      } elsif ($self->{state} == ENTITY_NAME_STATE) {
3358        if (length $self->{kwd} < 30 and        if ((0x0041 <= $self->{nc} and # a
3359            ## NOTE: Some number greater than the maximum length of entity name             $self->{nc} <= 0x005A) or # x
3360            ((0x0041 <= $self->{nc} and # a            (0x0061 <= $self->{nc} and # a
3361              $self->{nc} <= 0x005A) or # x             $self->{nc} <= 0x007A) or # z
3362             (0x0061 <= $self->{nc} and # a            (0x0030 <= $self->{nc} and # 0
3363              $self->{nc} <= 0x007A) or # z             $self->{nc} <= 0x0039) or # 9
3364             (0x0030 <= $self->{nc} and # 0            $self->{nc} == 0x003B) { # ;
             $self->{nc} <= 0x0039) or # 9  
            $self->{nc} == 0x003B)) { # ;  
3365          our $EntityChar;          our $EntityChar;
3366          $self->{kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3367          if (defined $EntityChar->{$self->{kwd}}) {          if (defined $EntityChar->{$self->{kwd}} or
3368                $self->{ge}->{$self->{kwd}}) {
3369            if ($self->{nc} == 0x003B) { # ;            if ($self->{nc} == 0x003B) { # ;
3370              !!!cp (1020);              if (defined $self->{ge}->{$self->{kwd}}) {
3371              $self->{entity__value} = $EntityChar->{$self->{kwd}};                if ($self->{ge}->{$self->{kwd}}->{only_text}) {
3372                    !!!cp (1020.1);
3373                    $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
3374                  } else {
3375                    if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
3376                      !!!cp (1020.2);
3377                      !!!parse-error (type => 'unparsed entity', ## TODO: type
3378                                      value => $self->{kwd});
3379                    } else {
3380                      !!!cp (1020.3);
3381                    }
3382                    $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
3383                  }
3384                } else {
3385                  if ($self->{is_xml}) {
3386                    !!!cp (1020.4);
3387                    !!!parse-error (type => 'entity not declared', ## TODO: type
3388                                    value => $self->{kwd},
3389                                    level => {
3390                                              'amp;' => $self->{level}->{warn},
3391                                              'quot;' => $self->{level}->{warn},
3392                                              'lt;' => $self->{level}->{warn},
3393                                              'gt;' => $self->{level}->{warn},
3394                                              'apos;' => $self->{level}->{warn},
3395                                             }->{$self->{kwd}} ||
3396                                             $self->{level}->{must});
3397                  } else {
3398                    !!!cp (1020);
3399                  }
3400                  $self->{entity__value} = $EntityChar->{$self->{kwd}};
3401                }
3402              $self->{entity__match} = 1;              $self->{entity__match} = 1;
3403              !!!next-input-character;              !!!next-input-character;
3404              #              #
# Line 3065  sub _get_next_token ($) { Line 3484  sub _get_next_token ($) {
3484      ## XML-only states      ## XML-only states
3485    
3486      } elsif ($self->{state} == PI_STATE) {      } elsif ($self->{state} == PI_STATE) {
3487          ## XML5: "Pi state" and "DOCTYPE pi state".
3488    
3489        if ($is_space->{$self->{nc}} or        if ($is_space->{$self->{nc}} or
3490            $self->{nc} == 0x003F or # ? ## XML5: Same as "Anything else"            $self->{nc} == 0x003F or # ?
3491            $self->{nc} == -1) {            $self->{nc} == -1) {
3492            ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
3493            ## pi state": Switch to the "DOCTYPE pi after state".  EOF:
3494            ## "DOCTYPE pi state": Parse error, switch to the "data
3495            ## state".
3496          !!!parse-error (type => 'bare pio', ## TODO: type          !!!parse-error (type => 'bare pio', ## TODO: type
3497                          line => $self->{line_prev},                          line => $self->{line_prev},
3498                          column => $self->{column_prev}                          column => $self->{column_prev}
# Line 3082  sub _get_next_token ($) { Line 3507  sub _get_next_token ($) {
3507                        };                        };
3508          redo A;          redo A;
3509        } else {        } else {
3510            ## XML5: "DOCTYPE pi state": Stay in the state.
3511          $self->{ct} = {type => PI_TOKEN,          $self->{ct} = {type => PI_TOKEN,
3512                         target => chr $self->{nc},                         target => chr $self->{nc},
3513                         data => '',                         data => '',
# Line 3099  sub _get_next_token ($) { Line 3525  sub _get_next_token ($) {
3525          redo A;          redo A;
3526        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3527          !!!parse-error (type => 'no pic'); ## TODO: type          !!!parse-error (type => 'no pic'); ## TODO: type
3528          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
3529          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3530            } else {
3531              $self->{state} = DATA_STATE;
3532              $self->{s_kwd} = '';
3533            }
3534          ## Reconsume.          ## Reconsume.
3535          !!!emit ($self->{ct}); # pi          !!!emit ($self->{ct}); # pi
3536          redo A;          redo A;
# Line 3131  sub _get_next_token ($) { Line 3561  sub _get_next_token ($) {
3561          redo A;          redo A;
3562        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3563          !!!parse-error (type => 'no pic'); ## TODO: type          !!!parse-error (type => 'no pic'); ## TODO: type
3564          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
3565          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
3566            } else {
3567              $self->{state} = DATA_STATE;
3568              $self->{s_kwd} = '';
3569            }
3570          ## Reprocess.          ## Reprocess.
3571          !!!emit ($self->{ct}); # pi          !!!emit ($self->{ct}); # pi
3572          redo A;          redo A;
# Line 3146  sub _get_next_token ($) { Line 3580  sub _get_next_token ($) {
3580          redo A;          redo A;
3581        }        }
3582      } elsif ($self->{state} == PI_AFTER_STATE) {      } elsif ($self->{state} == PI_AFTER_STATE) {
3583          ## XML5: Part of "Pi after state".
3584    
3585        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
3586          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
3587          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3588            } else {
3589              $self->{state} = DATA_STATE;
3590              $self->{s_kwd} = '';
3591            }
3592          !!!next-input-character;          !!!next-input-character;
3593          !!!emit ($self->{ct}); # pi          !!!emit ($self->{ct}); # pi
3594          redo A;          redo A;
# Line 3171  sub _get_next_token ($) { Line 3611  sub _get_next_token ($) {
3611          redo A;          redo A;
3612        }        }
3613      } elsif ($self->{state} == PI_DATA_AFTER_STATE) {      } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
3614        ## XML5: Same as "pi after state" in XML5        ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
3615    
3616        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
3617          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
3618          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3619            } else {
3620              $self->{state} = DATA_STATE;
3621              $self->{s_kwd} = '';
3622            }
3623          !!!next-input-character;          !!!next-input-character;
3624          !!!emit ($self->{ct}); # pi          !!!emit ($self->{ct}); # pi
3625          redo A;          redo A;
# Line 3192  sub _get_next_token ($) { Line 3637  sub _get_next_token ($) {
3637    
3638      } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {      } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
3639        if ($self->{nc} == 0x003C) { # <        if ($self->{nc} == 0x003C) { # <
3640          ## TODO:          $self->{state} = DOCTYPE_TAG_STATE;
3641          !!!next-input-character;          !!!next-input-character;
3642          redo A;          redo A;
3643        } elsif ($self->{nc} == 0x0025) { # %        } elsif ($self->{nc} == 0x0025) { # %
# Line 3202  sub _get_next_token ($) { Line 3647  sub _get_next_token ($) {
3647          !!!next-input-character;          !!!next-input-character;
3648          redo A;          redo A;
3649        } elsif ($self->{nc} == 0x005D) { # ]        } elsif ($self->{nc} == 0x005D) { # ]
3650            delete $self->{in_subset};
3651          $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3652          !!!next-input-character;          !!!next-input-character;
3653          redo A;          redo A;
# Line 3211  sub _get_next_token ($) { Line 3657  sub _get_next_token ($) {
3657          redo A;          redo A;
3658        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3659          !!!parse-error (type => 'unclosed internal subset'); ## TODO: type          !!!parse-error (type => 'unclosed internal subset'); ## TODO: type
3660            delete $self->{in_subset};
3661          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3662          $self->{s_kwd} = '';          $self->{s_kwd} = '';
3663          ## Reconsume.          ## Reconsume.
3664          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3665          redo A;          redo A;
3666        } else {        } else {
3667          unless ($self->{internal_subset_tainted}) {          unless ($self->{internal_subset_tainted}) {
# Line 3231  sub _get_next_token ($) { Line 3678  sub _get_next_token ($) {
3678          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3679          $self->{s_kwd} = '';          $self->{s_kwd} = '';
3680          !!!next-input-character;          !!!next-input-character;
3681          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3682          redo A;          redo A;
3683        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3684          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
3685          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3686          $self->{s_kwd} = '';          $self->{s_kwd} = '';
3687          ## Reconsume.          ## Reconsume.
3688          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3689          redo A;          redo A;
3690        } else {        } else {
3691          ## XML5: No parse error and stay in the state.          ## XML5: No parse error and stay in the state.
3692          !!!parse-error (type => 'string after internal subset'); ## TODO: type          !!!parse-error (type => 'string after internal subset'); ## TODO: type
3693    
3694          $self->{state} = BOGUS_DOCTYPE_STATE;          $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3695            !!!next-input-character;
3696            redo A;
3697          }
3698        } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3699          if ($self->{nc} == 0x003E) { # >
3700            $self->{state} = DATA_STATE;
3701            $self->{s_kwd} = '';
3702            !!!next-input-character;
3703            !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3704            redo A;
3705          } elsif ($self->{nc} == -1) {
3706            $self->{state} = DATA_STATE;
3707            $self->{s_kwd} = '';
3708            ## Reconsume.
3709            !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3710            redo A;
3711          } else {
3712            ## Stay in the state.
3713            !!!next-input-character;
3714            redo A;
3715          }
3716        } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
3717          if ($self->{nc} == 0x0021) { # !
3718            $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
3719            !!!next-input-character;
3720            redo A;
3721          } elsif ($self->{nc} == 0x003F) { # ?
3722            $self->{state} = PI_STATE;
3723            !!!next-input-character;
3724            redo A;
3725          } elsif ($self->{nc} == -1) {
3726            !!!parse-error (type => 'bare stago');
3727            $self->{state} = DATA_STATE;
3728            $self->{s_kwd} = '';
3729            ## Reconsume.
3730            redo A;
3731          } else {
3732            !!!parse-error (type => 'bare stago', ## XML5: Not a parse error.
3733                            line => $self->{line_prev},
3734                            column => $self->{column_prev});
3735            $self->{state} = BOGUS_COMMENT_STATE;
3736            $self->{ct} = {type => COMMENT_TOKEN,
3737                           data => '',
3738                          }; ## NOTE: Will be discarded.
3739            !!!next-input-character;
3740            redo A;
3741          }
3742        } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
3743          ## XML5: "DOCTYPE markup declaration state".
3744          
3745          if ($self->{nc} == 0x002D) { # -
3746            $self->{state} = MD_HYPHEN_STATE;
3747            !!!next-input-character;
3748            redo A;
3749          } elsif ($self->{nc} == 0x0045 or # E
3750                   $self->{nc} == 0x0065) { # e
3751            $self->{state} = MD_E_STATE;
3752            $self->{kwd} = chr $self->{nc};
3753            !!!next-input-character;
3754            redo A;
3755          } elsif ($self->{nc} == 0x0041 or # A
3756                   $self->{nc} == 0x0061) { # a
3757            $self->{state} = MD_ATTLIST_STATE;
3758            $self->{kwd} = chr $self->{nc};
3759            !!!next-input-character;
3760            redo A;
3761          } elsif ($self->{nc} == 0x004E or # N
3762                   $self->{nc} == 0x006E) { # n
3763            $self->{state} = MD_NOTATION_STATE;
3764            $self->{kwd} = chr $self->{nc};
3765            !!!next-input-character;
3766            redo A;
3767          } else {
3768            #
3769          }
3770          
3771          ## XML5: No parse error.
3772          !!!parse-error (type => 'bogus comment',
3773                          line => $self->{line_prev},
3774                          column => $self->{column_prev} - 1);
3775          ## Reconsume.
3776          $self->{state} = BOGUS_COMMENT_STATE;
3777          $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
3778          redo A;
3779        } elsif ($self->{state} == MD_E_STATE) {
3780          if ($self->{nc} == 0x004E or # N
3781              $self->{nc} == 0x006E) { # n
3782            $self->{state} = MD_ENTITY_STATE;
3783            $self->{kwd} .= chr $self->{nc};
3784            !!!next-input-character;
3785            redo A;
3786          } elsif ($self->{nc} == 0x004C or # L
3787                   $self->{nc} == 0x006C) { # l
3788            ## XML5: <!ELEMENT> not supported.
3789            $self->{state} = MD_ELEMENT_STATE;
3790            $self->{kwd} .= chr $self->{nc};
3791            !!!next-input-character;
3792            redo A;
3793          } else {
3794            ## XML5: No parse error.
3795            !!!parse-error (type => 'bogus comment',
3796                            line => $self->{line_prev},
3797                            column => $self->{column_prev} - 2
3798                                + 1 * ($self->{nc} == -1));
3799            ## Reconsume.
3800            $self->{state} = BOGUS_COMMENT_STATE;
3801            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3802            redo A;
3803          }
3804        } elsif ($self->{state} == MD_ENTITY_STATE) {
3805          if ($self->{nc} == [
3806                undef,
3807                undef,
3808                0x0054, # T
3809                0x0049, # I
3810                0x0054, # T
3811              ]->[length $self->{kwd}] or
3812              $self->{nc} == [
3813                undef,
3814                undef,
3815                0x0074, # t
3816                0x0069, # i
3817                0x0074, # t
3818              ]->[length $self->{kwd}]) {
3819            ## Stay in the state.
3820            $self->{kwd} .= chr $self->{nc};
3821            !!!next-input-character;
3822            redo A;
3823          } elsif ((length $self->{kwd}) == 5 and
3824                   ($self->{nc} == 0x0059 or # Y
3825                    $self->{nc} == 0x0079)) { # y
3826            if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
3827              !!!parse-error (type => 'lowercase keyword', ## TODO: type
3828                              text => 'ENTITY',
3829                              line => $self->{line_prev},
3830                              column => $self->{column_prev} - 4);
3831            }
3832            $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
3833                           line => $self->{line_prev},
3834                           column => $self->{column_prev} - 6};
3835            $self->{state} = DOCTYPE_MD_STATE;
3836            !!!next-input-character;
3837            redo A;
3838          } else {
3839            !!!parse-error (type => 'bogus comment',
3840                            line => $self->{line_prev},
3841                            column => $self->{column_prev} - 1
3842                                - (length $self->{kwd})
3843                                + 1 * ($self->{nc} == -1));
3844            $self->{state} = BOGUS_COMMENT_STATE;
3845            ## Reconsume.
3846            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3847            redo A;
3848          }
3849        } elsif ($self->{state} == MD_ELEMENT_STATE) {
3850          if ($self->{nc} == [
3851               undef,
3852               undef,
3853               0x0045, # E
3854               0x004D, # M
3855               0x0045, # E
3856               0x004E, # N
3857              ]->[length $self->{kwd}] or
3858              $self->{nc} == [
3859               undef,
3860               undef,
3861               0x0065, # e
3862               0x006D, # m
3863               0x0065, # e
3864               0x006E, # n
3865              ]->[length $self->{kwd}]) {
3866            ## Stay in the state.
3867            $self->{kwd} .= chr $self->{nc};
3868            !!!next-input-character;
3869            redo A;
3870          } elsif ((length $self->{kwd}) == 6 and
3871                   ($self->{nc} == 0x0054 or # T
3872                    $self->{nc} == 0x0074)) { # t
3873            if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
3874              !!!parse-error (type => 'lowercase keyword', ## TODO: type
3875                              text => 'ELEMENT',
3876                              line => $self->{line_prev},
3877                              column => $self->{column_prev} - 5);
3878            }
3879            $self->{ct} = {type => ELEMENT_TOKEN, name => '',
3880                           line => $self->{line_prev},
3881                           column => $self->{column_prev} - 6};
3882            $self->{state} = DOCTYPE_MD_STATE;
3883            !!!next-input-character;
3884            redo A;
3885          } else {
3886            !!!parse-error (type => 'bogus comment',
3887                            line => $self->{line_prev},
3888                            column => $self->{column_prev} - 1
3889                                - (length $self->{kwd})
3890                                + 1 * ($self->{nc} == -1));
3891            $self->{state} = BOGUS_COMMENT_STATE;
3892            ## Reconsume.
3893            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3894            redo A;
3895          }
3896        } elsif ($self->{state} == MD_ATTLIST_STATE) {
3897          if ($self->{nc} == [
3898               undef,
3899               0x0054, # T
3900               0x0054, # T
3901               0x004C, # L
3902               0x0049, # I
3903               0x0053, # S
3904              ]->[length $self->{kwd}] or
3905              $self->{nc} == [
3906               undef,
3907               0x0074, # t
3908               0x0074, # t
3909               0x006C, # l
3910               0x0069, # i
3911               0x0073, # s
3912              ]->[length $self->{kwd}]) {
3913            ## Stay in the state.
3914            $self->{kwd} .= chr $self->{nc};
3915            !!!next-input-character;
3916            redo A;
3917          } elsif ((length $self->{kwd}) == 6 and
3918                   ($self->{nc} == 0x0054 or # T
3919                    $self->{nc} == 0x0074)) { # t
3920            if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
3921              !!!parse-error (type => 'lowercase keyword', ## TODO: type
3922                              text => 'ATTLIST',
3923                              line => $self->{line_prev},
3924                              column => $self->{column_prev} - 5);
3925            }
3926            $self->{ct} = {type => ATTLIST_TOKEN, name => '',
3927                           attrdefs => [],
3928                           line => $self->{line_prev},
3929                           column => $self->{column_prev} - 6};
3930            $self->{state} = DOCTYPE_MD_STATE;
3931            !!!next-input-character;
3932            redo A;
3933          } else {
3934            !!!parse-error (type => 'bogus comment',
3935                            line => $self->{line_prev},
3936                            column => $self->{column_prev} - 1
3937                                 - (length $self->{kwd})
3938                                 + 1 * ($self->{nc} == -1));
3939            $self->{state} = BOGUS_COMMENT_STATE;
3940            ## Reconsume.
3941            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3942            redo A;
3943          }
3944        } elsif ($self->{state} == MD_NOTATION_STATE) {
3945          if ($self->{nc} == [
3946               undef,
3947               0x004F, # O
3948               0x0054, # T
3949               0x0041, # A
3950               0x0054, # T
3951               0x0049, # I
3952               0x004F, # O
3953              ]->[length $self->{kwd}] or
3954              $self->{nc} == [
3955               undef,
3956               0x006F, # o
3957               0x0074, # t
3958               0x0061, # a
3959               0x0074, # t
3960               0x0069, # i
3961               0x006F, # o
3962              ]->[length $self->{kwd}]) {
3963            ## Stay in the state.
3964            $self->{kwd} .= chr $self->{nc};
3965            !!!next-input-character;
3966            redo A;
3967          } elsif ((length $self->{kwd}) == 7 and
3968                   ($self->{nc} == 0x004E or # N
3969                    $self->{nc} == 0x006E)) { # n
3970            if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
3971              !!!parse-error (type => 'lowercase keyword', ## TODO: type
3972                              text => 'NOTATION',
3973                              line => $self->{line_prev},
3974                              column => $self->{column_prev} - 6);
3975            }
3976            $self->{ct} = {type => NOTATION_TOKEN, name => '',
3977                           line => $self->{line_prev},
3978                           column => $self->{column_prev} - 6};
3979            $self->{state} = DOCTYPE_MD_STATE;
3980            !!!next-input-character;
3981            redo A;
3982          } else {
3983            !!!parse-error (type => 'bogus comment',
3984                            line => $self->{line_prev},
3985                            column => $self->{column_prev} - 1
3986                                - (length $self->{kwd})
3987                                + 1 * ($self->{nc} == -1));
3988            $self->{state} = BOGUS_COMMENT_STATE;
3989            ## Reconsume.
3990            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3991            redo A;
3992          }
3993        } elsif ($self->{state} == DOCTYPE_MD_STATE) {
3994          ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
3995          ## "DOCTYPE NOTATION state".
3996    
3997          if ($is_space->{$self->{nc}}) {
3998            ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
3999            $self->{state} = BEFORE_MD_NAME_STATE;
4000            !!!next-input-character;
4001            redo A;
4002          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4003                   $self->{nc} == 0x0025) { # %
4004            ## XML5: Switch to the "DOCTYPE bogus comment state".
4005            !!!parse-error (type => 'no space before md name'); ## TODO: type
4006            $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
4007            !!!next-input-character;
4008            redo A;
4009          } elsif ($self->{nc} == -1) {
4010            !!!parse-error (type => 'unclosed md'); ## TODO: type
4011            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4012            ## Reconsume.
4013            redo A;
4014          } elsif ($self->{nc} == 0x003E) { # >
4015            ## XML5: Switch to the "DOCTYPE bogus comment state".
4016            !!!parse-error (type => 'no md name'); ## TODO: type
4017            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4018            !!!next-input-character;
4019            redo A;
4020          } else {
4021            ## XML5: Switch to the "DOCTYPE bogus comment state".
4022            !!!parse-error (type => 'no space before md name'); ## TODO: type
4023            $self->{state} = BEFORE_MD_NAME_STATE;
4024            redo A;
4025          }
4026        } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
4027          ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
4028          ## before state", "DOCTYPE ATTLIST name before state".
4029    
4030          if ($is_space->{$self->{nc}}) {
4031            ## Stay in the state.
4032            !!!next-input-character;
4033            redo A;
4034          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4035                   $self->{nc} == 0x0025) { # %
4036            $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
4037            !!!next-input-character;
4038            redo A;
4039          } elsif ($self->{nc} == 0x003E) { # >
4040            ## XML5: Same as "Anything else".
4041            !!!parse-error (type => 'no md name'); ## TODO: type
4042            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4043            !!!next-input-character;
4044            redo A;
4045          } elsif ($self->{nc} == -1) {
4046            !!!parse-error (type => 'unclosed md'); ## TODO: type
4047            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4048            ## Reconsume.
4049            redo A;
4050          } else {
4051            ## XML5: [ATTLIST] Not defined yet.
4052            $self->{ct}->{name} .= chr $self->{nc};
4053            $self->{state} = MD_NAME_STATE;
4054            !!!next-input-character;
4055            redo A;
4056          }
4057        } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
4058          if ($is_space->{$self->{nc}}) {
4059            ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
4060            $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
4061            $self->{state} = BEFORE_MD_NAME_STATE;
4062            !!!next-input-character;
4063            redo A;
4064          } elsif ($self->{nc} == 0x003E) { # >
4065            ## XML5: Same as "Anything else".
4066            !!!parse-error (type => 'no md name'); ## TODO: type
4067            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4068            !!!next-input-character;
4069            redo A;
4070          } elsif ($self->{nc} == -1) {
4071            !!!parse-error (type => 'unclosed md');
4072            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4073            ## Reconsume.
4074            redo A;
4075          } else {
4076            ## XML5: No parse error.
4077            !!!parse-error (type => 'no space after ENTITY percent'); ## TODO: type
4078            $self->{state} = BOGUS_COMMENT_STATE;
4079            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4080            ## Reconsume.
4081            redo A;
4082          }
4083        } elsif ($self->{state} == MD_NAME_STATE) {
4084          ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
4085          
4086          if ($is_space->{$self->{nc}}) {
4087            if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4088              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4089            } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
4090              $self->{state} = AFTER_ELEMENT_NAME_STATE;
4091            } else { # ENTITY/NOTATION
4092              $self->{state} = AFTER_DOCTYPE_NAME_STATE;
4093            }
4094            !!!next-input-character;
4095            redo A;
4096          } elsif ($self->{nc} == 0x003E) { # >
4097            if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4098              #
4099            } else {
4100              !!!parse-error (type => 'no md def'); ## TODO: type
4101            }
4102            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4103            !!!next-input-character;
4104            !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4105            redo A;
4106          } elsif ($self->{nc} == -1) {
4107            ## XML5: [ATTLIST] No parse error.
4108            !!!parse-error (type => 'unclosed md');
4109            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4110            ## Reconsume.
4111            !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4112            redo A;
4113          } else {
4114            ## XML5: [ATTLIST] Not defined yet.
4115            $self->{ct}->{name} .= chr $self->{nc};
4116            ## Stay in the state.
4117            !!!next-input-character;
4118            redo A;
4119          }
4120        } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
4121          if ($is_space->{$self->{nc}}) {
4122            ## Stay in the state.
4123            !!!next-input-character;
4124            redo A;
4125          } elsif ($self->{nc} == 0x003E) { # >
4126            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4127            !!!next-input-character;
4128            !!!emit ($self->{ct}); # ATTLIST
4129            redo A;
4130          } elsif ($self->{nc} == -1) {
4131            ## XML5: No parse error.
4132            !!!parse-error (type => 'unclosed md'); ## TODO: type
4133            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4134            !!!emit ($self->{ct});
4135            redo A;
4136          } else {
4137            ## XML5: Not defined yet.
4138            $self->{ca} = {name => chr ($self->{nc}), # attrdef
4139                           tokens => [],
4140                           line => $self->{line}, column => $self->{column}};
4141            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
4142            !!!next-input-character;
4143            redo A;
4144          }
4145        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
4146          if ($is_space->{$self->{nc}}) {
4147            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
4148            !!!next-input-character;
4149            redo A;
4150          } elsif ($self->{nc} == 0x003E) { # >
4151            ## XML5: Same as "anything else".
4152            !!!parse-error (type => 'no attr type'); ## TODO: type
4153            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4154            !!!next-input-character;
4155            !!!emit ($self->{ct}); # ATTLIST
4156            redo A;
4157          } elsif ($self->{nc} == 0x0028) { # (
4158            ## XML5: Same as "anything else".
4159            !!!parse-error (type => 'no space before paren'); ## TODO: type
4160            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4161            !!!next-input-character;
4162            redo A;
4163          } elsif ($self->{nc} == -1) {
4164            ## XML5: No parse error.
4165            !!!parse-error (type => 'unclosed md'); ## TODO: type
4166            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4167            !!!next-input-character;
4168            !!!emit ($self->{ct}); # ATTLIST
4169            redo A;
4170          } else {
4171            ## XML5: Not defined yet.
4172            $self->{ca}->{name} .= chr $self->{nc};
4173            ## Stay in the state.
4174            !!!next-input-character;
4175            redo A;
4176          }
4177        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
4178          if ($is_space->{$self->{nc}}) {
4179            ## Stay in the state.
4180            !!!next-input-character;
4181            redo A;
4182          } elsif ($self->{nc} == 0x003E) { # >
4183            ## XML5: Same as "anything else".
4184            !!!parse-error (type => 'no attr type'); ## TODO: type
4185            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4186            !!!next-input-character;
4187            !!!emit ($self->{ct}); # ATTLIST
4188            redo A;
4189          } elsif ($self->{nc} == 0x0028) { # (
4190            ## XML5: Same as "anything else".
4191            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4192            !!!next-input-character;
4193            redo A;
4194          } elsif ($self->{nc} == -1) {
4195            ## XML5: No parse error.
4196            !!!parse-error (type => 'unclosed md'); ## TODO: type
4197            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4198            !!!next-input-character;
4199            !!!emit ($self->{ct});
4200            redo A;
4201          } else {
4202            ## XML5: Not defined yet.
4203            $self->{ca}->{type} = chr $self->{nc};
4204            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
4205            !!!next-input-character;
4206            redo A;
4207          }
4208        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
4209          if ($is_space->{$self->{nc}}) {
4210            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
4211            !!!next-input-character;
4212            redo A;
4213          } elsif ($self->{nc} == 0x0023) { # #
4214            ## XML5: Same as "anything else".
4215            !!!parse-error (type => 'no space before default value'); ## TODO: type
4216            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4217            !!!next-input-character;
4218            redo A;
4219          } elsif ($self->{nc} == 0x0022) { # "
4220            ## XML5: Same as "anything else".
4221            !!!parse-error (type => 'no space before default value'); ## TODO: type
4222            $self->{ca}->{value} = '';
4223            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4224            !!!next-input-character;
4225            redo A;
4226          } elsif ($self->{nc} == 0x0027) { # '
4227            ## XML5: Same as "anything else".
4228            !!!parse-error (type => 'no space before default value'); ## TODO: type
4229            $self->{ca}->{value} = '';
4230            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4231            !!!next-input-character;
4232            redo A;
4233          } elsif ($self->{nc} == 0x003E) { # >
4234            ## XML5: Same as "anything else".
4235            !!!parse-error (type => 'no attr default'); ## TODO: type
4236            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4237            !!!next-input-character;
4238            !!!emit ($self->{ct}); # ATTLIST
4239            redo A;
4240          } elsif ($self->{nc} == 0x0028) { # (
4241            ## XML5: Same as "anything else".
4242            !!!parse-error (type => 'no space before paren'); ## TODO: type
4243            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4244            !!!next-input-character;
4245            redo A;
4246          } elsif ($self->{nc} == -1) {
4247            ## XML5: No parse error.
4248            !!!parse-error (type => 'unclosed md'); ## TODO: type
4249            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4250            !!!next-input-character;
4251            !!!emit ($self->{ct});
4252            redo A;
4253          } else {
4254            ## XML5: Not defined yet.
4255            $self->{ca}->{type} .= chr $self->{nc};
4256            ## Stay in the state.
4257            !!!next-input-character;
4258            redo A;
4259          }
4260        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
4261          if ($is_space->{$self->{nc}}) {
4262            ## Stay in the state.
4263            !!!next-input-character;
4264            redo A;
4265          } elsif ($self->{nc} == 0x0028) { # (
4266            ## XML5: Same as "anything else".
4267            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4268            !!!next-input-character;
4269            redo A;
4270          } elsif ($self->{nc} == 0x0023) { # #
4271            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4272            !!!next-input-character;
4273            redo A;
4274          } elsif ($self->{nc} == 0x0022) { # "
4275            ## XML5: Same as "anything else".
4276            $self->{ca}->{value} = '';
4277            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4278            !!!next-input-character;
4279            redo A;
4280          } elsif ($self->{nc} == 0x0027) { # '
4281            ## XML5: Same as "anything else".
4282            $self->{ca}->{value} = '';
4283            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4284            !!!next-input-character;
4285            redo A;
4286          } elsif ($self->{nc} == 0x003E) { # >
4287            ## XML5: Same as "anything else".
4288            !!!parse-error (type => 'no attr default'); ## TODO: type
4289            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4290            !!!next-input-character;
4291            !!!emit ($self->{ct}); # ATTLIST
4292            redo A;
4293          } elsif ($self->{nc} == -1) {
4294            ## XML5: No parse error.
4295            !!!parse-error (type => 'unclosed md'); ## TODO: type
4296            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4297            !!!next-input-character;
4298            !!!emit ($self->{ct});
4299            redo A;
4300          } else {
4301            ## XML5: Switch to the "DOCTYPE bogus comment state".
4302            !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4303            $self->{ca}->{value} = '';
4304            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4305            ## Reconsume.
4306            redo A;
4307          }
4308        } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
4309          if ($is_space->{$self->{nc}}) {
4310            ## Stay in the state.
4311            !!!next-input-character;
4312            redo A;
4313          } elsif ($self->{nc} == 0x007C) { # |
4314            !!!parse-error (type => 'empty allowed token'); ## TODO: type
4315            ## Stay in the state.
4316            !!!next-input-character;
4317            redo A;
4318          } elsif ($self->{nc} == 0x0029) { # )
4319            !!!parse-error (type => 'empty allowed token'); ## TODO: type
4320            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4321            !!!next-input-character;
4322            redo A;
4323          } elsif ($self->{nc} == 0x003E) { # >
4324            !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4325            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4326            !!!next-input-character;
4327            !!!emit ($self->{ct}); # ATTLIST
4328            redo A;
4329          } elsif ($self->{nc} == -1) {
4330            ## XML5: No parse error.
4331            !!!parse-error (type => 'unclosed md'); ## TODO: type
4332            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4333            !!!next-input-character;
4334            !!!emit ($self->{ct});
4335            redo A;
4336          } else {
4337            push @{$self->{ca}->{tokens}}, chr $self->{nc};
4338            $self->{state} = ALLOWED_TOKEN_STATE;
4339            !!!next-input-character;
4340            redo A;
4341          }
4342        } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
4343          if ($is_space->{$self->{nc}}) {
4344            $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
4345            !!!next-input-character;
4346            redo A;
4347          } elsif ($self->{nc} == 0x007C) { # |
4348            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4349            !!!next-input-character;
4350            redo A;
4351          } elsif ($self->{nc} == 0x0029) { # )
4352            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4353            !!!next-input-character;
4354            redo A;
4355          } elsif ($self->{nc} == 0x003E) { # >
4356            !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4357            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4358            !!!next-input-character;
4359            !!!emit ($self->{ct}); # ATTLIST
4360            redo A;
4361          } elsif ($self->{nc} == -1) {
4362            ## XML5: No parse error.
4363            !!!parse-error (type => 'unclosed md'); ## TODO: type
4364            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4365            !!!next-input-character;
4366            !!!emit ($self->{ct});
4367            redo A;
4368          } else {
4369            $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
4370            ## Stay in the state.
4371            !!!next-input-character;
4372            redo A;
4373          }
4374        } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
4375          if ($is_space->{$self->{nc}}) {
4376            ## Stay in the state.
4377            !!!next-input-character;
4378            redo A;
4379          } elsif ($self->{nc} == 0x007C) { # |
4380            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4381            !!!next-input-character;
4382            redo A;
4383          } elsif ($self->{nc} == 0x0029) { # )
4384            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4385            !!!next-input-character;
4386            redo A;
4387          } elsif ($self->{nc} == 0x003E) { # >
4388            !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4389            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4390            !!!next-input-character;
4391            !!!emit ($self->{ct}); # ATTLIST
4392            redo A;
4393          } elsif ($self->{nc} == -1) {
4394            ## XML5: No parse error.
4395            !!!parse-error (type => 'unclosed md'); ## TODO: type
4396            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4397            !!!next-input-character;
4398            !!!emit ($self->{ct});
4399            redo A;
4400          } else {
4401            !!!parse-error (type => 'space in allowed token', ## TODO: type
4402                            line => $self->{line_prev},
4403                            column => $self->{column_prev});
4404            $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
4405            $self->{state} = ALLOWED_TOKEN_STATE;
4406            !!!next-input-character;
4407            redo A;
4408          }
4409        } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
4410          if ($is_space->{$self->{nc}}) {
4411            $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
4412            !!!next-input-character;
4413            redo A;
4414          } elsif ($self->{nc} == 0x0023) { # #
4415            !!!parse-error (type => 'no space before default value'); ## TODO: type
4416            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4417            !!!next-input-character;
4418            redo A;
4419          } elsif ($self->{nc} == 0x0022) { # "
4420            !!!parse-error (type => 'no space before default value'); ## TODO: type
4421            $self->{ca}->{value} = '';
4422            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4423            !!!next-input-character;
4424            redo A;
4425          } elsif ($self->{nc} == 0x0027) { # '
4426            !!!parse-error (type => 'no space before default value'); ## TODO: type
4427            $self->{ca}->{value} = '';
4428            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4429            !!!next-input-character;
4430            redo A;
4431          } elsif ($self->{nc} == 0x003E) { # >
4432            !!!parse-error (type => 'no attr default'); ## TODO: type
4433            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4434            !!!next-input-character;
4435            !!!emit ($self->{ct}); # ATTLIST
4436            redo A;
4437          } elsif ($self->{nc} == -1) {
4438            !!!parse-error (type => 'unclosed md'); ## TODO: type
4439            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4440            !!!next-input-character;
4441            !!!emit ($self->{ct});
4442            redo A;
4443          } else {
4444            !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4445            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4446            ## Reconsume.
4447            redo A;
4448          }
4449        } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
4450          if ($is_space->{$self->{nc}}) {
4451            ## Stay in the state.
4452            !!!next-input-character;
4453            redo A;
4454          } elsif ($self->{nc} == 0x0023) { # #
4455            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4456            !!!next-input-character;
4457            redo A;
4458          } elsif ($self->{nc} == 0x0022) { # "
4459            $self->{ca}->{value} = '';
4460            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4461            !!!next-input-character;
4462            redo A;
4463          } elsif ($self->{nc} == 0x0027) { # '
4464            $self->{ca}->{value} = '';
4465            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4466            !!!next-input-character;
4467            redo A;
4468          } elsif ($self->{nc} == 0x003E) { # >
4469            !!!parse-error (type => 'no attr default'); ## TODO: type
4470            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4471            !!!next-input-character;
4472            !!!emit ($self->{ct}); # ATTLIST
4473            redo A;
4474          } elsif ($self->{nc} == -1) {
4475            !!!parse-error (type => 'unclosed md'); ## TODO: type
4476            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4477            !!!next-input-character;
4478            !!!emit ($self->{ct});
4479            redo A;
4480          } else {
4481            !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4482            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4483            ## Reconsume.
4484            redo A;
4485          }
4486        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
4487          if ($is_space->{$self->{nc}}) {
4488            ## XML5: No parse error.
4489            !!!parse-error (type => 'no default type'); ## TODO: type
4490            $self->{state} = BOGUS_MD_STATE;
4491            ## Reconsume.
4492            redo A;
4493          } elsif ($self->{nc} == 0x0022) { # "
4494            ## XML5: Same as "anything else".
4495            $self->{ca}->{value} = '';
4496            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4497            !!!next-input-character;
4498            redo A;
4499          } elsif ($self->{nc} == 0x0027) { # '
4500            ## XML5: Same as "anything else".
4501            $self->{ca}->{value} = '';
4502            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4503            !!!next-input-character;
4504            redo A;
4505          } elsif ($self->{nc} == 0x003E) { # >
4506            ## XML5: Same as "anything else".
4507            !!!parse-error (type => 'no attr default'); ## TODO: type
4508            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4509            !!!next-input-character;
4510            !!!emit ($self->{ct}); # ATTLIST
4511            redo A;
4512          } elsif ($self->{nc} == -1) {
4513            ## XML5: No parse error.
4514            !!!parse-error (type => 'unclosed md'); ## TODO: type
4515            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4516            !!!next-input-character;
4517            !!!emit ($self->{ct});
4518            redo A;
4519          } else {
4520            $self->{ca}->{default} = chr $self->{nc};
4521            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
4522            !!!next-input-character;
4523            redo A;
4524          }
4525        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
4526          if ($is_space->{$self->{nc}}) {
4527            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
4528            !!!next-input-character;
4529            redo A;
4530          } elsif ($self->{nc} == 0x0022) { # "
4531            ## XML5: Same as "anything else".
4532            !!!parse-error (type => 'no space before default value'); ## TODO: type
4533            $self->{ca}->{value} = '';
4534            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4535            !!!next-input-character;
4536            redo A;
4537          } elsif ($self->{nc} == 0x0027) { # '
4538            ## XML5: Same as "anything else".
4539            !!!parse-error (type => 'no space before default value'); ## TODO: type
4540            $self->{ca}->{value} = '';
4541            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4542            !!!next-input-character;
4543            redo A;
4544          } elsif ($self->{nc} == 0x003E) { # >
4545            ## XML5: Same as "anything else".
4546            push @{$self->{ct}->{attrdefs}}, $self->{ca};
4547            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4548            !!!next-input-character;
4549            !!!emit ($self->{ct}); # ATTLIST
4550            redo A;
4551          } elsif ($self->{nc} == -1) {
4552            ## XML5: No parse error.
4553            !!!parse-error (type => 'unclosed md'); ## TODO: type
4554            push @{$self->{ct}->{attrdefs}}, $self->{ca};
4555            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4556            !!!next-input-character;
4557            !!!emit ($self->{ct});
4558            redo A;
4559          } else {
4560            $self->{ca}->{default} .= chr $self->{nc};
4561            ## Stay in the state.
4562            !!!next-input-character;
4563            redo A;
4564          }
4565        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
4566          if ($is_space->{$self->{nc}}) {
4567            ## Stay in the state.
4568            !!!next-input-character;
4569            redo A;
4570          } elsif ($self->{nc} == 0x0022) { # "
4571            $self->{ca}->{value} = '';
4572            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4573            !!!next-input-character;
4574            redo A;
4575          } elsif ($self->{nc} == 0x0027) { # '
4576            $self->{ca}->{value} = '';
4577            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4578            !!!next-input-character;
4579            redo A;
4580          } elsif ($self->{nc} == 0x003E) { # >
4581            push @{$self->{ct}->{attrdefs}}, $self->{ca};
4582            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4583            !!!next-input-character;
4584            !!!emit ($self->{ct}); # ATTLIST
4585            redo A;
4586          } elsif ($self->{nc} == -1) {
4587            ## XML5: No parse error.
4588            !!!parse-error (type => 'unclosed md'); ## TODO: type
4589            push @{$self->{ct}->{attrdefs}}, $self->{ca};
4590            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4591            !!!next-input-character;
4592            !!!emit ($self->{ct});
4593            redo A;
4594          } else {
4595            ## XML5: Not defined yet.
4596            if ($self->{ca}->{default} eq 'FIXED') {
4597              $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4598            } else {
4599              push @{$self->{ct}->{attrdefs}}, $self->{ca};
4600              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4601            }
4602            ## Reconsume.
4603            redo A;
4604          }
4605        } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
4606          if ($is_space->{$self->{nc}} or
4607              $self->{nc} == -1 or
4608              $self->{nc} == 0x003E) { # >
4609            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4610            ## Reconsume.
4611            redo A;
4612          } else {
4613            !!!parse-error (type => 'no space before attr name'); ## TODO: type
4614            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4615            ## Reconsume.
4616            redo A;
4617          }
4618        } elsif ($self->{state} == NDATA_STATE) {
4619          ## ASCII case-insensitive
4620          if ($self->{nc} == [
4621                undef,
4622                0x0044, # D
4623                0x0041, # A
4624                0x0054, # T
4625              ]->[length $self->{kwd}] or
4626              $self->{nc} == [
4627                undef,
4628                0x0064, # d
4629                0x0061, # a
4630                0x0074, # t
4631              ]->[length $self->{kwd}]) {
4632            !!!cp (172.2);
4633            ## Stay in the state.
4634            $self->{kwd} .= chr $self->{nc};
4635            !!!next-input-character;
4636            redo A;
4637          } elsif ((length $self->{kwd}) == 4 and
4638                   ($self->{nc} == 0x0041 or # A
4639                    $self->{nc} == 0x0061)) { # a
4640            if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
4641              !!!cp (172.3);
4642              !!!parse-error (type => 'lowercase keyword', ## TODO: type
4643                              text => 'NDATA',
4644                              line => $self->{line_prev},
4645                              column => $self->{column_prev} - 4);
4646            } else {
4647              !!!cp (172.4);
4648            }
4649            $self->{state} = AFTER_NDATA_STATE;
4650            !!!next-input-character;
4651            redo A;
4652          } else {
4653            !!!parse-error (type => 'string after literal', ## TODO: type
4654                            line => $self->{line_prev},
4655                            column => $self->{column_prev} + 1
4656                                - length $self->{kwd});
4657            !!!cp (172.5);
4658            $self->{state} = BOGUS_MD_STATE;
4659            ## Reconsume.
4660            redo A;
4661          }
4662        } elsif ($self->{state} == AFTER_NDATA_STATE) {
4663          if ($is_space->{$self->{nc}}) {
4664            $self->{state} = BEFORE_NOTATION_NAME_STATE;
4665            !!!next-input-character;
4666            redo A;
4667          } elsif ($self->{nc} == 0x003E) { # >
4668            !!!parse-error (type => 'no notation name'); ## TODO: type
4669            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4670            !!!next-input-character;
4671            !!!emit ($self->{ct}); # ENTITY
4672            redo A;
4673          } elsif ($self->{nc} == -1) {
4674            !!!parse-error (type => 'unclosed md'); ## TODO: type
4675            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4676            !!!next-input-character;
4677            !!!emit ($self->{ct}); # ENTITY
4678            redo A;
4679          } else {
4680            !!!parse-error (type => 'string after literal', ## TODO: type
4681                            line => $self->{line_prev},
4682                            column => $self->{column_prev} + 1
4683                                - length $self->{kwd});
4684            $self->{state} = BOGUS_MD_STATE;
4685            ## Reconsume.
4686            redo A;
4687          }
4688        } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
4689          if ($is_space->{$self->{nc}}) {
4690            ## Stay in the state.
4691            !!!next-input-character;
4692            redo A;
4693          } elsif ($self->{nc} == 0x003E) { # >
4694            !!!parse-error (type => 'no notation name'); ## TODO: type
4695            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4696            !!!next-input-character;
4697            !!!emit ($self->{ct}); # ENTITY
4698            redo A;
4699          } elsif ($self->{nc} == -1) {
4700            !!!parse-error (type => 'unclosed md'); ## TODO: type
4701            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4702            !!!next-input-character;
4703            !!!emit ($self->{ct}); # ENTITY
4704            redo A;
4705          } else {
4706            $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
4707            $self->{state} = NOTATION_NAME_STATE;
4708            !!!next-input-character;
4709            redo A;
4710          }
4711        } elsif ($self->{state} == NOTATION_NAME_STATE) {
4712          if ($is_space->{$self->{nc}}) {
4713            $self->{state} = AFTER_MD_DEF_STATE;
4714            !!!next-input-character;
4715            redo A;
4716          } elsif ($self->{nc} == 0x003E) { # >
4717            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4718            !!!next-input-character;
4719            !!!emit ($self->{ct}); # ENTITY
4720            redo A;
4721          } elsif ($self->{nc} == -1) {
4722            !!!parse-error (type => 'unclosed md'); ## TODO: type
4723            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4724            !!!next-input-character;
4725            !!!emit ($self->{ct}); # ENTITY
4726            redo A;
4727          } else {
4728            $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
4729            ## Stay in the state.
4730            !!!next-input-character;
4731            redo A;
4732          }
4733        } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
4734          if ($self->{nc} == 0x0022) { # "
4735            $self->{state} = AFTER_MD_DEF_STATE;
4736            !!!next-input-character;
4737            redo A;
4738          } elsif ($self->{nc} == 0x0026) { # &
4739            $self->{prev_state} = $self->{state};
4740            $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4741            $self->{entity_add} = 0x0022; # "
4742            !!!next-input-character;
4743            redo A;
4744    ## TODO: %
4745          } elsif ($self->{nc} == -1) {
4746            !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4747            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4748            ## Reconsume.
4749            !!!emit ($self->{ct}); # ENTITY
4750            redo A;
4751          } else {
4752            $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4753            !!!next-input-character;
4754            redo A;
4755          }
4756        } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
4757          if ($self->{nc} == 0x0027) { # '
4758            $self->{state} = AFTER_MD_DEF_STATE;
4759            !!!next-input-character;
4760            redo A;
4761          } elsif ($self->{nc} == 0x0026) { # &
4762            $self->{prev_state} = $self->{state};
4763            $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4764            $self->{entity_add} = 0x0027; # '
4765            !!!next-input-character;
4766            redo A;
4767    ## TODO: %
4768          } elsif ($self->{nc} == -1) {
4769            !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4770            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4771            ## Reconsume.
4772            !!!emit ($self->{ct}); # ENTITY
4773            redo A;
4774          } else {
4775            $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4776            !!!next-input-character;
4777            redo A;
4778          }
4779        } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
4780          ## TODO: XMLize
4781    
4782          if ($is_space->{$self->{nc}} or
4783              {
4784                0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4785                $self->{entity_add} => 1,
4786              }->{$self->{nc}}) {
4787            ## Don't consume
4788            ## No error
4789            ## Return nothing.
4790            #
4791          } elsif ($self->{nc} == 0x0023) { # #
4792            $self->{ca} = $self->{ct};
4793            $self->{state} = ENTITY_HASH_STATE;
4794            $self->{kwd} = '#';
4795            !!!next-input-character;
4796            redo A;
4797          } elsif ((0x0041 <= $self->{nc} and
4798                    $self->{nc} <= 0x005A) or # A..Z
4799                   (0x0061 <= $self->{nc} and
4800                    $self->{nc} <= 0x007A)) { # a..z
4801            #
4802          } else {
4803            !!!parse-error (type => 'bare ero');
4804            ## Return nothing.
4805            #
4806          }
4807    
4808          $self->{ct}->{value} .= '&';
4809          $self->{state} = $self->{prev_state};
4810          ## Reconsume.
4811          redo A;
4812        } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
4813          if ($is_space->{$self->{nc}}) {
4814            $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
4815            !!!next-input-character;
4816            redo A;
4817          } elsif ($self->{nc} == 0x0028) { # (
4818            $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4819            $self->{ct}->{content} = ['('];
4820            $self->{group_depth} = 1;
4821            !!!next-input-character;
4822            redo A;
4823          } elsif ($self->{nc} == 0x003E) { # >
4824            !!!parse-error (type => 'no md def'); ## TODO: type
4825            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4826            !!!next-input-character;
4827            !!!emit ($self->{ct}); # ELEMENT
4828            redo A;
4829          } elsif ($self->{nc} == -1) {
4830            !!!parse-error (type => 'unclosed md'); ## TODO: type
4831            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4832            !!!next-input-character;
4833            !!!emit ($self->{ct}); # ELEMENT
4834            redo A;
4835          } else {
4836            $self->{ct}->{content} = [chr $self->{nc}];
4837            $self->{state} = CONTENT_KEYWORD_STATE;
4838            !!!next-input-character;
4839            redo A;
4840          }
4841        } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
4842          if ($is_space->{$self->{nc}}) {
4843            $self->{state} = AFTER_MD_DEF_STATE;
4844            !!!next-input-character;
4845            redo A;
4846          } elsif ($self->{nc} == 0x003E) { # >
4847            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4848            !!!next-input-character;
4849            !!!emit ($self->{ct}); # ELEMENT
4850            redo A;
4851          } elsif ($self->{nc} == -1) {
4852            !!!parse-error (type => 'unclosed md'); ## TODO: type
4853            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4854            !!!next-input-character;
4855            !!!emit ($self->{ct}); # ELEMENT
4856            redo A;
4857          } else {
4858            $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
4859            ## Stay in the state.
4860            !!!next-input-character;
4861            redo A;
4862          }
4863        } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
4864          if ($is_space->{$self->{nc}}) {
4865            ## Stay in the state.
4866            !!!next-input-character;
4867            redo A;
4868          } elsif ($self->{nc} == 0x0028) { # (
4869            $self->{group_depth}++;
4870            push @{$self->{ct}->{content}}, chr $self->{nc};
4871            ## Stay in the state.
4872            !!!next-input-character;
4873            redo A;
4874          } elsif ($self->{nc} == 0x007C or # |
4875                   $self->{nc} == 0x002C) { # ,
4876            !!!parse-error (type => 'empty element name'); ## TODO: type
4877            ## Stay in the state.
4878            !!!next-input-character;
4879            redo A;
4880          } elsif ($self->{nc} == 0x0029) { # )
4881            !!!parse-error (type => 'empty element name'); ## TODO: type
4882            push @{$self->{ct}->{content}}, chr $self->{nc};
4883            $self->{group_depth}--;
4884            $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
4885            !!!next-input-character;
4886            redo A;
4887          } elsif ($self->{nc} == 0x003E) { # >
4888            !!!parse-error (type => 'unclosed cm group'); ## TODO: type
4889            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4890            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4891            !!!next-input-character;
4892            !!!emit ($self->{ct}); # ELEMENT
4893            redo A;
4894          } elsif ($self->{nc} == -1) {
4895            !!!parse-error (type => 'unclosed md'); ## TODO: type
4896            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4897            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4898            !!!next-input-character;
4899            !!!emit ($self->{ct}); # ELEMENT
4900            redo A;
4901          } else {
4902            push @{$self->{ct}->{content}}, chr $self->{nc};
4903            $self->{state} = CM_ELEMENT_NAME_STATE;
4904            !!!next-input-character;
4905            redo A;
4906          }
4907        } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
4908          if ($is_space->{$self->{nc}}) {
4909            $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
4910            !!!next-input-character;
4911            redo A;
4912          } elsif ($self->{nc} == 0x002A or # *
4913                   $self->{nc} == 0x002B or # +
4914                   $self->{nc} == 0x003F) { # ?
4915            push @{$self->{ct}->{content}}, chr $self->{nc};
4916            $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
4917            !!!next-input-character;
4918            redo A;
4919          } elsif ($self->{nc} == 0x007C or # |
4920                   $self->{nc} == 0x002C) { # ,
4921            push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
4922            $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4923            !!!next-input-character;
4924            redo A;
4925          } elsif ($self->{nc} == 0x0029) { # )
4926            $self->{group_depth}--;
4927            push @{$self->{ct}->{content}}, chr $self->{nc};
4928            $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
4929            !!!next-input-character;
4930            redo A;
4931          } elsif ($self->{nc} == 0x003E) { # >
4932            !!!parse-error (type => 'unclosed cm group'); ## TODO: type
4933            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4934            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4935            !!!next-input-character;
4936            !!!emit ($self->{ct}); # ELEMENT
4937            redo A;
4938          } elsif ($self->{nc} == -1) {
4939            !!!parse-error (type => 'unclosed md'); ## TODO: type
4940            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4941            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4942            !!!next-input-character;
4943            !!!emit ($self->{ct}); # ELEMENT
4944            redo A;
4945          } else {
4946            $self->{ct}->{content}->[-1] .= chr $self->{nc};
4947            ## Stay in the state.
4948            !!!next-input-character;
4949            redo A;
4950          }
4951        } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
4952          if ($is_space->{$self->{nc}}) {
4953            ## Stay in the state.
4954            !!!next-input-character;
4955            redo A;
4956          } elsif ($self->{nc} == 0x007C or # |
4957                   $self->{nc} == 0x002C) { # ,
4958            push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
4959            $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4960            !!!next-input-character;
4961            redo A;
4962          } elsif ($self->{nc} == 0x0029) { # )
4963            $self->{group_depth}--;
4964            push @{$self->{ct}->{content}}, chr $self->{nc};
4965            $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
4966            !!!next-input-character;
4967            redo A;
4968          } elsif ($self->{nc} == 0x003E) { # >
4969            !!!parse-error (type => 'unclosed cm group'); ## TODO: type
4970            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4971            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4972            !!!next-input-character;
4973            !!!emit ($self->{ct}); # ELEMENT
4974            redo A;
4975          } elsif ($self->{nc} == -1) {
4976            !!!parse-error (type => 'unclosed md'); ## TODO: type
4977            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4978            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4979            !!!next-input-character;
4980            !!!emit ($self->{ct}); # ELEMENT
4981            redo A;
4982          } else {
4983            !!!parse-error (type => 'after element name'); ## TODO: type
4984            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4985            $self->{state} = BOGUS_MD_STATE;
4986            !!!next-input-character;
4987            redo A;
4988          }
4989        } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
4990          if ($is_space->{$self->{nc}}) {
4991            if ($self->{group_depth}) {
4992              $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
4993            } else {
4994              $self->{state} = AFTER_MD_DEF_STATE;
4995            }
4996            !!!next-input-character;
4997            redo A;
4998          } elsif ($self->{nc} == 0x002A or # *
4999                   $self->{nc} == 0x002B or # +
5000                   $self->{nc} == 0x003F) { # ?
5001            push @{$self->{ct}->{content}}, chr $self->{nc};
5002            if ($self->{group_depth}) {
5003              $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5004            } else {
5005              $self->{state} = AFTER_MD_DEF_STATE;
5006            }
5007            !!!next-input-character;
5008            redo A;
5009          } elsif ($self->{nc} == 0x0029) { # )
5010            if ($self->{group_depth}) {
5011              $self->{group_depth}--;
5012              push @{$self->{ct}->{content}}, chr $self->{nc};
5013              ## Stay in the state.
5014              !!!next-input-character;
5015              redo A;
5016            } else {
5017              !!!parse-error (type => 'string after md def'); ## TODO: type
5018              $self->{state} = BOGUS_MD_STATE;
5019              ## Reconsume.
5020              redo A;
5021            }
5022          } elsif ($self->{nc} == 0x003E) { # >
5023            if ($self->{group_depth}) {
5024              !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5025              push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5026            }
5027            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5028            !!!next-input-character;
5029            !!!emit ($self->{ct}); # ELEMENT
5030            redo A;
5031          } elsif ($self->{nc} == -1) {
5032            !!!parse-error (type => 'unclosed md'); ## TODO: type
5033            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5034            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5035            !!!next-input-character;
5036            !!!emit ($self->{ct}); # ELEMENT
5037            redo A;
5038          } else {
5039            if ($self->{group_depth}) {
5040              $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5041            } else {
5042              !!!parse-error (type => 'string after md def'); ## TODO: type
5043              $self->{state} = BOGUS_MD_STATE;
5044            }
5045            ## Reconsume.
5046            redo A;
5047          }
5048        } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
5049          if ($is_space->{$self->{nc}}) {
5050            ## Stay in the state.
5051            !!!next-input-character;
5052            redo A;
5053          } elsif ($self->{nc} == 0x003E) { # >
5054            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5055            !!!next-input-character;
5056            !!!emit ($self->{ct}); # ENTITY/ELEMENT
5057            redo A;
5058          } elsif ($self->{nc} == -1) {
5059            !!!parse-error (type => 'unclosed md'); ## TODO: type
5060            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5061            !!!next-input-character;
5062            !!!emit ($self->{ct}); # ENTITY/ELEMENT
5063            redo A;
5064          } else {
5065            !!!parse-error (type => 'string after md def'); ## TODO: type
5066            $self->{state} = BOGUS_MD_STATE;
5067            ## Reconsume.
5068            redo A;
5069          }
5070        } elsif ($self->{state} == BOGUS_MD_STATE) {
5071          if ($self->{nc} == 0x003E) { # >
5072            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5073            !!!next-input-character;
5074            !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
5075            redo A;
5076          } elsif ($self->{nc} == -1) {
5077            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5078            ## Reconsume.
5079            !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
5080            redo A;
5081          } else {
5082            ## Stay in the state.
5083          !!!next-input-character;          !!!next-input-character;
5084          redo A;          redo A;
5085        }        }
           
5086      } else {      } else {
5087        die "$0: $self->{state}: Unknown state";        die "$0: $self->{state}: Unknown state";
5088      }      }
# Line 3259  sub _get_next_token ($) { Line 5093  sub _get_next_token ($) {
5093    
5094  1;  1;
5095  ## $Date$  ## $Date$
5096                                    

Legend:
Removed from v.1.12  
changed lines
  Added in v.1.21

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24