/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.12 by wakaba, Wed Oct 15 12:49:49 2008 UTC revision 1.19 by wakaba, Sun Oct 19 07:19:00 2008 UTC
# Line 15  BEGIN { Line 15  BEGIN {
15      CHARACTER_TOKEN      CHARACTER_TOKEN
16      PI_TOKEN      PI_TOKEN
17      ABORT_TOKEN      ABORT_TOKEN
18        END_OF_DOCTYPE_TOKEN
19        ATTLIST_TOKEN
20        ELEMENT_TOKEN
21        GENERAL_ENTITY_TOKEN
22        PARAMETER_ENTITY_TOKEN
23        NOTATION_TOKEN
24    );    );
25        
26    our %EXPORT_TAGS = (    our %EXPORT_TAGS = (
# Line 27  BEGIN { Line 33  BEGIN {
33        CHARACTER_TOKEN        CHARACTER_TOKEN
34        PI_TOKEN        PI_TOKEN
35        ABORT_TOKEN        ABORT_TOKEN
36          END_OF_DOCTYPE_TOKEN
37          ATTLIST_TOKEN
38          ELEMENT_TOKEN
39          GENERAL_ENTITY_TOKEN
40          PARAMETER_ENTITY_TOKEN
41          NOTATION_TOKEN
42      )],      )],
43    );    );
44  }  }
# Line 43  sub END_OF_FILE_TOKEN () { 5 } Line 55  sub END_OF_FILE_TOKEN () { 5 }
55  sub CHARACTER_TOKEN () { 6 }  sub CHARACTER_TOKEN () { 6 }
56  sub PI_TOKEN () { 7 } ## NOTE: XML only.  sub PI_TOKEN () { 7 } ## NOTE: XML only.
57  sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.  sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58    sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59    sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60    sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61    sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62    sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63    sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64    
65  ## XML5: XML5 has "empty tag token".  In this implementation, it is  ## XML5: XML5 has "empty tag token".  In this implementation, it is
66  ## represented as a start tag token with $self->{self_closing} flag  ## represented as a start tag token with $self->{self_closing} flag
# Line 133  sub PI_AFTER_STATE () { 55 } Line 151  sub PI_AFTER_STATE () { 55 }
151  sub PI_DATA_AFTER_STATE () { 56 }  sub PI_DATA_AFTER_STATE () { 56 }
152  sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }  sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153  sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }  sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154    sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155    sub DOCTYPE_TAG_STATE () { 60 }
156    sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157    sub MD_ATTLIST_STATE () { 62 }
158    sub MD_E_STATE () { 63 }
159    sub MD_ELEMENT_STATE () { 64 }
160    sub MD_ENTITY_STATE () { 65 }
161    sub MD_NOTATION_STATE () { 66 }
162    sub DOCTYPE_MD_STATE () { 67 }
163    sub BEFORE_MD_NAME_STATE () { 68 }
164    sub MD_NAME_STATE () { 69 }
165    sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166    sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167    sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168    sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169    sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170    sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171    sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172    sub ALLOWED_TOKEN_STATE () { 77 }
173    sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174    sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175    sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179    sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180    sub BEFORE_NDATA_STATE () { 85 }
181    sub NDATA_STATE () { 86 }
182    sub AFTER_NDATA_STATE () { 87 }
183    sub BEFORE_NOTATION_NAME_STATE () { 88 }
184    sub NOTATION_NAME_STATE () { 89 }
185    sub AFTER_NOTATION_NAME_STATE () { 90 }
186    sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 91 }
187    sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 92 }
188    sub ENTITY_VALUE_ENTITY_STATE () { 93 }
189    sub BOGUS_MD_STATE () { 94 }
190    
191  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
192  ## list and descriptions)  ## list and descriptions)
# Line 1226  sub _get_next_token ($) { Line 1280  sub _get_next_token ($) {
1280          redo A;          redo A;
1281        }        }
1282      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1283        ## XML5: "Tag attribute value double quoted state".        ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1284          ## ATTLIST attribute value double quoted state".
1285                
1286        if ($self->{nc} == 0x0022) { # "        if ($self->{nc} == 0x0022) { # "
1287          !!!cp (95);          if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1288          ## XML5: "Tag attribute name before state".            !!!cp (95.1);
1289          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;            ## XML5: "DOCTYPE ATTLIST name after state".
1290              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1291              $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1292            } else {
1293              !!!cp (95);
1294              ## XML5: "Tag attribute name before state".
1295              $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1296            }
1297          !!!next-input-character;          !!!next-input-character;
1298          redo A;          redo A;
1299        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
# Line 1252  sub _get_next_token ($) { Line 1314  sub _get_next_token ($) {
1314          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1315            !!!cp (97);            !!!cp (97);
1316            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1317    
1318              $self->{state} = DATA_STATE;
1319              $self->{s_kwd} = '';
1320              ## reconsume
1321              !!!emit ($self->{ct}); # start tag
1322              redo A;
1323          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1324            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1325            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1261  sub _get_next_token ($) { Line 1329  sub _get_next_token ($) {
1329              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
1330              !!!cp (99);              !!!cp (99);
1331            }            }
1332    
1333              $self->{state} = DATA_STATE;
1334              $self->{s_kwd} = '';
1335              ## reconsume
1336              !!!emit ($self->{ct}); # end tag
1337              redo A;
1338            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1339              ## XML5: No parse error above; not defined yet.
1340              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1341              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1342              ## Reconsume.
1343              !!!emit ($self->{ct}); # ATTLIST
1344              redo A;
1345          } else {          } else {
1346            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1347          }          }
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
         ## reconsume  
   
         !!!emit ($self->{ct}); # start tag or end tag  
   
         redo A;  
1348        } else {        } else {
1349            ## XML5 [ATTLIST]: Not defined yet.
1350          if ($self->{is_xml} and $self->{nc} == 0x003C) { # <          if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1351            !!!cp (100);            !!!cp (100);
1352            ## XML5: Not a parse error.            ## XML5: Not a parse error.
# Line 1289  sub _get_next_token ($) { Line 1364  sub _get_next_token ($) {
1364          redo A;          redo A;
1365        }        }
1366      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1367        ## XML5: "Tag attribute value single quoted state".        ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1368          ## ATTLIST attribute value single quoted state".
1369    
1370        if ($self->{nc} == 0x0027) { # '        if ($self->{nc} == 0x0027) { # '
1371          !!!cp (101);          if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1372          ## XML5: "Before attribute name state" (sic).            !!!cp (101.1);
1373          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;            ## XML5: "DOCTYPE ATTLIST name after state".
1374              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1375              $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1376            } else {
1377              !!!cp (101);
1378              ## XML5: "Before attribute name state" (sic).
1379              $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1380            }
1381          !!!next-input-character;          !!!next-input-character;
1382          redo A;          redo A;
1383        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
# Line 1315  sub _get_next_token ($) { Line 1398  sub _get_next_token ($) {
1398          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1399            !!!cp (103);            !!!cp (103);
1400            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1401    
1402              $self->{state} = DATA_STATE;
1403              $self->{s_kwd} = '';
1404              ## reconsume
1405              !!!emit ($self->{ct}); # start tag
1406              redo A;
1407          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1408            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1409            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1324  sub _get_next_token ($) { Line 1413  sub _get_next_token ($) {
1413              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
1414              !!!cp (105);              !!!cp (105);
1415            }            }
1416    
1417              $self->{state} = DATA_STATE;
1418              $self->{s_kwd} = '';
1419              ## reconsume
1420              !!!emit ($self->{ct}); # end tag
1421              redo A;
1422            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1423              ## XML5: No parse error above; not defined yet.
1424              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1425              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1426              ## Reconsume.
1427              !!!emit ($self->{ct}); # ATTLIST
1428              redo A;
1429          } else {          } else {
1430            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1431          }          }
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
         ## reconsume  
   
         !!!emit ($self->{ct}); # start tag or end tag  
   
         redo A;  
1432        } else {        } else {
1433            ## XML5 [ATTLIST]: Not defined yet.
1434          if ($self->{is_xml} and $self->{nc} == 0x003C) { # <          if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1435            !!!cp (106);            !!!cp (106);
1436            ## XML5: Not a parse error.            ## XML5: Not a parse error.
# Line 1355  sub _get_next_token ($) { Line 1451  sub _get_next_token ($) {
1451        ## XML5: "Tag attribute value unquoted state".        ## XML5: "Tag attribute value unquoted state".
1452    
1453        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1454          !!!cp (107);          if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1455          ## XML5: "Tag attribute name before state".            !!!cp (107.1);
1456          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;            push @{$self->{ct}->{attrdefs}}, $self->{ca};
1457              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
1458            } else {
1459              !!!cp (107);
1460              ## XML5: "Tag attribute name before state".
1461              $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1462            }
1463          !!!next-input-character;          !!!next-input-character;
1464          redo A;          redo A;
1465        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
# Line 1378  sub _get_next_token ($) { Line 1480  sub _get_next_token ($) {
1480          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1481            !!!cp (109);            !!!cp (109);
1482            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1483    
1484              $self->{state} = DATA_STATE;
1485              $self->{s_kwd} = '';
1486              !!!next-input-character;
1487              !!!emit ($self->{ct}); # start tag
1488              redo A;
1489          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1490            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1491            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1387  sub _get_next_token ($) { Line 1495  sub _get_next_token ($) {
1495              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
1496              !!!cp (111);              !!!cp (111);
1497            }            }
1498    
1499              $self->{state} = DATA_STATE;
1500              $self->{s_kwd} = '';
1501              !!!next-input-character;
1502              !!!emit ($self->{ct}); # end tag
1503              redo A;
1504            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1505              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1506              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1507              !!!next-input-character;
1508              !!!emit ($self->{ct}); # ATTLIST
1509              redo A;
1510          } else {          } else {
1511            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1512          }          }
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
         !!!next-input-character;  
   
         !!!emit ($self->{ct}); # start tag or end tag  
   
         redo A;  
1513        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!parse-error (type => 'unclosed tag');  
1514          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1515            !!!cp (112);            !!!cp (112);
1516              !!!parse-error (type => 'unclosed tag');
1517            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1518    
1519              $self->{state} = DATA_STATE;
1520              $self->{s_kwd} = '';
1521              ## reconsume
1522              !!!emit ($self->{ct}); # start tag
1523              redo A;
1524          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1525              !!!parse-error (type => 'unclosed tag');
1526            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1527            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
1528              !!!cp (113);              !!!cp (113);
# Line 1411  sub _get_next_token ($) { Line 1531  sub _get_next_token ($) {
1531              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
1532              !!!cp (114);              !!!cp (114);
1533            }            }
1534    
1535              $self->{state} = DATA_STATE;
1536              $self->{s_kwd} = '';
1537              ## reconsume
1538              !!!emit ($self->{ct}); # end tag
1539              redo A;
1540            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1541              !!!parse-error (type => 'unclosed md'); ## TODO: type
1542              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1543              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1544              ## Reconsume.
1545              !!!emit ($self->{ct}); # ATTLIST
1546              redo A;
1547          } else {          } else {
1548            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1549          }          }
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
         ## reconsume  
   
         !!!emit ($self->{ct}); # start tag or end tag  
   
         redo A;  
1550        } else {        } else {
1551          if ({          if ({
1552               0x0022 => 1, # "               0x0022 => 1, # "
# Line 1563  sub _get_next_token ($) { Line 1689  sub _get_next_token ($) {
1689          redo A;          redo A;
1690        }        }
1691      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1692        ## (only happen if PCDATA state)        ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
1693    
1694        ## NOTE: Unlike spec's "bogus comment state", this implementation        ## NOTE: Unlike spec's "bogus comment state", this implementation
1695        ## consumes characters one-by-one basis.        ## consumes characters one-by-one basis.
1696                
1697        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
1698          !!!cp (124);          if ($self->{in_subset}) {
1699          $self->{state} = DATA_STATE;            !!!cp (123);
1700          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1701            } else {
1702              !!!cp (124);
1703              $self->{state} = DATA_STATE;
1704              $self->{s_kwd} = '';
1705            }
1706          !!!next-input-character;          !!!next-input-character;
1707    
1708          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
1709          redo A;          redo A;
1710        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1711          !!!cp (125);          if ($self->{in_subset}) {
1712          $self->{state} = DATA_STATE;            !!!cp (125.1);
1713          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1714            } else {
1715              !!!cp (125);
1716              $self->{state} = DATA_STATE;
1717              $self->{s_kwd} = '';
1718            }
1719          ## reconsume          ## reconsume
1720    
1721          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1596  sub _get_next_token ($) { Line 1732  sub _get_next_token ($) {
1732          redo A;          redo A;
1733        }        }
1734      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1735        ## (only happen if PCDATA state)        ## XML5: "Markup declaration state".
1736                
1737        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
1738          !!!cp (133);          !!!cp (133);
# Line 1772  sub _get_next_token ($) { Line 1908  sub _get_next_token ($) {
1908          !!!next-input-character;          !!!next-input-character;
1909          redo A;          redo A;
1910        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
         !!!cp (138);  
1911          !!!parse-error (type => 'bogus comment');          !!!parse-error (type => 'bogus comment');
1912          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1913          $self->{s_kwd} = '';            !!!cp (138.1);
1914              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1915            } else {
1916              !!!cp (138);
1917              $self->{state} = DATA_STATE;
1918              $self->{s_kwd} = '';
1919            }
1920          !!!next-input-character;          !!!next-input-character;
1921    
1922          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
1923    
1924          redo A;          redo A;
1925        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (139);  
1926          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1927          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1928          $self->{s_kwd} = '';            !!!cp (139.1);
1929              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1930            } else {
1931              !!!cp (139);
1932              $self->{state} = DATA_STATE;
1933              $self->{s_kwd} = '';
1934            }
1935          ## reconsume          ## reconsume
1936    
1937          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1806  sub _get_next_token ($) { Line 1952  sub _get_next_token ($) {
1952          !!!next-input-character;          !!!next-input-character;
1953          redo A;          redo A;
1954        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
         !!!cp (142);  
1955          !!!parse-error (type => 'bogus comment');          !!!parse-error (type => 'bogus comment');
1956          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1957          $self->{s_kwd} = '';            !!!cp (142.1);
1958              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1959            } else {
1960              !!!cp (142);
1961              $self->{state} = DATA_STATE;
1962              $self->{s_kwd} = '';
1963            }
1964          !!!next-input-character;          !!!next-input-character;
1965    
1966          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
1967    
1968          redo A;          redo A;
1969        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (143);  
1970          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1971          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1972          $self->{s_kwd} = '';            !!!cp (143.1);
1973              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1974            } else {
1975              !!!cp (143);
1976              $self->{state} = DATA_STATE;
1977              $self->{s_kwd} = '';
1978            }
1979          ## reconsume          ## reconsume
1980    
1981          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1834  sub _get_next_token ($) { Line 1990  sub _get_next_token ($) {
1990          redo A;          redo A;
1991        }        }
1992      } elsif ($self->{state} == COMMENT_STATE) {      } elsif ($self->{state} == COMMENT_STATE) {
1993          ## XML5: "Comment state" and "DOCTYPE comment state".
1994    
1995        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
1996          !!!cp (145);          !!!cp (145);
1997          $self->{state} = COMMENT_END_DASH_STATE;          $self->{state} = COMMENT_END_DASH_STATE;
1998          !!!next-input-character;          !!!next-input-character;
1999          redo A;          redo A;
2000        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (146);  
2001          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
2002          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2003          $self->{s_kwd} = '';            !!!cp (146.1);
2004              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2005            } else {
2006              !!!cp (146);
2007              $self->{state} = DATA_STATE;
2008              $self->{s_kwd} = '';
2009            }
2010          ## reconsume          ## reconsume
2011    
2012          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1861  sub _get_next_token ($) { Line 2024  sub _get_next_token ($) {
2024          redo A;          redo A;
2025        }        }
2026      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2027        ## XML5: "comment dash state".        ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2028    
2029        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2030          !!!cp (148);          !!!cp (148);
# Line 1869  sub _get_next_token ($) { Line 2032  sub _get_next_token ($) {
2032          !!!next-input-character;          !!!next-input-character;
2033          redo A;          redo A;
2034        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (149);  
2035          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
2036          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2037          $self->{s_kwd} = '';            !!!cp (149.1);
2038              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2039            } else {
2040              !!!cp (149);
2041              $self->{state} = DATA_STATE;
2042              $self->{s_kwd} = '';
2043            }
2044          ## reconsume          ## reconsume
2045    
2046          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1886  sub _get_next_token ($) { Line 2054  sub _get_next_token ($) {
2054          redo A;          redo A;
2055        }        }
2056      } elsif ($self->{state} == COMMENT_END_STATE) {      } elsif ($self->{state} == COMMENT_END_STATE) {
2057          ## XML5: "Comment end state" and "DOCTYPE comment end state".
2058    
2059        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2060          !!!cp (151);          if ($self->{in_subset}) {
2061          $self->{state} = DATA_STATE;            !!!cp (151.1);
2062          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2063            } else {
2064              !!!cp (151);
2065              $self->{state} = DATA_STATE;
2066              $self->{s_kwd} = '';
2067            }
2068          !!!next-input-character;          !!!next-input-character;
2069    
2070          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1906  sub _get_next_token ($) { Line 2081  sub _get_next_token ($) {
2081          !!!next-input-character;          !!!next-input-character;
2082          redo A;          redo A;
2083        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (153);  
2084          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
2085          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2086          $self->{s_kwd} = '';            !!!cp (153.1);
2087              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2088            } else {
2089              !!!cp (153);
2090              $self->{state} = DATA_STATE;
2091              $self->{s_kwd} = '';
2092            }
2093          ## reconsume          ## reconsume
2094    
2095          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1973  sub _get_next_token ($) { Line 2153  sub _get_next_token ($) {
2153          !!!cp (159.1);          !!!cp (159.1);
2154          !!!parse-error (type => 'no DOCTYPE name');          !!!parse-error (type => 'no DOCTYPE name');
2155          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2156            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2157            $self->{in_subset} = 1;
2158          !!!next-input-character;          !!!next-input-character;
2159            !!!emit ($self->{ct}); # DOCTYPE
2160          redo A;          redo A;
2161        } else {        } else {
2162          !!!cp (160);          !!!cp (160);
# Line 2016  sub _get_next_token ($) { Line 2199  sub _get_next_token ($) {
2199        } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [        } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2200          !!!cp (163.1);          !!!cp (163.1);
2201          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2202            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2203            $self->{in_subset} = 1;
2204          !!!next-input-character;          !!!next-input-character;
2205            !!!emit ($self->{ct}); # DOCTYPE
2206          redo A;          redo A;
2207        } else {        } else {
2208          !!!cp (164);          !!!cp (164);
# Line 2036  sub _get_next_token ($) { Line 2222  sub _get_next_token ($) {
2222          !!!next-input-character;          !!!next-input-character;
2223          redo A;          redo A;
2224        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2225          !!!cp (166);          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2226          $self->{state} = DATA_STATE;            !!!cp (166);
2227          $self->{s_kwd} = '';            $self->{state} = DATA_STATE;
2228              $self->{s_kwd} = '';
2229            } else {
2230              !!!cp (166.1);
2231              !!!parse-error (type => 'no md def'); ## TODO: type
2232              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2233            }
2234            
2235          !!!next-input-character;          !!!next-input-character;
2236            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         !!!emit ($self->{ct}); # DOCTYPE  
   
2237          redo A;          redo A;
2238        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2239          !!!cp (167);          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2240          !!!parse-error (type => 'unclosed DOCTYPE');            !!!cp (167);
2241          $self->{state} = DATA_STATE;            !!!parse-error (type => 'unclosed DOCTYPE');
2242          $self->{s_kwd} = '';            $self->{state} = DATA_STATE;
2243          ## reconsume            $self->{s_kwd} = '';
2244              $self->{ct}->{quirks} = 1;
2245          $self->{ct}->{quirks} = 1;          } else {
2246          !!!emit ($self->{ct}); # DOCTYPE            !!!cp (167.12);
2247              !!!parse-error (type => 'unclosed md'); ## TODO: type
2248              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2249            }
2250            
2251            ## Reconsume.
2252            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2253          redo A;          redo A;
2254        } elsif ($self->{nc} == 0x0050 or # P        } elsif ($self->{nc} == 0x0050 or # P
2255                 $self->{nc} == 0x0070) { # p                 $self->{nc} == 0x0070) { # p
# Line 2069  sub _get_next_token ($) { Line 2265  sub _get_next_token ($) {
2265          $self->{kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
2266          !!!next-input-character;          !!!next-input-character;
2267          redo A;          redo A;
2268        } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [        } elsif ($self->{nc} == 0x0022 and # "
2269                   ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2270                    $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2271            !!!cp (167.21);
2272            $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
2273            $self->{ct}->{value} = ''; # ENTITY
2274            !!!next-input-character;
2275            redo A;
2276          } elsif ($self->{nc} == 0x0027 and # '
2277                   ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2278                    $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2279            !!!cp (167.22);
2280            $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
2281            $self->{ct}->{value} = ''; # ENTITY
2282            !!!next-input-character;
2283            redo A;
2284          } elsif ($self->{is_xml} and
2285                   $self->{ct}->{type} == DOCTYPE_TOKEN and
2286                   $self->{nc} == 0x005B) { # [
2287          !!!cp (167.3);          !!!cp (167.3);
2288          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2289          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2290            $self->{in_subset} = 1;
2291          !!!next-input-character;          !!!next-input-character;
2292            !!!emit ($self->{ct}); # DOCTYPE
2293          redo A;          redo A;
2294        } else {        } else {
2295          !!!cp (180);          !!!parse-error (type => 'string after DOCTYPE name'); ## TODO: type
2296          !!!parse-error (type => 'string after DOCTYPE name');  
2297          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2298              !!!cp (180);
2299              $self->{ct}->{quirks} = 1;
2300              $self->{state} = BOGUS_DOCTYPE_STATE;
2301            } else {
2302              !!!cp (180.1);
2303              $self->{state} = BOGUS_MD_STATE;
2304            }
2305    
         $self->{state} = BOGUS_DOCTYPE_STATE;  
2306          !!!next-input-character;          !!!next-input-character;
2307          redo A;          redo A;
2308        }        }
# Line 2122  sub _get_next_token ($) { Line 2344  sub _get_next_token ($) {
2344          !!!next-input-character;          !!!next-input-character;
2345          redo A;          redo A;
2346        } else {        } else {
2347          !!!cp (169);          !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
         !!!parse-error (type => 'string after DOCTYPE name',  
2348                          line => $self->{line_prev},                          line => $self->{line_prev},
2349                          column => $self->{column_prev} + 1 - length $self->{kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
2350          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2351              !!!cp (169);
2352          $self->{state} = BOGUS_DOCTYPE_STATE;            $self->{ct}->{quirks} = 1;
2353              $self->{state} = BOGUS_DOCTYPE_STATE;
2354            } else {
2355              !!!cp (169.1);
2356              $self->{state} = BOGUS_MD_STATE;
2357            }
2358          ## Reconsume.          ## Reconsume.
2359          redo A;          redo A;
2360        }        }
# Line 2170  sub _get_next_token ($) { Line 2396  sub _get_next_token ($) {
2396          !!!next-input-character;          !!!next-input-character;
2397          redo A;          redo A;
2398        } else {        } else {
2399          !!!cp (172);          !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
         !!!parse-error (type => 'string after DOCTYPE name',  
2400                          line => $self->{line_prev},                          line => $self->{line_prev},
2401                          column => $self->{column_prev} + 1 - length $self->{kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
2402          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2403              !!!cp (172);
2404          $self->{state} = BOGUS_DOCTYPE_STATE;            $self->{ct}->{quirks} = 1;
2405              $self->{state} = BOGUS_DOCTYPE_STATE;
2406            } else {
2407              !!!cp (172.1);
2408              $self->{state} = BOGUS_MD_STATE;
2409            }
2410          ## Reconsume.          ## Reconsume.
2411          redo A;          redo A;
2412        }        }
# Line 2199  sub _get_next_token ($) { Line 2429  sub _get_next_token ($) {
2429          !!!next-input-character;          !!!next-input-character;
2430          redo A;          redo A;
2431        } elsif ($self->{nc} eq 0x003E) { # >        } elsif ($self->{nc} eq 0x003E) { # >
         !!!cp (184);  
2432          !!!parse-error (type => 'no PUBLIC literal');          !!!parse-error (type => 'no PUBLIC literal');
2433            
2434          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2435          $self->{s_kwd} = '';            !!!cp (184);
2436              $self->{state} = DATA_STATE;
2437              $self->{s_kwd} = '';
2438              $self->{ct}->{quirks} = 1;
2439            } else {
2440              !!!cp (184.1);
2441              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2442            }
2443            
2444          !!!next-input-character;          !!!next-input-character;
2445            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         !!!emit ($self->{ct}); # DOCTYPE  
   
2446          redo A;          redo A;
2447        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2448          !!!cp (185);          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2449          !!!parse-error (type => 'unclosed DOCTYPE');            !!!cp (185);
2450              !!!parse-error (type => 'unclosed DOCTYPE');
2451          $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
2452          $self->{s_kwd} = '';            $self->{s_kwd} = '';
2453              $self->{ct}->{quirks} = 1;
2454            } else {
2455              !!!cp (185.1);
2456              !!!parse-error (type => 'unclosed md'); ## TODO: type
2457              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2458            }
2459            
2460          ## reconsume          ## reconsume
   
         $self->{ct}->{quirks} = 1;  
2461          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
   
2462          redo A;          redo A;
2463        } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [        } elsif ($self->{is_xml} and
2464                   $self->{ct}->{type} == DOCTYPE_TOKEN and
2465                   $self->{nc} == 0x005B) { # [
2466          !!!cp (186.1);          !!!cp (186.1);
2467          !!!parse-error (type => 'no PUBLIC literal');          !!!parse-error (type => 'no PUBLIC literal');
2468          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2469          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2470            $self->{in_subset} = 1;
2471          !!!next-input-character;          !!!next-input-character;
2472            !!!emit ($self->{ct}); # DOCTYPE
2473          redo A;          redo A;
2474        } else {        } else {
         !!!cp (186);  
2475          !!!parse-error (type => 'string after PUBLIC');          !!!parse-error (type => 'string after PUBLIC');
         $self->{ct}->{quirks} = 1;  
2476    
2477          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2478              !!!cp (186);
2479              $self->{ct}->{quirks} = 1;
2480              $self->{state} = BOGUS_DOCTYPE_STATE;
2481            } else {
2482              !!!cp (186.2);
2483              $self->{state} = BOGUS_MD_STATE;
2484            }
2485    
2486          !!!next-input-character;          !!!next-input-character;
2487          redo A;          redo A;
2488        }        }
# Line 2245  sub _get_next_token ($) { Line 2493  sub _get_next_token ($) {
2493          !!!next-input-character;          !!!next-input-character;
2494          redo A;          redo A;
2495        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
         !!!cp (188);  
2496          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2497    
2498          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2499          $self->{s_kwd} = '';            !!!cp (188);
2500          !!!next-input-character;            $self->{state} = DATA_STATE;
2501              $self->{s_kwd} = '';
2502          $self->{ct}->{quirks} = 1;            $self->{ct}->{quirks} = 1;
2503          !!!emit ($self->{ct}); # DOCTYPE          } else {
2504              !!!cp (188.1);
2505              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2506            }
2507    
2508            !!!next-input-character;
2509            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2510          redo A;          redo A;
2511        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (189);  
2512          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2513    
2514          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2515          $self->{s_kwd} = '';            !!!cp (189);
2516          ## reconsume            $self->{state} = DATA_STATE;
2517              $self->{s_kwd} = '';
2518          $self->{ct}->{quirks} = 1;            $self->{ct}->{quirks} = 1;
2519            } else {
2520              !!!cp (189.1);
2521              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2522            }
2523            
2524            ## Reconsume.
2525          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
   
2526          redo A;          redo A;
2527        } else {        } else {
2528          !!!cp (190);          !!!cp (190);
2529          $self->{ct}->{pubid} # DOCTYPE          $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
2530          $self->{read_until}->($self->{ct}->{pubid}, q[">],          $self->{read_until}->($self->{ct}->{pubid}, q[">],
2531                                length $self->{ct}->{pubid});                                length $self->{ct}->{pubid});
2532    
# Line 2286  sub _get_next_token ($) { Line 2541  sub _get_next_token ($) {
2541          !!!next-input-character;          !!!next-input-character;
2542          redo A;          redo A;
2543        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
         !!!cp (192);  
2544          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2545    
2546          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2547          $self->{s_kwd} = '';            !!!cp (192);
2548          !!!next-input-character;            $self->{state} = DATA_STATE;
2549              $self->{s_kwd} = '';
2550          $self->{ct}->{quirks} = 1;            $self->{ct}->{quirks} = 1;
2551          !!!emit ($self->{ct}); # DOCTYPE          } else {
2552              !!!cp (192.1);
2553              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2554            }
2555    
2556            !!!next-input-character;
2557            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2558          redo A;          redo A;
2559        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (193);  
2560          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2561    
2562          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2563          $self->{s_kwd} = '';            !!!cp (193);
2564              $self->{state} = DATA_STATE;
2565              $self->{s_kwd} = '';
2566              $self->{ct}->{quirks} = 1;
2567            } else {
2568              !!!cp (193.1);
2569              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2570            }
2571          
2572          ## reconsume          ## reconsume
2573            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         !!!emit ($self->{ct}); # DOCTYPE  
   
2574          redo A;          redo A;
2575        } else {        } else {
2576          !!!cp (194);          !!!cp (194);
2577          $self->{ct}->{pubid} # DOCTYPE          $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
2578          $self->{read_until}->($self->{ct}->{pubid}, q['>],          $self->{read_until}->($self->{ct}->{pubid}, q['>],
2579                                length $self->{ct}->{pubid});                                length $self->{ct}->{pubid});
2580    
# Line 2328  sub _get_next_token ($) { Line 2590  sub _get_next_token ($) {
2590          redo A;          redo A;
2591        } elsif ($self->{nc} == 0x0022) { # "        } elsif ($self->{nc} == 0x0022) { # "
2592          !!!cp (196);          !!!cp (196);
2593          $self->{ct}->{sysid} = ''; # DOCTYPE          $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2594          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2595          !!!next-input-character;          !!!next-input-character;
2596          redo A;          redo A;
2597        } elsif ($self->{nc} == 0x0027) { # '        } elsif ($self->{nc} == 0x0027) { # '
2598          !!!cp (197);          !!!cp (197);
2599          $self->{ct}->{sysid} = ''; # DOCTYPE          $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2600          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2601          !!!next-input-character;          !!!next-input-character;
2602          redo A;          redo A;
2603        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2604          if ($self->{is_xml}) {          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2605            !!!cp (198.1);            if ($self->{is_xml}) {
2606            !!!parse-error (type => 'no SYSTEM literal');              !!!cp (198.1);
2607                !!!parse-error (type => 'no SYSTEM literal');
2608              } else {
2609                !!!cp (198);
2610              }
2611              $self->{state} = DATA_STATE;
2612              $self->{s_kwd} = '';
2613          } else {          } else {
2614            !!!cp (198);            if ($self->{ct}->{type} == NOTATION_TOKEN) {
2615                !!!cp (198.2);
2616              } else {
2617                !!!cp (198.3);
2618                !!!parse-error (type => 'no SYSTEM literal');            
2619              }
2620              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2621          }          }
2622          $self->{state} = DATA_STATE;          
         $self->{s_kwd} = '';  
2623          !!!next-input-character;          !!!next-input-character;
2624            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         !!!emit ($self->{ct}); # DOCTYPE  
   
2625          redo A;          redo A;
2626        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2627          !!!cp (199);          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2628          !!!parse-error (type => 'unclosed DOCTYPE');            !!!cp (199);
2629              !!!parse-error (type => 'unclosed DOCTYPE');
2630          $self->{state} = DATA_STATE;            
2631          $self->{s_kwd} = '';            $self->{state} = DATA_STATE;
2632              $self->{s_kwd} = '';
2633              $self->{ct}->{quirks} = 1;
2634            } else {
2635              !!!parse-error (type => 'unclosed md'); ## TODO: type
2636              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2637            }
2638            
2639          ## reconsume          ## reconsume
2640            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         !!!emit ($self->{ct}); # DOCTYPE  
   
2641          redo A;          redo A;
2642        } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [        } elsif ($self->{is_xml} and
2643                   $self->{ct}->{type} == DOCTYPE_TOKEN and
2644                   $self->{nc} == 0x005B) { # [
2645          !!!cp (200.1);          !!!cp (200.1);
2646          !!!parse-error (type => 'no SYSTEM literal');          !!!parse-error (type => 'no SYSTEM literal');
2647          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2648          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2649            $self->{in_subset} = 1;
2650          !!!next-input-character;          !!!next-input-character;
2651            !!!emit ($self->{ct}); # DOCTYPE
2652          redo A;          redo A;
2653        } else {        } else {
         !!!cp (200);  
2654          !!!parse-error (type => 'string after PUBLIC literal');          !!!parse-error (type => 'string after PUBLIC literal');
         $self->{ct}->{quirks} = 1;  
2655    
2656          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2657              !!!cp (200);
2658              $self->{ct}->{quirks} = 1;
2659              $self->{state} = BOGUS_DOCTYPE_STATE;
2660            } else {
2661              !!!cp (200.2);
2662              $self->{state} = BOGUS_MD_STATE;
2663            }
2664    
2665          !!!next-input-character;          !!!next-input-character;
2666          redo A;          redo A;
2667        }        }
# Line 2399  sub _get_next_token ($) { Line 2684  sub _get_next_token ($) {
2684          !!!next-input-character;          !!!next-input-character;
2685          redo A;          redo A;
2686        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
         !!!cp (204);  
2687          !!!parse-error (type => 'no SYSTEM literal');          !!!parse-error (type => 'no SYSTEM literal');
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
2688          !!!next-input-character;          !!!next-input-character;
2689    
2690          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2691          !!!emit ($self->{ct}); # DOCTYPE            !!!cp (204);
2692              $self->{state} = DATA_STATE;
2693              $self->{s_kwd} = '';
2694              $self->{ct}->{quirks} = 1;
2695            } else {
2696              !!!cp (204.1);
2697              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2698            }
2699    
2700            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2701          redo A;          redo A;
2702        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2703          !!!cp (205);          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2704          !!!parse-error (type => 'unclosed DOCTYPE');            !!!cp (205);
2705              !!!parse-error (type => 'unclosed DOCTYPE');
2706          $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
2707          $self->{s_kwd} = '';            $self->{s_kwd} = '';
2708              $self->{ct}->{quirks} = 1;
2709            } else {
2710              !!!cp (205.1);
2711              !!!parse-error (type => 'unclosed md'); ## TODO: type
2712              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2713            }
2714            
2715          ## reconsume          ## reconsume
2716            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         !!!emit ($self->{ct}); # DOCTYPE  
   
2717          redo A;          redo A;
2718        } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [        } elsif ($self->{is_xml} and
2719                   $self->{ct}->{type} == DOCTYPE_TOKEN and
2720                   $self->{nc} == 0x005B) { # [
2721          !!!cp (206.1);          !!!cp (206.1);
2722          !!!parse-error (type => 'no SYSTEM literal');          !!!parse-error (type => 'no SYSTEM literal');
2723    
2724          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2725          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2726            $self->{in_subset} = 1;
2727          !!!next-input-character;          !!!next-input-character;
2728            !!!emit ($self->{ct}); # DOCTYPE
2729          redo A;          redo A;
2730        } else {        } else {
         !!!cp (206);  
2731          !!!parse-error (type => 'string after SYSTEM');          !!!parse-error (type => 'string after SYSTEM');
         $self->{ct}->{quirks} = 1;  
2732    
2733          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2734              !!!cp (206);          
2735              $self->{ct}->{quirks} = 1;
2736              $self->{state} = BOGUS_DOCTYPE_STATE;
2737            } else {
2738              !!!cp (206.2);
2739              $self->{state} = BOGUS_MD_STATE;
2740            }
2741    
2742          !!!next-input-character;          !!!next-input-character;
2743          redo A;          redo A;
2744        }        }
# Line 2445  sub _get_next_token ($) { Line 2749  sub _get_next_token ($) {
2749          !!!next-input-character;          !!!next-input-character;
2750          redo A;          redo A;
2751        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
         !!!cp (208);  
2752          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2753    
2754          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2755          $self->{s_kwd} = '';            !!!cp (208);
2756              $self->{state} = DATA_STATE;
2757              $self->{s_kwd} = '';
2758              $self->{ct}->{quirks} = 1;
2759            } else {
2760              !!!cp (208.1);
2761              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2762            }
2763            
2764          !!!next-input-character;          !!!next-input-character;
2765            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         !!!emit ($self->{ct}); # DOCTYPE  
   
2766          redo A;          redo A;
2767        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (209);  
2768          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2769    
2770          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2771          $self->{s_kwd} = '';            !!!cp (209);
2772              $self->{state} = DATA_STATE;
2773              $self->{s_kwd} = '';
2774              $self->{ct}->{quirks} = 1;
2775            } else {
2776              !!!cp (209.1);
2777              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2778            }
2779            
2780          ## reconsume          ## reconsume
2781            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         !!!emit ($self->{ct}); # DOCTYPE  
   
2782          redo A;          redo A;
2783        } else {        } else {
2784          !!!cp (210);          !!!cp (210);
2785          $self->{ct}->{sysid} # DOCTYPE          $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
2786          $self->{read_until}->($self->{ct}->{sysid}, q[">],          $self->{read_until}->($self->{ct}->{sysid}, q[">],
2787                                length $self->{ct}->{sysid});                                length $self->{ct}->{sysid});
2788    
# Line 2498  sub _get_next_token ($) { Line 2809  sub _get_next_token ($) {
2809    
2810          redo A;          redo A;
2811        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (213);  
2812          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2813    
2814          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2815          $self->{s_kwd} = '';            !!!cp (213);
2816          ## reconsume            $self->{state} = DATA_STATE;
2817              $self->{s_kwd} = '';
2818          $self->{ct}->{quirks} = 1;            $self->{ct}->{quirks} = 1;
2819          !!!emit ($self->{ct}); # DOCTYPE          } else {
2820              !!!cp (213.1);
2821              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2822            }
2823    
2824            ## reconsume
2825            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2826          redo A;          redo A;
2827        } else {        } else {
2828          !!!cp (214);          !!!cp (214);
2829          $self->{ct}->{sysid} # DOCTYPE          $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
2830          $self->{read_until}->($self->{ct}->{sysid}, q['>],          $self->{read_until}->($self->{ct}->{sysid}, q['>],
2831                                length $self->{ct}->{sysid});                                length $self->{ct}->{sysid});
2832    
# Line 2522  sub _get_next_token ($) { Line 2836  sub _get_next_token ($) {
2836        }        }
2837      } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {      } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2838        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2839          !!!cp (215);          if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
2840          ## Stay in the state            !!!cp (215.1);
2841              $self->{state} = BEFORE_NDATA_STATE;
2842            } else {
2843              !!!cp (215);
2844              ## Stay in the state
2845            }
2846          !!!next-input-character;          !!!next-input-character;
2847          redo A;          redo A;
2848        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2849          !!!cp (216);          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2850          $self->{state} = DATA_STATE;            !!!cp (216);
2851          $self->{s_kwd} = '';            $self->{state} = DATA_STATE;
2852          !!!next-input-character;            $self->{s_kwd} = '';
2853            } else {
2854          !!!emit ($self->{ct}); # DOCTYPE            !!!cp (216.1);
2855              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2856            }
2857    
2858            !!!next-input-character;
2859            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2860            redo A;
2861          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
2862                   ($self->{nc} == 0x004E or # N
2863                    $self->{nc} == 0x006E)) { # n
2864            !!!cp (216.2);
2865            !!!parse-error (type => 'no space before NDATA'); ## TODO: type
2866            $self->{state} = NDATA_STATE;
2867            $self->{kwd} = chr $self->{nc};
2868            !!!next-input-character;
2869          redo A;          redo A;
2870        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2871          !!!cp (217);          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2872          !!!parse-error (type => 'unclosed DOCTYPE');            !!!cp (217);
2873          $self->{state} = DATA_STATE;            !!!parse-error (type => 'unclosed DOCTYPE');
2874          $self->{s_kwd} = '';            $self->{state} = DATA_STATE;
2875          ## reconsume            $self->{s_kwd} = '';
2876              $self->{ct}->{quirks} = 1;
2877          $self->{ct}->{quirks} = 1;          } else {
2878          !!!emit ($self->{ct}); # DOCTYPE            !!!cp (217.1);
2879              !!!parse-error (type => 'unclosed md'); ## TODO: type
2880              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2881            }
2882    
2883            ## reconsume
2884            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2885          redo A;          redo A;
2886        } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [        } elsif ($self->{is_xml} and
2887                   $self->{ct}->{type} == DOCTYPE_TOKEN and
2888                   $self->{nc} == 0x005B) { # [
2889          !!!cp (218.1);          !!!cp (218.1);
2890          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2891          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2892            $self->{in_subset} = 1;
2893          !!!next-input-character;          !!!next-input-character;
2894            !!!emit ($self->{ct}); # DOCTYPE
2895          redo A;          redo A;
2896        } else {        } else {
         !!!cp (218);  
2897          !!!parse-error (type => 'string after SYSTEM literal');          !!!parse-error (type => 'string after SYSTEM literal');
         #$self->{ct}->{quirks} = 1;  
2898    
2899          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2900              !!!cp (218);
2901              #$self->{ct}->{quirks} = 1;
2902              $self->{state} = BOGUS_DOCTYPE_STATE;
2903            } else {
2904              !!!cp (218.2);
2905              $self->{state} = BOGUS_MD_STATE;
2906            }
2907    
2908            !!!next-input-character;
2909            redo A;
2910          }
2911        } elsif ($self->{state} == BEFORE_NDATA_STATE) {
2912          if ($is_space->{$self->{nc}}) {
2913            !!!cp (218.3);
2914            ## Stay in the state.
2915            !!!next-input-character;
2916            redo A;
2917          } elsif ($self->{nc} == 0x003E) { # >
2918            !!!cp (218.4);
2919            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2920            !!!next-input-character;
2921            !!!emit ($self->{ct}); # ENTITY
2922            redo A;
2923          } elsif ($self->{nc} == 0x004E or # N
2924                   $self->{nc} == 0x006E) { # n
2925            !!!cp (218.5);
2926            $self->{state} = NDATA_STATE;
2927            $self->{kwd} = chr $self->{nc};
2928            !!!next-input-character;
2929            redo A;
2930          } elsif ($self->{nc} == -1) {
2931            !!!cp (218.6);
2932            !!!parse-error (type => 'unclosed md'); ## TODO: type
2933            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2934            ## reconsume
2935            !!!emit ($self->{ct}); # ENTITY
2936            redo A;
2937          } else {
2938            !!!cp (218.7);
2939            !!!parse-error (type => 'string after SYSTEM literal');
2940            $self->{state} = BOGUS_MD_STATE;
2941          !!!next-input-character;          !!!next-input-character;
2942          redo A;          redo A;
2943        }        }
# Line 2572  sub _get_next_token ($) { Line 2952  sub _get_next_token ($) {
2952    
2953          redo A;          redo A;
2954        } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [        } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2955          if ($self->{ct}->{has_internal_subset}) { # DOCTYPE          !!!cp (220.1);
2956            !!!cp (220.2);          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2957            ## Stay in the state.          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2958            !!!next-input-character;          $self->{in_subset} = 1;
2959            redo A;          !!!next-input-character;
2960          } else {          !!!emit ($self->{ct}); # DOCTYPE
2961            !!!cp (220.1);          redo A;
           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;  
           $self->{ct}->{has_internal_subset} = 1; # DOCTYPE  
           !!!next-input-character;  
           redo A;  
         }  
2962        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2963          !!!cp (220);          !!!cp (220);
2964          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
# Line 3065  sub _get_next_token ($) { Line 3440  sub _get_next_token ($) {
3440      ## XML-only states      ## XML-only states
3441    
3442      } elsif ($self->{state} == PI_STATE) {      } elsif ($self->{state} == PI_STATE) {
3443          ## XML5: "Pi state" and "DOCTYPE pi state".
3444    
3445        if ($is_space->{$self->{nc}} or        if ($is_space->{$self->{nc}} or
3446            $self->{nc} == 0x003F or # ? ## XML5: Same as "Anything else"            $self->{nc} == 0x003F or # ?
3447            $self->{nc} == -1) {            $self->{nc} == -1) {
3448            ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
3449            ## pi state": Switch to the "DOCTYPE pi after state".  EOF:
3450            ## "DOCTYPE pi state": Parse error, switch to the "data
3451            ## state".
3452          !!!parse-error (type => 'bare pio', ## TODO: type          !!!parse-error (type => 'bare pio', ## TODO: type
3453                          line => $self->{line_prev},                          line => $self->{line_prev},
3454                          column => $self->{column_prev}                          column => $self->{column_prev}
# Line 3082  sub _get_next_token ($) { Line 3463  sub _get_next_token ($) {
3463                        };                        };
3464          redo A;          redo A;
3465        } else {        } else {
3466            ## XML5: "DOCTYPE pi state": Stay in the state.
3467          $self->{ct} = {type => PI_TOKEN,          $self->{ct} = {type => PI_TOKEN,
3468                         target => chr $self->{nc},                         target => chr $self->{nc},
3469                         data => '',                         data => '',
# Line 3099  sub _get_next_token ($) { Line 3481  sub _get_next_token ($) {
3481          redo A;          redo A;
3482        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3483          !!!parse-error (type => 'no pic'); ## TODO: type          !!!parse-error (type => 'no pic'); ## TODO: type
3484          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
3485          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3486            } else {
3487              $self->{state} = DATA_STATE;
3488              $self->{s_kwd} = '';
3489            }
3490          ## Reconsume.          ## Reconsume.
3491          !!!emit ($self->{ct}); # pi          !!!emit ($self->{ct}); # pi
3492          redo A;          redo A;
# Line 3131  sub _get_next_token ($) { Line 3517  sub _get_next_token ($) {
3517          redo A;          redo A;
3518        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3519          !!!parse-error (type => 'no pic'); ## TODO: type          !!!parse-error (type => 'no pic'); ## TODO: type
3520          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
3521          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
3522            } else {
3523              $self->{state} = DATA_STATE;
3524              $self->{s_kwd} = '';
3525            }
3526          ## Reprocess.          ## Reprocess.
3527          !!!emit ($self->{ct}); # pi          !!!emit ($self->{ct}); # pi
3528          redo A;          redo A;
# Line 3146  sub _get_next_token ($) { Line 3536  sub _get_next_token ($) {
3536          redo A;          redo A;
3537        }        }
3538      } elsif ($self->{state} == PI_AFTER_STATE) {      } elsif ($self->{state} == PI_AFTER_STATE) {
3539          ## XML5: Part of "Pi after state".
3540    
3541        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
3542          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
3543          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3544            } else {
3545              $self->{state} = DATA_STATE;
3546              $self->{s_kwd} = '';
3547            }
3548          !!!next-input-character;          !!!next-input-character;
3549          !!!emit ($self->{ct}); # pi          !!!emit ($self->{ct}); # pi
3550          redo A;          redo A;
# Line 3171  sub _get_next_token ($) { Line 3567  sub _get_next_token ($) {
3567          redo A;          redo A;
3568        }        }
3569      } elsif ($self->{state} == PI_DATA_AFTER_STATE) {      } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
3570        ## XML5: Same as "pi after state" in XML5        ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
3571    
3572        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
3573          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
3574          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3575            } else {
3576              $self->{state} = DATA_STATE;
3577              $self->{s_kwd} = '';
3578            }
3579          !!!next-input-character;          !!!next-input-character;
3580          !!!emit ($self->{ct}); # pi          !!!emit ($self->{ct}); # pi
3581          redo A;          redo A;
# Line 3192  sub _get_next_token ($) { Line 3593  sub _get_next_token ($) {
3593    
3594      } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {      } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
3595        if ($self->{nc} == 0x003C) { # <        if ($self->{nc} == 0x003C) { # <
3596          ## TODO:          $self->{state} = DOCTYPE_TAG_STATE;
3597          !!!next-input-character;          !!!next-input-character;
3598          redo A;          redo A;
3599        } elsif ($self->{nc} == 0x0025) { # %        } elsif ($self->{nc} == 0x0025) { # %
# Line 3202  sub _get_next_token ($) { Line 3603  sub _get_next_token ($) {
3603          !!!next-input-character;          !!!next-input-character;
3604          redo A;          redo A;
3605        } elsif ($self->{nc} == 0x005D) { # ]        } elsif ($self->{nc} == 0x005D) { # ]
3606            delete $self->{in_subset};
3607          $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3608          !!!next-input-character;          !!!next-input-character;
3609          redo A;          redo A;
# Line 3211  sub _get_next_token ($) { Line 3613  sub _get_next_token ($) {
3613          redo A;          redo A;
3614        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3615          !!!parse-error (type => 'unclosed internal subset'); ## TODO: type          !!!parse-error (type => 'unclosed internal subset'); ## TODO: type
3616            delete $self->{in_subset};
3617          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3618          $self->{s_kwd} = '';          $self->{s_kwd} = '';
3619          ## Reconsume.          ## Reconsume.
3620          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3621          redo A;          redo A;
3622        } else {        } else {
3623          unless ($self->{internal_subset_tainted}) {          unless ($self->{internal_subset_tainted}) {
# Line 3231  sub _get_next_token ($) { Line 3634  sub _get_next_token ($) {
3634          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3635          $self->{s_kwd} = '';          $self->{s_kwd} = '';
3636          !!!next-input-character;          !!!next-input-character;
3637          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3638          redo A;          redo A;
3639        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3640          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
3641          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3642          $self->{s_kwd} = '';          $self->{s_kwd} = '';
3643          ## Reconsume.          ## Reconsume.
3644          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3645          redo A;          redo A;
3646        } else {        } else {
3647          ## XML5: No parse error and stay in the state.          ## XML5: No parse error and stay in the state.
3648          !!!parse-error (type => 'string after internal subset'); ## TODO: type          !!!parse-error (type => 'string after internal subset'); ## TODO: type
3649    
3650          $self->{state} = BOGUS_DOCTYPE_STATE;          $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3651            !!!next-input-character;
3652            redo A;
3653          }
3654        } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3655          if ($self->{nc} == 0x003E) { # >
3656            $self->{state} = DATA_STATE;
3657            $self->{s_kwd} = '';
3658            !!!next-input-character;
3659            !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3660            redo A;
3661          } elsif ($self->{nc} == -1) {
3662            $self->{state} = DATA_STATE;
3663            $self->{s_kwd} = '';
3664            ## Reconsume.
3665            !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3666            redo A;
3667          } else {
3668            ## Stay in the state.
3669            !!!next-input-character;
3670            redo A;
3671          }
3672        } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
3673          if ($self->{nc} == 0x0021) { # !
3674            $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
3675            !!!next-input-character;
3676            redo A;
3677          } elsif ($self->{nc} == 0x003F) { # ?
3678            $self->{state} = PI_STATE;
3679            !!!next-input-character;
3680            redo A;
3681          } elsif ($self->{nc} == -1) {
3682            !!!parse-error (type => 'bare stago');
3683            $self->{state} = DATA_STATE;
3684            $self->{s_kwd} = '';
3685            ## Reconsume.
3686            redo A;
3687          } else {
3688            !!!parse-error (type => 'bare stago', ## XML5: Not a parse error.
3689                            line => $self->{line_prev},
3690                            column => $self->{column_prev});
3691            $self->{state} = BOGUS_COMMENT_STATE;
3692            $self->{ct} = {type => COMMENT_TOKEN,
3693                           data => '',
3694                          }; ## NOTE: Will be discarded.
3695            !!!next-input-character;
3696            redo A;
3697          }
3698        } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
3699          ## XML5: "DOCTYPE markup declaration state".
3700          
3701          if ($self->{nc} == 0x002D) { # -
3702            $self->{state} = MD_HYPHEN_STATE;
3703            !!!next-input-character;
3704            redo A;
3705          } elsif ($self->{nc} == 0x0045 or # E
3706                   $self->{nc} == 0x0065) { # e
3707            $self->{state} = MD_E_STATE;
3708            $self->{kwd} = chr $self->{nc};
3709            !!!next-input-character;
3710            redo A;
3711          } elsif ($self->{nc} == 0x0041 or # A
3712                   $self->{nc} == 0x0061) { # a
3713            $self->{state} = MD_ATTLIST_STATE;
3714            $self->{kwd} = chr $self->{nc};
3715            !!!next-input-character;
3716            redo A;
3717          } elsif ($self->{nc} == 0x004E or # N
3718                   $self->{nc} == 0x006E) { # n
3719            $self->{state} = MD_NOTATION_STATE;
3720            $self->{kwd} = chr $self->{nc};
3721            !!!next-input-character;
3722            redo A;
3723          } else {
3724            #
3725          }
3726          
3727          ## XML5: No parse error.
3728          !!!parse-error (type => 'bogus comment',
3729                          line => $self->{line_prev},
3730                          column => $self->{column_prev} - 1);
3731          ## Reconsume.
3732          $self->{state} = BOGUS_COMMENT_STATE;
3733          $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
3734          redo A;
3735        } elsif ($self->{state} == MD_E_STATE) {
3736          if ($self->{nc} == 0x004E or # N
3737              $self->{nc} == 0x006E) { # n
3738            $self->{state} = MD_ENTITY_STATE;
3739            $self->{kwd} .= chr $self->{nc};
3740            !!!next-input-character;
3741            redo A;
3742          } elsif ($self->{nc} == 0x004C or # L
3743                   $self->{nc} == 0x006C) { # l
3744            ## XML5: <!ELEMENT> not supported.
3745            $self->{state} = MD_ELEMENT_STATE;
3746            $self->{kwd} .= chr $self->{nc};
3747            !!!next-input-character;
3748            redo A;
3749          } else {
3750            ## XML5: No parse error.
3751            !!!parse-error (type => 'bogus comment',
3752                            line => $self->{line_prev},
3753                            column => $self->{column_prev} - 2
3754                                + 1 * ($self->{nc} == -1));
3755            ## Reconsume.
3756            $self->{state} = BOGUS_COMMENT_STATE;
3757            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3758            redo A;
3759          }
3760        } elsif ($self->{state} == MD_ENTITY_STATE) {
3761          if ($self->{nc} == [
3762                undef,
3763                undef,
3764                0x0054, # T
3765                0x0049, # I
3766                0x0054, # T
3767              ]->[length $self->{kwd}] or
3768              $self->{nc} == [
3769                undef,
3770                undef,
3771                0x0074, # t
3772                0x0069, # i
3773                0x0074, # t
3774              ]->[length $self->{kwd}]) {
3775            ## Stay in the state.
3776            $self->{kwd} .= chr $self->{nc};
3777            !!!next-input-character;
3778            redo A;
3779          } elsif ((length $self->{kwd}) == 5 and
3780                   ($self->{nc} == 0x0059 or # Y
3781                    $self->{nc} == 0x0079)) { # y
3782            if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
3783              !!!parse-error (type => 'lowercase keyword', ## TODO: type
3784                              text => 'ENTITY',
3785                              line => $self->{line_prev},
3786                              column => $self->{column_prev} - 4);
3787            }
3788            $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
3789                           line => $self->{line_prev},
3790                           column => $self->{column_prev} - 6};
3791            $self->{state} = DOCTYPE_MD_STATE;
3792            !!!next-input-character;
3793            redo A;
3794          } else {
3795            !!!parse-error (type => 'bogus comment',
3796                            line => $self->{line_prev},
3797                            column => $self->{column_prev} - 1
3798                                - (length $self->{kwd})
3799                                + 1 * ($self->{nc} == -1));
3800            $self->{state} = BOGUS_COMMENT_STATE;
3801            ## Reconsume.
3802            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3803            redo A;
3804          }
3805        } elsif ($self->{state} == MD_ELEMENT_STATE) {
3806          if ($self->{nc} == [
3807               undef,
3808               undef,
3809               0x0045, # E
3810               0x004D, # M
3811               0x0045, # E
3812               0x004E, # N
3813              ]->[length $self->{kwd}] or
3814              $self->{nc} == [
3815               undef,
3816               undef,
3817               0x0065, # e
3818               0x006D, # m
3819               0x0065, # e
3820               0x006E, # n
3821              ]->[length $self->{kwd}]) {
3822            ## Stay in the state.
3823            $self->{kwd} .= chr $self->{nc};
3824            !!!next-input-character;
3825            redo A;
3826          } elsif ((length $self->{kwd}) == 6 and
3827                   ($self->{nc} == 0x0054 or # T
3828                    $self->{nc} == 0x0074)) { # t
3829            if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
3830              !!!parse-error (type => 'lowercase keyword', ## TODO: type
3831                              text => 'ELEMENT',
3832                              line => $self->{line_prev},
3833                              column => $self->{column_prev} - 5);
3834            }
3835            $self->{ct} = {type => ELEMENT_TOKEN, name => '',
3836                           line => $self->{line_prev},
3837                           column => $self->{column_prev} - 6};
3838            $self->{state} = DOCTYPE_MD_STATE;
3839            !!!next-input-character;
3840            redo A;
3841          } else {
3842            !!!parse-error (type => 'bogus comment',
3843                            line => $self->{line_prev},
3844                            column => $self->{column_prev} - 1
3845                                - (length $self->{kwd})
3846                                + 1 * ($self->{nc} == -1));
3847            $self->{state} = BOGUS_COMMENT_STATE;
3848            ## Reconsume.
3849            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3850            redo A;
3851          }
3852        } elsif ($self->{state} == MD_ATTLIST_STATE) {
3853          if ($self->{nc} == [
3854               undef,
3855               0x0054, # T
3856               0x0054, # T
3857               0x004C, # L
3858               0x0049, # I
3859               0x0053, # S
3860              ]->[length $self->{kwd}] or
3861              $self->{nc} == [
3862               undef,
3863               0x0074, # t
3864               0x0074, # t
3865               0x006C, # l
3866               0x0069, # i
3867               0x0073, # s
3868              ]->[length $self->{kwd}]) {
3869            ## Stay in the state.
3870            $self->{kwd} .= chr $self->{nc};
3871            !!!next-input-character;
3872            redo A;
3873          } elsif ((length $self->{kwd}) == 6 and
3874                   ($self->{nc} == 0x0054 or # T
3875                    $self->{nc} == 0x0074)) { # t
3876            if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
3877              !!!parse-error (type => 'lowercase keyword', ## TODO: type
3878                              text => 'ATTLIST',
3879                              line => $self->{line_prev},
3880                              column => $self->{column_prev} - 5);
3881            }
3882            $self->{ct} = {type => ATTLIST_TOKEN, name => '',
3883                           attrdefs => [],
3884                           line => $self->{line_prev},
3885                           column => $self->{column_prev} - 6};
3886            $self->{state} = DOCTYPE_MD_STATE;
3887            !!!next-input-character;
3888            redo A;
3889          } else {
3890            !!!parse-error (type => 'bogus comment',
3891                            line => $self->{line_prev},
3892                            column => $self->{column_prev} - 1
3893                                 - (length $self->{kwd})
3894                                 + 1 * ($self->{nc} == -1));
3895            $self->{state} = BOGUS_COMMENT_STATE;
3896            ## Reconsume.
3897            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3898            redo A;
3899          }
3900        } elsif ($self->{state} == MD_NOTATION_STATE) {
3901          if ($self->{nc} == [
3902               undef,
3903               0x004F, # O
3904               0x0054, # T
3905               0x0041, # A
3906               0x0054, # T
3907               0x0049, # I
3908               0x004F, # O
3909              ]->[length $self->{kwd}] or
3910              $self->{nc} == [
3911               undef,
3912               0x006F, # o
3913               0x0074, # t
3914               0x0061, # a
3915               0x0074, # t
3916               0x0069, # i
3917               0x006F, # o
3918              ]->[length $self->{kwd}]) {
3919            ## Stay in the state.
3920            $self->{kwd} .= chr $self->{nc};
3921            !!!next-input-character;
3922            redo A;
3923          } elsif ((length $self->{kwd}) == 7 and
3924                   ($self->{nc} == 0x004E or # N
3925                    $self->{nc} == 0x006E)) { # n
3926            if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
3927              !!!parse-error (type => 'lowercase keyword', ## TODO: type
3928                              text => 'NOTATION',
3929                              line => $self->{line_prev},
3930                              column => $self->{column_prev} - 6);
3931            }
3932            $self->{ct} = {type => NOTATION_TOKEN, name => '',
3933                           line => $self->{line_prev},
3934                           column => $self->{column_prev} - 6};
3935            $self->{state} = DOCTYPE_MD_STATE;
3936            !!!next-input-character;
3937            redo A;
3938          } else {
3939            !!!parse-error (type => 'bogus comment',
3940                            line => $self->{line_prev},
3941                            column => $self->{column_prev} - 1
3942                                - (length $self->{kwd})
3943                                + 1 * ($self->{nc} == -1));
3944            $self->{state} = BOGUS_COMMENT_STATE;
3945            ## Reconsume.
3946            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3947            redo A;
3948          }
3949        } elsif ($self->{state} == DOCTYPE_MD_STATE) {
3950          ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
3951          ## "DOCTYPE NOTATION state".
3952    
3953          if ($is_space->{$self->{nc}}) {
3954            ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
3955            $self->{state} = BEFORE_MD_NAME_STATE;
3956            !!!next-input-character;
3957            redo A;
3958          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
3959                   $self->{nc} == 0x0025) { # %
3960            ## XML5: Switch to the "DOCTYPE bogus comment state".
3961            !!!parse-error (type => 'no space before md name'); ## TODO: type
3962            $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
3963            !!!next-input-character;
3964            redo A;
3965          } elsif ($self->{nc} == -1) {
3966            !!!parse-error (type => 'unclosed md'); ## TODO: type
3967            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3968            ## Reconsume.
3969            redo A;
3970          } elsif ($self->{nc} == 0x003E) { # >
3971            ## XML5: Switch to the "DOCTYPE bogus comment state".
3972            !!!parse-error (type => 'no md name'); ## TODO: type
3973            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3974            !!!next-input-character;
3975            redo A;
3976          } else {
3977            ## XML5: Switch to the "DOCTYPE bogus comment state".
3978            !!!parse-error (type => 'no space before md name'); ## TODO: type
3979            $self->{state} = BEFORE_MD_NAME_STATE;
3980            redo A;
3981          }
3982        } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
3983          ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
3984          ## before state", "DOCTYPE ATTLIST name before state".
3985    
3986          if ($is_space->{$self->{nc}}) {
3987            ## Stay in the state.
3988            !!!next-input-character;
3989            redo A;
3990          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
3991                   $self->{nc} == 0x0025) { # %
3992            $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
3993            !!!next-input-character;
3994            redo A;
3995          } elsif ($self->{nc} == 0x003E) { # >
3996            ## XML5: Same as "Anything else".
3997            !!!parse-error (type => 'no md name'); ## TODO: type
3998            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3999            !!!next-input-character;
4000            redo A;
4001          } elsif ($self->{nc} == -1) {
4002            !!!parse-error (type => 'unclosed md'); ## TODO: type
4003            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4004            ## Reconsume.
4005            redo A;
4006          } else {
4007            ## XML5: [ATTLIST] Not defined yet.
4008            $self->{ct}->{name} .= chr $self->{nc};
4009            $self->{state} = MD_NAME_STATE;
4010            !!!next-input-character;
4011            redo A;
4012          }
4013        } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
4014          if ($is_space->{$self->{nc}}) {
4015            ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
4016            $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
4017            $self->{state} = BEFORE_MD_NAME_STATE;
4018            !!!next-input-character;
4019            redo A;
4020          } elsif ($self->{nc} == 0x003E) { # >
4021            ## XML5: Same as "Anything else".
4022            !!!parse-error (type => 'no md name'); ## TODO: type
4023            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4024            !!!next-input-character;
4025            redo A;
4026          } elsif ($self->{nc} == -1) {
4027            !!!parse-error (type => 'unclosed md');
4028            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4029            ## Reconsume.
4030            redo A;
4031          } else {
4032            ## XML5: No parse error.
4033            !!!parse-error (type => 'no space after ENTITY percent'); ## TODO: type
4034            $self->{state} = BOGUS_COMMENT_STATE;
4035            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4036            ## Reconsume.
4037            redo A;
4038          }
4039        } elsif ($self->{state} == MD_NAME_STATE) {
4040          ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
4041          
4042          if ($is_space->{$self->{nc}}) {
4043            if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4044              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4045            } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
4046              ## TODO: ...
4047              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4048            } else { # ENTITY/NOTATION
4049              $self->{state} = AFTER_DOCTYPE_NAME_STATE;
4050            }
4051            !!!next-input-character;
4052            redo A;
4053          } elsif ($self->{nc} == 0x003E) { # >
4054            if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4055              #
4056            } else {
4057              !!!parse-error (type => 'no md def'); ## TODO: type
4058            }
4059            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4060            !!!next-input-character;
4061            !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4062            redo A;
4063          } elsif ($self->{nc} == -1) {
4064            ## XML5: [ATTLIST] No parse error.
4065            !!!parse-error (type => 'unclosed md');
4066            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4067            ## Reconsume.
4068            !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4069            redo A;
4070          } else {
4071            ## XML5: [ATTLIST] Not defined yet.
4072            $self->{ct}->{name} .= chr $self->{nc};
4073            ## Stay in the state.
4074            !!!next-input-character;
4075            redo A;
4076          }
4077        } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
4078          if ($is_space->{$self->{nc}}) {
4079            ## Stay in the state.
4080            !!!next-input-character;
4081            redo A;
4082          } elsif ($self->{nc} == 0x003E) { # >
4083            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4084            !!!next-input-character;
4085            !!!emit ($self->{ct}); # ATTLIST
4086            redo A;
4087          } elsif ($self->{nc} == -1) {
4088            ## XML5: No parse error.
4089            !!!parse-error (type => 'unclosed md'); ## TODO: type
4090            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4091            !!!emit ($self->{ct});
4092            redo A;
4093          } else {
4094            ## XML5: Not defined yet.
4095            $self->{ca} = {name => chr ($self->{nc}), # attrdef
4096                           tokens => [],
4097                           line => $self->{line}, column => $self->{column}};
4098            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
4099            !!!next-input-character;
4100            redo A;
4101          }
4102        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
4103          if ($is_space->{$self->{nc}}) {
4104            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
4105            !!!next-input-character;
4106            redo A;
4107          } elsif ($self->{nc} == 0x003E) { # >
4108            ## XML5: Same as "anything else".
4109            !!!parse-error (type => 'no attr type'); ## TODO: type
4110            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4111            !!!next-input-character;
4112            !!!emit ($self->{ct}); # ATTLIST
4113            redo A;
4114          } elsif ($self->{nc} == 0x0028) { # (
4115            ## XML5: Same as "anything else".
4116            !!!parse-error (type => 'no space before paren'); ## TODO: type
4117            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4118            !!!next-input-character;
4119            redo A;
4120          } elsif ($self->{nc} == -1) {
4121            ## XML5: No parse error.
4122            !!!parse-error (type => 'unclosed md'); ## TODO: type
4123            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4124            !!!next-input-character;
4125            !!!emit ($self->{ct}); # ATTLIST
4126            redo A;
4127          } else {
4128            ## XML5: Not defined yet.
4129            $self->{ca}->{name} .= chr $self->{nc};
4130            ## Stay in the state.
4131            !!!next-input-character;
4132            redo A;
4133          }
4134        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
4135          if ($is_space->{$self->{nc}}) {
4136            ## Stay in the state.
4137            !!!next-input-character;
4138            redo A;
4139          } elsif ($self->{nc} == 0x003E) { # >
4140            ## XML5: Same as "anything else".
4141            !!!parse-error (type => 'no attr type'); ## TODO: type
4142            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4143            !!!next-input-character;
4144            !!!emit ($self->{ct}); # ATTLIST
4145            redo A;
4146          } elsif ($self->{nc} == 0x0028) { # (
4147            ## XML5: Same as "anything else".
4148            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4149            !!!next-input-character;
4150            redo A;
4151          } elsif ($self->{nc} == -1) {
4152            ## XML5: No parse error.
4153            !!!parse-error (type => 'unclosed md'); ## TODO: type
4154            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4155            !!!next-input-character;
4156            !!!emit ($self->{ct});
4157            redo A;
4158          } else {
4159            ## XML5: Not defined yet.
4160            $self->{ca}->{type} = chr $self->{nc};
4161            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
4162            !!!next-input-character;
4163            redo A;
4164          }
4165        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
4166          if ($is_space->{$self->{nc}}) {
4167            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
4168            !!!next-input-character;
4169            redo A;
4170          } elsif ($self->{nc} == 0x0023) { # #
4171            ## XML5: Same as "anything else".
4172            !!!parse-error (type => 'no space before default value'); ## TODO: type
4173            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4174            !!!next-input-character;
4175            redo A;
4176          } elsif ($self->{nc} == 0x0022) { # "
4177            ## XML5: Same as "anything else".
4178            !!!parse-error (type => 'no space before default value'); ## TODO: type
4179            $self->{ca}->{value} = '';
4180            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4181            !!!next-input-character;
4182            redo A;
4183          } elsif ($self->{nc} == 0x0027) { # '
4184            ## XML5: Same as "anything else".
4185            !!!parse-error (type => 'no space before default value'); ## TODO: type
4186            $self->{ca}->{value} = '';
4187            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4188            !!!next-input-character;
4189            redo A;
4190          } elsif ($self->{nc} == 0x003E) { # >
4191            ## XML5: Same as "anything else".
4192            !!!parse-error (type => 'no attr default'); ## TODO: type
4193            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4194            !!!next-input-character;
4195            !!!emit ($self->{ct}); # ATTLIST
4196            redo A;
4197          } elsif ($self->{nc} == 0x0028) { # (
4198            ## XML5: Same as "anything else".
4199            !!!parse-error (type => 'no space before paren'); ## TODO: type
4200            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4201            !!!next-input-character;
4202            redo A;
4203          } elsif ($self->{nc} == -1) {
4204            ## XML5: No parse error.
4205            !!!parse-error (type => 'unclosed md'); ## TODO: type
4206            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4207            !!!next-input-character;
4208            !!!emit ($self->{ct});
4209            redo A;
4210          } else {
4211            ## XML5: Not defined yet.
4212            $self->{ca}->{type} .= chr $self->{nc};
4213            ## Stay in the state.
4214            !!!next-input-character;
4215            redo A;
4216          }
4217        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
4218          if ($is_space->{$self->{nc}}) {
4219            ## Stay in the state.
4220            !!!next-input-character;
4221            redo A;
4222          } elsif ($self->{nc} == 0x0028) { # (
4223            ## XML5: Same as "anything else".
4224            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4225            !!!next-input-character;
4226            redo A;
4227          } elsif ($self->{nc} == 0x0023) { # #
4228            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4229            !!!next-input-character;
4230            redo A;
4231          } elsif ($self->{nc} == 0x0022) { # "
4232            ## XML5: Same as "anything else".
4233            $self->{ca}->{value} = '';
4234            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4235            !!!next-input-character;
4236            redo A;
4237          } elsif ($self->{nc} == 0x0027) { # '
4238            ## XML5: Same as "anything else".
4239            $self->{ca}->{value} = '';
4240            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4241            !!!next-input-character;
4242            redo A;
4243          } elsif ($self->{nc} == 0x003E) { # >
4244            ## XML5: Same as "anything else".
4245            !!!parse-error (type => 'no attr default'); ## TODO: type
4246            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4247            !!!next-input-character;
4248            !!!emit ($self->{ct}); # ATTLIST
4249            redo A;
4250          } elsif ($self->{nc} == -1) {
4251            ## XML5: No parse error.
4252            !!!parse-error (type => 'unclosed md'); ## TODO: type
4253            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4254            !!!next-input-character;
4255            !!!emit ($self->{ct});
4256            redo A;
4257          } else {
4258            ## XML5: Switch to the "DOCTYPE bogus comment state".
4259            !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4260            $self->{ca}->{value} = '';
4261            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4262            ## Reconsume.
4263            redo A;
4264          }
4265        } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
4266          if ($is_space->{$self->{nc}}) {
4267            ## Stay in the state.
4268            !!!next-input-character;
4269            redo A;
4270          } elsif ($self->{nc} == 0x007C) { # |
4271            !!!parse-error (type => 'empty allowed token'); ## TODO: type
4272            ## Stay in the state.
4273            !!!next-input-character;
4274            redo A;
4275          } elsif ($self->{nc} == 0x0029) { # )
4276            !!!parse-error (type => 'empty allowed token'); ## TODO: type
4277            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4278            !!!next-input-character;
4279            redo A;
4280          } elsif ($self->{nc} == 0x003E) { # >
4281            !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4282            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4283            !!!next-input-character;
4284            !!!emit ($self->{ct}); # ATTLIST
4285            redo A;
4286          } elsif ($self->{nc} == -1) {
4287            ## XML5: No parse error.
4288            !!!parse-error (type => 'unclosed md'); ## TODO: type
4289            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4290            !!!next-input-character;
4291            !!!emit ($self->{ct});
4292            redo A;
4293          } else {
4294            push @{$self->{ca}->{tokens}}, chr $self->{nc};
4295            $self->{state} = ALLOWED_TOKEN_STATE;
4296            !!!next-input-character;
4297            redo A;
4298          }
4299        } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
4300          if ($is_space->{$self->{nc}}) {
4301            $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
4302            !!!next-input-character;
4303            redo A;
4304          } elsif ($self->{nc} == 0x007C) { # |
4305            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4306            !!!next-input-character;
4307            redo A;
4308          } elsif ($self->{nc} == 0x0029) { # )
4309            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4310            !!!next-input-character;
4311            redo A;
4312          } elsif ($self->{nc} == 0x003E) { # >
4313            !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4314            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4315            !!!next-input-character;
4316            !!!emit ($self->{ct}); # ATTLIST
4317            redo A;
4318          } elsif ($self->{nc} == -1) {
4319            ## XML5: No parse error.
4320            !!!parse-error (type => 'unclosed md'); ## TODO: type
4321            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4322            !!!next-input-character;
4323            !!!emit ($self->{ct});
4324            redo A;
4325          } else {
4326            $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
4327            ## Stay in the state.
4328            !!!next-input-character;
4329            redo A;
4330          }
4331        } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
4332          if ($is_space->{$self->{nc}}) {
4333            ## Stay in the state.
4334            !!!next-input-character;
4335            redo A;
4336          } elsif ($self->{nc} == 0x007C) { # |
4337            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4338            !!!next-input-character;
4339            redo A;
4340          } elsif ($self->{nc} == 0x0029) { # )
4341            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4342            !!!next-input-character;
4343            redo A;
4344          } elsif ($self->{nc} == 0x003E) { # >
4345            !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4346            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4347            !!!next-input-character;
4348            !!!emit ($self->{ct}); # ATTLIST
4349            redo A;
4350          } elsif ($self->{nc} == -1) {
4351            ## XML5: No parse error.
4352            !!!parse-error (type => 'unclosed md'); ## TODO: type
4353            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4354            !!!next-input-character;
4355            !!!emit ($self->{ct});
4356            redo A;
4357          } else {
4358            !!!parse-error (type => 'space in allowed token', ## TODO: type
4359                            line => $self->{line_prev},
4360                            column => $self->{column_prev});
4361            $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
4362            $self->{state} = ALLOWED_TOKEN_STATE;
4363            !!!next-input-character;
4364            redo A;
4365          }
4366        } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
4367          if ($is_space->{$self->{nc}}) {
4368            $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
4369            !!!next-input-character;
4370            redo A;
4371          } elsif ($self->{nc} == 0x0023) { # #
4372            !!!parse-error (type => 'no space before default value'); ## TODO: type
4373            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4374            !!!next-input-character;
4375            redo A;
4376          } elsif ($self->{nc} == 0x0022) { # "
4377            !!!parse-error (type => 'no space before default value'); ## TODO: type
4378            $self->{ca}->{value} = '';
4379            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4380            !!!next-input-character;
4381            redo A;
4382          } elsif ($self->{nc} == 0x0027) { # '
4383            !!!parse-error (type => 'no space before default value'); ## TODO: type
4384            $self->{ca}->{value} = '';
4385            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4386            !!!next-input-character;
4387            redo A;
4388          } elsif ($self->{nc} == 0x003E) { # >
4389            !!!parse-error (type => 'no attr default'); ## TODO: type
4390            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4391            !!!next-input-character;
4392            !!!emit ($self->{ct}); # ATTLIST
4393            redo A;
4394          } elsif ($self->{nc} == -1) {
4395            !!!parse-error (type => 'unclosed md'); ## TODO: type
4396            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4397            !!!next-input-character;
4398            !!!emit ($self->{ct});
4399            redo A;
4400          } else {
4401            !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4402            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4403            ## Reconsume.
4404            redo A;
4405          }
4406        } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
4407          if ($is_space->{$self->{nc}}) {
4408            ## Stay in the state.
4409            !!!next-input-character;
4410            redo A;
4411          } elsif ($self->{nc} == 0x0023) { # #
4412            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4413            !!!next-input-character;
4414            redo A;
4415          } elsif ($self->{nc} == 0x0022) { # "
4416            $self->{ca}->{value} = '';
4417            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4418            !!!next-input-character;
4419            redo A;
4420          } elsif ($self->{nc} == 0x0027) { # '
4421            $self->{ca}->{value} = '';
4422            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4423            !!!next-input-character;
4424            redo A;
4425          } elsif ($self->{nc} == 0x003E) { # >
4426            !!!parse-error (type => 'no attr default'); ## TODO: type
4427            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4428            !!!next-input-character;
4429            !!!emit ($self->{ct}); # ATTLIST
4430            redo A;
4431          } elsif ($self->{nc} == -1) {
4432            !!!parse-error (type => 'unclosed md'); ## TODO: type
4433            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4434            !!!next-input-character;
4435            !!!emit ($self->{ct});
4436            redo A;
4437          } else {
4438            !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4439            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4440            ## Reconsume.
4441            redo A;
4442          }
4443        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
4444          if ($is_space->{$self->{nc}}) {
4445            ## XML5: No parse error.
4446            !!!parse-error (type => 'no default type'); ## TODO: type
4447            $self->{state} = BOGUS_MD_STATE;
4448            ## Reconsume.
4449            redo A;
4450          } elsif ($self->{nc} == 0x0022) { # "
4451            ## XML5: Same as "anything else".
4452            $self->{ca}->{value} = '';
4453            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4454            !!!next-input-character;
4455            redo A;
4456          } elsif ($self->{nc} == 0x0027) { # '
4457            ## XML5: Same as "anything else".
4458            $self->{ca}->{value} = '';
4459            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4460            !!!next-input-character;
4461            redo A;
4462          } elsif ($self->{nc} == 0x003E) { # >
4463            ## XML5: Same as "anything else".
4464            !!!parse-error (type => 'no attr default'); ## TODO: type
4465            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4466            !!!next-input-character;
4467            !!!emit ($self->{ct}); # ATTLIST
4468            redo A;
4469          } elsif ($self->{nc} == -1) {
4470            ## XML5: No parse error.
4471            !!!parse-error (type => 'unclosed md'); ## TODO: type
4472            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4473            !!!next-input-character;
4474            !!!emit ($self->{ct});
4475            redo A;
4476          } else {
4477            $self->{ca}->{default} = chr $self->{nc};
4478            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
4479            !!!next-input-character;
4480            redo A;
4481          }
4482        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
4483          if ($is_space->{$self->{nc}}) {
4484            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
4485            !!!next-input-character;
4486            redo A;
4487          } elsif ($self->{nc} == 0x0022) { # "
4488            ## XML5: Same as "anything else".
4489            !!!parse-error (type => 'no space before default value'); ## TODO: type
4490            $self->{ca}->{value} = '';
4491            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4492            !!!next-input-character;
4493            redo A;
4494          } elsif ($self->{nc} == 0x0027) { # '
4495            ## XML5: Same as "anything else".
4496            !!!parse-error (type => 'no space before default value'); ## TODO: type
4497            $self->{ca}->{value} = '';
4498            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4499            !!!next-input-character;
4500            redo A;
4501          } elsif ($self->{nc} == 0x003E) { # >
4502            ## XML5: Same as "anything else".
4503            push @{$self->{ct}->{attrdefs}}, $self->{ca};
4504            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4505            !!!next-input-character;
4506            !!!emit ($self->{ct}); # ATTLIST
4507            redo A;
4508          } elsif ($self->{nc} == -1) {
4509            ## XML5: No parse error.
4510            !!!parse-error (type => 'unclosed md'); ## TODO: type
4511            push @{$self->{ct}->{attrdefs}}, $self->{ca};
4512            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4513            !!!next-input-character;
4514            !!!emit ($self->{ct});
4515            redo A;
4516          } else {
4517            $self->{ca}->{default} .= chr $self->{nc};
4518            ## Stay in the state.
4519            !!!next-input-character;
4520            redo A;
4521          }
4522        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
4523          if ($is_space->{$self->{nc}}) {
4524            ## Stay in the state.
4525            !!!next-input-character;
4526            redo A;
4527          } elsif ($self->{nc} == 0x0022) { # "
4528            $self->{ca}->{value} = '';
4529            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4530            !!!next-input-character;
4531            redo A;
4532          } elsif ($self->{nc} == 0x0027) { # '
4533            $self->{ca}->{value} = '';
4534            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4535            !!!next-input-character;
4536            redo A;
4537          } elsif ($self->{nc} == 0x003E) { # >
4538            push @{$self->{ct}->{attrdefs}}, $self->{ca};
4539            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4540            !!!next-input-character;
4541            !!!emit ($self->{ct}); # ATTLIST
4542            redo A;
4543          } elsif ($self->{nc} == -1) {
4544            ## XML5: No parse error.
4545            !!!parse-error (type => 'unclosed md'); ## TODO: type
4546            push @{$self->{ct}->{attrdefs}}, $self->{ca};
4547            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4548            !!!next-input-character;
4549            !!!emit ($self->{ct});
4550            redo A;
4551          } else {
4552            ## XML5: Not defined yet.
4553            if ($self->{ca}->{default} eq 'FIXED') {
4554              $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4555            } else {
4556              push @{$self->{ct}->{attrdefs}}, $self->{ca};
4557              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4558            }
4559            ## Reconsume.
4560            redo A;
4561          }
4562        } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
4563          if ($is_space->{$self->{nc}} or
4564              $self->{nc} == -1 or
4565              $self->{nc} == 0x003E) { # >
4566            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4567            ## Reconsume.
4568            redo A;
4569          } else {
4570            !!!parse-error (type => 'no space before attr name'); ## TODO: type
4571            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4572            ## Reconsume.
4573            redo A;
4574          }
4575        } elsif ($self->{state} == NDATA_STATE) {
4576          ## ASCII case-insensitive
4577          if ($self->{nc} == [
4578                undef,
4579                0x0044, # D
4580                0x0041, # A
4581                0x0054, # T
4582              ]->[length $self->{kwd}] or
4583              $self->{nc} == [
4584                undef,
4585                0x0064, # d
4586                0x0061, # a
4587                0x0074, # t
4588              ]->[length $self->{kwd}]) {
4589            !!!cp (172.2);
4590            ## Stay in the state.
4591            $self->{kwd} .= chr $self->{nc};
4592            !!!next-input-character;
4593            redo A;
4594          } elsif ((length $self->{kwd}) == 4 and
4595                   ($self->{nc} == 0x0041 or # A
4596                    $self->{nc} == 0x0061)) { # a
4597            if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
4598              !!!cp (172.3);
4599              !!!parse-error (type => 'lowercase keyword', ## TODO: type
4600                              text => 'NDATA',
4601                              line => $self->{line_prev},
4602                              column => $self->{column_prev} - 4);
4603            } else {
4604              !!!cp (172.4);
4605            }
4606            $self->{state} = AFTER_NDATA_STATE;
4607            !!!next-input-character;
4608            redo A;
4609          } else {
4610            !!!parse-error (type => 'string after literal', ## TODO: type
4611                            line => $self->{line_prev},
4612                            column => $self->{column_prev} + 1
4613                                - length $self->{kwd});
4614            !!!cp (172.5);
4615            $self->{state} = BOGUS_MD_STATE;
4616            ## Reconsume.
4617            redo A;
4618          }
4619        } elsif ($self->{state} == AFTER_NDATA_STATE) {
4620          if ($is_space->{$self->{nc}}) {
4621            $self->{state} = BEFORE_NOTATION_NAME_STATE;
4622            !!!next-input-character;
4623            redo A;
4624          } elsif ($self->{nc} == 0x003E) { # >
4625            !!!parse-error (type => 'no notation name'); ## TODO: type
4626            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4627            !!!next-input-character;
4628            !!!emit ($self->{ct}); # ENTITY
4629            redo A;
4630          } elsif ($self->{nc} == -1) {
4631            !!!parse-error (type => 'unclosed md'); ## TODO: type
4632            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4633            !!!next-input-character;
4634            !!!emit ($self->{ct}); # ENTITY
4635            redo A;
4636          } else {
4637            !!!parse-error (type => 'string after literal', ## TODO: type
4638                            line => $self->{line_prev},
4639                            column => $self->{column_prev} + 1
4640                                - length $self->{kwd});
4641            $self->{state} = BOGUS_MD_STATE;
4642            ## Reconsume.
4643            redo A;
4644          }
4645        } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
4646          if ($is_space->{$self->{nc}}) {
4647            ## Stay in the state.
4648            !!!next-input-character;
4649            redo A;
4650          } elsif ($self->{nc} == 0x003E) { # >
4651            !!!parse-error (type => 'no notation name'); ## TODO: type
4652            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4653            !!!next-input-character;
4654            !!!emit ($self->{ct}); # ENTITY
4655            redo A;
4656          } elsif ($self->{nc} == -1) {
4657            !!!parse-error (type => 'unclosed md'); ## TODO: type
4658            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4659            !!!next-input-character;
4660            !!!emit ($self->{ct}); # ENTITY
4661            redo A;
4662          } else {
4663            $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
4664            $self->{state} = NOTATION_NAME_STATE;
4665            !!!next-input-character;
4666            redo A;
4667          }
4668        } elsif ($self->{state} == NOTATION_NAME_STATE) {
4669          if ($is_space->{$self->{nc}}) {
4670            $self->{state} = AFTER_NOTATION_NAME_STATE;
4671            !!!next-input-character;
4672            redo A;
4673          } elsif ($self->{nc} == 0x003E) { # >
4674            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4675            !!!next-input-character;
4676            !!!emit ($self->{ct}); # ENTITY
4677            redo A;
4678          } elsif ($self->{nc} == -1) {
4679            !!!parse-error (type => 'unclosed md'); ## TODO: type
4680            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4681            !!!next-input-character;
4682            !!!emit ($self->{ct}); # ENTITY
4683            redo A;
4684          } else {
4685            $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
4686            ## Stay in the state.
4687            !!!next-input-character;
4688            redo A;
4689          }
4690        } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
4691          if ($self->{nc} == 0x0022) { # "
4692            $self->{state} = AFTER_NOTATION_NAME_STATE;
4693            !!!next-input-character;
4694            redo A;
4695          } elsif ($self->{nc} == 0x0026) { # &
4696            $self->{prev_state} = $self->{state};
4697            $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4698            $self->{entity_add} = 0x0022; # "
4699            !!!next-input-character;
4700            redo A;
4701    ## TODO: %
4702          } elsif ($self->{nc} == -1) {
4703            !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4704            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4705            ## Reconsume.
4706            !!!emit ($self->{ct}); # ENTITY
4707            redo A;
4708          } else {
4709            $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4710            !!!next-input-character;
4711            redo A;
4712          }
4713        } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
4714          if ($self->{nc} == 0x0027) { # '
4715            $self->{state} = AFTER_NOTATION_NAME_STATE;
4716            !!!next-input-character;
4717            redo A;
4718          } elsif ($self->{nc} == 0x0026) { # &
4719            $self->{prev_state} = $self->{state};
4720            $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4721            $self->{entity_add} = 0x0027; # '
4722            !!!next-input-character;
4723            redo A;
4724    ## TODO: %
4725          } elsif ($self->{nc} == -1) {
4726            !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4727            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4728            ## Reconsume.
4729            !!!emit ($self->{ct}); # ENTITY
4730            redo A;
4731          } else {
4732            $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4733            !!!next-input-character;
4734            redo A;
4735          }
4736        } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
4737          ## TODO: XMLize
4738    
4739          if ($is_space->{$self->{nc}} or
4740              {
4741                0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4742                $self->{entity_add} => 1,
4743              }->{$self->{nc}}) {
4744            ## Don't consume
4745            ## No error
4746            ## Return nothing.
4747            #
4748          } elsif ($self->{nc} == 0x0023) { # #
4749            $self->{ca} = $self->{ct};
4750            $self->{state} = ENTITY_HASH_STATE;
4751            $self->{kwd} = '#';
4752            !!!next-input-character;
4753            redo A;
4754          } elsif ((0x0041 <= $self->{nc} and
4755                    $self->{nc} <= 0x005A) or # A..Z
4756                   (0x0061 <= $self->{nc} and
4757                    $self->{nc} <= 0x007A)) { # a..z
4758            #
4759          } else {
4760            !!!parse-error (type => 'bare ero');
4761            ## Return nothing.
4762            #
4763          }
4764    
4765          $self->{ct}->{value} .= '&';
4766          $self->{state} = $self->{prev_state};
4767          ## Reconsume.
4768          redo A;
4769        } elsif ($self->{state} == AFTER_NOTATION_NAME_STATE) {
4770          if ($is_space->{$self->{nc}}) {
4771            ## Stay in the state.
4772            !!!next-input-character;
4773            redo A;
4774          } elsif ($self->{nc} == 0x003E) { # >
4775            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4776            !!!next-input-character;
4777            !!!emit ($self->{ct}); # ENTITY
4778            redo A;
4779          } elsif ($self->{nc} == -1) {
4780            !!!parse-error (type => 'unclosed md'); ## TODO: type
4781            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4782            !!!next-input-character;
4783            !!!emit ($self->{ct}); # ENTITY
4784            redo A;
4785          } else {
4786            !!!parse-error (type => 'string after notation name'); ## TODO: type
4787            $self->{state} = BOGUS_MD_STATE;
4788            ## Reconsume.
4789            redo A;
4790          }
4791        } elsif ($self->{state} == BOGUS_MD_STATE) {
4792          if ($self->{nc} == 0x003E) { # >
4793            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4794            !!!next-input-character;
4795            !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
4796            redo A;
4797          } elsif ($self->{nc} == -1) {
4798            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4799            ## Reconsume.
4800            !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
4801            redo A;
4802          } else {
4803            ## Stay in the state.
4804          !!!next-input-character;          !!!next-input-character;
4805          redo A;          redo A;
4806        }        }
           
4807      } else {      } else {
4808        die "$0: $self->{state}: Unknown state";        die "$0: $self->{state}: Unknown state";
4809      }      }
# Line 3259  sub _get_next_token ($) { Line 4814  sub _get_next_token ($) {
4814    
4815  1;  1;
4816  ## $Date$  ## $Date$
4817                                    

Legend:
Removed from v.1.12  
changed lines
  Added in v.1.19

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24