/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.11 by wakaba, Wed Oct 15 10:50:38 2008 UTC revision 1.14 by wakaba, Fri Oct 17 07:14:29 2008 UTC
# Line 15  BEGIN { Line 15  BEGIN {
15      CHARACTER_TOKEN      CHARACTER_TOKEN
16      PI_TOKEN      PI_TOKEN
17      ABORT_TOKEN      ABORT_TOKEN
18        END_OF_DOCTYPE_TOKEN
19        ATTLIST_TOKEN
20        ELEMENT_TOKEN
21        GENERAL_ENTITY_TOKEN
22        PARAMETER_ENTITY_TOKEN
23        NOTATION_TOKEN
24    );    );
25        
26    our %EXPORT_TAGS = (    our %EXPORT_TAGS = (
# Line 27  BEGIN { Line 33  BEGIN {
33        CHARACTER_TOKEN        CHARACTER_TOKEN
34        PI_TOKEN        PI_TOKEN
35        ABORT_TOKEN        ABORT_TOKEN
36          END_OF_DOCTYPE_TOKEN
37          ATTLIST_TOKEN
38          ELEMENT_TOKEN
39          GENERAL_ENTITY_TOKEN
40          PARAMETER_ENTITY_TOKEN
41          NOTATION_TOKEN
42      )],      )],
43    );    );
44  }  }
45    
46    ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48  ## Token types  ## Token types
49    
50  sub DOCTYPE_TOKEN () { 1 }  sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51  sub COMMENT_TOKEN () { 2 }  sub COMMENT_TOKEN () { 2 }
52  sub START_TAG_TOKEN () { 3 }  sub START_TAG_TOKEN () { 3 }
53  sub END_TAG_TOKEN () { 4 }  sub END_TAG_TOKEN () { 4 }
54  sub END_OF_FILE_TOKEN () { 5 }  sub END_OF_FILE_TOKEN () { 5 }
55  sub CHARACTER_TOKEN () { 6 }  sub CHARACTER_TOKEN () { 6 }
56  sub PI_TOKEN () { 7 } # XML5  sub PI_TOKEN () { 7 } ## NOTE: XML only.
57  sub ABORT_TOKEN () { 8 } # Not a token actually  sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58    sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59    sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60    sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61    sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62    sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63    sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64    
65    ## XML5: XML5 has "empty tag token".  In this implementation, it is
66    ## represented as a start tag token with $self->{self_closing} flag
67    ## set to true.
68    
69    ## XML5: XML5 has "short end tag token".  In this implementation, it
70    ## is represented as an end tag token with $token->{tag_name} flag set
71    ## to an empty string.
72    
73  package Whatpm::HTML;  package Whatpm::HTML;
74    
# Line 114  sub HEXREF_HEX_STATE () { 48 } Line 142  sub HEXREF_HEX_STATE () { 48 }
142  sub ENTITY_NAME_STATE () { 49 }  sub ENTITY_NAME_STATE () { 49 }
143  sub PCDATA_STATE () { 50 } # "data state" in the spec  sub PCDATA_STATE () { 50 } # "data state" in the spec
144    
145  ## XML states  ## XML-only states
146  sub PI_STATE () { 51 }  sub PI_STATE () { 51 }
147  sub PI_TARGET_STATE () { 52 }  sub PI_TARGET_STATE () { 52 }
148  sub PI_TARGET_AFTER_STATE () { 53 }  sub PI_TARGET_AFTER_STATE () { 53 }
149  sub PI_DATA_STATE () { 54 }  sub PI_DATA_STATE () { 54 }
150  sub PI_AFTER_STATE () { 55 }  sub PI_AFTER_STATE () { 55 }
151  sub PI_DATA_AFTER_STATE () { 56 }  sub PI_DATA_AFTER_STATE () { 56 }
152    sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153    sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154    sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155    sub DOCTYPE_TAG_STATE () { 60 }
156    sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157    sub MD_ATTLIST_STATE () { 62 }
158    sub MD_E_STATE () { 63 }
159    sub MD_ELEMENT_STATE () { 64 }
160    sub MD_ENTITY_STATE () { 65 }
161    sub MD_NOTATION_STATE () { 66 }
162    sub DOCTYPE_MD_STATE () { 67 }
163    sub BEFORE_MD_NAME_STATE () { 68 }
164    sub MD_NAME_STATE () { 69 }
165    sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166    sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167    
168  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
169  ## list and descriptions)  ## list and descriptions)
# Line 186  sub _initialize_tokenizer ($) { Line 229  sub _initialize_tokenizer ($) {
229    #$self->{is_xml} (if XML)    #$self->{is_xml} (if XML)
230    
231    $self->{state} = DATA_STATE; # MUST    $self->{state} = DATA_STATE; # MUST
232    $self->{s_kwd} = ''; # state keyword    $self->{s_kwd} = ''; # Data state keyword
233      #$self->{kwd} = ''; # State-dependent keyword; initialized when used
234    #$self->{entity__value}; # initialized when used    #$self->{entity__value}; # initialized when used
235    #$self->{entity__match}; # initialized when used    #$self->{entity__match}; # initialized when used
236    $self->{content_model} = PCDATA_CONTENT_MODEL; # be    $self->{content_model} = PCDATA_CONTENT_MODEL; # be
# Line 221  sub _initialize_tokenizer ($) { Line 265  sub _initialize_tokenizer ($) {
265  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
266  ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)  ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
267  ##   ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.  ##   ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
268    ##   ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
269    
270  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
271  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|
272  ##     while the token is pushed back to the stack.  ##     while the token is pushed back to the stack.
# Line 240  my $is_space = { Line 286  my $is_space = {
286    0x0009 => 1, # CHARACTER TABULATION (HT)    0x0009 => 1, # CHARACTER TABULATION (HT)
287    0x000A => 1, # LINE FEED (LF)    0x000A => 1, # LINE FEED (LF)
288    #0x000B => 0, # LINE TABULATION (VT)    #0x000B => 0, # LINE TABULATION (VT)
289    0x000C => 1, # FORM FEED (FF)    0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
290    #0x000D => 1, # CARRIAGE RETURN (CR)    #0x000D => 1, # CARRIAGE RETURN (CR)
291    0x0020 => 1, # SPACE (SP)    0x0020 => 1, # SPACE (SP)
292  };  };
# Line 450  sub _get_next_token ($) { Line 496  sub _get_next_token ($) {
496            redo A;            redo A;
497          } elsif ($self->{nc} == 0x0021) { # !          } elsif ($self->{nc} == 0x0021) { # !
498            !!!cp (15.1);            !!!cp (15.1);
499            $self->{s_kwd} = '<' unless $self->{escape};            $self->{s_kwd} = $self->{escaped} ? '' : '<';
500            #            #
501          } else {          } else {
502            !!!cp (16);            !!!cp (16);
503              $self->{s_kwd} = '';
504            #            #
505          }          }
506    
507          ## reconsume          ## reconsume
508          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
         $self->{s_kwd} = '';  
509          !!!emit ({type => CHARACTER_TOKEN, data => '<',          !!!emit ({type => CHARACTER_TOKEN, data => '<',
510                    line => $self->{line_prev},                    line => $self->{line_prev},
511                    column => $self->{column_prev},                    column => $self->{column_prev},
# Line 570  sub _get_next_token ($) { Line 616  sub _get_next_token ($) {
616        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
617          if (defined $self->{last_stag_name}) {          if (defined $self->{last_stag_name}) {
618            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
619            $self->{s_kwd} = '';            $self->{kwd} = '';
620            ## Reconsume.            ## Reconsume.
621            redo A;            redo A;
622          } else {          } else {
# Line 673  sub _get_next_token ($) { Line 719  sub _get_next_token ($) {
719          redo A;          redo A;
720        }        }
721      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
722        my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;        my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
723        if (length $ch) {        if (length $ch) {
724          my $CH = $ch;          my $CH = $ch;
725          $ch =~ tr/a-z/A-Z/;          $ch =~ tr/a-z/A-Z/;
# Line 681  sub _get_next_token ($) { Line 727  sub _get_next_token ($) {
727          if ($nch eq $ch or $nch eq $CH) {          if ($nch eq $ch or $nch eq $CH) {
728            !!!cp (24);            !!!cp (24);
729            ## Stay in the state.            ## Stay in the state.
730            $self->{s_kwd} .= $nch;            $self->{kwd} .= $nch;
731            !!!next-input-character;            !!!next-input-character;
732            redo A;            redo A;
733          } else {          } else {
# Line 690  sub _get_next_token ($) { Line 736  sub _get_next_token ($) {
736            $self->{s_kwd} = '';            $self->{s_kwd} = '';
737            ## Reconsume.            ## Reconsume.
738            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
739                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
740                      line => $self->{line_prev},                      line => $self->{line_prev},
741                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
742                     });                     });
743            redo A;            redo A;
744          }          }
# Line 708  sub _get_next_token ($) { Line 754  sub _get_next_token ($) {
754            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
755            $self->{s_kwd} = '';            $self->{s_kwd} = '';
756            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
757                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
758                      line => $self->{line_prev},                      line => $self->{line_prev},
759                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
760                     });                     });
761            redo A;            redo A;
762          } else {          } else {
# Line 719  sub _get_next_token ($) { Line 765  sub _get_next_token ($) {
765                = {type => END_TAG_TOKEN,                = {type => END_TAG_TOKEN,
766                   tag_name => $self->{last_stag_name},                   tag_name => $self->{last_stag_name},
767                   line => $self->{line_prev},                   line => $self->{line_prev},
768                   column => $self->{column_prev} - 1 - length $self->{s_kwd}};                   column => $self->{column_prev} - 1 - length $self->{kwd}};
769            $self->{state} = TAG_NAME_STATE;            $self->{state} = TAG_NAME_STATE;
770            ## Reconsume.            ## Reconsume.
771            redo A;            redo A;
# Line 1548  sub _get_next_token ($) { Line 1594  sub _get_next_token ($) {
1594          redo A;          redo A;
1595        }        }
1596      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1597        ## (only happen if PCDATA state)        ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
1598    
1599        ## NOTE: Unlike spec's "bogus comment state", this implementation        ## NOTE: Unlike spec's "bogus comment state", this implementation
1600        ## consumes characters one-by-one basis.        ## consumes characters one-by-one basis.
1601                
1602        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
1603          !!!cp (124);          if ($self->{in_subset}) {
1604          $self->{state} = DATA_STATE;            !!!cp (123);
1605          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1606            } else {
1607              !!!cp (124);
1608              $self->{state} = DATA_STATE;
1609              $self->{s_kwd} = '';
1610            }
1611          !!!next-input-character;          !!!next-input-character;
1612    
1613          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
1614          redo A;          redo A;
1615        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1616          !!!cp (125);          if ($self->{in_subset}) {
1617          $self->{state} = DATA_STATE;            !!!cp (125.1);
1618          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1619            } else {
1620              !!!cp (125);
1621              $self->{state} = DATA_STATE;
1622              $self->{s_kwd} = '';
1623            }
1624          ## reconsume          ## reconsume
1625    
1626          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1581  sub _get_next_token ($) { Line 1637  sub _get_next_token ($) {
1637          redo A;          redo A;
1638        }        }
1639      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1640        ## (only happen if PCDATA state)        ## XML5: "Markup declaration state".
1641                
1642        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
1643          !!!cp (133);          !!!cp (133);
# Line 1593  sub _get_next_token ($) { Line 1649  sub _get_next_token ($) {
1649          ## ASCII case-insensitive.          ## ASCII case-insensitive.
1650          !!!cp (130);          !!!cp (130);
1651          $self->{state} = MD_DOCTYPE_STATE;          $self->{state} = MD_DOCTYPE_STATE;
1652          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
1653          !!!next-input-character;          !!!next-input-character;
1654          redo A;          redo A;
1655        } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and        } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
# Line 1602  sub _get_next_token ($) { Line 1658  sub _get_next_token ($) {
1658                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
1659          !!!cp (135.4);                          !!!cp (135.4);                
1660          $self->{state} = MD_CDATA_STATE;          $self->{state} = MD_CDATA_STATE;
1661          $self->{s_kwd} = '[';          $self->{kwd} = '[';
1662          !!!next-input-character;          !!!next-input-character;
1663          redo A;          redo A;
1664        } else {        } else {
# Line 1652  sub _get_next_token ($) { Line 1708  sub _get_next_token ($) {
1708              0x0054, # T              0x0054, # T
1709              0x0059, # Y              0x0059, # Y
1710              0x0050, # P              0x0050, # P
1711            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
1712            $self->{nc} == [            $self->{nc} == [
1713              undef,              undef,
1714              0x006F, # o              0x006F, # o
# Line 1660  sub _get_next_token ($) { Line 1716  sub _get_next_token ($) {
1716              0x0074, # t              0x0074, # t
1717              0x0079, # y              0x0079, # y
1718              0x0070, # p              0x0070, # p
1719            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
1720          !!!cp (131);          !!!cp (131);
1721          ## Stay in the state.          ## Stay in the state.
1722          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
1723          !!!next-input-character;          !!!next-input-character;
1724          redo A;          redo A;
1725        } elsif ((length $self->{s_kwd}) == 6 and        } elsif ((length $self->{kwd}) == 6 and
1726                 ($self->{nc} == 0x0045 or # E                 ($self->{nc} == 0x0045 or # E
1727                  $self->{nc} == 0x0065)) { # e                  $self->{nc} == 0x0065)) { # e
1728          if ($self->{s_kwd} ne 'DOCTYP') {          if ($self->{is_xml} and
1729                ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
1730            !!!cp (129);            !!!cp (129);
1731            ## XML5: case-sensitive.            ## XML5: case-sensitive.
1732            !!!parse-error (type => 'lowercase keyword', ## TODO            !!!parse-error (type => 'lowercase keyword', ## TODO
# Line 1691  sub _get_next_token ($) { Line 1748  sub _get_next_token ($) {
1748          !!!cp (132);                  !!!cp (132);        
1749          !!!parse-error (type => 'bogus comment',          !!!parse-error (type => 'bogus comment',
1750                          line => $self->{line_prev},                          line => $self->{line_prev},
1751                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
1752          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
1753          ## Reconsume.          ## Reconsume.
1754          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
1755                                    data => $self->{s_kwd},                                    data => $self->{kwd},
1756                                    line => $self->{line_prev},                                    line => $self->{line_prev},
1757                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
1758                                   };                                   };
1759          redo A;          redo A;
1760        }        }
# Line 1708  sub _get_next_token ($) { Line 1765  sub _get_next_token ($) {
1765              '[CD' => 0x0041, # A              '[CD' => 0x0041, # A
1766              '[CDA' => 0x0054, # T              '[CDA' => 0x0054, # T
1767              '[CDAT' => 0x0041, # A              '[CDAT' => 0x0041, # A
1768            }->{$self->{s_kwd}}) {            }->{$self->{kwd}}) {
1769          !!!cp (135.1);          !!!cp (135.1);
1770          ## Stay in the state.          ## Stay in the state.
1771          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
1772          !!!next-input-character;          !!!next-input-character;
1773          redo A;          redo A;
1774        } elsif ($self->{s_kwd} eq '[CDATA' and        } elsif ($self->{kwd} eq '[CDATA' and
1775                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
1776          if ($self->{is_xml} and          if ($self->{is_xml} and
1777              not $self->{tainted} and              not $self->{tainted} and
# Line 1739  sub _get_next_token ($) { Line 1796  sub _get_next_token ($) {
1796          !!!cp (135.3);          !!!cp (135.3);
1797          !!!parse-error (type => 'bogus comment',          !!!parse-error (type => 'bogus comment',
1798                          line => $self->{line_prev},                          line => $self->{line_prev},
1799                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
1800          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
1801          ## Reconsume.          ## Reconsume.
1802          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
1803                                    data => $self->{s_kwd},                                    data => $self->{kwd},
1804                                    line => $self->{line_prev},                                    line => $self->{line_prev},
1805                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
1806                                   };                                   };
1807          redo A;          redo A;
1808        }        }
# Line 1756  sub _get_next_token ($) { Line 1813  sub _get_next_token ($) {
1813          !!!next-input-character;          !!!next-input-character;
1814          redo A;          redo A;
1815        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
         !!!cp (138);  
1816          !!!parse-error (type => 'bogus comment');          !!!parse-error (type => 'bogus comment');
1817          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1818          $self->{s_kwd} = '';            !!!cp (138.1);
1819              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1820            } else {
1821              !!!cp (138);
1822              $self->{state} = DATA_STATE;
1823              $self->{s_kwd} = '';
1824            }
1825          !!!next-input-character;          !!!next-input-character;
1826    
1827          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
1828    
1829          redo A;          redo A;
1830        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (139);  
1831          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1832          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1833          $self->{s_kwd} = '';            !!!cp (139.1);
1834              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1835            } else {
1836              !!!cp (139);
1837              $self->{state} = DATA_STATE;
1838              $self->{s_kwd} = '';
1839            }
1840          ## reconsume          ## reconsume
1841    
1842          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1790  sub _get_next_token ($) { Line 1857  sub _get_next_token ($) {
1857          !!!next-input-character;          !!!next-input-character;
1858          redo A;          redo A;
1859        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
         !!!cp (142);  
1860          !!!parse-error (type => 'bogus comment');          !!!parse-error (type => 'bogus comment');
1861          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1862          $self->{s_kwd} = '';            !!!cp (142.1);
1863              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1864            } else {
1865              !!!cp (142);
1866              $self->{state} = DATA_STATE;
1867              $self->{s_kwd} = '';
1868            }
1869          !!!next-input-character;          !!!next-input-character;
1870    
1871          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
1872    
1873          redo A;          redo A;
1874        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (143);  
1875          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1876          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1877          $self->{s_kwd} = '';            !!!cp (143.1);
1878              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1879            } else {
1880              !!!cp (143);
1881              $self->{state} = DATA_STATE;
1882              $self->{s_kwd} = '';
1883            }
1884          ## reconsume          ## reconsume
1885    
1886          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1818  sub _get_next_token ($) { Line 1895  sub _get_next_token ($) {
1895          redo A;          redo A;
1896        }        }
1897      } elsif ($self->{state} == COMMENT_STATE) {      } elsif ($self->{state} == COMMENT_STATE) {
1898          ## XML5: "Comment state" and "DOCTYPE comment state".
1899    
1900        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
1901          !!!cp (145);          !!!cp (145);
1902          $self->{state} = COMMENT_END_DASH_STATE;          $self->{state} = COMMENT_END_DASH_STATE;
1903          !!!next-input-character;          !!!next-input-character;
1904          redo A;          redo A;
1905        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (146);  
1906          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1907          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1908          $self->{s_kwd} = '';            !!!cp (146.1);
1909              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1910            } else {
1911              !!!cp (146);
1912              $self->{state} = DATA_STATE;
1913              $self->{s_kwd} = '';
1914            }
1915          ## reconsume          ## reconsume
1916    
1917          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1845  sub _get_next_token ($) { Line 1929  sub _get_next_token ($) {
1929          redo A;          redo A;
1930        }        }
1931      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1932        ## XML5: "comment dash state".        ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
1933    
1934        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
1935          !!!cp (148);          !!!cp (148);
# Line 1853  sub _get_next_token ($) { Line 1937  sub _get_next_token ($) {
1937          !!!next-input-character;          !!!next-input-character;
1938          redo A;          redo A;
1939        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (149);  
1940          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1941          $self->{s_kwd} = '';          if ($self->{in_subset}) {
1942          $self->{state} = DATA_STATE;            !!!cp (149.1);
1943          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1944            } else {
1945              !!!cp (149);
1946              $self->{state} = DATA_STATE;
1947              $self->{s_kwd} = '';
1948            }
1949          ## reconsume          ## reconsume
1950    
1951          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1871  sub _get_next_token ($) { Line 1959  sub _get_next_token ($) {
1959          redo A;          redo A;
1960        }        }
1961      } elsif ($self->{state} == COMMENT_END_STATE) {      } elsif ($self->{state} == COMMENT_END_STATE) {
1962          ## XML5: "Comment end state" and "DOCTYPE comment end state".
1963    
1964        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
1965          !!!cp (151);          if ($self->{in_subset}) {
1966          $self->{state} = DATA_STATE;            !!!cp (151.1);
1967          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1968            } else {
1969              !!!cp (151);
1970              $self->{state} = DATA_STATE;
1971              $self->{s_kwd} = '';
1972            }
1973          !!!next-input-character;          !!!next-input-character;
1974    
1975          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1891  sub _get_next_token ($) { Line 1986  sub _get_next_token ($) {
1986          !!!next-input-character;          !!!next-input-character;
1987          redo A;          redo A;
1988        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (153);  
1989          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1990          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1991          $self->{s_kwd} = '';            !!!cp (153.1);
1992              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1993            } else {
1994              !!!cp (153);
1995              $self->{state} = DATA_STATE;
1996              $self->{s_kwd} = '';
1997            }
1998          ## reconsume          ## reconsume
1999    
2000          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1919  sub _get_next_token ($) { Line 2019  sub _get_next_token ($) {
2019          redo A;          redo A;
2020        } else {        } else {
2021          !!!cp (156);          !!!cp (156);
2022            ## XML5: Unless EOF, swith to the bogus comment state.
2023          !!!parse-error (type => 'no space before DOCTYPE name');          !!!parse-error (type => 'no space before DOCTYPE name');
2024          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2025          ## reconsume          ## reconsume
2026          redo A;          redo A;
2027        }        }
2028      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2029          ## XML5: "DOCTYPE root name before state".
2030    
2031        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2032          !!!cp (157);          !!!cp (157);
2033          ## Stay in the state          ## Stay in the state
# Line 1932  sub _get_next_token ($) { Line 2035  sub _get_next_token ($) {
2035          redo A;          redo A;
2036        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2037          !!!cp (158);          !!!cp (158);
2038            ## XML5: No parse error.
2039          !!!parse-error (type => 'no DOCTYPE name');          !!!parse-error (type => 'no DOCTYPE name');
2040          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2041          $self->{s_kwd} = '';          $self->{s_kwd} = '';
# Line 1950  sub _get_next_token ($) { Line 2054  sub _get_next_token ($) {
2054          !!!emit ($self->{ct}); # DOCTYPE (quirks)          !!!emit ($self->{ct}); # DOCTYPE (quirks)
2055    
2056          redo A;          redo A;
2057          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2058            !!!cp (159.1);
2059            !!!parse-error (type => 'no DOCTYPE name');
2060            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2061            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2062            $self->{in_subset} = 1;
2063            !!!next-input-character;
2064            !!!emit ($self->{ct}); # DOCTYPE
2065            redo A;
2066        } else {        } else {
2067          !!!cp (160);          !!!cp (160);
2068          $self->{ct}->{name} = chr $self->{nc};          $self->{ct}->{name} = chr $self->{nc};
# Line 1959  sub _get_next_token ($) { Line 2072  sub _get_next_token ($) {
2072          redo A;          redo A;
2073        }        }
2074      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2075  ## ISSUE: Redundant "First," in the spec.        ## XML5: "DOCTYPE root name state".
2076    
2077          ## ISSUE: Redundant "First," in the spec.
2078    
2079        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2080          !!!cp (161);          !!!cp (161);
2081          $self->{state} = AFTER_DOCTYPE_NAME_STATE;          $self->{state} = AFTER_DOCTYPE_NAME_STATE;
# Line 1985  sub _get_next_token ($) { Line 2101  sub _get_next_token ($) {
2101          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2102    
2103          redo A;          redo A;
2104          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2105            !!!cp (163.1);
2106            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2107            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2108            $self->{in_subset} = 1;
2109            !!!next-input-character;
2110            !!!emit ($self->{ct}); # DOCTYPE
2111            redo A;
2112        } else {        } else {
2113          !!!cp (164);          !!!cp (164);
2114          $self->{ct}->{name}          $self->{ct}->{name}
# Line 1994  sub _get_next_token ($) { Line 2118  sub _get_next_token ($) {
2118          redo A;          redo A;
2119        }        }
2120      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2121          ## XML5: Corresponding to XML5's "DOCTYPE root name after
2122          ## state", but implemented differently.
2123    
2124        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2125          !!!cp (165);          !!!cp (165);
2126          ## Stay in the state          ## Stay in the state
# Line 2021  sub _get_next_token ($) { Line 2148  sub _get_next_token ($) {
2148          redo A;          redo A;
2149        } elsif ($self->{nc} == 0x0050 or # P        } elsif ($self->{nc} == 0x0050 or # P
2150                 $self->{nc} == 0x0070) { # p                 $self->{nc} == 0x0070) { # p
2151            !!!cp (167.1);
2152          $self->{state} = PUBLIC_STATE;          $self->{state} = PUBLIC_STATE;
2153          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
2154          !!!next-input-character;          !!!next-input-character;
2155          redo A;          redo A;
2156        } elsif ($self->{nc} == 0x0053 or # S        } elsif ($self->{nc} == 0x0053 or # S
2157                 $self->{nc} == 0x0073) { # s                 $self->{nc} == 0x0073) { # s
2158            !!!cp (167.2);
2159          $self->{state} = SYSTEM_STATE;          $self->{state} = SYSTEM_STATE;
2160          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
2161          !!!next-input-character;          !!!next-input-character;
2162          redo A;          redo A;
2163          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2164            !!!cp (167.3);
2165            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2166            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2167            $self->{in_subset} = 1;
2168            !!!next-input-character;
2169            !!!emit ($self->{ct}); # DOCTYPE
2170            redo A;
2171        } else {        } else {
2172          !!!cp (180);          !!!cp (180);
2173          !!!parse-error (type => 'string after DOCTYPE name');          !!!parse-error (type => 'string after DOCTYPE name');
# Line 2048  sub _get_next_token ($) { Line 2185  sub _get_next_token ($) {
2185              0x0042, # B              0x0042, # B
2186              0x004C, # L              0x004C, # L
2187              0x0049, # I              0x0049, # I
2188            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
2189            $self->{nc} == [            $self->{nc} == [
2190              undef,              undef,
2191              0x0075, # u              0x0075, # u
2192              0x0062, # b              0x0062, # b
2193              0x006C, # l              0x006C, # l
2194              0x0069, # i              0x0069, # i
2195            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
2196          !!!cp (175);          !!!cp (175);
2197          ## Stay in the state.          ## Stay in the state.
2198          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2199          !!!next-input-character;          !!!next-input-character;
2200          redo A;          redo A;
2201        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
2202                 ($self->{nc} == 0x0043 or # C                 ($self->{nc} == 0x0043 or # C
2203                  $self->{nc} == 0x0063)) { # c                  $self->{nc} == 0x0063)) { # c
2204          !!!cp (168);          if ($self->{is_xml} and
2205                ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
2206              !!!cp (168.1);
2207              !!!parse-error (type => 'lowercase keyword', ## TODO: type
2208                              text => 'PUBLIC',
2209                              line => $self->{line_prev},
2210                              column => $self->{column_prev} - 4);
2211            } else {
2212              !!!cp (168);
2213            }
2214          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2215          !!!next-input-character;          !!!next-input-character;
2216          redo A;          redo A;
# Line 2072  sub _get_next_token ($) { Line 2218  sub _get_next_token ($) {
2218          !!!cp (169);          !!!cp (169);
2219          !!!parse-error (type => 'string after DOCTYPE name',          !!!parse-error (type => 'string after DOCTYPE name',
2220                          line => $self->{line_prev},                          line => $self->{line_prev},
2221                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
2222          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2223    
2224          $self->{state} = BOGUS_DOCTYPE_STATE;          $self->{state} = BOGUS_DOCTYPE_STATE;
# Line 2087  sub _get_next_token ($) { Line 2233  sub _get_next_token ($) {
2233              0x0053, # S              0x0053, # S
2234              0x0054, # T              0x0054, # T
2235              0x0045, # E              0x0045, # E
2236            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
2237            $self->{nc} == [            $self->{nc} == [
2238              undef,              undef,
2239              0x0079, # y              0x0079, # y
2240              0x0073, # s              0x0073, # s
2241              0x0074, # t              0x0074, # t
2242              0x0065, # e              0x0065, # e
2243            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
2244          !!!cp (170);          !!!cp (170);
2245          ## Stay in the state.          ## Stay in the state.
2246          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2247          !!!next-input-character;          !!!next-input-character;
2248          redo A;          redo A;
2249        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
2250                 ($self->{nc} == 0x004D or # M                 ($self->{nc} == 0x004D or # M
2251                  $self->{nc} == 0x006D)) { # m                  $self->{nc} == 0x006D)) { # m
2252          !!!cp (171);          if ($self->{is_xml} and
2253                ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
2254              !!!cp (171.1);
2255              !!!parse-error (type => 'lowercase keyword', ## TODO: type
2256                              text => 'SYSTEM',
2257                              line => $self->{line_prev},
2258                              column => $self->{column_prev} - 4);
2259            } else {
2260              !!!cp (171);
2261            }
2262          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2263          !!!next-input-character;          !!!next-input-character;
2264          redo A;          redo A;
# Line 2111  sub _get_next_token ($) { Line 2266  sub _get_next_token ($) {
2266          !!!cp (172);          !!!cp (172);
2267          !!!parse-error (type => 'string after DOCTYPE name',          !!!parse-error (type => 'string after DOCTYPE name',
2268                          line => $self->{line_prev},                          line => $self->{line_prev},
2269                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
2270          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2271    
2272          $self->{state} = BOGUS_DOCTYPE_STATE;          $self->{state} = BOGUS_DOCTYPE_STATE;
# Line 2160  sub _get_next_token ($) { Line 2315  sub _get_next_token ($) {
2315          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2316    
2317          redo A;          redo A;
2318          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2319            !!!cp (186.1);
2320            !!!parse-error (type => 'no PUBLIC literal');
2321            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2322            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2323            $self->{in_subset} = 1;
2324            !!!next-input-character;
2325            !!!emit ($self->{ct}); # DOCTYPE
2326            redo A;
2327        } else {        } else {
2328          !!!cp (186);          !!!cp (186);
2329          !!!parse-error (type => 'string after PUBLIC');          !!!parse-error (type => 'string after PUBLIC');
# Line 2270  sub _get_next_token ($) { Line 2434  sub _get_next_token ($) {
2434          !!!next-input-character;          !!!next-input-character;
2435          redo A;          redo A;
2436        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2437          !!!cp (198);          if ($self->{is_xml}) {
2438              !!!cp (198.1);
2439              !!!parse-error (type => 'no SYSTEM literal');
2440            } else {
2441              !!!cp (198);
2442            }
2443          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2444          $self->{s_kwd} = '';          $self->{s_kwd} = '';
2445          !!!next-input-character;          !!!next-input-character;
# Line 2290  sub _get_next_token ($) { Line 2459  sub _get_next_token ($) {
2459          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2460    
2461          redo A;          redo A;
2462          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2463            !!!cp (200.1);
2464            !!!parse-error (type => 'no SYSTEM literal');
2465            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2466            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2467            $self->{in_subset} = 1;
2468            !!!next-input-character;
2469            !!!emit ($self->{ct}); # DOCTYPE
2470            redo A;
2471        } else {        } else {
2472          !!!cp (200);          !!!cp (200);
2473          !!!parse-error (type => 'string after PUBLIC literal');          !!!parse-error (type => 'string after PUBLIC literal');
# Line 2340  sub _get_next_token ($) { Line 2518  sub _get_next_token ($) {
2518          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2519    
2520          redo A;          redo A;
2521          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2522            !!!cp (206.1);
2523            !!!parse-error (type => 'no SYSTEM literal');
2524    
2525            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2526            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2527            $self->{in_subset} = 1;
2528            !!!next-input-character;
2529            !!!emit ($self->{ct}); # DOCTYPE
2530            redo A;
2531        } else {        } else {
2532          !!!cp (206);          !!!cp (206);
2533          !!!parse-error (type => 'string after SYSTEM');          !!!parse-error (type => 'string after SYSTEM');
# Line 2355  sub _get_next_token ($) { Line 2543  sub _get_next_token ($) {
2543          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2544          !!!next-input-character;          !!!next-input-character;
2545          redo A;          redo A;
2546        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2547          !!!cp (208);          !!!cp (208);
2548          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2549    
# Line 2396  sub _get_next_token ($) { Line 2584  sub _get_next_token ($) {
2584          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2585          !!!next-input-character;          !!!next-input-character;
2586          redo A;          redo A;
2587        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2588          !!!cp (212);          !!!cp (212);
2589          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2590    
# Line 2457  sub _get_next_token ($) { Line 2645  sub _get_next_token ($) {
2645          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2646    
2647          redo A;          redo A;
2648          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2649            !!!cp (218.1);
2650            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2651            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2652            $self->{in_subset} = 1;
2653            !!!next-input-character;
2654            !!!emit ($self->{ct}); # DOCTYPE
2655            redo A;
2656        } else {        } else {
2657          !!!cp (218);          !!!cp (218);
2658          !!!parse-error (type => 'string after SYSTEM literal');          !!!parse-error (type => 'string after SYSTEM literal');
# Line 2476  sub _get_next_token ($) { Line 2672  sub _get_next_token ($) {
2672          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2673    
2674          redo A;          redo A;
2675          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2676            !!!cp (220.1);
2677            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2678            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2679            $self->{in_subset} = 1;
2680            !!!next-input-character;
2681            !!!emit ($self->{ct}); # DOCTYPE
2682            redo A;
2683        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2684          !!!cp (220);          !!!cp (220);
2685          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
# Line 2488  sub _get_next_token ($) { Line 2692  sub _get_next_token ($) {
2692        } else {        } else {
2693          !!!cp (221);          !!!cp (221);
2694          my $s = '';          my $s = '';
2695          $self->{read_until}->($s, q[>], 0);          $self->{read_until}->($s, q{>[}, 0);
2696    
2697          ## Stay in the state          ## Stay in the state
2698          !!!next-input-character;          !!!next-input-character;
# Line 2596  sub _get_next_token ($) { Line 2800  sub _get_next_token ($) {
2800        } elsif ($self->{nc} == 0x0023) { # #        } elsif ($self->{nc} == 0x0023) { # #
2801          !!!cp (999);          !!!cp (999);
2802          $self->{state} = ENTITY_HASH_STATE;          $self->{state} = ENTITY_HASH_STATE;
2803          $self->{s_kwd} = '#';          $self->{kwd} = '#';
2804          !!!next-input-character;          !!!next-input-character;
2805          redo A;          redo A;
2806        } elsif ((0x0041 <= $self->{nc} and        } elsif ((0x0041 <= $self->{nc} and
# Line 2606  sub _get_next_token ($) { Line 2810  sub _get_next_token ($) {
2810          !!!cp (998);          !!!cp (998);
2811          require Whatpm::_NamedEntityList;          require Whatpm::_NamedEntityList;
2812          $self->{state} = ENTITY_NAME_STATE;          $self->{state} = ENTITY_NAME_STATE;
2813          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
2814          $self->{entity__value} = $self->{s_kwd};          $self->{entity__value} = $self->{kwd};
2815          $self->{entity__match} = 0;          $self->{entity__match} = 0;
2816          !!!next-input-character;          !!!next-input-character;
2817          redo A;          redo A;
# Line 2647  sub _get_next_token ($) { Line 2851  sub _get_next_token ($) {
2851            $self->{nc} == 0x0058) { # X            $self->{nc} == 0x0058) { # X
2852          !!!cp (995);          !!!cp (995);
2853          $self->{state} = HEXREF_X_STATE;          $self->{state} = HEXREF_X_STATE;
2854          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2855          !!!next-input-character;          !!!next-input-character;
2856          redo A;          redo A;
2857        } elsif (0x0030 <= $self->{nc} and        } elsif (0x0030 <= $self->{nc} and
2858                 $self->{nc} <= 0x0039) { # 0..9                 $self->{nc} <= 0x0039) { # 0..9
2859          !!!cp (994);          !!!cp (994);
2860          $self->{state} = NCR_NUM_STATE;          $self->{state} = NCR_NUM_STATE;
2861          $self->{s_kwd} = $self->{nc} - 0x0030;          $self->{kwd} = $self->{nc} - 0x0030;
2862          !!!next-input-character;          !!!next-input-character;
2863          redo A;          redo A;
2864        } else {        } else {
# Line 2690  sub _get_next_token ($) { Line 2894  sub _get_next_token ($) {
2894        if (0x0030 <= $self->{nc} and        if (0x0030 <= $self->{nc} and
2895            $self->{nc} <= 0x0039) { # 0..9            $self->{nc} <= 0x0039) { # 0..9
2896          !!!cp (1012);          !!!cp (1012);
2897          $self->{s_kwd} *= 10;          $self->{kwd} *= 10;
2898          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
2899                    
2900          ## Stay in the state.          ## Stay in the state.
2901          !!!next-input-character;          !!!next-input-character;
# Line 2707  sub _get_next_token ($) { Line 2911  sub _get_next_token ($) {
2911          #          #
2912        }        }
2913    
2914        my $code = $self->{s_kwd};        my $code = $self->{kwd};
2915        my $l = $self->{line_prev};        my $l = $self->{line_prev};
2916        my $c = $self->{column_prev};        my $c = $self->{column_prev};
2917        if ($charref_map->{$code}) {        if ($charref_map->{$code}) {
# Line 2750  sub _get_next_token ($) { Line 2954  sub _get_next_token ($) {
2954          # 0..9, A..F, a..f          # 0..9, A..F, a..f
2955          !!!cp (990);          !!!cp (990);
2956          $self->{state} = HEXREF_HEX_STATE;          $self->{state} = HEXREF_HEX_STATE;
2957          $self->{s_kwd} = 0;          $self->{kwd} = 0;
2958          ## Reconsume.          ## Reconsume.
2959          redo A;          redo A;
2960        } else {        } else {
# Line 2768  sub _get_next_token ($) { Line 2972  sub _get_next_token ($) {
2972            $self->{s_kwd} = '';            $self->{s_kwd} = '';
2973            ## Reconsume.            ## Reconsume.
2974            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
2975                      data => '&' . $self->{s_kwd},                      data => '&' . $self->{kwd},
2976                      line => $self->{line_prev},                      line => $self->{line_prev},
2977                      column => $self->{column_prev} - length $self->{s_kwd},                      column => $self->{column_prev} - length $self->{kwd},
2978                     });                     });
2979            redo A;            redo A;
2980          } else {          } else {
2981            !!!cp (989);            !!!cp (989);
2982            $self->{ca}->{value} .= '&' . $self->{s_kwd};            $self->{ca}->{value} .= '&' . $self->{kwd};
2983            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
2984            $self->{s_kwd} = '';            $self->{s_kwd} = '';
2985            ## Reconsume.            ## Reconsume.
# Line 2786  sub _get_next_token ($) { Line 2990  sub _get_next_token ($) {
2990        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
2991          # 0..9          # 0..9
2992          !!!cp (1002);          !!!cp (1002);
2993          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
2994          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
2995          ## Stay in the state.          ## Stay in the state.
2996          !!!next-input-character;          !!!next-input-character;
2997          redo A;          redo A;
2998        } elsif (0x0061 <= $self->{nc} and        } elsif (0x0061 <= $self->{nc} and
2999                 $self->{nc} <= 0x0066) { # a..f                 $self->{nc} <= 0x0066) { # a..f
3000          !!!cp (1003);          !!!cp (1003);
3001          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
3002          $self->{s_kwd} += $self->{nc} - 0x0060 + 9;          $self->{kwd} += $self->{nc} - 0x0060 + 9;
3003          ## Stay in the state.          ## Stay in the state.
3004          !!!next-input-character;          !!!next-input-character;
3005          redo A;          redo A;
3006        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
3007                 $self->{nc} <= 0x0046) { # A..F                 $self->{nc} <= 0x0046) { # A..F
3008          !!!cp (1004);          !!!cp (1004);
3009          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
3010          $self->{s_kwd} += $self->{nc} - 0x0040 + 9;          $self->{kwd} += $self->{nc} - 0x0040 + 9;
3011          ## Stay in the state.          ## Stay in the state.
3012          !!!next-input-character;          !!!next-input-character;
3013          redo A;          redo A;
# Line 2820  sub _get_next_token ($) { Line 3024  sub _get_next_token ($) {
3024          #          #
3025        }        }
3026    
3027        my $code = $self->{s_kwd};        my $code = $self->{kwd};
3028        my $l = $self->{line_prev};        my $l = $self->{line_prev};
3029        my $c = $self->{column_prev};        my $c = $self->{column_prev};
3030        if ($charref_map->{$code}) {        if ($charref_map->{$code}) {
# Line 2857  sub _get_next_token ($) { Line 3061  sub _get_next_token ($) {
3061          redo A;          redo A;
3062        }        }
3063      } elsif ($self->{state} == ENTITY_NAME_STATE) {      } elsif ($self->{state} == ENTITY_NAME_STATE) {
3064        if (length $self->{s_kwd} < 30 and        if (length $self->{kwd} < 30 and
3065            ## NOTE: Some number greater than the maximum length of entity name            ## NOTE: Some number greater than the maximum length of entity name
3066            ((0x0041 <= $self->{nc} and # a            ((0x0041 <= $self->{nc} and # a
3067              $self->{nc} <= 0x005A) or # x              $self->{nc} <= 0x005A) or # x
# Line 2867  sub _get_next_token ($) { Line 3071  sub _get_next_token ($) {
3071              $self->{nc} <= 0x0039) or # 9              $self->{nc} <= 0x0039) or # 9
3072             $self->{nc} == 0x003B)) { # ;             $self->{nc} == 0x003B)) { # ;
3073          our $EntityChar;          our $EntityChar;
3074          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3075          if (defined $EntityChar->{$self->{s_kwd}}) {          if (defined $EntityChar->{$self->{kwd}}) {
3076            if ($self->{nc} == 0x003B) { # ;            if ($self->{nc} == 0x003B) { # ;
3077              !!!cp (1020);              !!!cp (1020);
3078              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};              $self->{entity__value} = $EntityChar->{$self->{kwd}};
3079              $self->{entity__match} = 1;              $self->{entity__match} = 1;
3080              !!!next-input-character;              !!!next-input-character;
3081              #              #
3082            } else {            } else {
3083              !!!cp (1021);              !!!cp (1021);
3084              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};              $self->{entity__value} = $EntityChar->{$self->{kwd}};
3085              $self->{entity__match} = -1;              $self->{entity__match} = -1;
3086              ## Stay in the state.              ## Stay in the state.
3087              !!!next-input-character;              !!!next-input-character;
# Line 2905  sub _get_next_token ($) { Line 3109  sub _get_next_token ($) {
3109          if ($self->{prev_state} != DATA_STATE and # in attribute          if ($self->{prev_state} != DATA_STATE and # in attribute
3110              $self->{entity__match} < -1) {              $self->{entity__match} < -1) {
3111            !!!cp (1024);            !!!cp (1024);
3112            $data = '&' . $self->{s_kwd};            $data = '&' . $self->{kwd};
3113            #            #
3114          } else {          } else {
3115            !!!cp (1025);            !!!cp (1025);
# Line 2917  sub _get_next_token ($) { Line 3121  sub _get_next_token ($) {
3121          !!!cp (1026);          !!!cp (1026);
3122          !!!parse-error (type => 'bare ero',          !!!parse-error (type => 'bare ero',
3123                          line => $self->{line_prev},                          line => $self->{line_prev},
3124                          column => $self->{column_prev} - length $self->{s_kwd});                          column => $self->{column_prev} - length $self->{kwd});
3125          $data = '&' . $self->{s_kwd};          $data = '&' . $self->{kwd};
3126          #          #
3127        }        }
3128        
# Line 2941  sub _get_next_token ($) { Line 3145  sub _get_next_token ($) {
3145                    data => $data,                    data => $data,
3146                    has_reference => $has_ref,                    has_reference => $has_ref,
3147                    line => $self->{line_prev},                    line => $self->{line_prev},
3148                    column => $self->{column_prev} + 1 - length $self->{s_kwd},                    column => $self->{column_prev} + 1 - length $self->{kwd},
3149                   });                   });
3150          redo A;          redo A;
3151        } else {        } else {
# Line 2957  sub _get_next_token ($) { Line 3161  sub _get_next_token ($) {
3161      ## XML-only states      ## XML-only states
3162    
3163      } elsif ($self->{state} == PI_STATE) {      } elsif ($self->{state} == PI_STATE) {
3164          ## XML5: "Pi state" and "DOCTYPE pi state".
3165    
3166        if ($is_space->{$self->{nc}} or        if ($is_space->{$self->{nc}} or
3167            $self->{nc} == 0x003F or # ? ## XML5: Same as "Anything else"            $self->{nc} == 0x003F or # ?
3168            $self->{nc} == -1) {            $self->{nc} == -1) {
3169            ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
3170            ## pi state": Switch to the "DOCTYPE pi after state".  EOF:
3171            ## "DOCTYPE pi state": Parse error, switch to the "data
3172            ## state".
3173          !!!parse-error (type => 'bare pio', ## TODO: type          !!!parse-error (type => 'bare pio', ## TODO: type
3174                          line => $self->{line_prev},                          line => $self->{line_prev},
3175                          column => $self->{column_prev}                          column => $self->{column_prev}
# Line 2974  sub _get_next_token ($) { Line 3184  sub _get_next_token ($) {
3184                        };                        };
3185          redo A;          redo A;
3186        } else {        } else {
3187            ## XML5: "DOCTYPE pi state": Stay in the state.
3188          $self->{ct} = {type => PI_TOKEN,          $self->{ct} = {type => PI_TOKEN,
3189                         target => chr $self->{nc},                         target => chr $self->{nc},
3190                         data => '',                         data => '',
# Line 2991  sub _get_next_token ($) { Line 3202  sub _get_next_token ($) {
3202          redo A;          redo A;
3203        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3204          !!!parse-error (type => 'no pic'); ## TODO: type          !!!parse-error (type => 'no pic'); ## TODO: type
3205          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
3206          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3207            } else {
3208              $self->{state} = DATA_STATE;
3209              $self->{s_kwd} = '';
3210            }
3211          ## Reconsume.          ## Reconsume.
3212          !!!emit ($self->{ct}); # pi          !!!emit ($self->{ct}); # pi
3213          redo A;          redo A;
# Line 3023  sub _get_next_token ($) { Line 3238  sub _get_next_token ($) {
3238          redo A;          redo A;
3239        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3240          !!!parse-error (type => 'no pic'); ## TODO: type          !!!parse-error (type => 'no pic'); ## TODO: type
3241          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
3242          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
3243            } else {
3244              $self->{state} = DATA_STATE;
3245              $self->{s_kwd} = '';
3246            }
3247          ## Reprocess.          ## Reprocess.
3248          !!!emit ($self->{ct}); # pi          !!!emit ($self->{ct}); # pi
3249          redo A;          redo A;
# Line 3038  sub _get_next_token ($) { Line 3257  sub _get_next_token ($) {
3257          redo A;          redo A;
3258        }        }
3259      } elsif ($self->{state} == PI_AFTER_STATE) {      } elsif ($self->{state} == PI_AFTER_STATE) {
3260          ## XML5: Part of "Pi after state".
3261    
3262        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
3263          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
3264          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3265            } else {
3266              $self->{state} = DATA_STATE;
3267              $self->{s_kwd} = '';
3268            }
3269          !!!next-input-character;          !!!next-input-character;
3270          !!!emit ($self->{ct}); # pi          !!!emit ($self->{ct}); # pi
3271          redo A;          redo A;
# Line 3063  sub _get_next_token ($) { Line 3288  sub _get_next_token ($) {
3288          redo A;          redo A;
3289        }        }
3290      } elsif ($self->{state} == PI_DATA_AFTER_STATE) {      } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
3291        ## XML5: Same as "pi after state" in XML5        ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
3292    
3293        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
3294          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
3295          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3296            } else {
3297              $self->{state} = DATA_STATE;
3298              $self->{s_kwd} = '';
3299            }
3300          !!!next-input-character;          !!!next-input-character;
3301          !!!emit ($self->{ct}); # pi          !!!emit ($self->{ct}); # pi
3302          redo A;          redo A;
# Line 3081  sub _get_next_token ($) { Line 3311  sub _get_next_token ($) {
3311          ## Reprocess.          ## Reprocess.
3312          redo A;          redo A;
3313        }        }
3314            
3315        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
3316          if ($self->{nc} == 0x003C) { # <
3317            $self->{state} = DOCTYPE_TAG_STATE;
3318            !!!next-input-character;
3319            redo A;
3320          } elsif ($self->{nc} == 0x0025) { # %
3321            ## XML5: Not defined yet.
3322    
3323            ## TODO:
3324            !!!next-input-character;
3325            redo A;
3326          } elsif ($self->{nc} == 0x005D) { # ]
3327            delete $self->{in_subset};
3328            $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3329            !!!next-input-character;
3330            redo A;
3331          } elsif ($is_space->{$self->{nc}}) {
3332            ## Stay in the state.
3333            !!!next-input-character;
3334            redo A;
3335          } elsif ($self->{nc} == -1) {
3336            !!!parse-error (type => 'unclosed internal subset'); ## TODO: type
3337            delete $self->{in_subset};
3338            $self->{state} = DATA_STATE;
3339            $self->{s_kwd} = '';
3340            ## Reconsume.
3341            !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3342            redo A;
3343          } else {
3344            unless ($self->{internal_subset_tainted}) {
3345              ## XML5: No parse error.
3346              !!!parse-error (type => 'string in internal subset');
3347              $self->{internal_subset_tainted} = 1;
3348            }
3349            ## Stay in the state.
3350            !!!next-input-character;
3351            redo A;
3352          }
3353        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3354          if ($self->{nc} == 0x003E) { # >
3355            $self->{state} = DATA_STATE;
3356            $self->{s_kwd} = '';
3357            !!!next-input-character;
3358            !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3359            redo A;
3360          } elsif ($self->{nc} == -1) {
3361            !!!parse-error (type => 'unclosed DOCTYPE');
3362            $self->{state} = DATA_STATE;
3363            $self->{s_kwd} = '';
3364            ## Reconsume.
3365            !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3366            redo A;
3367          } else {
3368            ## XML5: No parse error and stay in the state.
3369            !!!parse-error (type => 'string after internal subset'); ## TODO: type
3370    
3371            $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3372            !!!next-input-character;
3373            redo A;
3374          }
3375        } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3376          if ($self->{nc} == 0x003E) { # >
3377            $self->{state} = DATA_STATE;
3378            $self->{s_kwd} = '';
3379            !!!next-input-character;
3380            !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3381            redo A;
3382          } elsif ($self->{nc} == -1) {
3383            $self->{state} = DATA_STATE;
3384            $self->{s_kwd} = '';
3385            ## Reconsume.
3386            !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3387            redo A;
3388          } else {
3389            ## Stay in the state.
3390            !!!next-input-character;
3391            redo A;
3392          }
3393        } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
3394          if ($self->{nc} == 0x0021) { # !
3395            $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
3396            !!!next-input-character;
3397            redo A;
3398          } elsif ($self->{nc} == 0x003F) { # ?
3399            $self->{state} = PI_STATE;
3400            !!!next-input-character;
3401            redo A;
3402          } elsif ($self->{nc} == -1) {
3403            !!!parse-error (type => 'bare stago');
3404            $self->{state} = DATA_STATE;
3405            $self->{s_kwd} = '';
3406            ## Reconsume.
3407            redo A;
3408          } else {
3409            !!!parse-error (type => 'bare stago', ## XML5: Not a parse error.
3410                            line => $self->{line_prev},
3411                            column => $self->{column_prev});
3412            $self->{state} = BOGUS_COMMENT_STATE;
3413            $self->{ct} = {type => COMMENT_TOKEN,
3414                           data => '',
3415                          }; ## NOTE: Will be discarded.
3416            !!!next-input-character;
3417            redo A;
3418          }
3419        } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
3420          ## XML5: "DOCTYPE markup declaration state".
3421          
3422          if ($self->{nc} == 0x002D) { # -
3423            $self->{state} = MD_HYPHEN_STATE;
3424            !!!next-input-character;
3425            redo A;
3426          } elsif ($self->{nc} == 0x0045) { # E
3427            $self->{state} = MD_E_STATE;
3428            $self->{kwd} = chr $self->{nc};
3429            !!!next-input-character;
3430            redo A;
3431          } elsif ($self->{nc} == 0x0041) { # A
3432            $self->{state} = MD_ATTLIST_STATE;
3433            $self->{kwd} = chr $self->{nc};
3434            !!!next-input-character;
3435            redo A;
3436          } elsif ($self->{nc} == 0x004E) { # N
3437            $self->{state} = MD_NOTATION_STATE;
3438            $self->{kwd} = chr $self->{nc};
3439            !!!next-input-character;
3440            redo A;
3441          } else {
3442            #
3443          }
3444          
3445          ## XML5: No parse error.
3446          !!!parse-error (type => 'bogus comment',
3447                          line => $self->{line_prev},
3448                          column => $self->{column_prev} - 1);
3449          ## Reconsume.
3450          $self->{state} = BOGUS_COMMENT_STATE;
3451          $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
3452          redo A;
3453        } elsif ($self->{state} == MD_E_STATE) {
3454          if ($self->{nc} == 0x004E) { # N
3455            $self->{state} = MD_ENTITY_STATE;
3456            $self->{kwd} .= chr $self->{nc};
3457            !!!next-input-character;
3458            redo A;
3459          } elsif ($self->{nc} == 0x004C) { # L
3460            ## XML5: <!ELEMENT> not supported.
3461            $self->{state} = MD_ELEMENT_STATE;
3462            $self->{kwd} .= chr $self->{nc};
3463            !!!next-input-character;
3464            redo A;
3465          } else {
3466            ## XML5: No parse error.
3467            !!!parse-error (type => 'bogus comment',
3468                            line => $self->{line_prev},
3469                            column => $self->{column_prev} - 2
3470                                + 1 * ($self->{nc} == -1));
3471            ## Reconsume.
3472            $self->{state} = BOGUS_COMMENT_STATE;
3473            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3474            redo A;
3475          }
3476        } elsif ($self->{state} == MD_ENTITY_STATE) {
3477          if ($self->{nc} == {
3478                'EN' => 0x0054, # T
3479                'ENT' => 0x0049, # I
3480                'ENTI' => 0x0054, # T
3481              }->{$self->{kwd}}) {
3482            ## Stay in the state.
3483            $self->{kwd} .= chr $self->{nc};
3484            !!!next-input-character;
3485            redo A;
3486          } elsif ($self->{kwd} eq 'ENTIT' and
3487                   $self->{nc} == 0x0059) { # Y
3488            $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '', text => '',
3489                           line => $self->{line_prev},
3490                           column => $self->{column_prev} - 6};
3491            $self->{state} = DOCTYPE_MD_STATE;
3492            !!!next-input-character;
3493            redo A;
3494          } else {
3495            !!!parse-error (type => 'bogus comment',
3496                            line => $self->{line_prev},
3497                            column => $self->{column_prev} - 1
3498                                - (length $self->{kwd})
3499                                + 1 * ($self->{nc} == -1));
3500            $self->{state} = BOGUS_COMMENT_STATE;
3501            ## Reconsume.
3502            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3503            redo A;
3504          }
3505        } elsif ($self->{state} == MD_ELEMENT_STATE) {
3506          if ($self->{nc} == {
3507                'EL' => 0x0045, # E
3508                'ELE' => 0x004D, # M
3509                'ELEM' => 0x0045, # E
3510                'ELEME' => 0x004E, # N
3511              }->{$self->{kwd}}) {
3512            ## Stay in the state.
3513            $self->{kwd} .= chr $self->{nc};
3514            !!!next-input-character;
3515            redo A;
3516          } elsif ($self->{kwd} eq 'ELEMEN' and
3517                   $self->{nc} == 0x0054) { # T
3518            $self->{ct} = {type => ELEMENT_TOKEN, name => '',
3519                           line => $self->{line_prev},
3520                           column => $self->{column_prev} - 6};
3521            $self->{state} = DOCTYPE_MD_STATE;
3522            !!!next-input-character;
3523            redo A;
3524          } else {
3525            !!!parse-error (type => 'bogus comment',
3526                            line => $self->{line_prev},
3527                            column => $self->{column_prev} - 1
3528                                - (length $self->{kwd})
3529                                + 1 * ($self->{nc} == -1));
3530            $self->{state} = BOGUS_COMMENT_STATE;
3531            ## Reconsume.
3532            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3533            redo A;
3534          }
3535        } elsif ($self->{state} == MD_ATTLIST_STATE) {
3536          if ($self->{nc} == {
3537                'A' => 0x0054, # T
3538                'AT' => 0x0054, # T
3539                'ATT' => 0x004C, # L
3540                'ATTL' => 0x0049, # I
3541                'ATTLI' => 0x0053, # S
3542              }->{$self->{kwd}}) {
3543            ## Stay in the state.
3544            $self->{kwd} .= chr $self->{nc};
3545            !!!next-input-character;
3546            redo A;
3547          } elsif ($self->{kwd} eq 'ATTLIS' and
3548                   $self->{nc} == 0x0054) { # T
3549            $self->{ct} = {type => ATTLIST_TOKEN, name => '',
3550                           line => $self->{line_prev},
3551                           column => $self->{column_prev} - 6};
3552            $self->{state} = DOCTYPE_MD_STATE;
3553            !!!next-input-character;
3554            redo A;
3555          } else {
3556            !!!parse-error (type => 'bogus comment',
3557                            line => $self->{line_prev},
3558                            column => $self->{column_prev} - 1
3559                                 - (length $self->{kwd})
3560                                 + 1 * ($self->{nc} == -1));
3561            $self->{state} = BOGUS_COMMENT_STATE;
3562            ## Reconsume.
3563            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3564            redo A;
3565          }
3566        } elsif ($self->{state} == MD_NOTATION_STATE) {
3567          if ($self->{nc} == {
3568                'N' => 0x004F, # O
3569                'NO' => 0x0054, # T
3570                'NOT' => 0x0041, # A
3571                'NOTA' => 0x0054, # T
3572                'NOTAT' => 0x0049, # I
3573                'NOTATI' => 0x004F, # O
3574              }->{$self->{kwd}}) {
3575            ## Stay in the state.
3576            $self->{kwd} .= chr $self->{nc};
3577            !!!next-input-character;
3578            redo A;
3579          } elsif ($self->{kwd} eq 'NOTATIO' and
3580                   $self->{nc} == 0x004E) { # N
3581            $self->{ct} = {type => NOTATION_TOKEN, name => '',
3582                           line => $self->{line_prev},
3583                           column => $self->{column_prev} - 6};
3584            $self->{state} = DOCTYPE_MD_STATE;
3585            !!!next-input-character;
3586            redo A;
3587          } else {
3588            !!!parse-error (type => 'bogus comment',
3589                            line => $self->{line_prev},
3590                            column => $self->{column_prev} - 1
3591                                - (length $self->{kwd})
3592                                + 1 * ($self->{nc} == -1));
3593            $self->{state} = BOGUS_COMMENT_STATE;
3594            ## Reconsume.
3595            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3596            redo A;
3597          }
3598        } elsif ($self->{state} == DOCTYPE_MD_STATE) {
3599          ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
3600          ## "DOCTYPE NOTATION state".
3601    
3602          if ($is_space->{$self->{nc}}) {
3603            ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
3604            $self->{state} = BEFORE_MD_NAME_STATE;
3605            !!!next-input-character;
3606            redo A;
3607          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
3608                   $self->{nc} == 0x0025) { # %
3609            ## XML5: Switch to the "DOCTYPE bogus comment state".
3610            !!!parse-error (type => 'no space before md name'); ## TODO: type
3611            $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
3612            !!!next-input-character;
3613            redo A;
3614          } elsif ($self->{nc} == -1) {
3615            !!!parse-error (type => 'unclosed md'); ## TODO: type
3616            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3617            ## Reconsume.
3618            redo A;
3619          } elsif ($self->{nc} == 0x003E) { # >
3620            ## XML5: Switch to the "DOCTYPE bogus comment state".
3621            !!!parse-error (type => 'no md name'); ## TODO: type
3622            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3623            !!!next-input-character;
3624            redo A;
3625          } else {
3626            ## XML5: Switch to the "DOCTYPE bogus comment state".
3627            !!!parse-error (type => 'no space before md name'); ## TODO: type
3628            $self->{state} = BEFORE_MD_NAME_STATE;
3629            redo A;
3630          }
3631        } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
3632          ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
3633          ## before state", "DOCTYPE ATTLIST name before state".
3634    
3635          if ($is_space->{$self->{nc}}) {
3636            ## Stay in the state.
3637            !!!next-input-character;
3638            redo A;
3639          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
3640                   $self->{nc} == 0x0025) { # %
3641            $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
3642            !!!next-input-character;
3643            redo A;
3644          } elsif ($self->{nc} == 0x003E) { # >
3645            ## XML5: Same as "Anything else".
3646            !!!parse-error (type => 'no md name'); ## TODO: type
3647            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3648            !!!next-input-character;
3649            redo A;
3650          } elsif ($self->{nc} == -1) {
3651            !!!parse-error (type => 'unclosed md'); ## TODO: type
3652            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3653            ## Reconsume.
3654            redo A;
3655          } else {
3656            ## XML5: [ATTLIST] Not defined yet.
3657            $self->{ct}->{name} .= chr $self->{nc};
3658            $self->{state} = MD_NAME_STATE;
3659            !!!next-input-character;
3660            redo A;
3661          }
3662        } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
3663          if ($is_space->{$self->{nc}}) {
3664            ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
3665            $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
3666            $self->{state} = BEFORE_MD_NAME_STATE;
3667            !!!next-input-character;
3668            redo A;
3669          } elsif ($self->{nc} == 0x003E) { # >
3670            ## XML5: Same as "Anything else".
3671            !!!parse-error (type => 'no md name'); ## TODO: type
3672            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3673            !!!next-input-character;
3674            redo A;
3675          } elsif ($self->{nc} == -1) {
3676            !!!parse-error (type => 'unclosed md');
3677            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3678            ## Reconsume.
3679            redo A;
3680          } else {
3681            ## XML5: No parse error.
3682            !!!parse-error (type => 'no space after ENTITY percent'); ## TODO: type
3683            $self->{state} = BOGUS_COMMENT_STATE;
3684            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3685            ## Reconsume.
3686            redo A;
3687          }
3688        } elsif ($self->{state} == MD_NAME_STATE) {
3689          ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
3690          
3691          if ($is_space->{$self->{nc}}) {
3692            ## TODO:
3693            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
3694            !!!next-input-character;
3695            redo A;
3696          } elsif ($self->{nc} == 0x003E) { # >
3697            if ($self->{ct}->{type} == ATTLIST_TOKEN) {
3698              #
3699            } else {
3700              !!!parse-error (type => 'no md body'); ## TODO: type
3701            }
3702            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3703            !!!next-input-character;
3704            !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
3705            redo A;
3706          } elsif ($self->{nc} == -1) {
3707            ## XML5: [ATTLIST] No parse error.
3708            !!!parse-error (type => 'unclosed md');
3709            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3710            ## Reconsume.
3711            !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
3712            redo A;
3713          } else {
3714            ## XML5: [ATTLIST] Not defined yet.
3715            $self->{ct}->{name} .= chr $self->{nc};
3716            ## Stay in the state.
3717            !!!next-input-character;
3718            redo A;
3719          }
3720        } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
3721          if ($is_space->{$self->{nc}}) {
3722            ## Stay in the state.
3723            !!!next-input-character;
3724            redo A;
3725          } elsif ($self->{nc} == 0x003E) { # >
3726            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3727            !!!next-input-character;
3728            !!!emit ($self->{ct}); # ATTLIST
3729            redo A;
3730          } elsif ($self->{nc} == -1) {
3731            ## XML5: No parse error.
3732            !!!parse-error (type => 'unclosed md'); ## TODO: type
3733            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3734            redo A;
3735          } else {
3736            ## XML5: Not defined yet.
3737    
3738            ## TODO: ...
3739    
3740            $self->{state} = BOGUS_COMMENT_STATE;
3741            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3742            ## Reconsume.
3743            redo A;
3744          }
3745    
3746      } else {      } else {
3747        die "$0: $self->{state}: Unknown state";        die "$0: $self->{state}: Unknown state";
3748      }      }

Legend:
Removed from v.1.11  
changed lines
  Added in v.1.14

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24