/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.11 by wakaba, Wed Oct 15 10:50:38 2008 UTC revision 1.19 by wakaba, Sun Oct 19 07:19:00 2008 UTC
# Line 15  BEGIN { Line 15  BEGIN {
15      CHARACTER_TOKEN      CHARACTER_TOKEN
16      PI_TOKEN      PI_TOKEN
17      ABORT_TOKEN      ABORT_TOKEN
18        END_OF_DOCTYPE_TOKEN
19        ATTLIST_TOKEN
20        ELEMENT_TOKEN
21        GENERAL_ENTITY_TOKEN
22        PARAMETER_ENTITY_TOKEN
23        NOTATION_TOKEN
24    );    );
25        
26    our %EXPORT_TAGS = (    our %EXPORT_TAGS = (
# Line 27  BEGIN { Line 33  BEGIN {
33        CHARACTER_TOKEN        CHARACTER_TOKEN
34        PI_TOKEN        PI_TOKEN
35        ABORT_TOKEN        ABORT_TOKEN
36          END_OF_DOCTYPE_TOKEN
37          ATTLIST_TOKEN
38          ELEMENT_TOKEN
39          GENERAL_ENTITY_TOKEN
40          PARAMETER_ENTITY_TOKEN
41          NOTATION_TOKEN
42      )],      )],
43    );    );
44  }  }
45    
46    ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48  ## Token types  ## Token types
49    
50  sub DOCTYPE_TOKEN () { 1 }  sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51  sub COMMENT_TOKEN () { 2 }  sub COMMENT_TOKEN () { 2 }
52  sub START_TAG_TOKEN () { 3 }  sub START_TAG_TOKEN () { 3 }
53  sub END_TAG_TOKEN () { 4 }  sub END_TAG_TOKEN () { 4 }
54  sub END_OF_FILE_TOKEN () { 5 }  sub END_OF_FILE_TOKEN () { 5 }
55  sub CHARACTER_TOKEN () { 6 }  sub CHARACTER_TOKEN () { 6 }
56  sub PI_TOKEN () { 7 } # XML5  sub PI_TOKEN () { 7 } ## NOTE: XML only.
57  sub ABORT_TOKEN () { 8 } # Not a token actually  sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58    sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59    sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60    sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61    sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62    sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63    sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64    
65    ## XML5: XML5 has "empty tag token".  In this implementation, it is
66    ## represented as a start tag token with $self->{self_closing} flag
67    ## set to true.
68    
69    ## XML5: XML5 has "short end tag token".  In this implementation, it
70    ## is represented as an end tag token with $token->{tag_name} flag set
71    ## to an empty string.
72    
73  package Whatpm::HTML;  package Whatpm::HTML;
74    
# Line 114  sub HEXREF_HEX_STATE () { 48 } Line 142  sub HEXREF_HEX_STATE () { 48 }
142  sub ENTITY_NAME_STATE () { 49 }  sub ENTITY_NAME_STATE () { 49 }
143  sub PCDATA_STATE () { 50 } # "data state" in the spec  sub PCDATA_STATE () { 50 } # "data state" in the spec
144    
145  ## XML states  ## XML-only states
146  sub PI_STATE () { 51 }  sub PI_STATE () { 51 }
147  sub PI_TARGET_STATE () { 52 }  sub PI_TARGET_STATE () { 52 }
148  sub PI_TARGET_AFTER_STATE () { 53 }  sub PI_TARGET_AFTER_STATE () { 53 }
149  sub PI_DATA_STATE () { 54 }  sub PI_DATA_STATE () { 54 }
150  sub PI_AFTER_STATE () { 55 }  sub PI_AFTER_STATE () { 55 }
151  sub PI_DATA_AFTER_STATE () { 56 }  sub PI_DATA_AFTER_STATE () { 56 }
152    sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153    sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154    sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155    sub DOCTYPE_TAG_STATE () { 60 }
156    sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157    sub MD_ATTLIST_STATE () { 62 }
158    sub MD_E_STATE () { 63 }
159    sub MD_ELEMENT_STATE () { 64 }
160    sub MD_ENTITY_STATE () { 65 }
161    sub MD_NOTATION_STATE () { 66 }
162    sub DOCTYPE_MD_STATE () { 67 }
163    sub BEFORE_MD_NAME_STATE () { 68 }
164    sub MD_NAME_STATE () { 69 }
165    sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166    sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167    sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168    sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169    sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170    sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171    sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172    sub ALLOWED_TOKEN_STATE () { 77 }
173    sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174    sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175    sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179    sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180    sub BEFORE_NDATA_STATE () { 85 }
181    sub NDATA_STATE () { 86 }
182    sub AFTER_NDATA_STATE () { 87 }
183    sub BEFORE_NOTATION_NAME_STATE () { 88 }
184    sub NOTATION_NAME_STATE () { 89 }
185    sub AFTER_NOTATION_NAME_STATE () { 90 }
186    sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 91 }
187    sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 92 }
188    sub ENTITY_VALUE_ENTITY_STATE () { 93 }
189    sub BOGUS_MD_STATE () { 94 }
190    
191  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
192  ## list and descriptions)  ## list and descriptions)
# Line 186  sub _initialize_tokenizer ($) { Line 252  sub _initialize_tokenizer ($) {
252    #$self->{is_xml} (if XML)    #$self->{is_xml} (if XML)
253    
254    $self->{state} = DATA_STATE; # MUST    $self->{state} = DATA_STATE; # MUST
255    $self->{s_kwd} = ''; # state keyword    $self->{s_kwd} = ''; # Data state keyword
256      #$self->{kwd} = ''; # State-dependent keyword; initialized when used
257    #$self->{entity__value}; # initialized when used    #$self->{entity__value}; # initialized when used
258    #$self->{entity__match}; # initialized when used    #$self->{entity__match}; # initialized when used
259    $self->{content_model} = PCDATA_CONTENT_MODEL; # be    $self->{content_model} = PCDATA_CONTENT_MODEL; # be
# Line 221  sub _initialize_tokenizer ($) { Line 288  sub _initialize_tokenizer ($) {
288  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
289  ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)  ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
290  ##   ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.  ##   ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
291    ##   ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
292    
293  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
294  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|
295  ##     while the token is pushed back to the stack.  ##     while the token is pushed back to the stack.
# Line 240  my $is_space = { Line 309  my $is_space = {
309    0x0009 => 1, # CHARACTER TABULATION (HT)    0x0009 => 1, # CHARACTER TABULATION (HT)
310    0x000A => 1, # LINE FEED (LF)    0x000A => 1, # LINE FEED (LF)
311    #0x000B => 0, # LINE TABULATION (VT)    #0x000B => 0, # LINE TABULATION (VT)
312    0x000C => 1, # FORM FEED (FF)    0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
313    #0x000D => 1, # CARRIAGE RETURN (CR)    #0x000D => 1, # CARRIAGE RETURN (CR)
314    0x0020 => 1, # SPACE (SP)    0x0020 => 1, # SPACE (SP)
315  };  };
# Line 450  sub _get_next_token ($) { Line 519  sub _get_next_token ($) {
519            redo A;            redo A;
520          } elsif ($self->{nc} == 0x0021) { # !          } elsif ($self->{nc} == 0x0021) { # !
521            !!!cp (15.1);            !!!cp (15.1);
522            $self->{s_kwd} = '<' unless $self->{escape};            $self->{s_kwd} = $self->{escaped} ? '' : '<';
523            #            #
524          } else {          } else {
525            !!!cp (16);            !!!cp (16);
526              $self->{s_kwd} = '';
527            #            #
528          }          }
529    
530          ## reconsume          ## reconsume
531          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
         $self->{s_kwd} = '';  
532          !!!emit ({type => CHARACTER_TOKEN, data => '<',          !!!emit ({type => CHARACTER_TOKEN, data => '<',
533                    line => $self->{line_prev},                    line => $self->{line_prev},
534                    column => $self->{column_prev},                    column => $self->{column_prev},
# Line 570  sub _get_next_token ($) { Line 639  sub _get_next_token ($) {
639        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
640          if (defined $self->{last_stag_name}) {          if (defined $self->{last_stag_name}) {
641            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
642            $self->{s_kwd} = '';            $self->{kwd} = '';
643            ## Reconsume.            ## Reconsume.
644            redo A;            redo A;
645          } else {          } else {
# Line 673  sub _get_next_token ($) { Line 742  sub _get_next_token ($) {
742          redo A;          redo A;
743        }        }
744      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
745        my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;        my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
746        if (length $ch) {        if (length $ch) {
747          my $CH = $ch;          my $CH = $ch;
748          $ch =~ tr/a-z/A-Z/;          $ch =~ tr/a-z/A-Z/;
# Line 681  sub _get_next_token ($) { Line 750  sub _get_next_token ($) {
750          if ($nch eq $ch or $nch eq $CH) {          if ($nch eq $ch or $nch eq $CH) {
751            !!!cp (24);            !!!cp (24);
752            ## Stay in the state.            ## Stay in the state.
753            $self->{s_kwd} .= $nch;            $self->{kwd} .= $nch;
754            !!!next-input-character;            !!!next-input-character;
755            redo A;            redo A;
756          } else {          } else {
# Line 690  sub _get_next_token ($) { Line 759  sub _get_next_token ($) {
759            $self->{s_kwd} = '';            $self->{s_kwd} = '';
760            ## Reconsume.            ## Reconsume.
761            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
762                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
763                      line => $self->{line_prev},                      line => $self->{line_prev},
764                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
765                     });                     });
766            redo A;            redo A;
767          }          }
# Line 708  sub _get_next_token ($) { Line 777  sub _get_next_token ($) {
777            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
778            $self->{s_kwd} = '';            $self->{s_kwd} = '';
779            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
780                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
781                      line => $self->{line_prev},                      line => $self->{line_prev},
782                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
783                     });                     });
784            redo A;            redo A;
785          } else {          } else {
# Line 719  sub _get_next_token ($) { Line 788  sub _get_next_token ($) {
788                = {type => END_TAG_TOKEN,                = {type => END_TAG_TOKEN,
789                   tag_name => $self->{last_stag_name},                   tag_name => $self->{last_stag_name},
790                   line => $self->{line_prev},                   line => $self->{line_prev},
791                   column => $self->{column_prev} - 1 - length $self->{s_kwd}};                   column => $self->{column_prev} - 1 - length $self->{kwd}};
792            $self->{state} = TAG_NAME_STATE;            $self->{state} = TAG_NAME_STATE;
793            ## Reconsume.            ## Reconsume.
794            redo A;            redo A;
# Line 1211  sub _get_next_token ($) { Line 1280  sub _get_next_token ($) {
1280          redo A;          redo A;
1281        }        }
1282      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1283        ## XML5: "Tag attribute value double quoted state".        ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1284          ## ATTLIST attribute value double quoted state".
1285                
1286        if ($self->{nc} == 0x0022) { # "        if ($self->{nc} == 0x0022) { # "
1287          !!!cp (95);          if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1288          ## XML5: "Tag attribute name before state".            !!!cp (95.1);
1289          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;            ## XML5: "DOCTYPE ATTLIST name after state".
1290              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1291              $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1292            } else {
1293              !!!cp (95);
1294              ## XML5: "Tag attribute name before state".
1295              $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1296            }
1297          !!!next-input-character;          !!!next-input-character;
1298          redo A;          redo A;
1299        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
# Line 1237  sub _get_next_token ($) { Line 1314  sub _get_next_token ($) {
1314          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1315            !!!cp (97);            !!!cp (97);
1316            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1317    
1318              $self->{state} = DATA_STATE;
1319              $self->{s_kwd} = '';
1320              ## reconsume
1321              !!!emit ($self->{ct}); # start tag
1322              redo A;
1323          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1324            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1325            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1246  sub _get_next_token ($) { Line 1329  sub _get_next_token ($) {
1329              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
1330              !!!cp (99);              !!!cp (99);
1331            }            }
1332    
1333              $self->{state} = DATA_STATE;
1334              $self->{s_kwd} = '';
1335              ## reconsume
1336              !!!emit ($self->{ct}); # end tag
1337              redo A;
1338            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1339              ## XML5: No parse error above; not defined yet.
1340              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1341              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1342              ## Reconsume.
1343              !!!emit ($self->{ct}); # ATTLIST
1344              redo A;
1345          } else {          } else {
1346            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1347          }          }
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
         ## reconsume  
   
         !!!emit ($self->{ct}); # start tag or end tag  
   
         redo A;  
1348        } else {        } else {
1349            ## XML5 [ATTLIST]: Not defined yet.
1350          if ($self->{is_xml} and $self->{nc} == 0x003C) { # <          if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1351            !!!cp (100);            !!!cp (100);
1352            ## XML5: Not a parse error.            ## XML5: Not a parse error.
# Line 1274  sub _get_next_token ($) { Line 1364  sub _get_next_token ($) {
1364          redo A;          redo A;
1365        }        }
1366      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1367        ## XML5: "Tag attribute value single quoted state".        ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1368          ## ATTLIST attribute value single quoted state".
1369    
1370        if ($self->{nc} == 0x0027) { # '        if ($self->{nc} == 0x0027) { # '
1371          !!!cp (101);          if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1372          ## XML5: "Before attribute name state" (sic).            !!!cp (101.1);
1373          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;            ## XML5: "DOCTYPE ATTLIST name after state".
1374              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1375              $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1376            } else {
1377              !!!cp (101);
1378              ## XML5: "Before attribute name state" (sic).
1379              $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1380            }
1381          !!!next-input-character;          !!!next-input-character;
1382          redo A;          redo A;
1383        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
# Line 1300  sub _get_next_token ($) { Line 1398  sub _get_next_token ($) {
1398          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1399            !!!cp (103);            !!!cp (103);
1400            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1401    
1402              $self->{state} = DATA_STATE;
1403              $self->{s_kwd} = '';
1404              ## reconsume
1405              !!!emit ($self->{ct}); # start tag
1406              redo A;
1407          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1408            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1409            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1309  sub _get_next_token ($) { Line 1413  sub _get_next_token ($) {
1413              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
1414              !!!cp (105);              !!!cp (105);
1415            }            }
1416    
1417              $self->{state} = DATA_STATE;
1418              $self->{s_kwd} = '';
1419              ## reconsume
1420              !!!emit ($self->{ct}); # end tag
1421              redo A;
1422            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1423              ## XML5: No parse error above; not defined yet.
1424              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1425              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1426              ## Reconsume.
1427              !!!emit ($self->{ct}); # ATTLIST
1428              redo A;
1429          } else {          } else {
1430            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1431          }          }
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
         ## reconsume  
   
         !!!emit ($self->{ct}); # start tag or end tag  
   
         redo A;  
1432        } else {        } else {
1433            ## XML5 [ATTLIST]: Not defined yet.
1434          if ($self->{is_xml} and $self->{nc} == 0x003C) { # <          if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1435            !!!cp (106);            !!!cp (106);
1436            ## XML5: Not a parse error.            ## XML5: Not a parse error.
# Line 1340  sub _get_next_token ($) { Line 1451  sub _get_next_token ($) {
1451        ## XML5: "Tag attribute value unquoted state".        ## XML5: "Tag attribute value unquoted state".
1452    
1453        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1454          !!!cp (107);          if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1455          ## XML5: "Tag attribute name before state".            !!!cp (107.1);
1456          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;            push @{$self->{ct}->{attrdefs}}, $self->{ca};
1457              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
1458            } else {
1459              !!!cp (107);
1460              ## XML5: "Tag attribute name before state".
1461              $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1462            }
1463          !!!next-input-character;          !!!next-input-character;
1464          redo A;          redo A;
1465        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
# Line 1363  sub _get_next_token ($) { Line 1480  sub _get_next_token ($) {
1480          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1481            !!!cp (109);            !!!cp (109);
1482            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1483    
1484              $self->{state} = DATA_STATE;
1485              $self->{s_kwd} = '';
1486              !!!next-input-character;
1487              !!!emit ($self->{ct}); # start tag
1488              redo A;
1489          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1490            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1491            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1372  sub _get_next_token ($) { Line 1495  sub _get_next_token ($) {
1495              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
1496              !!!cp (111);              !!!cp (111);
1497            }            }
1498    
1499              $self->{state} = DATA_STATE;
1500              $self->{s_kwd} = '';
1501              !!!next-input-character;
1502              !!!emit ($self->{ct}); # end tag
1503              redo A;
1504            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1505              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1506              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1507              !!!next-input-character;
1508              !!!emit ($self->{ct}); # ATTLIST
1509              redo A;
1510          } else {          } else {
1511            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1512          }          }
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
         !!!next-input-character;  
   
         !!!emit ($self->{ct}); # start tag or end tag  
   
         redo A;  
1513        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!parse-error (type => 'unclosed tag');  
1514          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1515            !!!cp (112);            !!!cp (112);
1516              !!!parse-error (type => 'unclosed tag');
1517            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1518    
1519              $self->{state} = DATA_STATE;
1520              $self->{s_kwd} = '';
1521              ## reconsume
1522              !!!emit ($self->{ct}); # start tag
1523              redo A;
1524          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1525              !!!parse-error (type => 'unclosed tag');
1526            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1527            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
1528              !!!cp (113);              !!!cp (113);
# Line 1396  sub _get_next_token ($) { Line 1531  sub _get_next_token ($) {
1531              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
1532              !!!cp (114);              !!!cp (114);
1533            }            }
1534    
1535              $self->{state} = DATA_STATE;
1536              $self->{s_kwd} = '';
1537              ## reconsume
1538              !!!emit ($self->{ct}); # end tag
1539              redo A;
1540            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1541              !!!parse-error (type => 'unclosed md'); ## TODO: type
1542              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1543              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1544              ## Reconsume.
1545              !!!emit ($self->{ct}); # ATTLIST
1546              redo A;
1547          } else {          } else {
1548            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1549          }          }
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
         ## reconsume  
   
         !!!emit ($self->{ct}); # start tag or end tag  
   
         redo A;  
1550        } else {        } else {
1551          if ({          if ({
1552               0x0022 => 1, # "               0x0022 => 1, # "
# Line 1548  sub _get_next_token ($) { Line 1689  sub _get_next_token ($) {
1689          redo A;          redo A;
1690        }        }
1691      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1692        ## (only happen if PCDATA state)        ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
1693    
1694        ## NOTE: Unlike spec's "bogus comment state", this implementation        ## NOTE: Unlike spec's "bogus comment state", this implementation
1695        ## consumes characters one-by-one basis.        ## consumes characters one-by-one basis.
1696                
1697        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
1698          !!!cp (124);          if ($self->{in_subset}) {
1699          $self->{state} = DATA_STATE;            !!!cp (123);
1700          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1701            } else {
1702              !!!cp (124);
1703              $self->{state} = DATA_STATE;
1704              $self->{s_kwd} = '';
1705            }
1706          !!!next-input-character;          !!!next-input-character;
1707    
1708          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
1709          redo A;          redo A;
1710        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1711          !!!cp (125);          if ($self->{in_subset}) {
1712          $self->{state} = DATA_STATE;            !!!cp (125.1);
1713          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1714            } else {
1715              !!!cp (125);
1716              $self->{state} = DATA_STATE;
1717              $self->{s_kwd} = '';
1718            }
1719          ## reconsume          ## reconsume
1720    
1721          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1581  sub _get_next_token ($) { Line 1732  sub _get_next_token ($) {
1732          redo A;          redo A;
1733        }        }
1734      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1735        ## (only happen if PCDATA state)        ## XML5: "Markup declaration state".
1736                
1737        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
1738          !!!cp (133);          !!!cp (133);
# Line 1593  sub _get_next_token ($) { Line 1744  sub _get_next_token ($) {
1744          ## ASCII case-insensitive.          ## ASCII case-insensitive.
1745          !!!cp (130);          !!!cp (130);
1746          $self->{state} = MD_DOCTYPE_STATE;          $self->{state} = MD_DOCTYPE_STATE;
1747          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
1748          !!!next-input-character;          !!!next-input-character;
1749          redo A;          redo A;
1750        } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and        } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
# Line 1602  sub _get_next_token ($) { Line 1753  sub _get_next_token ($) {
1753                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
1754          !!!cp (135.4);                          !!!cp (135.4);                
1755          $self->{state} = MD_CDATA_STATE;          $self->{state} = MD_CDATA_STATE;
1756          $self->{s_kwd} = '[';          $self->{kwd} = '[';
1757          !!!next-input-character;          !!!next-input-character;
1758          redo A;          redo A;
1759        } else {        } else {
# Line 1652  sub _get_next_token ($) { Line 1803  sub _get_next_token ($) {
1803              0x0054, # T              0x0054, # T
1804              0x0059, # Y              0x0059, # Y
1805              0x0050, # P              0x0050, # P
1806            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
1807            $self->{nc} == [            $self->{nc} == [
1808              undef,              undef,
1809              0x006F, # o              0x006F, # o
# Line 1660  sub _get_next_token ($) { Line 1811  sub _get_next_token ($) {
1811              0x0074, # t              0x0074, # t
1812              0x0079, # y              0x0079, # y
1813              0x0070, # p              0x0070, # p
1814            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
1815          !!!cp (131);          !!!cp (131);
1816          ## Stay in the state.          ## Stay in the state.
1817          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
1818          !!!next-input-character;          !!!next-input-character;
1819          redo A;          redo A;
1820        } elsif ((length $self->{s_kwd}) == 6 and        } elsif ((length $self->{kwd}) == 6 and
1821                 ($self->{nc} == 0x0045 or # E                 ($self->{nc} == 0x0045 or # E
1822                  $self->{nc} == 0x0065)) { # e                  $self->{nc} == 0x0065)) { # e
1823          if ($self->{s_kwd} ne 'DOCTYP') {          if ($self->{is_xml} and
1824                ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
1825            !!!cp (129);            !!!cp (129);
1826            ## XML5: case-sensitive.            ## XML5: case-sensitive.
1827            !!!parse-error (type => 'lowercase keyword', ## TODO            !!!parse-error (type => 'lowercase keyword', ## TODO
# Line 1691  sub _get_next_token ($) { Line 1843  sub _get_next_token ($) {
1843          !!!cp (132);                  !!!cp (132);        
1844          !!!parse-error (type => 'bogus comment',          !!!parse-error (type => 'bogus comment',
1845                          line => $self->{line_prev},                          line => $self->{line_prev},
1846                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
1847          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
1848          ## Reconsume.          ## Reconsume.
1849          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
1850                                    data => $self->{s_kwd},                                    data => $self->{kwd},
1851                                    line => $self->{line_prev},                                    line => $self->{line_prev},
1852                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
1853                                   };                                   };
1854          redo A;          redo A;
1855        }        }
# Line 1708  sub _get_next_token ($) { Line 1860  sub _get_next_token ($) {
1860              '[CD' => 0x0041, # A              '[CD' => 0x0041, # A
1861              '[CDA' => 0x0054, # T              '[CDA' => 0x0054, # T
1862              '[CDAT' => 0x0041, # A              '[CDAT' => 0x0041, # A
1863            }->{$self->{s_kwd}}) {            }->{$self->{kwd}}) {
1864          !!!cp (135.1);          !!!cp (135.1);
1865          ## Stay in the state.          ## Stay in the state.
1866          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
1867          !!!next-input-character;          !!!next-input-character;
1868          redo A;          redo A;
1869        } elsif ($self->{s_kwd} eq '[CDATA' and        } elsif ($self->{kwd} eq '[CDATA' and
1870                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
1871          if ($self->{is_xml} and          if ($self->{is_xml} and
1872              not $self->{tainted} and              not $self->{tainted} and
# Line 1739  sub _get_next_token ($) { Line 1891  sub _get_next_token ($) {
1891          !!!cp (135.3);          !!!cp (135.3);
1892          !!!parse-error (type => 'bogus comment',          !!!parse-error (type => 'bogus comment',
1893                          line => $self->{line_prev},                          line => $self->{line_prev},
1894                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
1895          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
1896          ## Reconsume.          ## Reconsume.
1897          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
1898                                    data => $self->{s_kwd},                                    data => $self->{kwd},
1899                                    line => $self->{line_prev},                                    line => $self->{line_prev},
1900                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
1901                                   };                                   };
1902          redo A;          redo A;
1903        }        }
# Line 1756  sub _get_next_token ($) { Line 1908  sub _get_next_token ($) {
1908          !!!next-input-character;          !!!next-input-character;
1909          redo A;          redo A;
1910        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
         !!!cp (138);  
1911          !!!parse-error (type => 'bogus comment');          !!!parse-error (type => 'bogus comment');
1912          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1913          $self->{s_kwd} = '';            !!!cp (138.1);
1914              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1915            } else {
1916              !!!cp (138);
1917              $self->{state} = DATA_STATE;
1918              $self->{s_kwd} = '';
1919            }
1920          !!!next-input-character;          !!!next-input-character;
1921    
1922          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
1923    
1924          redo A;          redo A;
1925        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (139);  
1926          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1927          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1928          $self->{s_kwd} = '';            !!!cp (139.1);
1929              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1930            } else {
1931              !!!cp (139);
1932              $self->{state} = DATA_STATE;
1933              $self->{s_kwd} = '';
1934            }
1935          ## reconsume          ## reconsume
1936    
1937          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1790  sub _get_next_token ($) { Line 1952  sub _get_next_token ($) {
1952          !!!next-input-character;          !!!next-input-character;
1953          redo A;          redo A;
1954        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
         !!!cp (142);  
1955          !!!parse-error (type => 'bogus comment');          !!!parse-error (type => 'bogus comment');
1956          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1957          $self->{s_kwd} = '';            !!!cp (142.1);
1958              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1959            } else {
1960              !!!cp (142);
1961              $self->{state} = DATA_STATE;
1962              $self->{s_kwd} = '';
1963            }
1964          !!!next-input-character;          !!!next-input-character;
1965    
1966          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
1967    
1968          redo A;          redo A;
1969        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (143);  
1970          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1971          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1972          $self->{s_kwd} = '';            !!!cp (143.1);
1973              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1974            } else {
1975              !!!cp (143);
1976              $self->{state} = DATA_STATE;
1977              $self->{s_kwd} = '';
1978            }
1979          ## reconsume          ## reconsume
1980    
1981          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1818  sub _get_next_token ($) { Line 1990  sub _get_next_token ($) {
1990          redo A;          redo A;
1991        }        }
1992      } elsif ($self->{state} == COMMENT_STATE) {      } elsif ($self->{state} == COMMENT_STATE) {
1993          ## XML5: "Comment state" and "DOCTYPE comment state".
1994    
1995        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
1996          !!!cp (145);          !!!cp (145);
1997          $self->{state} = COMMENT_END_DASH_STATE;          $self->{state} = COMMENT_END_DASH_STATE;
1998          !!!next-input-character;          !!!next-input-character;
1999          redo A;          redo A;
2000        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (146);  
2001          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
2002          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2003          $self->{s_kwd} = '';            !!!cp (146.1);
2004              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2005            } else {
2006              !!!cp (146);
2007              $self->{state} = DATA_STATE;
2008              $self->{s_kwd} = '';
2009            }
2010          ## reconsume          ## reconsume
2011    
2012          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1845  sub _get_next_token ($) { Line 2024  sub _get_next_token ($) {
2024          redo A;          redo A;
2025        }        }
2026      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2027        ## XML5: "comment dash state".        ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2028    
2029        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2030          !!!cp (148);          !!!cp (148);
# Line 1853  sub _get_next_token ($) { Line 2032  sub _get_next_token ($) {
2032          !!!next-input-character;          !!!next-input-character;
2033          redo A;          redo A;
2034        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (149);  
2035          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
2036          $self->{s_kwd} = '';          if ($self->{in_subset}) {
2037          $self->{state} = DATA_STATE;            !!!cp (149.1);
2038          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2039            } else {
2040              !!!cp (149);
2041              $self->{state} = DATA_STATE;
2042              $self->{s_kwd} = '';
2043            }
2044          ## reconsume          ## reconsume
2045    
2046          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1871  sub _get_next_token ($) { Line 2054  sub _get_next_token ($) {
2054          redo A;          redo A;
2055        }        }
2056      } elsif ($self->{state} == COMMENT_END_STATE) {      } elsif ($self->{state} == COMMENT_END_STATE) {
2057          ## XML5: "Comment end state" and "DOCTYPE comment end state".
2058    
2059        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2060          !!!cp (151);          if ($self->{in_subset}) {
2061          $self->{state} = DATA_STATE;            !!!cp (151.1);
2062          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2063            } else {
2064              !!!cp (151);
2065              $self->{state} = DATA_STATE;
2066              $self->{s_kwd} = '';
2067            }
2068          !!!next-input-character;          !!!next-input-character;
2069    
2070          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1891  sub _get_next_token ($) { Line 2081  sub _get_next_token ($) {
2081          !!!next-input-character;          !!!next-input-character;
2082          redo A;          redo A;
2083        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (153);  
2084          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
2085          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2086          $self->{s_kwd} = '';            !!!cp (153.1);
2087              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2088            } else {
2089              !!!cp (153);
2090              $self->{state} = DATA_STATE;
2091              $self->{s_kwd} = '';
2092            }
2093          ## reconsume          ## reconsume
2094    
2095          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1919  sub _get_next_token ($) { Line 2114  sub _get_next_token ($) {
2114          redo A;          redo A;
2115        } else {        } else {
2116          !!!cp (156);          !!!cp (156);
2117            ## XML5: Unless EOF, swith to the bogus comment state.
2118          !!!parse-error (type => 'no space before DOCTYPE name');          !!!parse-error (type => 'no space before DOCTYPE name');
2119          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2120          ## reconsume          ## reconsume
2121          redo A;          redo A;
2122        }        }
2123      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2124          ## XML5: "DOCTYPE root name before state".
2125    
2126        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2127          !!!cp (157);          !!!cp (157);
2128          ## Stay in the state          ## Stay in the state
# Line 1932  sub _get_next_token ($) { Line 2130  sub _get_next_token ($) {
2130          redo A;          redo A;
2131        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2132          !!!cp (158);          !!!cp (158);
2133            ## XML5: No parse error.
2134          !!!parse-error (type => 'no DOCTYPE name');          !!!parse-error (type => 'no DOCTYPE name');
2135          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2136          $self->{s_kwd} = '';          $self->{s_kwd} = '';
# Line 1950  sub _get_next_token ($) { Line 2149  sub _get_next_token ($) {
2149          !!!emit ($self->{ct}); # DOCTYPE (quirks)          !!!emit ($self->{ct}); # DOCTYPE (quirks)
2150    
2151          redo A;          redo A;
2152          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2153            !!!cp (159.1);
2154            !!!parse-error (type => 'no DOCTYPE name');
2155            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2156            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2157            $self->{in_subset} = 1;
2158            !!!next-input-character;
2159            !!!emit ($self->{ct}); # DOCTYPE
2160            redo A;
2161        } else {        } else {
2162          !!!cp (160);          !!!cp (160);
2163          $self->{ct}->{name} = chr $self->{nc};          $self->{ct}->{name} = chr $self->{nc};
# Line 1959  sub _get_next_token ($) { Line 2167  sub _get_next_token ($) {
2167          redo A;          redo A;
2168        }        }
2169      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2170  ## ISSUE: Redundant "First," in the spec.        ## XML5: "DOCTYPE root name state".
2171    
2172          ## ISSUE: Redundant "First," in the spec.
2173    
2174        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2175          !!!cp (161);          !!!cp (161);
2176          $self->{state} = AFTER_DOCTYPE_NAME_STATE;          $self->{state} = AFTER_DOCTYPE_NAME_STATE;
# Line 1985  sub _get_next_token ($) { Line 2196  sub _get_next_token ($) {
2196          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2197    
2198          redo A;          redo A;
2199          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2200            !!!cp (163.1);
2201            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2202            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2203            $self->{in_subset} = 1;
2204            !!!next-input-character;
2205            !!!emit ($self->{ct}); # DOCTYPE
2206            redo A;
2207        } else {        } else {
2208          !!!cp (164);          !!!cp (164);
2209          $self->{ct}->{name}          $self->{ct}->{name}
# Line 1994  sub _get_next_token ($) { Line 2213  sub _get_next_token ($) {
2213          redo A;          redo A;
2214        }        }
2215      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2216          ## XML5: Corresponding to XML5's "DOCTYPE root name after
2217          ## state", but implemented differently.
2218    
2219        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2220          !!!cp (165);          !!!cp (165);
2221          ## Stay in the state          ## Stay in the state
2222          !!!next-input-character;          !!!next-input-character;
2223          redo A;          redo A;
2224        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2225          !!!cp (166);          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2226          $self->{state} = DATA_STATE;            !!!cp (166);
2227          $self->{s_kwd} = '';            $self->{state} = DATA_STATE;
2228              $self->{s_kwd} = '';
2229            } else {
2230              !!!cp (166.1);
2231              !!!parse-error (type => 'no md def'); ## TODO: type
2232              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2233            }
2234            
2235          !!!next-input-character;          !!!next-input-character;
2236            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         !!!emit ($self->{ct}); # DOCTYPE  
   
2237          redo A;          redo A;
2238        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2239          !!!cp (167);          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2240          !!!parse-error (type => 'unclosed DOCTYPE');            !!!cp (167);
2241          $self->{state} = DATA_STATE;            !!!parse-error (type => 'unclosed DOCTYPE');
2242          $self->{s_kwd} = '';            $self->{state} = DATA_STATE;
2243          ## reconsume            $self->{s_kwd} = '';
2244              $self->{ct}->{quirks} = 1;
2245          $self->{ct}->{quirks} = 1;          } else {
2246          !!!emit ($self->{ct}); # DOCTYPE            !!!cp (167.12);
2247              !!!parse-error (type => 'unclosed md'); ## TODO: type
2248              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2249            }
2250            
2251            ## Reconsume.
2252            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2253          redo A;          redo A;
2254        } elsif ($self->{nc} == 0x0050 or # P        } elsif ($self->{nc} == 0x0050 or # P
2255                 $self->{nc} == 0x0070) { # p                 $self->{nc} == 0x0070) { # p
2256            !!!cp (167.1);
2257          $self->{state} = PUBLIC_STATE;          $self->{state} = PUBLIC_STATE;
2258          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
2259          !!!next-input-character;          !!!next-input-character;
2260          redo A;          redo A;
2261        } elsif ($self->{nc} == 0x0053 or # S        } elsif ($self->{nc} == 0x0053 or # S
2262                 $self->{nc} == 0x0073) { # s                 $self->{nc} == 0x0073) { # s
2263            !!!cp (167.2);
2264          $self->{state} = SYSTEM_STATE;          $self->{state} = SYSTEM_STATE;
2265          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
2266            !!!next-input-character;
2267            redo A;
2268          } elsif ($self->{nc} == 0x0022 and # "
2269                   ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2270                    $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2271            !!!cp (167.21);
2272            $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
2273            $self->{ct}->{value} = ''; # ENTITY
2274          !!!next-input-character;          !!!next-input-character;
2275          redo A;          redo A;
2276          } elsif ($self->{nc} == 0x0027 and # '
2277                   ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2278                    $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2279            !!!cp (167.22);
2280            $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
2281            $self->{ct}->{value} = ''; # ENTITY
2282            !!!next-input-character;
2283            redo A;
2284          } elsif ($self->{is_xml} and
2285                   $self->{ct}->{type} == DOCTYPE_TOKEN and
2286                   $self->{nc} == 0x005B) { # [
2287            !!!cp (167.3);
2288            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2289            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2290            $self->{in_subset} = 1;
2291            !!!next-input-character;
2292            !!!emit ($self->{ct}); # DOCTYPE
2293            redo A;
2294        } else {        } else {
2295          !!!cp (180);          !!!parse-error (type => 'string after DOCTYPE name'); ## TODO: type
2296          !!!parse-error (type => 'string after DOCTYPE name');  
2297          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2298              !!!cp (180);
2299              $self->{ct}->{quirks} = 1;
2300              $self->{state} = BOGUS_DOCTYPE_STATE;
2301            } else {
2302              !!!cp (180.1);
2303              $self->{state} = BOGUS_MD_STATE;
2304            }
2305    
         $self->{state} = BOGUS_DOCTYPE_STATE;  
2306          !!!next-input-character;          !!!next-input-character;
2307          redo A;          redo A;
2308        }        }
# Line 2048  sub _get_next_token ($) { Line 2314  sub _get_next_token ($) {
2314              0x0042, # B              0x0042, # B
2315              0x004C, # L              0x004C, # L
2316              0x0049, # I              0x0049, # I
2317            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
2318            $self->{nc} == [            $self->{nc} == [
2319              undef,              undef,
2320              0x0075, # u              0x0075, # u
2321              0x0062, # b              0x0062, # b
2322              0x006C, # l              0x006C, # l
2323              0x0069, # i              0x0069, # i
2324            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
2325          !!!cp (175);          !!!cp (175);
2326          ## Stay in the state.          ## Stay in the state.
2327          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2328          !!!next-input-character;          !!!next-input-character;
2329          redo A;          redo A;
2330        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
2331                 ($self->{nc} == 0x0043 or # C                 ($self->{nc} == 0x0043 or # C
2332                  $self->{nc} == 0x0063)) { # c                  $self->{nc} == 0x0063)) { # c
2333          !!!cp (168);          if ($self->{is_xml} and
2334                ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
2335              !!!cp (168.1);
2336              !!!parse-error (type => 'lowercase keyword', ## TODO: type
2337                              text => 'PUBLIC',
2338                              line => $self->{line_prev},
2339                              column => $self->{column_prev} - 4);
2340            } else {
2341              !!!cp (168);
2342            }
2343          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2344          !!!next-input-character;          !!!next-input-character;
2345          redo A;          redo A;
2346        } else {        } else {
2347          !!!cp (169);          !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
         !!!parse-error (type => 'string after DOCTYPE name',  
2348                          line => $self->{line_prev},                          line => $self->{line_prev},
2349                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
2350          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2351              !!!cp (169);
2352          $self->{state} = BOGUS_DOCTYPE_STATE;            $self->{ct}->{quirks} = 1;
2353              $self->{state} = BOGUS_DOCTYPE_STATE;
2354            } else {
2355              !!!cp (169.1);
2356              $self->{state} = BOGUS_MD_STATE;
2357            }
2358          ## Reconsume.          ## Reconsume.
2359          redo A;          redo A;
2360        }        }
# Line 2087  sub _get_next_token ($) { Line 2366  sub _get_next_token ($) {
2366              0x0053, # S              0x0053, # S
2367              0x0054, # T              0x0054, # T
2368              0x0045, # E              0x0045, # E
2369            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
2370            $self->{nc} == [            $self->{nc} == [
2371              undef,              undef,
2372              0x0079, # y              0x0079, # y
2373              0x0073, # s              0x0073, # s
2374              0x0074, # t              0x0074, # t
2375              0x0065, # e              0x0065, # e
2376            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
2377          !!!cp (170);          !!!cp (170);
2378          ## Stay in the state.          ## Stay in the state.
2379          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2380          !!!next-input-character;          !!!next-input-character;
2381          redo A;          redo A;
2382        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
2383                 ($self->{nc} == 0x004D or # M                 ($self->{nc} == 0x004D or # M
2384                  $self->{nc} == 0x006D)) { # m                  $self->{nc} == 0x006D)) { # m
2385          !!!cp (171);          if ($self->{is_xml} and
2386                ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
2387              !!!cp (171.1);
2388              !!!parse-error (type => 'lowercase keyword', ## TODO: type
2389                              text => 'SYSTEM',
2390                              line => $self->{line_prev},
2391                              column => $self->{column_prev} - 4);
2392            } else {
2393              !!!cp (171);
2394            }
2395          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2396          !!!next-input-character;          !!!next-input-character;
2397          redo A;          redo A;
2398        } else {        } else {
2399          !!!cp (172);          !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
         !!!parse-error (type => 'string after DOCTYPE name',  
2400                          line => $self->{line_prev},                          line => $self->{line_prev},
2401                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
2402          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2403              !!!cp (172);
2404          $self->{state} = BOGUS_DOCTYPE_STATE;            $self->{ct}->{quirks} = 1;
2405              $self->{state} = BOGUS_DOCTYPE_STATE;
2406            } else {
2407              !!!cp (172.1);
2408              $self->{state} = BOGUS_MD_STATE;
2409            }
2410          ## Reconsume.          ## Reconsume.
2411          redo A;          redo A;
2412        }        }
# Line 2137  sub _get_next_token ($) { Line 2429  sub _get_next_token ($) {
2429          !!!next-input-character;          !!!next-input-character;
2430          redo A;          redo A;
2431        } elsif ($self->{nc} eq 0x003E) { # >        } elsif ($self->{nc} eq 0x003E) { # >
         !!!cp (184);  
2432          !!!parse-error (type => 'no PUBLIC literal');          !!!parse-error (type => 'no PUBLIC literal');
2433            
2434          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2435          $self->{s_kwd} = '';            !!!cp (184);
2436              $self->{state} = DATA_STATE;
2437              $self->{s_kwd} = '';
2438              $self->{ct}->{quirks} = 1;
2439            } else {
2440              !!!cp (184.1);
2441              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2442            }
2443            
2444          !!!next-input-character;          !!!next-input-character;
2445            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         !!!emit ($self->{ct}); # DOCTYPE  
   
2446          redo A;          redo A;
2447        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2448          !!!cp (185);          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2449          !!!parse-error (type => 'unclosed DOCTYPE');            !!!cp (185);
2450              !!!parse-error (type => 'unclosed DOCTYPE');
2451          $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
2452          $self->{s_kwd} = '';            $self->{s_kwd} = '';
2453              $self->{ct}->{quirks} = 1;
2454            } else {
2455              !!!cp (185.1);
2456              !!!parse-error (type => 'unclosed md'); ## TODO: type
2457              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2458            }
2459            
2460          ## reconsume          ## reconsume
   
         $self->{ct}->{quirks} = 1;  
2461          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2462            redo A;
2463          } elsif ($self->{is_xml} and
2464                   $self->{ct}->{type} == DOCTYPE_TOKEN and
2465                   $self->{nc} == 0x005B) { # [
2466            !!!cp (186.1);
2467            !!!parse-error (type => 'no PUBLIC literal');
2468            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2469            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2470            $self->{in_subset} = 1;
2471            !!!next-input-character;
2472            !!!emit ($self->{ct}); # DOCTYPE
2473          redo A;          redo A;
2474        } else {        } else {
         !!!cp (186);  
2475          !!!parse-error (type => 'string after PUBLIC');          !!!parse-error (type => 'string after PUBLIC');
         $self->{ct}->{quirks} = 1;  
2476    
2477          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2478              !!!cp (186);
2479              $self->{ct}->{quirks} = 1;
2480              $self->{state} = BOGUS_DOCTYPE_STATE;
2481            } else {
2482              !!!cp (186.2);
2483              $self->{state} = BOGUS_MD_STATE;
2484            }
2485    
2486          !!!next-input-character;          !!!next-input-character;
2487          redo A;          redo A;
2488        }        }
# Line 2176  sub _get_next_token ($) { Line 2493  sub _get_next_token ($) {
2493          !!!next-input-character;          !!!next-input-character;
2494          redo A;          redo A;
2495        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
         !!!cp (188);  
2496          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2497    
2498          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2499          $self->{s_kwd} = '';            !!!cp (188);
2500          !!!next-input-character;            $self->{state} = DATA_STATE;
2501              $self->{s_kwd} = '';
2502          $self->{ct}->{quirks} = 1;            $self->{ct}->{quirks} = 1;
2503          !!!emit ($self->{ct}); # DOCTYPE          } else {
2504              !!!cp (188.1);
2505              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2506            }
2507    
2508            !!!next-input-character;
2509            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2510          redo A;          redo A;
2511        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (189);  
2512          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2513    
2514          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2515          $self->{s_kwd} = '';            !!!cp (189);
2516          ## reconsume            $self->{state} = DATA_STATE;
2517              $self->{s_kwd} = '';
2518          $self->{ct}->{quirks} = 1;            $self->{ct}->{quirks} = 1;
2519            } else {
2520              !!!cp (189.1);
2521              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2522            }
2523            
2524            ## Reconsume.
2525          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
   
2526          redo A;          redo A;
2527        } else {        } else {
2528          !!!cp (190);          !!!cp (190);
2529          $self->{ct}->{pubid} # DOCTYPE          $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
2530          $self->{read_until}->($self->{ct}->{pubid}, q[">],          $self->{read_until}->($self->{ct}->{pubid}, q[">],
2531                                length $self->{ct}->{pubid});                                length $self->{ct}->{pubid});
2532    
# Line 2217  sub _get_next_token ($) { Line 2541  sub _get_next_token ($) {
2541          !!!next-input-character;          !!!next-input-character;
2542          redo A;          redo A;
2543        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
         !!!cp (192);  
2544          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2545    
2546          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2547          $self->{s_kwd} = '';            !!!cp (192);
2548          !!!next-input-character;            $self->{state} = DATA_STATE;
2549              $self->{s_kwd} = '';
2550          $self->{ct}->{quirks} = 1;            $self->{ct}->{quirks} = 1;
2551          !!!emit ($self->{ct}); # DOCTYPE          } else {
2552              !!!cp (192.1);
2553              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2554            }
2555    
2556            !!!next-input-character;
2557            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2558          redo A;          redo A;
2559        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (193);  
2560          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2561    
2562          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2563          $self->{s_kwd} = '';            !!!cp (193);
2564              $self->{state} = DATA_STATE;
2565              $self->{s_kwd} = '';
2566              $self->{ct}->{quirks} = 1;
2567            } else {
2568              !!!cp (193.1);
2569              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2570            }
2571          
2572          ## reconsume          ## reconsume
2573            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         !!!emit ($self->{ct}); # DOCTYPE  
   
2574          redo A;          redo A;
2575        } else {        } else {
2576          !!!cp (194);          !!!cp (194);
2577          $self->{ct}->{pubid} # DOCTYPE          $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
2578          $self->{read_until}->($self->{ct}->{pubid}, q['>],          $self->{read_until}->($self->{ct}->{pubid}, q['>],
2579                                length $self->{ct}->{pubid});                                length $self->{ct}->{pubid});
2580    
# Line 2259  sub _get_next_token ($) { Line 2590  sub _get_next_token ($) {
2590          redo A;          redo A;
2591        } elsif ($self->{nc} == 0x0022) { # "        } elsif ($self->{nc} == 0x0022) { # "
2592          !!!cp (196);          !!!cp (196);
2593          $self->{ct}->{sysid} = ''; # DOCTYPE          $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2594          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2595          !!!next-input-character;          !!!next-input-character;
2596          redo A;          redo A;
2597        } elsif ($self->{nc} == 0x0027) { # '        } elsif ($self->{nc} == 0x0027) { # '
2598          !!!cp (197);          !!!cp (197);
2599          $self->{ct}->{sysid} = ''; # DOCTYPE          $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2600          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2601          !!!next-input-character;          !!!next-input-character;
2602          redo A;          redo A;
2603        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2604          !!!cp (198);          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2605          $self->{state} = DATA_STATE;            if ($self->{is_xml}) {
2606          $self->{s_kwd} = '';              !!!cp (198.1);
2607                !!!parse-error (type => 'no SYSTEM literal');
2608              } else {
2609                !!!cp (198);
2610              }
2611              $self->{state} = DATA_STATE;
2612              $self->{s_kwd} = '';
2613            } else {
2614              if ($self->{ct}->{type} == NOTATION_TOKEN) {
2615                !!!cp (198.2);
2616              } else {
2617                !!!cp (198.3);
2618                !!!parse-error (type => 'no SYSTEM literal');            
2619              }
2620              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2621            }
2622            
2623          !!!next-input-character;          !!!next-input-character;
2624            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         !!!emit ($self->{ct}); # DOCTYPE  
   
2625          redo A;          redo A;
2626        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2627          !!!cp (199);          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2628          !!!parse-error (type => 'unclosed DOCTYPE');            !!!cp (199);
2629              !!!parse-error (type => 'unclosed DOCTYPE');
2630          $self->{state} = DATA_STATE;            
2631          $self->{s_kwd} = '';            $self->{state} = DATA_STATE;
2632              $self->{s_kwd} = '';
2633              $self->{ct}->{quirks} = 1;
2634            } else {
2635              !!!parse-error (type => 'unclosed md'); ## TODO: type
2636              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2637            }
2638            
2639          ## reconsume          ## reconsume
2640            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2641          $self->{ct}->{quirks} = 1;          redo A;
2642          } elsif ($self->{is_xml} and
2643                   $self->{ct}->{type} == DOCTYPE_TOKEN and
2644                   $self->{nc} == 0x005B) { # [
2645            !!!cp (200.1);
2646            !!!parse-error (type => 'no SYSTEM literal');
2647            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2648            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2649            $self->{in_subset} = 1;
2650            !!!next-input-character;
2651          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
   
2652          redo A;          redo A;
2653        } else {        } else {
         !!!cp (200);  
2654          !!!parse-error (type => 'string after PUBLIC literal');          !!!parse-error (type => 'string after PUBLIC literal');
         $self->{ct}->{quirks} = 1;  
2655    
2656          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2657              !!!cp (200);
2658              $self->{ct}->{quirks} = 1;
2659              $self->{state} = BOGUS_DOCTYPE_STATE;
2660            } else {
2661              !!!cp (200.2);
2662              $self->{state} = BOGUS_MD_STATE;
2663            }
2664    
2665          !!!next-input-character;          !!!next-input-character;
2666          redo A;          redo A;
2667        }        }
# Line 2318  sub _get_next_token ($) { Line 2684  sub _get_next_token ($) {
2684          !!!next-input-character;          !!!next-input-character;
2685          redo A;          redo A;
2686        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
         !!!cp (204);  
2687          !!!parse-error (type => 'no SYSTEM literal');          !!!parse-error (type => 'no SYSTEM literal');
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
2688          !!!next-input-character;          !!!next-input-character;
2689    
2690          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2691          !!!emit ($self->{ct}); # DOCTYPE            !!!cp (204);
2692              $self->{state} = DATA_STATE;
2693              $self->{s_kwd} = '';
2694              $self->{ct}->{quirks} = 1;
2695            } else {
2696              !!!cp (204.1);
2697              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2698            }
2699    
2700            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2701          redo A;          redo A;
2702        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2703          !!!cp (205);          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2704          !!!parse-error (type => 'unclosed DOCTYPE');            !!!cp (205);
2705              !!!parse-error (type => 'unclosed DOCTYPE');
2706          $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
2707          $self->{s_kwd} = '';            $self->{s_kwd} = '';
2708              $self->{ct}->{quirks} = 1;
2709            } else {
2710              !!!cp (205.1);
2711              !!!parse-error (type => 'unclosed md'); ## TODO: type
2712              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2713            }
2714            
2715          ## reconsume          ## reconsume
2716            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2717            redo A;
2718          } elsif ($self->{is_xml} and
2719                   $self->{ct}->{type} == DOCTYPE_TOKEN and
2720                   $self->{nc} == 0x005B) { # [
2721            !!!cp (206.1);
2722            !!!parse-error (type => 'no SYSTEM literal');
2723    
2724          $self->{ct}->{quirks} = 1;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2725            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2726            $self->{in_subset} = 1;
2727            !!!next-input-character;
2728          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
   
2729          redo A;          redo A;
2730        } else {        } else {
         !!!cp (206);  
2731          !!!parse-error (type => 'string after SYSTEM');          !!!parse-error (type => 'string after SYSTEM');
         $self->{ct}->{quirks} = 1;  
2732    
2733          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2734              !!!cp (206);          
2735              $self->{ct}->{quirks} = 1;
2736              $self->{state} = BOGUS_DOCTYPE_STATE;
2737            } else {
2738              !!!cp (206.2);
2739              $self->{state} = BOGUS_MD_STATE;
2740            }
2741    
2742          !!!next-input-character;          !!!next-input-character;
2743          redo A;          redo A;
2744        }        }
# Line 2355  sub _get_next_token ($) { Line 2748  sub _get_next_token ($) {
2748          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2749          !!!next-input-character;          !!!next-input-character;
2750          redo A;          redo A;
2751        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
         !!!cp (208);  
2752          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2753    
2754          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2755          $self->{s_kwd} = '';            !!!cp (208);
2756              $self->{state} = DATA_STATE;
2757              $self->{s_kwd} = '';
2758              $self->{ct}->{quirks} = 1;
2759            } else {
2760              !!!cp (208.1);
2761              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2762            }
2763            
2764          !!!next-input-character;          !!!next-input-character;
2765            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         !!!emit ($self->{ct}); # DOCTYPE  
   
2766          redo A;          redo A;
2767        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (209);  
2768          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2769    
2770          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2771          $self->{s_kwd} = '';            !!!cp (209);
2772              $self->{state} = DATA_STATE;
2773              $self->{s_kwd} = '';
2774              $self->{ct}->{quirks} = 1;
2775            } else {
2776              !!!cp (209.1);
2777              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2778            }
2779            
2780          ## reconsume          ## reconsume
2781            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         !!!emit ($self->{ct}); # DOCTYPE  
   
2782          redo A;          redo A;
2783        } else {        } else {
2784          !!!cp (210);          !!!cp (210);
2785          $self->{ct}->{sysid} # DOCTYPE          $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
2786          $self->{read_until}->($self->{ct}->{sysid}, q[">],          $self->{read_until}->($self->{ct}->{sysid}, q[">],
2787                                length $self->{ct}->{sysid});                                length $self->{ct}->{sysid});
2788    
# Line 2396  sub _get_next_token ($) { Line 2796  sub _get_next_token ($) {
2796          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2797          !!!next-input-character;          !!!next-input-character;
2798          redo A;          redo A;
2799        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2800          !!!cp (212);          !!!cp (212);
2801          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2802    
# Line 2409  sub _get_next_token ($) { Line 2809  sub _get_next_token ($) {
2809    
2810          redo A;          redo A;
2811        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (213);  
2812          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2813    
2814          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2815          $self->{s_kwd} = '';            !!!cp (213);
2816          ## reconsume            $self->{state} = DATA_STATE;
2817              $self->{s_kwd} = '';
2818          $self->{ct}->{quirks} = 1;            $self->{ct}->{quirks} = 1;
2819          !!!emit ($self->{ct}); # DOCTYPE          } else {
2820              !!!cp (213.1);
2821              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2822            }
2823    
2824            ## reconsume
2825            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2826          redo A;          redo A;
2827        } else {        } else {
2828          !!!cp (214);          !!!cp (214);
2829          $self->{ct}->{sysid} # DOCTYPE          $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
2830          $self->{read_until}->($self->{ct}->{sysid}, q['>],          $self->{read_until}->($self->{ct}->{sysid}, q['>],
2831                                length $self->{ct}->{sysid});                                length $self->{ct}->{sysid});
2832    
# Line 2433  sub _get_next_token ($) { Line 2836  sub _get_next_token ($) {
2836        }        }
2837      } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {      } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2838        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2839          !!!cp (215);          if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
2840          ## Stay in the state            !!!cp (215.1);
2841              $self->{state} = BEFORE_NDATA_STATE;
2842            } else {
2843              !!!cp (215);
2844              ## Stay in the state
2845            }
2846          !!!next-input-character;          !!!next-input-character;
2847          redo A;          redo A;
2848        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2849          !!!cp (216);          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2850          $self->{state} = DATA_STATE;            !!!cp (216);
2851          $self->{s_kwd} = '';            $self->{state} = DATA_STATE;
2852              $self->{s_kwd} = '';
2853            } else {
2854              !!!cp (216.1);
2855              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2856            }
2857    
2858            !!!next-input-character;
2859            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2860            redo A;
2861          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
2862                   ($self->{nc} == 0x004E or # N
2863                    $self->{nc} == 0x006E)) { # n
2864            !!!cp (216.2);
2865            !!!parse-error (type => 'no space before NDATA'); ## TODO: type
2866            $self->{state} = NDATA_STATE;
2867            $self->{kwd} = chr $self->{nc};
2868          !!!next-input-character;          !!!next-input-character;
2869            redo A;
2870          } elsif ($self->{nc} == -1) {
2871            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2872              !!!cp (217);
2873              !!!parse-error (type => 'unclosed DOCTYPE');
2874              $self->{state} = DATA_STATE;
2875              $self->{s_kwd} = '';
2876              $self->{ct}->{quirks} = 1;
2877            } else {
2878              !!!cp (217.1);
2879              !!!parse-error (type => 'unclosed md'); ## TODO: type
2880              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2881            }
2882    
2883            ## reconsume
2884            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2885            redo A;
2886          } elsif ($self->{is_xml} and
2887                   $self->{ct}->{type} == DOCTYPE_TOKEN and
2888                   $self->{nc} == 0x005B) { # [
2889            !!!cp (218.1);
2890            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2891            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2892            $self->{in_subset} = 1;
2893            !!!next-input-character;
2894          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2895            redo A;
2896          } else {
2897            !!!parse-error (type => 'string after SYSTEM literal');
2898    
2899            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2900              !!!cp (218);
2901              #$self->{ct}->{quirks} = 1;
2902              $self->{state} = BOGUS_DOCTYPE_STATE;
2903            } else {
2904              !!!cp (218.2);
2905              $self->{state} = BOGUS_MD_STATE;
2906            }
2907    
2908            !!!next-input-character;
2909            redo A;
2910          }
2911        } elsif ($self->{state} == BEFORE_NDATA_STATE) {
2912          if ($is_space->{$self->{nc}}) {
2913            !!!cp (218.3);
2914            ## Stay in the state.
2915            !!!next-input-character;
2916            redo A;
2917          } elsif ($self->{nc} == 0x003E) { # >
2918            !!!cp (218.4);
2919            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2920            !!!next-input-character;
2921            !!!emit ($self->{ct}); # ENTITY
2922            redo A;
2923          } elsif ($self->{nc} == 0x004E or # N
2924                   $self->{nc} == 0x006E) { # n
2925            !!!cp (218.5);
2926            $self->{state} = NDATA_STATE;
2927            $self->{kwd} = chr $self->{nc};
2928            !!!next-input-character;
2929          redo A;          redo A;
2930        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2931          !!!cp (217);          !!!cp (218.6);
2932          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed md'); ## TODO: type
2933          $self->{state} = DATA_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
         $self->{s_kwd} = '';  
2934          ## reconsume          ## reconsume
2935            !!!emit ($self->{ct}); # ENTITY
         $self->{ct}->{quirks} = 1;  
         !!!emit ($self->{ct}); # DOCTYPE  
   
2936          redo A;          redo A;
2937        } else {        } else {
2938          !!!cp (218);          !!!cp (218.7);
2939          !!!parse-error (type => 'string after SYSTEM literal');          !!!parse-error (type => 'string after SYSTEM literal');
2940          #$self->{ct}->{quirks} = 1;          $self->{state} = BOGUS_MD_STATE;
   
         $self->{state} = BOGUS_DOCTYPE_STATE;  
2941          !!!next-input-character;          !!!next-input-character;
2942          redo A;          redo A;
2943        }        }
# Line 2476  sub _get_next_token ($) { Line 2951  sub _get_next_token ($) {
2951          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2952    
2953          redo A;          redo A;
2954          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2955            !!!cp (220.1);
2956            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2957            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2958            $self->{in_subset} = 1;
2959            !!!next-input-character;
2960            !!!emit ($self->{ct}); # DOCTYPE
2961            redo A;
2962        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2963          !!!cp (220);          !!!cp (220);
2964          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
# Line 2488  sub _get_next_token ($) { Line 2971  sub _get_next_token ($) {
2971        } else {        } else {
2972          !!!cp (221);          !!!cp (221);
2973          my $s = '';          my $s = '';
2974          $self->{read_until}->($s, q[>], 0);          $self->{read_until}->($s, q{>[}, 0);
2975    
2976          ## Stay in the state          ## Stay in the state
2977          !!!next-input-character;          !!!next-input-character;
# Line 2596  sub _get_next_token ($) { Line 3079  sub _get_next_token ($) {
3079        } elsif ($self->{nc} == 0x0023) { # #        } elsif ($self->{nc} == 0x0023) { # #
3080          !!!cp (999);          !!!cp (999);
3081          $self->{state} = ENTITY_HASH_STATE;          $self->{state} = ENTITY_HASH_STATE;
3082          $self->{s_kwd} = '#';          $self->{kwd} = '#';
3083          !!!next-input-character;          !!!next-input-character;
3084          redo A;          redo A;
3085        } elsif ((0x0041 <= $self->{nc} and        } elsif ((0x0041 <= $self->{nc} and
# Line 2606  sub _get_next_token ($) { Line 3089  sub _get_next_token ($) {
3089          !!!cp (998);          !!!cp (998);
3090          require Whatpm::_NamedEntityList;          require Whatpm::_NamedEntityList;
3091          $self->{state} = ENTITY_NAME_STATE;          $self->{state} = ENTITY_NAME_STATE;
3092          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
3093          $self->{entity__value} = $self->{s_kwd};          $self->{entity__value} = $self->{kwd};
3094          $self->{entity__match} = 0;          $self->{entity__match} = 0;
3095          !!!next-input-character;          !!!next-input-character;
3096          redo A;          redo A;
# Line 2647  sub _get_next_token ($) { Line 3130  sub _get_next_token ($) {
3130            $self->{nc} == 0x0058) { # X            $self->{nc} == 0x0058) { # X
3131          !!!cp (995);          !!!cp (995);
3132          $self->{state} = HEXREF_X_STATE;          $self->{state} = HEXREF_X_STATE;
3133          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3134          !!!next-input-character;          !!!next-input-character;
3135          redo A;          redo A;
3136        } elsif (0x0030 <= $self->{nc} and        } elsif (0x0030 <= $self->{nc} and
3137                 $self->{nc} <= 0x0039) { # 0..9                 $self->{nc} <= 0x0039) { # 0..9
3138          !!!cp (994);          !!!cp (994);
3139          $self->{state} = NCR_NUM_STATE;          $self->{state} = NCR_NUM_STATE;
3140          $self->{s_kwd} = $self->{nc} - 0x0030;          $self->{kwd} = $self->{nc} - 0x0030;
3141          !!!next-input-character;          !!!next-input-character;
3142          redo A;          redo A;
3143        } else {        } else {
# Line 2690  sub _get_next_token ($) { Line 3173  sub _get_next_token ($) {
3173        if (0x0030 <= $self->{nc} and        if (0x0030 <= $self->{nc} and
3174            $self->{nc} <= 0x0039) { # 0..9            $self->{nc} <= 0x0039) { # 0..9
3175          !!!cp (1012);          !!!cp (1012);
3176          $self->{s_kwd} *= 10;          $self->{kwd} *= 10;
3177          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
3178                    
3179          ## Stay in the state.          ## Stay in the state.
3180          !!!next-input-character;          !!!next-input-character;
# Line 2707  sub _get_next_token ($) { Line 3190  sub _get_next_token ($) {
3190          #          #
3191        }        }
3192    
3193        my $code = $self->{s_kwd};        my $code = $self->{kwd};
3194        my $l = $self->{line_prev};        my $l = $self->{line_prev};
3195        my $c = $self->{column_prev};        my $c = $self->{column_prev};
3196        if ($charref_map->{$code}) {        if ($charref_map->{$code}) {
# Line 2750  sub _get_next_token ($) { Line 3233  sub _get_next_token ($) {
3233          # 0..9, A..F, a..f          # 0..9, A..F, a..f
3234          !!!cp (990);          !!!cp (990);
3235          $self->{state} = HEXREF_HEX_STATE;          $self->{state} = HEXREF_HEX_STATE;
3236          $self->{s_kwd} = 0;          $self->{kwd} = 0;
3237          ## Reconsume.          ## Reconsume.
3238          redo A;          redo A;
3239        } else {        } else {
# Line 2768  sub _get_next_token ($) { Line 3251  sub _get_next_token ($) {
3251            $self->{s_kwd} = '';            $self->{s_kwd} = '';
3252            ## Reconsume.            ## Reconsume.
3253            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
3254                      data => '&' . $self->{s_kwd},                      data => '&' . $self->{kwd},
3255                      line => $self->{line_prev},                      line => $self->{line_prev},
3256                      column => $self->{column_prev} - length $self->{s_kwd},                      column => $self->{column_prev} - length $self->{kwd},
3257                     });                     });
3258            redo A;            redo A;
3259          } else {          } else {
3260            !!!cp (989);            !!!cp (989);
3261            $self->{ca}->{value} .= '&' . $self->{s_kwd};            $self->{ca}->{value} .= '&' . $self->{kwd};
3262            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
3263            $self->{s_kwd} = '';            $self->{s_kwd} = '';
3264            ## Reconsume.            ## Reconsume.
# Line 2786  sub _get_next_token ($) { Line 3269  sub _get_next_token ($) {
3269        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
3270          # 0..9          # 0..9
3271          !!!cp (1002);          !!!cp (1002);
3272          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
3273          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
3274          ## Stay in the state.          ## Stay in the state.
3275          !!!next-input-character;          !!!next-input-character;
3276          redo A;          redo A;
3277        } elsif (0x0061 <= $self->{nc} and        } elsif (0x0061 <= $self->{nc} and
3278                 $self->{nc} <= 0x0066) { # a..f                 $self->{nc} <= 0x0066) { # a..f
3279          !!!cp (1003);          !!!cp (1003);
3280          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
3281          $self->{s_kwd} += $self->{nc} - 0x0060 + 9;          $self->{kwd} += $self->{nc} - 0x0060 + 9;
3282          ## Stay in the state.          ## Stay in the state.
3283          !!!next-input-character;          !!!next-input-character;
3284          redo A;          redo A;
3285        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
3286                 $self->{nc} <= 0x0046) { # A..F                 $self->{nc} <= 0x0046) { # A..F
3287          !!!cp (1004);          !!!cp (1004);
3288          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
3289          $self->{s_kwd} += $self->{nc} - 0x0040 + 9;          $self->{kwd} += $self->{nc} - 0x0040 + 9;
3290          ## Stay in the state.          ## Stay in the state.
3291          !!!next-input-character;          !!!next-input-character;
3292          redo A;          redo A;
# Line 2820  sub _get_next_token ($) { Line 3303  sub _get_next_token ($) {
3303          #          #
3304        }        }
3305    
3306        my $code = $self->{s_kwd};        my $code = $self->{kwd};
3307        my $l = $self->{line_prev};        my $l = $self->{line_prev};
3308        my $c = $self->{column_prev};        my $c = $self->{column_prev};
3309        if ($charref_map->{$code}) {        if ($charref_map->{$code}) {
# Line 2857  sub _get_next_token ($) { Line 3340  sub _get_next_token ($) {
3340          redo A;          redo A;
3341        }        }
3342      } elsif ($self->{state} == ENTITY_NAME_STATE) {      } elsif ($self->{state} == ENTITY_NAME_STATE) {
3343        if (length $self->{s_kwd} < 30 and        if (length $self->{kwd} < 30 and
3344            ## NOTE: Some number greater than the maximum length of entity name            ## NOTE: Some number greater than the maximum length of entity name
3345            ((0x0041 <= $self->{nc} and # a            ((0x0041 <= $self->{nc} and # a
3346              $self->{nc} <= 0x005A) or # x              $self->{nc} <= 0x005A) or # x
# Line 2867  sub _get_next_token ($) { Line 3350  sub _get_next_token ($) {
3350              $self->{nc} <= 0x0039) or # 9              $self->{nc} <= 0x0039) or # 9
3351             $self->{nc} == 0x003B)) { # ;             $self->{nc} == 0x003B)) { # ;
3352          our $EntityChar;          our $EntityChar;
3353          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3354          if (defined $EntityChar->{$self->{s_kwd}}) {          if (defined $EntityChar->{$self->{kwd}}) {
3355            if ($self->{nc} == 0x003B) { # ;            if ($self->{nc} == 0x003B) { # ;
3356              !!!cp (1020);              !!!cp (1020);
3357              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};              $self->{entity__value} = $EntityChar->{$self->{kwd}};
3358              $self->{entity__match} = 1;              $self->{entity__match} = 1;
3359              !!!next-input-character;              !!!next-input-character;
3360              #              #
3361            } else {            } else {
3362              !!!cp (1021);              !!!cp (1021);
3363              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};              $self->{entity__value} = $EntityChar->{$self->{kwd}};
3364              $self->{entity__match} = -1;              $self->{entity__match} = -1;
3365              ## Stay in the state.              ## Stay in the state.
3366              !!!next-input-character;              !!!next-input-character;
# Line 2905  sub _get_next_token ($) { Line 3388  sub _get_next_token ($) {
3388          if ($self->{prev_state} != DATA_STATE and # in attribute          if ($self->{prev_state} != DATA_STATE and # in attribute
3389              $self->{entity__match} < -1) {              $self->{entity__match} < -1) {
3390            !!!cp (1024);            !!!cp (1024);
3391            $data = '&' . $self->{s_kwd};            $data = '&' . $self->{kwd};
3392            #            #
3393          } else {          } else {
3394            !!!cp (1025);            !!!cp (1025);
# Line 2917  sub _get_next_token ($) { Line 3400  sub _get_next_token ($) {
3400          !!!cp (1026);          !!!cp (1026);
3401          !!!parse-error (type => 'bare ero',          !!!parse-error (type => 'bare ero',
3402                          line => $self->{line_prev},                          line => $self->{line_prev},
3403                          column => $self->{column_prev} - length $self->{s_kwd});                          column => $self->{column_prev} - length $self->{kwd});
3404          $data = '&' . $self->{s_kwd};          $data = '&' . $self->{kwd};
3405          #          #
3406        }        }
3407        
# Line 2941  sub _get_next_token ($) { Line 3424  sub _get_next_token ($) {
3424                    data => $data,                    data => $data,
3425                    has_reference => $has_ref,                    has_reference => $has_ref,
3426                    line => $self->{line_prev},                    line => $self->{line_prev},
3427                    column => $self->{column_prev} + 1 - length $self->{s_kwd},                    column => $self->{column_prev} + 1 - length $self->{kwd},
3428                   });                   });
3429          redo A;          redo A;
3430        } else {        } else {
# Line 2957  sub _get_next_token ($) { Line 3440  sub _get_next_token ($) {
3440      ## XML-only states      ## XML-only states
3441    
3442      } elsif ($self->{state} == PI_STATE) {      } elsif ($self->{state} == PI_STATE) {
3443          ## XML5: "Pi state" and "DOCTYPE pi state".
3444    
3445        if ($is_space->{$self->{nc}} or        if ($is_space->{$self->{nc}} or
3446            $self->{nc} == 0x003F or # ? ## XML5: Same as "Anything else"            $self->{nc} == 0x003F or # ?
3447            $self->{nc} == -1) {            $self->{nc} == -1) {
3448            ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
3449            ## pi state": Switch to the "DOCTYPE pi after state".  EOF:
3450            ## "DOCTYPE pi state": Parse error, switch to the "data
3451            ## state".
3452          !!!parse-error (type => 'bare pio', ## TODO: type          !!!parse-error (type => 'bare pio', ## TODO: type
3453                          line => $self->{line_prev},                          line => $self->{line_prev},
3454                          column => $self->{column_prev}                          column => $self->{column_prev}
# Line 2974  sub _get_next_token ($) { Line 3463  sub _get_next_token ($) {
3463                        };                        };
3464          redo A;          redo A;
3465        } else {        } else {
3466            ## XML5: "DOCTYPE pi state": Stay in the state.
3467          $self->{ct} = {type => PI_TOKEN,          $self->{ct} = {type => PI_TOKEN,
3468                         target => chr $self->{nc},                         target => chr $self->{nc},
3469                         data => '',                         data => '',
# Line 2991  sub _get_next_token ($) { Line 3481  sub _get_next_token ($) {
3481          redo A;          redo A;
3482        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3483          !!!parse-error (type => 'no pic'); ## TODO: type          !!!parse-error (type => 'no pic'); ## TODO: type
3484          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
3485          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3486            } else {
3487              $self->{state} = DATA_STATE;
3488              $self->{s_kwd} = '';
3489            }
3490          ## Reconsume.          ## Reconsume.
3491          !!!emit ($self->{ct}); # pi          !!!emit ($self->{ct}); # pi
3492          redo A;          redo A;
# Line 3023  sub _get_next_token ($) { Line 3517  sub _get_next_token ($) {
3517          redo A;          redo A;
3518        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3519          !!!parse-error (type => 'no pic'); ## TODO: type          !!!parse-error (type => 'no pic'); ## TODO: type
3520          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
3521          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
3522            } else {
3523              $self->{state} = DATA_STATE;
3524              $self->{s_kwd} = '';
3525            }
3526          ## Reprocess.          ## Reprocess.
3527          !!!emit ($self->{ct}); # pi          !!!emit ($self->{ct}); # pi
3528          redo A;          redo A;
# Line 3038  sub _get_next_token ($) { Line 3536  sub _get_next_token ($) {
3536          redo A;          redo A;
3537        }        }
3538      } elsif ($self->{state} == PI_AFTER_STATE) {      } elsif ($self->{state} == PI_AFTER_STATE) {
3539          ## XML5: Part of "Pi after state".
3540    
3541        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
3542          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
3543          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3544            } else {
3545              $self->{state} = DATA_STATE;
3546              $self->{s_kwd} = '';
3547            }
3548          !!!next-input-character;          !!!next-input-character;
3549          !!!emit ($self->{ct}); # pi          !!!emit ($self->{ct}); # pi
3550          redo A;          redo A;
# Line 3063  sub _get_next_token ($) { Line 3567  sub _get_next_token ($) {
3567          redo A;          redo A;
3568        }        }
3569      } elsif ($self->{state} == PI_DATA_AFTER_STATE) {      } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
3570        ## XML5: Same as "pi after state" in XML5        ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
3571    
3572        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
3573          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
3574          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3575            } else {
3576              $self->{state} = DATA_STATE;
3577              $self->{s_kwd} = '';
3578            }
3579          !!!next-input-character;          !!!next-input-character;
3580          !!!emit ($self->{ct}); # pi          !!!emit ($self->{ct}); # pi
3581          redo A;          redo A;
# Line 3081  sub _get_next_token ($) { Line 3590  sub _get_next_token ($) {
3590          ## Reprocess.          ## Reprocess.
3591          redo A;          redo A;
3592        }        }
3593            
3594        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
3595          if ($self->{nc} == 0x003C) { # <
3596            $self->{state} = DOCTYPE_TAG_STATE;
3597            !!!next-input-character;
3598            redo A;
3599          } elsif ($self->{nc} == 0x0025) { # %
3600            ## XML5: Not defined yet.
3601    
3602            ## TODO:
3603            !!!next-input-character;
3604            redo A;
3605          } elsif ($self->{nc} == 0x005D) { # ]
3606            delete $self->{in_subset};
3607            $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3608            !!!next-input-character;
3609            redo A;
3610          } elsif ($is_space->{$self->{nc}}) {
3611            ## Stay in the state.
3612            !!!next-input-character;
3613            redo A;
3614          } elsif ($self->{nc} == -1) {
3615            !!!parse-error (type => 'unclosed internal subset'); ## TODO: type
3616            delete $self->{in_subset};
3617            $self->{state} = DATA_STATE;
3618            $self->{s_kwd} = '';
3619            ## Reconsume.
3620            !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3621            redo A;
3622          } else {
3623            unless ($self->{internal_subset_tainted}) {
3624              ## XML5: No parse error.
3625              !!!parse-error (type => 'string in internal subset');
3626              $self->{internal_subset_tainted} = 1;
3627            }
3628            ## Stay in the state.
3629            !!!next-input-character;
3630            redo A;
3631          }
3632        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3633          if ($self->{nc} == 0x003E) { # >
3634            $self->{state} = DATA_STATE;
3635            $self->{s_kwd} = '';
3636            !!!next-input-character;
3637            !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3638            redo A;
3639          } elsif ($self->{nc} == -1) {
3640            !!!parse-error (type => 'unclosed DOCTYPE');
3641            $self->{state} = DATA_STATE;
3642            $self->{s_kwd} = '';
3643            ## Reconsume.
3644            !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3645            redo A;
3646          } else {
3647            ## XML5: No parse error and stay in the state.
3648            !!!parse-error (type => 'string after internal subset'); ## TODO: type
3649    
3650            $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3651            !!!next-input-character;
3652            redo A;
3653          }
3654        } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3655          if ($self->{nc} == 0x003E) { # >
3656            $self->{state} = DATA_STATE;
3657            $self->{s_kwd} = '';
3658            !!!next-input-character;
3659            !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3660            redo A;
3661          } elsif ($self->{nc} == -1) {
3662            $self->{state} = DATA_STATE;
3663            $self->{s_kwd} = '';
3664            ## Reconsume.
3665            !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3666            redo A;
3667          } else {
3668            ## Stay in the state.
3669            !!!next-input-character;
3670            redo A;
3671          }
3672        } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
3673          if ($self->{nc} == 0x0021) { # !
3674            $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
3675            !!!next-input-character;
3676            redo A;
3677          } elsif ($self->{nc} == 0x003F) { # ?
3678            $self->{state} = PI_STATE;
3679            !!!next-input-character;
3680            redo A;
3681          } elsif ($self->{nc} == -1) {
3682            !!!parse-error (type => 'bare stago');
3683            $self->{state} = DATA_STATE;
3684            $self->{s_kwd} = '';
3685            ## Reconsume.
3686            redo A;
3687          } else {
3688            !!!parse-error (type => 'bare stago', ## XML5: Not a parse error.
3689                            line => $self->{line_prev},
3690                            column => $self->{column_prev});
3691            $self->{state} = BOGUS_COMMENT_STATE;
3692            $self->{ct} = {type => COMMENT_TOKEN,
3693                           data => '',
3694                          }; ## NOTE: Will be discarded.
3695            !!!next-input-character;
3696            redo A;
3697          }
3698        } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
3699          ## XML5: "DOCTYPE markup declaration state".
3700          
3701          if ($self->{nc} == 0x002D) { # -
3702            $self->{state} = MD_HYPHEN_STATE;
3703            !!!next-input-character;
3704            redo A;
3705          } elsif ($self->{nc} == 0x0045 or # E
3706                   $self->{nc} == 0x0065) { # e
3707            $self->{state} = MD_E_STATE;
3708            $self->{kwd} = chr $self->{nc};
3709            !!!next-input-character;
3710            redo A;
3711          } elsif ($self->{nc} == 0x0041 or # A
3712                   $self->{nc} == 0x0061) { # a
3713            $self->{state} = MD_ATTLIST_STATE;
3714            $self->{kwd} = chr $self->{nc};
3715            !!!next-input-character;
3716            redo A;
3717          } elsif ($self->{nc} == 0x004E or # N
3718                   $self->{nc} == 0x006E) { # n
3719            $self->{state} = MD_NOTATION_STATE;
3720            $self->{kwd} = chr $self->{nc};
3721            !!!next-input-character;
3722            redo A;
3723          } else {
3724            #
3725          }
3726          
3727          ## XML5: No parse error.
3728          !!!parse-error (type => 'bogus comment',
3729                          line => $self->{line_prev},
3730                          column => $self->{column_prev} - 1);
3731          ## Reconsume.
3732          $self->{state} = BOGUS_COMMENT_STATE;
3733          $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
3734          redo A;
3735        } elsif ($self->{state} == MD_E_STATE) {
3736          if ($self->{nc} == 0x004E or # N
3737              $self->{nc} == 0x006E) { # n
3738            $self->{state} = MD_ENTITY_STATE;
3739            $self->{kwd} .= chr $self->{nc};
3740            !!!next-input-character;
3741            redo A;
3742          } elsif ($self->{nc} == 0x004C or # L
3743                   $self->{nc} == 0x006C) { # l
3744            ## XML5: <!ELEMENT> not supported.
3745            $self->{state} = MD_ELEMENT_STATE;
3746            $self->{kwd} .= chr $self->{nc};
3747            !!!next-input-character;
3748            redo A;
3749          } else {
3750            ## XML5: No parse error.
3751            !!!parse-error (type => 'bogus comment',
3752                            line => $self->{line_prev},
3753                            column => $self->{column_prev} - 2
3754                                + 1 * ($self->{nc} == -1));
3755            ## Reconsume.
3756            $self->{state} = BOGUS_COMMENT_STATE;
3757            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3758            redo A;
3759          }
3760        } elsif ($self->{state} == MD_ENTITY_STATE) {
3761          if ($self->{nc} == [
3762                undef,
3763                undef,
3764                0x0054, # T
3765                0x0049, # I
3766                0x0054, # T
3767              ]->[length $self->{kwd}] or
3768              $self->{nc} == [
3769                undef,
3770                undef,
3771                0x0074, # t
3772                0x0069, # i
3773                0x0074, # t
3774              ]->[length $self->{kwd}]) {
3775            ## Stay in the state.
3776            $self->{kwd} .= chr $self->{nc};
3777            !!!next-input-character;
3778            redo A;
3779          } elsif ((length $self->{kwd}) == 5 and
3780                   ($self->{nc} == 0x0059 or # Y
3781                    $self->{nc} == 0x0079)) { # y
3782            if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
3783              !!!parse-error (type => 'lowercase keyword', ## TODO: type
3784                              text => 'ENTITY',
3785                              line => $self->{line_prev},
3786                              column => $self->{column_prev} - 4);
3787            }
3788            $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
3789                           line => $self->{line_prev},
3790                           column => $self->{column_prev} - 6};
3791            $self->{state} = DOCTYPE_MD_STATE;
3792            !!!next-input-character;
3793            redo A;
3794          } else {
3795            !!!parse-error (type => 'bogus comment',
3796                            line => $self->{line_prev},
3797                            column => $self->{column_prev} - 1
3798                                - (length $self->{kwd})
3799                                + 1 * ($self->{nc} == -1));
3800            $self->{state} = BOGUS_COMMENT_STATE;
3801            ## Reconsume.
3802            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3803            redo A;
3804          }
3805        } elsif ($self->{state} == MD_ELEMENT_STATE) {
3806          if ($self->{nc} == [
3807               undef,
3808               undef,
3809               0x0045, # E
3810               0x004D, # M
3811               0x0045, # E
3812               0x004E, # N
3813              ]->[length $self->{kwd}] or
3814              $self->{nc} == [
3815               undef,
3816               undef,
3817               0x0065, # e
3818               0x006D, # m
3819               0x0065, # e
3820               0x006E, # n
3821              ]->[length $self->{kwd}]) {
3822            ## Stay in the state.
3823            $self->{kwd} .= chr $self->{nc};
3824            !!!next-input-character;
3825            redo A;
3826          } elsif ((length $self->{kwd}) == 6 and
3827                   ($self->{nc} == 0x0054 or # T
3828                    $self->{nc} == 0x0074)) { # t
3829            if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
3830              !!!parse-error (type => 'lowercase keyword', ## TODO: type
3831                              text => 'ELEMENT',
3832                              line => $self->{line_prev},
3833                              column => $self->{column_prev} - 5);
3834            }
3835            $self->{ct} = {type => ELEMENT_TOKEN, name => '',
3836                           line => $self->{line_prev},
3837                           column => $self->{column_prev} - 6};
3838            $self->{state} = DOCTYPE_MD_STATE;
3839            !!!next-input-character;
3840            redo A;
3841          } else {
3842            !!!parse-error (type => 'bogus comment',
3843                            line => $self->{line_prev},
3844                            column => $self->{column_prev} - 1
3845                                - (length $self->{kwd})
3846                                + 1 * ($self->{nc} == -1));
3847            $self->{state} = BOGUS_COMMENT_STATE;
3848            ## Reconsume.
3849            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3850            redo A;
3851          }
3852        } elsif ($self->{state} == MD_ATTLIST_STATE) {
3853          if ($self->{nc} == [
3854               undef,
3855               0x0054, # T
3856               0x0054, # T
3857               0x004C, # L
3858               0x0049, # I
3859               0x0053, # S
3860              ]->[length $self->{kwd}] or
3861              $self->{nc} == [
3862               undef,
3863               0x0074, # t
3864               0x0074, # t
3865               0x006C, # l
3866               0x0069, # i
3867               0x0073, # s
3868              ]->[length $self->{kwd}]) {
3869            ## Stay in the state.
3870            $self->{kwd} .= chr $self->{nc};
3871            !!!next-input-character;
3872            redo A;
3873          } elsif ((length $self->{kwd}) == 6 and
3874                   ($self->{nc} == 0x0054 or # T
3875                    $self->{nc} == 0x0074)) { # t
3876            if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
3877              !!!parse-error (type => 'lowercase keyword', ## TODO: type
3878                              text => 'ATTLIST',
3879                              line => $self->{line_prev},
3880                              column => $self->{column_prev} - 5);
3881            }
3882            $self->{ct} = {type => ATTLIST_TOKEN, name => '',
3883                           attrdefs => [],
3884                           line => $self->{line_prev},
3885                           column => $self->{column_prev} - 6};
3886            $self->{state} = DOCTYPE_MD_STATE;
3887            !!!next-input-character;
3888            redo A;
3889          } else {
3890            !!!parse-error (type => 'bogus comment',
3891                            line => $self->{line_prev},
3892                            column => $self->{column_prev} - 1
3893                                 - (length $self->{kwd})
3894                                 + 1 * ($self->{nc} == -1));
3895            $self->{state} = BOGUS_COMMENT_STATE;
3896            ## Reconsume.
3897            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3898            redo A;
3899          }
3900        } elsif ($self->{state} == MD_NOTATION_STATE) {
3901          if ($self->{nc} == [
3902               undef,
3903               0x004F, # O
3904               0x0054, # T
3905               0x0041, # A
3906               0x0054, # T
3907               0x0049, # I
3908               0x004F, # O
3909              ]->[length $self->{kwd}] or
3910              $self->{nc} == [
3911               undef,
3912               0x006F, # o
3913               0x0074, # t
3914               0x0061, # a
3915               0x0074, # t
3916               0x0069, # i
3917               0x006F, # o
3918              ]->[length $self->{kwd}]) {
3919            ## Stay in the state.
3920            $self->{kwd} .= chr $self->{nc};
3921            !!!next-input-character;
3922            redo A;
3923          } elsif ((length $self->{kwd}) == 7 and
3924                   ($self->{nc} == 0x004E or # N
3925                    $self->{nc} == 0x006E)) { # n
3926            if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
3927              !!!parse-error (type => 'lowercase keyword', ## TODO: type
3928                              text => 'NOTATION',
3929                              line => $self->{line_prev},
3930                              column => $self->{column_prev} - 6);
3931            }
3932            $self->{ct} = {type => NOTATION_TOKEN, name => '',
3933                           line => $self->{line_prev},
3934                           column => $self->{column_prev} - 6};
3935            $self->{state} = DOCTYPE_MD_STATE;
3936            !!!next-input-character;
3937            redo A;
3938          } else {
3939            !!!parse-error (type => 'bogus comment',
3940                            line => $self->{line_prev},
3941                            column => $self->{column_prev} - 1
3942                                - (length $self->{kwd})
3943                                + 1 * ($self->{nc} == -1));
3944            $self->{state} = BOGUS_COMMENT_STATE;
3945            ## Reconsume.
3946            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3947            redo A;
3948          }
3949        } elsif ($self->{state} == DOCTYPE_MD_STATE) {
3950          ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
3951          ## "DOCTYPE NOTATION state".
3952    
3953          if ($is_space->{$self->{nc}}) {
3954            ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
3955            $self->{state} = BEFORE_MD_NAME_STATE;
3956            !!!next-input-character;
3957            redo A;
3958          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
3959                   $self->{nc} == 0x0025) { # %
3960            ## XML5: Switch to the "DOCTYPE bogus comment state".
3961            !!!parse-error (type => 'no space before md name'); ## TODO: type
3962            $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
3963            !!!next-input-character;
3964            redo A;
3965          } elsif ($self->{nc} == -1) {
3966            !!!parse-error (type => 'unclosed md'); ## TODO: type
3967            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3968            ## Reconsume.
3969            redo A;
3970          } elsif ($self->{nc} == 0x003E) { # >
3971            ## XML5: Switch to the "DOCTYPE bogus comment state".
3972            !!!parse-error (type => 'no md name'); ## TODO: type
3973            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3974            !!!next-input-character;
3975            redo A;
3976          } else {
3977            ## XML5: Switch to the "DOCTYPE bogus comment state".
3978            !!!parse-error (type => 'no space before md name'); ## TODO: type
3979            $self->{state} = BEFORE_MD_NAME_STATE;
3980            redo A;
3981          }
3982        } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
3983          ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
3984          ## before state", "DOCTYPE ATTLIST name before state".
3985    
3986          if ($is_space->{$self->{nc}}) {
3987            ## Stay in the state.
3988            !!!next-input-character;
3989            redo A;
3990          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
3991                   $self->{nc} == 0x0025) { # %
3992            $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
3993            !!!next-input-character;
3994            redo A;
3995          } elsif ($self->{nc} == 0x003E) { # >
3996            ## XML5: Same as "Anything else".
3997            !!!parse-error (type => 'no md name'); ## TODO: type
3998            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3999            !!!next-input-character;
4000            redo A;
4001          } elsif ($self->{nc} == -1) {
4002            !!!parse-error (type => 'unclosed md'); ## TODO: type
4003            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4004            ## Reconsume.
4005            redo A;
4006          } else {
4007            ## XML5: [ATTLIST] Not defined yet.
4008            $self->{ct}->{name} .= chr $self->{nc};
4009            $self->{state} = MD_NAME_STATE;
4010            !!!next-input-character;
4011            redo A;
4012          }
4013        } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
4014          if ($is_space->{$self->{nc}}) {
4015            ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
4016            $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
4017            $self->{state} = BEFORE_MD_NAME_STATE;
4018            !!!next-input-character;
4019            redo A;
4020          } elsif ($self->{nc} == 0x003E) { # >
4021            ## XML5: Same as "Anything else".
4022            !!!parse-error (type => 'no md name'); ## TODO: type
4023            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4024            !!!next-input-character;
4025            redo A;
4026          } elsif ($self->{nc} == -1) {
4027            !!!parse-error (type => 'unclosed md');
4028            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4029            ## Reconsume.
4030            redo A;
4031          } else {
4032            ## XML5: No parse error.
4033            !!!parse-error (type => 'no space after ENTITY percent'); ## TODO: type
4034            $self->{state} = BOGUS_COMMENT_STATE;
4035            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4036            ## Reconsume.
4037            redo A;
4038          }
4039        } elsif ($self->{state} == MD_NAME_STATE) {
4040          ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
4041          
4042          if ($is_space->{$self->{nc}}) {
4043            if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4044              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4045            } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
4046              ## TODO: ...
4047              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4048            } else { # ENTITY/NOTATION
4049              $self->{state} = AFTER_DOCTYPE_NAME_STATE;
4050            }
4051            !!!next-input-character;
4052            redo A;
4053          } elsif ($self->{nc} == 0x003E) { # >
4054            if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4055              #
4056            } else {
4057              !!!parse-error (type => 'no md def'); ## TODO: type
4058            }
4059            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4060            !!!next-input-character;
4061            !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4062            redo A;
4063          } elsif ($self->{nc} == -1) {
4064            ## XML5: [ATTLIST] No parse error.
4065            !!!parse-error (type => 'unclosed md');
4066            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4067            ## Reconsume.
4068            !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4069            redo A;
4070          } else {
4071            ## XML5: [ATTLIST] Not defined yet.
4072            $self->{ct}->{name} .= chr $self->{nc};
4073            ## Stay in the state.
4074            !!!next-input-character;
4075            redo A;
4076          }
4077        } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
4078          if ($is_space->{$self->{nc}}) {
4079            ## Stay in the state.
4080            !!!next-input-character;
4081            redo A;
4082          } elsif ($self->{nc} == 0x003E) { # >
4083            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4084            !!!next-input-character;
4085            !!!emit ($self->{ct}); # ATTLIST
4086            redo A;
4087          } elsif ($self->{nc} == -1) {
4088            ## XML5: No parse error.
4089            !!!parse-error (type => 'unclosed md'); ## TODO: type
4090            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4091            !!!emit ($self->{ct});
4092            redo A;
4093          } else {
4094            ## XML5: Not defined yet.
4095            $self->{ca} = {name => chr ($self->{nc}), # attrdef
4096                           tokens => [],
4097                           line => $self->{line}, column => $self->{column}};
4098            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
4099            !!!next-input-character;
4100            redo A;
4101          }
4102        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
4103          if ($is_space->{$self->{nc}}) {
4104            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
4105            !!!next-input-character;
4106            redo A;
4107          } elsif ($self->{nc} == 0x003E) { # >
4108            ## XML5: Same as "anything else".
4109            !!!parse-error (type => 'no attr type'); ## TODO: type
4110            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4111            !!!next-input-character;
4112            !!!emit ($self->{ct}); # ATTLIST
4113            redo A;
4114          } elsif ($self->{nc} == 0x0028) { # (
4115            ## XML5: Same as "anything else".
4116            !!!parse-error (type => 'no space before paren'); ## TODO: type
4117            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4118            !!!next-input-character;
4119            redo A;
4120          } elsif ($self->{nc} == -1) {
4121            ## XML5: No parse error.
4122            !!!parse-error (type => 'unclosed md'); ## TODO: type
4123            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4124            !!!next-input-character;
4125            !!!emit ($self->{ct}); # ATTLIST
4126            redo A;
4127          } else {
4128            ## XML5: Not defined yet.
4129            $self->{ca}->{name} .= chr $self->{nc};
4130            ## Stay in the state.
4131            !!!next-input-character;
4132            redo A;
4133          }
4134        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
4135          if ($is_space->{$self->{nc}}) {
4136            ## Stay in the state.
4137            !!!next-input-character;
4138            redo A;
4139          } elsif ($self->{nc} == 0x003E) { # >
4140            ## XML5: Same as "anything else".
4141            !!!parse-error (type => 'no attr type'); ## TODO: type
4142            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4143            !!!next-input-character;
4144            !!!emit ($self->{ct}); # ATTLIST
4145            redo A;
4146          } elsif ($self->{nc} == 0x0028) { # (
4147            ## XML5: Same as "anything else".
4148            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4149            !!!next-input-character;
4150            redo A;
4151          } elsif ($self->{nc} == -1) {
4152            ## XML5: No parse error.
4153            !!!parse-error (type => 'unclosed md'); ## TODO: type
4154            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4155            !!!next-input-character;
4156            !!!emit ($self->{ct});
4157            redo A;
4158          } else {
4159            ## XML5: Not defined yet.
4160            $self->{ca}->{type} = chr $self->{nc};
4161            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
4162            !!!next-input-character;
4163            redo A;
4164          }
4165        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
4166          if ($is_space->{$self->{nc}}) {
4167            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
4168            !!!next-input-character;
4169            redo A;
4170          } elsif ($self->{nc} == 0x0023) { # #
4171            ## XML5: Same as "anything else".
4172            !!!parse-error (type => 'no space before default value'); ## TODO: type
4173            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4174            !!!next-input-character;
4175            redo A;
4176          } elsif ($self->{nc} == 0x0022) { # "
4177            ## XML5: Same as "anything else".
4178            !!!parse-error (type => 'no space before default value'); ## TODO: type
4179            $self->{ca}->{value} = '';
4180            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4181            !!!next-input-character;
4182            redo A;
4183          } elsif ($self->{nc} == 0x0027) { # '
4184            ## XML5: Same as "anything else".
4185            !!!parse-error (type => 'no space before default value'); ## TODO: type
4186            $self->{ca}->{value} = '';
4187            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4188            !!!next-input-character;
4189            redo A;
4190          } elsif ($self->{nc} == 0x003E) { # >
4191            ## XML5: Same as "anything else".
4192            !!!parse-error (type => 'no attr default'); ## TODO: type
4193            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4194            !!!next-input-character;
4195            !!!emit ($self->{ct}); # ATTLIST
4196            redo A;
4197          } elsif ($self->{nc} == 0x0028) { # (
4198            ## XML5: Same as "anything else".
4199            !!!parse-error (type => 'no space before paren'); ## TODO: type
4200            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4201            !!!next-input-character;
4202            redo A;
4203          } elsif ($self->{nc} == -1) {
4204            ## XML5: No parse error.
4205            !!!parse-error (type => 'unclosed md'); ## TODO: type
4206            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4207            !!!next-input-character;
4208            !!!emit ($self->{ct});
4209            redo A;
4210          } else {
4211            ## XML5: Not defined yet.
4212            $self->{ca}->{type} .= chr $self->{nc};
4213            ## Stay in the state.
4214            !!!next-input-character;
4215            redo A;
4216          }
4217        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
4218          if ($is_space->{$self->{nc}}) {
4219            ## Stay in the state.
4220            !!!next-input-character;
4221            redo A;
4222          } elsif ($self->{nc} == 0x0028) { # (
4223            ## XML5: Same as "anything else".
4224            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4225            !!!next-input-character;
4226            redo A;
4227          } elsif ($self->{nc} == 0x0023) { # #
4228            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4229            !!!next-input-character;
4230            redo A;
4231          } elsif ($self->{nc} == 0x0022) { # "
4232            ## XML5: Same as "anything else".
4233            $self->{ca}->{value} = '';
4234            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4235            !!!next-input-character;
4236            redo A;
4237          } elsif ($self->{nc} == 0x0027) { # '
4238            ## XML5: Same as "anything else".
4239            $self->{ca}->{value} = '';
4240            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4241            !!!next-input-character;
4242            redo A;
4243          } elsif ($self->{nc} == 0x003E) { # >
4244            ## XML5: Same as "anything else".
4245            !!!parse-error (type => 'no attr default'); ## TODO: type
4246            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4247            !!!next-input-character;
4248            !!!emit ($self->{ct}); # ATTLIST
4249            redo A;
4250          } elsif ($self->{nc} == -1) {
4251            ## XML5: No parse error.
4252            !!!parse-error (type => 'unclosed md'); ## TODO: type
4253            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4254            !!!next-input-character;
4255            !!!emit ($self->{ct});
4256            redo A;
4257          } else {
4258            ## XML5: Switch to the "DOCTYPE bogus comment state".
4259            !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4260            $self->{ca}->{value} = '';
4261            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4262            ## Reconsume.
4263            redo A;
4264          }
4265        } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
4266          if ($is_space->{$self->{nc}}) {
4267            ## Stay in the state.
4268            !!!next-input-character;
4269            redo A;
4270          } elsif ($self->{nc} == 0x007C) { # |
4271            !!!parse-error (type => 'empty allowed token'); ## TODO: type
4272            ## Stay in the state.
4273            !!!next-input-character;
4274            redo A;
4275          } elsif ($self->{nc} == 0x0029) { # )
4276            !!!parse-error (type => 'empty allowed token'); ## TODO: type
4277            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4278            !!!next-input-character;
4279            redo A;
4280          } elsif ($self->{nc} == 0x003E) { # >
4281            !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4282            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4283            !!!next-input-character;
4284            !!!emit ($self->{ct}); # ATTLIST
4285            redo A;
4286          } elsif ($self->{nc} == -1) {
4287            ## XML5: No parse error.
4288            !!!parse-error (type => 'unclosed md'); ## TODO: type
4289            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4290            !!!next-input-character;
4291            !!!emit ($self->{ct});
4292            redo A;
4293          } else {
4294            push @{$self->{ca}->{tokens}}, chr $self->{nc};
4295            $self->{state} = ALLOWED_TOKEN_STATE;
4296            !!!next-input-character;
4297            redo A;
4298          }
4299        } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
4300          if ($is_space->{$self->{nc}}) {
4301            $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
4302            !!!next-input-character;
4303            redo A;
4304          } elsif ($self->{nc} == 0x007C) { # |
4305            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4306            !!!next-input-character;
4307            redo A;
4308          } elsif ($self->{nc} == 0x0029) { # )
4309            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4310            !!!next-input-character;
4311            redo A;
4312          } elsif ($self->{nc} == 0x003E) { # >
4313            !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4314            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4315            !!!next-input-character;
4316            !!!emit ($self->{ct}); # ATTLIST
4317            redo A;
4318          } elsif ($self->{nc} == -1) {
4319            ## XML5: No parse error.
4320            !!!parse-error (type => 'unclosed md'); ## TODO: type
4321            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4322            !!!next-input-character;
4323            !!!emit ($self->{ct});
4324            redo A;
4325          } else {
4326            $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
4327            ## Stay in the state.
4328            !!!next-input-character;
4329            redo A;
4330          }
4331        } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
4332          if ($is_space->{$self->{nc}}) {
4333            ## Stay in the state.
4334            !!!next-input-character;
4335            redo A;
4336          } elsif ($self->{nc} == 0x007C) { # |
4337            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4338            !!!next-input-character;
4339            redo A;
4340          } elsif ($self->{nc} == 0x0029) { # )
4341            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4342            !!!next-input-character;
4343            redo A;
4344          } elsif ($self->{nc} == 0x003E) { # >
4345            !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4346            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4347            !!!next-input-character;
4348            !!!emit ($self->{ct}); # ATTLIST
4349            redo A;
4350          } elsif ($self->{nc} == -1) {
4351            ## XML5: No parse error.
4352            !!!parse-error (type => 'unclosed md'); ## TODO: type
4353            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4354            !!!next-input-character;
4355            !!!emit ($self->{ct});
4356            redo A;
4357          } else {
4358            !!!parse-error (type => 'space in allowed token', ## TODO: type
4359                            line => $self->{line_prev},
4360                            column => $self->{column_prev});
4361            $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
4362            $self->{state} = ALLOWED_TOKEN_STATE;
4363            !!!next-input-character;
4364            redo A;
4365          }
4366        } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
4367          if ($is_space->{$self->{nc}}) {
4368            $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
4369            !!!next-input-character;
4370            redo A;
4371          } elsif ($self->{nc} == 0x0023) { # #
4372            !!!parse-error (type => 'no space before default value'); ## TODO: type
4373            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4374            !!!next-input-character;
4375            redo A;
4376          } elsif ($self->{nc} == 0x0022) { # "
4377            !!!parse-error (type => 'no space before default value'); ## TODO: type
4378            $self->{ca}->{value} = '';
4379            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4380            !!!next-input-character;
4381            redo A;
4382          } elsif ($self->{nc} == 0x0027) { # '
4383            !!!parse-error (type => 'no space before default value'); ## TODO: type
4384            $self->{ca}->{value} = '';
4385            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4386            !!!next-input-character;
4387            redo A;
4388          } elsif ($self->{nc} == 0x003E) { # >
4389            !!!parse-error (type => 'no attr default'); ## TODO: type
4390            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4391            !!!next-input-character;
4392            !!!emit ($self->{ct}); # ATTLIST
4393            redo A;
4394          } elsif ($self->{nc} == -1) {
4395            !!!parse-error (type => 'unclosed md'); ## TODO: type
4396            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4397            !!!next-input-character;
4398            !!!emit ($self->{ct});
4399            redo A;
4400          } else {
4401            !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4402            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4403            ## Reconsume.
4404            redo A;
4405          }
4406        } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
4407          if ($is_space->{$self->{nc}}) {
4408            ## Stay in the state.
4409            !!!next-input-character;
4410            redo A;
4411          } elsif ($self->{nc} == 0x0023) { # #
4412            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4413            !!!next-input-character;
4414            redo A;
4415          } elsif ($self->{nc} == 0x0022) { # "
4416            $self->{ca}->{value} = '';
4417            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4418            !!!next-input-character;
4419            redo A;
4420          } elsif ($self->{nc} == 0x0027) { # '
4421            $self->{ca}->{value} = '';
4422            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4423            !!!next-input-character;
4424            redo A;
4425          } elsif ($self->{nc} == 0x003E) { # >
4426            !!!parse-error (type => 'no attr default'); ## TODO: type
4427            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4428            !!!next-input-character;
4429            !!!emit ($self->{ct}); # ATTLIST
4430            redo A;
4431          } elsif ($self->{nc} == -1) {
4432            !!!parse-error (type => 'unclosed md'); ## TODO: type
4433            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4434            !!!next-input-character;
4435            !!!emit ($self->{ct});
4436            redo A;
4437          } else {
4438            !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4439            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4440            ## Reconsume.
4441            redo A;
4442          }
4443        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
4444          if ($is_space->{$self->{nc}}) {
4445            ## XML5: No parse error.
4446            !!!parse-error (type => 'no default type'); ## TODO: type
4447            $self->{state} = BOGUS_MD_STATE;
4448            ## Reconsume.
4449            redo A;
4450          } elsif ($self->{nc} == 0x0022) { # "
4451            ## XML5: Same as "anything else".
4452            $self->{ca}->{value} = '';
4453            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4454            !!!next-input-character;
4455            redo A;
4456          } elsif ($self->{nc} == 0x0027) { # '
4457            ## XML5: Same as "anything else".
4458            $self->{ca}->{value} = '';
4459            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4460            !!!next-input-character;
4461            redo A;
4462          } elsif ($self->{nc} == 0x003E) { # >
4463            ## XML5: Same as "anything else".
4464            !!!parse-error (type => 'no attr default'); ## TODO: type
4465            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4466            !!!next-input-character;
4467            !!!emit ($self->{ct}); # ATTLIST
4468            redo A;
4469          } elsif ($self->{nc} == -1) {
4470            ## XML5: No parse error.
4471            !!!parse-error (type => 'unclosed md'); ## TODO: type
4472            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4473            !!!next-input-character;
4474            !!!emit ($self->{ct});
4475            redo A;
4476          } else {
4477            $self->{ca}->{default} = chr $self->{nc};
4478            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
4479            !!!next-input-character;
4480            redo A;
4481          }
4482        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
4483          if ($is_space->{$self->{nc}}) {
4484            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
4485            !!!next-input-character;
4486            redo A;
4487          } elsif ($self->{nc} == 0x0022) { # "
4488            ## XML5: Same as "anything else".
4489            !!!parse-error (type => 'no space before default value'); ## TODO: type
4490            $self->{ca}->{value} = '';
4491            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4492            !!!next-input-character;
4493            redo A;
4494          } elsif ($self->{nc} == 0x0027) { # '
4495            ## XML5: Same as "anything else".
4496            !!!parse-error (type => 'no space before default value'); ## TODO: type
4497            $self->{ca}->{value} = '';
4498            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4499            !!!next-input-character;
4500            redo A;
4501          } elsif ($self->{nc} == 0x003E) { # >
4502            ## XML5: Same as "anything else".
4503            push @{$self->{ct}->{attrdefs}}, $self->{ca};
4504            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4505            !!!next-input-character;
4506            !!!emit ($self->{ct}); # ATTLIST
4507            redo A;
4508          } elsif ($self->{nc} == -1) {
4509            ## XML5: No parse error.
4510            !!!parse-error (type => 'unclosed md'); ## TODO: type
4511            push @{$self->{ct}->{attrdefs}}, $self->{ca};
4512            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4513            !!!next-input-character;
4514            !!!emit ($self->{ct});
4515            redo A;
4516          } else {
4517            $self->{ca}->{default} .= chr $self->{nc};
4518            ## Stay in the state.
4519            !!!next-input-character;
4520            redo A;
4521          }
4522        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
4523          if ($is_space->{$self->{nc}}) {
4524            ## Stay in the state.
4525            !!!next-input-character;
4526            redo A;
4527          } elsif ($self->{nc} == 0x0022) { # "
4528            $self->{ca}->{value} = '';
4529            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4530            !!!next-input-character;
4531            redo A;
4532          } elsif ($self->{nc} == 0x0027) { # '
4533            $self->{ca}->{value} = '';
4534            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4535            !!!next-input-character;
4536            redo A;
4537          } elsif ($self->{nc} == 0x003E) { # >
4538            push @{$self->{ct}->{attrdefs}}, $self->{ca};
4539            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4540            !!!next-input-character;
4541            !!!emit ($self->{ct}); # ATTLIST
4542            redo A;
4543          } elsif ($self->{nc} == -1) {
4544            ## XML5: No parse error.
4545            !!!parse-error (type => 'unclosed md'); ## TODO: type
4546            push @{$self->{ct}->{attrdefs}}, $self->{ca};
4547            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4548            !!!next-input-character;
4549            !!!emit ($self->{ct});
4550            redo A;
4551          } else {
4552            ## XML5: Not defined yet.
4553            if ($self->{ca}->{default} eq 'FIXED') {
4554              $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4555            } else {
4556              push @{$self->{ct}->{attrdefs}}, $self->{ca};
4557              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4558            }
4559            ## Reconsume.
4560            redo A;
4561          }
4562        } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
4563          if ($is_space->{$self->{nc}} or
4564              $self->{nc} == -1 or
4565              $self->{nc} == 0x003E) { # >
4566            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4567            ## Reconsume.
4568            redo A;
4569          } else {
4570            !!!parse-error (type => 'no space before attr name'); ## TODO: type
4571            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4572            ## Reconsume.
4573            redo A;
4574          }
4575        } elsif ($self->{state} == NDATA_STATE) {
4576          ## ASCII case-insensitive
4577          if ($self->{nc} == [
4578                undef,
4579                0x0044, # D
4580                0x0041, # A
4581                0x0054, # T
4582              ]->[length $self->{kwd}] or
4583              $self->{nc} == [
4584                undef,
4585                0x0064, # d
4586                0x0061, # a
4587                0x0074, # t
4588              ]->[length $self->{kwd}]) {
4589            !!!cp (172.2);
4590            ## Stay in the state.
4591            $self->{kwd} .= chr $self->{nc};
4592            !!!next-input-character;
4593            redo A;
4594          } elsif ((length $self->{kwd}) == 4 and
4595                   ($self->{nc} == 0x0041 or # A
4596                    $self->{nc} == 0x0061)) { # a
4597            if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
4598              !!!cp (172.3);
4599              !!!parse-error (type => 'lowercase keyword', ## TODO: type
4600                              text => 'NDATA',
4601                              line => $self->{line_prev},
4602                              column => $self->{column_prev} - 4);
4603            } else {
4604              !!!cp (172.4);
4605            }
4606            $self->{state} = AFTER_NDATA_STATE;
4607            !!!next-input-character;
4608            redo A;
4609          } else {
4610            !!!parse-error (type => 'string after literal', ## TODO: type
4611                            line => $self->{line_prev},
4612                            column => $self->{column_prev} + 1
4613                                - length $self->{kwd});
4614            !!!cp (172.5);
4615            $self->{state} = BOGUS_MD_STATE;
4616            ## Reconsume.
4617            redo A;
4618          }
4619        } elsif ($self->{state} == AFTER_NDATA_STATE) {
4620          if ($is_space->{$self->{nc}}) {
4621            $self->{state} = BEFORE_NOTATION_NAME_STATE;
4622            !!!next-input-character;
4623            redo A;
4624          } elsif ($self->{nc} == 0x003E) { # >
4625            !!!parse-error (type => 'no notation name'); ## TODO: type
4626            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4627            !!!next-input-character;
4628            !!!emit ($self->{ct}); # ENTITY
4629            redo A;
4630          } elsif ($self->{nc} == -1) {
4631            !!!parse-error (type => 'unclosed md'); ## TODO: type
4632            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4633            !!!next-input-character;
4634            !!!emit ($self->{ct}); # ENTITY
4635            redo A;
4636          } else {
4637            !!!parse-error (type => 'string after literal', ## TODO: type
4638                            line => $self->{line_prev},
4639                            column => $self->{column_prev} + 1
4640                                - length $self->{kwd});
4641            $self->{state} = BOGUS_MD_STATE;
4642            ## Reconsume.
4643            redo A;
4644          }
4645        } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
4646          if ($is_space->{$self->{nc}}) {
4647            ## Stay in the state.
4648            !!!next-input-character;
4649            redo A;
4650          } elsif ($self->{nc} == 0x003E) { # >
4651            !!!parse-error (type => 'no notation name'); ## TODO: type
4652            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4653            !!!next-input-character;
4654            !!!emit ($self->{ct}); # ENTITY
4655            redo A;
4656          } elsif ($self->{nc} == -1) {
4657            !!!parse-error (type => 'unclosed md'); ## TODO: type
4658            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4659            !!!next-input-character;
4660            !!!emit ($self->{ct}); # ENTITY
4661            redo A;
4662          } else {
4663            $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
4664            $self->{state} = NOTATION_NAME_STATE;
4665            !!!next-input-character;
4666            redo A;
4667          }
4668        } elsif ($self->{state} == NOTATION_NAME_STATE) {
4669          if ($is_space->{$self->{nc}}) {
4670            $self->{state} = AFTER_NOTATION_NAME_STATE;
4671            !!!next-input-character;
4672            redo A;
4673          } elsif ($self->{nc} == 0x003E) { # >
4674            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4675            !!!next-input-character;
4676            !!!emit ($self->{ct}); # ENTITY
4677            redo A;
4678          } elsif ($self->{nc} == -1) {
4679            !!!parse-error (type => 'unclosed md'); ## TODO: type
4680            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4681            !!!next-input-character;
4682            !!!emit ($self->{ct}); # ENTITY
4683            redo A;
4684          } else {
4685            $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
4686            ## Stay in the state.
4687            !!!next-input-character;
4688            redo A;
4689          }
4690        } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
4691          if ($self->{nc} == 0x0022) { # "
4692            $self->{state} = AFTER_NOTATION_NAME_STATE;
4693            !!!next-input-character;
4694            redo A;
4695          } elsif ($self->{nc} == 0x0026) { # &
4696            $self->{prev_state} = $self->{state};
4697            $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4698            $self->{entity_add} = 0x0022; # "
4699            !!!next-input-character;
4700            redo A;
4701    ## TODO: %
4702          } elsif ($self->{nc} == -1) {
4703            !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4704            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4705            ## Reconsume.
4706            !!!emit ($self->{ct}); # ENTITY
4707            redo A;
4708          } else {
4709            $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4710            !!!next-input-character;
4711            redo A;
4712          }
4713        } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
4714          if ($self->{nc} == 0x0027) { # '
4715            $self->{state} = AFTER_NOTATION_NAME_STATE;
4716            !!!next-input-character;
4717            redo A;
4718          } elsif ($self->{nc} == 0x0026) { # &
4719            $self->{prev_state} = $self->{state};
4720            $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4721            $self->{entity_add} = 0x0027; # '
4722            !!!next-input-character;
4723            redo A;
4724    ## TODO: %
4725          } elsif ($self->{nc} == -1) {
4726            !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4727            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4728            ## Reconsume.
4729            !!!emit ($self->{ct}); # ENTITY
4730            redo A;
4731          } else {
4732            $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4733            !!!next-input-character;
4734            redo A;
4735          }
4736        } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
4737          ## TODO: XMLize
4738    
4739          if ($is_space->{$self->{nc}} or
4740              {
4741                0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4742                $self->{entity_add} => 1,
4743              }->{$self->{nc}}) {
4744            ## Don't consume
4745            ## No error
4746            ## Return nothing.
4747            #
4748          } elsif ($self->{nc} == 0x0023) { # #
4749            $self->{ca} = $self->{ct};
4750            $self->{state} = ENTITY_HASH_STATE;
4751            $self->{kwd} = '#';
4752            !!!next-input-character;
4753            redo A;
4754          } elsif ((0x0041 <= $self->{nc} and
4755                    $self->{nc} <= 0x005A) or # A..Z
4756                   (0x0061 <= $self->{nc} and
4757                    $self->{nc} <= 0x007A)) { # a..z
4758            #
4759          } else {
4760            !!!parse-error (type => 'bare ero');
4761            ## Return nothing.
4762            #
4763          }
4764    
4765          $self->{ct}->{value} .= '&';
4766          $self->{state} = $self->{prev_state};
4767          ## Reconsume.
4768          redo A;
4769        } elsif ($self->{state} == AFTER_NOTATION_NAME_STATE) {
4770          if ($is_space->{$self->{nc}}) {
4771            ## Stay in the state.
4772            !!!next-input-character;
4773            redo A;
4774          } elsif ($self->{nc} == 0x003E) { # >
4775            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4776            !!!next-input-character;
4777            !!!emit ($self->{ct}); # ENTITY
4778            redo A;
4779          } elsif ($self->{nc} == -1) {
4780            !!!parse-error (type => 'unclosed md'); ## TODO: type
4781            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4782            !!!next-input-character;
4783            !!!emit ($self->{ct}); # ENTITY
4784            redo A;
4785          } else {
4786            !!!parse-error (type => 'string after notation name'); ## TODO: type
4787            $self->{state} = BOGUS_MD_STATE;
4788            ## Reconsume.
4789            redo A;
4790          }
4791        } elsif ($self->{state} == BOGUS_MD_STATE) {
4792          if ($self->{nc} == 0x003E) { # >
4793            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4794            !!!next-input-character;
4795            !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
4796            redo A;
4797          } elsif ($self->{nc} == -1) {
4798            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4799            ## Reconsume.
4800            !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
4801            redo A;
4802          } else {
4803            ## Stay in the state.
4804            !!!next-input-character;
4805            redo A;
4806          }
4807      } else {      } else {
4808        die "$0: $self->{state}: Unknown state";        die "$0: $self->{state}: Unknown state";
4809      }      }
# Line 3092  sub _get_next_token ($) { Line 4814  sub _get_next_token ($) {
4814    
4815  1;  1;
4816  ## $Date$  ## $Date$
4817                                    

Legend:
Removed from v.1.11  
changed lines
  Added in v.1.19

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24