/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.11 by wakaba, Wed Oct 15 10:50:38 2008 UTC revision 1.18 by wakaba, Sun Oct 19 06:14:57 2008 UTC
# Line 15  BEGIN { Line 15  BEGIN {
15      CHARACTER_TOKEN      CHARACTER_TOKEN
16      PI_TOKEN      PI_TOKEN
17      ABORT_TOKEN      ABORT_TOKEN
18        END_OF_DOCTYPE_TOKEN
19        ATTLIST_TOKEN
20        ELEMENT_TOKEN
21        GENERAL_ENTITY_TOKEN
22        PARAMETER_ENTITY_TOKEN
23        NOTATION_TOKEN
24    );    );
25        
26    our %EXPORT_TAGS = (    our %EXPORT_TAGS = (
# Line 27  BEGIN { Line 33  BEGIN {
33        CHARACTER_TOKEN        CHARACTER_TOKEN
34        PI_TOKEN        PI_TOKEN
35        ABORT_TOKEN        ABORT_TOKEN
36          END_OF_DOCTYPE_TOKEN
37          ATTLIST_TOKEN
38          ELEMENT_TOKEN
39          GENERAL_ENTITY_TOKEN
40          PARAMETER_ENTITY_TOKEN
41          NOTATION_TOKEN
42      )],      )],
43    );    );
44  }  }
45    
46    ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48  ## Token types  ## Token types
49    
50  sub DOCTYPE_TOKEN () { 1 }  sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51  sub COMMENT_TOKEN () { 2 }  sub COMMENT_TOKEN () { 2 }
52  sub START_TAG_TOKEN () { 3 }  sub START_TAG_TOKEN () { 3 }
53  sub END_TAG_TOKEN () { 4 }  sub END_TAG_TOKEN () { 4 }
54  sub END_OF_FILE_TOKEN () { 5 }  sub END_OF_FILE_TOKEN () { 5 }
55  sub CHARACTER_TOKEN () { 6 }  sub CHARACTER_TOKEN () { 6 }
56  sub PI_TOKEN () { 7 } # XML5  sub PI_TOKEN () { 7 } ## NOTE: XML only.
57  sub ABORT_TOKEN () { 8 } # Not a token actually  sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58    sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59    sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60    sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61    sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62    sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63    sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64    
65    ## XML5: XML5 has "empty tag token".  In this implementation, it is
66    ## represented as a start tag token with $self->{self_closing} flag
67    ## set to true.
68    
69    ## XML5: XML5 has "short end tag token".  In this implementation, it
70    ## is represented as an end tag token with $token->{tag_name} flag set
71    ## to an empty string.
72    
73  package Whatpm::HTML;  package Whatpm::HTML;
74    
# Line 114  sub HEXREF_HEX_STATE () { 48 } Line 142  sub HEXREF_HEX_STATE () { 48 }
142  sub ENTITY_NAME_STATE () { 49 }  sub ENTITY_NAME_STATE () { 49 }
143  sub PCDATA_STATE () { 50 } # "data state" in the spec  sub PCDATA_STATE () { 50 } # "data state" in the spec
144    
145  ## XML states  ## XML-only states
146  sub PI_STATE () { 51 }  sub PI_STATE () { 51 }
147  sub PI_TARGET_STATE () { 52 }  sub PI_TARGET_STATE () { 52 }
148  sub PI_TARGET_AFTER_STATE () { 53 }  sub PI_TARGET_AFTER_STATE () { 53 }
149  sub PI_DATA_STATE () { 54 }  sub PI_DATA_STATE () { 54 }
150  sub PI_AFTER_STATE () { 55 }  sub PI_AFTER_STATE () { 55 }
151  sub PI_DATA_AFTER_STATE () { 56 }  sub PI_DATA_AFTER_STATE () { 56 }
152    sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153    sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154    sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155    sub DOCTYPE_TAG_STATE () { 60 }
156    sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157    sub MD_ATTLIST_STATE () { 62 }
158    sub MD_E_STATE () { 63 }
159    sub MD_ELEMENT_STATE () { 64 }
160    sub MD_ENTITY_STATE () { 65 }
161    sub MD_NOTATION_STATE () { 66 }
162    sub DOCTYPE_MD_STATE () { 67 }
163    sub BEFORE_MD_NAME_STATE () { 68 }
164    sub MD_NAME_STATE () { 69 }
165    sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166    sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167    sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168    sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169    sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170    sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171    sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172    sub ALLOWED_TOKEN_STATE () { 77 }
173    sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174    sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175    sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179    sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180    sub BEFORE_NDATA_STATE () { 85 }
181    sub NDATA_STATE () { 86 }
182    sub AFTER_NDATA_STATE () { 87 }
183    sub BEFORE_NOTATION_NAME_STATE () { 88 }
184    sub NOTATION_NAME_STATE () { 89 }
185    sub AFTER_NOTATION_NAME_STATE () { 90 }
186    sub BOGUS_MD_STATE () { 91 }
187    
188  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
189  ## list and descriptions)  ## list and descriptions)
# Line 186  sub _initialize_tokenizer ($) { Line 249  sub _initialize_tokenizer ($) {
249    #$self->{is_xml} (if XML)    #$self->{is_xml} (if XML)
250    
251    $self->{state} = DATA_STATE; # MUST    $self->{state} = DATA_STATE; # MUST
252    $self->{s_kwd} = ''; # state keyword    $self->{s_kwd} = ''; # Data state keyword
253      #$self->{kwd} = ''; # State-dependent keyword; initialized when used
254    #$self->{entity__value}; # initialized when used    #$self->{entity__value}; # initialized when used
255    #$self->{entity__match}; # initialized when used    #$self->{entity__match}; # initialized when used
256    $self->{content_model} = PCDATA_CONTENT_MODEL; # be    $self->{content_model} = PCDATA_CONTENT_MODEL; # be
# Line 221  sub _initialize_tokenizer ($) { Line 285  sub _initialize_tokenizer ($) {
285  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
286  ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)  ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
287  ##   ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.  ##   ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
288    ##   ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
289    
290  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
291  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|
292  ##     while the token is pushed back to the stack.  ##     while the token is pushed back to the stack.
# Line 240  my $is_space = { Line 306  my $is_space = {
306    0x0009 => 1, # CHARACTER TABULATION (HT)    0x0009 => 1, # CHARACTER TABULATION (HT)
307    0x000A => 1, # LINE FEED (LF)    0x000A => 1, # LINE FEED (LF)
308    #0x000B => 0, # LINE TABULATION (VT)    #0x000B => 0, # LINE TABULATION (VT)
309    0x000C => 1, # FORM FEED (FF)    0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
310    #0x000D => 1, # CARRIAGE RETURN (CR)    #0x000D => 1, # CARRIAGE RETURN (CR)
311    0x0020 => 1, # SPACE (SP)    0x0020 => 1, # SPACE (SP)
312  };  };
# Line 450  sub _get_next_token ($) { Line 516  sub _get_next_token ($) {
516            redo A;            redo A;
517          } elsif ($self->{nc} == 0x0021) { # !          } elsif ($self->{nc} == 0x0021) { # !
518            !!!cp (15.1);            !!!cp (15.1);
519            $self->{s_kwd} = '<' unless $self->{escape};            $self->{s_kwd} = $self->{escaped} ? '' : '<';
520            #            #
521          } else {          } else {
522            !!!cp (16);            !!!cp (16);
523              $self->{s_kwd} = '';
524            #            #
525          }          }
526    
527          ## reconsume          ## reconsume
528          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
         $self->{s_kwd} = '';  
529          !!!emit ({type => CHARACTER_TOKEN, data => '<',          !!!emit ({type => CHARACTER_TOKEN, data => '<',
530                    line => $self->{line_prev},                    line => $self->{line_prev},
531                    column => $self->{column_prev},                    column => $self->{column_prev},
# Line 570  sub _get_next_token ($) { Line 636  sub _get_next_token ($) {
636        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
637          if (defined $self->{last_stag_name}) {          if (defined $self->{last_stag_name}) {
638            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
639            $self->{s_kwd} = '';            $self->{kwd} = '';
640            ## Reconsume.            ## Reconsume.
641            redo A;            redo A;
642          } else {          } else {
# Line 673  sub _get_next_token ($) { Line 739  sub _get_next_token ($) {
739          redo A;          redo A;
740        }        }
741      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
742        my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;        my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
743        if (length $ch) {        if (length $ch) {
744          my $CH = $ch;          my $CH = $ch;
745          $ch =~ tr/a-z/A-Z/;          $ch =~ tr/a-z/A-Z/;
# Line 681  sub _get_next_token ($) { Line 747  sub _get_next_token ($) {
747          if ($nch eq $ch or $nch eq $CH) {          if ($nch eq $ch or $nch eq $CH) {
748            !!!cp (24);            !!!cp (24);
749            ## Stay in the state.            ## Stay in the state.
750            $self->{s_kwd} .= $nch;            $self->{kwd} .= $nch;
751            !!!next-input-character;            !!!next-input-character;
752            redo A;            redo A;
753          } else {          } else {
# Line 690  sub _get_next_token ($) { Line 756  sub _get_next_token ($) {
756            $self->{s_kwd} = '';            $self->{s_kwd} = '';
757            ## Reconsume.            ## Reconsume.
758            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
759                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
760                      line => $self->{line_prev},                      line => $self->{line_prev},
761                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
762                     });                     });
763            redo A;            redo A;
764          }          }
# Line 708  sub _get_next_token ($) { Line 774  sub _get_next_token ($) {
774            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
775            $self->{s_kwd} = '';            $self->{s_kwd} = '';
776            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
777                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
778                      line => $self->{line_prev},                      line => $self->{line_prev},
779                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
780                     });                     });
781            redo A;            redo A;
782          } else {          } else {
# Line 719  sub _get_next_token ($) { Line 785  sub _get_next_token ($) {
785                = {type => END_TAG_TOKEN,                = {type => END_TAG_TOKEN,
786                   tag_name => $self->{last_stag_name},                   tag_name => $self->{last_stag_name},
787                   line => $self->{line_prev},                   line => $self->{line_prev},
788                   column => $self->{column_prev} - 1 - length $self->{s_kwd}};                   column => $self->{column_prev} - 1 - length $self->{kwd}};
789            $self->{state} = TAG_NAME_STATE;            $self->{state} = TAG_NAME_STATE;
790            ## Reconsume.            ## Reconsume.
791            redo A;            redo A;
# Line 1211  sub _get_next_token ($) { Line 1277  sub _get_next_token ($) {
1277          redo A;          redo A;
1278        }        }
1279      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1280        ## XML5: "Tag attribute value double quoted state".        ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1281          ## ATTLIST attribute value double quoted state".
1282                
1283        if ($self->{nc} == 0x0022) { # "        if ($self->{nc} == 0x0022) { # "
1284          !!!cp (95);          if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1285          ## XML5: "Tag attribute name before state".            !!!cp (95.1);
1286          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;            ## XML5: "DOCTYPE ATTLIST name after state".
1287              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1288              $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1289            } else {
1290              !!!cp (95);
1291              ## XML5: "Tag attribute name before state".
1292              $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1293            }
1294          !!!next-input-character;          !!!next-input-character;
1295          redo A;          redo A;
1296        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
# Line 1237  sub _get_next_token ($) { Line 1311  sub _get_next_token ($) {
1311          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1312            !!!cp (97);            !!!cp (97);
1313            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1314    
1315              $self->{state} = DATA_STATE;
1316              $self->{s_kwd} = '';
1317              ## reconsume
1318              !!!emit ($self->{ct}); # start tag
1319              redo A;
1320          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1321            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1322            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1246  sub _get_next_token ($) { Line 1326  sub _get_next_token ($) {
1326              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
1327              !!!cp (99);              !!!cp (99);
1328            }            }
1329    
1330              $self->{state} = DATA_STATE;
1331              $self->{s_kwd} = '';
1332              ## reconsume
1333              !!!emit ($self->{ct}); # end tag
1334              redo A;
1335            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1336              ## XML5: No parse error above; not defined yet.
1337              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1338              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1339              ## Reconsume.
1340              !!!emit ($self->{ct}); # ATTLIST
1341              redo A;
1342          } else {          } else {
1343            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1344          }          }
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
         ## reconsume  
   
         !!!emit ($self->{ct}); # start tag or end tag  
   
         redo A;  
1345        } else {        } else {
1346            ## XML5 [ATTLIST]: Not defined yet.
1347          if ($self->{is_xml} and $self->{nc} == 0x003C) { # <          if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1348            !!!cp (100);            !!!cp (100);
1349            ## XML5: Not a parse error.            ## XML5: Not a parse error.
# Line 1274  sub _get_next_token ($) { Line 1361  sub _get_next_token ($) {
1361          redo A;          redo A;
1362        }        }
1363      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1364        ## XML5: "Tag attribute value single quoted state".        ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1365          ## ATTLIST attribute value single quoted state".
1366    
1367        if ($self->{nc} == 0x0027) { # '        if ($self->{nc} == 0x0027) { # '
1368          !!!cp (101);          if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1369          ## XML5: "Before attribute name state" (sic).            !!!cp (101.1);
1370          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;            ## XML5: "DOCTYPE ATTLIST name after state".
1371              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1372              $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1373            } else {
1374              !!!cp (101);
1375              ## XML5: "Before attribute name state" (sic).
1376              $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1377            }
1378          !!!next-input-character;          !!!next-input-character;
1379          redo A;          redo A;
1380        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
# Line 1300  sub _get_next_token ($) { Line 1395  sub _get_next_token ($) {
1395          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1396            !!!cp (103);            !!!cp (103);
1397            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1398    
1399              $self->{state} = DATA_STATE;
1400              $self->{s_kwd} = '';
1401              ## reconsume
1402              !!!emit ($self->{ct}); # start tag
1403              redo A;
1404          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1405            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1406            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1309  sub _get_next_token ($) { Line 1410  sub _get_next_token ($) {
1410              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
1411              !!!cp (105);              !!!cp (105);
1412            }            }
1413    
1414              $self->{state} = DATA_STATE;
1415              $self->{s_kwd} = '';
1416              ## reconsume
1417              !!!emit ($self->{ct}); # end tag
1418              redo A;
1419            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1420              ## XML5: No parse error above; not defined yet.
1421              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1422              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1423              ## Reconsume.
1424              !!!emit ($self->{ct}); # ATTLIST
1425              redo A;
1426          } else {          } else {
1427            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1428          }          }
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
         ## reconsume  
   
         !!!emit ($self->{ct}); # start tag or end tag  
   
         redo A;  
1429        } else {        } else {
1430            ## XML5 [ATTLIST]: Not defined yet.
1431          if ($self->{is_xml} and $self->{nc} == 0x003C) { # <          if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1432            !!!cp (106);            !!!cp (106);
1433            ## XML5: Not a parse error.            ## XML5: Not a parse error.
# Line 1340  sub _get_next_token ($) { Line 1448  sub _get_next_token ($) {
1448        ## XML5: "Tag attribute value unquoted state".        ## XML5: "Tag attribute value unquoted state".
1449    
1450        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1451          !!!cp (107);          if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1452          ## XML5: "Tag attribute name before state".            !!!cp (107.1);
1453          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;            push @{$self->{ct}->{attrdefs}}, $self->{ca};
1454              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
1455            } else {
1456              !!!cp (107);
1457              ## XML5: "Tag attribute name before state".
1458              $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1459            }
1460          !!!next-input-character;          !!!next-input-character;
1461          redo A;          redo A;
1462        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
# Line 1363  sub _get_next_token ($) { Line 1477  sub _get_next_token ($) {
1477          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1478            !!!cp (109);            !!!cp (109);
1479            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1480    
1481              $self->{state} = DATA_STATE;
1482              $self->{s_kwd} = '';
1483              !!!next-input-character;
1484              !!!emit ($self->{ct}); # start tag
1485              redo A;
1486          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1487            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1488            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1372  sub _get_next_token ($) { Line 1492  sub _get_next_token ($) {
1492              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
1493              !!!cp (111);              !!!cp (111);
1494            }            }
1495    
1496              $self->{state} = DATA_STATE;
1497              $self->{s_kwd} = '';
1498              !!!next-input-character;
1499              !!!emit ($self->{ct}); # end tag
1500              redo A;
1501            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1502              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1503              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1504              !!!next-input-character;
1505              !!!emit ($self->{ct}); # ATTLIST
1506              redo A;
1507          } else {          } else {
1508            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1509          }          }
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
         !!!next-input-character;  
   
         !!!emit ($self->{ct}); # start tag or end tag  
   
         redo A;  
1510        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!parse-error (type => 'unclosed tag');  
1511          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1512            !!!cp (112);            !!!cp (112);
1513              !!!parse-error (type => 'unclosed tag');
1514            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1515    
1516              $self->{state} = DATA_STATE;
1517              $self->{s_kwd} = '';
1518              ## reconsume
1519              !!!emit ($self->{ct}); # start tag
1520              redo A;
1521          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1522              !!!parse-error (type => 'unclosed tag');
1523            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1524            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
1525              !!!cp (113);              !!!cp (113);
# Line 1396  sub _get_next_token ($) { Line 1528  sub _get_next_token ($) {
1528              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
1529              !!!cp (114);              !!!cp (114);
1530            }            }
1531    
1532              $self->{state} = DATA_STATE;
1533              $self->{s_kwd} = '';
1534              ## reconsume
1535              !!!emit ($self->{ct}); # end tag
1536              redo A;
1537            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1538              !!!parse-error (type => 'unclosed md'); ## TODO: type
1539              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1540              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1541              ## Reconsume.
1542              !!!emit ($self->{ct}); # ATTLIST
1543              redo A;
1544          } else {          } else {
1545            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1546          }          }
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
         ## reconsume  
   
         !!!emit ($self->{ct}); # start tag or end tag  
   
         redo A;  
1547        } else {        } else {
1548          if ({          if ({
1549               0x0022 => 1, # "               0x0022 => 1, # "
# Line 1548  sub _get_next_token ($) { Line 1686  sub _get_next_token ($) {
1686          redo A;          redo A;
1687        }        }
1688      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1689        ## (only happen if PCDATA state)        ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
1690    
1691        ## NOTE: Unlike spec's "bogus comment state", this implementation        ## NOTE: Unlike spec's "bogus comment state", this implementation
1692        ## consumes characters one-by-one basis.        ## consumes characters one-by-one basis.
1693                
1694        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
1695          !!!cp (124);          if ($self->{in_subset}) {
1696          $self->{state} = DATA_STATE;            !!!cp (123);
1697          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1698            } else {
1699              !!!cp (124);
1700              $self->{state} = DATA_STATE;
1701              $self->{s_kwd} = '';
1702            }
1703          !!!next-input-character;          !!!next-input-character;
1704    
1705          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
1706          redo A;          redo A;
1707        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1708          !!!cp (125);          if ($self->{in_subset}) {
1709          $self->{state} = DATA_STATE;            !!!cp (125.1);
1710          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1711            } else {
1712              !!!cp (125);
1713              $self->{state} = DATA_STATE;
1714              $self->{s_kwd} = '';
1715            }
1716          ## reconsume          ## reconsume
1717    
1718          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1581  sub _get_next_token ($) { Line 1729  sub _get_next_token ($) {
1729          redo A;          redo A;
1730        }        }
1731      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1732        ## (only happen if PCDATA state)        ## XML5: "Markup declaration state".
1733                
1734        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
1735          !!!cp (133);          !!!cp (133);
# Line 1593  sub _get_next_token ($) { Line 1741  sub _get_next_token ($) {
1741          ## ASCII case-insensitive.          ## ASCII case-insensitive.
1742          !!!cp (130);          !!!cp (130);
1743          $self->{state} = MD_DOCTYPE_STATE;          $self->{state} = MD_DOCTYPE_STATE;
1744          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
1745          !!!next-input-character;          !!!next-input-character;
1746          redo A;          redo A;
1747        } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and        } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
# Line 1602  sub _get_next_token ($) { Line 1750  sub _get_next_token ($) {
1750                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
1751          !!!cp (135.4);                          !!!cp (135.4);                
1752          $self->{state} = MD_CDATA_STATE;          $self->{state} = MD_CDATA_STATE;
1753          $self->{s_kwd} = '[';          $self->{kwd} = '[';
1754          !!!next-input-character;          !!!next-input-character;
1755          redo A;          redo A;
1756        } else {        } else {
# Line 1652  sub _get_next_token ($) { Line 1800  sub _get_next_token ($) {
1800              0x0054, # T              0x0054, # T
1801              0x0059, # Y              0x0059, # Y
1802              0x0050, # P              0x0050, # P
1803            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
1804            $self->{nc} == [            $self->{nc} == [
1805              undef,              undef,
1806              0x006F, # o              0x006F, # o
# Line 1660  sub _get_next_token ($) { Line 1808  sub _get_next_token ($) {
1808              0x0074, # t              0x0074, # t
1809              0x0079, # y              0x0079, # y
1810              0x0070, # p              0x0070, # p
1811            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
1812          !!!cp (131);          !!!cp (131);
1813          ## Stay in the state.          ## Stay in the state.
1814          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
1815          !!!next-input-character;          !!!next-input-character;
1816          redo A;          redo A;
1817        } elsif ((length $self->{s_kwd}) == 6 and        } elsif ((length $self->{kwd}) == 6 and
1818                 ($self->{nc} == 0x0045 or # E                 ($self->{nc} == 0x0045 or # E
1819                  $self->{nc} == 0x0065)) { # e                  $self->{nc} == 0x0065)) { # e
1820          if ($self->{s_kwd} ne 'DOCTYP') {          if ($self->{is_xml} and
1821                ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
1822            !!!cp (129);            !!!cp (129);
1823            ## XML5: case-sensitive.            ## XML5: case-sensitive.
1824            !!!parse-error (type => 'lowercase keyword', ## TODO            !!!parse-error (type => 'lowercase keyword', ## TODO
# Line 1691  sub _get_next_token ($) { Line 1840  sub _get_next_token ($) {
1840          !!!cp (132);                  !!!cp (132);        
1841          !!!parse-error (type => 'bogus comment',          !!!parse-error (type => 'bogus comment',
1842                          line => $self->{line_prev},                          line => $self->{line_prev},
1843                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
1844          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
1845          ## Reconsume.          ## Reconsume.
1846          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
1847                                    data => $self->{s_kwd},                                    data => $self->{kwd},
1848                                    line => $self->{line_prev},                                    line => $self->{line_prev},
1849                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
1850                                   };                                   };
1851          redo A;          redo A;
1852        }        }
# Line 1708  sub _get_next_token ($) { Line 1857  sub _get_next_token ($) {
1857              '[CD' => 0x0041, # A              '[CD' => 0x0041, # A
1858              '[CDA' => 0x0054, # T              '[CDA' => 0x0054, # T
1859              '[CDAT' => 0x0041, # A              '[CDAT' => 0x0041, # A
1860            }->{$self->{s_kwd}}) {            }->{$self->{kwd}}) {
1861          !!!cp (135.1);          !!!cp (135.1);
1862          ## Stay in the state.          ## Stay in the state.
1863          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
1864          !!!next-input-character;          !!!next-input-character;
1865          redo A;          redo A;
1866        } elsif ($self->{s_kwd} eq '[CDATA' and        } elsif ($self->{kwd} eq '[CDATA' and
1867                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
1868          if ($self->{is_xml} and          if ($self->{is_xml} and
1869              not $self->{tainted} and              not $self->{tainted} and
# Line 1739  sub _get_next_token ($) { Line 1888  sub _get_next_token ($) {
1888          !!!cp (135.3);          !!!cp (135.3);
1889          !!!parse-error (type => 'bogus comment',          !!!parse-error (type => 'bogus comment',
1890                          line => $self->{line_prev},                          line => $self->{line_prev},
1891                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
1892          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
1893          ## Reconsume.          ## Reconsume.
1894          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
1895                                    data => $self->{s_kwd},                                    data => $self->{kwd},
1896                                    line => $self->{line_prev},                                    line => $self->{line_prev},
1897                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
1898                                   };                                   };
1899          redo A;          redo A;
1900        }        }
# Line 1756  sub _get_next_token ($) { Line 1905  sub _get_next_token ($) {
1905          !!!next-input-character;          !!!next-input-character;
1906          redo A;          redo A;
1907        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
         !!!cp (138);  
1908          !!!parse-error (type => 'bogus comment');          !!!parse-error (type => 'bogus comment');
1909          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1910          $self->{s_kwd} = '';            !!!cp (138.1);
1911              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1912            } else {
1913              !!!cp (138);
1914              $self->{state} = DATA_STATE;
1915              $self->{s_kwd} = '';
1916            }
1917          !!!next-input-character;          !!!next-input-character;
1918    
1919          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
1920    
1921          redo A;          redo A;
1922        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (139);  
1923          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1924          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1925          $self->{s_kwd} = '';            !!!cp (139.1);
1926              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1927            } else {
1928              !!!cp (139);
1929              $self->{state} = DATA_STATE;
1930              $self->{s_kwd} = '';
1931            }
1932          ## reconsume          ## reconsume
1933    
1934          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1790  sub _get_next_token ($) { Line 1949  sub _get_next_token ($) {
1949          !!!next-input-character;          !!!next-input-character;
1950          redo A;          redo A;
1951        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
         !!!cp (142);  
1952          !!!parse-error (type => 'bogus comment');          !!!parse-error (type => 'bogus comment');
1953          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1954          $self->{s_kwd} = '';            !!!cp (142.1);
1955              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1956            } else {
1957              !!!cp (142);
1958              $self->{state} = DATA_STATE;
1959              $self->{s_kwd} = '';
1960            }
1961          !!!next-input-character;          !!!next-input-character;
1962    
1963          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
1964    
1965          redo A;          redo A;
1966        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (143);  
1967          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1968          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1969          $self->{s_kwd} = '';            !!!cp (143.1);
1970              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1971            } else {
1972              !!!cp (143);
1973              $self->{state} = DATA_STATE;
1974              $self->{s_kwd} = '';
1975            }
1976          ## reconsume          ## reconsume
1977    
1978          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1818  sub _get_next_token ($) { Line 1987  sub _get_next_token ($) {
1987          redo A;          redo A;
1988        }        }
1989      } elsif ($self->{state} == COMMENT_STATE) {      } elsif ($self->{state} == COMMENT_STATE) {
1990          ## XML5: "Comment state" and "DOCTYPE comment state".
1991    
1992        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
1993          !!!cp (145);          !!!cp (145);
1994          $self->{state} = COMMENT_END_DASH_STATE;          $self->{state} = COMMENT_END_DASH_STATE;
1995          !!!next-input-character;          !!!next-input-character;
1996          redo A;          redo A;
1997        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (146);  
1998          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1999          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2000          $self->{s_kwd} = '';            !!!cp (146.1);
2001              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2002            } else {
2003              !!!cp (146);
2004              $self->{state} = DATA_STATE;
2005              $self->{s_kwd} = '';
2006            }
2007          ## reconsume          ## reconsume
2008    
2009          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1845  sub _get_next_token ($) { Line 2021  sub _get_next_token ($) {
2021          redo A;          redo A;
2022        }        }
2023      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2024        ## XML5: "comment dash state".        ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2025    
2026        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2027          !!!cp (148);          !!!cp (148);
# Line 1853  sub _get_next_token ($) { Line 2029  sub _get_next_token ($) {
2029          !!!next-input-character;          !!!next-input-character;
2030          redo A;          redo A;
2031        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (149);  
2032          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
2033          $self->{s_kwd} = '';          if ($self->{in_subset}) {
2034          $self->{state} = DATA_STATE;            !!!cp (149.1);
2035          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2036            } else {
2037              !!!cp (149);
2038              $self->{state} = DATA_STATE;
2039              $self->{s_kwd} = '';
2040            }
2041          ## reconsume          ## reconsume
2042    
2043          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1871  sub _get_next_token ($) { Line 2051  sub _get_next_token ($) {
2051          redo A;          redo A;
2052        }        }
2053      } elsif ($self->{state} == COMMENT_END_STATE) {      } elsif ($self->{state} == COMMENT_END_STATE) {
2054          ## XML5: "Comment end state" and "DOCTYPE comment end state".
2055    
2056        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2057          !!!cp (151);          if ($self->{in_subset}) {
2058          $self->{state} = DATA_STATE;            !!!cp (151.1);
2059          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2060            } else {
2061              !!!cp (151);
2062              $self->{state} = DATA_STATE;
2063              $self->{s_kwd} = '';
2064            }
2065          !!!next-input-character;          !!!next-input-character;
2066    
2067          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1891  sub _get_next_token ($) { Line 2078  sub _get_next_token ($) {
2078          !!!next-input-character;          !!!next-input-character;
2079          redo A;          redo A;
2080        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (153);  
2081          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
2082          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2083          $self->{s_kwd} = '';            !!!cp (153.1);
2084              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2085            } else {
2086              !!!cp (153);
2087              $self->{state} = DATA_STATE;
2088              $self->{s_kwd} = '';
2089            }
2090          ## reconsume          ## reconsume
2091    
2092          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1919  sub _get_next_token ($) { Line 2111  sub _get_next_token ($) {
2111          redo A;          redo A;
2112        } else {        } else {
2113          !!!cp (156);          !!!cp (156);
2114            ## XML5: Unless EOF, swith to the bogus comment state.
2115          !!!parse-error (type => 'no space before DOCTYPE name');          !!!parse-error (type => 'no space before DOCTYPE name');
2116          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2117          ## reconsume          ## reconsume
2118          redo A;          redo A;
2119        }        }
2120      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2121          ## XML5: "DOCTYPE root name before state".
2122    
2123        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2124          !!!cp (157);          !!!cp (157);
2125          ## Stay in the state          ## Stay in the state
# Line 1932  sub _get_next_token ($) { Line 2127  sub _get_next_token ($) {
2127          redo A;          redo A;
2128        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2129          !!!cp (158);          !!!cp (158);
2130            ## XML5: No parse error.
2131          !!!parse-error (type => 'no DOCTYPE name');          !!!parse-error (type => 'no DOCTYPE name');
2132          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2133          $self->{s_kwd} = '';          $self->{s_kwd} = '';
# Line 1950  sub _get_next_token ($) { Line 2146  sub _get_next_token ($) {
2146          !!!emit ($self->{ct}); # DOCTYPE (quirks)          !!!emit ($self->{ct}); # DOCTYPE (quirks)
2147    
2148          redo A;          redo A;
2149          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2150            !!!cp (159.1);
2151            !!!parse-error (type => 'no DOCTYPE name');
2152            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2153            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2154            $self->{in_subset} = 1;
2155            !!!next-input-character;
2156            !!!emit ($self->{ct}); # DOCTYPE
2157            redo A;
2158        } else {        } else {
2159          !!!cp (160);          !!!cp (160);
2160          $self->{ct}->{name} = chr $self->{nc};          $self->{ct}->{name} = chr $self->{nc};
# Line 1959  sub _get_next_token ($) { Line 2164  sub _get_next_token ($) {
2164          redo A;          redo A;
2165        }        }
2166      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2167  ## ISSUE: Redundant "First," in the spec.        ## XML5: "DOCTYPE root name state".
2168    
2169          ## ISSUE: Redundant "First," in the spec.
2170    
2171        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2172          !!!cp (161);          !!!cp (161);
2173          $self->{state} = AFTER_DOCTYPE_NAME_STATE;          $self->{state} = AFTER_DOCTYPE_NAME_STATE;
# Line 1985  sub _get_next_token ($) { Line 2193  sub _get_next_token ($) {
2193          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2194    
2195          redo A;          redo A;
2196          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2197            !!!cp (163.1);
2198            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2199            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2200            $self->{in_subset} = 1;
2201            !!!next-input-character;
2202            !!!emit ($self->{ct}); # DOCTYPE
2203            redo A;
2204        } else {        } else {
2205          !!!cp (164);          !!!cp (164);
2206          $self->{ct}->{name}          $self->{ct}->{name}
# Line 1994  sub _get_next_token ($) { Line 2210  sub _get_next_token ($) {
2210          redo A;          redo A;
2211        }        }
2212      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2213          ## XML5: Corresponding to XML5's "DOCTYPE root name after
2214          ## state", but implemented differently.
2215    
2216        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2217          !!!cp (165);          !!!cp (165);
2218          ## Stay in the state          ## Stay in the state
2219          !!!next-input-character;          !!!next-input-character;
2220          redo A;          redo A;
2221        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2222          !!!cp (166);          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2223          $self->{state} = DATA_STATE;            !!!cp (166);
2224          $self->{s_kwd} = '';            $self->{state} = DATA_STATE;
2225              $self->{s_kwd} = '';
2226            } else {
2227              !!!cp (166.1);
2228              !!!parse-error (type => 'no md def'); ## TODO: type
2229              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2230            }
2231            
2232          !!!next-input-character;          !!!next-input-character;
2233            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         !!!emit ($self->{ct}); # DOCTYPE  
   
2234          redo A;          redo A;
2235        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2236          !!!cp (167);          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2237          !!!parse-error (type => 'unclosed DOCTYPE');            !!!cp (167);
2238          $self->{state} = DATA_STATE;            !!!parse-error (type => 'unclosed DOCTYPE');
2239          $self->{s_kwd} = '';            $self->{state} = DATA_STATE;
2240          ## reconsume            $self->{s_kwd} = '';
2241              $self->{ct}->{quirks} = 1;
2242          $self->{ct}->{quirks} = 1;          } else {
2243          !!!emit ($self->{ct}); # DOCTYPE            !!!cp (167.12);
2244              !!!parse-error (type => 'unclosed md'); ## TODO: type
2245              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2246            }
2247            
2248            ## Reconsume.
2249            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2250          redo A;          redo A;
2251        } elsif ($self->{nc} == 0x0050 or # P        } elsif ($self->{nc} == 0x0050 or # P
2252                 $self->{nc} == 0x0070) { # p                 $self->{nc} == 0x0070) { # p
2253            !!!cp (167.1);
2254          $self->{state} = PUBLIC_STATE;          $self->{state} = PUBLIC_STATE;
2255          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
2256          !!!next-input-character;          !!!next-input-character;
2257          redo A;          redo A;
2258        } elsif ($self->{nc} == 0x0053 or # S        } elsif ($self->{nc} == 0x0053 or # S
2259                 $self->{nc} == 0x0073) { # s                 $self->{nc} == 0x0073) { # s
2260            !!!cp (167.2);
2261          $self->{state} = SYSTEM_STATE;          $self->{state} = SYSTEM_STATE;
2262          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
2263            !!!next-input-character;
2264            redo A;
2265    ## TODO: " and ' for ENTITY
2266          } elsif ($self->{is_xml} and
2267                   $self->{ct}->{type} == DOCTYPE_TOKEN and
2268                   $self->{nc} == 0x005B) { # [
2269            !!!cp (167.3);
2270            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2271            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2272            $self->{in_subset} = 1;
2273          !!!next-input-character;          !!!next-input-character;
2274            !!!emit ($self->{ct}); # DOCTYPE
2275          redo A;          redo A;
2276        } else {        } else {
2277          !!!cp (180);          !!!parse-error (type => 'string after DOCTYPE name'); ## TODO: type
2278          !!!parse-error (type => 'string after DOCTYPE name');  
2279          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2280              !!!cp (180);
2281              $self->{ct}->{quirks} = 1;
2282              $self->{state} = BOGUS_DOCTYPE_STATE;
2283            } else {
2284              !!!cp (180.1);
2285              $self->{state} = BOGUS_MD_STATE;
2286            }
2287    
         $self->{state} = BOGUS_DOCTYPE_STATE;  
2288          !!!next-input-character;          !!!next-input-character;
2289          redo A;          redo A;
2290        }        }
# Line 2048  sub _get_next_token ($) { Line 2296  sub _get_next_token ($) {
2296              0x0042, # B              0x0042, # B
2297              0x004C, # L              0x004C, # L
2298              0x0049, # I              0x0049, # I
2299            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
2300            $self->{nc} == [            $self->{nc} == [
2301              undef,              undef,
2302              0x0075, # u              0x0075, # u
2303              0x0062, # b              0x0062, # b
2304              0x006C, # l              0x006C, # l
2305              0x0069, # i              0x0069, # i
2306            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
2307          !!!cp (175);          !!!cp (175);
2308          ## Stay in the state.          ## Stay in the state.
2309          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2310          !!!next-input-character;          !!!next-input-character;
2311          redo A;          redo A;
2312        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
2313                 ($self->{nc} == 0x0043 or # C                 ($self->{nc} == 0x0043 or # C
2314                  $self->{nc} == 0x0063)) { # c                  $self->{nc} == 0x0063)) { # c
2315          !!!cp (168);          if ($self->{is_xml} and
2316                ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
2317              !!!cp (168.1);
2318              !!!parse-error (type => 'lowercase keyword', ## TODO: type
2319                              text => 'PUBLIC',
2320                              line => $self->{line_prev},
2321                              column => $self->{column_prev} - 4);
2322            } else {
2323              !!!cp (168);
2324            }
2325          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2326          !!!next-input-character;          !!!next-input-character;
2327          redo A;          redo A;
2328        } else {        } else {
2329          !!!cp (169);          !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
         !!!parse-error (type => 'string after DOCTYPE name',  
2330                          line => $self->{line_prev},                          line => $self->{line_prev},
2331                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
2332          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2333              !!!cp (169);
2334          $self->{state} = BOGUS_DOCTYPE_STATE;            $self->{ct}->{quirks} = 1;
2335              $self->{state} = BOGUS_DOCTYPE_STATE;
2336            } else {
2337              !!!cp (169.1);
2338              $self->{state} = BOGUS_MD_STATE;
2339            }
2340          ## Reconsume.          ## Reconsume.
2341          redo A;          redo A;
2342        }        }
# Line 2087  sub _get_next_token ($) { Line 2348  sub _get_next_token ($) {
2348              0x0053, # S              0x0053, # S
2349              0x0054, # T              0x0054, # T
2350              0x0045, # E              0x0045, # E
2351            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
2352            $self->{nc} == [            $self->{nc} == [
2353              undef,              undef,
2354              0x0079, # y              0x0079, # y
2355              0x0073, # s              0x0073, # s
2356              0x0074, # t              0x0074, # t
2357              0x0065, # e              0x0065, # e
2358            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
2359          !!!cp (170);          !!!cp (170);
2360          ## Stay in the state.          ## Stay in the state.
2361          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2362          !!!next-input-character;          !!!next-input-character;
2363          redo A;          redo A;
2364        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
2365                 ($self->{nc} == 0x004D or # M                 ($self->{nc} == 0x004D or # M
2366                  $self->{nc} == 0x006D)) { # m                  $self->{nc} == 0x006D)) { # m
2367          !!!cp (171);          if ($self->{is_xml} and
2368                ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
2369              !!!cp (171.1);
2370              !!!parse-error (type => 'lowercase keyword', ## TODO: type
2371                              text => 'SYSTEM',
2372                              line => $self->{line_prev},
2373                              column => $self->{column_prev} - 4);
2374            } else {
2375              !!!cp (171);
2376            }
2377          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2378          !!!next-input-character;          !!!next-input-character;
2379          redo A;          redo A;
2380        } else {        } else {
2381          !!!cp (172);          !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
         !!!parse-error (type => 'string after DOCTYPE name',  
2382                          line => $self->{line_prev},                          line => $self->{line_prev},
2383                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
2384          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2385              !!!cp (172);
2386          $self->{state} = BOGUS_DOCTYPE_STATE;            $self->{ct}->{quirks} = 1;
2387              $self->{state} = BOGUS_DOCTYPE_STATE;
2388            } else {
2389              !!!cp (172.1);
2390              $self->{state} = BOGUS_MD_STATE;
2391            }
2392          ## Reconsume.          ## Reconsume.
2393          redo A;          redo A;
2394        }        }
# Line 2137  sub _get_next_token ($) { Line 2411  sub _get_next_token ($) {
2411          !!!next-input-character;          !!!next-input-character;
2412          redo A;          redo A;
2413        } elsif ($self->{nc} eq 0x003E) { # >        } elsif ($self->{nc} eq 0x003E) { # >
         !!!cp (184);  
2414          !!!parse-error (type => 'no PUBLIC literal');          !!!parse-error (type => 'no PUBLIC literal');
2415            
2416          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2417          $self->{s_kwd} = '';            !!!cp (184);
2418              $self->{state} = DATA_STATE;
2419              $self->{s_kwd} = '';
2420              $self->{ct}->{quirks} = 1;
2421            } else {
2422              !!!cp (184.1);
2423              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2424            }
2425            
2426          !!!next-input-character;          !!!next-input-character;
2427            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         !!!emit ($self->{ct}); # DOCTYPE  
   
2428          redo A;          redo A;
2429        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2430          !!!cp (185);          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2431          !!!parse-error (type => 'unclosed DOCTYPE');            !!!cp (185);
2432              !!!parse-error (type => 'unclosed DOCTYPE');
2433          $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
2434          $self->{s_kwd} = '';            $self->{s_kwd} = '';
2435              $self->{ct}->{quirks} = 1;
2436            } else {
2437              !!!cp (185.1);
2438              !!!parse-error (type => 'unclosed md'); ## TODO: type
2439              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2440            }
2441            
2442          ## reconsume          ## reconsume
   
         $self->{ct}->{quirks} = 1;  
2443          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2444            redo A;
2445          } elsif ($self->{is_xml} and
2446                   $self->{ct}->{type} == DOCTYPE_TOKEN and
2447                   $self->{nc} == 0x005B) { # [
2448            !!!cp (186.1);
2449            !!!parse-error (type => 'no PUBLIC literal');
2450            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2451            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2452            $self->{in_subset} = 1;
2453            !!!next-input-character;
2454            !!!emit ($self->{ct}); # DOCTYPE
2455          redo A;          redo A;
2456        } else {        } else {
         !!!cp (186);  
2457          !!!parse-error (type => 'string after PUBLIC');          !!!parse-error (type => 'string after PUBLIC');
         $self->{ct}->{quirks} = 1;  
2458    
2459          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2460              !!!cp (186);
2461              $self->{ct}->{quirks} = 1;
2462              $self->{state} = BOGUS_DOCTYPE_STATE;
2463            } else {
2464              !!!cp (186.2);
2465              $self->{state} = BOGUS_MD_STATE;
2466            }
2467    
2468          !!!next-input-character;          !!!next-input-character;
2469          redo A;          redo A;
2470        }        }
# Line 2176  sub _get_next_token ($) { Line 2475  sub _get_next_token ($) {
2475          !!!next-input-character;          !!!next-input-character;
2476          redo A;          redo A;
2477        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
         !!!cp (188);  
2478          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2479    
2480          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2481          $self->{s_kwd} = '';            !!!cp (188);
2482          !!!next-input-character;            $self->{state} = DATA_STATE;
2483              $self->{s_kwd} = '';
2484          $self->{ct}->{quirks} = 1;            $self->{ct}->{quirks} = 1;
2485          !!!emit ($self->{ct}); # DOCTYPE          } else {
2486              !!!cp (188.1);
2487              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2488            }
2489    
2490            !!!next-input-character;
2491            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2492          redo A;          redo A;
2493        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (189);  
2494          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2495    
2496          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2497          $self->{s_kwd} = '';            !!!cp (189);
2498          ## reconsume            $self->{state} = DATA_STATE;
2499              $self->{s_kwd} = '';
2500          $self->{ct}->{quirks} = 1;            $self->{ct}->{quirks} = 1;
2501            } else {
2502              !!!cp (189.1);
2503              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2504            }
2505            
2506            ## Reconsume.
2507          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
   
2508          redo A;          redo A;
2509        } else {        } else {
2510          !!!cp (190);          !!!cp (190);
2511          $self->{ct}->{pubid} # DOCTYPE          $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
2512          $self->{read_until}->($self->{ct}->{pubid}, q[">],          $self->{read_until}->($self->{ct}->{pubid}, q[">],
2513                                length $self->{ct}->{pubid});                                length $self->{ct}->{pubid});
2514    
# Line 2217  sub _get_next_token ($) { Line 2523  sub _get_next_token ($) {
2523          !!!next-input-character;          !!!next-input-character;
2524          redo A;          redo A;
2525        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
         !!!cp (192);  
2526          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2527    
2528          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2529          $self->{s_kwd} = '';            !!!cp (192);
2530          !!!next-input-character;            $self->{state} = DATA_STATE;
2531              $self->{s_kwd} = '';
2532          $self->{ct}->{quirks} = 1;            $self->{ct}->{quirks} = 1;
2533          !!!emit ($self->{ct}); # DOCTYPE          } else {
2534              !!!cp (192.1);
2535              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2536            }
2537    
2538            !!!next-input-character;
2539            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2540          redo A;          redo A;
2541        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (193);  
2542          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2543    
2544          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2545          $self->{s_kwd} = '';            !!!cp (193);
2546              $self->{state} = DATA_STATE;
2547              $self->{s_kwd} = '';
2548              $self->{ct}->{quirks} = 1;
2549            } else {
2550              !!!cp (193.1);
2551              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2552            }
2553          
2554          ## reconsume          ## reconsume
2555            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         !!!emit ($self->{ct}); # DOCTYPE  
   
2556          redo A;          redo A;
2557        } else {        } else {
2558          !!!cp (194);          !!!cp (194);
2559          $self->{ct}->{pubid} # DOCTYPE          $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
2560          $self->{read_until}->($self->{ct}->{pubid}, q['>],          $self->{read_until}->($self->{ct}->{pubid}, q['>],
2561                                length $self->{ct}->{pubid});                                length $self->{ct}->{pubid});
2562    
# Line 2259  sub _get_next_token ($) { Line 2572  sub _get_next_token ($) {
2572          redo A;          redo A;
2573        } elsif ($self->{nc} == 0x0022) { # "        } elsif ($self->{nc} == 0x0022) { # "
2574          !!!cp (196);          !!!cp (196);
2575          $self->{ct}->{sysid} = ''; # DOCTYPE          $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2576          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2577          !!!next-input-character;          !!!next-input-character;
2578          redo A;          redo A;
2579        } elsif ($self->{nc} == 0x0027) { # '        } elsif ($self->{nc} == 0x0027) { # '
2580          !!!cp (197);          !!!cp (197);
2581          $self->{ct}->{sysid} = ''; # DOCTYPE          $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2582          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2583          !!!next-input-character;          !!!next-input-character;
2584          redo A;          redo A;
2585        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2586          !!!cp (198);          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2587          $self->{state} = DATA_STATE;            if ($self->{is_xml}) {
2588          $self->{s_kwd} = '';              !!!cp (198.1);
2589                !!!parse-error (type => 'no SYSTEM literal');
2590              } else {
2591                !!!cp (198);
2592              }
2593              $self->{state} = DATA_STATE;
2594              $self->{s_kwd} = '';
2595            } else {
2596              if ($self->{ct}->{type} == NOTATION_TOKEN) {
2597                !!!cp (198.2);
2598              } else {
2599                !!!cp (198.3);
2600                !!!parse-error (type => 'no SYSTEM literal');            
2601              }
2602              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2603            }
2604            
2605          !!!next-input-character;          !!!next-input-character;
2606            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         !!!emit ($self->{ct}); # DOCTYPE  
   
2607          redo A;          redo A;
2608        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2609          !!!cp (199);          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2610          !!!parse-error (type => 'unclosed DOCTYPE');            !!!cp (199);
2611              !!!parse-error (type => 'unclosed DOCTYPE');
2612          $self->{state} = DATA_STATE;            
2613          $self->{s_kwd} = '';            $self->{state} = DATA_STATE;
2614              $self->{s_kwd} = '';
2615              $self->{ct}->{quirks} = 1;
2616            } else {
2617              !!!parse-error (type => 'unclosed md'); ## TODO: type
2618              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2619            }
2620            
2621          ## reconsume          ## reconsume
2622            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2623          $self->{ct}->{quirks} = 1;          redo A;
2624          } elsif ($self->{is_xml} and
2625                   $self->{ct}->{type} == DOCTYPE_TOKEN and
2626                   $self->{nc} == 0x005B) { # [
2627            !!!cp (200.1);
2628            !!!parse-error (type => 'no SYSTEM literal');
2629            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2630            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2631            $self->{in_subset} = 1;
2632            !!!next-input-character;
2633          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
   
2634          redo A;          redo A;
2635        } else {        } else {
         !!!cp (200);  
2636          !!!parse-error (type => 'string after PUBLIC literal');          !!!parse-error (type => 'string after PUBLIC literal');
         $self->{ct}->{quirks} = 1;  
2637    
2638          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2639              !!!cp (200);
2640              $self->{ct}->{quirks} = 1;
2641              $self->{state} = BOGUS_DOCTYPE_STATE;
2642            } else {
2643              !!!cp (200.2);
2644              $self->{state} = BOGUS_MD_STATE;
2645            }
2646    
2647          !!!next-input-character;          !!!next-input-character;
2648          redo A;          redo A;
2649        }        }
# Line 2318  sub _get_next_token ($) { Line 2666  sub _get_next_token ($) {
2666          !!!next-input-character;          !!!next-input-character;
2667          redo A;          redo A;
2668        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
         !!!cp (204);  
2669          !!!parse-error (type => 'no SYSTEM literal');          !!!parse-error (type => 'no SYSTEM literal');
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
2670          !!!next-input-character;          !!!next-input-character;
2671    
2672          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2673          !!!emit ($self->{ct}); # DOCTYPE            !!!cp (204);
2674              $self->{state} = DATA_STATE;
2675              $self->{s_kwd} = '';
2676              $self->{ct}->{quirks} = 1;
2677            } else {
2678              !!!cp (204.1);
2679              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2680            }
2681    
2682            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2683          redo A;          redo A;
2684        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2685          !!!cp (205);          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2686          !!!parse-error (type => 'unclosed DOCTYPE');            !!!cp (205);
2687              !!!parse-error (type => 'unclosed DOCTYPE');
2688          $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
2689          $self->{s_kwd} = '';            $self->{s_kwd} = '';
2690              $self->{ct}->{quirks} = 1;
2691            } else {
2692              !!!cp (205.1);
2693              !!!parse-error (type => 'unclosed md'); ## TODO: type
2694              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2695            }
2696            
2697          ## reconsume          ## reconsume
2698            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2699            redo A;
2700          } elsif ($self->{is_xml} and
2701                   $self->{ct}->{type} == DOCTYPE_TOKEN and
2702                   $self->{nc} == 0x005B) { # [
2703            !!!cp (206.1);
2704            !!!parse-error (type => 'no SYSTEM literal');
2705    
2706          $self->{ct}->{quirks} = 1;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2707            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2708            $self->{in_subset} = 1;
2709            !!!next-input-character;
2710          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
   
2711          redo A;          redo A;
2712        } else {        } else {
         !!!cp (206);  
2713          !!!parse-error (type => 'string after SYSTEM');          !!!parse-error (type => 'string after SYSTEM');
         $self->{ct}->{quirks} = 1;  
2714    
2715          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2716              !!!cp (206);          
2717              $self->{ct}->{quirks} = 1;
2718              $self->{state} = BOGUS_DOCTYPE_STATE;
2719            } else {
2720              !!!cp (206.2);
2721              $self->{state} = BOGUS_MD_STATE;
2722            }
2723    
2724          !!!next-input-character;          !!!next-input-character;
2725          redo A;          redo A;
2726        }        }
# Line 2355  sub _get_next_token ($) { Line 2730  sub _get_next_token ($) {
2730          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2731          !!!next-input-character;          !!!next-input-character;
2732          redo A;          redo A;
2733        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
         !!!cp (208);  
2734          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2735    
2736          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2737          $self->{s_kwd} = '';            !!!cp (208);
2738              $self->{state} = DATA_STATE;
2739              $self->{s_kwd} = '';
2740              $self->{ct}->{quirks} = 1;
2741            } else {
2742              !!!cp (208.1);
2743              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2744            }
2745            
2746          !!!next-input-character;          !!!next-input-character;
2747            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         !!!emit ($self->{ct}); # DOCTYPE  
   
2748          redo A;          redo A;
2749        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (209);  
2750          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2751    
2752          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2753          $self->{s_kwd} = '';            !!!cp (209);
2754              $self->{state} = DATA_STATE;
2755              $self->{s_kwd} = '';
2756              $self->{ct}->{quirks} = 1;
2757            } else {
2758              !!!cp (209.1);
2759              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2760            }
2761            
2762          ## reconsume          ## reconsume
2763            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         !!!emit ($self->{ct}); # DOCTYPE  
   
2764          redo A;          redo A;
2765        } else {        } else {
2766          !!!cp (210);          !!!cp (210);
2767          $self->{ct}->{sysid} # DOCTYPE          $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
2768          $self->{read_until}->($self->{ct}->{sysid}, q[">],          $self->{read_until}->($self->{ct}->{sysid}, q[">],
2769                                length $self->{ct}->{sysid});                                length $self->{ct}->{sysid});
2770    
# Line 2396  sub _get_next_token ($) { Line 2778  sub _get_next_token ($) {
2778          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2779          !!!next-input-character;          !!!next-input-character;
2780          redo A;          redo A;
2781        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2782          !!!cp (212);          !!!cp (212);
2783          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2784    
# Line 2409  sub _get_next_token ($) { Line 2791  sub _get_next_token ($) {
2791    
2792          redo A;          redo A;
2793        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (213);  
2794          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2795    
2796          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2797          $self->{s_kwd} = '';            !!!cp (213);
2798          ## reconsume            $self->{state} = DATA_STATE;
2799              $self->{s_kwd} = '';
2800          $self->{ct}->{quirks} = 1;            $self->{ct}->{quirks} = 1;
2801          !!!emit ($self->{ct}); # DOCTYPE          } else {
2802              !!!cp (213.1);
2803              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2804            }
2805    
2806            ## reconsume
2807            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2808          redo A;          redo A;
2809        } else {        } else {
2810          !!!cp (214);          !!!cp (214);
2811          $self->{ct}->{sysid} # DOCTYPE          $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
2812          $self->{read_until}->($self->{ct}->{sysid}, q['>],          $self->{read_until}->($self->{ct}->{sysid}, q['>],
2813                                length $self->{ct}->{sysid});                                length $self->{ct}->{sysid});
2814    
# Line 2433  sub _get_next_token ($) { Line 2818  sub _get_next_token ($) {
2818        }        }
2819      } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {      } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2820        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2821          !!!cp (215);          if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
2822          ## Stay in the state            !!!cp (215.1);
2823              $self->{state} = BEFORE_NDATA_STATE;
2824            } else {
2825              !!!cp (215);
2826              ## Stay in the state
2827            }
2828          !!!next-input-character;          !!!next-input-character;
2829          redo A;          redo A;
2830        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2831          !!!cp (216);          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2832          $self->{state} = DATA_STATE;            !!!cp (216);
2833          $self->{s_kwd} = '';            $self->{state} = DATA_STATE;
2834              $self->{s_kwd} = '';
2835            } else {
2836              !!!cp (216.1);
2837              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2838            }
2839    
2840            !!!next-input-character;
2841            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2842            redo A;
2843          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
2844                   ($self->{nc} == 0x004E or # N
2845                    $self->{nc} == 0x006E)) { # n
2846            !!!cp (216.2);
2847            !!!parse-error (type => 'no space before NDATA'); ## TODO: type
2848            $self->{state} = NDATA_STATE;
2849            $self->{kwd} = chr $self->{nc};
2850          !!!next-input-character;          !!!next-input-character;
2851            redo A;
2852          } elsif ($self->{nc} == -1) {
2853            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2854              !!!cp (217);
2855              !!!parse-error (type => 'unclosed DOCTYPE');
2856              $self->{state} = DATA_STATE;
2857              $self->{s_kwd} = '';
2858              $self->{ct}->{quirks} = 1;
2859            } else {
2860              !!!cp (217.1);
2861              !!!parse-error (type => 'unclosed md'); ## TODO: type
2862              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2863            }
2864    
2865            ## reconsume
2866            !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2867            redo A;
2868          } elsif ($self->{is_xml} and
2869                   $self->{ct}->{type} == DOCTYPE_TOKEN and
2870                   $self->{nc} == 0x005B) { # [
2871            !!!cp (218.1);
2872            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2873            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2874            $self->{in_subset} = 1;
2875            !!!next-input-character;
2876          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2877            redo A;
2878          } else {
2879            !!!parse-error (type => 'string after SYSTEM literal');
2880    
2881            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2882              !!!cp (218);
2883              #$self->{ct}->{quirks} = 1;
2884              $self->{state} = BOGUS_DOCTYPE_STATE;
2885            } else {
2886              !!!cp (218.2);
2887              $self->{state} = BOGUS_MD_STATE;
2888            }
2889    
2890            !!!next-input-character;
2891            redo A;
2892          }
2893        } elsif ($self->{state} == BEFORE_NDATA_STATE) {
2894          if ($is_space->{$self->{nc}}) {
2895            !!!cp (218.3);
2896            ## Stay in the state.
2897            !!!next-input-character;
2898            redo A;
2899          } elsif ($self->{nc} == 0x003E) { # >
2900            !!!cp (218.4);
2901            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2902            !!!next-input-character;
2903            !!!emit ($self->{ct}); # ENTITY
2904            redo A;
2905          } elsif ($self->{nc} == 0x004E or # N
2906                   $self->{nc} == 0x006E) { # n
2907            !!!cp (218.5);
2908            $self->{state} = NDATA_STATE;
2909            $self->{kwd} = chr $self->{nc};
2910            !!!next-input-character;
2911          redo A;          redo A;
2912        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2913          !!!cp (217);          !!!cp (218.6);
2914          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed md'); ## TODO: type
2915          $self->{state} = DATA_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
         $self->{s_kwd} = '';  
2916          ## reconsume          ## reconsume
2917            !!!emit ($self->{ct}); # ENTITY
         $self->{ct}->{quirks} = 1;  
         !!!emit ($self->{ct}); # DOCTYPE  
   
2918          redo A;          redo A;
2919        } else {        } else {
2920          !!!cp (218);          !!!cp (218.7);
2921          !!!parse-error (type => 'string after SYSTEM literal');          !!!parse-error (type => 'string after SYSTEM literal');
2922          #$self->{ct}->{quirks} = 1;          $self->{state} = BOGUS_MD_STATE;
   
         $self->{state} = BOGUS_DOCTYPE_STATE;  
2923          !!!next-input-character;          !!!next-input-character;
2924          redo A;          redo A;
2925        }        }
# Line 2476  sub _get_next_token ($) { Line 2933  sub _get_next_token ($) {
2933          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2934    
2935          redo A;          redo A;
2936          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2937            !!!cp (220.1);
2938            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2939            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2940            $self->{in_subset} = 1;
2941            !!!next-input-character;
2942            !!!emit ($self->{ct}); # DOCTYPE
2943            redo A;
2944        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2945          !!!cp (220);          !!!cp (220);
2946          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
# Line 2488  sub _get_next_token ($) { Line 2953  sub _get_next_token ($) {
2953        } else {        } else {
2954          !!!cp (221);          !!!cp (221);
2955          my $s = '';          my $s = '';
2956          $self->{read_until}->($s, q[>], 0);          $self->{read_until}->($s, q{>[}, 0);
2957    
2958          ## Stay in the state          ## Stay in the state
2959          !!!next-input-character;          !!!next-input-character;
# Line 2596  sub _get_next_token ($) { Line 3061  sub _get_next_token ($) {
3061        } elsif ($self->{nc} == 0x0023) { # #        } elsif ($self->{nc} == 0x0023) { # #
3062          !!!cp (999);          !!!cp (999);
3063          $self->{state} = ENTITY_HASH_STATE;          $self->{state} = ENTITY_HASH_STATE;
3064          $self->{s_kwd} = '#';          $self->{kwd} = '#';
3065          !!!next-input-character;          !!!next-input-character;
3066          redo A;          redo A;
3067        } elsif ((0x0041 <= $self->{nc} and        } elsif ((0x0041 <= $self->{nc} and
# Line 2606  sub _get_next_token ($) { Line 3071  sub _get_next_token ($) {
3071          !!!cp (998);          !!!cp (998);
3072          require Whatpm::_NamedEntityList;          require Whatpm::_NamedEntityList;
3073          $self->{state} = ENTITY_NAME_STATE;          $self->{state} = ENTITY_NAME_STATE;
3074          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
3075          $self->{entity__value} = $self->{s_kwd};          $self->{entity__value} = $self->{kwd};
3076          $self->{entity__match} = 0;          $self->{entity__match} = 0;
3077          !!!next-input-character;          !!!next-input-character;
3078          redo A;          redo A;
# Line 2647  sub _get_next_token ($) { Line 3112  sub _get_next_token ($) {
3112            $self->{nc} == 0x0058) { # X            $self->{nc} == 0x0058) { # X
3113          !!!cp (995);          !!!cp (995);
3114          $self->{state} = HEXREF_X_STATE;          $self->{state} = HEXREF_X_STATE;
3115          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3116          !!!next-input-character;          !!!next-input-character;
3117          redo A;          redo A;
3118        } elsif (0x0030 <= $self->{nc} and        } elsif (0x0030 <= $self->{nc} and
3119                 $self->{nc} <= 0x0039) { # 0..9                 $self->{nc} <= 0x0039) { # 0..9
3120          !!!cp (994);          !!!cp (994);
3121          $self->{state} = NCR_NUM_STATE;          $self->{state} = NCR_NUM_STATE;
3122          $self->{s_kwd} = $self->{nc} - 0x0030;          $self->{kwd} = $self->{nc} - 0x0030;
3123          !!!next-input-character;          !!!next-input-character;
3124          redo A;          redo A;
3125        } else {        } else {
# Line 2690  sub _get_next_token ($) { Line 3155  sub _get_next_token ($) {
3155        if (0x0030 <= $self->{nc} and        if (0x0030 <= $self->{nc} and
3156            $self->{nc} <= 0x0039) { # 0..9            $self->{nc} <= 0x0039) { # 0..9
3157          !!!cp (1012);          !!!cp (1012);
3158          $self->{s_kwd} *= 10;          $self->{kwd} *= 10;
3159          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
3160                    
3161          ## Stay in the state.          ## Stay in the state.
3162          !!!next-input-character;          !!!next-input-character;
# Line 2707  sub _get_next_token ($) { Line 3172  sub _get_next_token ($) {
3172          #          #
3173        }        }
3174    
3175        my $code = $self->{s_kwd};        my $code = $self->{kwd};
3176        my $l = $self->{line_prev};        my $l = $self->{line_prev};
3177        my $c = $self->{column_prev};        my $c = $self->{column_prev};
3178        if ($charref_map->{$code}) {        if ($charref_map->{$code}) {
# Line 2750  sub _get_next_token ($) { Line 3215  sub _get_next_token ($) {
3215          # 0..9, A..F, a..f          # 0..9, A..F, a..f
3216          !!!cp (990);          !!!cp (990);
3217          $self->{state} = HEXREF_HEX_STATE;          $self->{state} = HEXREF_HEX_STATE;
3218          $self->{s_kwd} = 0;          $self->{kwd} = 0;
3219          ## Reconsume.          ## Reconsume.
3220          redo A;          redo A;
3221        } else {        } else {
# Line 2768  sub _get_next_token ($) { Line 3233  sub _get_next_token ($) {
3233            $self->{s_kwd} = '';            $self->{s_kwd} = '';
3234            ## Reconsume.            ## Reconsume.
3235            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
3236                      data => '&' . $self->{s_kwd},                      data => '&' . $self->{kwd},
3237                      line => $self->{line_prev},                      line => $self->{line_prev},
3238                      column => $self->{column_prev} - length $self->{s_kwd},                      column => $self->{column_prev} - length $self->{kwd},
3239                     });                     });
3240            redo A;            redo A;
3241          } else {          } else {
3242            !!!cp (989);            !!!cp (989);
3243            $self->{ca}->{value} .= '&' . $self->{s_kwd};            $self->{ca}->{value} .= '&' . $self->{kwd};
3244            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
3245            $self->{s_kwd} = '';            $self->{s_kwd} = '';
3246            ## Reconsume.            ## Reconsume.
# Line 2786  sub _get_next_token ($) { Line 3251  sub _get_next_token ($) {
3251        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
3252          # 0..9          # 0..9
3253          !!!cp (1002);          !!!cp (1002);
3254          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
3255          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
3256          ## Stay in the state.          ## Stay in the state.
3257          !!!next-input-character;          !!!next-input-character;
3258          redo A;          redo A;
3259        } elsif (0x0061 <= $self->{nc} and        } elsif (0x0061 <= $self->{nc} and
3260                 $self->{nc} <= 0x0066) { # a..f                 $self->{nc} <= 0x0066) { # a..f
3261          !!!cp (1003);          !!!cp (1003);
3262          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
3263          $self->{s_kwd} += $self->{nc} - 0x0060 + 9;          $self->{kwd} += $self->{nc} - 0x0060 + 9;
3264          ## Stay in the state.          ## Stay in the state.
3265          !!!next-input-character;          !!!next-input-character;
3266          redo A;          redo A;
3267        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
3268                 $self->{nc} <= 0x0046) { # A..F                 $self->{nc} <= 0x0046) { # A..F
3269          !!!cp (1004);          !!!cp (1004);
3270          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
3271          $self->{s_kwd} += $self->{nc} - 0x0040 + 9;          $self->{kwd} += $self->{nc} - 0x0040 + 9;
3272          ## Stay in the state.          ## Stay in the state.
3273          !!!next-input-character;          !!!next-input-character;
3274          redo A;          redo A;
# Line 2820  sub _get_next_token ($) { Line 3285  sub _get_next_token ($) {
3285          #          #
3286        }        }
3287    
3288        my $code = $self->{s_kwd};        my $code = $self->{kwd};
3289        my $l = $self->{line_prev};        my $l = $self->{line_prev};
3290        my $c = $self->{column_prev};        my $c = $self->{column_prev};
3291        if ($charref_map->{$code}) {        if ($charref_map->{$code}) {
# Line 2857  sub _get_next_token ($) { Line 3322  sub _get_next_token ($) {
3322          redo A;          redo A;
3323        }        }
3324      } elsif ($self->{state} == ENTITY_NAME_STATE) {      } elsif ($self->{state} == ENTITY_NAME_STATE) {
3325        if (length $self->{s_kwd} < 30 and        if (length $self->{kwd} < 30 and
3326            ## NOTE: Some number greater than the maximum length of entity name            ## NOTE: Some number greater than the maximum length of entity name
3327            ((0x0041 <= $self->{nc} and # a            ((0x0041 <= $self->{nc} and # a
3328              $self->{nc} <= 0x005A) or # x              $self->{nc} <= 0x005A) or # x
# Line 2867  sub _get_next_token ($) { Line 3332  sub _get_next_token ($) {
3332              $self->{nc} <= 0x0039) or # 9              $self->{nc} <= 0x0039) or # 9
3333             $self->{nc} == 0x003B)) { # ;             $self->{nc} == 0x003B)) { # ;
3334          our $EntityChar;          our $EntityChar;
3335          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3336          if (defined $EntityChar->{$self->{s_kwd}}) {          if (defined $EntityChar->{$self->{kwd}}) {
3337            if ($self->{nc} == 0x003B) { # ;            if ($self->{nc} == 0x003B) { # ;
3338              !!!cp (1020);              !!!cp (1020);
3339              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};              $self->{entity__value} = $EntityChar->{$self->{kwd}};
3340              $self->{entity__match} = 1;              $self->{entity__match} = 1;
3341              !!!next-input-character;              !!!next-input-character;
3342              #              #
3343            } else {            } else {
3344              !!!cp (1021);              !!!cp (1021);
3345              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};              $self->{entity__value} = $EntityChar->{$self->{kwd}};
3346              $self->{entity__match} = -1;              $self->{entity__match} = -1;
3347              ## Stay in the state.              ## Stay in the state.
3348              !!!next-input-character;              !!!next-input-character;
# Line 2905  sub _get_next_token ($) { Line 3370  sub _get_next_token ($) {
3370          if ($self->{prev_state} != DATA_STATE and # in attribute          if ($self->{prev_state} != DATA_STATE and # in attribute
3371              $self->{entity__match} < -1) {              $self->{entity__match} < -1) {
3372            !!!cp (1024);            !!!cp (1024);
3373            $data = '&' . $self->{s_kwd};            $data = '&' . $self->{kwd};
3374            #            #
3375          } else {          } else {
3376            !!!cp (1025);            !!!cp (1025);
# Line 2917  sub _get_next_token ($) { Line 3382  sub _get_next_token ($) {
3382          !!!cp (1026);          !!!cp (1026);
3383          !!!parse-error (type => 'bare ero',          !!!parse-error (type => 'bare ero',
3384                          line => $self->{line_prev},                          line => $self->{line_prev},
3385                          column => $self->{column_prev} - length $self->{s_kwd});                          column => $self->{column_prev} - length $self->{kwd});
3386          $data = '&' . $self->{s_kwd};          $data = '&' . $self->{kwd};
3387          #          #
3388        }        }
3389        
# Line 2941  sub _get_next_token ($) { Line 3406  sub _get_next_token ($) {
3406                    data => $data,                    data => $data,
3407                    has_reference => $has_ref,                    has_reference => $has_ref,
3408                    line => $self->{line_prev},                    line => $self->{line_prev},
3409                    column => $self->{column_prev} + 1 - length $self->{s_kwd},                    column => $self->{column_prev} + 1 - length $self->{kwd},
3410                   });                   });
3411          redo A;          redo A;
3412        } else {        } else {
# Line 2957  sub _get_next_token ($) { Line 3422  sub _get_next_token ($) {
3422      ## XML-only states      ## XML-only states
3423    
3424      } elsif ($self->{state} == PI_STATE) {      } elsif ($self->{state} == PI_STATE) {
3425          ## XML5: "Pi state" and "DOCTYPE pi state".
3426    
3427        if ($is_space->{$self->{nc}} or        if ($is_space->{$self->{nc}} or
3428            $self->{nc} == 0x003F or # ? ## XML5: Same as "Anything else"            $self->{nc} == 0x003F or # ?
3429            $self->{nc} == -1) {            $self->{nc} == -1) {
3430            ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
3431            ## pi state": Switch to the "DOCTYPE pi after state".  EOF:
3432            ## "DOCTYPE pi state": Parse error, switch to the "data
3433            ## state".
3434          !!!parse-error (type => 'bare pio', ## TODO: type          !!!parse-error (type => 'bare pio', ## TODO: type
3435                          line => $self->{line_prev},                          line => $self->{line_prev},
3436                          column => $self->{column_prev}                          column => $self->{column_prev}
# Line 2974  sub _get_next_token ($) { Line 3445  sub _get_next_token ($) {
3445                        };                        };
3446          redo A;          redo A;
3447        } else {        } else {
3448            ## XML5: "DOCTYPE pi state": Stay in the state.
3449          $self->{ct} = {type => PI_TOKEN,          $self->{ct} = {type => PI_TOKEN,
3450                         target => chr $self->{nc},                         target => chr $self->{nc},
3451                         data => '',                         data => '',
# Line 2991  sub _get_next_token ($) { Line 3463  sub _get_next_token ($) {
3463          redo A;          redo A;
3464        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3465          !!!parse-error (type => 'no pic'); ## TODO: type          !!!parse-error (type => 'no pic'); ## TODO: type
3466          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
3467          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3468            } else {
3469              $self->{state} = DATA_STATE;
3470              $self->{s_kwd} = '';
3471            }
3472          ## Reconsume.          ## Reconsume.
3473          !!!emit ($self->{ct}); # pi          !!!emit ($self->{ct}); # pi
3474          redo A;          redo A;
# Line 3023  sub _get_next_token ($) { Line 3499  sub _get_next_token ($) {
3499          redo A;          redo A;
3500        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3501          !!!parse-error (type => 'no pic'); ## TODO: type          !!!parse-error (type => 'no pic'); ## TODO: type
3502          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
3503          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
3504            } else {
3505              $self->{state} = DATA_STATE;
3506              $self->{s_kwd} = '';
3507            }
3508          ## Reprocess.          ## Reprocess.
3509          !!!emit ($self->{ct}); # pi          !!!emit ($self->{ct}); # pi
3510          redo A;          redo A;
# Line 3038  sub _get_next_token ($) { Line 3518  sub _get_next_token ($) {
3518          redo A;          redo A;
3519        }        }
3520      } elsif ($self->{state} == PI_AFTER_STATE) {      } elsif ($self->{state} == PI_AFTER_STATE) {
3521          ## XML5: Part of "Pi after state".
3522    
3523        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
3524          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
3525          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3526            } else {
3527              $self->{state} = DATA_STATE;
3528              $self->{s_kwd} = '';
3529            }
3530          !!!next-input-character;          !!!next-input-character;
3531          !!!emit ($self->{ct}); # pi          !!!emit ($self->{ct}); # pi
3532          redo A;          redo A;
# Line 3063  sub _get_next_token ($) { Line 3549  sub _get_next_token ($) {
3549          redo A;          redo A;
3550        }        }
3551      } elsif ($self->{state} == PI_DATA_AFTER_STATE) {      } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
3552        ## XML5: Same as "pi after state" in XML5        ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
3553    
3554        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
3555          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
3556          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3557            } else {
3558              $self->{state} = DATA_STATE;
3559              $self->{s_kwd} = '';
3560            }
3561          !!!next-input-character;          !!!next-input-character;
3562          !!!emit ($self->{ct}); # pi          !!!emit ($self->{ct}); # pi
3563          redo A;          redo A;
# Line 3081  sub _get_next_token ($) { Line 3572  sub _get_next_token ($) {
3572          ## Reprocess.          ## Reprocess.
3573          redo A;          redo A;
3574        }        }
3575            
3576        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
3577          if ($self->{nc} == 0x003C) { # <
3578            $self->{state} = DOCTYPE_TAG_STATE;
3579            !!!next-input-character;
3580            redo A;
3581          } elsif ($self->{nc} == 0x0025) { # %
3582            ## XML5: Not defined yet.
3583    
3584            ## TODO:
3585            !!!next-input-character;
3586            redo A;
3587          } elsif ($self->{nc} == 0x005D) { # ]
3588            delete $self->{in_subset};
3589            $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3590            !!!next-input-character;
3591            redo A;
3592          } elsif ($is_space->{$self->{nc}}) {
3593            ## Stay in the state.
3594            !!!next-input-character;
3595            redo A;
3596          } elsif ($self->{nc} == -1) {
3597            !!!parse-error (type => 'unclosed internal subset'); ## TODO: type
3598            delete $self->{in_subset};
3599            $self->{state} = DATA_STATE;
3600            $self->{s_kwd} = '';
3601            ## Reconsume.
3602            !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3603            redo A;
3604          } else {
3605            unless ($self->{internal_subset_tainted}) {
3606              ## XML5: No parse error.
3607              !!!parse-error (type => 'string in internal subset');
3608              $self->{internal_subset_tainted} = 1;
3609            }
3610            ## Stay in the state.
3611            !!!next-input-character;
3612            redo A;
3613          }
3614        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3615          if ($self->{nc} == 0x003E) { # >
3616            $self->{state} = DATA_STATE;
3617            $self->{s_kwd} = '';
3618            !!!next-input-character;
3619            !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3620            redo A;
3621          } elsif ($self->{nc} == -1) {
3622            !!!parse-error (type => 'unclosed DOCTYPE');
3623            $self->{state} = DATA_STATE;
3624            $self->{s_kwd} = '';
3625            ## Reconsume.
3626            !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3627            redo A;
3628          } else {
3629            ## XML5: No parse error and stay in the state.
3630            !!!parse-error (type => 'string after internal subset'); ## TODO: type
3631    
3632            $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3633            !!!next-input-character;
3634            redo A;
3635          }
3636        } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3637          if ($self->{nc} == 0x003E) { # >
3638            $self->{state} = DATA_STATE;
3639            $self->{s_kwd} = '';
3640            !!!next-input-character;
3641            !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3642            redo A;
3643          } elsif ($self->{nc} == -1) {
3644            $self->{state} = DATA_STATE;
3645            $self->{s_kwd} = '';
3646            ## Reconsume.
3647            !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3648            redo A;
3649          } else {
3650            ## Stay in the state.
3651            !!!next-input-character;
3652            redo A;
3653          }
3654        } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
3655          if ($self->{nc} == 0x0021) { # !
3656            $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
3657            !!!next-input-character;
3658            redo A;
3659          } elsif ($self->{nc} == 0x003F) { # ?
3660            $self->{state} = PI_STATE;
3661            !!!next-input-character;
3662            redo A;
3663          } elsif ($self->{nc} == -1) {
3664            !!!parse-error (type => 'bare stago');
3665            $self->{state} = DATA_STATE;
3666            $self->{s_kwd} = '';
3667            ## Reconsume.
3668            redo A;
3669          } else {
3670            !!!parse-error (type => 'bare stago', ## XML5: Not a parse error.
3671                            line => $self->{line_prev},
3672                            column => $self->{column_prev});
3673            $self->{state} = BOGUS_COMMENT_STATE;
3674            $self->{ct} = {type => COMMENT_TOKEN,
3675                           data => '',
3676                          }; ## NOTE: Will be discarded.
3677            !!!next-input-character;
3678            redo A;
3679          }
3680        } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
3681          ## XML5: "DOCTYPE markup declaration state".
3682          
3683          if ($self->{nc} == 0x002D) { # -
3684            $self->{state} = MD_HYPHEN_STATE;
3685            !!!next-input-character;
3686            redo A;
3687          } elsif ($self->{nc} == 0x0045 or # E
3688                   $self->{nc} == 0x0065) { # e
3689            $self->{state} = MD_E_STATE;
3690            $self->{kwd} = chr $self->{nc};
3691            !!!next-input-character;
3692            redo A;
3693          } elsif ($self->{nc} == 0x0041 or # A
3694                   $self->{nc} == 0x0061) { # a
3695            $self->{state} = MD_ATTLIST_STATE;
3696            $self->{kwd} = chr $self->{nc};
3697            !!!next-input-character;
3698            redo A;
3699          } elsif ($self->{nc} == 0x004E or # N
3700                   $self->{nc} == 0x006E) { # n
3701            $self->{state} = MD_NOTATION_STATE;
3702            $self->{kwd} = chr $self->{nc};
3703            !!!next-input-character;
3704            redo A;
3705          } else {
3706            #
3707          }
3708          
3709          ## XML5: No parse error.
3710          !!!parse-error (type => 'bogus comment',
3711                          line => $self->{line_prev},
3712                          column => $self->{column_prev} - 1);
3713          ## Reconsume.
3714          $self->{state} = BOGUS_COMMENT_STATE;
3715          $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
3716          redo A;
3717        } elsif ($self->{state} == MD_E_STATE) {
3718          if ($self->{nc} == 0x004E or # N
3719              $self->{nc} == 0x006E) { # n
3720            $self->{state} = MD_ENTITY_STATE;
3721            $self->{kwd} .= chr $self->{nc};
3722            !!!next-input-character;
3723            redo A;
3724          } elsif ($self->{nc} == 0x004C or # L
3725                   $self->{nc} == 0x006C) { # l
3726            ## XML5: <!ELEMENT> not supported.
3727            $self->{state} = MD_ELEMENT_STATE;
3728            $self->{kwd} .= chr $self->{nc};
3729            !!!next-input-character;
3730            redo A;
3731          } else {
3732            ## XML5: No parse error.
3733            !!!parse-error (type => 'bogus comment',
3734                            line => $self->{line_prev},
3735                            column => $self->{column_prev} - 2
3736                                + 1 * ($self->{nc} == -1));
3737            ## Reconsume.
3738            $self->{state} = BOGUS_COMMENT_STATE;
3739            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3740            redo A;
3741          }
3742        } elsif ($self->{state} == MD_ENTITY_STATE) {
3743          if ($self->{nc} == [
3744                undef,
3745                undef,
3746                0x0054, # T
3747                0x0049, # I
3748                0x0054, # T
3749              ]->[length $self->{kwd}] or
3750              $self->{nc} == [
3751                undef,
3752                undef,
3753                0x0074, # t
3754                0x0069, # i
3755                0x0074, # t
3756              ]->[length $self->{kwd}]) {
3757            ## Stay in the state.
3758            $self->{kwd} .= chr $self->{nc};
3759            !!!next-input-character;
3760            redo A;
3761          } elsif ((length $self->{kwd}) == 5 and
3762                   ($self->{nc} == 0x0059 or # Y
3763                    $self->{nc} == 0x0079)) { # y
3764            if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
3765              !!!parse-error (type => 'lowercase keyword', ## TODO: type
3766                              text => 'ENTITY',
3767                              line => $self->{line_prev},
3768                              column => $self->{column_prev} - 4);
3769            }
3770            $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
3771                           line => $self->{line_prev},
3772                           column => $self->{column_prev} - 6};
3773            $self->{state} = DOCTYPE_MD_STATE;
3774            !!!next-input-character;
3775            redo A;
3776          } else {
3777            !!!parse-error (type => 'bogus comment',
3778                            line => $self->{line_prev},
3779                            column => $self->{column_prev} - 1
3780                                - (length $self->{kwd})
3781                                + 1 * ($self->{nc} == -1));
3782            $self->{state} = BOGUS_COMMENT_STATE;
3783            ## Reconsume.
3784            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3785            redo A;
3786          }
3787        } elsif ($self->{state} == MD_ELEMENT_STATE) {
3788          if ($self->{nc} == [
3789               undef,
3790               undef,
3791               0x0045, # E
3792               0x004D, # M
3793               0x0045, # E
3794               0x004E, # N
3795              ]->[length $self->{kwd}] or
3796              $self->{nc} == [
3797               undef,
3798               undef,
3799               0x0065, # e
3800               0x006D, # m
3801               0x0065, # e
3802               0x006E, # n
3803              ]->[length $self->{kwd}]) {
3804            ## Stay in the state.
3805            $self->{kwd} .= chr $self->{nc};
3806            !!!next-input-character;
3807            redo A;
3808          } elsif ((length $self->{kwd}) == 6 and
3809                   ($self->{nc} == 0x0054 or # T
3810                    $self->{nc} == 0x0074)) { # t
3811            if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
3812              !!!parse-error (type => 'lowercase keyword', ## TODO: type
3813                              text => 'ELEMENT',
3814                              line => $self->{line_prev},
3815                              column => $self->{column_prev} - 5);
3816            }
3817            $self->{ct} = {type => ELEMENT_TOKEN, name => '',
3818                           line => $self->{line_prev},
3819                           column => $self->{column_prev} - 6};
3820            $self->{state} = DOCTYPE_MD_STATE;
3821            !!!next-input-character;
3822            redo A;
3823          } else {
3824            !!!parse-error (type => 'bogus comment',
3825                            line => $self->{line_prev},
3826                            column => $self->{column_prev} - 1
3827                                - (length $self->{kwd})
3828                                + 1 * ($self->{nc} == -1));
3829            $self->{state} = BOGUS_COMMENT_STATE;
3830            ## Reconsume.
3831            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3832            redo A;
3833          }
3834        } elsif ($self->{state} == MD_ATTLIST_STATE) {
3835          if ($self->{nc} == [
3836               undef,
3837               0x0054, # T
3838               0x0054, # T
3839               0x004C, # L
3840               0x0049, # I
3841               0x0053, # S
3842              ]->[length $self->{kwd}] or
3843              $self->{nc} == [
3844               undef,
3845               0x0074, # t
3846               0x0074, # t
3847               0x006C, # l
3848               0x0069, # i
3849               0x0073, # s
3850              ]->[length $self->{kwd}]) {
3851            ## Stay in the state.
3852            $self->{kwd} .= chr $self->{nc};
3853            !!!next-input-character;
3854            redo A;
3855          } elsif ((length $self->{kwd}) == 6 and
3856                   ($self->{nc} == 0x0054 or # T
3857                    $self->{nc} == 0x0074)) { # t
3858            if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
3859              !!!parse-error (type => 'lowercase keyword', ## TODO: type
3860                              text => 'ATTLIST',
3861                              line => $self->{line_prev},
3862                              column => $self->{column_prev} - 5);
3863            }
3864            $self->{ct} = {type => ATTLIST_TOKEN, name => '',
3865                           attrdefs => [],
3866                           line => $self->{line_prev},
3867                           column => $self->{column_prev} - 6};
3868            $self->{state} = DOCTYPE_MD_STATE;
3869            !!!next-input-character;
3870            redo A;
3871          } else {
3872            !!!parse-error (type => 'bogus comment',
3873                            line => $self->{line_prev},
3874                            column => $self->{column_prev} - 1
3875                                 - (length $self->{kwd})
3876                                 + 1 * ($self->{nc} == -1));
3877            $self->{state} = BOGUS_COMMENT_STATE;
3878            ## Reconsume.
3879            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3880            redo A;
3881          }
3882        } elsif ($self->{state} == MD_NOTATION_STATE) {
3883          if ($self->{nc} == [
3884               undef,
3885               0x004F, # O
3886               0x0054, # T
3887               0x0041, # A
3888               0x0054, # T
3889               0x0049, # I
3890               0x004F, # O
3891              ]->[length $self->{kwd}] or
3892              $self->{nc} == [
3893               undef,
3894               0x006F, # o
3895               0x0074, # t
3896               0x0061, # a
3897               0x0074, # t
3898               0x0069, # i
3899               0x006F, # o
3900              ]->[length $self->{kwd}]) {
3901            ## Stay in the state.
3902            $self->{kwd} .= chr $self->{nc};
3903            !!!next-input-character;
3904            redo A;
3905          } elsif ((length $self->{kwd}) == 7 and
3906                   ($self->{nc} == 0x004E or # N
3907                    $self->{nc} == 0x006E)) { # n
3908            if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
3909              !!!parse-error (type => 'lowercase keyword', ## TODO: type
3910                              text => 'NOTATION',
3911                              line => $self->{line_prev},
3912                              column => $self->{column_prev} - 6);
3913            }
3914            $self->{ct} = {type => NOTATION_TOKEN, name => '',
3915                           line => $self->{line_prev},
3916                           column => $self->{column_prev} - 6};
3917            $self->{state} = DOCTYPE_MD_STATE;
3918            !!!next-input-character;
3919            redo A;
3920          } else {
3921            !!!parse-error (type => 'bogus comment',
3922                            line => $self->{line_prev},
3923                            column => $self->{column_prev} - 1
3924                                - (length $self->{kwd})
3925                                + 1 * ($self->{nc} == -1));
3926            $self->{state} = BOGUS_COMMENT_STATE;
3927            ## Reconsume.
3928            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3929            redo A;
3930          }
3931        } elsif ($self->{state} == DOCTYPE_MD_STATE) {
3932          ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
3933          ## "DOCTYPE NOTATION state".
3934    
3935          if ($is_space->{$self->{nc}}) {
3936            ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
3937            $self->{state} = BEFORE_MD_NAME_STATE;
3938            !!!next-input-character;
3939            redo A;
3940          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
3941                   $self->{nc} == 0x0025) { # %
3942            ## XML5: Switch to the "DOCTYPE bogus comment state".
3943            !!!parse-error (type => 'no space before md name'); ## TODO: type
3944            $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
3945            !!!next-input-character;
3946            redo A;
3947          } elsif ($self->{nc} == -1) {
3948            !!!parse-error (type => 'unclosed md'); ## TODO: type
3949            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3950            ## Reconsume.
3951            redo A;
3952          } elsif ($self->{nc} == 0x003E) { # >
3953            ## XML5: Switch to the "DOCTYPE bogus comment state".
3954            !!!parse-error (type => 'no md name'); ## TODO: type
3955            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3956            !!!next-input-character;
3957            redo A;
3958          } else {
3959            ## XML5: Switch to the "DOCTYPE bogus comment state".
3960            !!!parse-error (type => 'no space before md name'); ## TODO: type
3961            $self->{state} = BEFORE_MD_NAME_STATE;
3962            redo A;
3963          }
3964        } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
3965          ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
3966          ## before state", "DOCTYPE ATTLIST name before state".
3967    
3968          if ($is_space->{$self->{nc}}) {
3969            ## Stay in the state.
3970            !!!next-input-character;
3971            redo A;
3972          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
3973                   $self->{nc} == 0x0025) { # %
3974            $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
3975            !!!next-input-character;
3976            redo A;
3977          } elsif ($self->{nc} == 0x003E) { # >
3978            ## XML5: Same as "Anything else".
3979            !!!parse-error (type => 'no md name'); ## TODO: type
3980            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3981            !!!next-input-character;
3982            redo A;
3983          } elsif ($self->{nc} == -1) {
3984            !!!parse-error (type => 'unclosed md'); ## TODO: type
3985            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3986            ## Reconsume.
3987            redo A;
3988          } else {
3989            ## XML5: [ATTLIST] Not defined yet.
3990            $self->{ct}->{name} .= chr $self->{nc};
3991            $self->{state} = MD_NAME_STATE;
3992            !!!next-input-character;
3993            redo A;
3994          }
3995        } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
3996          if ($is_space->{$self->{nc}}) {
3997            ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
3998            $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
3999            $self->{state} = BEFORE_MD_NAME_STATE;
4000            !!!next-input-character;
4001            redo A;
4002          } elsif ($self->{nc} == 0x003E) { # >
4003            ## XML5: Same as "Anything else".
4004            !!!parse-error (type => 'no md name'); ## TODO: type
4005            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4006            !!!next-input-character;
4007            redo A;
4008          } elsif ($self->{nc} == -1) {
4009            !!!parse-error (type => 'unclosed md');
4010            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4011            ## Reconsume.
4012            redo A;
4013          } else {
4014            ## XML5: No parse error.
4015            !!!parse-error (type => 'no space after ENTITY percent'); ## TODO: type
4016            $self->{state} = BOGUS_COMMENT_STATE;
4017            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4018            ## Reconsume.
4019            redo A;
4020          }
4021        } elsif ($self->{state} == MD_NAME_STATE) {
4022          ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
4023          
4024          if ($is_space->{$self->{nc}}) {
4025            if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4026              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4027            } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
4028              ## TODO: ...
4029              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4030            } else { # ENTITY/NOTATION
4031              $self->{state} = AFTER_DOCTYPE_NAME_STATE;
4032            }
4033            !!!next-input-character;
4034            redo A;
4035          } elsif ($self->{nc} == 0x003E) { # >
4036            if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4037              #
4038            } else {
4039              !!!parse-error (type => 'no md def'); ## TODO: type
4040            }
4041            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4042            !!!next-input-character;
4043            !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4044            redo A;
4045          } elsif ($self->{nc} == -1) {
4046            ## XML5: [ATTLIST] No parse error.
4047            !!!parse-error (type => 'unclosed md');
4048            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4049            ## Reconsume.
4050            !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4051            redo A;
4052          } else {
4053            ## XML5: [ATTLIST] Not defined yet.
4054            $self->{ct}->{name} .= chr $self->{nc};
4055            ## Stay in the state.
4056            !!!next-input-character;
4057            redo A;
4058          }
4059        } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
4060          if ($is_space->{$self->{nc}}) {
4061            ## Stay in the state.
4062            !!!next-input-character;
4063            redo A;
4064          } elsif ($self->{nc} == 0x003E) { # >
4065            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4066            !!!next-input-character;
4067            !!!emit ($self->{ct}); # ATTLIST
4068            redo A;
4069          } elsif ($self->{nc} == -1) {
4070            ## XML5: No parse error.
4071            !!!parse-error (type => 'unclosed md'); ## TODO: type
4072            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4073            !!!emit ($self->{ct});
4074            redo A;
4075          } else {
4076            ## XML5: Not defined yet.
4077            $self->{ca} = {name => chr ($self->{nc}), # attrdef
4078                           tokens => [],
4079                           line => $self->{line}, column => $self->{column}};
4080            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
4081            !!!next-input-character;
4082            redo A;
4083          }
4084        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
4085          if ($is_space->{$self->{nc}}) {
4086            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
4087            !!!next-input-character;
4088            redo A;
4089          } elsif ($self->{nc} == 0x003E) { # >
4090            ## XML5: Same as "anything else".
4091            !!!parse-error (type => 'no attr type'); ## TODO: type
4092            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4093            !!!next-input-character;
4094            !!!emit ($self->{ct}); # ATTLIST
4095            redo A;
4096          } elsif ($self->{nc} == 0x0028) { # (
4097            ## XML5: Same as "anything else".
4098            !!!parse-error (type => 'no space before paren'); ## TODO: type
4099            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4100            !!!next-input-character;
4101            redo A;
4102          } elsif ($self->{nc} == -1) {
4103            ## XML5: No parse error.
4104            !!!parse-error (type => 'unclosed md'); ## TODO: type
4105            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4106            !!!next-input-character;
4107            !!!emit ($self->{ct}); # ATTLIST
4108            redo A;
4109          } else {
4110            ## XML5: Not defined yet.
4111            $self->{ca}->{name} .= chr $self->{nc};
4112            ## Stay in the state.
4113            !!!next-input-character;
4114            redo A;
4115          }
4116        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
4117          if ($is_space->{$self->{nc}}) {
4118            ## Stay in the state.
4119            !!!next-input-character;
4120            redo A;
4121          } elsif ($self->{nc} == 0x003E) { # >
4122            ## XML5: Same as "anything else".
4123            !!!parse-error (type => 'no attr type'); ## TODO: type
4124            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4125            !!!next-input-character;
4126            !!!emit ($self->{ct}); # ATTLIST
4127            redo A;
4128          } elsif ($self->{nc} == 0x0028) { # (
4129            ## XML5: Same as "anything else".
4130            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4131            !!!next-input-character;
4132            redo A;
4133          } elsif ($self->{nc} == -1) {
4134            ## XML5: No parse error.
4135            !!!parse-error (type => 'unclosed md'); ## TODO: type
4136            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4137            !!!next-input-character;
4138            !!!emit ($self->{ct});
4139            redo A;
4140          } else {
4141            ## XML5: Not defined yet.
4142            $self->{ca}->{type} = chr $self->{nc};
4143            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
4144            !!!next-input-character;
4145            redo A;
4146          }
4147        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
4148          if ($is_space->{$self->{nc}}) {
4149            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
4150            !!!next-input-character;
4151            redo A;
4152          } elsif ($self->{nc} == 0x0023) { # #
4153            ## XML5: Same as "anything else".
4154            !!!parse-error (type => 'no space before default value'); ## TODO: type
4155            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4156            !!!next-input-character;
4157            redo A;
4158          } elsif ($self->{nc} == 0x0022) { # "
4159            ## XML5: Same as "anything else".
4160            !!!parse-error (type => 'no space before default value'); ## TODO: type
4161            $self->{ca}->{value} = '';
4162            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4163            !!!next-input-character;
4164            redo A;
4165          } elsif ($self->{nc} == 0x0027) { # '
4166            ## XML5: Same as "anything else".
4167            !!!parse-error (type => 'no space before default value'); ## TODO: type
4168            $self->{ca}->{value} = '';
4169            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4170            !!!next-input-character;
4171            redo A;
4172          } elsif ($self->{nc} == 0x003E) { # >
4173            ## XML5: Same as "anything else".
4174            !!!parse-error (type => 'no attr default'); ## TODO: type
4175            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4176            !!!next-input-character;
4177            !!!emit ($self->{ct}); # ATTLIST
4178            redo A;
4179          } elsif ($self->{nc} == 0x0028) { # (
4180            ## XML5: Same as "anything else".
4181            !!!parse-error (type => 'no space before paren'); ## TODO: type
4182            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4183            !!!next-input-character;
4184            redo A;
4185          } elsif ($self->{nc} == -1) {
4186            ## XML5: No parse error.
4187            !!!parse-error (type => 'unclosed md'); ## TODO: type
4188            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4189            !!!next-input-character;
4190            !!!emit ($self->{ct});
4191            redo A;
4192          } else {
4193            ## XML5: Not defined yet.
4194            $self->{ca}->{type} .= chr $self->{nc};
4195            ## Stay in the state.
4196            !!!next-input-character;
4197            redo A;
4198          }
4199        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
4200          if ($is_space->{$self->{nc}}) {
4201            ## Stay in the state.
4202            !!!next-input-character;
4203            redo A;
4204          } elsif ($self->{nc} == 0x0028) { # (
4205            ## XML5: Same as "anything else".
4206            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4207            !!!next-input-character;
4208            redo A;
4209          } elsif ($self->{nc} == 0x0023) { # #
4210            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4211            !!!next-input-character;
4212            redo A;
4213          } elsif ($self->{nc} == 0x0022) { # "
4214            ## XML5: Same as "anything else".
4215            $self->{ca}->{value} = '';
4216            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4217            !!!next-input-character;
4218            redo A;
4219          } elsif ($self->{nc} == 0x0027) { # '
4220            ## XML5: Same as "anything else".
4221            $self->{ca}->{value} = '';
4222            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4223            !!!next-input-character;
4224            redo A;
4225          } elsif ($self->{nc} == 0x003E) { # >
4226            ## XML5: Same as "anything else".
4227            !!!parse-error (type => 'no attr default'); ## TODO: type
4228            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4229            !!!next-input-character;
4230            !!!emit ($self->{ct}); # ATTLIST
4231            redo A;
4232          } elsif ($self->{nc} == -1) {
4233            ## XML5: No parse error.
4234            !!!parse-error (type => 'unclosed md'); ## TODO: type
4235            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4236            !!!next-input-character;
4237            !!!emit ($self->{ct});
4238            redo A;
4239          } else {
4240            ## XML5: Switch to the "DOCTYPE bogus comment state".
4241            !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4242            $self->{ca}->{value} = '';
4243            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4244            ## Reconsume.
4245            redo A;
4246          }
4247        } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
4248          if ($is_space->{$self->{nc}}) {
4249            ## Stay in the state.
4250            !!!next-input-character;
4251            redo A;
4252          } elsif ($self->{nc} == 0x007C) { # |
4253            !!!parse-error (type => 'empty allowed token'); ## TODO: type
4254            ## Stay in the state.
4255            !!!next-input-character;
4256            redo A;
4257          } elsif ($self->{nc} == 0x0029) { # )
4258            !!!parse-error (type => 'empty allowed token'); ## TODO: type
4259            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4260            !!!next-input-character;
4261            redo A;
4262          } elsif ($self->{nc} == 0x003E) { # >
4263            !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4264            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4265            !!!next-input-character;
4266            !!!emit ($self->{ct}); # ATTLIST
4267            redo A;
4268          } elsif ($self->{nc} == -1) {
4269            ## XML5: No parse error.
4270            !!!parse-error (type => 'unclosed md'); ## TODO: type
4271            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4272            !!!next-input-character;
4273            !!!emit ($self->{ct});
4274            redo A;
4275          } else {
4276            push @{$self->{ca}->{tokens}}, chr $self->{nc};
4277            $self->{state} = ALLOWED_TOKEN_STATE;
4278            !!!next-input-character;
4279            redo A;
4280          }
4281        } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
4282          if ($is_space->{$self->{nc}}) {
4283            $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
4284            !!!next-input-character;
4285            redo A;
4286          } elsif ($self->{nc} == 0x007C) { # |
4287            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4288            !!!next-input-character;
4289            redo A;
4290          } elsif ($self->{nc} == 0x0029) { # )
4291            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4292            !!!next-input-character;
4293            redo A;
4294          } elsif ($self->{nc} == 0x003E) { # >
4295            !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4296            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4297            !!!next-input-character;
4298            !!!emit ($self->{ct}); # ATTLIST
4299            redo A;
4300          } elsif ($self->{nc} == -1) {
4301            ## XML5: No parse error.
4302            !!!parse-error (type => 'unclosed md'); ## TODO: type
4303            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4304            !!!next-input-character;
4305            !!!emit ($self->{ct});
4306            redo A;
4307          } else {
4308            $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
4309            ## Stay in the state.
4310            !!!next-input-character;
4311            redo A;
4312          }
4313        } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
4314          if ($is_space->{$self->{nc}}) {
4315            ## Stay in the state.
4316            !!!next-input-character;
4317            redo A;
4318          } elsif ($self->{nc} == 0x007C) { # |
4319            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4320            !!!next-input-character;
4321            redo A;
4322          } elsif ($self->{nc} == 0x0029) { # )
4323            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4324            !!!next-input-character;
4325            redo A;
4326          } elsif ($self->{nc} == 0x003E) { # >
4327            !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4328            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4329            !!!next-input-character;
4330            !!!emit ($self->{ct}); # ATTLIST
4331            redo A;
4332          } elsif ($self->{nc} == -1) {
4333            ## XML5: No parse error.
4334            !!!parse-error (type => 'unclosed md'); ## TODO: type
4335            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4336            !!!next-input-character;
4337            !!!emit ($self->{ct});
4338            redo A;
4339          } else {
4340            !!!parse-error (type => 'space in allowed token', ## TODO: type
4341                            line => $self->{line_prev},
4342                            column => $self->{column_prev});
4343            $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
4344            $self->{state} = ALLOWED_TOKEN_STATE;
4345            !!!next-input-character;
4346            redo A;
4347          }
4348        } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
4349          if ($is_space->{$self->{nc}}) {
4350            $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
4351            !!!next-input-character;
4352            redo A;
4353          } elsif ($self->{nc} == 0x0023) { # #
4354            !!!parse-error (type => 'no space before default value'); ## TODO: type
4355            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4356            !!!next-input-character;
4357            redo A;
4358          } elsif ($self->{nc} == 0x0022) { # "
4359            !!!parse-error (type => 'no space before default value'); ## TODO: type
4360            $self->{ca}->{value} = '';
4361            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4362            !!!next-input-character;
4363            redo A;
4364          } elsif ($self->{nc} == 0x0027) { # '
4365            !!!parse-error (type => 'no space before default value'); ## TODO: type
4366            $self->{ca}->{value} = '';
4367            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4368            !!!next-input-character;
4369            redo A;
4370          } elsif ($self->{nc} == 0x003E) { # >
4371            !!!parse-error (type => 'no attr default'); ## TODO: type
4372            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4373            !!!next-input-character;
4374            !!!emit ($self->{ct}); # ATTLIST
4375            redo A;
4376          } elsif ($self->{nc} == -1) {
4377            !!!parse-error (type => 'unclosed md'); ## TODO: type
4378            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4379            !!!next-input-character;
4380            !!!emit ($self->{ct});
4381            redo A;
4382          } else {
4383            !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4384            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4385            ## Reconsume.
4386            redo A;
4387          }
4388        } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
4389          if ($is_space->{$self->{nc}}) {
4390            ## Stay in the state.
4391            !!!next-input-character;
4392            redo A;
4393          } elsif ($self->{nc} == 0x0023) { # #
4394            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4395            !!!next-input-character;
4396            redo A;
4397          } elsif ($self->{nc} == 0x0022) { # "
4398            $self->{ca}->{value} = '';
4399            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4400            !!!next-input-character;
4401            redo A;
4402          } elsif ($self->{nc} == 0x0027) { # '
4403            $self->{ca}->{value} = '';
4404            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4405            !!!next-input-character;
4406            redo A;
4407          } elsif ($self->{nc} == 0x003E) { # >
4408            !!!parse-error (type => 'no attr default'); ## TODO: type
4409            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4410            !!!next-input-character;
4411            !!!emit ($self->{ct}); # ATTLIST
4412            redo A;
4413          } elsif ($self->{nc} == -1) {
4414            !!!parse-error (type => 'unclosed md'); ## TODO: type
4415            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4416            !!!next-input-character;
4417            !!!emit ($self->{ct});
4418            redo A;
4419          } else {
4420            !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4421            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4422            ## Reconsume.
4423            redo A;
4424          }
4425        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
4426          if ($is_space->{$self->{nc}}) {
4427            ## XML5: No parse error.
4428            !!!parse-error (type => 'no default type'); ## TODO: type
4429            $self->{state} = BOGUS_MD_STATE;
4430            ## Reconsume.
4431            redo A;
4432          } elsif ($self->{nc} == 0x0022) { # "
4433            ## XML5: Same as "anything else".
4434            $self->{ca}->{value} = '';
4435            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4436            !!!next-input-character;
4437            redo A;
4438          } elsif ($self->{nc} == 0x0027) { # '
4439            ## XML5: Same as "anything else".
4440            $self->{ca}->{value} = '';
4441            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4442            !!!next-input-character;
4443            redo A;
4444          } elsif ($self->{nc} == 0x003E) { # >
4445            ## XML5: Same as "anything else".
4446            !!!parse-error (type => 'no attr default'); ## TODO: type
4447            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4448            !!!next-input-character;
4449            !!!emit ($self->{ct}); # ATTLIST
4450            redo A;
4451          } elsif ($self->{nc} == -1) {
4452            ## XML5: No parse error.
4453            !!!parse-error (type => 'unclosed md'); ## TODO: type
4454            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4455            !!!next-input-character;
4456            !!!emit ($self->{ct});
4457            redo A;
4458          } else {
4459            $self->{ca}->{default} = chr $self->{nc};
4460            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
4461            !!!next-input-character;
4462            redo A;
4463          }
4464        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
4465          if ($is_space->{$self->{nc}}) {
4466            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
4467            !!!next-input-character;
4468            redo A;
4469          } elsif ($self->{nc} == 0x0022) { # "
4470            ## XML5: Same as "anything else".
4471            !!!parse-error (type => 'no space before default value'); ## TODO: type
4472            $self->{ca}->{value} = '';
4473            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4474            !!!next-input-character;
4475            redo A;
4476          } elsif ($self->{nc} == 0x0027) { # '
4477            ## XML5: Same as "anything else".
4478            !!!parse-error (type => 'no space before default value'); ## TODO: type
4479            $self->{ca}->{value} = '';
4480            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4481            !!!next-input-character;
4482            redo A;
4483          } elsif ($self->{nc} == 0x003E) { # >
4484            ## XML5: Same as "anything else".
4485            push @{$self->{ct}->{attrdefs}}, $self->{ca};
4486            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4487            !!!next-input-character;
4488            !!!emit ($self->{ct}); # ATTLIST
4489            redo A;
4490          } elsif ($self->{nc} == -1) {
4491            ## XML5: No parse error.
4492            !!!parse-error (type => 'unclosed md'); ## TODO: type
4493            push @{$self->{ct}->{attrdefs}}, $self->{ca};
4494            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4495            !!!next-input-character;
4496            !!!emit ($self->{ct});
4497            redo A;
4498          } else {
4499            $self->{ca}->{default} .= chr $self->{nc};
4500            ## Stay in the state.
4501            !!!next-input-character;
4502            redo A;
4503          }
4504        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
4505          if ($is_space->{$self->{nc}}) {
4506            ## Stay in the state.
4507            !!!next-input-character;
4508            redo A;
4509          } elsif ($self->{nc} == 0x0022) { # "
4510            $self->{ca}->{value} = '';
4511            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4512            !!!next-input-character;
4513            redo A;
4514          } elsif ($self->{nc} == 0x0027) { # '
4515            $self->{ca}->{value} = '';
4516            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4517            !!!next-input-character;
4518            redo A;
4519          } elsif ($self->{nc} == 0x003E) { # >
4520            push @{$self->{ct}->{attrdefs}}, $self->{ca};
4521            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4522            !!!next-input-character;
4523            !!!emit ($self->{ct}); # ATTLIST
4524            redo A;
4525          } elsif ($self->{nc} == -1) {
4526            ## XML5: No parse error.
4527            !!!parse-error (type => 'unclosed md'); ## TODO: type
4528            push @{$self->{ct}->{attrdefs}}, $self->{ca};
4529            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4530            !!!next-input-character;
4531            !!!emit ($self->{ct});
4532            redo A;
4533          } else {
4534            ## XML5: Not defined yet.
4535            if ($self->{ca}->{default} eq 'FIXED') {
4536              $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4537            } else {
4538              push @{$self->{ct}->{attrdefs}}, $self->{ca};
4539              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4540            }
4541            ## Reconsume.
4542            redo A;
4543          }
4544        } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
4545          if ($is_space->{$self->{nc}} or
4546              $self->{nc} == -1 or
4547              $self->{nc} == 0x003E) { # >
4548            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4549            ## Reconsume.
4550            redo A;
4551          } else {
4552            !!!parse-error (type => 'no space before attr name'); ## TODO: type
4553            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4554            ## Reconsume.
4555            redo A;
4556          }
4557        } elsif ($self->{state} == NDATA_STATE) {
4558          ## ASCII case-insensitive
4559          if ($self->{nc} == [
4560                undef,
4561                0x0044, # D
4562                0x0041, # A
4563                0x0054, # T
4564              ]->[length $self->{kwd}] or
4565              $self->{nc} == [
4566                undef,
4567                0x0064, # d
4568                0x0061, # a
4569                0x0074, # t
4570              ]->[length $self->{kwd}]) {
4571            !!!cp (172.2);
4572            ## Stay in the state.
4573            $self->{kwd} .= chr $self->{nc};
4574            !!!next-input-character;
4575            redo A;
4576          } elsif ((length $self->{kwd}) == 4 and
4577                   ($self->{nc} == 0x0041 or # A
4578                    $self->{nc} == 0x0061)) { # a
4579            if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
4580              !!!cp (172.3);
4581              !!!parse-error (type => 'lowercase keyword', ## TODO: type
4582                              text => 'NDATA',
4583                              line => $self->{line_prev},
4584                              column => $self->{column_prev} - 4);
4585            } else {
4586              !!!cp (172.4);
4587            }
4588            $self->{state} = AFTER_NDATA_STATE;
4589            !!!next-input-character;
4590            redo A;
4591          } else {
4592            !!!parse-error (type => 'string after literal', ## TODO: type
4593                            line => $self->{line_prev},
4594                            column => $self->{column_prev} + 1
4595                                - length $self->{kwd});
4596            !!!cp (172.5);
4597            $self->{state} = BOGUS_MD_STATE;
4598            ## Reconsume.
4599            redo A;
4600          }
4601        } elsif ($self->{state} == AFTER_NDATA_STATE) {
4602          if ($is_space->{$self->{nc}}) {
4603            $self->{state} = BEFORE_NOTATION_NAME_STATE;
4604            !!!next-input-character;
4605            redo A;
4606          } elsif ($self->{nc} == 0x003E) { # >
4607            !!!parse-error (type => 'no notation name'); ## TODO: type
4608            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4609            !!!next-input-character;
4610            !!!emit ($self->{ct}); # ENTITY
4611            redo A;
4612          } elsif ($self->{nc} == -1) {
4613            !!!parse-error (type => 'unclosed md'); ## TODO: type
4614            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4615            !!!next-input-character;
4616            !!!emit ($self->{ct}); # ENTITY
4617            redo A;
4618          } else {
4619            !!!parse-error (type => 'string after literal', ## TODO: type
4620                            line => $self->{line_prev},
4621                            column => $self->{column_prev} + 1
4622                                - length $self->{kwd});
4623            $self->{state} = BOGUS_MD_STATE;
4624            ## Reconsume.
4625            redo A;
4626          }
4627        } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
4628          if ($is_space->{$self->{nc}}) {
4629            ## Stay in the state.
4630            !!!next-input-character;
4631            redo A;
4632          } elsif ($self->{nc} == 0x003E) { # >
4633            !!!parse-error (type => 'no notation name'); ## TODO: type
4634            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4635            !!!next-input-character;
4636            !!!emit ($self->{ct}); # ENTITY
4637            redo A;
4638          } elsif ($self->{nc} == -1) {
4639            !!!parse-error (type => 'unclosed md'); ## TODO: type
4640            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4641            !!!next-input-character;
4642            !!!emit ($self->{ct}); # ENTITY
4643            redo A;
4644          } else {
4645            $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
4646            $self->{state} = NOTATION_NAME_STATE;
4647            !!!next-input-character;
4648            redo A;
4649          }
4650        } elsif ($self->{state} == NOTATION_NAME_STATE) {
4651          if ($is_space->{$self->{nc}}) {
4652            $self->{state} = AFTER_NOTATION_NAME_STATE;
4653            !!!next-input-character;
4654            redo A;
4655          } elsif ($self->{nc} == 0x003E) { # >
4656            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4657            !!!next-input-character;
4658            !!!emit ($self->{ct}); # ENTITY
4659            redo A;
4660          } elsif ($self->{nc} == -1) {
4661            !!!parse-error (type => 'unclosed md'); ## TODO: type
4662            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4663            !!!next-input-character;
4664            !!!emit ($self->{ct}); # ENTITY
4665            redo A;
4666          } else {
4667            $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
4668            ## Stay in the state.
4669            !!!next-input-character;
4670            redo A;
4671          }
4672        } elsif ($self->{state} == AFTER_NOTATION_NAME_STATE) {
4673          if ($is_space->{$self->{nc}}) {
4674            ## Stay in the state.
4675            !!!next-input-character;
4676            redo A;
4677          } elsif ($self->{nc} == 0x003E) { # >
4678            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4679            !!!next-input-character;
4680            !!!emit ($self->{ct}); # ENTITY
4681            redo A;
4682          } elsif ($self->{nc} == -1) {
4683            !!!parse-error (type => 'unclosed md'); ## TODO: type
4684            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4685            !!!next-input-character;
4686            !!!emit ($self->{ct}); # ENTITY
4687            redo A;
4688          } else {
4689            !!!parse-error (type => 'string after notation name'); ## TODO: type
4690            $self->{state} = BOGUS_MD_STATE;
4691            ## Reconsume.
4692            redo A;
4693          }
4694    
4695    
4696        } elsif ($self->{state} == BOGUS_MD_STATE) {
4697          if ($self->{nc} == 0x003E) { # >
4698            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4699            !!!next-input-character;
4700            !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
4701            redo A;
4702          } elsif ($self->{nc} == -1) {
4703            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4704            ## Reconsume.
4705            !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
4706            redo A;
4707          } else {
4708            ## Stay in the state.
4709            !!!next-input-character;
4710            redo A;
4711          }
4712      } else {      } else {
4713        die "$0: $self->{state}: Unknown state";        die "$0: $self->{state}: Unknown state";
4714      }      }
# Line 3092  sub _get_next_token ($) { Line 4719  sub _get_next_token ($) {
4719    
4720  1;  1;
4721  ## $Date$  ## $Date$
4722                                    

Legend:
Removed from v.1.11  
changed lines
  Added in v.1.18

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24