/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.10 by wakaba, Wed Oct 15 08:51:02 2008 UTC revision 1.13 by wakaba, Thu Oct 16 03:39:57 2008 UTC
# Line 15  BEGIN { Line 15  BEGIN {
15      CHARACTER_TOKEN      CHARACTER_TOKEN
16      PI_TOKEN      PI_TOKEN
17      ABORT_TOKEN      ABORT_TOKEN
18        END_OF_DOCTYPE_TOKEN
19    );    );
20        
21    our %EXPORT_TAGS = (    our %EXPORT_TAGS = (
# Line 27  BEGIN { Line 28  BEGIN {
28        CHARACTER_TOKEN        CHARACTER_TOKEN
29        PI_TOKEN        PI_TOKEN
30        ABORT_TOKEN        ABORT_TOKEN
31          END_OF_DOCTYPE_TOKEN
32      )],      )],
33    );    );
34  }  }
35    
36    ## NOTE: Differences from the XML5 draft are marked as "XML5:".
37    
38  ## Token types  ## Token types
39    
40  sub DOCTYPE_TOKEN () { 1 }  sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
41  sub COMMENT_TOKEN () { 2 }  sub COMMENT_TOKEN () { 2 }
42  sub START_TAG_TOKEN () { 3 }  sub START_TAG_TOKEN () { 3 }
43  sub END_TAG_TOKEN () { 4 }  sub END_TAG_TOKEN () { 4 }
44  sub END_OF_FILE_TOKEN () { 5 }  sub END_OF_FILE_TOKEN () { 5 }
45  sub CHARACTER_TOKEN () { 6 }  sub CHARACTER_TOKEN () { 6 }
46  sub PI_TOKEN () { 7 } # XML5  sub PI_TOKEN () { 7 } ## NOTE: XML only.
47  sub ABORT_TOKEN () { 8 } # Not a token actually  sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
48    sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only
49    
50    ## XML5: XML5 has "empty tag token".  In this implementation, it is
51    ## represented as a start tag token with $self->{self_closing} flag
52    ## set to true.
53    
54    ## XML5: XML5 has "short end tag token".  In this implementation, it
55    ## is represented as an end tag token with $token->{tag_name} flag set
56    ## to an empty string.
57    
58  package Whatpm::HTML;  package Whatpm::HTML;
59    
# Line 114  sub HEXREF_HEX_STATE () { 48 } Line 127  sub HEXREF_HEX_STATE () { 48 }
127  sub ENTITY_NAME_STATE () { 49 }  sub ENTITY_NAME_STATE () { 49 }
128  sub PCDATA_STATE () { 50 } # "data state" in the spec  sub PCDATA_STATE () { 50 } # "data state" in the spec
129    
130  ## XML states  ## XML-only states
131  sub PI_STATE () { 51 }  sub PI_STATE () { 51 }
132  sub PI_TARGET_STATE () { 52 }  sub PI_TARGET_STATE () { 52 }
133  sub PI_TARGET_AFTER_STATE () { 53 }  sub PI_TARGET_AFTER_STATE () { 53 }
134  sub PI_DATA_STATE () { 54 }  sub PI_DATA_STATE () { 54 }
135  sub PI_AFTER_STATE () { 55 }  sub PI_AFTER_STATE () { 55 }
136  sub PI_DATA_AFTER_STATE () { 56 }  sub PI_DATA_AFTER_STATE () { 56 }
137    sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
138    sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
139    sub DOCTYPE_TAG_STATE () { 59 }
140    sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 60 }
141    
142  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
143  ## list and descriptions)  ## list and descriptions)
# Line 186  sub _initialize_tokenizer ($) { Line 203  sub _initialize_tokenizer ($) {
203    #$self->{is_xml} (if XML)    #$self->{is_xml} (if XML)
204    
205    $self->{state} = DATA_STATE; # MUST    $self->{state} = DATA_STATE; # MUST
206    $self->{s_kwd} = ''; # state keyword    $self->{s_kwd} = ''; # Data state keyword
207      #$self->{kwd} = ''; # State-dependent keyword; initialized when used
208    #$self->{entity__value}; # initialized when used    #$self->{entity__value}; # initialized when used
209    #$self->{entity__match}; # initialized when used    #$self->{entity__match}; # initialized when used
210    $self->{content_model} = PCDATA_CONTENT_MODEL; # be    $self->{content_model} = PCDATA_CONTENT_MODEL; # be
# Line 206  sub _initialize_tokenizer ($) { Line 224  sub _initialize_tokenizer ($) {
224    
225  ## A token has:  ## A token has:
226  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
227  ##       CHARACTER_TOKEN, or END_OF_FILE_TOKEN  ##       CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
228  ##   ->{name} (DOCTYPE_TOKEN)  ##   ->{name} (DOCTYPE_TOKEN)
229  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
230    ##   ->{target} (PI_TOKEN)
231  ##   ->{pubid} (DOCTYPE_TOKEN)  ##   ->{pubid} (DOCTYPE_TOKEN)
232  ##   ->{sysid} (DOCTYPE_TOKEN)  ##   ->{sysid} (DOCTYPE_TOKEN)
233  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
# Line 216  sub _initialize_tokenizer ($) { Line 235  sub _initialize_tokenizer ($) {
235  ##        ->{name}  ##        ->{name}
236  ##        ->{value}  ##        ->{value}
237  ##        ->{has_reference} == 1 or 0  ##        ->{has_reference} == 1 or 0
238  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)  ##        ->{index}: Index of the attribute in a tag.
239    ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
240  ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)  ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
241    ##   ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
242    ##   ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
243    
244  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
245  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|
246  ##     while the token is pushed back to the stack.  ##     while the token is pushed back to the stack.
# Line 237  my $is_space = { Line 260  my $is_space = {
260    0x0009 => 1, # CHARACTER TABULATION (HT)    0x0009 => 1, # CHARACTER TABULATION (HT)
261    0x000A => 1, # LINE FEED (LF)    0x000A => 1, # LINE FEED (LF)
262    #0x000B => 0, # LINE TABULATION (VT)    #0x000B => 0, # LINE TABULATION (VT)
263    0x000C => 1, # FORM FEED (FF)    0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
264    #0x000D => 1, # CARRIAGE RETURN (CR)    #0x000D => 1, # CARRIAGE RETURN (CR)
265    0x0020 => 1, # SPACE (SP)    0x0020 => 1, # SPACE (SP)
266  };  };
# Line 447  sub _get_next_token ($) { Line 470  sub _get_next_token ($) {
470            redo A;            redo A;
471          } elsif ($self->{nc} == 0x0021) { # !          } elsif ($self->{nc} == 0x0021) { # !
472            !!!cp (15.1);            !!!cp (15.1);
473            $self->{s_kwd} = '<' unless $self->{escape};            $self->{s_kwd} = $self->{escaped} ? '' : '<';
474            #            #
475          } else {          } else {
476            !!!cp (16);            !!!cp (16);
477              $self->{s_kwd} = '';
478            #            #
479          }          }
480    
481          ## reconsume          ## reconsume
482          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
         $self->{s_kwd} = '';  
483          !!!emit ({type => CHARACTER_TOKEN, data => '<',          !!!emit ({type => CHARACTER_TOKEN, data => '<',
484                    line => $self->{line_prev},                    line => $self->{line_prev},
485                    column => $self->{column_prev},                    column => $self->{column_prev},
# Line 567  sub _get_next_token ($) { Line 590  sub _get_next_token ($) {
590        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
591          if (defined $self->{last_stag_name}) {          if (defined $self->{last_stag_name}) {
592            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
593            $self->{s_kwd} = '';            $self->{kwd} = '';
594            ## Reconsume.            ## Reconsume.
595            redo A;            redo A;
596          } else {          } else {
# Line 670  sub _get_next_token ($) { Line 693  sub _get_next_token ($) {
693          redo A;          redo A;
694        }        }
695      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
696        my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;        my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
697        if (length $ch) {        if (length $ch) {
698          my $CH = $ch;          my $CH = $ch;
699          $ch =~ tr/a-z/A-Z/;          $ch =~ tr/a-z/A-Z/;
# Line 678  sub _get_next_token ($) { Line 701  sub _get_next_token ($) {
701          if ($nch eq $ch or $nch eq $CH) {          if ($nch eq $ch or $nch eq $CH) {
702            !!!cp (24);            !!!cp (24);
703            ## Stay in the state.            ## Stay in the state.
704            $self->{s_kwd} .= $nch;            $self->{kwd} .= $nch;
705            !!!next-input-character;            !!!next-input-character;
706            redo A;            redo A;
707          } else {          } else {
# Line 687  sub _get_next_token ($) { Line 710  sub _get_next_token ($) {
710            $self->{s_kwd} = '';            $self->{s_kwd} = '';
711            ## Reconsume.            ## Reconsume.
712            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
713                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
714                      line => $self->{line_prev},                      line => $self->{line_prev},
715                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
716                     });                     });
717            redo A;            redo A;
718          }          }
# Line 705  sub _get_next_token ($) { Line 728  sub _get_next_token ($) {
728            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
729            $self->{s_kwd} = '';            $self->{s_kwd} = '';
730            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
731                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
732                      line => $self->{line_prev},                      line => $self->{line_prev},
733                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
734                     });                     });
735            redo A;            redo A;
736          } else {          } else {
# Line 716  sub _get_next_token ($) { Line 739  sub _get_next_token ($) {
739                = {type => END_TAG_TOKEN,                = {type => END_TAG_TOKEN,
740                   tag_name => $self->{last_stag_name},                   tag_name => $self->{last_stag_name},
741                   line => $self->{line_prev},                   line => $self->{line_prev},
742                   column => $self->{column_prev} - 1 - length $self->{s_kwd}};                   column => $self->{column_prev} - 1 - length $self->{kwd}};
743            $self->{state} = TAG_NAME_STATE;            $self->{state} = TAG_NAME_STATE;
744            ## Reconsume.            ## Reconsume.
745            redo A;            redo A;
# Line 798  sub _get_next_token ($) { Line 821  sub _get_next_token ($) {
821          redo A;          redo A;
822        }        }
823      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
824          ## XML5: "Tag attribute name before state".
825    
826        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
827          !!!cp (45);          !!!cp (45);
828          ## Stay in the state          ## Stay in the state
# Line 870  sub _get_next_token ($) { Line 895  sub _get_next_token ($) {
895               0x003D => 1, # =               0x003D => 1, # =
896              }->{$self->{nc}}) {              }->{$self->{nc}}) {
897            !!!cp (55);            !!!cp (55);
898              ## XML5: Not a parse error.
899            !!!parse-error (type => 'bad attribute name');            !!!parse-error (type => 'bad attribute name');
900          } else {          } else {
901            !!!cp (56);            !!!cp (56);
902              ## XML5: ":" raises a parse error and is ignored.
903          }          }
904          $self->{ca}          $self->{ca}
905              = {name => chr ($self->{nc}),              = {name => chr ($self->{nc}),
# Line 883  sub _get_next_token ($) { Line 910  sub _get_next_token ($) {
910          redo A;          redo A;
911        }        }
912      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
913          ## XML5: "Tag attribute name state".
914    
915        my $before_leave = sub {        my $before_leave = sub {
916          if (exists $self->{ct}->{attributes} # start tag or end tag          if (exists $self->{ct}->{attributes} # start tag or end tag
917              ->{$self->{ca}->{name}}) { # MUST              ->{$self->{ca}->{name}}) { # MUST
# Line 893  sub _get_next_token ($) { Line 922  sub _get_next_token ($) {
922            !!!cp (58);            !!!cp (58);
923            $self->{ct}->{attributes}->{$self->{ca}->{name}}            $self->{ct}->{attributes}->{$self->{ca}->{name}}
924              = $self->{ca};              = $self->{ca};
925              $self->{ca}->{index} = ++$self->{ct}->{last_index};
926          }          }
927        }; # $before_leave        }; # $before_leave
928    
# Line 909  sub _get_next_token ($) { Line 939  sub _get_next_token ($) {
939          !!!next-input-character;          !!!next-input-character;
940          redo A;          redo A;
941        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
942            if ($self->{is_xml}) {
943              !!!cp (60.1);
944              ## XML5: Not a parse error.
945              !!!parse-error (type => 'no attr value'); ## TODO: type
946            } else {
947              !!!cp (60.2);
948            }
949    
950          $before_leave->();          $before_leave->();
951          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
952            !!!cp (61);            !!!cp (61);
# Line 938  sub _get_next_token ($) { Line 976  sub _get_next_token ($) {
976          !!!next-input-character;          !!!next-input-character;
977          redo A;          redo A;
978        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
979          !!!cp (64);          if ($self->{is_xml}) {
980              !!!cp (64);
981              ## XML5: Not a parse error.
982              !!!parse-error (type => 'no attr value'); ## TODO: type
983            } else {
984              !!!cp (64.1);
985            }
986            
987          $before_leave->();          $before_leave->();
988          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
989          !!!next-input-character;          !!!next-input-character;
# Line 972  sub _get_next_token ($) { Line 1017  sub _get_next_token ($) {
1017          if ($self->{nc} == 0x0022 or # "          if ($self->{nc} == 0x0022 or # "
1018              $self->{nc} == 0x0027) { # '              $self->{nc} == 0x0027) { # '
1019            !!!cp (69);            !!!cp (69);
1020              ## XML5: Not a parse error.
1021            !!!parse-error (type => 'bad attribute name');            !!!parse-error (type => 'bad attribute name');
1022          } else {          } else {
1023            !!!cp (70);            !!!cp (70);
# Line 982  sub _get_next_token ($) { Line 1028  sub _get_next_token ($) {
1028          redo A;          redo A;
1029        }        }
1030      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1031          ## XML5: "Tag attribute name after state".
1032          
1033        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1034          !!!cp (71);          !!!cp (71);
1035          ## Stay in the state          ## Stay in the state
# Line 993  sub _get_next_token ($) { Line 1041  sub _get_next_token ($) {
1041          !!!next-input-character;          !!!next-input-character;
1042          redo A;          redo A;
1043        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1044            if ($self->{is_xml}) {
1045              !!!cp (72.1);
1046              ## XML5: Not a parse error.
1047              !!!parse-error (type => 'no attr value'); ## TODO: type
1048            } else {
1049              !!!cp (72.2);
1050            }
1051    
1052          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1053            !!!cp (73);            !!!cp (73);
1054            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
# Line 1026  sub _get_next_token ($) { Line 1082  sub _get_next_token ($) {
1082          !!!next-input-character;          !!!next-input-character;
1083          redo A;          redo A;
1084        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1085          !!!cp (77);          if ($self->{is_xml}) {
1086              !!!cp (77);
1087              ## XML5: Not a parse error.
1088              !!!parse-error (type => 'no attr value'); ## TODO: type
1089            } else {
1090              !!!cp (77.1);
1091            }
1092            
1093          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
1094          !!!next-input-character;          !!!next-input-character;
1095          redo A;          redo A;
# Line 1055  sub _get_next_token ($) { Line 1118  sub _get_next_token ($) {
1118    
1119          redo A;          redo A;
1120        } else {        } else {
1121            if ($self->{is_xml}) {
1122              !!!cp (78.1);
1123              ## XML5: Not a parse error.
1124              !!!parse-error (type => 'no attr value'); ## TODO: type
1125            } else {
1126              !!!cp (78.2);
1127            }
1128    
1129          if ($self->{nc} == 0x0022 or # "          if ($self->{nc} == 0x0022 or # "
1130              $self->{nc} == 0x0027) { # '              $self->{nc} == 0x0027) { # '
1131            !!!cp (78);            !!!cp (78);
1132              ## XML5: Not a parse error.
1133            !!!parse-error (type => 'bad attribute name');            !!!parse-error (type => 'bad attribute name');
1134          } else {          } else {
1135            !!!cp (82);            !!!cp (82);
# Line 1071  sub _get_next_token ($) { Line 1143  sub _get_next_token ($) {
1143          redo A;                  redo A;        
1144        }        }
1145      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1146          ## XML5: "Tag attribute value before state".
1147    
1148        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1149          !!!cp (83);          !!!cp (83);
1150          ## Stay in the state          ## Stay in the state
# Line 1142  sub _get_next_token ($) { Line 1216  sub _get_next_token ($) {
1216        } else {        } else {
1217          if ($self->{nc} == 0x003D) { # =          if ($self->{nc} == 0x003D) { # =
1218            !!!cp (93);            !!!cp (93);
1219              ## XML5: Not a parse error.
1220            !!!parse-error (type => 'bad attribute value');            !!!parse-error (type => 'bad attribute value');
1221            } elsif ($self->{is_xml}) {
1222              !!!cp (93.1);
1223              ## XML5: No parse error.
1224              !!!parse-error (type => 'unquoted attr value'); ## TODO
1225          } else {          } else {
1226            !!!cp (94);            !!!cp (94);
1227          }          }
# Line 1152  sub _get_next_token ($) { Line 1231  sub _get_next_token ($) {
1231          redo A;          redo A;
1232        }        }
1233      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1234          ## XML5: "Tag attribute value double quoted state".
1235          
1236        if ($self->{nc} == 0x0022) { # "        if ($self->{nc} == 0x0022) { # "
1237          !!!cp (95);          !!!cp (95);
1238            ## XML5: "Tag attribute name before state".
1239          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1240          !!!next-input-character;          !!!next-input-character;
1241          redo A;          redo A;
1242        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1243          !!!cp (96);          !!!cp (96);
1244            ## XML5: Not defined yet.
1245    
1246          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1247          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1248          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1193  sub _get_next_token ($) { Line 1277  sub _get_next_token ($) {
1277    
1278          redo A;          redo A;
1279        } else {        } else {
1280          !!!cp (100);          if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1281              !!!cp (100);
1282              ## XML5: Not a parse error.
1283              !!!parse-error (type => 'lt in attr value'); ## TODO: type
1284            } else {
1285              !!!cp (100.1);
1286            }
1287          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1288          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1289                                q["&],                                q["&<],
1290                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1291    
1292          ## Stay in the state          ## Stay in the state
# Line 1204  sub _get_next_token ($) { Line 1294  sub _get_next_token ($) {
1294          redo A;          redo A;
1295        }        }
1296      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1297          ## XML5: "Tag attribute value single quoted state".
1298    
1299        if ($self->{nc} == 0x0027) { # '        if ($self->{nc} == 0x0027) { # '
1300          !!!cp (101);          !!!cp (101);
1301            ## XML5: "Before attribute name state" (sic).
1302          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1303          !!!next-input-character;          !!!next-input-character;
1304          redo A;          redo A;
1305        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1306          !!!cp (102);          !!!cp (102);
1307            ## XML5: Not defined yet.
1308    
1309          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1310          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1311          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1245  sub _get_next_token ($) { Line 1340  sub _get_next_token ($) {
1340    
1341          redo A;          redo A;
1342        } else {        } else {
1343          !!!cp (106);          if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1344              !!!cp (106);
1345              ## XML5: Not a parse error.
1346              !!!parse-error (type => 'lt in attr value'); ## TODO: type
1347            } else {
1348              !!!cp (106.1);
1349            }
1350          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1351          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1352                                q['&],                                q['&<],
1353                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1354    
1355          ## Stay in the state          ## Stay in the state
# Line 1256  sub _get_next_token ($) { Line 1357  sub _get_next_token ($) {
1357          redo A;          redo A;
1358        }        }
1359      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1360          ## XML5: "Tag attribute value unquoted state".
1361    
1362        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1363          !!!cp (107);          !!!cp (107);
1364            ## XML5: "Tag attribute name before state".
1365          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1366          !!!next-input-character;          !!!next-input-character;
1367          redo A;          redo A;
1368        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1369          !!!cp (108);          !!!cp (108);
1370    
1371            ## XML5: Not defined yet.
1372    
1373          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1374          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1375          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1326  sub _get_next_token ($) { Line 1433  sub _get_next_token ($) {
1433               0x003D => 1, # =               0x003D => 1, # =
1434              }->{$self->{nc}}) {              }->{$self->{nc}}) {
1435            !!!cp (115);            !!!cp (115);
1436              ## XML5: Not a parse error.
1437            !!!parse-error (type => 'bad attribute value');            !!!parse-error (type => 'bad attribute value');
1438          } else {          } else {
1439            !!!cp (116);            !!!cp (116);
# Line 1402  sub _get_next_token ($) { Line 1510  sub _get_next_token ($) {
1510          redo A;          redo A;
1511        }        }
1512      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1513          ## XML5: "Empty tag state".
1514    
1515        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
1516          if ($self->{ct}->{type} == END_TAG_TOKEN) {          if ($self->{ct}->{type} == END_TAG_TOKEN) {
1517            !!!cp ('124.2');            !!!cp ('124.2');
# Line 1443  sub _get_next_token ($) { Line 1553  sub _get_next_token ($) {
1553          } else {          } else {
1554            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1555          }          }
1556            ## XML5: "Tag attribute name before state".
1557          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1558          $self->{s_kwd} = '';          $self->{s_kwd} = '';
1559          ## Reconsume.          ## Reconsume.
# Line 1457  sub _get_next_token ($) { Line 1568  sub _get_next_token ($) {
1568          redo A;          redo A;
1569        }        }
1570      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
       ## (only happen if PCDATA state)  
   
1571        ## NOTE: Unlike spec's "bogus comment state", this implementation        ## NOTE: Unlike spec's "bogus comment state", this implementation
1572        ## consumes characters one-by-one basis.        ## consumes characters one-by-one basis.
1573                
1574        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
1575          !!!cp (124);          if ($self->{in_subset}) {
1576          $self->{state} = DATA_STATE;            !!!cp (123);
1577          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1578            } else {
1579              !!!cp (124);
1580              $self->{state} = DATA_STATE;
1581              $self->{s_kwd} = '';
1582            }
1583          !!!next-input-character;          !!!next-input-character;
1584    
1585          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
1586          redo A;          redo A;
1587        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1588          !!!cp (125);          if ($self->{in_subset}) {
1589          $self->{state} = DATA_STATE;            !!!cp (125.1);
1590          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1591            } else {
1592              !!!cp (125);
1593              $self->{state} = DATA_STATE;
1594              $self->{s_kwd} = '';
1595            }
1596          ## reconsume          ## reconsume
1597    
1598          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1490  sub _get_next_token ($) { Line 1609  sub _get_next_token ($) {
1609          redo A;          redo A;
1610        }        }
1611      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1612        ## (only happen if PCDATA state)        ## XML5: "Markup declaration state" and "DOCTYPE markup
1613          ## declaration state".
1614                
1615        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
1616          !!!cp (133);          !!!cp (133);
# Line 1502  sub _get_next_token ($) { Line 1622  sub _get_next_token ($) {
1622          ## ASCII case-insensitive.          ## ASCII case-insensitive.
1623          !!!cp (130);          !!!cp (130);
1624          $self->{state} = MD_DOCTYPE_STATE;          $self->{state} = MD_DOCTYPE_STATE;
1625          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
1626          !!!next-input-character;          !!!next-input-character;
1627          redo A;          redo A;
1628        } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and        } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
# Line 1511  sub _get_next_token ($) { Line 1631  sub _get_next_token ($) {
1631                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
1632          !!!cp (135.4);                          !!!cp (135.4);                
1633          $self->{state} = MD_CDATA_STATE;          $self->{state} = MD_CDATA_STATE;
1634          $self->{s_kwd} = '[';          $self->{kwd} = '[';
1635          !!!next-input-character;          !!!next-input-character;
1636          redo A;          redo A;
1637        } else {        } else {
# Line 1561  sub _get_next_token ($) { Line 1681  sub _get_next_token ($) {
1681              0x0054, # T              0x0054, # T
1682              0x0059, # Y              0x0059, # Y
1683              0x0050, # P              0x0050, # P
1684            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
1685            $self->{nc} == [            $self->{nc} == [
1686              undef,              undef,
1687              0x006F, # o              0x006F, # o
# Line 1569  sub _get_next_token ($) { Line 1689  sub _get_next_token ($) {
1689              0x0074, # t              0x0074, # t
1690              0x0079, # y              0x0079, # y
1691              0x0070, # p              0x0070, # p
1692            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
1693          !!!cp (131);          !!!cp (131);
1694          ## Stay in the state.          ## Stay in the state.
1695          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
1696          !!!next-input-character;          !!!next-input-character;
1697          redo A;          redo A;
1698        } elsif ((length $self->{s_kwd}) == 6 and        } elsif ((length $self->{kwd}) == 6 and
1699                 ($self->{nc} == 0x0045 or # E                 ($self->{nc} == 0x0045 or # E
1700                  $self->{nc} == 0x0065)) { # e                  $self->{nc} == 0x0065)) { # e
1701          if ($self->{s_kwd} ne 'DOCTYP') {          if ($self->{is_xml} and
1702                ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
1703            !!!cp (129);            !!!cp (129);
1704            ## XML5: case-sensitive.            ## XML5: case-sensitive.
1705            !!!parse-error (type => 'lowercase keyword', ## TODO            !!!parse-error (type => 'lowercase keyword', ## TODO
# Line 1600  sub _get_next_token ($) { Line 1721  sub _get_next_token ($) {
1721          !!!cp (132);                  !!!cp (132);        
1722          !!!parse-error (type => 'bogus comment',          !!!parse-error (type => 'bogus comment',
1723                          line => $self->{line_prev},                          line => $self->{line_prev},
1724                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
1725          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
1726          ## Reconsume.          ## Reconsume.
1727          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
1728                                    data => $self->{s_kwd},                                    data => $self->{kwd},
1729                                    line => $self->{line_prev},                                    line => $self->{line_prev},
1730                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
1731                                   };                                   };
1732          redo A;          redo A;
1733        }        }
# Line 1617  sub _get_next_token ($) { Line 1738  sub _get_next_token ($) {
1738              '[CD' => 0x0041, # A              '[CD' => 0x0041, # A
1739              '[CDA' => 0x0054, # T              '[CDA' => 0x0054, # T
1740              '[CDAT' => 0x0041, # A              '[CDAT' => 0x0041, # A
1741            }->{$self->{s_kwd}}) {            }->{$self->{kwd}}) {
1742          !!!cp (135.1);          !!!cp (135.1);
1743          ## Stay in the state.          ## Stay in the state.
1744          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
1745          !!!next-input-character;          !!!next-input-character;
1746          redo A;          redo A;
1747        } elsif ($self->{s_kwd} eq '[CDATA' and        } elsif ($self->{kwd} eq '[CDATA' and
1748                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
1749          if ($self->{is_xml} and          if ($self->{is_xml} and
1750              not $self->{tainted} and              not $self->{tainted} and
# Line 1648  sub _get_next_token ($) { Line 1769  sub _get_next_token ($) {
1769          !!!cp (135.3);          !!!cp (135.3);
1770          !!!parse-error (type => 'bogus comment',          !!!parse-error (type => 'bogus comment',
1771                          line => $self->{line_prev},                          line => $self->{line_prev},
1772                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
1773          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
1774          ## Reconsume.          ## Reconsume.
1775          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
1776                                    data => $self->{s_kwd},                                    data => $self->{kwd},
1777                                    line => $self->{line_prev},                                    line => $self->{line_prev},
1778                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
1779                                   };                                   };
1780          redo A;          redo A;
1781        }        }
# Line 1665  sub _get_next_token ($) { Line 1786  sub _get_next_token ($) {
1786          !!!next-input-character;          !!!next-input-character;
1787          redo A;          redo A;
1788        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
         !!!cp (138);  
1789          !!!parse-error (type => 'bogus comment');          !!!parse-error (type => 'bogus comment');
1790          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1791          $self->{s_kwd} = '';            !!!cp (138.1);
1792              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1793            } else {
1794              !!!cp (138);
1795              $self->{state} = DATA_STATE;
1796              $self->{s_kwd} = '';
1797            }
1798          !!!next-input-character;          !!!next-input-character;
1799    
1800          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
1801    
1802          redo A;          redo A;
1803        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (139);  
1804          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1805          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1806          $self->{s_kwd} = '';            !!!cp (139.1);
1807              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1808            } else {
1809              !!!cp (139);
1810              $self->{state} = DATA_STATE;
1811              $self->{s_kwd} = '';
1812            }
1813          ## reconsume          ## reconsume
1814    
1815          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1699  sub _get_next_token ($) { Line 1830  sub _get_next_token ($) {
1830          !!!next-input-character;          !!!next-input-character;
1831          redo A;          redo A;
1832        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
         !!!cp (142);  
1833          !!!parse-error (type => 'bogus comment');          !!!parse-error (type => 'bogus comment');
1834          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1835          $self->{s_kwd} = '';            !!!cp (142.1);
1836              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1837            } else {
1838              !!!cp (142);
1839              $self->{state} = DATA_STATE;
1840              $self->{s_kwd} = '';
1841            }
1842          !!!next-input-character;          !!!next-input-character;
1843    
1844          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
1845    
1846          redo A;          redo A;
1847        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (143);  
1848          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1849          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1850          $self->{s_kwd} = '';            !!!cp (143.1);
1851              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1852            } else {
1853              !!!cp (143);
1854              $self->{state} = DATA_STATE;
1855              $self->{s_kwd} = '';
1856            }
1857          ## reconsume          ## reconsume
1858    
1859          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1733  sub _get_next_token ($) { Line 1874  sub _get_next_token ($) {
1874          !!!next-input-character;          !!!next-input-character;
1875          redo A;          redo A;
1876        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (146);  
1877          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1878          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1879          $self->{s_kwd} = '';            !!!cp (146.1);
1880              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1881            } else {
1882              !!!cp (146);
1883              $self->{state} = DATA_STATE;
1884              $self->{s_kwd} = '';
1885            }
1886          ## reconsume          ## reconsume
1887    
1888          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1762  sub _get_next_token ($) { Line 1908  sub _get_next_token ($) {
1908          !!!next-input-character;          !!!next-input-character;
1909          redo A;          redo A;
1910        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (149);  
1911          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1912          $self->{s_kwd} = '';          if ($self->{in_subset}) {
1913          $self->{state} = DATA_STATE;            !!!cp (149.1);
1914          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1915            } else {
1916              !!!cp (149);
1917              $self->{state} = DATA_STATE;
1918              $self->{s_kwd} = '';
1919            }
1920          ## reconsume          ## reconsume
1921    
1922          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1781  sub _get_next_token ($) { Line 1931  sub _get_next_token ($) {
1931        }        }
1932      } elsif ($self->{state} == COMMENT_END_STATE) {      } elsif ($self->{state} == COMMENT_END_STATE) {
1933        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
1934          !!!cp (151);          if ($self->{in_subset}) {
1935          $self->{state} = DATA_STATE;            !!!cp (151.1);
1936          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1937            } else {
1938              !!!cp (151);
1939              $self->{state} = DATA_STATE;
1940              $self->{s_kwd} = '';
1941            }
1942          !!!next-input-character;          !!!next-input-character;
1943    
1944          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1800  sub _get_next_token ($) { Line 1955  sub _get_next_token ($) {
1955          !!!next-input-character;          !!!next-input-character;
1956          redo A;          redo A;
1957        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (153);  
1958          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1959          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1960          $self->{s_kwd} = '';            !!!cp (153.1);
1961              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1962            } else {
1963              !!!cp (153);
1964              $self->{state} = DATA_STATE;
1965              $self->{s_kwd} = '';
1966            }
1967          ## reconsume          ## reconsume
1968    
1969          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1828  sub _get_next_token ($) { Line 1988  sub _get_next_token ($) {
1988          redo A;          redo A;
1989        } else {        } else {
1990          !!!cp (156);          !!!cp (156);
1991            ## XML5: Unless EOF, swith to the bogus comment state.
1992          !!!parse-error (type => 'no space before DOCTYPE name');          !!!parse-error (type => 'no space before DOCTYPE name');
1993          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1994          ## reconsume          ## reconsume
1995          redo A;          redo A;
1996        }        }
1997      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
1998          ## XML5: "DOCTYPE root name before state".
1999    
2000        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2001          !!!cp (157);          !!!cp (157);
2002          ## Stay in the state          ## Stay in the state
# Line 1841  sub _get_next_token ($) { Line 2004  sub _get_next_token ($) {
2004          redo A;          redo A;
2005        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2006          !!!cp (158);          !!!cp (158);
2007            ## XML5: No parse error.
2008          !!!parse-error (type => 'no DOCTYPE name');          !!!parse-error (type => 'no DOCTYPE name');
2009          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2010          $self->{s_kwd} = '';          $self->{s_kwd} = '';
# Line 1859  sub _get_next_token ($) { Line 2023  sub _get_next_token ($) {
2023          !!!emit ($self->{ct}); # DOCTYPE (quirks)          !!!emit ($self->{ct}); # DOCTYPE (quirks)
2024    
2025          redo A;          redo A;
2026          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2027            !!!cp (159.1);
2028            !!!parse-error (type => 'no DOCTYPE name');
2029            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2030            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2031            $self->{in_subset} = 1;
2032            !!!next-input-character;
2033            !!!emit ($self->{ct}); # DOCTYPE
2034            redo A;
2035        } else {        } else {
2036          !!!cp (160);          !!!cp (160);
2037          $self->{ct}->{name} = chr $self->{nc};          $self->{ct}->{name} = chr $self->{nc};
# Line 1868  sub _get_next_token ($) { Line 2041  sub _get_next_token ($) {
2041          redo A;          redo A;
2042        }        }
2043      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2044  ## ISSUE: Redundant "First," in the spec.        ## XML5: "DOCTYPE root name state".
2045    
2046          ## ISSUE: Redundant "First," in the spec.
2047    
2048        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2049          !!!cp (161);          !!!cp (161);
2050          $self->{state} = AFTER_DOCTYPE_NAME_STATE;          $self->{state} = AFTER_DOCTYPE_NAME_STATE;
# Line 1894  sub _get_next_token ($) { Line 2070  sub _get_next_token ($) {
2070          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2071    
2072          redo A;          redo A;
2073          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2074            !!!cp (163.1);
2075            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2076            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2077            $self->{in_subset} = 1;
2078            !!!next-input-character;
2079            !!!emit ($self->{ct}); # DOCTYPE
2080            redo A;
2081        } else {        } else {
2082          !!!cp (164);          !!!cp (164);
2083          $self->{ct}->{name}          $self->{ct}->{name}
# Line 1903  sub _get_next_token ($) { Line 2087  sub _get_next_token ($) {
2087          redo A;          redo A;
2088        }        }
2089      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2090          ## XML5: Corresponding to XML5's "DOCTYPE root name after
2091          ## state", but implemented differently.
2092    
2093        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2094          !!!cp (165);          !!!cp (165);
2095          ## Stay in the state          ## Stay in the state
# Line 1930  sub _get_next_token ($) { Line 2117  sub _get_next_token ($) {
2117          redo A;          redo A;
2118        } elsif ($self->{nc} == 0x0050 or # P        } elsif ($self->{nc} == 0x0050 or # P
2119                 $self->{nc} == 0x0070) { # p                 $self->{nc} == 0x0070) { # p
2120            !!!cp (167.1);
2121          $self->{state} = PUBLIC_STATE;          $self->{state} = PUBLIC_STATE;
2122          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
2123          !!!next-input-character;          !!!next-input-character;
2124          redo A;          redo A;
2125        } elsif ($self->{nc} == 0x0053 or # S        } elsif ($self->{nc} == 0x0053 or # S
2126                 $self->{nc} == 0x0073) { # s                 $self->{nc} == 0x0073) { # s
2127            !!!cp (167.2);
2128          $self->{state} = SYSTEM_STATE;          $self->{state} = SYSTEM_STATE;
2129          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
2130          !!!next-input-character;          !!!next-input-character;
2131          redo A;          redo A;
2132          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2133            !!!cp (167.3);
2134            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2135            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2136            $self->{in_subset} = 1;
2137            !!!next-input-character;
2138            !!!emit ($self->{ct}); # DOCTYPE
2139            redo A;
2140        } else {        } else {
2141          !!!cp (180);          !!!cp (180);
2142          !!!parse-error (type => 'string after DOCTYPE name');          !!!parse-error (type => 'string after DOCTYPE name');
# Line 1957  sub _get_next_token ($) { Line 2154  sub _get_next_token ($) {
2154              0x0042, # B              0x0042, # B
2155              0x004C, # L              0x004C, # L
2156              0x0049, # I              0x0049, # I
2157            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
2158            $self->{nc} == [            $self->{nc} == [
2159              undef,              undef,
2160              0x0075, # u              0x0075, # u
2161              0x0062, # b              0x0062, # b
2162              0x006C, # l              0x006C, # l
2163              0x0069, # i              0x0069, # i
2164            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
2165          !!!cp (175);          !!!cp (175);
2166          ## Stay in the state.          ## Stay in the state.
2167          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2168          !!!next-input-character;          !!!next-input-character;
2169          redo A;          redo A;
2170        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
2171                 ($self->{nc} == 0x0043 or # C                 ($self->{nc} == 0x0043 or # C
2172                  $self->{nc} == 0x0063)) { # c                  $self->{nc} == 0x0063)) { # c
2173          !!!cp (168);          if ($self->{is_xml} and
2174                ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
2175              !!!cp (168.1);
2176              !!!parse-error (type => 'lowercase keyword', ## TODO: type
2177                              text => 'PUBLIC',
2178                              line => $self->{line_prev},
2179                              column => $self->{column_prev} - 4);
2180            } else {
2181              !!!cp (168);
2182            }
2183          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2184          !!!next-input-character;          !!!next-input-character;
2185          redo A;          redo A;
# Line 1981  sub _get_next_token ($) { Line 2187  sub _get_next_token ($) {
2187          !!!cp (169);          !!!cp (169);
2188          !!!parse-error (type => 'string after DOCTYPE name',          !!!parse-error (type => 'string after DOCTYPE name',
2189                          line => $self->{line_prev},                          line => $self->{line_prev},
2190                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
2191          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2192    
2193          $self->{state} = BOGUS_DOCTYPE_STATE;          $self->{state} = BOGUS_DOCTYPE_STATE;
# Line 1996  sub _get_next_token ($) { Line 2202  sub _get_next_token ($) {
2202              0x0053, # S              0x0053, # S
2203              0x0054, # T              0x0054, # T
2204              0x0045, # E              0x0045, # E
2205            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
2206            $self->{nc} == [            $self->{nc} == [
2207              undef,              undef,
2208              0x0079, # y              0x0079, # y
2209              0x0073, # s              0x0073, # s
2210              0x0074, # t              0x0074, # t
2211              0x0065, # e              0x0065, # e
2212            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
2213          !!!cp (170);          !!!cp (170);
2214          ## Stay in the state.          ## Stay in the state.
2215          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2216          !!!next-input-character;          !!!next-input-character;
2217          redo A;          redo A;
2218        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
2219                 ($self->{nc} == 0x004D or # M                 ($self->{nc} == 0x004D or # M
2220                  $self->{nc} == 0x006D)) { # m                  $self->{nc} == 0x006D)) { # m
2221          !!!cp (171);          if ($self->{is_xml} and
2222                ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
2223              !!!cp (171.1);
2224              !!!parse-error (type => 'lowercase keyword', ## TODO: type
2225                              text => 'SYSTEM',
2226                              line => $self->{line_prev},
2227                              column => $self->{column_prev} - 4);
2228            } else {
2229              !!!cp (171);
2230            }
2231          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2232          !!!next-input-character;          !!!next-input-character;
2233          redo A;          redo A;
# Line 2020  sub _get_next_token ($) { Line 2235  sub _get_next_token ($) {
2235          !!!cp (172);          !!!cp (172);
2236          !!!parse-error (type => 'string after DOCTYPE name',          !!!parse-error (type => 'string after DOCTYPE name',
2237                          line => $self->{line_prev},                          line => $self->{line_prev},
2238                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
2239          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2240    
2241          $self->{state} = BOGUS_DOCTYPE_STATE;          $self->{state} = BOGUS_DOCTYPE_STATE;
# Line 2069  sub _get_next_token ($) { Line 2284  sub _get_next_token ($) {
2284          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2285    
2286          redo A;          redo A;
2287          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2288            !!!cp (186.1);
2289            !!!parse-error (type => 'no PUBLIC literal');
2290            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2291            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2292            $self->{in_subset} = 1;
2293            !!!next-input-character;
2294            !!!emit ($self->{ct}); # DOCTYPE
2295            redo A;
2296        } else {        } else {
2297          !!!cp (186);          !!!cp (186);
2298          !!!parse-error (type => 'string after PUBLIC');          !!!parse-error (type => 'string after PUBLIC');
# Line 2179  sub _get_next_token ($) { Line 2403  sub _get_next_token ($) {
2403          !!!next-input-character;          !!!next-input-character;
2404          redo A;          redo A;
2405        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2406          !!!cp (198);          if ($self->{is_xml}) {
2407              !!!cp (198.1);
2408              !!!parse-error (type => 'no SYSTEM literal');
2409            } else {
2410              !!!cp (198);
2411            }
2412          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2413          $self->{s_kwd} = '';          $self->{s_kwd} = '';
2414          !!!next-input-character;          !!!next-input-character;
# Line 2199  sub _get_next_token ($) { Line 2428  sub _get_next_token ($) {
2428          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2429    
2430          redo A;          redo A;
2431          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2432            !!!cp (200.1);
2433            !!!parse-error (type => 'no SYSTEM literal');
2434            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2435            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2436            $self->{in_subset} = 1;
2437            !!!next-input-character;
2438            !!!emit ($self->{ct}); # DOCTYPE
2439            redo A;
2440        } else {        } else {
2441          !!!cp (200);          !!!cp (200);
2442          !!!parse-error (type => 'string after PUBLIC literal');          !!!parse-error (type => 'string after PUBLIC literal');
# Line 2249  sub _get_next_token ($) { Line 2487  sub _get_next_token ($) {
2487          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2488    
2489          redo A;          redo A;
2490          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2491            !!!cp (206.1);
2492            !!!parse-error (type => 'no SYSTEM literal');
2493    
2494            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2495            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2496            $self->{in_subset} = 1;
2497            !!!next-input-character;
2498            !!!emit ($self->{ct}); # DOCTYPE
2499            redo A;
2500        } else {        } else {
2501          !!!cp (206);          !!!cp (206);
2502          !!!parse-error (type => 'string after SYSTEM');          !!!parse-error (type => 'string after SYSTEM');
# Line 2264  sub _get_next_token ($) { Line 2512  sub _get_next_token ($) {
2512          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2513          !!!next-input-character;          !!!next-input-character;
2514          redo A;          redo A;
2515        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2516          !!!cp (208);          !!!cp (208);
2517          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2518    
# Line 2305  sub _get_next_token ($) { Line 2553  sub _get_next_token ($) {
2553          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2554          !!!next-input-character;          !!!next-input-character;
2555          redo A;          redo A;
2556        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2557          !!!cp (212);          !!!cp (212);
2558          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2559    
# Line 2366  sub _get_next_token ($) { Line 2614  sub _get_next_token ($) {
2614          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2615    
2616          redo A;          redo A;
2617          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2618            !!!cp (218.1);
2619            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2620            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2621            $self->{in_subset} = 1;
2622            !!!next-input-character;
2623            !!!emit ($self->{ct}); # DOCTYPE
2624            redo A;
2625        } else {        } else {
2626          !!!cp (218);          !!!cp (218);
2627          !!!parse-error (type => 'string after SYSTEM literal');          !!!parse-error (type => 'string after SYSTEM literal');
# Line 2385  sub _get_next_token ($) { Line 2641  sub _get_next_token ($) {
2641          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2642    
2643          redo A;          redo A;
2644          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2645            !!!cp (220.1);
2646            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2647            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2648            $self->{in_subset} = 1;
2649            !!!next-input-character;
2650            !!!emit ($self->{ct}); # DOCTYPE
2651            redo A;
2652        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2653          !!!cp (220);          !!!cp (220);
2654          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
# Line 2397  sub _get_next_token ($) { Line 2661  sub _get_next_token ($) {
2661        } else {        } else {
2662          !!!cp (221);          !!!cp (221);
2663          my $s = '';          my $s = '';
2664          $self->{read_until}->($s, q[>], 0);          $self->{read_until}->($s, q{>[}, 0);
2665    
2666          ## Stay in the state          ## Stay in the state
2667          !!!next-input-character;          !!!next-input-character;
# Line 2505  sub _get_next_token ($) { Line 2769  sub _get_next_token ($) {
2769        } elsif ($self->{nc} == 0x0023) { # #        } elsif ($self->{nc} == 0x0023) { # #
2770          !!!cp (999);          !!!cp (999);
2771          $self->{state} = ENTITY_HASH_STATE;          $self->{state} = ENTITY_HASH_STATE;
2772          $self->{s_kwd} = '#';          $self->{kwd} = '#';
2773          !!!next-input-character;          !!!next-input-character;
2774          redo A;          redo A;
2775        } elsif ((0x0041 <= $self->{nc} and        } elsif ((0x0041 <= $self->{nc} and
# Line 2515  sub _get_next_token ($) { Line 2779  sub _get_next_token ($) {
2779          !!!cp (998);          !!!cp (998);
2780          require Whatpm::_NamedEntityList;          require Whatpm::_NamedEntityList;
2781          $self->{state} = ENTITY_NAME_STATE;          $self->{state} = ENTITY_NAME_STATE;
2782          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
2783          $self->{entity__value} = $self->{s_kwd};          $self->{entity__value} = $self->{kwd};
2784          $self->{entity__match} = 0;          $self->{entity__match} = 0;
2785          !!!next-input-character;          !!!next-input-character;
2786          redo A;          redo A;
# Line 2556  sub _get_next_token ($) { Line 2820  sub _get_next_token ($) {
2820            $self->{nc} == 0x0058) { # X            $self->{nc} == 0x0058) { # X
2821          !!!cp (995);          !!!cp (995);
2822          $self->{state} = HEXREF_X_STATE;          $self->{state} = HEXREF_X_STATE;
2823          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2824          !!!next-input-character;          !!!next-input-character;
2825          redo A;          redo A;
2826        } elsif (0x0030 <= $self->{nc} and        } elsif (0x0030 <= $self->{nc} and
2827                 $self->{nc} <= 0x0039) { # 0..9                 $self->{nc} <= 0x0039) { # 0..9
2828          !!!cp (994);          !!!cp (994);
2829          $self->{state} = NCR_NUM_STATE;          $self->{state} = NCR_NUM_STATE;
2830          $self->{s_kwd} = $self->{nc} - 0x0030;          $self->{kwd} = $self->{nc} - 0x0030;
2831          !!!next-input-character;          !!!next-input-character;
2832          redo A;          redo A;
2833        } else {        } else {
# Line 2599  sub _get_next_token ($) { Line 2863  sub _get_next_token ($) {
2863        if (0x0030 <= $self->{nc} and        if (0x0030 <= $self->{nc} and
2864            $self->{nc} <= 0x0039) { # 0..9            $self->{nc} <= 0x0039) { # 0..9
2865          !!!cp (1012);          !!!cp (1012);
2866          $self->{s_kwd} *= 10;          $self->{kwd} *= 10;
2867          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
2868                    
2869          ## Stay in the state.          ## Stay in the state.
2870          !!!next-input-character;          !!!next-input-character;
# Line 2616  sub _get_next_token ($) { Line 2880  sub _get_next_token ($) {
2880          #          #
2881        }        }
2882    
2883        my $code = $self->{s_kwd};        my $code = $self->{kwd};
2884        my $l = $self->{line_prev};        my $l = $self->{line_prev};
2885        my $c = $self->{column_prev};        my $c = $self->{column_prev};
2886        if ($charref_map->{$code}) {        if ($charref_map->{$code}) {
# Line 2659  sub _get_next_token ($) { Line 2923  sub _get_next_token ($) {
2923          # 0..9, A..F, a..f          # 0..9, A..F, a..f
2924          !!!cp (990);          !!!cp (990);
2925          $self->{state} = HEXREF_HEX_STATE;          $self->{state} = HEXREF_HEX_STATE;
2926          $self->{s_kwd} = 0;          $self->{kwd} = 0;
2927          ## Reconsume.          ## Reconsume.
2928          redo A;          redo A;
2929        } else {        } else {
# Line 2677  sub _get_next_token ($) { Line 2941  sub _get_next_token ($) {
2941            $self->{s_kwd} = '';            $self->{s_kwd} = '';
2942            ## Reconsume.            ## Reconsume.
2943            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
2944                      data => '&' . $self->{s_kwd},                      data => '&' . $self->{kwd},
2945                      line => $self->{line_prev},                      line => $self->{line_prev},
2946                      column => $self->{column_prev} - length $self->{s_kwd},                      column => $self->{column_prev} - length $self->{kwd},
2947                     });                     });
2948            redo A;            redo A;
2949          } else {          } else {
2950            !!!cp (989);            !!!cp (989);
2951            $self->{ca}->{value} .= '&' . $self->{s_kwd};            $self->{ca}->{value} .= '&' . $self->{kwd};
2952            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
2953            $self->{s_kwd} = '';            $self->{s_kwd} = '';
2954            ## Reconsume.            ## Reconsume.
# Line 2695  sub _get_next_token ($) { Line 2959  sub _get_next_token ($) {
2959        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
2960          # 0..9          # 0..9
2961          !!!cp (1002);          !!!cp (1002);
2962          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
2963          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
2964          ## Stay in the state.          ## Stay in the state.
2965          !!!next-input-character;          !!!next-input-character;
2966          redo A;          redo A;
2967        } elsif (0x0061 <= $self->{nc} and        } elsif (0x0061 <= $self->{nc} and
2968                 $self->{nc} <= 0x0066) { # a..f                 $self->{nc} <= 0x0066) { # a..f
2969          !!!cp (1003);          !!!cp (1003);
2970          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
2971          $self->{s_kwd} += $self->{nc} - 0x0060 + 9;          $self->{kwd} += $self->{nc} - 0x0060 + 9;
2972          ## Stay in the state.          ## Stay in the state.
2973          !!!next-input-character;          !!!next-input-character;
2974          redo A;          redo A;
2975        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
2976                 $self->{nc} <= 0x0046) { # A..F                 $self->{nc} <= 0x0046) { # A..F
2977          !!!cp (1004);          !!!cp (1004);
2978          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
2979          $self->{s_kwd} += $self->{nc} - 0x0040 + 9;          $self->{kwd} += $self->{nc} - 0x0040 + 9;
2980          ## Stay in the state.          ## Stay in the state.
2981          !!!next-input-character;          !!!next-input-character;
2982          redo A;          redo A;
# Line 2729  sub _get_next_token ($) { Line 2993  sub _get_next_token ($) {
2993          #          #
2994        }        }
2995    
2996        my $code = $self->{s_kwd};        my $code = $self->{kwd};
2997        my $l = $self->{line_prev};        my $l = $self->{line_prev};
2998        my $c = $self->{column_prev};        my $c = $self->{column_prev};
2999        if ($charref_map->{$code}) {        if ($charref_map->{$code}) {
# Line 2766  sub _get_next_token ($) { Line 3030  sub _get_next_token ($) {
3030          redo A;          redo A;
3031        }        }
3032      } elsif ($self->{state} == ENTITY_NAME_STATE) {      } elsif ($self->{state} == ENTITY_NAME_STATE) {
3033        if (length $self->{s_kwd} < 30 and        if (length $self->{kwd} < 30 and
3034            ## NOTE: Some number greater than the maximum length of entity name            ## NOTE: Some number greater than the maximum length of entity name
3035            ((0x0041 <= $self->{nc} and # a            ((0x0041 <= $self->{nc} and # a
3036              $self->{nc} <= 0x005A) or # x              $self->{nc} <= 0x005A) or # x
# Line 2776  sub _get_next_token ($) { Line 3040  sub _get_next_token ($) {
3040              $self->{nc} <= 0x0039) or # 9              $self->{nc} <= 0x0039) or # 9
3041             $self->{nc} == 0x003B)) { # ;             $self->{nc} == 0x003B)) { # ;
3042          our $EntityChar;          our $EntityChar;
3043          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3044          if (defined $EntityChar->{$self->{s_kwd}}) {          if (defined $EntityChar->{$self->{kwd}}) {
3045            if ($self->{nc} == 0x003B) { # ;            if ($self->{nc} == 0x003B) { # ;
3046              !!!cp (1020);              !!!cp (1020);
3047              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};              $self->{entity__value} = $EntityChar->{$self->{kwd}};
3048              $self->{entity__match} = 1;              $self->{entity__match} = 1;
3049              !!!next-input-character;              !!!next-input-character;
3050              #              #
3051            } else {            } else {
3052              !!!cp (1021);              !!!cp (1021);
3053              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};              $self->{entity__value} = $EntityChar->{$self->{kwd}};
3054              $self->{entity__match} = -1;              $self->{entity__match} = -1;
3055              ## Stay in the state.              ## Stay in the state.
3056              !!!next-input-character;              !!!next-input-character;
# Line 2814  sub _get_next_token ($) { Line 3078  sub _get_next_token ($) {
3078          if ($self->{prev_state} != DATA_STATE and # in attribute          if ($self->{prev_state} != DATA_STATE and # in attribute
3079              $self->{entity__match} < -1) {              $self->{entity__match} < -1) {
3080            !!!cp (1024);            !!!cp (1024);
3081            $data = '&' . $self->{s_kwd};            $data = '&' . $self->{kwd};
3082            #            #
3083          } else {          } else {
3084            !!!cp (1025);            !!!cp (1025);
# Line 2826  sub _get_next_token ($) { Line 3090  sub _get_next_token ($) {
3090          !!!cp (1026);          !!!cp (1026);
3091          !!!parse-error (type => 'bare ero',          !!!parse-error (type => 'bare ero',
3092                          line => $self->{line_prev},                          line => $self->{line_prev},
3093                          column => $self->{column_prev} - length $self->{s_kwd});                          column => $self->{column_prev} - length $self->{kwd});
3094          $data = '&' . $self->{s_kwd};          $data = '&' . $self->{kwd};
3095          #          #
3096        }        }
3097        
# Line 2850  sub _get_next_token ($) { Line 3114  sub _get_next_token ($) {
3114                    data => $data,                    data => $data,
3115                    has_reference => $has_ref,                    has_reference => $has_ref,
3116                    line => $self->{line_prev},                    line => $self->{line_prev},
3117                    column => $self->{column_prev} + 1 - length $self->{s_kwd},                    column => $self->{column_prev} + 1 - length $self->{kwd},
3118                   });                   });
3119          redo A;          redo A;
3120        } else {        } else {
# Line 2900  sub _get_next_token ($) { Line 3164  sub _get_next_token ($) {
3164          redo A;          redo A;
3165        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3166          !!!parse-error (type => 'no pic'); ## TODO: type          !!!parse-error (type => 'no pic'); ## TODO: type
3167          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
3168          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3169            } else {
3170              $self->{state} = DATA_STATE;
3171              $self->{s_kwd} = '';
3172            }
3173          ## Reconsume.          ## Reconsume.
3174          !!!emit ($self->{ct}); # pi          !!!emit ($self->{ct}); # pi
3175          redo A;          redo A;
# Line 2932  sub _get_next_token ($) { Line 3200  sub _get_next_token ($) {
3200          redo A;          redo A;
3201        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3202          !!!parse-error (type => 'no pic'); ## TODO: type          !!!parse-error (type => 'no pic'); ## TODO: type
3203          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
3204          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3205            } else {
3206              $self->{state} = DATA_STATE;
3207              $self->{s_kwd} = '';
3208            }
3209          ## Reprocess.          ## Reprocess.
3210          !!!emit ($self->{ct}); # pi          !!!emit ($self->{ct}); # pi
3211          redo A;          redo A;
# Line 2948  sub _get_next_token ($) { Line 3220  sub _get_next_token ($) {
3220        }        }
3221      } elsif ($self->{state} == PI_AFTER_STATE) {      } elsif ($self->{state} == PI_AFTER_STATE) {
3222        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
3223          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
3224          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3225            } else {
3226              $self->{state} = DATA_STATE;
3227              $self->{s_kwd} = '';
3228            }
3229          !!!next-input-character;          !!!next-input-character;
3230          !!!emit ($self->{ct}); # pi          !!!emit ($self->{ct}); # pi
3231          redo A;          redo A;
# Line 2974  sub _get_next_token ($) { Line 3250  sub _get_next_token ($) {
3250      } elsif ($self->{state} == PI_DATA_AFTER_STATE) {      } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
3251        ## XML5: Same as "pi after state" in XML5        ## XML5: Same as "pi after state" in XML5
3252        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
3253          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
3254          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3255            } else {
3256              $self->{state} = DATA_STATE;
3257              $self->{s_kwd} = '';
3258            }
3259          !!!next-input-character;          !!!next-input-character;
3260          !!!emit ($self->{ct}); # pi          !!!emit ($self->{ct}); # pi
3261          redo A;          redo A;
# Line 2990  sub _get_next_token ($) { Line 3270  sub _get_next_token ($) {
3270          ## Reprocess.          ## Reprocess.
3271          redo A;          redo A;
3272        }        }
3273    
3274        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
3275          if ($self->{nc} == 0x003C) { # <
3276            $self->{state} = DOCTYPE_TAG_STATE;
3277            !!!next-input-character;
3278            redo A;
3279          } elsif ($self->{nc} == 0x0025) { # %
3280            ## XML5: Not defined yet.
3281    
3282            ## TODO:
3283            !!!next-input-character;
3284            redo A;
3285          } elsif ($self->{nc} == 0x005D) { # ]
3286            delete $self->{in_subset};
3287            $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3288            !!!next-input-character;
3289            redo A;
3290          } elsif ($is_space->{$self->{nc}}) {
3291            ## Stay in the state.
3292            !!!next-input-character;
3293            redo A;
3294          } elsif ($self->{nc} == -1) {
3295            !!!parse-error (type => 'unclosed internal subset'); ## TODO: type
3296            delete $self->{in_subset};
3297            $self->{state} = DATA_STATE;
3298            $self->{s_kwd} = '';
3299            ## Reconsume.
3300            !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3301            redo A;
3302          } else {
3303            unless ($self->{internal_subset_tainted}) {
3304              ## XML5: No parse error.
3305              !!!parse-error (type => 'string in internal subset');
3306              $self->{internal_subset_tainted} = 1;
3307            }
3308            ## Stay in the state.
3309            !!!next-input-character;
3310            redo A;
3311          }
3312        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3313          if ($self->{nc} == 0x003E) { # >
3314            $self->{state} = DATA_STATE;
3315            $self->{s_kwd} = '';
3316            !!!next-input-character;
3317            !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3318            redo A;
3319          } elsif ($self->{nc} == -1) {
3320            !!!parse-error (type => 'unclosed DOCTYPE');
3321            $self->{state} = DATA_STATE;
3322            $self->{s_kwd} = '';
3323            ## Reconsume.
3324            !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3325            redo A;
3326          } else {
3327            ## XML5: No parse error and stay in the state.
3328            !!!parse-error (type => 'string after internal subset'); ## TODO: type
3329    
3330            $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3331            !!!next-input-character;
3332            redo A;
3333          }
3334        } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3335          if ($self->{nc} == 0x003E) { # >
3336            $self->{state} = DATA_STATE;
3337            $self->{s_kwd} = '';
3338            !!!next-input-character;
3339            !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3340            redo A;
3341          } elsif ($self->{nc} == -1) {
3342            $self->{state} = DATA_STATE;
3343            $self->{s_kwd} = '';
3344            ## Reconsume.
3345            !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3346            redo A;
3347          } else {
3348            ## Stay in the state.
3349            !!!next-input-character;
3350            redo A;
3351          }
3352        } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
3353          if ($self->{nc} == 0x0021) { # !
3354            $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
3355            !!!next-input-character;
3356            redo A;
3357          } elsif ($self->{nc} == 0x003F) { # ?
3358            $self->{state} = PI_STATE;
3359            !!!next-input-character;
3360            redo A;
3361          } elsif ($self->{nc} == -1) {
3362            !!!parse-error (type => 'bare stago');
3363            $self->{state} = DATA_STATE;
3364            $self->{s_kwd} = '';
3365            ## Reconsume.
3366            redo A;
3367          } else {
3368            !!!parse-error (type => 'bare stago', ## XML5: Not a parse error.
3369                            line => $self->{line_prev},
3370                            column => $self->{column_prev});
3371            $self->{state} = BOGUS_COMMENT_STATE;
3372            $self->{ct} = {type => COMMENT_TOKEN,
3373                           data => '',
3374                          }; ## NOTE: Will be discarded.
3375            !!!next-input-character;
3376            redo A;
3377          }
3378                    
3379      } else {      } else {
3380        die "$0: $self->{state}: Unknown state";        die "$0: $self->{state}: Unknown state";

Legend:
Removed from v.1.10  
changed lines
  Added in v.1.13

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24