/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.10 by wakaba, Wed Oct 15 08:51:02 2008 UTC revision 1.12 by wakaba, Wed Oct 15 12:49:49 2008 UTC
# Line 31  BEGIN { Line 31  BEGIN {
31    );    );
32  }  }
33    
34    ## NOTE: Differences from the XML5 draft are marked as "XML5:".
35    
36  ## Token types  ## Token types
37    
38  sub DOCTYPE_TOKEN () { 1 }  sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
39  sub COMMENT_TOKEN () { 2 }  sub COMMENT_TOKEN () { 2 }
40  sub START_TAG_TOKEN () { 3 }  sub START_TAG_TOKEN () { 3 }
41  sub END_TAG_TOKEN () { 4 }  sub END_TAG_TOKEN () { 4 }
42  sub END_OF_FILE_TOKEN () { 5 }  sub END_OF_FILE_TOKEN () { 5 }
43  sub CHARACTER_TOKEN () { 6 }  sub CHARACTER_TOKEN () { 6 }
44  sub PI_TOKEN () { 7 } # XML5  sub PI_TOKEN () { 7 } ## NOTE: XML only.
45  sub ABORT_TOKEN () { 8 } # Not a token actually  sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
46    
47    ## XML5: XML5 has "empty tag token".  In this implementation, it is
48    ## represented as a start tag token with $self->{self_closing} flag
49    ## set to true.
50    
51    ## XML5: XML5 has "short end tag token".  In this implementation, it
52    ## is represented as an end tag token with $token->{tag_name} flag set
53    ## to an empty string.
54    
55  package Whatpm::HTML;  package Whatpm::HTML;
56    
# Line 114  sub HEXREF_HEX_STATE () { 48 } Line 124  sub HEXREF_HEX_STATE () { 48 }
124  sub ENTITY_NAME_STATE () { 49 }  sub ENTITY_NAME_STATE () { 49 }
125  sub PCDATA_STATE () { 50 } # "data state" in the spec  sub PCDATA_STATE () { 50 } # "data state" in the spec
126    
127  ## XML states  ## XML-only states
128  sub PI_STATE () { 51 }  sub PI_STATE () { 51 }
129  sub PI_TARGET_STATE () { 52 }  sub PI_TARGET_STATE () { 52 }
130  sub PI_TARGET_AFTER_STATE () { 53 }  sub PI_TARGET_AFTER_STATE () { 53 }
131  sub PI_DATA_STATE () { 54 }  sub PI_DATA_STATE () { 54 }
132  sub PI_AFTER_STATE () { 55 }  sub PI_AFTER_STATE () { 55 }
133  sub PI_DATA_AFTER_STATE () { 56 }  sub PI_DATA_AFTER_STATE () { 56 }
134    sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
135    sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
136    
137  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
138  ## list and descriptions)  ## list and descriptions)
# Line 186  sub _initialize_tokenizer ($) { Line 198  sub _initialize_tokenizer ($) {
198    #$self->{is_xml} (if XML)    #$self->{is_xml} (if XML)
199    
200    $self->{state} = DATA_STATE; # MUST    $self->{state} = DATA_STATE; # MUST
201    $self->{s_kwd} = ''; # state keyword    $self->{s_kwd} = ''; # Data state keyword
202      #$self->{kwd} = ''; # State-dependent keyword; initialized when used
203    #$self->{entity__value}; # initialized when used    #$self->{entity__value}; # initialized when used
204    #$self->{entity__match}; # initialized when used    #$self->{entity__match}; # initialized when used
205    $self->{content_model} = PCDATA_CONTENT_MODEL; # be    $self->{content_model} = PCDATA_CONTENT_MODEL; # be
# Line 206  sub _initialize_tokenizer ($) { Line 219  sub _initialize_tokenizer ($) {
219    
220  ## A token has:  ## A token has:
221  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
222  ##       CHARACTER_TOKEN, or END_OF_FILE_TOKEN  ##       CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
223  ##   ->{name} (DOCTYPE_TOKEN)  ##   ->{name} (DOCTYPE_TOKEN)
224  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
225    ##   ->{target} (PI_TOKEN)
226  ##   ->{pubid} (DOCTYPE_TOKEN)  ##   ->{pubid} (DOCTYPE_TOKEN)
227  ##   ->{sysid} (DOCTYPE_TOKEN)  ##   ->{sysid} (DOCTYPE_TOKEN)
228  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
# Line 216  sub _initialize_tokenizer ($) { Line 230  sub _initialize_tokenizer ($) {
230  ##        ->{name}  ##        ->{name}
231  ##        ->{value}  ##        ->{value}
232  ##        ->{has_reference} == 1 or 0  ##        ->{has_reference} == 1 or 0
233  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)  ##        ->{index}: Index of the attribute in a tag.
234    ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
235  ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)  ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
236    ##   ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
237    ##   ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
238    
239  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
240  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|
241  ##     while the token is pushed back to the stack.  ##     while the token is pushed back to the stack.
# Line 237  my $is_space = { Line 255  my $is_space = {
255    0x0009 => 1, # CHARACTER TABULATION (HT)    0x0009 => 1, # CHARACTER TABULATION (HT)
256    0x000A => 1, # LINE FEED (LF)    0x000A => 1, # LINE FEED (LF)
257    #0x000B => 0, # LINE TABULATION (VT)    #0x000B => 0, # LINE TABULATION (VT)
258    0x000C => 1, # FORM FEED (FF)    0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
259    #0x000D => 1, # CARRIAGE RETURN (CR)    #0x000D => 1, # CARRIAGE RETURN (CR)
260    0x0020 => 1, # SPACE (SP)    0x0020 => 1, # SPACE (SP)
261  };  };
# Line 447  sub _get_next_token ($) { Line 465  sub _get_next_token ($) {
465            redo A;            redo A;
466          } elsif ($self->{nc} == 0x0021) { # !          } elsif ($self->{nc} == 0x0021) { # !
467            !!!cp (15.1);            !!!cp (15.1);
468            $self->{s_kwd} = '<' unless $self->{escape};            $self->{s_kwd} = $self->{escaped} ? '' : '<';
469            #            #
470          } else {          } else {
471            !!!cp (16);            !!!cp (16);
472              $self->{s_kwd} = '';
473            #            #
474          }          }
475    
476          ## reconsume          ## reconsume
477          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
         $self->{s_kwd} = '';  
478          !!!emit ({type => CHARACTER_TOKEN, data => '<',          !!!emit ({type => CHARACTER_TOKEN, data => '<',
479                    line => $self->{line_prev},                    line => $self->{line_prev},
480                    column => $self->{column_prev},                    column => $self->{column_prev},
# Line 567  sub _get_next_token ($) { Line 585  sub _get_next_token ($) {
585        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
586          if (defined $self->{last_stag_name}) {          if (defined $self->{last_stag_name}) {
587            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
588            $self->{s_kwd} = '';            $self->{kwd} = '';
589            ## Reconsume.            ## Reconsume.
590            redo A;            redo A;
591          } else {          } else {
# Line 670  sub _get_next_token ($) { Line 688  sub _get_next_token ($) {
688          redo A;          redo A;
689        }        }
690      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
691        my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;        my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
692        if (length $ch) {        if (length $ch) {
693          my $CH = $ch;          my $CH = $ch;
694          $ch =~ tr/a-z/A-Z/;          $ch =~ tr/a-z/A-Z/;
# Line 678  sub _get_next_token ($) { Line 696  sub _get_next_token ($) {
696          if ($nch eq $ch or $nch eq $CH) {          if ($nch eq $ch or $nch eq $CH) {
697            !!!cp (24);            !!!cp (24);
698            ## Stay in the state.            ## Stay in the state.
699            $self->{s_kwd} .= $nch;            $self->{kwd} .= $nch;
700            !!!next-input-character;            !!!next-input-character;
701            redo A;            redo A;
702          } else {          } else {
# Line 687  sub _get_next_token ($) { Line 705  sub _get_next_token ($) {
705            $self->{s_kwd} = '';            $self->{s_kwd} = '';
706            ## Reconsume.            ## Reconsume.
707            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
708                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
709                      line => $self->{line_prev},                      line => $self->{line_prev},
710                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
711                     });                     });
712            redo A;            redo A;
713          }          }
# Line 705  sub _get_next_token ($) { Line 723  sub _get_next_token ($) {
723            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
724            $self->{s_kwd} = '';            $self->{s_kwd} = '';
725            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
726                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
727                      line => $self->{line_prev},                      line => $self->{line_prev},
728                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
729                     });                     });
730            redo A;            redo A;
731          } else {          } else {
# Line 716  sub _get_next_token ($) { Line 734  sub _get_next_token ($) {
734                = {type => END_TAG_TOKEN,                = {type => END_TAG_TOKEN,
735                   tag_name => $self->{last_stag_name},                   tag_name => $self->{last_stag_name},
736                   line => $self->{line_prev},                   line => $self->{line_prev},
737                   column => $self->{column_prev} - 1 - length $self->{s_kwd}};                   column => $self->{column_prev} - 1 - length $self->{kwd}};
738            $self->{state} = TAG_NAME_STATE;            $self->{state} = TAG_NAME_STATE;
739            ## Reconsume.            ## Reconsume.
740            redo A;            redo A;
# Line 798  sub _get_next_token ($) { Line 816  sub _get_next_token ($) {
816          redo A;          redo A;
817        }        }
818      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
819          ## XML5: "Tag attribute name before state".
820    
821        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
822          !!!cp (45);          !!!cp (45);
823          ## Stay in the state          ## Stay in the state
# Line 870  sub _get_next_token ($) { Line 890  sub _get_next_token ($) {
890               0x003D => 1, # =               0x003D => 1, # =
891              }->{$self->{nc}}) {              }->{$self->{nc}}) {
892            !!!cp (55);            !!!cp (55);
893              ## XML5: Not a parse error.
894            !!!parse-error (type => 'bad attribute name');            !!!parse-error (type => 'bad attribute name');
895          } else {          } else {
896            !!!cp (56);            !!!cp (56);
897              ## XML5: ":" raises a parse error and is ignored.
898          }          }
899          $self->{ca}          $self->{ca}
900              = {name => chr ($self->{nc}),              = {name => chr ($self->{nc}),
# Line 883  sub _get_next_token ($) { Line 905  sub _get_next_token ($) {
905          redo A;          redo A;
906        }        }
907      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
908          ## XML5: "Tag attribute name state".
909    
910        my $before_leave = sub {        my $before_leave = sub {
911          if (exists $self->{ct}->{attributes} # start tag or end tag          if (exists $self->{ct}->{attributes} # start tag or end tag
912              ->{$self->{ca}->{name}}) { # MUST              ->{$self->{ca}->{name}}) { # MUST
# Line 893  sub _get_next_token ($) { Line 917  sub _get_next_token ($) {
917            !!!cp (58);            !!!cp (58);
918            $self->{ct}->{attributes}->{$self->{ca}->{name}}            $self->{ct}->{attributes}->{$self->{ca}->{name}}
919              = $self->{ca};              = $self->{ca};
920              $self->{ca}->{index} = ++$self->{ct}->{last_index};
921          }          }
922        }; # $before_leave        }; # $before_leave
923    
# Line 909  sub _get_next_token ($) { Line 934  sub _get_next_token ($) {
934          !!!next-input-character;          !!!next-input-character;
935          redo A;          redo A;
936        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
937            if ($self->{is_xml}) {
938              !!!cp (60.1);
939              ## XML5: Not a parse error.
940              !!!parse-error (type => 'no attr value'); ## TODO: type
941            } else {
942              !!!cp (60.2);
943            }
944    
945          $before_leave->();          $before_leave->();
946          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
947            !!!cp (61);            !!!cp (61);
# Line 938  sub _get_next_token ($) { Line 971  sub _get_next_token ($) {
971          !!!next-input-character;          !!!next-input-character;
972          redo A;          redo A;
973        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
974          !!!cp (64);          if ($self->{is_xml}) {
975              !!!cp (64);
976              ## XML5: Not a parse error.
977              !!!parse-error (type => 'no attr value'); ## TODO: type
978            } else {
979              !!!cp (64.1);
980            }
981            
982          $before_leave->();          $before_leave->();
983          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
984          !!!next-input-character;          !!!next-input-character;
# Line 972  sub _get_next_token ($) { Line 1012  sub _get_next_token ($) {
1012          if ($self->{nc} == 0x0022 or # "          if ($self->{nc} == 0x0022 or # "
1013              $self->{nc} == 0x0027) { # '              $self->{nc} == 0x0027) { # '
1014            !!!cp (69);            !!!cp (69);
1015              ## XML5: Not a parse error.
1016            !!!parse-error (type => 'bad attribute name');            !!!parse-error (type => 'bad attribute name');
1017          } else {          } else {
1018            !!!cp (70);            !!!cp (70);
# Line 982  sub _get_next_token ($) { Line 1023  sub _get_next_token ($) {
1023          redo A;          redo A;
1024        }        }
1025      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1026          ## XML5: "Tag attribute name after state".
1027          
1028        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1029          !!!cp (71);          !!!cp (71);
1030          ## Stay in the state          ## Stay in the state
# Line 993  sub _get_next_token ($) { Line 1036  sub _get_next_token ($) {
1036          !!!next-input-character;          !!!next-input-character;
1037          redo A;          redo A;
1038        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1039            if ($self->{is_xml}) {
1040              !!!cp (72.1);
1041              ## XML5: Not a parse error.
1042              !!!parse-error (type => 'no attr value'); ## TODO: type
1043            } else {
1044              !!!cp (72.2);
1045            }
1046    
1047          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1048            !!!cp (73);            !!!cp (73);
1049            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
# Line 1026  sub _get_next_token ($) { Line 1077  sub _get_next_token ($) {
1077          !!!next-input-character;          !!!next-input-character;
1078          redo A;          redo A;
1079        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1080          !!!cp (77);          if ($self->{is_xml}) {
1081              !!!cp (77);
1082              ## XML5: Not a parse error.
1083              !!!parse-error (type => 'no attr value'); ## TODO: type
1084            } else {
1085              !!!cp (77.1);
1086            }
1087            
1088          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
1089          !!!next-input-character;          !!!next-input-character;
1090          redo A;          redo A;
# Line 1055  sub _get_next_token ($) { Line 1113  sub _get_next_token ($) {
1113    
1114          redo A;          redo A;
1115        } else {        } else {
1116            if ($self->{is_xml}) {
1117              !!!cp (78.1);
1118              ## XML5: Not a parse error.
1119              !!!parse-error (type => 'no attr value'); ## TODO: type
1120            } else {
1121              !!!cp (78.2);
1122            }
1123    
1124          if ($self->{nc} == 0x0022 or # "          if ($self->{nc} == 0x0022 or # "
1125              $self->{nc} == 0x0027) { # '              $self->{nc} == 0x0027) { # '
1126            !!!cp (78);            !!!cp (78);
1127              ## XML5: Not a parse error.
1128            !!!parse-error (type => 'bad attribute name');            !!!parse-error (type => 'bad attribute name');
1129          } else {          } else {
1130            !!!cp (82);            !!!cp (82);
# Line 1071  sub _get_next_token ($) { Line 1138  sub _get_next_token ($) {
1138          redo A;                  redo A;        
1139        }        }
1140      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1141          ## XML5: "Tag attribute value before state".
1142    
1143        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1144          !!!cp (83);          !!!cp (83);
1145          ## Stay in the state          ## Stay in the state
# Line 1142  sub _get_next_token ($) { Line 1211  sub _get_next_token ($) {
1211        } else {        } else {
1212          if ($self->{nc} == 0x003D) { # =          if ($self->{nc} == 0x003D) { # =
1213            !!!cp (93);            !!!cp (93);
1214              ## XML5: Not a parse error.
1215            !!!parse-error (type => 'bad attribute value');            !!!parse-error (type => 'bad attribute value');
1216            } elsif ($self->{is_xml}) {
1217              !!!cp (93.1);
1218              ## XML5: No parse error.
1219              !!!parse-error (type => 'unquoted attr value'); ## TODO
1220          } else {          } else {
1221            !!!cp (94);            !!!cp (94);
1222          }          }
# Line 1152  sub _get_next_token ($) { Line 1226  sub _get_next_token ($) {
1226          redo A;          redo A;
1227        }        }
1228      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1229          ## XML5: "Tag attribute value double quoted state".
1230          
1231        if ($self->{nc} == 0x0022) { # "        if ($self->{nc} == 0x0022) { # "
1232          !!!cp (95);          !!!cp (95);
1233            ## XML5: "Tag attribute name before state".
1234          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1235          !!!next-input-character;          !!!next-input-character;
1236          redo A;          redo A;
1237        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1238          !!!cp (96);          !!!cp (96);
1239            ## XML5: Not defined yet.
1240    
1241          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1242          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1243          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1193  sub _get_next_token ($) { Line 1272  sub _get_next_token ($) {
1272    
1273          redo A;          redo A;
1274        } else {        } else {
1275          !!!cp (100);          if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1276              !!!cp (100);
1277              ## XML5: Not a parse error.
1278              !!!parse-error (type => 'lt in attr value'); ## TODO: type
1279            } else {
1280              !!!cp (100.1);
1281            }
1282          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1283          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1284                                q["&],                                q["&<],
1285                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1286    
1287          ## Stay in the state          ## Stay in the state
# Line 1204  sub _get_next_token ($) { Line 1289  sub _get_next_token ($) {
1289          redo A;          redo A;
1290        }        }
1291      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1292          ## XML5: "Tag attribute value single quoted state".
1293    
1294        if ($self->{nc} == 0x0027) { # '        if ($self->{nc} == 0x0027) { # '
1295          !!!cp (101);          !!!cp (101);
1296            ## XML5: "Before attribute name state" (sic).
1297          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1298          !!!next-input-character;          !!!next-input-character;
1299          redo A;          redo A;
1300        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1301          !!!cp (102);          !!!cp (102);
1302            ## XML5: Not defined yet.
1303    
1304          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1305          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1306          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1245  sub _get_next_token ($) { Line 1335  sub _get_next_token ($) {
1335    
1336          redo A;          redo A;
1337        } else {        } else {
1338          !!!cp (106);          if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1339              !!!cp (106);
1340              ## XML5: Not a parse error.
1341              !!!parse-error (type => 'lt in attr value'); ## TODO: type
1342            } else {
1343              !!!cp (106.1);
1344            }
1345          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1346          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1347                                q['&],                                q['&<],
1348                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1349    
1350          ## Stay in the state          ## Stay in the state
# Line 1256  sub _get_next_token ($) { Line 1352  sub _get_next_token ($) {
1352          redo A;          redo A;
1353        }        }
1354      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1355          ## XML5: "Tag attribute value unquoted state".
1356    
1357        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1358          !!!cp (107);          !!!cp (107);
1359            ## XML5: "Tag attribute name before state".
1360          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1361          !!!next-input-character;          !!!next-input-character;
1362          redo A;          redo A;
1363        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1364          !!!cp (108);          !!!cp (108);
1365    
1366            ## XML5: Not defined yet.
1367    
1368          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1369          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1370          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1326  sub _get_next_token ($) { Line 1428  sub _get_next_token ($) {
1428               0x003D => 1, # =               0x003D => 1, # =
1429              }->{$self->{nc}}) {              }->{$self->{nc}}) {
1430            !!!cp (115);            !!!cp (115);
1431              ## XML5: Not a parse error.
1432            !!!parse-error (type => 'bad attribute value');            !!!parse-error (type => 'bad attribute value');
1433          } else {          } else {
1434            !!!cp (116);            !!!cp (116);
# Line 1402  sub _get_next_token ($) { Line 1505  sub _get_next_token ($) {
1505          redo A;          redo A;
1506        }        }
1507      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1508          ## XML5: "Empty tag state".
1509    
1510        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
1511          if ($self->{ct}->{type} == END_TAG_TOKEN) {          if ($self->{ct}->{type} == END_TAG_TOKEN) {
1512            !!!cp ('124.2');            !!!cp ('124.2');
# Line 1443  sub _get_next_token ($) { Line 1548  sub _get_next_token ($) {
1548          } else {          } else {
1549            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1550          }          }
1551            ## XML5: "Tag attribute name before state".
1552          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1553          $self->{s_kwd} = '';          $self->{s_kwd} = '';
1554          ## Reconsume.          ## Reconsume.
# Line 1502  sub _get_next_token ($) { Line 1608  sub _get_next_token ($) {
1608          ## ASCII case-insensitive.          ## ASCII case-insensitive.
1609          !!!cp (130);          !!!cp (130);
1610          $self->{state} = MD_DOCTYPE_STATE;          $self->{state} = MD_DOCTYPE_STATE;
1611          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
1612          !!!next-input-character;          !!!next-input-character;
1613          redo A;          redo A;
1614        } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and        } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
# Line 1511  sub _get_next_token ($) { Line 1617  sub _get_next_token ($) {
1617                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
1618          !!!cp (135.4);                          !!!cp (135.4);                
1619          $self->{state} = MD_CDATA_STATE;          $self->{state} = MD_CDATA_STATE;
1620          $self->{s_kwd} = '[';          $self->{kwd} = '[';
1621          !!!next-input-character;          !!!next-input-character;
1622          redo A;          redo A;
1623        } else {        } else {
# Line 1561  sub _get_next_token ($) { Line 1667  sub _get_next_token ($) {
1667              0x0054, # T              0x0054, # T
1668              0x0059, # Y              0x0059, # Y
1669              0x0050, # P              0x0050, # P
1670            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
1671            $self->{nc} == [            $self->{nc} == [
1672              undef,              undef,
1673              0x006F, # o              0x006F, # o
# Line 1569  sub _get_next_token ($) { Line 1675  sub _get_next_token ($) {
1675              0x0074, # t              0x0074, # t
1676              0x0079, # y              0x0079, # y
1677              0x0070, # p              0x0070, # p
1678            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
1679          !!!cp (131);          !!!cp (131);
1680          ## Stay in the state.          ## Stay in the state.
1681          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
1682          !!!next-input-character;          !!!next-input-character;
1683          redo A;          redo A;
1684        } elsif ((length $self->{s_kwd}) == 6 and        } elsif ((length $self->{kwd}) == 6 and
1685                 ($self->{nc} == 0x0045 or # E                 ($self->{nc} == 0x0045 or # E
1686                  $self->{nc} == 0x0065)) { # e                  $self->{nc} == 0x0065)) { # e
1687          if ($self->{s_kwd} ne 'DOCTYP') {          if ($self->{is_xml} and
1688                ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
1689            !!!cp (129);            !!!cp (129);
1690            ## XML5: case-sensitive.            ## XML5: case-sensitive.
1691            !!!parse-error (type => 'lowercase keyword', ## TODO            !!!parse-error (type => 'lowercase keyword', ## TODO
# Line 1600  sub _get_next_token ($) { Line 1707  sub _get_next_token ($) {
1707          !!!cp (132);                  !!!cp (132);        
1708          !!!parse-error (type => 'bogus comment',          !!!parse-error (type => 'bogus comment',
1709                          line => $self->{line_prev},                          line => $self->{line_prev},
1710                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
1711          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
1712          ## Reconsume.          ## Reconsume.
1713          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
1714                                    data => $self->{s_kwd},                                    data => $self->{kwd},
1715                                    line => $self->{line_prev},                                    line => $self->{line_prev},
1716                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
1717                                   };                                   };
1718          redo A;          redo A;
1719        }        }
# Line 1617  sub _get_next_token ($) { Line 1724  sub _get_next_token ($) {
1724              '[CD' => 0x0041, # A              '[CD' => 0x0041, # A
1725              '[CDA' => 0x0054, # T              '[CDA' => 0x0054, # T
1726              '[CDAT' => 0x0041, # A              '[CDAT' => 0x0041, # A
1727            }->{$self->{s_kwd}}) {            }->{$self->{kwd}}) {
1728          !!!cp (135.1);          !!!cp (135.1);
1729          ## Stay in the state.          ## Stay in the state.
1730          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
1731          !!!next-input-character;          !!!next-input-character;
1732          redo A;          redo A;
1733        } elsif ($self->{s_kwd} eq '[CDATA' and        } elsif ($self->{kwd} eq '[CDATA' and
1734                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
1735          if ($self->{is_xml} and          if ($self->{is_xml} and
1736              not $self->{tainted} and              not $self->{tainted} and
# Line 1648  sub _get_next_token ($) { Line 1755  sub _get_next_token ($) {
1755          !!!cp (135.3);          !!!cp (135.3);
1756          !!!parse-error (type => 'bogus comment',          !!!parse-error (type => 'bogus comment',
1757                          line => $self->{line_prev},                          line => $self->{line_prev},
1758                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
1759          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
1760          ## Reconsume.          ## Reconsume.
1761          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
1762                                    data => $self->{s_kwd},                                    data => $self->{kwd},
1763                                    line => $self->{line_prev},                                    line => $self->{line_prev},
1764                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
1765                                   };                                   };
1766          redo A;          redo A;
1767        }        }
# Line 1764  sub _get_next_token ($) { Line 1871  sub _get_next_token ($) {
1871        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1872          !!!cp (149);          !!!cp (149);
1873          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
         $self->{s_kwd} = '';  
1874          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1875          $self->{s_kwd} = '';          $self->{s_kwd} = '';
1876          ## reconsume          ## reconsume
# Line 1828  sub _get_next_token ($) { Line 1934  sub _get_next_token ($) {
1934          redo A;          redo A;
1935        } else {        } else {
1936          !!!cp (156);          !!!cp (156);
1937            ## XML5: Unless EOF, swith to the bogus comment state.
1938          !!!parse-error (type => 'no space before DOCTYPE name');          !!!parse-error (type => 'no space before DOCTYPE name');
1939          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1940          ## reconsume          ## reconsume
1941          redo A;          redo A;
1942        }        }
1943      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
1944          ## XML5: "DOCTYPE root name before state".
1945    
1946        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1947          !!!cp (157);          !!!cp (157);
1948          ## Stay in the state          ## Stay in the state
# Line 1841  sub _get_next_token ($) { Line 1950  sub _get_next_token ($) {
1950          redo A;          redo A;
1951        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1952          !!!cp (158);          !!!cp (158);
1953            ## XML5: No parse error.
1954          !!!parse-error (type => 'no DOCTYPE name');          !!!parse-error (type => 'no DOCTYPE name');
1955          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1956          $self->{s_kwd} = '';          $self->{s_kwd} = '';
# Line 1859  sub _get_next_token ($) { Line 1969  sub _get_next_token ($) {
1969          !!!emit ($self->{ct}); # DOCTYPE (quirks)          !!!emit ($self->{ct}); # DOCTYPE (quirks)
1970    
1971          redo A;          redo A;
1972          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
1973            !!!cp (159.1);
1974            !!!parse-error (type => 'no DOCTYPE name');
1975            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1976            !!!next-input-character;
1977            redo A;
1978        } else {        } else {
1979          !!!cp (160);          !!!cp (160);
1980          $self->{ct}->{name} = chr $self->{nc};          $self->{ct}->{name} = chr $self->{nc};
# Line 1868  sub _get_next_token ($) { Line 1984  sub _get_next_token ($) {
1984          redo A;          redo A;
1985        }        }
1986      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
1987  ## ISSUE: Redundant "First," in the spec.        ## XML5: "DOCTYPE root name state".
1988    
1989          ## ISSUE: Redundant "First," in the spec.
1990    
1991        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1992          !!!cp (161);          !!!cp (161);
1993          $self->{state} = AFTER_DOCTYPE_NAME_STATE;          $self->{state} = AFTER_DOCTYPE_NAME_STATE;
# Line 1894  sub _get_next_token ($) { Line 2013  sub _get_next_token ($) {
2013          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2014    
2015          redo A;          redo A;
2016          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2017            !!!cp (163.1);
2018            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2019            !!!next-input-character;
2020            redo A;
2021        } else {        } else {
2022          !!!cp (164);          !!!cp (164);
2023          $self->{ct}->{name}          $self->{ct}->{name}
# Line 1903  sub _get_next_token ($) { Line 2027  sub _get_next_token ($) {
2027          redo A;          redo A;
2028        }        }
2029      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2030          ## XML5: Corresponding to XML5's "DOCTYPE root name after
2031          ## state", but implemented differently.
2032    
2033        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2034          !!!cp (165);          !!!cp (165);
2035          ## Stay in the state          ## Stay in the state
# Line 1930  sub _get_next_token ($) { Line 2057  sub _get_next_token ($) {
2057          redo A;          redo A;
2058        } elsif ($self->{nc} == 0x0050 or # P        } elsif ($self->{nc} == 0x0050 or # P
2059                 $self->{nc} == 0x0070) { # p                 $self->{nc} == 0x0070) { # p
2060            !!!cp (167.1);
2061          $self->{state} = PUBLIC_STATE;          $self->{state} = PUBLIC_STATE;
2062          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
2063          !!!next-input-character;          !!!next-input-character;
2064          redo A;          redo A;
2065        } elsif ($self->{nc} == 0x0053 or # S        } elsif ($self->{nc} == 0x0053 or # S
2066                 $self->{nc} == 0x0073) { # s                 $self->{nc} == 0x0073) { # s
2067            !!!cp (167.2);
2068          $self->{state} = SYSTEM_STATE;          $self->{state} = SYSTEM_STATE;
2069          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
2070            !!!next-input-character;
2071            redo A;
2072          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2073            !!!cp (167.3);
2074            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2075            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2076          !!!next-input-character;          !!!next-input-character;
2077          redo A;          redo A;
2078        } else {        } else {
# Line 1957  sub _get_next_token ($) { Line 2092  sub _get_next_token ($) {
2092              0x0042, # B              0x0042, # B
2093              0x004C, # L              0x004C, # L
2094              0x0049, # I              0x0049, # I
2095            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
2096            $self->{nc} == [            $self->{nc} == [
2097              undef,              undef,
2098              0x0075, # u              0x0075, # u
2099              0x0062, # b              0x0062, # b
2100              0x006C, # l              0x006C, # l
2101              0x0069, # i              0x0069, # i
2102            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
2103          !!!cp (175);          !!!cp (175);
2104          ## Stay in the state.          ## Stay in the state.
2105          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2106          !!!next-input-character;          !!!next-input-character;
2107          redo A;          redo A;
2108        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
2109                 ($self->{nc} == 0x0043 or # C                 ($self->{nc} == 0x0043 or # C
2110                  $self->{nc} == 0x0063)) { # c                  $self->{nc} == 0x0063)) { # c
2111          !!!cp (168);          if ($self->{is_xml} and
2112                ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
2113              !!!cp (168.1);
2114              !!!parse-error (type => 'lowercase keyword', ## TODO: type
2115                              text => 'PUBLIC',
2116                              line => $self->{line_prev},
2117                              column => $self->{column_prev} - 4);
2118            } else {
2119              !!!cp (168);
2120            }
2121          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2122          !!!next-input-character;          !!!next-input-character;
2123          redo A;          redo A;
# Line 1981  sub _get_next_token ($) { Line 2125  sub _get_next_token ($) {
2125          !!!cp (169);          !!!cp (169);
2126          !!!parse-error (type => 'string after DOCTYPE name',          !!!parse-error (type => 'string after DOCTYPE name',
2127                          line => $self->{line_prev},                          line => $self->{line_prev},
2128                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
2129          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2130    
2131          $self->{state} = BOGUS_DOCTYPE_STATE;          $self->{state} = BOGUS_DOCTYPE_STATE;
# Line 1996  sub _get_next_token ($) { Line 2140  sub _get_next_token ($) {
2140              0x0053, # S              0x0053, # S
2141              0x0054, # T              0x0054, # T
2142              0x0045, # E              0x0045, # E
2143            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
2144            $self->{nc} == [            $self->{nc} == [
2145              undef,              undef,
2146              0x0079, # y              0x0079, # y
2147              0x0073, # s              0x0073, # s
2148              0x0074, # t              0x0074, # t
2149              0x0065, # e              0x0065, # e
2150            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
2151          !!!cp (170);          !!!cp (170);
2152          ## Stay in the state.          ## Stay in the state.
2153          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2154          !!!next-input-character;          !!!next-input-character;
2155          redo A;          redo A;
2156        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
2157                 ($self->{nc} == 0x004D or # M                 ($self->{nc} == 0x004D or # M
2158                  $self->{nc} == 0x006D)) { # m                  $self->{nc} == 0x006D)) { # m
2159          !!!cp (171);          if ($self->{is_xml} and
2160                ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
2161              !!!cp (171.1);
2162              !!!parse-error (type => 'lowercase keyword', ## TODO: type
2163                              text => 'SYSTEM',
2164                              line => $self->{line_prev},
2165                              column => $self->{column_prev} - 4);
2166            } else {
2167              !!!cp (171);
2168            }
2169          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2170          !!!next-input-character;          !!!next-input-character;
2171          redo A;          redo A;
# Line 2020  sub _get_next_token ($) { Line 2173  sub _get_next_token ($) {
2173          !!!cp (172);          !!!cp (172);
2174          !!!parse-error (type => 'string after DOCTYPE name',          !!!parse-error (type => 'string after DOCTYPE name',
2175                          line => $self->{line_prev},                          line => $self->{line_prev},
2176                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
2177          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2178    
2179          $self->{state} = BOGUS_DOCTYPE_STATE;          $self->{state} = BOGUS_DOCTYPE_STATE;
# Line 2069  sub _get_next_token ($) { Line 2222  sub _get_next_token ($) {
2222          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2223    
2224          redo A;          redo A;
2225          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2226            !!!cp (186.1);
2227            !!!parse-error (type => 'no PUBLIC literal');
2228            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2229            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2230            !!!next-input-character;
2231            redo A;
2232        } else {        } else {
2233          !!!cp (186);          !!!cp (186);
2234          !!!parse-error (type => 'string after PUBLIC');          !!!parse-error (type => 'string after PUBLIC');
# Line 2179  sub _get_next_token ($) { Line 2339  sub _get_next_token ($) {
2339          !!!next-input-character;          !!!next-input-character;
2340          redo A;          redo A;
2341        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2342          !!!cp (198);          if ($self->{is_xml}) {
2343              !!!cp (198.1);
2344              !!!parse-error (type => 'no SYSTEM literal');
2345            } else {
2346              !!!cp (198);
2347            }
2348          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2349          $self->{s_kwd} = '';          $self->{s_kwd} = '';
2350          !!!next-input-character;          !!!next-input-character;
# Line 2199  sub _get_next_token ($) { Line 2364  sub _get_next_token ($) {
2364          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2365    
2366          redo A;          redo A;
2367          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2368            !!!cp (200.1);
2369            !!!parse-error (type => 'no SYSTEM literal');
2370            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2371            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2372            !!!next-input-character;
2373            redo A;
2374        } else {        } else {
2375          !!!cp (200);          !!!cp (200);
2376          !!!parse-error (type => 'string after PUBLIC literal');          !!!parse-error (type => 'string after PUBLIC literal');
# Line 2249  sub _get_next_token ($) { Line 2421  sub _get_next_token ($) {
2421          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2422    
2423          redo A;          redo A;
2424          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2425            !!!cp (206.1);
2426            !!!parse-error (type => 'no SYSTEM literal');
2427    
2428            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2429            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2430            !!!next-input-character;
2431            redo A;
2432        } else {        } else {
2433          !!!cp (206);          !!!cp (206);
2434          !!!parse-error (type => 'string after SYSTEM');          !!!parse-error (type => 'string after SYSTEM');
# Line 2264  sub _get_next_token ($) { Line 2444  sub _get_next_token ($) {
2444          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2445          !!!next-input-character;          !!!next-input-character;
2446          redo A;          redo A;
2447        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2448          !!!cp (208);          !!!cp (208);
2449          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2450    
# Line 2305  sub _get_next_token ($) { Line 2485  sub _get_next_token ($) {
2485          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2486          !!!next-input-character;          !!!next-input-character;
2487          redo A;          redo A;
2488        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2489          !!!cp (212);          !!!cp (212);
2490          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2491    
# Line 2366  sub _get_next_token ($) { Line 2546  sub _get_next_token ($) {
2546          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2547    
2548          redo A;          redo A;
2549          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2550            !!!cp (218.1);
2551            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2552            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2553            !!!next-input-character;
2554            redo A;
2555        } else {        } else {
2556          !!!cp (218);          !!!cp (218);
2557          !!!parse-error (type => 'string after SYSTEM literal');          !!!parse-error (type => 'string after SYSTEM literal');
# Line 2385  sub _get_next_token ($) { Line 2571  sub _get_next_token ($) {
2571          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2572    
2573          redo A;          redo A;
2574          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2575            if ($self->{ct}->{has_internal_subset}) { # DOCTYPE
2576              !!!cp (220.2);
2577              ## Stay in the state.
2578              !!!next-input-character;
2579              redo A;
2580            } else {
2581              !!!cp (220.1);
2582              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2583              $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2584              !!!next-input-character;
2585              redo A;
2586            }
2587        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2588          !!!cp (220);          !!!cp (220);
2589          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
# Line 2397  sub _get_next_token ($) { Line 2596  sub _get_next_token ($) {
2596        } else {        } else {
2597          !!!cp (221);          !!!cp (221);
2598          my $s = '';          my $s = '';
2599          $self->{read_until}->($s, q[>], 0);          $self->{read_until}->($s, q{>[}, 0);
2600    
2601          ## Stay in the state          ## Stay in the state
2602          !!!next-input-character;          !!!next-input-character;
# Line 2505  sub _get_next_token ($) { Line 2704  sub _get_next_token ($) {
2704        } elsif ($self->{nc} == 0x0023) { # #        } elsif ($self->{nc} == 0x0023) { # #
2705          !!!cp (999);          !!!cp (999);
2706          $self->{state} = ENTITY_HASH_STATE;          $self->{state} = ENTITY_HASH_STATE;
2707          $self->{s_kwd} = '#';          $self->{kwd} = '#';
2708          !!!next-input-character;          !!!next-input-character;
2709          redo A;          redo A;
2710        } elsif ((0x0041 <= $self->{nc} and        } elsif ((0x0041 <= $self->{nc} and
# Line 2515  sub _get_next_token ($) { Line 2714  sub _get_next_token ($) {
2714          !!!cp (998);          !!!cp (998);
2715          require Whatpm::_NamedEntityList;          require Whatpm::_NamedEntityList;
2716          $self->{state} = ENTITY_NAME_STATE;          $self->{state} = ENTITY_NAME_STATE;
2717          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
2718          $self->{entity__value} = $self->{s_kwd};          $self->{entity__value} = $self->{kwd};
2719          $self->{entity__match} = 0;          $self->{entity__match} = 0;
2720          !!!next-input-character;          !!!next-input-character;
2721          redo A;          redo A;
# Line 2556  sub _get_next_token ($) { Line 2755  sub _get_next_token ($) {
2755            $self->{nc} == 0x0058) { # X            $self->{nc} == 0x0058) { # X
2756          !!!cp (995);          !!!cp (995);
2757          $self->{state} = HEXREF_X_STATE;          $self->{state} = HEXREF_X_STATE;
2758          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2759          !!!next-input-character;          !!!next-input-character;
2760          redo A;          redo A;
2761        } elsif (0x0030 <= $self->{nc} and        } elsif (0x0030 <= $self->{nc} and
2762                 $self->{nc} <= 0x0039) { # 0..9                 $self->{nc} <= 0x0039) { # 0..9
2763          !!!cp (994);          !!!cp (994);
2764          $self->{state} = NCR_NUM_STATE;          $self->{state} = NCR_NUM_STATE;
2765          $self->{s_kwd} = $self->{nc} - 0x0030;          $self->{kwd} = $self->{nc} - 0x0030;
2766          !!!next-input-character;          !!!next-input-character;
2767          redo A;          redo A;
2768        } else {        } else {
# Line 2599  sub _get_next_token ($) { Line 2798  sub _get_next_token ($) {
2798        if (0x0030 <= $self->{nc} and        if (0x0030 <= $self->{nc} and
2799            $self->{nc} <= 0x0039) { # 0..9            $self->{nc} <= 0x0039) { # 0..9
2800          !!!cp (1012);          !!!cp (1012);
2801          $self->{s_kwd} *= 10;          $self->{kwd} *= 10;
2802          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
2803                    
2804          ## Stay in the state.          ## Stay in the state.
2805          !!!next-input-character;          !!!next-input-character;
# Line 2616  sub _get_next_token ($) { Line 2815  sub _get_next_token ($) {
2815          #          #
2816        }        }
2817    
2818        my $code = $self->{s_kwd};        my $code = $self->{kwd};
2819        my $l = $self->{line_prev};        my $l = $self->{line_prev};
2820        my $c = $self->{column_prev};        my $c = $self->{column_prev};
2821        if ($charref_map->{$code}) {        if ($charref_map->{$code}) {
# Line 2659  sub _get_next_token ($) { Line 2858  sub _get_next_token ($) {
2858          # 0..9, A..F, a..f          # 0..9, A..F, a..f
2859          !!!cp (990);          !!!cp (990);
2860          $self->{state} = HEXREF_HEX_STATE;          $self->{state} = HEXREF_HEX_STATE;
2861          $self->{s_kwd} = 0;          $self->{kwd} = 0;
2862          ## Reconsume.          ## Reconsume.
2863          redo A;          redo A;
2864        } else {        } else {
# Line 2677  sub _get_next_token ($) { Line 2876  sub _get_next_token ($) {
2876            $self->{s_kwd} = '';            $self->{s_kwd} = '';
2877            ## Reconsume.            ## Reconsume.
2878            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
2879                      data => '&' . $self->{s_kwd},                      data => '&' . $self->{kwd},
2880                      line => $self->{line_prev},                      line => $self->{line_prev},
2881                      column => $self->{column_prev} - length $self->{s_kwd},                      column => $self->{column_prev} - length $self->{kwd},
2882                     });                     });
2883            redo A;            redo A;
2884          } else {          } else {
2885            !!!cp (989);            !!!cp (989);
2886            $self->{ca}->{value} .= '&' . $self->{s_kwd};            $self->{ca}->{value} .= '&' . $self->{kwd};
2887            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
2888            $self->{s_kwd} = '';            $self->{s_kwd} = '';
2889            ## Reconsume.            ## Reconsume.
# Line 2695  sub _get_next_token ($) { Line 2894  sub _get_next_token ($) {
2894        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
2895          # 0..9          # 0..9
2896          !!!cp (1002);          !!!cp (1002);
2897          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
2898          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
2899          ## Stay in the state.          ## Stay in the state.
2900          !!!next-input-character;          !!!next-input-character;
2901          redo A;          redo A;
2902        } elsif (0x0061 <= $self->{nc} and        } elsif (0x0061 <= $self->{nc} and
2903                 $self->{nc} <= 0x0066) { # a..f                 $self->{nc} <= 0x0066) { # a..f
2904          !!!cp (1003);          !!!cp (1003);
2905          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
2906          $self->{s_kwd} += $self->{nc} - 0x0060 + 9;          $self->{kwd} += $self->{nc} - 0x0060 + 9;
2907          ## Stay in the state.          ## Stay in the state.
2908          !!!next-input-character;          !!!next-input-character;
2909          redo A;          redo A;
2910        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
2911                 $self->{nc} <= 0x0046) { # A..F                 $self->{nc} <= 0x0046) { # A..F
2912          !!!cp (1004);          !!!cp (1004);
2913          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
2914          $self->{s_kwd} += $self->{nc} - 0x0040 + 9;          $self->{kwd} += $self->{nc} - 0x0040 + 9;
2915          ## Stay in the state.          ## Stay in the state.
2916          !!!next-input-character;          !!!next-input-character;
2917          redo A;          redo A;
# Line 2729  sub _get_next_token ($) { Line 2928  sub _get_next_token ($) {
2928          #          #
2929        }        }
2930    
2931        my $code = $self->{s_kwd};        my $code = $self->{kwd};
2932        my $l = $self->{line_prev};        my $l = $self->{line_prev};
2933        my $c = $self->{column_prev};        my $c = $self->{column_prev};
2934        if ($charref_map->{$code}) {        if ($charref_map->{$code}) {
# Line 2766  sub _get_next_token ($) { Line 2965  sub _get_next_token ($) {
2965          redo A;          redo A;
2966        }        }
2967      } elsif ($self->{state} == ENTITY_NAME_STATE) {      } elsif ($self->{state} == ENTITY_NAME_STATE) {
2968        if (length $self->{s_kwd} < 30 and        if (length $self->{kwd} < 30 and
2969            ## NOTE: Some number greater than the maximum length of entity name            ## NOTE: Some number greater than the maximum length of entity name
2970            ((0x0041 <= $self->{nc} and # a            ((0x0041 <= $self->{nc} and # a
2971              $self->{nc} <= 0x005A) or # x              $self->{nc} <= 0x005A) or # x
# Line 2776  sub _get_next_token ($) { Line 2975  sub _get_next_token ($) {
2975              $self->{nc} <= 0x0039) or # 9              $self->{nc} <= 0x0039) or # 9
2976             $self->{nc} == 0x003B)) { # ;             $self->{nc} == 0x003B)) { # ;
2977          our $EntityChar;          our $EntityChar;
2978          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2979          if (defined $EntityChar->{$self->{s_kwd}}) {          if (defined $EntityChar->{$self->{kwd}}) {
2980            if ($self->{nc} == 0x003B) { # ;            if ($self->{nc} == 0x003B) { # ;
2981              !!!cp (1020);              !!!cp (1020);
2982              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};              $self->{entity__value} = $EntityChar->{$self->{kwd}};
2983              $self->{entity__match} = 1;              $self->{entity__match} = 1;
2984              !!!next-input-character;              !!!next-input-character;
2985              #              #
2986            } else {            } else {
2987              !!!cp (1021);              !!!cp (1021);
2988              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};              $self->{entity__value} = $EntityChar->{$self->{kwd}};
2989              $self->{entity__match} = -1;              $self->{entity__match} = -1;
2990              ## Stay in the state.              ## Stay in the state.
2991              !!!next-input-character;              !!!next-input-character;
# Line 2814  sub _get_next_token ($) { Line 3013  sub _get_next_token ($) {
3013          if ($self->{prev_state} != DATA_STATE and # in attribute          if ($self->{prev_state} != DATA_STATE and # in attribute
3014              $self->{entity__match} < -1) {              $self->{entity__match} < -1) {
3015            !!!cp (1024);            !!!cp (1024);
3016            $data = '&' . $self->{s_kwd};            $data = '&' . $self->{kwd};
3017            #            #
3018          } else {          } else {
3019            !!!cp (1025);            !!!cp (1025);
# Line 2826  sub _get_next_token ($) { Line 3025  sub _get_next_token ($) {
3025          !!!cp (1026);          !!!cp (1026);
3026          !!!parse-error (type => 'bare ero',          !!!parse-error (type => 'bare ero',
3027                          line => $self->{line_prev},                          line => $self->{line_prev},
3028                          column => $self->{column_prev} - length $self->{s_kwd});                          column => $self->{column_prev} - length $self->{kwd});
3029          $data = '&' . $self->{s_kwd};          $data = '&' . $self->{kwd};
3030          #          #
3031        }        }
3032        
# Line 2850  sub _get_next_token ($) { Line 3049  sub _get_next_token ($) {
3049                    data => $data,                    data => $data,
3050                    has_reference => $has_ref,                    has_reference => $has_ref,
3051                    line => $self->{line_prev},                    line => $self->{line_prev},
3052                    column => $self->{column_prev} + 1 - length $self->{s_kwd},                    column => $self->{column_prev} + 1 - length $self->{kwd},
3053                   });                   });
3054          redo A;          redo A;
3055        } else {        } else {
# Line 2990  sub _get_next_token ($) { Line 3189  sub _get_next_token ($) {
3189          ## Reprocess.          ## Reprocess.
3190          redo A;          redo A;
3191        }        }
3192    
3193        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
3194          if ($self->{nc} == 0x003C) { # <
3195            ## TODO:
3196            !!!next-input-character;
3197            redo A;
3198          } elsif ($self->{nc} == 0x0025) { # %
3199            ## XML5: Not defined yet.
3200    
3201            ## TODO:
3202            !!!next-input-character;
3203            redo A;
3204          } elsif ($self->{nc} == 0x005D) { # ]
3205            $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3206            !!!next-input-character;
3207            redo A;
3208          } elsif ($is_space->{$self->{nc}}) {
3209            ## Stay in the state.
3210            !!!next-input-character;
3211            redo A;
3212          } elsif ($self->{nc} == -1) {
3213            !!!parse-error (type => 'unclosed internal subset'); ## TODO: type
3214            $self->{state} = DATA_STATE;
3215            $self->{s_kwd} = '';
3216            ## Reconsume.
3217            !!!emit ($self->{ct}); # DOCTYPE
3218            redo A;
3219          } else {
3220            unless ($self->{internal_subset_tainted}) {
3221              ## XML5: No parse error.
3222              !!!parse-error (type => 'string in internal subset');
3223              $self->{internal_subset_tainted} = 1;
3224            }
3225            ## Stay in the state.
3226            !!!next-input-character;
3227            redo A;
3228          }
3229        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3230          if ($self->{nc} == 0x003E) { # >
3231            $self->{state} = DATA_STATE;
3232            $self->{s_kwd} = '';
3233            !!!next-input-character;
3234            !!!emit ($self->{ct}); # DOCTYPE
3235            redo A;
3236          } elsif ($self->{nc} == -1) {
3237            !!!parse-error (type => 'unclosed DOCTYPE');
3238            $self->{state} = DATA_STATE;
3239            $self->{s_kwd} = '';
3240            ## Reconsume.
3241            !!!emit ($self->{ct}); # DOCTYPE
3242            redo A;
3243          } else {
3244            ## XML5: No parse error and stay in the state.
3245            !!!parse-error (type => 'string after internal subset'); ## TODO: type
3246    
3247            $self->{state} = BOGUS_DOCTYPE_STATE;
3248            !!!next-input-character;
3249            redo A;
3250          }
3251                    
3252      } else {      } else {
3253        die "$0: $self->{state}: Unknown state";        die "$0: $self->{state}: Unknown state";

Legend:
Removed from v.1.10  
changed lines
  Added in v.1.12

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24