/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.9 by wakaba, Wed Oct 15 08:05:47 2008 UTC revision 1.11 by wakaba, Wed Oct 15 10:50:38 2008 UTC
# Line 206  sub _initialize_tokenizer ($) { Line 206  sub _initialize_tokenizer ($) {
206    
207  ## A token has:  ## A token has:
208  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
209  ##       CHARACTER_TOKEN, or END_OF_FILE_TOKEN  ##       CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
210  ##   ->{name} (DOCTYPE_TOKEN)  ##   ->{name} (DOCTYPE_TOKEN)
211  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
212    ##   ->{target} (PI_TOKEN)
213  ##   ->{pubid} (DOCTYPE_TOKEN)  ##   ->{pubid} (DOCTYPE_TOKEN)
214  ##   ->{sysid} (DOCTYPE_TOKEN)  ##   ->{sysid} (DOCTYPE_TOKEN)
215  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
# Line 216  sub _initialize_tokenizer ($) { Line 217  sub _initialize_tokenizer ($) {
217  ##        ->{name}  ##        ->{name}
218  ##        ->{value}  ##        ->{value}
219  ##        ->{has_reference} == 1 or 0  ##        ->{has_reference} == 1 or 0
220  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)  ##        ->{index}: Index of the attribute in a tag.
221    ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
222  ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)  ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
223    ##   ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
224  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
225  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|
226  ##     while the token is pushed back to the stack.  ##     while the token is pushed back to the stack.
# Line 437  sub _get_next_token ($) { Line 440  sub _get_next_token ($) {
440        !!!emit ($token);        !!!emit ($token);
441        redo A;        redo A;
442      } elsif ($self->{state} == TAG_OPEN_STATE) {      } elsif ($self->{state} == TAG_OPEN_STATE) {
443          ## XML5: "tag state".
444    
445        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
446          if ($self->{nc} == 0x002F) { # /          if ($self->{nc} == 0x002F) { # /
447            !!!cp (15);            !!!cp (15);
# Line 559  sub _get_next_token ($) { Line 564  sub _get_next_token ($) {
564        ## NOTE: The "close tag open state" in the spec is implemented as        ## NOTE: The "close tag open state" in the spec is implemented as
565        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
566    
567          ## XML5: "end tag state".
568    
569        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
570        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
571          if (defined $self->{last_stag_name}) {          if (defined $self->{last_stag_name}) {
# Line 600  sub _get_next_token ($) { Line 607  sub _get_next_token ($) {
607          !!!next-input-character;          !!!next-input-character;
608          redo A;          redo A;
609        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
         !!!cp (31);  
610          !!!parse-error (type => 'empty end tag',          !!!parse-error (type => 'empty end tag',
611                          line => $self->{line_prev}, ## "<" in "</>"                          line => $self->{line_prev}, ## "<" in "</>"
612                          column => $self->{column_prev} - 1);                          column => $self->{column_prev} - 1);
613          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
614          $self->{s_kwd} = '';          $self->{s_kwd} = '';
615          !!!next-input-character;          if ($self->{is_xml}) {
616              !!!cp (31);
617              ## XML5: No parse error.
618              
619              ## NOTE: This parser raises a parse error, since it supports
620              ## XML1, not XML5.
621    
622              ## NOTE: A short end tag token.
623              my $ct = {type => END_TAG_TOKEN,
624                        tag_name => '',
625                        line => $self->{line_prev},
626                        column => $self->{column_prev} - 1,
627                       };
628              !!!next-input-character;
629              !!!emit ($ct);
630            } else {
631              !!!cp (31.1);
632              !!!next-input-character;
633            }
634          redo A;          redo A;
635        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
636          !!!cp (32);          !!!cp (32);
# Line 620  sub _get_next_token ($) { Line 644  sub _get_next_token ($) {
644                   });                   });
645    
646          redo A;          redo A;
647        } else {        } elsif (not $self->{is_xml} or
648                   $is_space->{$self->{nc}}) {
649          !!!cp (33);          !!!cp (33);
650          !!!parse-error (type => 'bogus end tag');          !!!parse-error (type => 'bogus end tag',
651                            line => $self->{line_prev}, # "<" of "</"
652                            column => $self->{column_prev} - 1);
653          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
654          $self->{ct} = {type => COMMENT_TOKEN, data => '',          $self->{ct} = {type => COMMENT_TOKEN, data => '',
655                                    line => $self->{line_prev}, # "<" of "</"                                    line => $self->{line_prev}, # "<" of "</"
# Line 635  sub _get_next_token ($) { Line 662  sub _get_next_token ($) {
662          ## generated from the bogus end tag, as defined in the          ## generated from the bogus end tag, as defined in the
663          ## "bogus comment state" entry.          ## "bogus comment state" entry.
664          redo A;          redo A;
665          } else {
666            ## XML5: "</:" is a parse error.
667            !!!cp (30.1);
668            $self->{ct} = {type => END_TAG_TOKEN,
669                           tag_name => chr ($self->{nc}),
670                           line => $l, column => $c};
671            $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
672            !!!next-input-character;
673            redo A;
674        }        }
675      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
676        my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;        my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
# Line 765  sub _get_next_token ($) { Line 801  sub _get_next_token ($) {
801          redo A;          redo A;
802        }        }
803      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
804          ## XML5: "Tag attribute name before state".
805    
806        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
807          !!!cp (45);          !!!cp (45);
808          ## Stay in the state          ## Stay in the state
# Line 837  sub _get_next_token ($) { Line 875  sub _get_next_token ($) {
875               0x003D => 1, # =               0x003D => 1, # =
876              }->{$self->{nc}}) {              }->{$self->{nc}}) {
877            !!!cp (55);            !!!cp (55);
878              ## XML5: Not a parse error.
879            !!!parse-error (type => 'bad attribute name');            !!!parse-error (type => 'bad attribute name');
880          } else {          } else {
881            !!!cp (56);            !!!cp (56);
882              ## XML5: ":" raises a parse error and is ignored.
883          }          }
884          $self->{ca}          $self->{ca}
885              = {name => chr ($self->{nc}),              = {name => chr ($self->{nc}),
# Line 850  sub _get_next_token ($) { Line 890  sub _get_next_token ($) {
890          redo A;          redo A;
891        }        }
892      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
893          ## XML5: "Tag attribute name state".
894    
895        my $before_leave = sub {        my $before_leave = sub {
896          if (exists $self->{ct}->{attributes} # start tag or end tag          if (exists $self->{ct}->{attributes} # start tag or end tag
897              ->{$self->{ca}->{name}}) { # MUST              ->{$self->{ca}->{name}}) { # MUST
# Line 860  sub _get_next_token ($) { Line 902  sub _get_next_token ($) {
902            !!!cp (58);            !!!cp (58);
903            $self->{ct}->{attributes}->{$self->{ca}->{name}}            $self->{ct}->{attributes}->{$self->{ca}->{name}}
904              = $self->{ca};              = $self->{ca};
905              $self->{ca}->{index} = ++$self->{ct}->{last_index};
906          }          }
907        }; # $before_leave        }; # $before_leave
908    
# Line 876  sub _get_next_token ($) { Line 919  sub _get_next_token ($) {
919          !!!next-input-character;          !!!next-input-character;
920          redo A;          redo A;
921        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
922            if ($self->{is_xml}) {
923              !!!cp (60.1);
924              ## XML5: Not a parse error.
925              !!!parse-error (type => 'no attr value'); ## TODO: type
926            } else {
927              !!!cp (60.2);
928            }
929    
930          $before_leave->();          $before_leave->();
931          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
932            !!!cp (61);            !!!cp (61);
# Line 905  sub _get_next_token ($) { Line 956  sub _get_next_token ($) {
956          !!!next-input-character;          !!!next-input-character;
957          redo A;          redo A;
958        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
959          !!!cp (64);          if ($self->{is_xml}) {
960              !!!cp (64);
961              ## XML5: Not a parse error.
962              !!!parse-error (type => 'no attr value'); ## TODO: type
963            } else {
964              !!!cp (64.1);
965            }
966            
967          $before_leave->();          $before_leave->();
968          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
969          !!!next-input-character;          !!!next-input-character;
# Line 939  sub _get_next_token ($) { Line 997  sub _get_next_token ($) {
997          if ($self->{nc} == 0x0022 or # "          if ($self->{nc} == 0x0022 or # "
998              $self->{nc} == 0x0027) { # '              $self->{nc} == 0x0027) { # '
999            !!!cp (69);            !!!cp (69);
1000              ## XML5: Not a parse error.
1001            !!!parse-error (type => 'bad attribute name');            !!!parse-error (type => 'bad attribute name');
1002          } else {          } else {
1003            !!!cp (70);            !!!cp (70);
# Line 949  sub _get_next_token ($) { Line 1008  sub _get_next_token ($) {
1008          redo A;          redo A;
1009        }        }
1010      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1011          ## XML5: "Tag attribute name after state".
1012          
1013        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1014          !!!cp (71);          !!!cp (71);
1015          ## Stay in the state          ## Stay in the state
# Line 960  sub _get_next_token ($) { Line 1021  sub _get_next_token ($) {
1021          !!!next-input-character;          !!!next-input-character;
1022          redo A;          redo A;
1023        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1024            if ($self->{is_xml}) {
1025              !!!cp (72.1);
1026              ## XML5: Not a parse error.
1027              !!!parse-error (type => 'no attr value'); ## TODO: type
1028            } else {
1029              !!!cp (72.2);
1030            }
1031    
1032          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1033            !!!cp (73);            !!!cp (73);
1034            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
# Line 993  sub _get_next_token ($) { Line 1062  sub _get_next_token ($) {
1062          !!!next-input-character;          !!!next-input-character;
1063          redo A;          redo A;
1064        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1065          !!!cp (77);          if ($self->{is_xml}) {
1066              !!!cp (77);
1067              ## XML5: Not a parse error.
1068              !!!parse-error (type => 'no attr value'); ## TODO: type
1069            } else {
1070              !!!cp (77.1);
1071            }
1072            
1073          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
1074          !!!next-input-character;          !!!next-input-character;
1075          redo A;          redo A;
# Line 1022  sub _get_next_token ($) { Line 1098  sub _get_next_token ($) {
1098    
1099          redo A;          redo A;
1100        } else {        } else {
1101            if ($self->{is_xml}) {
1102              !!!cp (78.1);
1103              ## XML5: Not a parse error.
1104              !!!parse-error (type => 'no attr value'); ## TODO: type
1105            } else {
1106              !!!cp (78.2);
1107            }
1108    
1109          if ($self->{nc} == 0x0022 or # "          if ($self->{nc} == 0x0022 or # "
1110              $self->{nc} == 0x0027) { # '              $self->{nc} == 0x0027) { # '
1111            !!!cp (78);            !!!cp (78);
1112              ## XML5: Not a parse error.
1113            !!!parse-error (type => 'bad attribute name');            !!!parse-error (type => 'bad attribute name');
1114          } else {          } else {
1115            !!!cp (82);            !!!cp (82);
# Line 1038  sub _get_next_token ($) { Line 1123  sub _get_next_token ($) {
1123          redo A;                  redo A;        
1124        }        }
1125      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1126          ## XML5: "Tag attribute value before state".
1127    
1128        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1129          !!!cp (83);          !!!cp (83);
1130          ## Stay in the state          ## Stay in the state
# Line 1109  sub _get_next_token ($) { Line 1196  sub _get_next_token ($) {
1196        } else {        } else {
1197          if ($self->{nc} == 0x003D) { # =          if ($self->{nc} == 0x003D) { # =
1198            !!!cp (93);            !!!cp (93);
1199              ## XML5: Not a parse error.
1200            !!!parse-error (type => 'bad attribute value');            !!!parse-error (type => 'bad attribute value');
1201            } elsif ($self->{is_xml}) {
1202              !!!cp (93.1);
1203              ## XML5: No parse error.
1204              !!!parse-error (type => 'unquoted attr value'); ## TODO
1205          } else {          } else {
1206            !!!cp (94);            !!!cp (94);
1207          }          }
# Line 1119  sub _get_next_token ($) { Line 1211  sub _get_next_token ($) {
1211          redo A;          redo A;
1212        }        }
1213      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1214          ## XML5: "Tag attribute value double quoted state".
1215          
1216        if ($self->{nc} == 0x0022) { # "        if ($self->{nc} == 0x0022) { # "
1217          !!!cp (95);          !!!cp (95);
1218            ## XML5: "Tag attribute name before state".
1219          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1220          !!!next-input-character;          !!!next-input-character;
1221          redo A;          redo A;
1222        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1223          !!!cp (96);          !!!cp (96);
1224            ## XML5: Not defined yet.
1225    
1226          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1227          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1228          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1160  sub _get_next_token ($) { Line 1257  sub _get_next_token ($) {
1257    
1258          redo A;          redo A;
1259        } else {        } else {
1260          !!!cp (100);          if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1261              !!!cp (100);
1262              ## XML5: Not a parse error.
1263              !!!parse-error (type => 'lt in attr value'); ## TODO: type
1264            } else {
1265              !!!cp (100.1);
1266            }
1267          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1268          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1269                                q["&],                                q["&<],
1270                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1271    
1272          ## Stay in the state          ## Stay in the state
# Line 1171  sub _get_next_token ($) { Line 1274  sub _get_next_token ($) {
1274          redo A;          redo A;
1275        }        }
1276      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1277          ## XML5: "Tag attribute value single quoted state".
1278    
1279        if ($self->{nc} == 0x0027) { # '        if ($self->{nc} == 0x0027) { # '
1280          !!!cp (101);          !!!cp (101);
1281            ## XML5: "Before attribute name state" (sic).
1282          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1283          !!!next-input-character;          !!!next-input-character;
1284          redo A;          redo A;
1285        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1286          !!!cp (102);          !!!cp (102);
1287            ## XML5: Not defined yet.
1288    
1289          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1290          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1291          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1212  sub _get_next_token ($) { Line 1320  sub _get_next_token ($) {
1320    
1321          redo A;          redo A;
1322        } else {        } else {
1323          !!!cp (106);          if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1324              !!!cp (106);
1325              ## XML5: Not a parse error.
1326              !!!parse-error (type => 'lt in attr value'); ## TODO: type
1327            } else {
1328              !!!cp (106.1);
1329            }
1330          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1331          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1332                                q['&],                                q['&<],
1333                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1334    
1335          ## Stay in the state          ## Stay in the state
# Line 1223  sub _get_next_token ($) { Line 1337  sub _get_next_token ($) {
1337          redo A;          redo A;
1338        }        }
1339      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1340          ## XML5: "Tag attribute value unquoted state".
1341    
1342        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1343          !!!cp (107);          !!!cp (107);
1344            ## XML5: "Tag attribute name before state".
1345          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1346          !!!next-input-character;          !!!next-input-character;
1347          redo A;          redo A;
1348        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1349          !!!cp (108);          !!!cp (108);
1350    
1351            ## XML5: Not defined yet.
1352    
1353          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1354          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1355          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1293  sub _get_next_token ($) { Line 1413  sub _get_next_token ($) {
1413               0x003D => 1, # =               0x003D => 1, # =
1414              }->{$self->{nc}}) {              }->{$self->{nc}}) {
1415            !!!cp (115);            !!!cp (115);
1416              ## XML5: Not a parse error.
1417            !!!parse-error (type => 'bad attribute value');            !!!parse-error (type => 'bad attribute value');
1418          } else {          } else {
1419            !!!cp (116);            !!!cp (116);
# Line 1369  sub _get_next_token ($) { Line 1490  sub _get_next_token ($) {
1490          redo A;          redo A;
1491        }        }
1492      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1493          ## XML5: "Empty tag state".
1494    
1495        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
1496          if ($self->{ct}->{type} == END_TAG_TOKEN) {          if ($self->{ct}->{type} == END_TAG_TOKEN) {
1497            !!!cp ('124.2');            !!!cp ('124.2');
# Line 1410  sub _get_next_token ($) { Line 1533  sub _get_next_token ($) {
1533          } else {          } else {
1534            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1535          }          }
1536            ## XML5: "Tag attribute name before state".
1537          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1538          $self->{s_kwd} = '';          $self->{s_kwd} = '';
1539          ## Reconsume.          ## Reconsume.
# Line 1502  sub _get_next_token ($) { Line 1626  sub _get_next_token ($) {
1626                                    line => $self->{line_prev},                                    line => $self->{line_prev},
1627                                    column => $self->{column_prev} - 2,                                    column => $self->{column_prev} - 2,
1628                                   };                                   };
1629          $self->{state} = COMMENT_START_STATE;          $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
1630          !!!next-input-character;          !!!next-input-character;
1631          redo A;          redo A;
1632        } else {        } else {
# Line 1545  sub _get_next_token ($) { Line 1669  sub _get_next_token ($) {
1669        } elsif ((length $self->{s_kwd}) == 6 and        } elsif ((length $self->{s_kwd}) == 6 and
1670                 ($self->{nc} == 0x0045 or # E                 ($self->{nc} == 0x0045 or # E
1671                  $self->{nc} == 0x0065)) { # e                  $self->{nc} == 0x0065)) { # e
1672          !!!cp (129);          if ($self->{s_kwd} ne 'DOCTYP') {
1673              !!!cp (129);
1674              ## XML5: case-sensitive.
1675              !!!parse-error (type => 'lowercase keyword', ## TODO
1676                              text => 'DOCTYPE',
1677                              line => $self->{line_prev},
1678                              column => $self->{column_prev} - 5);
1679            } else {
1680              !!!cp (129.1);
1681            }
1682          $self->{state} = DOCTYPE_STATE;          $self->{state} = DOCTYPE_STATE;
1683          $self->{ct} = {type => DOCTYPE_TOKEN,          $self->{ct} = {type => DOCTYPE_TOKEN,
1684                                    quirks => 1,                                    quirks => 1,
# Line 1712  sub _get_next_token ($) { Line 1845  sub _get_next_token ($) {
1845          redo A;          redo A;
1846        }        }
1847      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1848          ## XML5: "comment dash state".
1849    
1850        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
1851          !!!cp (148);          !!!cp (148);
1852          $self->{state} = COMMENT_END_STATE;          $self->{state} = COMMENT_END_STATE;
# Line 1747  sub _get_next_token ($) { Line 1882  sub _get_next_token ($) {
1882          redo A;          redo A;
1883        } elsif ($self->{nc} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
1884          !!!cp (152);          !!!cp (152);
1885            ## XML5: Not a parse error.
1886          !!!parse-error (type => 'dash in comment',          !!!parse-error (type => 'dash in comment',
1887                          line => $self->{line_prev},                          line => $self->{line_prev},
1888                          column => $self->{column_prev});                          column => $self->{column_prev});
# Line 1766  sub _get_next_token ($) { Line 1902  sub _get_next_token ($) {
1902          redo A;          redo A;
1903        } else {        } else {
1904          !!!cp (154);          !!!cp (154);
1905            ## XML5: Not a parse error.
1906          !!!parse-error (type => 'dash in comment',          !!!parse-error (type => 'dash in comment',
1907                          line => $self->{line_prev},                          line => $self->{line_prev},
1908                          column => $self->{column_prev});                          column => $self->{column_prev});
# Line 2361  sub _get_next_token ($) { Line 2498  sub _get_next_token ($) {
2498        ## NOTE: "CDATA section state" in the state is jointly implemented        ## NOTE: "CDATA section state" in the state is jointly implemented
2499        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
2500        ## and |CDATA_SECTION_MSE2_STATE|.        ## and |CDATA_SECTION_MSE2_STATE|.
2501    
2502          ## XML5: "CDATA state".
2503                
2504        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
2505          !!!cp (221.1);          !!!cp (221.1);
# Line 2377  sub _get_next_token ($) { Line 2516  sub _get_next_token ($) {
2516    
2517          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2518          $self->{s_kwd} = '';          $self->{s_kwd} = '';
2519          !!!next-input-character;          ## Reconsume.
2520          if (length $self->{ct}->{data}) { # character          if (length $self->{ct}->{data}) { # character
2521            !!!cp (221.2);            !!!cp (221.2);
2522            !!!emit ($self->{ct}); # character            !!!emit ($self->{ct}); # character
# Line 2400  sub _get_next_token ($) { Line 2539  sub _get_next_token ($) {
2539    
2540        ## ISSUE: "text tokens" in spec.        ## ISSUE: "text tokens" in spec.
2541      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
2542          ## XML5: "CDATA bracket state".
2543    
2544        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
2545          !!!cp (221.5);          !!!cp (221.5);
2546          $self->{state} = CDATA_SECTION_MSE2_STATE;          $self->{state} = CDATA_SECTION_MSE2_STATE;
# Line 2407  sub _get_next_token ($) { Line 2548  sub _get_next_token ($) {
2548          redo A;          redo A;
2549        } else {        } else {
2550          !!!cp (221.6);          !!!cp (221.6);
2551            ## XML5: If EOF, "]" is not appended and changed to the data state.
2552          $self->{ct}->{data} .= ']';          $self->{ct}->{data} .= ']';
2553          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
2554          ## Reconsume.          ## Reconsume.
2555          redo A;          redo A;
2556        }        }
2557      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
2558          ## XML5: "CDATA end state".
2559    
2560        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2561          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2562          $self->{s_kwd} = '';          $self->{s_kwd} = '';
# Line 2435  sub _get_next_token ($) { Line 2579  sub _get_next_token ($) {
2579          !!!cp (221.11);          !!!cp (221.11);
2580          $self->{ct}->{data} .= ']]'; # character          $self->{ct}->{data} .= ']]'; # character
2581          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE;
2582          ## Reconsume.          ## Reconsume. ## XML5: Emit.
2583          redo A;          redo A;
2584        }        }
2585      } elsif ($self->{state} == ENTITY_STATE) {      } elsif ($self->{state} == ENTITY_STATE) {

Legend:
Removed from v.1.9  
changed lines
  Added in v.1.11

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24