/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.10 by wakaba, Wed Oct 15 08:51:02 2008 UTC revision 1.11 by wakaba, Wed Oct 15 10:50:38 2008 UTC
# Line 206  sub _initialize_tokenizer ($) { Line 206  sub _initialize_tokenizer ($) {
206    
207  ## A token has:  ## A token has:
208  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
209  ##       CHARACTER_TOKEN, or END_OF_FILE_TOKEN  ##       CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
210  ##   ->{name} (DOCTYPE_TOKEN)  ##   ->{name} (DOCTYPE_TOKEN)
211  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
212    ##   ->{target} (PI_TOKEN)
213  ##   ->{pubid} (DOCTYPE_TOKEN)  ##   ->{pubid} (DOCTYPE_TOKEN)
214  ##   ->{sysid} (DOCTYPE_TOKEN)  ##   ->{sysid} (DOCTYPE_TOKEN)
215  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
# Line 216  sub _initialize_tokenizer ($) { Line 217  sub _initialize_tokenizer ($) {
217  ##        ->{name}  ##        ->{name}
218  ##        ->{value}  ##        ->{value}
219  ##        ->{has_reference} == 1 or 0  ##        ->{has_reference} == 1 or 0
220  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)  ##        ->{index}: Index of the attribute in a tag.
221    ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
222  ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)  ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
223    ##   ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
224  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
225  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|
226  ##     while the token is pushed back to the stack.  ##     while the token is pushed back to the stack.
# Line 798  sub _get_next_token ($) { Line 801  sub _get_next_token ($) {
801          redo A;          redo A;
802        }        }
803      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
804          ## XML5: "Tag attribute name before state".
805    
806        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
807          !!!cp (45);          !!!cp (45);
808          ## Stay in the state          ## Stay in the state
# Line 870  sub _get_next_token ($) { Line 875  sub _get_next_token ($) {
875               0x003D => 1, # =               0x003D => 1, # =
876              }->{$self->{nc}}) {              }->{$self->{nc}}) {
877            !!!cp (55);            !!!cp (55);
878              ## XML5: Not a parse error.
879            !!!parse-error (type => 'bad attribute name');            !!!parse-error (type => 'bad attribute name');
880          } else {          } else {
881            !!!cp (56);            !!!cp (56);
882              ## XML5: ":" raises a parse error and is ignored.
883          }          }
884          $self->{ca}          $self->{ca}
885              = {name => chr ($self->{nc}),              = {name => chr ($self->{nc}),
# Line 883  sub _get_next_token ($) { Line 890  sub _get_next_token ($) {
890          redo A;          redo A;
891        }        }
892      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
893          ## XML5: "Tag attribute name state".
894    
895        my $before_leave = sub {        my $before_leave = sub {
896          if (exists $self->{ct}->{attributes} # start tag or end tag          if (exists $self->{ct}->{attributes} # start tag or end tag
897              ->{$self->{ca}->{name}}) { # MUST              ->{$self->{ca}->{name}}) { # MUST
# Line 893  sub _get_next_token ($) { Line 902  sub _get_next_token ($) {
902            !!!cp (58);            !!!cp (58);
903            $self->{ct}->{attributes}->{$self->{ca}->{name}}            $self->{ct}->{attributes}->{$self->{ca}->{name}}
904              = $self->{ca};              = $self->{ca};
905              $self->{ca}->{index} = ++$self->{ct}->{last_index};
906          }          }
907        }; # $before_leave        }; # $before_leave
908    
# Line 909  sub _get_next_token ($) { Line 919  sub _get_next_token ($) {
919          !!!next-input-character;          !!!next-input-character;
920          redo A;          redo A;
921        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
922            if ($self->{is_xml}) {
923              !!!cp (60.1);
924              ## XML5: Not a parse error.
925              !!!parse-error (type => 'no attr value'); ## TODO: type
926            } else {
927              !!!cp (60.2);
928            }
929    
930          $before_leave->();          $before_leave->();
931          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
932            !!!cp (61);            !!!cp (61);
# Line 938  sub _get_next_token ($) { Line 956  sub _get_next_token ($) {
956          !!!next-input-character;          !!!next-input-character;
957          redo A;          redo A;
958        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
959          !!!cp (64);          if ($self->{is_xml}) {
960              !!!cp (64);
961              ## XML5: Not a parse error.
962              !!!parse-error (type => 'no attr value'); ## TODO: type
963            } else {
964              !!!cp (64.1);
965            }
966            
967          $before_leave->();          $before_leave->();
968          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
969          !!!next-input-character;          !!!next-input-character;
# Line 972  sub _get_next_token ($) { Line 997  sub _get_next_token ($) {
997          if ($self->{nc} == 0x0022 or # "          if ($self->{nc} == 0x0022 or # "
998              $self->{nc} == 0x0027) { # '              $self->{nc} == 0x0027) { # '
999            !!!cp (69);            !!!cp (69);
1000              ## XML5: Not a parse error.
1001            !!!parse-error (type => 'bad attribute name');            !!!parse-error (type => 'bad attribute name');
1002          } else {          } else {
1003            !!!cp (70);            !!!cp (70);
# Line 982  sub _get_next_token ($) { Line 1008  sub _get_next_token ($) {
1008          redo A;          redo A;
1009        }        }
1010      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1011          ## XML5: "Tag attribute name after state".
1012          
1013        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1014          !!!cp (71);          !!!cp (71);
1015          ## Stay in the state          ## Stay in the state
# Line 993  sub _get_next_token ($) { Line 1021  sub _get_next_token ($) {
1021          !!!next-input-character;          !!!next-input-character;
1022          redo A;          redo A;
1023        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1024            if ($self->{is_xml}) {
1025              !!!cp (72.1);
1026              ## XML5: Not a parse error.
1027              !!!parse-error (type => 'no attr value'); ## TODO: type
1028            } else {
1029              !!!cp (72.2);
1030            }
1031    
1032          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1033            !!!cp (73);            !!!cp (73);
1034            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
# Line 1026  sub _get_next_token ($) { Line 1062  sub _get_next_token ($) {
1062          !!!next-input-character;          !!!next-input-character;
1063          redo A;          redo A;
1064        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1065          !!!cp (77);          if ($self->{is_xml}) {
1066              !!!cp (77);
1067              ## XML5: Not a parse error.
1068              !!!parse-error (type => 'no attr value'); ## TODO: type
1069            } else {
1070              !!!cp (77.1);
1071            }
1072            
1073          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
1074          !!!next-input-character;          !!!next-input-character;
1075          redo A;          redo A;
# Line 1055  sub _get_next_token ($) { Line 1098  sub _get_next_token ($) {
1098    
1099          redo A;          redo A;
1100        } else {        } else {
1101            if ($self->{is_xml}) {
1102              !!!cp (78.1);
1103              ## XML5: Not a parse error.
1104              !!!parse-error (type => 'no attr value'); ## TODO: type
1105            } else {
1106              !!!cp (78.2);
1107            }
1108    
1109          if ($self->{nc} == 0x0022 or # "          if ($self->{nc} == 0x0022 or # "
1110              $self->{nc} == 0x0027) { # '              $self->{nc} == 0x0027) { # '
1111            !!!cp (78);            !!!cp (78);
1112              ## XML5: Not a parse error.
1113            !!!parse-error (type => 'bad attribute name');            !!!parse-error (type => 'bad attribute name');
1114          } else {          } else {
1115            !!!cp (82);            !!!cp (82);
# Line 1071  sub _get_next_token ($) { Line 1123  sub _get_next_token ($) {
1123          redo A;                  redo A;        
1124        }        }
1125      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1126          ## XML5: "Tag attribute value before state".
1127    
1128        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1129          !!!cp (83);          !!!cp (83);
1130          ## Stay in the state          ## Stay in the state
# Line 1142  sub _get_next_token ($) { Line 1196  sub _get_next_token ($) {
1196        } else {        } else {
1197          if ($self->{nc} == 0x003D) { # =          if ($self->{nc} == 0x003D) { # =
1198            !!!cp (93);            !!!cp (93);
1199              ## XML5: Not a parse error.
1200            !!!parse-error (type => 'bad attribute value');            !!!parse-error (type => 'bad attribute value');
1201            } elsif ($self->{is_xml}) {
1202              !!!cp (93.1);
1203              ## XML5: No parse error.
1204              !!!parse-error (type => 'unquoted attr value'); ## TODO
1205          } else {          } else {
1206            !!!cp (94);            !!!cp (94);
1207          }          }
# Line 1152  sub _get_next_token ($) { Line 1211  sub _get_next_token ($) {
1211          redo A;          redo A;
1212        }        }
1213      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1214          ## XML5: "Tag attribute value double quoted state".
1215          
1216        if ($self->{nc} == 0x0022) { # "        if ($self->{nc} == 0x0022) { # "
1217          !!!cp (95);          !!!cp (95);
1218            ## XML5: "Tag attribute name before state".
1219          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1220          !!!next-input-character;          !!!next-input-character;
1221          redo A;          redo A;
1222        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1223          !!!cp (96);          !!!cp (96);
1224            ## XML5: Not defined yet.
1225    
1226          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1227          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1228          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1193  sub _get_next_token ($) { Line 1257  sub _get_next_token ($) {
1257    
1258          redo A;          redo A;
1259        } else {        } else {
1260          !!!cp (100);          if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1261              !!!cp (100);
1262              ## XML5: Not a parse error.
1263              !!!parse-error (type => 'lt in attr value'); ## TODO: type
1264            } else {
1265              !!!cp (100.1);
1266            }
1267          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1268          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1269                                q["&],                                q["&<],
1270                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1271    
1272          ## Stay in the state          ## Stay in the state
# Line 1204  sub _get_next_token ($) { Line 1274  sub _get_next_token ($) {
1274          redo A;          redo A;
1275        }        }
1276      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1277          ## XML5: "Tag attribute value single quoted state".
1278    
1279        if ($self->{nc} == 0x0027) { # '        if ($self->{nc} == 0x0027) { # '
1280          !!!cp (101);          !!!cp (101);
1281            ## XML5: "Before attribute name state" (sic).
1282          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1283          !!!next-input-character;          !!!next-input-character;
1284          redo A;          redo A;
1285        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1286          !!!cp (102);          !!!cp (102);
1287            ## XML5: Not defined yet.
1288    
1289          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1290          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1291          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1245  sub _get_next_token ($) { Line 1320  sub _get_next_token ($) {
1320    
1321          redo A;          redo A;
1322        } else {        } else {
1323          !!!cp (106);          if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1324              !!!cp (106);
1325              ## XML5: Not a parse error.
1326              !!!parse-error (type => 'lt in attr value'); ## TODO: type
1327            } else {
1328              !!!cp (106.1);
1329            }
1330          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1331          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1332                                q['&],                                q['&<],
1333                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1334    
1335          ## Stay in the state          ## Stay in the state
# Line 1256  sub _get_next_token ($) { Line 1337  sub _get_next_token ($) {
1337          redo A;          redo A;
1338        }        }
1339      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1340          ## XML5: "Tag attribute value unquoted state".
1341    
1342        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1343          !!!cp (107);          !!!cp (107);
1344            ## XML5: "Tag attribute name before state".
1345          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1346          !!!next-input-character;          !!!next-input-character;
1347          redo A;          redo A;
1348        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1349          !!!cp (108);          !!!cp (108);
1350    
1351            ## XML5: Not defined yet.
1352    
1353          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1354          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1355          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1326  sub _get_next_token ($) { Line 1413  sub _get_next_token ($) {
1413               0x003D => 1, # =               0x003D => 1, # =
1414              }->{$self->{nc}}) {              }->{$self->{nc}}) {
1415            !!!cp (115);            !!!cp (115);
1416              ## XML5: Not a parse error.
1417            !!!parse-error (type => 'bad attribute value');            !!!parse-error (type => 'bad attribute value');
1418          } else {          } else {
1419            !!!cp (116);            !!!cp (116);
# Line 1402  sub _get_next_token ($) { Line 1490  sub _get_next_token ($) {
1490          redo A;          redo A;
1491        }        }
1492      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1493          ## XML5: "Empty tag state".
1494    
1495        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
1496          if ($self->{ct}->{type} == END_TAG_TOKEN) {          if ($self->{ct}->{type} == END_TAG_TOKEN) {
1497            !!!cp ('124.2');            !!!cp ('124.2');
# Line 1443  sub _get_next_token ($) { Line 1533  sub _get_next_token ($) {
1533          } else {          } else {
1534            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1535          }          }
1536            ## XML5: "Tag attribute name before state".
1537          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1538          $self->{s_kwd} = '';          $self->{s_kwd} = '';
1539          ## Reconsume.          ## Reconsume.

Legend:
Removed from v.1.10  
changed lines
  Added in v.1.11

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24