| 216 |
|
|
| 217 |
## A token has: |
## A token has: |
| 218 |
## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN, |
## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN, |
| 219 |
## CHARACTER_TOKEN, or END_OF_FILE_TOKEN |
## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN |
| 220 |
## ->{name} (DOCTYPE_TOKEN) |
## ->{name} (DOCTYPE_TOKEN) |
| 221 |
## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN) |
## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN) |
| 222 |
|
## ->{target} (PI_TOKEN) |
| 223 |
## ->{pubid} (DOCTYPE_TOKEN) |
## ->{pubid} (DOCTYPE_TOKEN) |
| 224 |
## ->{sysid} (DOCTYPE_TOKEN) |
## ->{sysid} (DOCTYPE_TOKEN) |
| 225 |
## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag |
## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag |
| 227 |
## ->{name} |
## ->{name} |
| 228 |
## ->{value} |
## ->{value} |
| 229 |
## ->{has_reference} == 1 or 0 |
## ->{has_reference} == 1 or 0 |
| 230 |
## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN) |
## ->{index}: Index of the attribute in a tag. |
| 231 |
|
## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN) |
| 232 |
## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN) |
## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN) |
| 233 |
|
## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1. |
| 234 |
## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|. |
## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|. |
| 235 |
## |->{self_closing}| is used to save the value of |$self->{self_closing}| |
## |->{self_closing}| is used to save the value of |$self->{self_closing}| |
| 236 |
## while the token is pushed back to the stack. |
## while the token is pushed back to the stack. |
| 1061 |
redo A; |
redo A; |
| 1062 |
} |
} |
| 1063 |
} elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) { |
} elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) { |
| 1064 |
|
## XML5: "Tag attribute name before state". |
| 1065 |
|
|
| 1066 |
if ($is_space->{$self->{nc}}) { |
if ($is_space->{$self->{nc}}) { |
| 1067 |
|
|
| 1068 |
## Stay in the state |
## Stay in the state |
| 1175 |
0x003D => 1, # = |
0x003D => 1, # = |
| 1176 |
}->{$self->{nc}}) { |
}->{$self->{nc}}) { |
| 1177 |
|
|
| 1178 |
|
## XML5: Not a parse error. |
| 1179 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name'); |
| 1180 |
} else { |
} else { |
| 1181 |
|
|
| 1182 |
|
## XML5: ":" raises a parse error and is ignored. |
| 1183 |
} |
} |
| 1184 |
$self->{ca} |
$self->{ca} |
| 1185 |
= {name => chr ($self->{nc}), |
= {name => chr ($self->{nc}), |
| 1200 |
redo A; |
redo A; |
| 1201 |
} |
} |
| 1202 |
} elsif ($self->{state} == ATTRIBUTE_NAME_STATE) { |
} elsif ($self->{state} == ATTRIBUTE_NAME_STATE) { |
| 1203 |
|
## XML5: "Tag attribute name state". |
| 1204 |
|
|
| 1205 |
my $before_leave = sub { |
my $before_leave = sub { |
| 1206 |
if (exists $self->{ct}->{attributes} # start tag or end tag |
if (exists $self->{ct}->{attributes} # start tag or end tag |
| 1207 |
->{$self->{ca}->{name}}) { # MUST |
->{$self->{ca}->{name}}) { # MUST |
| 1212 |
|
|
| 1213 |
$self->{ct}->{attributes}->{$self->{ca}->{name}} |
$self->{ct}->{attributes}->{$self->{ca}->{name}} |
| 1214 |
= $self->{ca}; |
= $self->{ca}; |
| 1215 |
|
$self->{ca}->{index} = ++$self->{ct}->{last_index}; |
| 1216 |
} |
} |
| 1217 |
}; # $before_leave |
}; # $before_leave |
| 1218 |
|
|
| 1249 |
|
|
| 1250 |
redo A; |
redo A; |
| 1251 |
} elsif ($self->{nc} == 0x003E) { # > |
} elsif ($self->{nc} == 0x003E) { # > |
| 1252 |
|
if ($self->{is_xml}) { |
| 1253 |
|
|
| 1254 |
|
## XML5: Not a parse error. |
| 1255 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type |
| 1256 |
|
} else { |
| 1257 |
|
|
| 1258 |
|
} |
| 1259 |
|
|
| 1260 |
$before_leave->(); |
$before_leave->(); |
| 1261 |
if ($self->{ct}->{type} == START_TAG_TOKEN) { |
if ($self->{ct}->{type} == START_TAG_TOKEN) { |
| 1262 |
|
|
| 1306 |
|
|
| 1307 |
redo A; |
redo A; |
| 1308 |
} elsif ($self->{nc} == 0x002F) { # / |
} elsif ($self->{nc} == 0x002F) { # / |
| 1309 |
|
if ($self->{is_xml}) { |
| 1310 |
|
|
| 1311 |
|
## XML5: Not a parse error. |
| 1312 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type |
| 1313 |
|
} else { |
| 1314 |
|
|
| 1315 |
|
} |
| 1316 |
|
|
| 1317 |
$before_leave->(); |
$before_leave->(); |
| 1318 |
$self->{state} = SELF_CLOSING_START_TAG_STATE; |
$self->{state} = SELF_CLOSING_START_TAG_STATE; |
| 1357 |
if ($self->{nc} == 0x0022 or # " |
if ($self->{nc} == 0x0022 or # " |
| 1358 |
$self->{nc} == 0x0027) { # ' |
$self->{nc} == 0x0027) { # ' |
| 1359 |
|
|
| 1360 |
|
## XML5: Not a parse error. |
| 1361 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name'); |
| 1362 |
} else { |
} else { |
| 1363 |
|
|
| 1378 |
redo A; |
redo A; |
| 1379 |
} |
} |
| 1380 |
} elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) { |
} elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) { |
| 1381 |
|
## XML5: "Tag attribute name after state". |
| 1382 |
|
|
| 1383 |
if ($is_space->{$self->{nc}}) { |
if ($is_space->{$self->{nc}}) { |
| 1384 |
|
|
| 1385 |
## Stay in the state |
## Stay in the state |
| 1411 |
|
|
| 1412 |
redo A; |
redo A; |
| 1413 |
} elsif ($self->{nc} == 0x003E) { # > |
} elsif ($self->{nc} == 0x003E) { # > |
| 1414 |
|
if ($self->{is_xml}) { |
| 1415 |
|
|
| 1416 |
|
## XML5: Not a parse error. |
| 1417 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type |
| 1418 |
|
} else { |
| 1419 |
|
|
| 1420 |
|
} |
| 1421 |
|
|
| 1422 |
if ($self->{ct}->{type} == START_TAG_TOKEN) { |
if ($self->{ct}->{type} == START_TAG_TOKEN) { |
| 1423 |
|
|
| 1424 |
$self->{last_stag_name} = $self->{ct}->{tag_name}; |
$self->{last_stag_name} = $self->{ct}->{tag_name}; |
| 1472 |
|
|
| 1473 |
redo A; |
redo A; |
| 1474 |
} elsif ($self->{nc} == 0x002F) { # / |
} elsif ($self->{nc} == 0x002F) { # / |
| 1475 |
|
if ($self->{is_xml}) { |
| 1476 |
|
|
| 1477 |
|
## XML5: Not a parse error. |
| 1478 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type |
| 1479 |
|
} else { |
| 1480 |
|
|
| 1481 |
|
} |
| 1482 |
|
|
| 1483 |
$self->{state} = SELF_CLOSING_START_TAG_STATE; |
$self->{state} = SELF_CLOSING_START_TAG_STATE; |
| 1484 |
|
|
| 1518 |
|
|
| 1519 |
redo A; |
redo A; |
| 1520 |
} else { |
} else { |
| 1521 |
|
if ($self->{is_xml}) { |
| 1522 |
|
|
| 1523 |
|
## XML5: Not a parse error. |
| 1524 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type |
| 1525 |
|
} else { |
| 1526 |
|
|
| 1527 |
|
} |
| 1528 |
|
|
| 1529 |
if ($self->{nc} == 0x0022 or # " |
if ($self->{nc} == 0x0022 or # " |
| 1530 |
$self->{nc} == 0x0027) { # ' |
$self->{nc} == 0x0027) { # ' |
| 1531 |
|
|
| 1532 |
|
## XML5: Not a parse error. |
| 1533 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name'); |
| 1534 |
} else { |
} else { |
| 1535 |
|
|
| 1553 |
redo A; |
redo A; |
| 1554 |
} |
} |
| 1555 |
} elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) { |
} elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) { |
| 1556 |
|
## XML5: "Tag attribute value before state". |
| 1557 |
|
|
| 1558 |
if ($is_space->{$self->{nc}}) { |
if ($is_space->{$self->{nc}}) { |
| 1559 |
|
|
| 1560 |
## Stay in the state |
## Stay in the state |
| 1666 |
} else { |
} else { |
| 1667 |
if ($self->{nc} == 0x003D) { # = |
if ($self->{nc} == 0x003D) { # = |
| 1668 |
|
|
| 1669 |
|
## XML5: Not a parse error. |
| 1670 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value'); |
| 1671 |
|
} elsif ($self->{is_xml}) { |
| 1672 |
|
|
| 1673 |
|
## XML5: No parse error. |
| 1674 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO |
| 1675 |
} else { |
} else { |
| 1676 |
|
|
| 1677 |
} |
} |
| 1691 |
redo A; |
redo A; |
| 1692 |
} |
} |
| 1693 |
} elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) { |
} elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) { |
| 1694 |
|
## XML5: "Tag attribute value double quoted state". |
| 1695 |
|
|
| 1696 |
if ($self->{nc} == 0x0022) { # " |
if ($self->{nc} == 0x0022) { # " |
| 1697 |
|
|
| 1698 |
|
## XML5: "Tag attribute name before state". |
| 1699 |
$self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE; |
$self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE; |
| 1700 |
|
|
| 1701 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 1711 |
redo A; |
redo A; |
| 1712 |
} elsif ($self->{nc} == 0x0026) { # & |
} elsif ($self->{nc} == 0x0026) { # & |
| 1713 |
|
|
| 1714 |
|
## XML5: Not defined yet. |
| 1715 |
|
|
| 1716 |
## NOTE: In the spec, the tokenizer is switched to the |
## NOTE: In the spec, the tokenizer is switched to the |
| 1717 |
## "entity in attribute value state". In this implementation, the |
## "entity in attribute value state". In this implementation, the |
| 1718 |
## tokenizer is switched to the |ENTITY_STATE|, which is an |
## tokenizer is switched to the |ENTITY_STATE|, which is an |
| 1757 |
|
|
| 1758 |
redo A; |
redo A; |
| 1759 |
} else { |
} else { |
| 1760 |
|
if ($self->{is_xml} and $self->{nc} == 0x003C) { # < |
| 1761 |
|
|
| 1762 |
|
## XML5: Not a parse error. |
| 1763 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type |
| 1764 |
|
} else { |
| 1765 |
|
|
| 1766 |
|
} |
| 1767 |
$self->{ca}->{value} .= chr ($self->{nc}); |
$self->{ca}->{value} .= chr ($self->{nc}); |
| 1768 |
$self->{read_until}->($self->{ca}->{value}, |
$self->{read_until}->($self->{ca}->{value}, |
| 1769 |
q["&], |
q["&<], |
| 1770 |
length $self->{ca}->{value}); |
length $self->{ca}->{value}); |
| 1771 |
|
|
| 1772 |
## Stay in the state |
## Stay in the state |
| 1784 |
redo A; |
redo A; |
| 1785 |
} |
} |
| 1786 |
} elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) { |
} elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) { |
| 1787 |
|
## XML5: "Tag attribute value single quoted state". |
| 1788 |
|
|
| 1789 |
if ($self->{nc} == 0x0027) { # ' |
if ($self->{nc} == 0x0027) { # ' |
| 1790 |
|
|
| 1791 |
|
## XML5: "Before attribute name state" (sic). |
| 1792 |
$self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE; |
$self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE; |
| 1793 |
|
|
| 1794 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 1804 |
redo A; |
redo A; |
| 1805 |
} elsif ($self->{nc} == 0x0026) { # & |
} elsif ($self->{nc} == 0x0026) { # & |
| 1806 |
|
|
| 1807 |
|
## XML5: Not defined yet. |
| 1808 |
|
|
| 1809 |
## NOTE: In the spec, the tokenizer is switched to the |
## NOTE: In the spec, the tokenizer is switched to the |
| 1810 |
## "entity in attribute value state". In this implementation, the |
## "entity in attribute value state". In this implementation, the |
| 1811 |
## tokenizer is switched to the |ENTITY_STATE|, which is an |
## tokenizer is switched to the |ENTITY_STATE|, which is an |
| 1850 |
|
|
| 1851 |
redo A; |
redo A; |
| 1852 |
} else { |
} else { |
| 1853 |
|
if ($self->{is_xml} and $self->{nc} == 0x003C) { # < |
| 1854 |
|
|
| 1855 |
|
## XML5: Not a parse error. |
| 1856 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type |
| 1857 |
|
} else { |
| 1858 |
|
|
| 1859 |
|
} |
| 1860 |
$self->{ca}->{value} .= chr ($self->{nc}); |
$self->{ca}->{value} .= chr ($self->{nc}); |
| 1861 |
$self->{read_until}->($self->{ca}->{value}, |
$self->{read_until}->($self->{ca}->{value}, |
| 1862 |
q['&], |
q['&<], |
| 1863 |
length $self->{ca}->{value}); |
length $self->{ca}->{value}); |
| 1864 |
|
|
| 1865 |
## Stay in the state |
## Stay in the state |
| 1877 |
redo A; |
redo A; |
| 1878 |
} |
} |
| 1879 |
} elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) { |
} elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) { |
| 1880 |
|
## XML5: "Tag attribute value unquoted state". |
| 1881 |
|
|
| 1882 |
if ($is_space->{$self->{nc}}) { |
if ($is_space->{$self->{nc}}) { |
| 1883 |
|
|
| 1884 |
|
## XML5: "Tag attribute name before state". |
| 1885 |
$self->{state} = BEFORE_ATTRIBUTE_NAME_STATE; |
$self->{state} = BEFORE_ATTRIBUTE_NAME_STATE; |
| 1886 |
|
|
| 1887 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 1897 |
redo A; |
redo A; |
| 1898 |
} elsif ($self->{nc} == 0x0026) { # & |
} elsif ($self->{nc} == 0x0026) { # & |
| 1899 |
|
|
| 1900 |
|
|
| 1901 |
|
## XML5: Not defined yet. |
| 1902 |
|
|
| 1903 |
## NOTE: In the spec, the tokenizer is switched to the |
## NOTE: In the spec, the tokenizer is switched to the |
| 1904 |
## "entity in attribute value state". In this implementation, the |
## "entity in attribute value state". In this implementation, the |
| 1905 |
## tokenizer is switched to the |ENTITY_STATE|, which is an |
## tokenizer is switched to the |ENTITY_STATE|, which is an |
| 1983 |
0x003D => 1, # = |
0x003D => 1, # = |
| 1984 |
}->{$self->{nc}}) { |
}->{$self->{nc}}) { |
| 1985 |
|
|
| 1986 |
|
## XML5: Not a parse error. |
| 1987 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value'); |
| 1988 |
} else { |
} else { |
| 1989 |
|
|
| 2100 |
redo A; |
redo A; |
| 2101 |
} |
} |
| 2102 |
} elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) { |
} elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) { |
| 2103 |
|
## XML5: "Empty tag state". |
| 2104 |
|
|
| 2105 |
if ($self->{nc} == 0x003E) { # > |
if ($self->{nc} == 0x003E) { # > |
| 2106 |
if ($self->{ct}->{type} == END_TAG_TOKEN) { |
if ($self->{ct}->{type} == END_TAG_TOKEN) { |
| 2107 |
|
|
| 2153 |
} else { |
} else { |
| 2154 |
die "$0: $self->{ct}->{type}: Unknown token type"; |
die "$0: $self->{ct}->{type}: Unknown token type"; |
| 2155 |
} |
} |
| 2156 |
|
## XML5: "Tag attribute name before state". |
| 2157 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 2158 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 2159 |
## Reconsume. |
## Reconsume. |