| 105 |
sub COMMENT_START_DASH_STATE () { 15 } |
sub COMMENT_START_DASH_STATE () { 15 } |
| 106 |
sub COMMENT_STATE () { 16 } |
sub COMMENT_STATE () { 16 } |
| 107 |
sub COMMENT_END_STATE () { 17 } |
sub COMMENT_END_STATE () { 17 } |
| 108 |
sub COMMENT_END_BANG_STATE () { 102 } ## LAST |
sub COMMENT_END_BANG_STATE () { 102 } |
| 109 |
|
sub COMMENT_END_SPACE_STATE () { 103 } ## LAST |
| 110 |
sub COMMENT_END_DASH_STATE () { 18 } |
sub COMMENT_END_DASH_STATE () { 18 } |
| 111 |
sub BOGUS_COMMENT_STATE () { 19 } |
sub BOGUS_COMMENT_STATE () { 19 } |
| 112 |
sub DOCTYPE_STATE () { 20 } |
sub DOCTYPE_STATE () { 20 } |
| 206 |
## Character reference mappings |
## Character reference mappings |
| 207 |
|
|
| 208 |
my $charref_map = { |
my $charref_map = { |
| 209 |
|
0x00 => 0xFFFD, # REPLACEMENT CHARACTER |
| 210 |
0x0D => 0x000A, |
0x0D => 0x000A, |
| 211 |
0x80 => 0x20AC, |
0x80 => 0x20AC, |
| 212 |
0x81 => 0xFFFD, |
0x81 => 0x0081, |
| 213 |
0x82 => 0x201A, |
0x82 => 0x201A, |
| 214 |
0x83 => 0x0192, |
0x83 => 0x0192, |
| 215 |
0x84 => 0x201E, |
0x84 => 0x201E, |
| 221 |
0x8A => 0x0160, |
0x8A => 0x0160, |
| 222 |
0x8B => 0x2039, |
0x8B => 0x2039, |
| 223 |
0x8C => 0x0152, |
0x8C => 0x0152, |
| 224 |
0x8D => 0xFFFD, |
0x8D => 0x008D, |
| 225 |
0x8E => 0x017D, |
0x8E => 0x017D, |
| 226 |
0x8F => 0xFFFD, |
0x8F => 0x008F, |
| 227 |
0x90 => 0xFFFD, |
0x90 => 0x0090, |
| 228 |
0x91 => 0x2018, |
0x91 => 0x2018, |
| 229 |
0x92 => 0x2019, |
0x92 => 0x2019, |
| 230 |
0x93 => 0x201C, |
0x93 => 0x201C, |
| 237 |
0x9A => 0x0161, |
0x9A => 0x0161, |
| 238 |
0x9B => 0x203A, |
0x9B => 0x203A, |
| 239 |
0x9C => 0x0153, |
0x9C => 0x0153, |
| 240 |
0x9D => 0xFFFD, |
0x9D => 0x009D, |
| 241 |
0x9E => 0x017E, |
0x9E => 0x017E, |
| 242 |
0x9F => 0x0178, |
0x9F => 0x0178, |
| 243 |
}; # $charref_map |
}; # $charref_map |
| 244 |
$charref_map->{$_} = 0xFFFD |
$charref_map->{$_} = $_ |
| 245 |
for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F, |
for 0x0001..0x0008, 0x000B, 0x000E..0x001F, 0x007F, |
| 246 |
0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF |
0xD800..0xDFFF, 0xFDD0..0xFDEF, |
| 247 |
0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, |
0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, |
| 248 |
0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, |
0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, |
| 249 |
0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF, |
0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF, |
| 863 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 864 |
# reconsume |
# reconsume |
| 865 |
|
|
| 866 |
!!!emit ($self->{ct}); # start tag or end tag |
## Discard the token. |
| 867 |
|
#!!!emit ($self->{ct}); # start tag or end tag |
| 868 |
|
|
| 869 |
redo A; |
redo A; |
| 870 |
} elsif ($self->{nc} == 0x002F) { # / |
} elsif ($self->{nc} == 0x002F) { # / |
| 945 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 946 |
# reconsume |
# reconsume |
| 947 |
|
|
| 948 |
!!!emit ($self->{ct}); # start tag or end tag |
## Discard the token. |
| 949 |
|
#!!!emit ($self->{ct}); # start tag or end tag |
| 950 |
|
|
| 951 |
redo A; |
redo A; |
| 952 |
} else { |
} else { |
| 1072 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 1073 |
# reconsume |
# reconsume |
| 1074 |
|
|
| 1075 |
!!!emit ($self->{ct}); # start tag or end tag |
## Discard the token. |
| 1076 |
|
#!!!emit ($self->{ct}); # start tag or end tag |
| 1077 |
|
|
| 1078 |
redo A; |
redo A; |
| 1079 |
} else { |
} else { |
| 1180 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 1181 |
# reconsume |
# reconsume |
| 1182 |
|
|
| 1183 |
!!!emit ($self->{ct}); # start tag or end tag |
## Discard the token. |
| 1184 |
|
#!!!emit ($self->{ct}); # start tag or end tag |
| 1185 |
|
|
| 1186 |
redo A; |
redo A; |
| 1187 |
} else { |
} else { |
| 1280 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 1281 |
## reconsume |
## reconsume |
| 1282 |
|
|
| 1283 |
!!!emit ($self->{ct}); # start tag or end tag |
## Discard the token. |
| 1284 |
|
#!!!emit ($self->{ct}); # start tag or end tag |
| 1285 |
|
|
| 1286 |
redo A; |
redo A; |
| 1287 |
} else { |
} else { |
| 1362 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 1363 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 1364 |
## reconsume |
## reconsume |
| 1365 |
!!!emit ($self->{ct}); # end tag |
|
| 1366 |
|
## Discard the token. |
| 1367 |
|
#!!!emit ($self->{ct}); # end tag |
| 1368 |
|
|
| 1369 |
redo A; |
redo A; |
| 1370 |
} elsif ($self->{ct}->{type} == ATTLIST_TOKEN) { |
} elsif ($self->{ct}->{type} == ATTLIST_TOKEN) { |
| 1371 |
## XML5: No parse error above; not defined yet. |
## XML5: No parse error above; not defined yet. |
| 1372 |
push @{$self->{ct}->{attrdefs}}, $self->{ca}; |
push @{$self->{ct}->{attrdefs}}, $self->{ca}; |
| 1373 |
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 1374 |
## Reconsume. |
## Reconsume. |
| 1375 |
!!!emit ($self->{ct}); # ATTLIST |
|
| 1376 |
|
## Discard the token. |
| 1377 |
|
#!!!emit ($self->{ct}); # ATTLIST |
| 1378 |
|
|
| 1379 |
redo A; |
redo A; |
| 1380 |
} else { |
} else { |
| 1381 |
die "$0: $self->{ct}->{type}: Unknown token type"; |
die "$0: $self->{ct}->{type}: Unknown token type"; |
| 1444 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 1445 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 1446 |
## reconsume |
## reconsume |
| 1447 |
!!!emit ($self->{ct}); # start tag |
|
| 1448 |
|
## Discard the token. |
| 1449 |
|
#!!!emit ($self->{ct}); # start tag |
| 1450 |
|
|
| 1451 |
redo A; |
redo A; |
| 1452 |
} elsif ($self->{ct}->{type} == END_TAG_TOKEN) { |
} elsif ($self->{ct}->{type} == END_TAG_TOKEN) { |
| 1453 |
$self->{content_model} = PCDATA_CONTENT_MODEL; # MUST |
$self->{content_model} = PCDATA_CONTENT_MODEL; # MUST |
| 1462 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 1463 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 1464 |
## reconsume |
## reconsume |
| 1465 |
!!!emit ($self->{ct}); # end tag |
|
| 1466 |
|
## Discard the token. |
| 1467 |
|
#!!!emit ($self->{ct}); # end tag |
| 1468 |
|
|
| 1469 |
redo A; |
redo A; |
| 1470 |
} elsif ($self->{ct}->{type} == ATTLIST_TOKEN) { |
} elsif ($self->{ct}->{type} == ATTLIST_TOKEN) { |
| 1471 |
## XML5: No parse error above; not defined yet. |
## XML5: No parse error above; not defined yet. |
| 1472 |
push @{$self->{ct}->{attrdefs}}, $self->{ca}; |
push @{$self->{ct}->{attrdefs}}, $self->{ca}; |
| 1473 |
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 1474 |
## Reconsume. |
## Reconsume. |
| 1475 |
!!!emit ($self->{ct}); # ATTLIST |
|
| 1476 |
|
## Discard the token. |
| 1477 |
|
#!!!emit ($self->{ct}); # ATTLIST |
| 1478 |
|
|
| 1479 |
redo A; |
redo A; |
| 1480 |
} else { |
} else { |
| 1481 |
die "$0: $self->{ct}->{type}: Unknown token type"; |
die "$0: $self->{ct}->{type}: Unknown token type"; |
| 1570 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 1571 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 1572 |
## reconsume |
## reconsume |
| 1573 |
!!!emit ($self->{ct}); # start tag |
|
| 1574 |
|
## Discard the token. |
| 1575 |
|
#!!!emit ($self->{ct}); # start tag |
| 1576 |
|
|
| 1577 |
redo A; |
redo A; |
| 1578 |
} elsif ($self->{ct}->{type} == END_TAG_TOKEN) { |
} elsif ($self->{ct}->{type} == END_TAG_TOKEN) { |
| 1579 |
!!!parse-error (type => 'unclosed tag'); |
!!!parse-error (type => 'unclosed tag'); |
| 1589 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 1590 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 1591 |
## reconsume |
## reconsume |
| 1592 |
!!!emit ($self->{ct}); # end tag |
|
| 1593 |
|
## Discard the token. |
| 1594 |
|
#!!!emit ($self->{ct}); # end tag |
| 1595 |
|
|
| 1596 |
redo A; |
redo A; |
| 1597 |
} elsif ($self->{ct}->{type} == ATTLIST_TOKEN) { |
} elsif ($self->{ct}->{type} == ATTLIST_TOKEN) { |
| 1598 |
!!!parse-error (type => 'unclosed md'); ## TODO: type |
!!!parse-error (type => 'unclosed md'); ## TODO: type |
| 1599 |
push @{$self->{ct}->{attrdefs}}, $self->{ca}; |
push @{$self->{ct}->{attrdefs}}, $self->{ca}; |
| 1600 |
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 1601 |
## Reconsume. |
## Reconsume. |
| 1602 |
!!!emit ($self->{ct}); # ATTLIST |
|
| 1603 |
|
## Discard the token. |
| 1604 |
|
#!!!emit ($self->{ct}); # ATTLIST |
| 1605 |
|
|
| 1606 |
redo A; |
redo A; |
| 1607 |
} else { |
} else { |
| 1608 |
die "$0: $self->{ct}->{type}: Unknown token type"; |
die "$0: $self->{ct}->{type}: Unknown token type"; |
| 1682 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 1683 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 1684 |
## Reconsume. |
## Reconsume. |
| 1685 |
!!!emit ($self->{ct}); # start tag or end tag |
|
| 1686 |
|
## Discard the token. |
| 1687 |
|
#!!!emit ($self->{ct}); # start tag or end tag |
| 1688 |
|
|
| 1689 |
redo A; |
redo A; |
| 1690 |
} else { |
} else { |
| 1691 |
!!!cp ('124.1'); |
!!!cp ('124.1'); |
| 1742 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 1743 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 1744 |
## Reconsume. |
## Reconsume. |
| 1745 |
!!!emit ($self->{ct}); # start tag or end tag |
|
| 1746 |
|
## Discard the token. |
| 1747 |
|
#!!!emit ($self->{ct}); # start tag or end tag |
| 1748 |
|
|
| 1749 |
redo A; |
redo A; |
| 1750 |
} else { |
} else { |
| 1751 |
!!!cp ('124.4'); |
!!!cp ('124.4'); |
| 2155 |
} |
} |
| 2156 |
!!!next-input-character; |
!!!next-input-character; |
| 2157 |
redo A; |
redo A; |
| 2158 |
} elsif ($self->{nc} == 0x0021 and # ! |
} elsif ($self->{state} != COMMENT_END_BANG_STATE and |
| 2159 |
$self->{state} != COMMENT_END_BANG_STATE) { |
$is_space->{$self->{nc}}) { |
| 2160 |
|
!!!cp (152.1); |
| 2161 |
|
!!!parse-error (type => 'comment end space'); # XXX error type |
| 2162 |
|
$self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment |
| 2163 |
|
$self->{state} = COMMENT_END_SPACE_STATE; |
| 2164 |
|
!!!next-input-character; |
| 2165 |
|
redo A; |
| 2166 |
|
} elsif ($self->{state} != COMMENT_END_BANG_STATE and |
| 2167 |
|
$self->{nc} == 0x0021) { # ! |
| 2168 |
|
!!!cp (152.2); |
| 2169 |
!!!parse-error (type => 'comment end bang'); # XXX error type |
!!!parse-error (type => 'comment end bang'); # XXX error type |
| 2170 |
$self->{state} = COMMENT_END_BANG_STATE; |
$self->{state} = COMMENT_END_BANG_STATE; |
| 2171 |
!!!next-input-character; |
!!!next-input-character; |
| 2196 |
!!!next-input-character; |
!!!next-input-character; |
| 2197 |
redo A; |
redo A; |
| 2198 |
} |
} |
| 2199 |
|
} elsif ($self->{state} == COMMENT_END_SPACE_STATE) { |
| 2200 |
|
## XML5: Not exist. |
| 2201 |
|
|
| 2202 |
|
if ($self->{nc} == 0x003E) { # > |
| 2203 |
|
if ($self->{in_subset}) { |
| 2204 |
|
!!!cp (154.4); |
| 2205 |
|
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 2206 |
|
} else { |
| 2207 |
|
!!!cp (154.5); |
| 2208 |
|
$self->{state} = DATA_STATE; |
| 2209 |
|
$self->{s_kwd} = ''; |
| 2210 |
|
} |
| 2211 |
|
!!!next-input-character; |
| 2212 |
|
|
| 2213 |
|
!!!emit ($self->{ct}); # comment |
| 2214 |
|
|
| 2215 |
|
redo A; |
| 2216 |
|
} elsif ($is_space->{$self->{nc}}) { |
| 2217 |
|
!!!cp (154.6); |
| 2218 |
|
$self->{ct}->{data} .= chr ($self->{nc}); # comment |
| 2219 |
|
## Stay in the state. |
| 2220 |
|
!!!next-input-character; |
| 2221 |
|
redo A; |
| 2222 |
|
} elsif ($self->{nc} == -1) { |
| 2223 |
|
!!!parse-error (type => 'unclosed comment'); |
| 2224 |
|
if ($self->{in_subset}) { |
| 2225 |
|
!!!cp (154.7); |
| 2226 |
|
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 2227 |
|
} else { |
| 2228 |
|
!!!cp (154.8); |
| 2229 |
|
$self->{state} = DATA_STATE; |
| 2230 |
|
$self->{s_kwd} = ''; |
| 2231 |
|
} |
| 2232 |
|
## Reconsume. |
| 2233 |
|
|
| 2234 |
|
!!!emit ($self->{ct}); # comment |
| 2235 |
|
|
| 2236 |
|
redo A; |
| 2237 |
|
} else { |
| 2238 |
|
!!!cp (154.9); |
| 2239 |
|
$self->{ct}->{data} .= chr ($self->{nc}); # comment |
| 2240 |
|
$self->{state} = COMMENT_STATE; |
| 2241 |
|
!!!next-input-character; |
| 2242 |
|
redo A; |
| 2243 |
|
} |
| 2244 |
} elsif ($self->{state} == DOCTYPE_STATE) { |
} elsif ($self->{state} == DOCTYPE_STATE) { |
| 2245 |
if ($is_space->{$self->{nc}}) { |
if ($is_space->{$self->{nc}}) { |
| 2246 |
!!!cp (155); |
!!!cp (155); |