| 105 |
sub COMMENT_START_DASH_STATE () { 15 } |
sub COMMENT_START_DASH_STATE () { 15 } |
| 106 |
sub COMMENT_STATE () { 16 } |
sub COMMENT_STATE () { 16 } |
| 107 |
sub COMMENT_END_STATE () { 17 } |
sub COMMENT_END_STATE () { 17 } |
| 108 |
|
sub COMMENT_END_BANG_STATE () { 102 } |
| 109 |
|
sub COMMENT_END_SPACE_STATE () { 103 } ## LAST |
| 110 |
sub COMMENT_END_DASH_STATE () { 18 } |
sub COMMENT_END_DASH_STATE () { 18 } |
| 111 |
sub BOGUS_COMMENT_STATE () { 19 } |
sub BOGUS_COMMENT_STATE () { 19 } |
| 112 |
sub DOCTYPE_STATE () { 20 } |
sub DOCTYPE_STATE () { 20 } |
| 206 |
## Character reference mappings |
## Character reference mappings |
| 207 |
|
|
| 208 |
my $charref_map = { |
my $charref_map = { |
| 209 |
|
0x00 => 0xFFFD, # REPLACEMENT CHARACTER |
| 210 |
0x0D => 0x000A, |
0x0D => 0x000A, |
| 211 |
0x80 => 0x20AC, |
0x80 => 0x20AC, |
| 212 |
0x81 => 0xFFFD, |
0x81 => 0x0081, |
| 213 |
0x82 => 0x201A, |
0x82 => 0x201A, |
| 214 |
0x83 => 0x0192, |
0x83 => 0x0192, |
| 215 |
0x84 => 0x201E, |
0x84 => 0x201E, |
| 221 |
0x8A => 0x0160, |
0x8A => 0x0160, |
| 222 |
0x8B => 0x2039, |
0x8B => 0x2039, |
| 223 |
0x8C => 0x0152, |
0x8C => 0x0152, |
| 224 |
0x8D => 0xFFFD, |
0x8D => 0x008D, |
| 225 |
0x8E => 0x017D, |
0x8E => 0x017D, |
| 226 |
0x8F => 0xFFFD, |
0x8F => 0x008F, |
| 227 |
0x90 => 0xFFFD, |
0x90 => 0x0090, |
| 228 |
0x91 => 0x2018, |
0x91 => 0x2018, |
| 229 |
0x92 => 0x2019, |
0x92 => 0x2019, |
| 230 |
0x93 => 0x201C, |
0x93 => 0x201C, |
| 237 |
0x9A => 0x0161, |
0x9A => 0x0161, |
| 238 |
0x9B => 0x203A, |
0x9B => 0x203A, |
| 239 |
0x9C => 0x0153, |
0x9C => 0x0153, |
| 240 |
0x9D => 0xFFFD, |
0x9D => 0x009D, |
| 241 |
0x9E => 0x017E, |
0x9E => 0x017E, |
| 242 |
0x9F => 0x0178, |
0x9F => 0x0178, |
| 243 |
}; # $charref_map |
}; # $charref_map |
| 244 |
$charref_map->{$_} = 0xFFFD |
$charref_map->{$_} = $_ |
| 245 |
for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F, |
for 0x0001..0x0008, 0x000B, 0x000E..0x001F, 0x007F, |
| 246 |
0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF |
0xD800..0xDFFF, 0xFDD0..0xFDEF, |
| 247 |
0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, |
0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, |
| 248 |
0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, |
0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, |
| 249 |
0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF, |
0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF, |
| 1103 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 1104 |
# reconsume |
# reconsume |
| 1105 |
|
|
| 1106 |
return ($self->{ct}); # start tag or end tag |
## Discard the token. |
| 1107 |
|
#return ($self->{ct}); # start tag or end tag |
| 1108 |
|
|
| 1109 |
redo A; |
redo A; |
| 1110 |
} elsif ($self->{nc} == 0x002F) { # / |
} elsif ($self->{nc} == 0x002F) { # / |
| 1245 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 1246 |
# reconsume |
# reconsume |
| 1247 |
|
|
| 1248 |
return ($self->{ct}); # start tag or end tag |
## Discard the token. |
| 1249 |
|
#return ($self->{ct}); # start tag or end tag |
| 1250 |
|
|
| 1251 |
redo A; |
redo A; |
| 1252 |
} else { |
} else { |
| 1253 |
if ({ |
if ({ |
| 1254 |
0x0022 => 1, # " |
0x0022 => 1, # " |
| 1255 |
0x0027 => 1, # ' |
0x0027 => 1, # ' |
| 1256 |
|
0x003C => 1, # < |
| 1257 |
0x003D => 1, # = |
0x003D => 1, # = |
| 1258 |
}->{$self->{nc}}) { |
}->{$self->{nc}}) { |
| 1259 |
|
|
| 1432 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 1433 |
# reconsume |
# reconsume |
| 1434 |
|
|
| 1435 |
return ($self->{ct}); # start tag or end tag |
## Discard the token. |
| 1436 |
|
#return ($self->{ct}); # start tag or end tag |
| 1437 |
|
|
| 1438 |
redo A; |
redo A; |
| 1439 |
} else { |
} else { |
| 1440 |
if ($self->{nc} == 0x0022 or # " |
if ({ |
| 1441 |
$self->{nc} == 0x0027) { # ' |
0x0022 => 1, # " |
| 1442 |
|
0x0027 => 1, # ' |
| 1443 |
|
0x003C => 1, # < |
| 1444 |
|
}->{$self->{nc}}) { |
| 1445 |
|
|
| 1446 |
## XML5: Not a parse error. |
## XML5: Not a parse error. |
| 1447 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name'); |
| 1600 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 1601 |
# reconsume |
# reconsume |
| 1602 |
|
|
| 1603 |
return ($self->{ct}); # start tag or end tag |
## Discard the token. |
| 1604 |
|
#return ($self->{ct}); # start tag or end tag |
| 1605 |
|
|
| 1606 |
redo A; |
redo A; |
| 1607 |
} else { |
} else { |
| 1613 |
|
|
| 1614 |
} |
} |
| 1615 |
|
|
| 1616 |
if ($self->{nc} == 0x0022 or # " |
if ({ |
| 1617 |
$self->{nc} == 0x0027) { # ' |
0x0022 => 1, # " |
| 1618 |
|
0x0027 => 1, # ' |
| 1619 |
|
0x003C => 1, # < |
| 1620 |
|
}->{$self->{nc}}) { |
| 1621 |
|
|
| 1622 |
## XML5: Not a parse error. |
## XML5: Not a parse error. |
| 1623 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name'); |
| 1750 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 1751 |
## reconsume |
## reconsume |
| 1752 |
|
|
| 1753 |
return ($self->{ct}); # start tag or end tag |
## Discard the token. |
| 1754 |
|
#return ($self->{ct}); # start tag or end tag |
| 1755 |
|
|
| 1756 |
redo A; |
redo A; |
| 1757 |
} else { |
} else { |
| 1758 |
if ($self->{nc} == 0x003D) { # = |
if ($self->{nc} == 0x003D or $self->{nc} == 0x003C) { # =, < |
| 1759 |
|
|
| 1760 |
## XML5: Not a parse error. |
## XML5: Not a parse error. |
| 1761 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value'); |
| 1831 |
} |
} |
| 1832 |
|
|
| 1833 |
redo A; |
redo A; |
| 1834 |
|
} elsif ($self->{is_xml} and |
| 1835 |
|
$is_space->{$self->{nc}}) { |
| 1836 |
|
|
| 1837 |
|
$self->{ca}->{value} .= ' '; |
| 1838 |
|
## Stay in the state. |
| 1839 |
|
|
| 1840 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 1841 |
|
$self->{line_prev} = $self->{line}; |
| 1842 |
|
$self->{column_prev} = $self->{column}; |
| 1843 |
|
$self->{column}++; |
| 1844 |
|
$self->{nc} |
| 1845 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 1846 |
|
} else { |
| 1847 |
|
$self->{set_nc}->($self); |
| 1848 |
|
} |
| 1849 |
|
|
| 1850 |
|
redo A; |
| 1851 |
} elsif ($self->{nc} == -1) { |
} elsif ($self->{nc} == -1) { |
| 1852 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value'); |
| 1853 |
if ($self->{ct}->{type} == START_TAG_TOKEN) { |
if ($self->{ct}->{type} == START_TAG_TOKEN) { |
| 1872 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 1873 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 1874 |
## reconsume |
## reconsume |
| 1875 |
return ($self->{ct}); # end tag |
|
| 1876 |
|
## Discard the token. |
| 1877 |
|
#return ($self->{ct}); # end tag |
| 1878 |
|
|
| 1879 |
redo A; |
redo A; |
| 1880 |
} elsif ($self->{ct}->{type} == ATTLIST_TOKEN) { |
} elsif ($self->{ct}->{type} == ATTLIST_TOKEN) { |
| 1881 |
## XML5: No parse error above; not defined yet. |
## XML5: No parse error above; not defined yet. |
| 1882 |
push @{$self->{ct}->{attrdefs}}, $self->{ca}; |
push @{$self->{ct}->{attrdefs}}, $self->{ca}; |
| 1883 |
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 1884 |
## Reconsume. |
## Reconsume. |
| 1885 |
return ($self->{ct}); # ATTLIST |
|
| 1886 |
|
## Discard the token. |
| 1887 |
|
#return ($self->{ct}); # ATTLIST |
| 1888 |
|
|
| 1889 |
redo A; |
redo A; |
| 1890 |
} else { |
} else { |
| 1891 |
die "$0: $self->{ct}->{type}: Unknown token type"; |
die "$0: $self->{ct}->{type}: Unknown token type"; |
| 1901 |
} |
} |
| 1902 |
$self->{ca}->{value} .= chr ($self->{nc}); |
$self->{ca}->{value} .= chr ($self->{nc}); |
| 1903 |
$self->{read_until}->($self->{ca}->{value}, |
$self->{read_until}->($self->{ca}->{value}, |
| 1904 |
q["&<], |
qq["&<\x09\x0C\x20], |
| 1905 |
length $self->{ca}->{value}); |
length $self->{ca}->{value}); |
| 1906 |
|
|
| 1907 |
## Stay in the state |
## Stay in the state |
| 1968 |
} |
} |
| 1969 |
|
|
| 1970 |
redo A; |
redo A; |
| 1971 |
|
} elsif ($self->{is_xml} and |
| 1972 |
|
$is_space->{$self->{nc}}) { |
| 1973 |
|
|
| 1974 |
|
$self->{ca}->{value} .= ' '; |
| 1975 |
|
## Stay in the state. |
| 1976 |
|
|
| 1977 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 1978 |
|
$self->{line_prev} = $self->{line}; |
| 1979 |
|
$self->{column_prev} = $self->{column}; |
| 1980 |
|
$self->{column}++; |
| 1981 |
|
$self->{nc} |
| 1982 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 1983 |
|
} else { |
| 1984 |
|
$self->{set_nc}->($self); |
| 1985 |
|
} |
| 1986 |
|
|
| 1987 |
|
redo A; |
| 1988 |
} elsif ($self->{nc} == -1) { |
} elsif ($self->{nc} == -1) { |
| 1989 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value'); |
| 1990 |
if ($self->{ct}->{type} == START_TAG_TOKEN) { |
if ($self->{ct}->{type} == START_TAG_TOKEN) { |
| 1994 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 1995 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 1996 |
## reconsume |
## reconsume |
| 1997 |
return ($self->{ct}); # start tag |
|
| 1998 |
|
## Discard the token. |
| 1999 |
|
#return ($self->{ct}); # start tag |
| 2000 |
|
|
| 2001 |
redo A; |
redo A; |
| 2002 |
} elsif ($self->{ct}->{type} == END_TAG_TOKEN) { |
} elsif ($self->{ct}->{type} == END_TAG_TOKEN) { |
| 2003 |
$self->{content_model} = PCDATA_CONTENT_MODEL; # MUST |
$self->{content_model} = PCDATA_CONTENT_MODEL; # MUST |
| 2012 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 2013 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 2014 |
## reconsume |
## reconsume |
| 2015 |
return ($self->{ct}); # end tag |
|
| 2016 |
|
## Discard the token. |
| 2017 |
|
#return ($self->{ct}); # end tag |
| 2018 |
|
|
| 2019 |
redo A; |
redo A; |
| 2020 |
} elsif ($self->{ct}->{type} == ATTLIST_TOKEN) { |
} elsif ($self->{ct}->{type} == ATTLIST_TOKEN) { |
| 2021 |
## XML5: No parse error above; not defined yet. |
## XML5: No parse error above; not defined yet. |
| 2022 |
push @{$self->{ct}->{attrdefs}}, $self->{ca}; |
push @{$self->{ct}->{attrdefs}}, $self->{ca}; |
| 2023 |
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 2024 |
## Reconsume. |
## Reconsume. |
| 2025 |
return ($self->{ct}); # ATTLIST |
|
| 2026 |
|
## Discard the token. |
| 2027 |
|
#return ($self->{ct}); # ATTLIST |
| 2028 |
|
|
| 2029 |
redo A; |
redo A; |
| 2030 |
} else { |
} else { |
| 2031 |
die "$0: $self->{ct}->{type}: Unknown token type"; |
die "$0: $self->{ct}->{type}: Unknown token type"; |
| 2041 |
} |
} |
| 2042 |
$self->{ca}->{value} .= chr ($self->{nc}); |
$self->{ca}->{value} .= chr ($self->{nc}); |
| 2043 |
$self->{read_until}->($self->{ca}->{value}, |
$self->{read_until}->($self->{ca}->{value}, |
| 2044 |
q['&<], |
qq['&<\x09\x0C\x20], |
| 2045 |
length $self->{ca}->{value}); |
length $self->{ca}->{value}); |
| 2046 |
|
|
| 2047 |
## Stay in the state |
## Stay in the state |
| 2180 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 2181 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 2182 |
## reconsume |
## reconsume |
| 2183 |
return ($self->{ct}); # start tag |
|
| 2184 |
|
## Discard the token. |
| 2185 |
|
#return ($self->{ct}); # start tag |
| 2186 |
|
|
| 2187 |
redo A; |
redo A; |
| 2188 |
} elsif ($self->{ct}->{type} == END_TAG_TOKEN) { |
} elsif ($self->{ct}->{type} == END_TAG_TOKEN) { |
| 2189 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag'); |
| 2199 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 2200 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 2201 |
## reconsume |
## reconsume |
| 2202 |
return ($self->{ct}); # end tag |
|
| 2203 |
|
## Discard the token. |
| 2204 |
|
#return ($self->{ct}); # end tag |
| 2205 |
|
|
| 2206 |
redo A; |
redo A; |
| 2207 |
} elsif ($self->{ct}->{type} == ATTLIST_TOKEN) { |
} elsif ($self->{ct}->{type} == ATTLIST_TOKEN) { |
| 2208 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type |
| 2209 |
push @{$self->{ct}->{attrdefs}}, $self->{ca}; |
push @{$self->{ct}->{attrdefs}}, $self->{ca}; |
| 2210 |
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 2211 |
## Reconsume. |
## Reconsume. |
| 2212 |
return ($self->{ct}); # ATTLIST |
|
| 2213 |
|
## Discard the token. |
| 2214 |
|
#return ($self->{ct}); # ATTLIST |
| 2215 |
|
|
| 2216 |
redo A; |
redo A; |
| 2217 |
} else { |
} else { |
| 2218 |
die "$0: $self->{ct}->{type}: Unknown token type"; |
die "$0: $self->{ct}->{type}: Unknown token type"; |
| 2222 |
0x0022 => 1, # " |
0x0022 => 1, # " |
| 2223 |
0x0027 => 1, # ' |
0x0027 => 1, # ' |
| 2224 |
0x003D => 1, # = |
0x003D => 1, # = |
| 2225 |
|
0x003C => 1, # < |
| 2226 |
}->{$self->{nc}}) { |
}->{$self->{nc}}) { |
| 2227 |
|
|
| 2228 |
## XML5: Not a parse error. |
## XML5: Not a parse error. |
| 2232 |
} |
} |
| 2233 |
$self->{ca}->{value} .= chr ($self->{nc}); |
$self->{ca}->{value} .= chr ($self->{nc}); |
| 2234 |
$self->{read_until}->($self->{ca}->{value}, |
$self->{read_until}->($self->{ca}->{value}, |
| 2235 |
q["'=& >], |
qq["'=& \x09\x0C>], |
| 2236 |
length $self->{ca}->{value}); |
length $self->{ca}->{value}); |
| 2237 |
|
|
| 2238 |
## Stay in the state |
## Stay in the state |
| 2332 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 2333 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 2334 |
## Reconsume. |
## Reconsume. |
| 2335 |
return ($self->{ct}); # start tag or end tag |
|
| 2336 |
|
## Discard the token. |
| 2337 |
|
#return ($self->{ct}); # start tag or end tag |
| 2338 |
|
|
| 2339 |
redo A; |
redo A; |
| 2340 |
} else { |
} else { |
| 2341 |
|
|
| 2402 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 2403 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 2404 |
## Reconsume. |
## Reconsume. |
| 2405 |
return ($self->{ct}); # start tag or end tag |
|
| 2406 |
|
## Discard the token. |
| 2407 |
|
#return ($self->{ct}); # start tag or end tag |
| 2408 |
|
|
| 2409 |
redo A; |
redo A; |
| 2410 |
} else { |
} else { |
| 2411 |
|
|
| 2980 |
|
|
| 2981 |
redo A; |
redo A; |
| 2982 |
} |
} |
| 2983 |
} elsif ($self->{state} == COMMENT_END_STATE) { |
} elsif ($self->{state} == COMMENT_END_STATE or |
| 2984 |
|
$self->{state} == COMMENT_END_BANG_STATE) { |
| 2985 |
## XML5: "Comment end state" and "DOCTYPE comment end state". |
## XML5: "Comment end state" and "DOCTYPE comment end state". |
| 2986 |
|
## (No comment end bang state.) |
| 2987 |
|
|
| 2988 |
if ($self->{nc} == 0x003E) { # > |
if ($self->{nc} == 0x003E) { # > |
| 2989 |
if ($self->{in_subset}) { |
if ($self->{in_subset}) { |
| 3010 |
|
|
| 3011 |
redo A; |
redo A; |
| 3012 |
} elsif ($self->{nc} == 0x002D) { # - |
} elsif ($self->{nc} == 0x002D) { # - |
| 3013 |
|
if ($self->{state} == COMMENT_END_BANG_STATE) { |
| 3014 |
|
|
| 3015 |
|
$self->{ct}->{data} .= '--!'; # comment |
| 3016 |
|
$self->{state} = COMMENT_END_DASH_STATE; |
| 3017 |
|
} else { |
| 3018 |
|
|
| 3019 |
|
## XML5: Not a parse error. |
| 3020 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment', |
| 3021 |
|
line => $self->{line_prev}, |
| 3022 |
|
column => $self->{column_prev}); |
| 3023 |
|
$self->{ct}->{data} .= '-'; # comment |
| 3024 |
|
## Stay in the state |
| 3025 |
|
} |
| 3026 |
|
|
| 3027 |
## XML5: Not a parse error. |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 3028 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment', |
$self->{line_prev} = $self->{line}; |
| 3029 |
line => $self->{line_prev}, |
$self->{column_prev} = $self->{column}; |
| 3030 |
column => $self->{column_prev}); |
$self->{column}++; |
| 3031 |
$self->{ct}->{data} .= '-'; # comment |
$self->{nc} |
| 3032 |
## Stay in the state |
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 3033 |
|
} else { |
| 3034 |
|
$self->{set_nc}->($self); |
| 3035 |
|
} |
| 3036 |
|
|
| 3037 |
|
redo A; |
| 3038 |
|
} elsif ($self->{state} != COMMENT_END_BANG_STATE and |
| 3039 |
|
$is_space->{$self->{nc}}) { |
| 3040 |
|
|
| 3041 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'comment end space'); # XXX error type |
| 3042 |
|
$self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment |
| 3043 |
|
$self->{state} = COMMENT_END_SPACE_STATE; |
| 3044 |
|
|
| 3045 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 3046 |
|
$self->{line_prev} = $self->{line}; |
| 3047 |
|
$self->{column_prev} = $self->{column}; |
| 3048 |
|
$self->{column}++; |
| 3049 |
|
$self->{nc} |
| 3050 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 3051 |
|
} else { |
| 3052 |
|
$self->{set_nc}->($self); |
| 3053 |
|
} |
| 3054 |
|
|
| 3055 |
|
redo A; |
| 3056 |
|
} elsif ($self->{state} != COMMENT_END_BANG_STATE and |
| 3057 |
|
$self->{nc} == 0x0021) { # ! |
| 3058 |
|
|
| 3059 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'comment end bang'); # XXX error type |
| 3060 |
|
$self->{state} = COMMENT_END_BANG_STATE; |
| 3061 |
|
|
| 3062 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 3063 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 3080 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 3081 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 3082 |
} |
} |
| 3083 |
## reconsume |
## Reconsume. |
| 3084 |
|
|
| 3085 |
return ($self->{ct}); # comment |
return ($self->{ct}); # comment |
| 3086 |
|
|
| 3087 |
redo A; |
redo A; |
| 3088 |
} else { |
} else { |
| 3089 |
|
|
| 3090 |
## XML5: Not a parse error. |
if ($self->{state} == COMMENT_END_BANG_STATE) { |
| 3091 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment', |
$self->{ct}->{data} .= '--!' . chr ($self->{nc}); # comment |
| 3092 |
line => $self->{line_prev}, |
} else { |
| 3093 |
column => $self->{column_prev}); |
$self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment |
| 3094 |
$self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment |
} |
| 3095 |
$self->{state} = COMMENT_STATE; |
$self->{state} = COMMENT_STATE; |
| 3096 |
|
|
| 3097 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 3106 |
|
|
| 3107 |
redo A; |
redo A; |
| 3108 |
} |
} |
| 3109 |
|
} elsif ($self->{state} == COMMENT_END_SPACE_STATE) { |
| 3110 |
|
## XML5: Not exist. |
| 3111 |
|
|
| 3112 |
|
if ($self->{nc} == 0x003E) { # > |
| 3113 |
|
if ($self->{in_subset}) { |
| 3114 |
|
|
| 3115 |
|
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 3116 |
|
} else { |
| 3117 |
|
|
| 3118 |
|
$self->{state} = DATA_STATE; |
| 3119 |
|
$self->{s_kwd} = ''; |
| 3120 |
|
} |
| 3121 |
|
|
| 3122 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 3123 |
|
$self->{line_prev} = $self->{line}; |
| 3124 |
|
$self->{column_prev} = $self->{column}; |
| 3125 |
|
$self->{column}++; |
| 3126 |
|
$self->{nc} |
| 3127 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 3128 |
|
} else { |
| 3129 |
|
$self->{set_nc}->($self); |
| 3130 |
|
} |
| 3131 |
|
|
| 3132 |
|
|
| 3133 |
|
return ($self->{ct}); # comment |
| 3134 |
|
|
| 3135 |
|
redo A; |
| 3136 |
|
} elsif ($is_space->{$self->{nc}}) { |
| 3137 |
|
|
| 3138 |
|
$self->{ct}->{data} .= chr ($self->{nc}); # comment |
| 3139 |
|
## Stay in the state. |
| 3140 |
|
|
| 3141 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 3142 |
|
$self->{line_prev} = $self->{line}; |
| 3143 |
|
$self->{column_prev} = $self->{column}; |
| 3144 |
|
$self->{column}++; |
| 3145 |
|
$self->{nc} |
| 3146 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 3147 |
|
} else { |
| 3148 |
|
$self->{set_nc}->($self); |
| 3149 |
|
} |
| 3150 |
|
|
| 3151 |
|
redo A; |
| 3152 |
|
} elsif ($self->{nc} == -1) { |
| 3153 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment'); |
| 3154 |
|
if ($self->{in_subset}) { |
| 3155 |
|
|
| 3156 |
|
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 3157 |
|
} else { |
| 3158 |
|
|
| 3159 |
|
$self->{state} = DATA_STATE; |
| 3160 |
|
$self->{s_kwd} = ''; |
| 3161 |
|
} |
| 3162 |
|
## Reconsume. |
| 3163 |
|
|
| 3164 |
|
return ($self->{ct}); # comment |
| 3165 |
|
|
| 3166 |
|
redo A; |
| 3167 |
|
} else { |
| 3168 |
|
|
| 3169 |
|
$self->{ct}->{data} .= chr ($self->{nc}); # comment |
| 3170 |
|
$self->{state} = COMMENT_STATE; |
| 3171 |
|
|
| 3172 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 3173 |
|
$self->{line_prev} = $self->{line}; |
| 3174 |
|
$self->{column_prev} = $self->{column}; |
| 3175 |
|
$self->{column}++; |
| 3176 |
|
$self->{nc} |
| 3177 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 3178 |
|
} else { |
| 3179 |
|
$self->{set_nc}->($self); |
| 3180 |
|
} |
| 3181 |
|
|
| 3182 |
|
redo A; |
| 3183 |
|
} |
| 3184 |
} elsif ($self->{state} == DOCTYPE_STATE) { |
} elsif ($self->{state} == DOCTYPE_STATE) { |
| 3185 |
if ($is_space->{$self->{nc}}) { |
if ($is_space->{$self->{nc}}) { |
| 3186 |
|
|
| 3197 |
} |
} |
| 3198 |
|
|
| 3199 |
redo A; |
redo A; |
| 3200 |
|
} elsif ($self->{nc} == -1) { |
| 3201 |
|
|
| 3202 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE'); |
| 3203 |
|
$self->{ct}->{quirks} = 1; |
| 3204 |
|
|
| 3205 |
|
$self->{state} = DATA_STATE; |
| 3206 |
|
## Reconsume. |
| 3207 |
|
return ($self->{ct}); # DOCTYPE (quirks) |
| 3208 |
|
|
| 3209 |
|
redo A; |
| 3210 |
} else { |
} else { |
| 3211 |
|
|
| 3212 |
## XML5: Unless EOF, swith to the bogus comment state. |
## XML5: Swith to the bogus comment state. |
| 3213 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name'); |
| 3214 |
$self->{state} = BEFORE_DOCTYPE_NAME_STATE; |
$self->{state} = BEFORE_DOCTYPE_NAME_STATE; |
| 3215 |
## reconsume |
## reconsume |
| 3254 |
return ($self->{ct}); # DOCTYPE (quirks) |
return ($self->{ct}); # DOCTYPE (quirks) |
| 3255 |
|
|
| 3256 |
redo A; |
redo A; |
| 3257 |
|
} elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z |
| 3258 |
|
|
| 3259 |
|
$self->{ct}->{name} # DOCTYPE |
| 3260 |
|
= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)); |
| 3261 |
|
delete $self->{ct}->{quirks}; |
| 3262 |
|
$self->{state} = DOCTYPE_NAME_STATE; |
| 3263 |
|
|
| 3264 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 3265 |
|
$self->{line_prev} = $self->{line}; |
| 3266 |
|
$self->{column_prev} = $self->{column}; |
| 3267 |
|
$self->{column}++; |
| 3268 |
|
$self->{nc} |
| 3269 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 3270 |
|
} else { |
| 3271 |
|
$self->{set_nc}->($self); |
| 3272 |
|
} |
| 3273 |
|
|
| 3274 |
|
redo A; |
| 3275 |
} elsif ($self->{nc} == -1) { |
} elsif ($self->{nc} == -1) { |
| 3276 |
|
|
| 3277 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name'); |
| 3358 |
return ($self->{ct}); # DOCTYPE |
return ($self->{ct}); # DOCTYPE |
| 3359 |
|
|
| 3360 |
redo A; |
redo A; |
| 3361 |
|
} elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z |
| 3362 |
|
|
| 3363 |
|
$self->{ct}->{name} # DOCTYPE |
| 3364 |
|
.= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)); |
| 3365 |
|
delete $self->{ct}->{quirks}; |
| 3366 |
|
## Stay in the state. |
| 3367 |
|
|
| 3368 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 3369 |
|
$self->{line_prev} = $self->{line}; |
| 3370 |
|
$self->{column_prev} = $self->{column}; |
| 3371 |
|
$self->{column}++; |
| 3372 |
|
$self->{nc} |
| 3373 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 3374 |
|
} else { |
| 3375 |
|
$self->{set_nc}->($self); |
| 3376 |
|
} |
| 3377 |
|
|
| 3378 |
|
redo A; |
| 3379 |
} elsif ($self->{nc} == -1) { |
} elsif ($self->{nc} == -1) { |
| 3380 |
|
|
| 3381 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE'); |
| 3407 |
redo A; |
redo A; |
| 3408 |
} else { |
} else { |
| 3409 |
|
|
| 3410 |
$self->{ct}->{name} |
$self->{ct}->{name} .= chr ($self->{nc}); # DOCTYPE |
| 3411 |
.= chr ($self->{nc}); # DOCTYPE |
## Stay in the state. |
|
## Stay in the state |
|
| 3412 |
|
|
| 3413 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 3414 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 5080 |
my $code = $self->{kwd}; |
my $code = $self->{kwd}; |
| 5081 |
my $l = $self->{line_prev}; |
my $l = $self->{line_prev}; |
| 5082 |
my $c = $self->{column_prev}; |
my $c = $self->{column_prev}; |
| 5083 |
if ($charref_map->{$code}) { |
if ((not $self->{is_xml} and $charref_map->{$code}) or |
| 5084 |
|
($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or |
| 5085 |
|
($self->{is_xml} and $code == 0x0000)) { |
| 5086 |
|
|
| 5087 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference', |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference', |
| 5088 |
text => (sprintf 'U+%04X', $code), |
text => (sprintf 'U+%04X', $code), |
| 5235 |
my $code = $self->{kwd}; |
my $code = $self->{kwd}; |
| 5236 |
my $l = $self->{line_prev}; |
my $l = $self->{line_prev}; |
| 5237 |
my $c = $self->{column_prev}; |
my $c = $self->{column_prev}; |
| 5238 |
if ($charref_map->{$code}) { |
if ((not $self->{is_xml} and $charref_map->{$code}) or |
| 5239 |
|
($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or |
| 5240 |
|
($self->{is_xml} and $code == 0x0000)) { |
| 5241 |
|
|
| 5242 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference', |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference', |
| 5243 |
text => (sprintf 'U+%04X', $code), |
text => (sprintf 'U+%04X', $code), |
| 5716 |
## XML5: Not defined yet. |
## XML5: Not defined yet. |
| 5717 |
|
|
| 5718 |
## TODO: |
## TODO: |
| 5719 |
|
|
| 5720 |
|
if (not $self->{stop_processing} and |
| 5721 |
|
not $self->{document}->xml_standalone) { |
| 5722 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'stop processing', ## TODO: type |
| 5723 |
|
level => $self->{level}->{info}); |
| 5724 |
|
$self->{stop_processing} = 1; |
| 5725 |
|
} |
| 5726 |
|
|
| 5727 |
|
|
| 5728 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 5729 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 6158 |
} |
} |
| 6159 |
$self->{ct} = {type => ELEMENT_TOKEN, name => '', |
$self->{ct} = {type => ELEMENT_TOKEN, name => '', |
| 6160 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 6161 |
column => $self->{column_prev} - 6}; |
column => $self->{column_prev} - 7}; |
| 6162 |
$self->{state} = DOCTYPE_MD_STATE; |
$self->{state} = DOCTYPE_MD_STATE; |
| 6163 |
|
|
| 6164 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 6226 |
$self->{ct} = {type => ATTLIST_TOKEN, name => '', |
$self->{ct} = {type => ATTLIST_TOKEN, name => '', |
| 6227 |
attrdefs => [], |
attrdefs => [], |
| 6228 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 6229 |
column => $self->{column_prev} - 6}; |
column => $self->{column_prev} - 7}; |
| 6230 |
$self->{state} = DOCTYPE_MD_STATE; |
$self->{state} = DOCTYPE_MD_STATE; |
| 6231 |
|
|
| 6232 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 6295 |
} |
} |
| 6296 |
$self->{ct} = {type => NOTATION_TOKEN, name => '', |
$self->{ct} = {type => NOTATION_TOKEN, name => '', |
| 6297 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 6298 |
column => $self->{column_prev} - 6}; |
column => $self->{column_prev} - 8}; |
| 6299 |
$self->{state} = DOCTYPE_MD_STATE; |
$self->{state} = DOCTYPE_MD_STATE; |
| 6300 |
|
|
| 6301 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |