2 |
use strict; |
use strict; |
3 |
our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r}; |
our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r}; |
4 |
|
|
5 |
## This is an early version of an HTML parser. |
## ISSUE: |
6 |
|
## var doc = implementation.createDocument (null, null, null); |
7 |
|
## doc.write (''); |
8 |
|
## alert (doc.compatMode); |
9 |
|
|
10 |
my $permitted_slash_tag_name = { |
my $permitted_slash_tag_name = { |
11 |
base => 1, |
base => 1, |
158 |
# $self->{next_input_character} |
# $self->{next_input_character} |
159 |
!!!next-input-character; |
!!!next-input-character; |
160 |
$self->{token} = []; |
$self->{token} = []; |
161 |
|
# $self->{escape} |
162 |
} # _initialize_tokenizer |
} # _initialize_tokenizer |
163 |
|
|
164 |
## A token has: |
## A token has: |
165 |
## ->{type} eq 'DOCTYPE', 'start tag', 'end tag', 'comment', |
## ->{type} eq 'DOCTYPE', 'start tag', 'end tag', 'comment', |
166 |
## 'character', or 'end-of-file' |
## 'character', or 'end-of-file' |
167 |
## ->{name} (DOCTYPE, start tag (tagname), end tag (tagname)) |
## ->{name} (DOCTYPE, start tag (tag name), end tag (tag name)) |
168 |
## ISSUE: the spec need s/tagname/tag name/ |
## ->{public_identifier} (DOCTYPE) |
169 |
## ->{error} == 1 or 0 (DOCTYPE) |
## ->{system_identifier} (DOCTYPE) |
170 |
|
## ->{correct} == 1 or 0 (DOCTYPE) |
171 |
## ->{attributes} isa HASH (start tag, end tag) |
## ->{attributes} isa HASH (start tag, end tag) |
172 |
## ->{data} (comment, character) |
## ->{data} (comment, character) |
173 |
|
|
|
## Macros |
|
|
## Macros MUST be preceded by three EXCLAMATION MARKs. |
|
|
## emit ($token) |
|
|
## Emits the specified token. |
|
|
|
|
174 |
## Emitted token MUST immediately be handled by the tree construction state. |
## Emitted token MUST immediately be handled by the tree construction state. |
175 |
|
|
176 |
## Before each step, UA MAY check to see if either one of the scripts in |
## Before each step, UA MAY check to see if either one of the scripts in |
1103 |
## Stay in the state |
## Stay in the state |
1104 |
!!!next-input-character; |
!!!next-input-character; |
1105 |
redo A; |
redo A; |
|
} elsif (0x0061 <= $self->{next_input_character} and |
|
|
$self->{next_input_character} <= 0x007A) { # a..z |
|
|
## ISSUE: "Set the token's name name to the" in the spec |
|
|
$self->{current_token} = {type => 'DOCTYPE', |
|
|
name => chr ($self->{next_input_character} - 0x0020), |
|
|
error => 1}; |
|
|
$self->{state} = 'DOCTYPE name'; |
|
|
!!!next-input-character; |
|
|
redo A; |
|
1106 |
} elsif ($self->{next_input_character} == 0x003E) { # > |
} elsif ($self->{next_input_character} == 0x003E) { # > |
1107 |
!!!parse-error (type => 'no DOCTYPE name'); |
!!!parse-error (type => 'no DOCTYPE name'); |
1108 |
$self->{state} = 'data'; |
$self->{state} = 'data'; |
1109 |
!!!next-input-character; |
!!!next-input-character; |
1110 |
|
|
1111 |
!!!emit ({type => 'DOCTYPE', name => '', error => 1}); |
!!!emit ({type => 'DOCTYPE'}); # incorrect |
1112 |
|
|
1113 |
redo A; |
redo A; |
1114 |
} elsif ($self->{next_input_character} == -1) { |
} elsif ($self->{next_input_character} == -1) { |
1116 |
$self->{state} = 'data'; |
$self->{state} = 'data'; |
1117 |
## reconsume |
## reconsume |
1118 |
|
|
1119 |
!!!emit ({type => 'DOCTYPE', name => '', error => 1}); |
!!!emit ({type => 'DOCTYPE'}); # incorrect |
1120 |
|
|
1121 |
redo A; |
redo A; |
1122 |
} else { |
} else { |
1123 |
$self->{current_token} = {type => 'DOCTYPE', |
$self->{current_token} |
1124 |
name => chr ($self->{next_input_character}), |
= {type => 'DOCTYPE', |
1125 |
error => 1}; |
name => chr ($self->{next_input_character}), |
1126 |
|
correct => 1}; |
1127 |
## ISSUE: "Set the token's name name to the" in the spec |
## ISSUE: "Set the token's name name to the" in the spec |
1128 |
$self->{state} = 'DOCTYPE name'; |
$self->{state} = 'DOCTYPE name'; |
1129 |
!!!next-input-character; |
!!!next-input-character; |
1130 |
redo A; |
redo A; |
1131 |
} |
} |
1132 |
} elsif ($self->{state} eq 'DOCTYPE name') { |
} elsif ($self->{state} eq 'DOCTYPE name') { |
1133 |
|
## ISSUE: Redundant "First," in the spec. |
1134 |
if ($self->{next_input_character} == 0x0009 or # HT |
if ($self->{next_input_character} == 0x0009 or # HT |
1135 |
$self->{next_input_character} == 0x000A or # LF |
$self->{next_input_character} == 0x000A or # LF |
1136 |
$self->{next_input_character} == 0x000B or # VT |
$self->{next_input_character} == 0x000B or # VT |
1137 |
$self->{next_input_character} == 0x000C or # FF |
$self->{next_input_character} == 0x000C or # FF |
1138 |
$self->{next_input_character} == 0x0020) { # SP |
$self->{next_input_character} == 0x0020) { # SP |
|
$self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE |
|
1139 |
$self->{state} = 'after DOCTYPE name'; |
$self->{state} = 'after DOCTYPE name'; |
1140 |
!!!next-input-character; |
!!!next-input-character; |
1141 |
redo A; |
redo A; |
1142 |
} elsif ($self->{next_input_character} == 0x003E) { # > |
} elsif ($self->{next_input_character} == 0x003E) { # > |
|
$self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE |
|
1143 |
$self->{state} = 'data'; |
$self->{state} = 'data'; |
1144 |
!!!next-input-character; |
!!!next-input-character; |
1145 |
|
|
1147 |
undef $self->{current_token}; |
undef $self->{current_token}; |
1148 |
|
|
1149 |
redo A; |
redo A; |
|
} elsif (0x0061 <= $self->{next_input_character} and |
|
|
$self->{next_input_character} <= 0x007A) { # a..z |
|
|
$self->{current_token}->{name} .= chr ($self->{next_input_character} - 0x0020); # DOCTYPE |
|
|
#$self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); |
|
|
## Stay in the state |
|
|
!!!next-input-character; |
|
|
redo A; |
|
1150 |
} elsif ($self->{next_input_character} == -1) { |
} elsif ($self->{next_input_character} == -1) { |
1151 |
!!!parse-error (type => 'unclosed DOCTYPE'); |
!!!parse-error (type => 'unclosed DOCTYPE'); |
|
$self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE |
|
1152 |
$self->{state} = 'data'; |
$self->{state} = 'data'; |
1153 |
## reconsume |
## reconsume |
1154 |
|
|
1155 |
!!!emit ($self->{current_token}); |
delete $self->{current_token}->{correct}; |
1156 |
|
!!!emit ($self->{current_token}); # DOCTYPE |
1157 |
undef $self->{current_token}; |
undef $self->{current_token}; |
1158 |
|
|
1159 |
redo A; |
redo A; |
1160 |
} else { |
} else { |
1161 |
$self->{current_token}->{name} |
$self->{current_token}->{name} |
1162 |
.= chr ($self->{next_input_character}); # DOCTYPE |
.= chr ($self->{next_input_character}); # DOCTYPE |
|
#$self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); |
|
1163 |
## Stay in the state |
## Stay in the state |
1164 |
!!!next-input-character; |
!!!next-input-character; |
1165 |
redo A; |
redo A; |
1186 |
$self->{state} = 'data'; |
$self->{state} = 'data'; |
1187 |
## reconsume |
## reconsume |
1188 |
|
|
1189 |
|
delete $self->{current_token}->{correct}; |
1190 |
|
!!!emit ($self->{current_token}); # DOCTYPE |
1191 |
|
undef $self->{current_token}; |
1192 |
|
|
1193 |
|
redo A; |
1194 |
|
} elsif ($self->{next_input_character} == 0x0050 or # P |
1195 |
|
$self->{next_input_character} == 0x0070) { # p |
1196 |
|
!!!next-input-character; |
1197 |
|
if ($self->{next_input_character} == 0x0055 or # U |
1198 |
|
$self->{next_input_character} == 0x0075) { # u |
1199 |
|
!!!next-input-character; |
1200 |
|
if ($self->{next_input_character} == 0x0042 or # B |
1201 |
|
$self->{next_input_character} == 0x0062) { # b |
1202 |
|
!!!next-input-character; |
1203 |
|
if ($self->{next_input_character} == 0x004C or # L |
1204 |
|
$self->{next_input_character} == 0x006C) { # l |
1205 |
|
!!!next-input-character; |
1206 |
|
if ($self->{next_input_character} == 0x0049 or # I |
1207 |
|
$self->{next_input_character} == 0x0069) { # i |
1208 |
|
!!!next-input-character; |
1209 |
|
if ($self->{next_input_character} == 0x0043 or # C |
1210 |
|
$self->{next_input_character} == 0x0063) { # c |
1211 |
|
$self->{state} = 'before DOCTYPE public identifier'; |
1212 |
|
!!!next-input-character; |
1213 |
|
redo A; |
1214 |
|
} |
1215 |
|
} |
1216 |
|
} |
1217 |
|
} |
1218 |
|
} |
1219 |
|
|
1220 |
|
# |
1221 |
|
} elsif ($self->{next_input_character} == 0x0053 or # S |
1222 |
|
$self->{next_input_character} == 0x0073) { # s |
1223 |
|
!!!next-input-character; |
1224 |
|
if ($self->{next_input_character} == 0x0059 or # Y |
1225 |
|
$self->{next_input_character} == 0x0079) { # y |
1226 |
|
!!!next-input-character; |
1227 |
|
if ($self->{next_input_character} == 0x0053 or # S |
1228 |
|
$self->{next_input_character} == 0x0073) { # s |
1229 |
|
!!!next-input-character; |
1230 |
|
if ($self->{next_input_character} == 0x0054 or # T |
1231 |
|
$self->{next_input_character} == 0x0074) { # t |
1232 |
|
!!!next-input-character; |
1233 |
|
if ($self->{next_input_character} == 0x0045 or # E |
1234 |
|
$self->{next_input_character} == 0x0065) { # e |
1235 |
|
!!!next-input-character; |
1236 |
|
if ($self->{next_input_character} == 0x004D or # M |
1237 |
|
$self->{next_input_character} == 0x006D) { # m |
1238 |
|
$self->{state} = 'before DOCTYPE system identifier'; |
1239 |
|
!!!next-input-character; |
1240 |
|
redo A; |
1241 |
|
} |
1242 |
|
} |
1243 |
|
} |
1244 |
|
} |
1245 |
|
} |
1246 |
|
|
1247 |
|
# |
1248 |
|
} else { |
1249 |
|
!!!next-input-character; |
1250 |
|
# |
1251 |
|
} |
1252 |
|
|
1253 |
|
!!!parse-error (type => 'string after DOCTYPE name'); |
1254 |
|
$self->{state} = 'bogus DOCTYPE'; |
1255 |
|
# next-input-character is already done |
1256 |
|
redo A; |
1257 |
|
} elsif ($self->{state} eq 'before DOCTYPE public identifier') { |
1258 |
|
if ({ |
1259 |
|
0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1, |
1260 |
|
#0x000D => 1, # HT, LF, VT, FF, SP, CR |
1261 |
|
}->{$self->{next_input_character}}) { |
1262 |
|
## Stay in the state |
1263 |
|
!!!next-input-character; |
1264 |
|
redo A; |
1265 |
|
} elsif ($self->{next_input_character} eq 0x0022) { # " |
1266 |
|
$self->{current_token}->{public_identifier} = ''; # DOCTYPE |
1267 |
|
$self->{state} = 'DOCTYPE public identifier (double-quoted)'; |
1268 |
|
!!!next-input-character; |
1269 |
|
redo A; |
1270 |
|
} elsif ($self->{next_input_character} eq 0x0027) { # ' |
1271 |
|
$self->{current_token}->{public_identifier} = ''; # DOCTYPE |
1272 |
|
$self->{state} = 'DOCTYPE public identifier (single-quoted)'; |
1273 |
|
!!!next-input-character; |
1274 |
|
redo A; |
1275 |
|
} elsif ($self->{next_input_character} eq 0x003E) { # > |
1276 |
|
!!!parse-error (type => 'no PUBLIC literal'); |
1277 |
|
|
1278 |
|
$self->{state} = 'data'; |
1279 |
|
!!!next-input-character; |
1280 |
|
|
1281 |
|
delete $self->{current_token}->{correct}; |
1282 |
|
!!!emit ($self->{current_token}); # DOCTYPE |
1283 |
|
undef $self->{current_token}; |
1284 |
|
|
1285 |
|
redo A; |
1286 |
|
} elsif ($self->{next_input_character} == -1) { |
1287 |
|
!!!parse-error (type => 'unclosed DOCTYPE'); |
1288 |
|
|
1289 |
|
$self->{state} = 'data'; |
1290 |
|
## reconsume |
1291 |
|
|
1292 |
|
delete $self->{current_token}->{correct}; |
1293 |
|
!!!emit ($self->{current_token}); # DOCTYPE |
1294 |
|
undef $self->{current_token}; |
1295 |
|
|
1296 |
|
redo A; |
1297 |
|
} else { |
1298 |
|
!!!parse-error (type => 'string after PUBLIC'); |
1299 |
|
$self->{state} = 'bogus DOCTYPE'; |
1300 |
|
!!!next-input-character; |
1301 |
|
redo A; |
1302 |
|
} |
1303 |
|
} elsif ($self->{state} eq 'DOCTYPE public identifier (double-quoted)') { |
1304 |
|
if ($self->{next_input_character} == 0x0022) { # " |
1305 |
|
$self->{state} = 'after DOCTYPE public identifier'; |
1306 |
|
!!!next-input-character; |
1307 |
|
redo A; |
1308 |
|
} elsif ($self->{next_input_character} == -1) { |
1309 |
|
!!!parse-error (type => 'unclosed PUBLIC literal'); |
1310 |
|
|
1311 |
|
$self->{state} = 'data'; |
1312 |
|
## reconsume |
1313 |
|
|
1314 |
|
delete $self->{current_token}->{correct}; |
1315 |
|
!!!emit ($self->{current_token}); # DOCTYPE |
1316 |
|
undef $self->{current_token}; |
1317 |
|
|
1318 |
|
redo A; |
1319 |
|
} else { |
1320 |
|
$self->{current_token}->{public_identifier} # DOCTYPE |
1321 |
|
.= chr $self->{next_input_character}; |
1322 |
|
## Stay in the state |
1323 |
|
!!!next-input-character; |
1324 |
|
redo A; |
1325 |
|
} |
1326 |
|
} elsif ($self->{state} eq 'DOCTYPE public identifier (single-quoted)') { |
1327 |
|
if ($self->{next_input_character} == 0x0027) { # ' |
1328 |
|
$self->{state} = 'after DOCTYPE public identifier'; |
1329 |
|
!!!next-input-character; |
1330 |
|
redo A; |
1331 |
|
} elsif ($self->{next_input_character} == -1) { |
1332 |
|
!!!parse-error (type => 'unclosed PUBLIC literal'); |
1333 |
|
|
1334 |
|
$self->{state} = 'data'; |
1335 |
|
## reconsume |
1336 |
|
|
1337 |
|
delete $self->{current_token}->{correct}; |
1338 |
|
!!!emit ($self->{current_token}); # DOCTYPE |
1339 |
|
undef $self->{current_token}; |
1340 |
|
|
1341 |
|
redo A; |
1342 |
|
} else { |
1343 |
|
$self->{current_token}->{public_identifier} # DOCTYPE |
1344 |
|
.= chr $self->{next_input_character}; |
1345 |
|
## Stay in the state |
1346 |
|
!!!next-input-character; |
1347 |
|
redo A; |
1348 |
|
} |
1349 |
|
} elsif ($self->{state} eq 'after DOCTYPE public identifier') { |
1350 |
|
if ({ |
1351 |
|
0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1, |
1352 |
|
#0x000D => 1, # HT, LF, VT, FF, SP, CR |
1353 |
|
}->{$self->{next_input_character}}) { |
1354 |
|
## Stay in the state |
1355 |
|
!!!next-input-character; |
1356 |
|
redo A; |
1357 |
|
} elsif ($self->{next_input_character} == 0x0022) { # " |
1358 |
|
$self->{current_token}->{system_identifier} = ''; # DOCTYPE |
1359 |
|
$self->{state} = 'DOCTYPE system identifier (double-quoted)'; |
1360 |
|
!!!next-input-character; |
1361 |
|
redo A; |
1362 |
|
} elsif ($self->{next_input_character} == 0x0027) { # ' |
1363 |
|
$self->{current_token}->{system_identifier} = ''; # DOCTYPE |
1364 |
|
$self->{state} = 'DOCTYPE system identifier (single-quoted)'; |
1365 |
|
!!!next-input-character; |
1366 |
|
redo A; |
1367 |
|
} elsif ($self->{next_input_character} == 0x003E) { # > |
1368 |
|
$self->{state} = 'data'; |
1369 |
|
!!!next-input-character; |
1370 |
|
|
1371 |
|
!!!emit ($self->{current_token}); # DOCTYPE |
1372 |
|
undef $self->{current_token}; |
1373 |
|
|
1374 |
|
redo A; |
1375 |
|
} elsif ($self->{next_input_character} == -1) { |
1376 |
|
!!!parse-error (type => 'unclosed DOCTYPE'); |
1377 |
|
|
1378 |
|
$self->{state} = 'data'; |
1379 |
|
## recomsume |
1380 |
|
|
1381 |
|
delete $self->{current_token}->{correct}; |
1382 |
|
!!!emit ($self->{current_token}); # DOCTYPE |
1383 |
|
undef $self->{current_token}; |
1384 |
|
|
1385 |
|
redo A; |
1386 |
|
} else { |
1387 |
|
!!!parse-error (type => 'string after PUBLIC literal'); |
1388 |
|
$self->{state} = 'bogus DOCTYPE'; |
1389 |
|
!!!next-input-character; |
1390 |
|
redo A; |
1391 |
|
} |
1392 |
|
} elsif ($self->{state} eq 'before DOCTYPE system identifier') { |
1393 |
|
if ({ |
1394 |
|
0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1, |
1395 |
|
#0x000D => 1, # HT, LF, VT, FF, SP, CR |
1396 |
|
}->{$self->{next_input_character}}) { |
1397 |
|
## Stay in the state |
1398 |
|
!!!next-input-character; |
1399 |
|
redo A; |
1400 |
|
} elsif ($self->{next_input_character} == 0x0022) { # " |
1401 |
|
$self->{current_token}->{system_identifier} = ''; # DOCTYPE |
1402 |
|
$self->{state} = 'DOCTYPE system identifier (double-quoted)'; |
1403 |
|
!!!next-input-character; |
1404 |
|
redo A; |
1405 |
|
} elsif ($self->{next_input_character} == 0x0027) { # ' |
1406 |
|
$self->{current_token}->{system_identifier} = ''; # DOCTYPE |
1407 |
|
$self->{state} = 'DOCTYPE system identifier (single-quoted)'; |
1408 |
|
!!!next-input-character; |
1409 |
|
redo A; |
1410 |
|
} elsif ($self->{next_input_character} == 0x003E) { # > |
1411 |
|
!!!parse-error (type => 'no SYSTEM literal'); |
1412 |
|
$self->{state} = 'data'; |
1413 |
|
!!!next-input-character; |
1414 |
|
|
1415 |
|
delete $self->{current_token}->{correct}; |
1416 |
|
!!!emit ($self->{current_token}); # DOCTYPE |
1417 |
|
undef $self->{current_token}; |
1418 |
|
|
1419 |
|
redo A; |
1420 |
|
} elsif ($self->{next_input_character} == -1) { |
1421 |
|
!!!parse-error (type => 'unclosed DOCTYPE'); |
1422 |
|
|
1423 |
|
$self->{state} = 'data'; |
1424 |
|
## recomsume |
1425 |
|
|
1426 |
|
delete $self->{current_token}->{correct}; |
1427 |
|
!!!emit ($self->{current_token}); # DOCTYPE |
1428 |
|
undef $self->{current_token}; |
1429 |
|
|
1430 |
|
redo A; |
1431 |
|
} else { |
1432 |
|
!!!parse-error (type => 'string after PUBLIC literal'); |
1433 |
|
$self->{state} = 'bogus DOCTYPE'; |
1434 |
|
!!!next-input-character; |
1435 |
|
redo A; |
1436 |
|
} |
1437 |
|
} elsif ($self->{state} eq 'DOCTYPE system identifier (double-quoted)') { |
1438 |
|
if ($self->{next_input_character} == 0x0022) { # " |
1439 |
|
$self->{state} = 'after DOCTYPE system identifier'; |
1440 |
|
!!!next-input-character; |
1441 |
|
redo A; |
1442 |
|
} elsif ($self->{next_input_character} == -1) { |
1443 |
|
!!!parse-error (type => 'unclosed SYSTEM literal'); |
1444 |
|
|
1445 |
|
$self->{state} = 'data'; |
1446 |
|
## reconsume |
1447 |
|
|
1448 |
|
delete $self->{current_token}->{correct}; |
1449 |
|
!!!emit ($self->{current_token}); # DOCTYPE |
1450 |
|
undef $self->{current_token}; |
1451 |
|
|
1452 |
|
redo A; |
1453 |
|
} else { |
1454 |
|
$self->{current_token}->{system_identifier} # DOCTYPE |
1455 |
|
.= chr $self->{next_input_character}; |
1456 |
|
## Stay in the state |
1457 |
|
!!!next-input-character; |
1458 |
|
redo A; |
1459 |
|
} |
1460 |
|
} elsif ($self->{state} eq 'DOCTYPE system identifier (single-quoted)') { |
1461 |
|
if ($self->{next_input_character} == 0x0027) { # ' |
1462 |
|
$self->{state} = 'after DOCTYPE system identifier'; |
1463 |
|
!!!next-input-character; |
1464 |
|
redo A; |
1465 |
|
} elsif ($self->{next_input_character} == -1) { |
1466 |
|
!!!parse-error (type => 'unclosed SYSTEM literal'); |
1467 |
|
|
1468 |
|
$self->{state} = 'data'; |
1469 |
|
## reconsume |
1470 |
|
|
1471 |
|
delete $self->{current_token}->{correct}; |
1472 |
!!!emit ($self->{current_token}); # DOCTYPE |
!!!emit ($self->{current_token}); # DOCTYPE |
1473 |
undef $self->{current_token}; |
undef $self->{current_token}; |
1474 |
|
|
1475 |
redo A; |
redo A; |
1476 |
} else { |
} else { |
1477 |
!!!parse-error (type => 'string after DOCTYPE name'); |
$self->{current_token}->{system_identifier} # DOCTYPE |
1478 |
$self->{current_token}->{error} = 1; # DOCTYPE |
.= chr $self->{next_input_character}; |
1479 |
|
## Stay in the state |
1480 |
|
!!!next-input-character; |
1481 |
|
redo A; |
1482 |
|
} |
1483 |
|
} elsif ($self->{state} eq 'after DOCTYPE system identifier') { |
1484 |
|
if ({ |
1485 |
|
0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1, |
1486 |
|
#0x000D => 1, # HT, LF, VT, FF, SP, CR |
1487 |
|
}->{$self->{next_input_character}}) { |
1488 |
|
## Stay in the state |
1489 |
|
!!!next-input-character; |
1490 |
|
redo A; |
1491 |
|
} elsif ($self->{next_input_character} == 0x003E) { # > |
1492 |
|
$self->{state} = 'data'; |
1493 |
|
!!!next-input-character; |
1494 |
|
|
1495 |
|
!!!emit ($self->{current_token}); # DOCTYPE |
1496 |
|
undef $self->{current_token}; |
1497 |
|
|
1498 |
|
redo A; |
1499 |
|
} elsif ($self->{next_input_character} == -1) { |
1500 |
|
!!!parse-error (type => 'unclosed DOCTYPE'); |
1501 |
|
|
1502 |
|
$self->{state} = 'data'; |
1503 |
|
## recomsume |
1504 |
|
|
1505 |
|
delete $self->{current_token}->{correct}; |
1506 |
|
!!!emit ($self->{current_token}); # DOCTYPE |
1507 |
|
undef $self->{current_token}; |
1508 |
|
|
1509 |
|
redo A; |
1510 |
|
} else { |
1511 |
|
!!!parse-error (type => 'string after SYSTEM literal'); |
1512 |
$self->{state} = 'bogus DOCTYPE'; |
$self->{state} = 'bogus DOCTYPE'; |
1513 |
!!!next-input-character; |
!!!next-input-character; |
1514 |
redo A; |
redo A; |
1518 |
$self->{state} = 'data'; |
$self->{state} = 'data'; |
1519 |
!!!next-input-character; |
!!!next-input-character; |
1520 |
|
|
1521 |
|
delete $self->{current_token}->{correct}; |
1522 |
!!!emit ($self->{current_token}); # DOCTYPE |
!!!emit ($self->{current_token}); # DOCTYPE |
1523 |
undef $self->{current_token}; |
undef $self->{current_token}; |
1524 |
|
|
1528 |
$self->{state} = 'data'; |
$self->{state} = 'data'; |
1529 |
## reconsume |
## reconsume |
1530 |
|
|
1531 |
|
delete $self->{current_token}->{correct}; |
1532 |
!!!emit ($self->{current_token}); # DOCTYPE |
!!!emit ($self->{current_token}); # DOCTYPE |
1533 |
undef $self->{current_token}; |
undef $self->{current_token}; |
1534 |
|
|
1696 |
$self->{document}->strict_error_checking (0); |
$self->{document}->strict_error_checking (0); |
1697 |
## TODO: Turn mutation events off # MUST |
## TODO: Turn mutation events off # MUST |
1698 |
## TODO: Turn loose Document option (manakai extension) on |
## TODO: Turn loose Document option (manakai extension) on |
1699 |
## TODO: Mark the Document as an HTML document # MUST |
$self->{document}->manakai_is_html (1); # MUST |
1700 |
} # _initialize_tree_constructor |
} # _initialize_tree_constructor |
1701 |
|
|
1702 |
sub _terminate_tree_constructor ($) { |
sub _terminate_tree_constructor ($) { |
1736 |
|
|
1737 |
sub _tree_construction_initial ($) { |
sub _tree_construction_initial ($) { |
1738 |
my $self = shift; |
my $self = shift; |
1739 |
B: { |
INITIAL: { |
1740 |
if ($token->{type} eq 'DOCTYPE') { |
if ($token->{type} eq 'DOCTYPE') { |
1741 |
if ($token->{error}) { |
## NOTE: Conformance checkers MAY, instead of reporting "not HTML5" |
1742 |
## ISSUE: Spec currently left this case undefined. |
## error, switch to a conformance checking mode for another |
1743 |
!!!parse-error (type => 'bogus DOCTYPE'); |
## language. |
1744 |
} |
my $doctype_name = $token->{name}; |
1745 |
my $doctype = $self->{document}->create_document_type_definition |
$doctype_name = '' unless defined $doctype_name; |
1746 |
($token->{name}); |
$doctype_name =~ tr/a-z/A-Z/; |
1747 |
$self->{document}->append_child ($doctype); |
if (not defined $token->{name} or # <!DOCTYPE> |
1748 |
#$phase = 'root element'; |
defined $token->{public_identifier} or |
1749 |
!!!next-token; |
defined $token->{system_identifier}) { |
1750 |
#redo B; |
!!!parse-error (type => 'not HTML5'); |
1751 |
return; |
} elsif ($doctype_name ne 'HTML') { |
1752 |
} elsif ({ |
## ISSUE: ASCII case-insensitive? (in fact it does not matter) |
1753 |
comment => 1, |
!!!parse-error (type => 'not HTML5'); |
1754 |
'start tag' => 1, |
} |
1755 |
'end tag' => 1, |
|
1756 |
'end-of-file' => 1, |
my $doctype = $self->{document}->create_document_type_definition |
1757 |
}->{$token->{type}}) { |
($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)? |
1758 |
## ISSUE: Spec currently left this case undefined. |
$doctype->public_id ($token->{public_identifier}) |
1759 |
!!!parse-error (type => 'missing DOCTYPE'); |
if defined $token->{public_identifier}; |
1760 |
#$phase = 'root element'; |
$doctype->system_id ($token->{system_identifier}) |
1761 |
## reprocess |
if defined $token->{system_identifier}; |
1762 |
#redo B; |
## NOTE: Other DocumentType attributes are null or empty lists. |
1763 |
return; |
## ISSUE: internalSubset = null?? |
1764 |
} elsif ($token->{type} eq 'character') { |
$self->{document}->append_child ($doctype); |
1765 |
if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { |
|
1766 |
$self->{document}->manakai_append_text ($1); |
if (not $token->{correct} or $doctype_name ne 'HTML') { |
1767 |
## ISSUE: DOM3 Core does not allow Document > Text |
$self->{document}->manakai_compat_mode ('quirks'); |
1768 |
unless (length $token->{data}) { |
} elsif (defined $token->{public_identifier}) { |
1769 |
## Stay in the phase |
my $pubid = $token->{public_identifier}; |
1770 |
!!!next-token; |
$pubid =~ tr/a-z/A-z/; |
1771 |
redo B; |
if ({ |
1772 |
|
"+//SILMARIL//DTD HTML PRO V0R11 19970101//EN" => 1, |
1773 |
|
"-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1, |
1774 |
|
"-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1, |
1775 |
|
"-//IETF//DTD HTML 2.0 LEVEL 1//EN" => 1, |
1776 |
|
"-//IETF//DTD HTML 2.0 LEVEL 2//EN" => 1, |
1777 |
|
"-//IETF//DTD HTML 2.0 STRICT LEVEL 1//EN" => 1, |
1778 |
|
"-//IETF//DTD HTML 2.0 STRICT LEVEL 2//EN" => 1, |
1779 |
|
"-//IETF//DTD HTML 2.0 STRICT//EN" => 1, |
1780 |
|
"-//IETF//DTD HTML 2.0//EN" => 1, |
1781 |
|
"-//IETF//DTD HTML 2.1E//EN" => 1, |
1782 |
|
"-//IETF//DTD HTML 3.0//EN" => 1, |
1783 |
|
"-//IETF//DTD HTML 3.0//EN//" => 1, |
1784 |
|
"-//IETF//DTD HTML 3.2 FINAL//EN" => 1, |
1785 |
|
"-//IETF//DTD HTML 3.2//EN" => 1, |
1786 |
|
"-//IETF//DTD HTML 3//EN" => 1, |
1787 |
|
"-//IETF//DTD HTML LEVEL 0//EN" => 1, |
1788 |
|
"-//IETF//DTD HTML LEVEL 0//EN//2.0" => 1, |
1789 |
|
"-//IETF//DTD HTML LEVEL 1//EN" => 1, |
1790 |
|
"-//IETF//DTD HTML LEVEL 1//EN//2.0" => 1, |
1791 |
|
"-//IETF//DTD HTML LEVEL 2//EN" => 1, |
1792 |
|
"-//IETF//DTD HTML LEVEL 2//EN//2.0" => 1, |
1793 |
|
"-//IETF//DTD HTML LEVEL 3//EN" => 1, |
1794 |
|
"-//IETF//DTD HTML LEVEL 3//EN//3.0" => 1, |
1795 |
|
"-//IETF//DTD HTML STRICT LEVEL 0//EN" => 1, |
1796 |
|
"-//IETF//DTD HTML STRICT LEVEL 0//EN//2.0" => 1, |
1797 |
|
"-//IETF//DTD HTML STRICT LEVEL 1//EN" => 1, |
1798 |
|
"-//IETF//DTD HTML STRICT LEVEL 1//EN//2.0" => 1, |
1799 |
|
"-//IETF//DTD HTML STRICT LEVEL 2//EN" => 1, |
1800 |
|
"-//IETF//DTD HTML STRICT LEVEL 2//EN//2.0" => 1, |
1801 |
|
"-//IETF//DTD HTML STRICT LEVEL 3//EN" => 1, |
1802 |
|
"-//IETF//DTD HTML STRICT LEVEL 3//EN//3.0" => 1, |
1803 |
|
"-//IETF//DTD HTML STRICT//EN" => 1, |
1804 |
|
"-//IETF//DTD HTML STRICT//EN//2.0" => 1, |
1805 |
|
"-//IETF//DTD HTML STRICT//EN//3.0" => 1, |
1806 |
|
"-//IETF//DTD HTML//EN" => 1, |
1807 |
|
"-//IETF//DTD HTML//EN//2.0" => 1, |
1808 |
|
"-//IETF//DTD HTML//EN//3.0" => 1, |
1809 |
|
"-//METRIUS//DTD METRIUS PRESENTATIONAL//EN" => 1, |
1810 |
|
"-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//EN" => 1, |
1811 |
|
"-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//EN" => 1, |
1812 |
|
"-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//EN" => 1, |
1813 |
|
"-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//EN" => 1, |
1814 |
|
"-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//EN" => 1, |
1815 |
|
"-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//EN" => 1, |
1816 |
|
"-//NETSCAPE COMM. CORP.//DTD HTML//EN" => 1, |
1817 |
|
"-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1, |
1818 |
|
"-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1, |
1819 |
|
"-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1, |
1820 |
|
"-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1, |
1821 |
|
"-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1, |
1822 |
|
"-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1, |
1823 |
|
"-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//EN" => 1, |
1824 |
|
"-//W3C//DTD HTML 3 1995-03-24//EN" => 1, |
1825 |
|
"-//W3C//DTD HTML 3.2 DRAFT//EN" => 1, |
1826 |
|
"-//W3C//DTD HTML 3.2 FINAL//EN" => 1, |
1827 |
|
"-//W3C//DTD HTML 3.2//EN" => 1, |
1828 |
|
"-//W3C//DTD HTML 3.2S DRAFT//EN" => 1, |
1829 |
|
"-//W3C//DTD HTML 4.0 FRAMESET//EN" => 1, |
1830 |
|
"-//W3C//DTD HTML 4.0 TRANSITIONAL//EN" => 1, |
1831 |
|
"-//W3C//DTD HTML EXPERIMETNAL 19960712//EN" => 1, |
1832 |
|
"-//W3C//DTD HTML EXPERIMENTAL 970421//EN" => 1, |
1833 |
|
"-//W3C//DTD W3 HTML//EN" => 1, |
1834 |
|
"-//W3O//DTD W3 HTML 3.0//EN" => 1, |
1835 |
|
"-//W3O//DTD W3 HTML 3.0//EN//" => 1, |
1836 |
|
"-//W3O//DTD W3 HTML STRICT 3.0//EN//" => 1, |
1837 |
|
"-//WEBTECHS//DTD MOZILLA HTML 2.0//EN" => 1, |
1838 |
|
"-//WEBTECHS//DTD MOZILLA HTML//EN" => 1, |
1839 |
|
"-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" => 1, |
1840 |
|
"HTML" => 1, |
1841 |
|
}->{$pubid}) { |
1842 |
|
$self->{document}->manakai_compat_mode ('quirks'); |
1843 |
|
} elsif ($pubid eq "-//W3C//DTD HTML 4.01 FRAMESET//EN" or |
1844 |
|
$pubid eq "-//W3C//DTD HTML 4.01 TRANSITIONAL//EN") { |
1845 |
|
if (defined $token->{system_identifier}) { |
1846 |
|
$self->{document}->manakai_compat_mode ('quirks'); |
1847 |
|
} else { |
1848 |
|
$self->{document}->manakai_compat_mode ('limited quirks'); |
1849 |
} |
} |
1850 |
|
} elsif ($pubid eq "-//W3C//DTD XHTML 1.0 Frameset//EN" or |
1851 |
|
$pubid eq "-//W3C//DTD XHTML 1.0 Transitional//EN") { |
1852 |
|
$self->{document}->manakai_compat_mode ('limited quirks'); |
1853 |
|
} |
1854 |
|
} |
1855 |
|
if (defined $token->{system_identifier}) { |
1856 |
|
my $sysid = $token->{system_identifier}; |
1857 |
|
$sysid =~ tr/A-Z/a-z/; |
1858 |
|
if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") { |
1859 |
|
$self->{document}->manakai_compat_mode ('quirks'); |
1860 |
} |
} |
|
## ISSUE: Spec currently left this case undefined. |
|
|
!!!parse-error (type => 'missing DOCTYPE'); |
|
|
#$phase = 'root element'; |
|
|
## reprocess |
|
|
#redo B; |
|
|
return; |
|
|
} else { |
|
|
die "$0: $token->{type}: Unknown token"; |
|
1861 |
} |
} |
1862 |
} # B |
|
1863 |
|
## Go to the root element phase. |
1864 |
|
!!!next-token; |
1865 |
|
return; |
1866 |
|
} elsif ({ |
1867 |
|
'start tag' => 1, |
1868 |
|
'end tag' => 1, |
1869 |
|
'end-of-file' => 1, |
1870 |
|
}->{$token->{type}}) { |
1871 |
|
!!!parse-error (type => 'no DOCTYPE'); |
1872 |
|
$self->{document}->manakai_compat_mode ('quirks'); |
1873 |
|
## Go to the root element phase |
1874 |
|
## reprocess |
1875 |
|
return; |
1876 |
|
} elsif ($token->{type} eq 'character') { |
1877 |
|
if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D |
1878 |
|
## Ignore the token |
1879 |
|
unless (length $token->{data}) { |
1880 |
|
## Stay in the phase |
1881 |
|
!!!next-token; |
1882 |
|
redo INITIAL; |
1883 |
|
} |
1884 |
|
} |
1885 |
|
|
1886 |
|
!!!parse-error (type => 'no DOCTYPE'); |
1887 |
|
$self->{document}->manakai_compat_mode ('quirks'); |
1888 |
|
## Go to the root element phase |
1889 |
|
## reprocess |
1890 |
|
return; |
1891 |
|
} elsif ($token->{type} eq 'comment') { |
1892 |
|
my $comment = $self->{document}->create_comment ($token->{data}); |
1893 |
|
$self->{document}->append_child ($comment); |
1894 |
|
|
1895 |
|
## Stay in the phase. |
1896 |
|
!!!next-token; |
1897 |
|
redo INITIAL; |
1898 |
|
} else { |
1899 |
|
die "$0: $token->{type}: Unknown token"; |
1900 |
|
} |
1901 |
|
} # INITIAL |
1902 |
} # _tree_construction_initial |
} # _tree_construction_initial |
1903 |
|
|
1904 |
sub _tree_construction_root_element ($) { |
sub _tree_construction_root_element ($) { |
5084 |
## Step 1 # MUST |
## Step 1 # MUST |
5085 |
my $this_doc = $node->owner_document; |
my $this_doc = $node->owner_document; |
5086 |
my $doc = $this_doc->implementation->create_document; |
my $doc = $this_doc->implementation->create_document; |
5087 |
## TODO: Mark as HTML document |
$doc->manakai_is_html (1); |
5088 |
my $p = $class->new; |
my $p = $class->new; |
5089 |
$p->{document} = $doc; |
$p->{document} = $doc; |
5090 |
|
|