/[suikacvs]/markup/html/scripting-parser/parser.html
Suika

Contents of /markup/html/scripting-parser/parser.html

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.9 - (show annotations) (download) (as text)
Sun Apr 27 09:16:11 2008 UTC (17 years, 2 months ago) by wakaba
Branch: MAIN
Changes since 1.8: +16 -11 lines
File MIME type: text/html
Attribute parsing bug fixed

1 <!DOCTYPE HTML>
2 <html lang=en>
3 <head>
4 <title>Live Scripting HTML Parser</title>
5 <style>
6 h1, h2 {
7 margin: 0;
8 font-size: 100%;
9 }
10 p, pre {
11 margin: 0;
12 }
13 textarea {
14 width: 100%;
15 -width: 99%;
16 height: 10em;
17 }
18 output {
19 display: block;
20 font-family: monospace;
21 white-space: -moz-pre-wrap;
22 white-space: pre-wrap;
23 }
24 </style>
25 <script>
26 var delayedUpdater = 0;
27
28 function update () {
29 if (delayedUpdater) {
30 clearTimeout (delayedUpdater);
31 delayedUpdater = 0;
32 }
33 delayedUpdater = setTimeout (update2, 100);
34 } // update
35
36 function update2 () {
37 var v = document.sourceElement.value;
38 if (v != document.previousSourceText) {
39 document.previousSourceText = v;
40 document.links['permalink'].href
41 = location.pathname + '?s=' + encodeURIComponent (v);
42 document.links['ldvlink'].href
43 = 'http://software.hixie.ch/utilities/js/live-dom-viewer/?'
44 + encodeURIComponent (v);
45
46 document.logElement.textContent = '';
47 var p = new Parser (new InputStream (v));
48 var doc = p.doc;
49 p.parse ();
50 log (dumpTree (doc, ''));
51 }
52 } // update2
53
54 var logIndentLevel = 0;
55 function log (s) {
56 for (var i = 0; i < logIndentLevel; i++) {
57 s = ' ' + s;
58 }
59 document.logElement.appendChild (document.createTextNode (s + "\n"));
60 } // log
61
62 function InputStream (s) {
63 this.s = s;
64 } // InputStream
65
66 function Parser (i, doc) {
67 this.parseMode = 'pcdata';
68 if (!doc) {
69 doc = new JSDocument (this);
70 doc.manakaiIsHTML = true;
71 }
72 this.doc = doc;
73 this.openElements = [doc];
74 this.input = i;
75 this.scriptsExecutedAfterParsing = [];
76 } // Parser
77
78 Parser.prototype.getNextToken = function () {
79 var p = this;
80 var i = this.input;
81 if (this.parseMode == 'script') {
82 var token;
83 if (p.insertionPoint <= 0) {
84 return {type: 'abort'};
85 }
86 i.s = i.s.replace (/^([^<]+)/,
87 function (s, t) {
88 if (0 < p.insertionPoint && p.insertionPoint < t.length) {
89 token = {type: 'char', value: t.substring (0, p.insertionPoint)};
90 var ip = p.insertionPoint;
91 p.insertionPoint = 0;
92 return t.substring (ip, t.length);
93 }
94 token = {type: 'char', value: t};
95 p.insertionPoint -= t.length;
96 return '';
97 });
98 if (token) return token;
99 i.s = i.s.replace (/^<\/[Ss][Cc][Rr][Ii][Pp][Tt]>/, function (s) {
100 if (p.insertionPoint < s.length) {
101 token = {type: 'abort'};
102 return s;
103 }
104 token = {type: 'end-tag', value: 'script'};
105 p.insertionPoint -= s.length;
106 return '';
107 });
108 if (token) return token;
109 var m;
110 if ((p.insertionPoint < '</script'.length) &&
111 (m = i.s.match (/^<\/([SCRIPTscript]+)/))) {
112 var v = m[1].substring (0, p.insertionPoint).toLowerCase ();
113 if (v == 'script'.substring (0, p.insertionPoint - '</'.length)) {
114 return {type: 'abort'};
115 }
116 }
117 i.s = i.s.replace (/^</,
118 function (s) {
119 token = {type: 'char', value: s};
120 p.insertionPoint -= s.length;
121 return '';
122 });
123 if (token) return token;
124 return {type: 'eof'};
125 }
126
127 var token;
128 i.s = i.s.replace (/^<\/([^>]+)(?:>|$)/, function (s, e) {
129 if (p.insertionPoint < s.length ||
130 (p.insertionPoint <= s.length &&
131 s.substring (s.length - 1, 1) != '>')) {
132 token = {type: 'abort'};
133 return s;
134 }
135 token = {type: 'end-tag', value: e.toLowerCase ()};
136 p.insertionPoint -= s.length;
137 return '';
138 });
139 if (token) return token;
140 i.s = i.s.replace (/^<([^>]+)(?:>|$)/, function (s, e) {
141 if (p.insertionPoint < s.length ||
142 (p.insertionPoint <= s.length &&
143 s.substring (s.length - 1, 1) != '>')) {
144 token = {type: 'abort'};
145 return s;
146 }
147 var tagName;
148 var attrs = {};
149 e = e.replace (/^[\S]+/, function (v) {
150 tagName = v.toLowerCase ();
151 return '';
152 });
153 while (true) {
154 var m = false;
155 e = e.replace (/^\s*([^\s=]+)\s*(?:=\s*(?:"([^"]*)"|'([^']*)'|([^"'\s]*)))?/,
156 function (x, attrName, attrValue1, attrValue2, attrValue3) {
157 v = attrValue1 || attrValue2 || attrValue3;
158 v = v.replace (/&quot;/g, '"').replace (/&apos;/g, "'")
159 .replace (/&amp;/g, '&');
160 attrs[attrName.toLowerCase ()] = v;
161 m = true;
162 return '';
163 });
164 if (!m) break;
165 }
166 if (e.length) {
167 log ('Broken start tag: "' + e + '"');
168 }
169 token = {type: 'start-tag', value: tagName, attrs: attrs};
170 p.insertionPoint -= s.length;
171 return '';
172 });
173 if (token) return token;
174 if (p.insertionPoint <= 0) {
175 return {type: 'abort'};
176 }
177 i.s = i.s.replace (/^[^<]+/, function (s) {
178 if (p.insertionPoint < s.length) {
179 token = {type: 'char', value: s.substring (0, p.insertionPoint)};
180 var ip = p.insertionPoint;
181 p.insertionPoint = 0;
182 return s.substring (ip, s.length);
183 }
184 token = {type: 'char', value: s};
185 p.insertionPoint -= s.length;
186 return '';
187 });
188 if (token) return token;
189 i.s = i.s.replace (/^[\s\S]/, function (s) {
190 token = {type: 'char', value: s};
191 p.insertionPoint -= s.length;
192 return '';
193 });
194 if (token) return token;
195 return {type: 'eof'};
196 } // getNextToken
197
198 Parser.prototype.parse = function () {
199 logIndentLevel++;
200 log ('parse: start');
201
202 while (true) {
203 var token = this.getNextToken ();
204 log ('token: ' + token.type + ' "' + token.value + '"');
205
206 if (token.type == 'start-tag') {
207 if (token.value == 'script') {
208 // 1. Create an element for the token in the HTML namespace.
209 var el = new JSElement (this.doc, token.value);
210 if (token.attrs.async != null) el.async = true;
211 if (token.attrs.defer != null) el.defer = true;
212 if (token.attrs.src != null) el.src = token.attrs.src;
213
214 // 2. Mark the element as being "parser-inserted".
215 el.manakaiParserInserted = true;
216
217 // 3. Switch the tokeniser's content model flag to the CDATA state.
218 this.parseMode = 'script';
219
220 // 4.1. Collect all the character tokens.
221 while (true) {
222 var token = this.getNextToken ();
223 log ('token: ' + token.type + ' "' + token.value + '"');
224
225 if (token.type == 'char') {
226 // 5. Append a single Text node to the script element node.
227 el.manakaiAppendText (token.value);
228
229 // 4.2. Until it returns a token that is not a character token, or
230 // until it stops tokenising.
231 } else if (token.type == 'eof' ||
232 (token.type == 'end-tag' && token.value == 'script') ||
233 token.type == 'abort') {
234 // 6. Switched back to the PCDATA state.
235 this.parseMode = 'pcdata';
236
237 // 7.1. If the next token is not an end tag token with ...
238 if (token.type != 'end-tag') {
239 // 7.2. This is a parse error.
240 log ('Parse error: no </' + 'script>');
241
242 // 7.3. Mark the script element as "already executed".
243 el.manakaiAlreadyExecuted = true;
244 } else {
245 // 7.4. Ignore it.
246 //
247 }
248 break;
249 }
250 }
251
252 // 8.1. If the parser were originally created for the ...
253 if (this.fragmentParsingMode) {
254 // 8.2. Mark the script element as "already executed" and ...
255 el.alreadyExecuted = true;
256 continue;
257 }
258
259 // 9.1. Let the old insertion point have the same value as the ...
260 var oldInsertionPoint = this.insertionPoint;
261 // 9.2. Let the insertion point be just before the next input ...
262 this.setInsertionPoint (0);
263
264 // 10. Append the new element to the current node.
265 this.openElements[this.openElements.length - 1].appendChild (el);
266
267 // 11. Let the insertion point have the value of the old ...
268
269 oldInsertionPoint += this.insertionPoint;
270 this.setInsertionPoint (oldInsertionPoint);
271
272 // 12. If there is a script that will execute as soon as ...
273 while (this.scriptExecutedWhenParserResumes) {
274 // 12.1. If the tree construction stage is being called reentrantly
275 if (this.reentrant) {
276 log ('parse: abort (reentrance)');
277 logIndentLevel--;
278 return;
279
280 // 12.2. Otherwise
281 } else {
282 // 1.
283 var script = this.scriptExecutedWhenParserResumes;
284 this.scriptExecutedWhenParserResumes = null;
285
286 // 2. Pause until the script has completed loading.
287 //
288
289 // 3. Let the insertion point to just before the next input char.
290 this.setInsertionPoint (0);
291
292 // 4. Execute the script.
293 executeScript (this.doc, script);
294
295 // 5. Let the insertion point be undefined again.
296 this.setInsertionPoint (undefined);
297
298 // 6. If there is once again a script that will execute ...
299 //
300 }
301 }
302 } else {
303 var el = new JSElement (this.doc, token.value);
304 this.openElements[this.openElements.length - 1].appendChild (el);
305 this.openElements.push (el);
306 }
307 } else if (token.type == 'end-tag') {
308 if (this.openElements[this.openElements.length - 1].localName ==
309 token.value) {
310 this.openElements.pop ();
311 } else {
312 log ('parse error: unmatched end tag: ' + token.value);
313 }
314 } else if (token.type == 'char') {
315 this.openElements[this.openElements.length - 1].manakaiAppendText
316 (token.value);
317 } else if (token.type == 'eof') {
318 break;
319 } else if (token.type == 'abort') {
320 log ('parse: abort');
321 logIndentLevel--;
322 return;
323 }
324 }
325
326 log ('stop parsing');
327
328 // readyState = 'interactive'
329
330 // "When a script completes loading" rules start applying.
331
332 // TODO: Handles "list of scripts that will execute as soon as possible"
333 // and "list of scripts that will execute asynchronously"
334
335 // Handle "list of scripts that will execute when the document has finished
336 // parsing".
337 var list = this.scriptsExecutedAfterParsing;
338 while (list.length > 0) {
339 // TODO: break unless completed loading
340
341 // Step 1.
342 //
343
344 // Step 2. and Step 3.
345 log ('Executing a |defer|red script...');
346 executeScript (this.doc, list.shift ());
347
348 // Step 4.
349 }
350
351 log ('DOMContentLoaded event fired');
352
353 // "delays tha load event" things has completed:
354 // readyState = 'complete'
355 log ('load event fired');
356
357 logIndentLevel--;
358 } // parse
359
360 Parser.prototype.setInsertionPoint = function (ip) {
361 if (ip == undefined || ip == null || isNaN (ip)) {
362 log ('insertion point: set to undefined');
363 this.insertionPoint = undefined;
364 } else if (ip == this.input.s.length) {
365 log ('insertion point: end of file');
366 this.insertionPoint = ip;
367 } else {
368 log ('insertion point: set to ' + ip +
369 ' (before "' + this.input.s.substring (0, 10) + '")');
370 this.insertionPoint = ip;
371 }
372 }; // setInsertionPoint
373
374 function JSDocument (p) {
375 this.childNodes = [];
376 this._parser = p;
377 } // JSDocument
378
379 function JSElement (doc, localName) {
380 this.localName = localName;
381 this.ownerDocument = doc;
382 this.childNodes = [];
383 } // JSElement
384
385 JSDocument.prototype.appendChild = JSElement.prototype.appendChild =
386 function (e) {
387 this.childNodes.push (e);
388 e.parentNode = this;
389
390 if (e.localName == 'script') {
391 logIndentLevel++;
392 log ('Running a script: start');
393
394 var doc = this.ownerDocument || this;
395 var p = doc._parser;
396
397 // 1. Script type
398 //
399
400 // 2.1. If scripting is disabled
401 //
402 // 2.2. If the script element was created by an XML ... innerHTML ...
403 //
404 // 2.3. If the user agent does not support the scripting language ...
405 //
406 // 2.4. If the script element has its "already executed" flag set
407 if (e.manakaiAlreadyExecuted) {
408 // 2.5. Abort these steps at this point.
409 log ('Running a script: aborted');
410 logIndentLevel--;
411 return e;
412 }
413
414 // 3. Set the element's "already executed" flag.
415 e.manakaiAlreadyExecuted = true;
416
417 // 4. If the element has a src attribute, then a load for ...
418 // TODO: load an external resource
419
420 // 5. The first of the following options:
421
422 // 5.1.
423 if (/* TODO: If the document is still being parsed && */
424 e.defer && !e.async) {
425 p.scriptsExecutedAfterParsing.push (e);
426 log ('Running a script: aborted (defer)');
427 } else if (e.async && e.src != null) {
428 // TODO
429 } else if (e.async && e.src == null
430 /* && list of scripts that will execute asynchronously is not empty */) {
431 // TODO
432 } else if (e.src != null && e.manakaiParserInserted) {
433 if (p.scriptExecutedWhenParserResumes) {
434 log ('Error: There is a script that will execute as soon as the parser resumes.');
435 }
436 p.scriptExecutedWhenParserResumes = e;
437 log ('Running a script: aborted (src)');
438 } else if (e.src != null) {
439 // TODO
440 } else {
441 executeScript (doc, e); // even if other scripts are already executing.
442 }
443
444 log ('Running a script: end');
445 logIndentLevel--;
446 }
447
448 return e;
449 }; // appendChild
450
451 function executeScript (doc, e) {
452 log ('executing a script block: start');
453
454 var s;
455 if (e.src != null) {
456 s = getExternalScript (e.src);
457
458 // If the load resulted in an error, then ... firing an error event ...
459 if (s == null) {
460 log ('error event fired at the script element');
461 return;
462 }
463
464 log ('External script loaded: "' + s + '"');
465 } else {
466 s = e.text;
467 }
468
469 // If the load was successful
470 log ('load event fired at the script element');
471
472 if (true) {
473 // Scripting is enabled, Document.designMode is disabled,
474 // Document is the active document in its browsing context
475
476 parseAndRunScript (doc, s);
477 }
478
479 log ('executing a script block: end');
480 } // executeScript
481
482 function getExternalScript (uri) {
483 if (uri.match (/^javascript:/i)) {
484 var m;
485 if (m = uri.match (/^javascript:\s*(?:'([^']*)'|"([^"]+)")\s*$/i)) {
486 if (m[1]) {
487 return m[1];
488 } else if (m[2]) {
489 return m[2];
490 } else {
491 return null;
492 }
493 } else {
494 log ('Complex javascript: URI is not supported: <' + uri + '>');
495 return null;
496 }
497 } else {
498 log ('URI scheme not supported: <' + uri + '>');
499 return null;
500 }
501 } // getExternalScript
502
503 function parseAndRunScript (doc, s) {
504 while (true) {
505 var matched = false;
506 s = s.replace (/^\s*document\.write\s*\(((?:'[^']*'|"[^"]*")\s*(?:,\s*(?:'[^']*'|"[^"]*"))*)\)\s*;\s*/, function (s, t) {
507 matched = true;
508 var args = [];
509 t.replace (/('[^']*'|"[^"]*")/g, function (s, v) {
510 args.push (v.substring (1, v.length - 1));
511 return '';
512 });
513 doc.write.apply (doc, args);
514 return '';
515 });
516 if (s == '') break;
517 if (!matched) {
518 log ('Script parse error: "' + s + '"');
519 break;
520 }
521 }
522 } // parseAndRunScript
523
524 function JSText (data) {
525 this.data = data;
526 } // JSText
527
528 JSDocument.prototype.manakaiAppendText =
529 JSElement.prototype.manakaiAppendText =
530 function (s) {
531 if (this.childNodes.length > 0 &&
532 this.childNodes[this.childNodes.length - 1] instanceof JSText) {
533 this.childNodes[this.childNodes.length - 1].data += s;
534 } else {
535 this.childNodes.push (new JSText (s));
536 }
537 }; // manakaiAppendText
538
539 JSDocument.prototype.open = function () {
540 // Two or fewer arguments
541
542 // Step 1.
543 var type = arguments[0] || 'text/html';
544
545 // Step 2.
546 var replace = arguments[1] == 'replace';
547
548 // Step 3.
549 if (this._parser &&
550 !this._parser.scriptCreated &&
551 this._parser.input.insertionPoint != undefined) {
552 log ('document.open () in parsing mode is ignored');
553 return this;
554 }
555
556 // Step 4.
557 log ('onbeforeunload event fired');
558 log ('onunload event fired');
559
560 // Step 5.
561 if (this._parser) {
562 // Discard the parser.
563 }
564
565 // Step 6.
566 log ('document cleared by document.open ()');
567 this.childNodes = [];
568
569 // Step 7.
570 this._parser = new Parser (new InputStream (''), this);
571 this._parser.scriptCreated = true;
572
573 // Step 8.
574 this.manakaiIsHTML = true;
575
576 // Step 9.
577 // If not text/html, ...
578
579 // Step 10.
580 if (!replace) {
581 // History
582 }
583
584 // Step 11.
585 this._parser.setInsertionPoint (this._parser.input.s.length);
586
587 // Step 12.
588 return this;
589 }; // document.open
590
591 JSDocument.prototype.write = function () {
592 logIndentLevel++;
593
594 var p = this._parser;
595
596 // 1. If the insertion point is undefined, the open() method must be ...
597 if (isNaN (p.insertionPoint) || p.insertionPoint == undefined) {
598 this.open ();
599 p = this._parser;
600 }
601
602 // 2. ... inserted into the input stream just before the insertion point.
603 var s = Array.join (arguments, '');
604 log ('document.write: insert "' + s + '"' +
605 ' before "' +
606 p.input.s.substring (p.insertionPoint, p.insertionPoint + 10) + '"');
607 p.input.s = p.input.s.substring (0, p.insertionPoint) + s
608 + p.input.s.substring (p.insertionPoint, p.input.s.length);
609 p.insertionPoint += s.length;
610
611 // 3. If there is a script that will execute as soon as the parser resumes
612 if (p.scriptExecutedAfterParserResumes) {
613 log ('document.write: processed later (there is an unprocessed <script src>)');
614 logIndentLevel--;
615 return;
616 }
617
618 // 4. Process the characters that were inserted, ...
619 var originalReentrant = p.reentrant;
620 p.reentrant = true;
621 p.parse ();
622 p.reentrant = originalReentrant;
623 // TODO: "Abort the processing of any nested invokations of the tokeniser,
624 // yielding control back to the caller." (<script> parsing). Do we need
625 // to do something here?
626
627 // 5. Return
628 log ('document.write: return');
629
630 logIndentLevel--;
631 return;
632 }; // document.write
633
634 JSElement.prototype.__defineGetter__ ('text', function () {
635 var r = '';
636 for (var i = 0; i < this.childNodes.length; i++) {
637 if (this.childNodes[i] instanceof JSText) {
638 r += this.childNodes[i].data;
639 }
640 }
641 return r;
642 });
643
644 function dumpTree (n, indent) {
645 var r = '';
646 for (var i = 0; i < n.childNodes.length; i++) {
647 var node = n.childNodes[i];
648 if (node instanceof JSElement) {
649 r += '| ' + indent + node.localName + '\n';
650 if (node.async) r += '| ' + indent + ' async=""\n';
651 if (node.defer) r += '| ' + indent + ' defer=""\n';
652 if (node.src != null) {
653 r += '| ' + indent + ' src="' + node.src + '"\n';
654 }
655 r += dumpTree (node, indent + ' ');
656 } else if (node instanceof JSText) {
657 r += '| ' + indent + '"' + node.data + '"\n';
658 } else {
659 r += '| ' + indent + node + '\n';
660 }
661 }
662 return r;
663 } // dumpTree
664 </script>
665 </head>
666 <body onload="
667 document.sourceElement = document.getElementsByTagName ('textarea')[0];
668
669 var q = location.search;
670 if (q != null) {
671 q = q.substring (1).split (/;/);
672 for (var i = 0; i < q.length; i++) {
673 var v = q[i].split (/=/, 2);
674 v[0] = decodeURIComponent (v[0]);
675 v[1] = decodeURIComponent (v[1] || '');
676 if (v[0] == 's') {
677 document.sourceElement.value = v[1];
678 }
679 }
680 }
681
682 document.logElement = document.getElementsByTagName ('output')[0];
683 update ();
684 ">
685 <h1>Live Scripting <abbr title="Hypertext Markup Language">HTML</abbr>
686 Parser</h1>
687
688 <h2>Markup to test
689 (<a href=data:, id=permalink rel=bookmark>permalink</a>,
690 <a href="http://software.hixie.ch/utilities/js/live-dom-viewer/"
691 id=ldvlink>Live <abbr title="Document Object Model">DOM</abbr>
692 Viewer</a>)</h2>
693 <p>
694 <textarea onkeydown=" update () " onchange=" update () " oninput=" update () ">&lt;html>
695 &lt;head>&lt;/head>&lt;body>
696 &lt;p>
697 &lt;script>
698 document.write ('aaaaaaa&lt;/p>&lt;script>document.write("cccccc");&lt;/', 'script>bbbbbb');
699 &lt;/script>
700 &lt;p>
701 </textarea>
702
703 <h2>Log</h2>
704 <p><output></output>
705
706 <h2>Note</h2>
707
708 <p>This is a <em>simplified</em> implementation of
709 <a href="http://www.whatwg.org/specs/web-apps/current-work/#parsing">HTML5
710 Parsing Algorithm</a>. It only implements script-related part of the
711 algorithm. Especially, this parser:
712 <ul>
713 <li>Does not support <code>DOCTYPE</code> and comment tokens.
714 <li>Does not support entities except for <code>&amp;quot;</code>,
715 <code>&amp;apos;</code>, and <code>&amp;amp;</code> in <code>script</code>
716 <code>src</code> attribute value.
717 <li>Does not support omissions of start or end tags, the <abbr>AAA</abbr>
718 algorithm, and so on.
719 <li>Does not raise parse errors for invalid attribute specifications in start
720 or end tags.
721 <li>Does not support CDATA/PCDATA element other than <code>script</code>.
722 <li>Does not support <code>&lt;!--</code>..<code>--></code> parsing rule
723 in <code>script</code> element.
724 <li>Does not support foreign (SVG or MathML) elements.
725 <li>Only supports <code>script</code> <code>type</code>
726 <code>text/javascript</code>. <code>type</code> and <code>language</code>
727 attributes are ignored.
728 <li>Only supports <code>document.write</code>.
729 The script code must be match to the regular expression
730 <code>^\s*(?:document\.write\s*\(<var>v</var>\s*(?:,\s*<var>v</var>\s*)*\)\s*;\s*)*$</code>
731 where <var>v</var> is <code>"[^"]*"|'[^']*'</code>.
732 <li>Only supports <code>javascript:</code>
733 <abbr title="Uniform Resourace Identifiers">URI</abbr> scheme in the
734 <code>src</code> attribute of the <code>script</code> element. In addition,
735 the <abbr title="Uniform Resource Identifiers">URI</abbr> must be conform to
736 the regular expression <code>^javascript:\s*(?:"[^"]*"|'[^']*')\s*$</code>.
737 </ul>
738
739 <p>For some reason, this parser does not work in browsers that do
740 not support JavaScript 1.5.
741
742 </body>
743 </html>

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24