/[suikacvs]/markup/html/scripting-parser/parser.html
Suika

Diff of /markup/html/scripting-parser/parser.html

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.4 by wakaba, Sun Apr 20 12:19:13 2008 UTC revision 1.9 by wakaba, Sun Apr 27 09:16:11 2008 UTC
# Line 1  Line 1 
1  <!DOCTYPE HTML>  <!DOCTYPE HTML>
2  <html lang=en>  <html lang=en>
3  <head>  <head>
4  <title>Demo of HTML5 Parsing Algorithm with Scripting Enabled</title>  <title>Live Scripting HTML Parser</title>
5  <style>  <style>
6      h1, h2 {
7        margin: 0;
8        font-size: 100%;
9      }
10      p, pre {
11        margin: 0;
12      }
13    textarea {    textarea {
14       display: block;      width: 100%;
15       width: 80%;      -width: 99%;
16       margin-left: auto;      height: 10em;
      margin-right: auto;  
      min-height: 20em;  
17    }    }
18    output {    output {
19      display: block;      display: block;
# Line 18  Line 23 
23    }    }
24  </style>  </style>
25  <script>  <script>
26      var delayedUpdater = 0;
27    
28    function update () {    function update () {
29      document.logElement.textContent = '';      if (delayedUpdater) {
30      var p = new Parser (new InputStream (document.sourceElement.value));        clearTimeout (delayedUpdater);
31      var doc = p.doc;        delayedUpdater = 0;
32      p.parse ();      }
33      log (dumpTree (doc, ''));      delayedUpdater = setTimeout (update2, 100);
34    } // update    } // update
35    
36      function update2 () {
37        var v = document.sourceElement.value;
38        if (v != document.previousSourceText) {
39          document.previousSourceText = v;
40          document.links['permalink'].href
41              = location.pathname + '?s=' + encodeURIComponent (v);
42          document.links['ldvlink'].href
43              = 'http://software.hixie.ch/utilities/js/live-dom-viewer/?'
44              + encodeURIComponent (v);
45    
46          document.logElement.textContent = '';
47          var p = new Parser (new InputStream (v));
48          var doc = p.doc;
49          p.parse ();
50          log (dumpTree (doc, ''));
51        }
52      } // update2
53    
54      var logIndentLevel = 0;
55    function log (s) {    function log (s) {
56        for (var i = 0; i < logIndentLevel; i++) {
57          s = '  ' + s;
58        }
59      document.logElement.appendChild (document.createTextNode (s + "\n"));      document.logElement.appendChild (document.createTextNode (s + "\n"));
60    } // log    } // log
61    
# Line 42  Line 71 
71      }      }
72      this.doc = doc;      this.doc = doc;
73      this.openElements = [doc];      this.openElements = [doc];
74      this.in = i;      this.input = i;
75      this.scriptsExecutedAfterParsing = [];      this.scriptsExecutedAfterParsing = [];
76    } // Parser    } // Parser
77    
78    Parser.prototype.getNextToken = function () {    Parser.prototype.getNextToken = function () {
79      var p = this;      var p = this;
80      var i = this.in;      var i = this.input;
81      if (this.parseMode == 'script') {      if (this.parseMode == 'script') {
82        var token;        var token;
83        if (p.insertionPoint <= 0) {        if (p.insertionPoint <= 0) {
# Line 77  Line 106 
106          return '';          return '';
107        });        });
108        if (token) return token;        if (token) return token;
109          var m;
110          if ((p.insertionPoint < '</script'.length) &&
111              (m = i.s.match (/^<\/([SCRIPTscript]+)/))) {
112            var v = m[1].substring (0, p.insertionPoint).toLowerCase ();
113            if (v == 'script'.substring (0, p.insertionPoint - '</'.length)) {
114              return {type: 'abort'};
115            }
116          }
117        i.s = i.s.replace (/^</,        i.s = i.s.replace (/^</,
118        function (s) {        function (s) {
119          token = {type: 'char', value: s};          token = {type: 'char', value: s};
# Line 88  Line 125 
125      }      }
126    
127      var token;      var token;
128      i.s = i.s.replace (/^<\/([^>]+)>/, function (s, e) {      i.s = i.s.replace (/^<\/([^>]+)(?:>|$)/, function (s, e) {
129        if (p.insertionPoint < s.length) {        if (p.insertionPoint < s.length ||
130              (p.insertionPoint <= s.length &&
131               s.substring (s.length - 1, 1) != '>')) {
132          token = {type: 'abort'};          token = {type: 'abort'};
133          return s;          return s;
134        }        }
# Line 98  Line 137 
137        return '';        return '';
138      });      });
139      if (token) return token;      if (token) return token;
140      i.s = i.s.replace (/^<([^>]+)>/, function (s, e) {      i.s = i.s.replace (/^<([^>]+)(?:>|$)/, function (s, e) {
141        if (p.insertionPoint < s.length) {        if (p.insertionPoint < s.length ||
142              (p.insertionPoint <= s.length &&
143               s.substring (s.length - 1, 1) != '>')) {
144          token = {type: 'abort'};          token = {type: 'abort'};
145          return s;          return s;
146        }        }
# Line 109  Line 150 
150          tagName = v.toLowerCase ();          tagName = v.toLowerCase ();
151          return '';          return '';
152        });        });
153        e = e.replace (/^\s*(\S+)\s*(?:=\s*"([^"]*)"|'([^']*)'|([^"']+))?/,        while (true) {
154        function (x, attrName, attrValue1, attrValue2, attrValue3) {          var m = false;
155          attrs[attrName] = attrValue1 || attrValue2 || attrValue3;          e = e.replace (/^\s*([^\s=]+)\s*(?:=\s*(?:"([^"]*)"|'([^']*)'|([^"'\s]*)))?/,
156          return '';          function (x, attrName, attrValue1, attrValue2, attrValue3) {
157        });            v = attrValue1 || attrValue2 || attrValue3;
158              v = v.replace (/&quot;/g, '"').replace (/&apos;/g, "'")
159                  .replace (/&amp;/g, '&');
160              attrs[attrName.toLowerCase ()] = v;
161              m = true;
162              return '';
163            });
164            if (!m) break;
165          }
166          if (e.length) {
167            log ('Broken start tag: "' + e + '"');
168          }
169        token = {type: 'start-tag', value: tagName, attrs: attrs};        token = {type: 'start-tag', value: tagName, attrs: attrs};
170        p.insertionPoint -= s.length;        p.insertionPoint -= s.length;
171        return '';        return '';
# Line 144  Line 196 
196    } // getNextToken    } // getNextToken
197    
198    Parser.prototype.parse = function () {    Parser.prototype.parse = function () {
199      log ('start parsing');      logIndentLevel++;
200        log ('parse: start');
201    
202      while (true) {      while (true) {
203        var token = this.getNextToken ();        var token = this.getNextToken ();
# Line 212  Line 265 
265            this.openElements[this.openElements.length - 1].appendChild (el);            this.openElements[this.openElements.length - 1].appendChild (el);
266    
267            // 11. Let the insertion point have the value of the old ...            // 11. Let the insertion point have the value of the old ...
268    
269              oldInsertionPoint += this.insertionPoint;
270            this.setInsertionPoint (oldInsertionPoint);            this.setInsertionPoint (oldInsertionPoint);
271    
272            // 12. If there is a script that will execute as soon as ...            // 12. If there is a script that will execute as soon as ...
273                        while (this.scriptExecutedWhenParserResumes) {
274                // 12.1. If the tree construction stage is being called reentrantly
275                if (this.reentrant) {
276                  log ('parse: abort (reentrance)');
277                  logIndentLevel--;
278                  return;
279    
280                // 12.2. Otherwise
281                } else {
282                  // 1.
283                  var script = this.scriptExecutedWhenParserResumes;
284                  this.scriptExecutedWhenParserResumes = null;
285    
286                  // 2. Pause until the script has completed loading.
287                  //
288    
289                  // 3. Let the insertion point to just before the next input char.
290                  this.setInsertionPoint (0);
291    
292                  // 4. Execute the script.
293                  executeScript (this.doc, script);
294    
295                  // 5. Let the insertion point be undefined again.
296                  this.setInsertionPoint (undefined);
297    
298                  // 6. If there is once again a script that will execute ...
299                  //
300                }
301              }
302          } else {          } else {
303            var el = new JSElement (this.doc, token.value);            var el = new JSElement (this.doc, token.value);
304            this.openElements[this.openElements.length - 1].appendChild (el);            this.openElements[this.openElements.length - 1].appendChild (el);
# Line 236  Line 318 
318          break;          break;
319        } else if (token.type == 'abort') {        } else if (token.type == 'abort') {
320          log ('parse: abort');          log ('parse: abort');
321            logIndentLevel--;
322          return;          return;
323        }        }
324      }      }
# Line 270  Line 353 
353      // "delays tha load event" things has completed:      // "delays tha load event" things has completed:
354      // readyState = 'complete'      // readyState = 'complete'
355      log ('load event fired');      log ('load event fired');
356    
357        logIndentLevel--;
358    } // parse    } // parse
359    
360    Parser.prototype.setInsertionPoint = function (ip) {    Parser.prototype.setInsertionPoint = function (ip) {
361      if (ip == undefined || ip == null || isNaN (ip)) {      if (ip == undefined || ip == null || isNaN (ip)) {
362        log ('insertion point: set to undefined');        log ('insertion point: set to undefined');
363        this.insertionPoint = undefined;        this.insertionPoint = undefined;
364      } else if (ip == this.in.s.length) {      } else if (ip == this.input.s.length) {
365        log ('insertion point: end of file');        log ('insertion point: end of file');
366        this.insertionPoint = ip;        this.insertionPoint = ip;
367      } else {      } else {
368        log ('insertion point: set to ' + ip +        log ('insertion point: set to ' + ip +
369             ' (before "' + this.in.s.substring (0, 10) + '")');             ' (before "' + this.input.s.substring (0, 10) + '")');
370        this.insertionPoint = ip;        this.insertionPoint = ip;
371      }      }
372    }; // setInsertionPoint    }; // setInsertionPoint
# Line 303  Line 388 
388      e.parentNode = this;      e.parentNode = this;
389    
390      if (e.localName == 'script') {      if (e.localName == 'script') {
391          logIndentLevel++;
392        log ('Running a script: start');        log ('Running a script: start');
393    
394        var doc = this.ownerDocument || this;        var doc = this.ownerDocument || this;
# Line 321  Line 407 
407        if (e.manakaiAlreadyExecuted) {        if (e.manakaiAlreadyExecuted) {
408          // 2.5. Abort these steps at this point.          // 2.5. Abort these steps at this point.
409          log ('Running a script: aborted');          log ('Running a script: aborted');
410            logIndentLevel--;
411          return e;          return e;
412        }        }
413    
# Line 343  Line 430 
430                   /* && list of scripts that will execute asynchronously is not empty */) {                   /* && list of scripts that will execute asynchronously is not empty */) {
431          // TODO          // TODO
432        } else if (e.src != null && e.manakaiParserInserted) {        } else if (e.src != null && e.manakaiParserInserted) {
433          // TODO          if (p.scriptExecutedWhenParserResumes) {
434              log ('Error: There is a script that will execute as soon as the parser resumes.');
435            }
436            p.scriptExecutedWhenParserResumes = e;
437            log ('Running a script: aborted (src)');
438        } else if (e.src != null) {        } else if (e.src != null) {
439          // TODO          // TODO
440        } else {        } else {
# Line 351  Line 442 
442        }        }
443    
444        log ('Running a script: end');        log ('Running a script: end');
445          logIndentLevel--;
446      }      }
447    
448      return e;      return e;
# Line 359  Line 451 
451    function executeScript (doc, e) {    function executeScript (doc, e) {
452      log ('executing a script block: start');      log ('executing a script block: start');
453    
454      // If the load resulted in an error, then ... firing an error event ...      var s;
455        if (e.src != null) {
456          s = getExternalScript (e.src);
457    
458          // If the load resulted in an error, then ... firing an error event ...
459          if (s == null) {
460            log ('error event fired at the script element');
461            return;
462          }
463    
464          log ('External script loaded: "' + s + '"');
465        } else {
466          s = e.text;
467        }
468    
469      // If the load was successful      // If the load was successful
470      log ('load event fired at the script element');      log ('load event fired at the script element');
# Line 368  Line 473 
473      // Scripting is enabled, Document.designMode is disabled,      // Scripting is enabled, Document.designMode is disabled,
474      // Document is the active document in its browsing context      // Document is the active document in its browsing context
475    
       var s;  
       if (e.src != null) {  
         // TODO: from external file  
       } else {  
         s = e.text;  
       }  
   
476        parseAndRunScript (doc, s);        parseAndRunScript (doc, s);
477      }      }
478    
479      log ('executing a script block: end');      log ('executing a script block: end');
480    } // executeScript    } // executeScript
481    
482      function getExternalScript (uri) {
483        if (uri.match (/^javascript:/i)) {
484          var m;
485          if (m = uri.match (/^javascript:\s*(?:'([^']*)'|"([^"]+)")\s*$/i)) {
486            if (m[1]) {
487              return m[1];
488            } else if (m[2]) {
489              return m[2];
490            } else {
491              return null;
492            }
493          } else {
494            log ('Complex javascript: URI is not supported: <' + uri + '>');
495            return null;
496          }
497        } else {
498          log ('URI scheme not supported: <' + uri + '>');
499          return null;
500        }
501      } // getExternalScript
502    
503    function parseAndRunScript (doc, s) {    function parseAndRunScript (doc, s) {
504      while (true) {      while (true) {
505        var matched = false;        var matched = false;
# Line 429  Line 548 
548      // Step 3.      // Step 3.
549      if (this._parser &&      if (this._parser &&
550          !this._parser.scriptCreated &&          !this._parser.scriptCreated &&
551          this._parser.in.insertionPoint != undefined) {          this._parser.input.insertionPoint != undefined) {
552        log ('document.open () in parsing mode is ignored');        log ('document.open () in parsing mode is ignored');
553        return this;        return this;
554      }      }
# Line 463  Line 582 
582      }      }
583    
584      // Step 11.      // Step 11.
585      this._parser.setInsertionPoint (this._parser.in.s.length);      this._parser.setInsertionPoint (this._parser.input.s.length);
586    
587      // Step 12.      // Step 12.
588      return this;      return this;
589    }; // document.open    }; // document.open
590    
591    JSDocument.prototype.write = function () {    JSDocument.prototype.write = function () {
592        logIndentLevel++;
593    
594      var p = this._parser;      var p = this._parser;
595    
596      // 1. If the insertion point is undefined, the open() method must be ...      // 1. If the insertion point is undefined, the open() method must be ...
# Line 481  Line 602 
602      // 2. ... inserted into the input stream just before the insertion point.      // 2. ... inserted into the input stream just before the insertion point.
603      var s = Array.join (arguments, '');      var s = Array.join (arguments, '');
604      log ('document.write: insert "' + s + '"' +      log ('document.write: insert "' + s + '"' +
605           ' before "' + p.in.s.substring (p.insertionPoint, p.insertionPoint + 10) + '"');           ' before "' +
606      p.in.s = p.in.s.substring (0, p.insertionPoint) + s           p.input.s.substring (p.insertionPoint, p.insertionPoint + 10) + '"');
607          + p.in.s.substring (p.insertionPoint, p.in.s.length);      p.input.s = p.input.s.substring (0, p.insertionPoint) + s
608            + p.input.s.substring (p.insertionPoint, p.input.s.length);
609      p.insertionPoint += s.length;      p.insertionPoint += s.length;
610    
611      // 3. If there is a script that will execute as soon as the parser resumes      // 3. If there is a script that will execute as soon as the parser resumes
612      // TODO      if (p.scriptExecutedAfterParserResumes) {
613          log ('document.write: processed later (there is an unprocessed <script src>)');
614          logIndentLevel--;
615          return;
616        }
617    
618      // 4. Process the characters that were inserted, ...      // 4. Process the characters that were inserted, ...
619        var originalReentrant = p.reentrant;
620        p.reentrant = true;
621      p.parse ();      p.parse ();
622        p.reentrant = originalReentrant;
623        // TODO: "Abort the processing of any nested invokations of the tokeniser,
624        // yielding control back to the caller." (<script> parsing).  Do we need
625        // to do something here?
626    
627      // 5. Return      // 5. Return
628      log ('document.write: return');      log ('document.write: return');
629    
630        logIndentLevel--;
631      return;      return;
632    }; // document.write    }; // document.write
633    
# Line 515  Line 649 
649          r += '| ' + indent + node.localName + '\n';          r += '| ' + indent + node.localName + '\n';
650          if (node.async) r += '| ' + indent + '  async=""\n';          if (node.async) r += '| ' + indent + '  async=""\n';
651          if (node.defer) r += '| ' + indent + '  defer=""\n';          if (node.defer) r += '| ' + indent + '  defer=""\n';
652          if (node.src) r += '| ' + indent + '  src="' + node.src + '"\n';          if (node.src != null) {
653              r += '| ' + indent + '  src="' + node.src + '"\n';
654            }
655          r += dumpTree (node, indent + '  ');          r += dumpTree (node, indent + '  ');
656        } else if (node instanceof JSText) {        } else if (node instanceof JSText) {
657          r += '| ' + indent + '"' + node.data + '"\n';          r += '| ' + indent + '"' + node.data + '"\n';
# Line 529  Line 665 
665  </head>  </head>
666  <body onload="  <body onload="
667    document.sourceElement = document.getElementsByTagName ('textarea')[0];    document.sourceElement = document.getElementsByTagName ('textarea')[0];
668    
669      var q = location.search;
670      if (q != null) {
671        q = q.substring (1).split (/;/);
672        for (var i = 0; i < q.length; i++) {
673          var v = q[i].split (/=/, 2);
674          v[0] = decodeURIComponent (v[0]);
675          v[1] = decodeURIComponent (v[1] || '');
676          if (v[0] == 's') {
677            document.sourceElement.value = v[1];
678          }
679        }
680      }
681    
682    document.logElement = document.getElementsByTagName ('output')[0];    document.logElement = document.getElementsByTagName ('output')[0];
683    update ();    update ();
684  ">  ">
685    <h1>Live Scripting <abbr title="Hypertext Markup Language">HTML</abbr>
686    Parser</h1>
687    
688  <textarea onchange=" update () ">&lt;html>  <h2>Markup to test
689    (<a href=data:, id=permalink rel=bookmark>permalink</a>,
690    <a href="http://software.hixie.ch/utilities/js/live-dom-viewer/"
691        id=ldvlink>Live <abbr title="Document Object Model">DOM</abbr>
692        Viewer</a>)</h2>
693    <p>
694    <textarea onkeydown=" update () " onchange=" update () " oninput=" update () ">&lt;html>
695  &lt;head>&lt;/head>&lt;body>  &lt;head>&lt;/head>&lt;body>
696  &lt;p>  &lt;p>
697  &lt;script>  &lt;script>
# Line 542  document.write ('aaaaaaa&lt;/p>&lt;scrip Line 700  document.write ('aaaaaaa&lt;/p>&lt;scrip
700  &lt;p>  &lt;p>
701  </textarea>  </textarea>
702    
703  <output></output>  <h2>Log</h2>
704    <p><output></output>
705    
706    <h2>Note</h2>
707    
708    <p>This is a <em>simplified</em> implementation of
709    <a href="http://www.whatwg.org/specs/web-apps/current-work/#parsing">HTML5
710    Parsing Algorithm</a>.  It only implements script-related part of the
711    algorithm.  Especially, this parser:
712    <ul>
713    <li>Does not support <code>DOCTYPE</code> and comment tokens.
714    <li>Does not support entities except for <code>&amp;quot;</code>,
715    <code>&amp;apos;</code>, and <code>&amp;amp;</code> in <code>script</code>
716    <code>src</code> attribute value.
717    <li>Does not support omissions of start or end tags, the <abbr>AAA</abbr>
718    algorithm, and so on.
719    <li>Does not raise parse errors for invalid attribute specifications in start
720    or end tags.
721    <li>Does not support CDATA/PCDATA element other than <code>script</code>.
722    <li>Does not support <code>&lt;!--</code>..<code>--></code> parsing rule
723    in <code>script</code> element.
724    <li>Does not support foreign (SVG or MathML) elements.
725    <li>Only supports <code>script</code> <code>type</code>
726    <code>text/javascript</code>.  <code>type</code> and <code>language</code>
727    attributes are ignored.
728    <li>Only supports <code>document.write</code>.
729    The script code must be match to the regular expression
730    <code>^\s*(?:document\.write\s*\(<var>v</var>\s*(?:,\s*<var>v</var>\s*)*\)\s*;\s*)*$</code>
731    where <var>v</var> is <code>"[^"]*"|'[^']*'</code>.
732    <li>Only supports <code>javascript:</code>
733    <abbr title="Uniform Resourace Identifiers">URI</abbr> scheme in the
734    <code>src</code> attribute of the <code>script</code> element.  In addition,
735    the <abbr title="Uniform Resource Identifiers">URI</abbr> must be conform to
736    the regular expression <code>^javascript:\s*(?:"[^"]*"|'[^']*')\s*$</code>.
737    </ul>
738    
739    <p>For some reason, this parser does not work in browsers that do
740    not support JavaScript 1.5.
741    
742  </body>  </body>
743  </html>  </html>

Legend:
Removed from v.1.4  
changed lines
  Added in v.1.9

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24