/[suikacvs]/markup/html/scripting-parser/parser.html
Suika

Diff of /markup/html/scripting-parser/parser.html

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.8 by wakaba, Sun Apr 27 08:56:34 2008 UTC revision 1.14 by wakaba, Tue Apr 29 02:50:00 2008 UTC
# Line 2  Line 2 
2  <html lang=en>  <html lang=en>
3  <head>  <head>
4  <title>Live Scripting HTML Parser</title>  <title>Live Scripting HTML Parser</title>
5    <link rel=author href="http://suika.fam.cx/~wakaba/who?">
6    <link rel=license href="http://suika.fam.cx/c/gnu/gpl"
7        title="GNU GPL2 or later">
8  <style>  <style>
9    h1, h2 {    h1 {
10        margin: 0;
11        font-size: 150%;
12      }
13      h2 {
14      margin: 0;      margin: 0;
15      font-size: 100%;      font-size: 100%;
16    }    }
17    p, pre {    p {
18      margin: 0;      margin: 0 1em;
19    }    }
20    textarea {    textarea {
21      width: 100%;      width: 100%;
# Line 47  Line 54 
54        var p = new Parser (new InputStream (v));        var p = new Parser (new InputStream (v));
55        var doc = p.doc;        var doc = p.doc;
56        p.parse ();        p.parse ();
57          
58        log (dumpTree (doc, ''));        log (dumpTree (doc, ''));
59          
60          if (p.hasAsyncScript) {
61            log ('Some script codes are executed asynchronously; it means that the document might be rendered in different ways depending on the network condition and other factors');
62          }
63      }      }
64    } // update2    } // update2
65    
# Line 73  Line 85 
85      this.openElements = [doc];      this.openElements = [doc];
86      this.input = i;      this.input = i;
87      this.scriptsExecutedAfterParsing = [];      this.scriptsExecutedAfterParsing = [];
88        this.scriptsExecutedSoon = [];
89        this.scriptsExecutedAsynchronously = [];
90    } // Parser    } // Parser
91    
92    Parser.prototype.getNextToken = function () {    Parser.prototype.getNextToken = function () {
93      var p = this;      var p = this;
94      var i = this.input;      var i = this.input;
95      if (this.parseMode == 'script') {      if (this.parseMode == 'cdata') {
96          var tagName = this.endTagName;
97        var token;        var token;
98        if (p.insertionPoint <= 0) {        if (p.insertionPoint <= 0) {
99          return {type: 'abort'};          return {type: 'abort'};
# Line 96  Line 111 
111          return '';          return '';
112        });        });
113        if (token) return token;        if (token) return token;
114        i.s = i.s.replace (/^<\/[Ss][Cc][Rr][Ii][Pp][Tt]>/, function (s) {        var pattern = new RegExp ('^</' + tagName + '>', 'i');
115          i.s = i.s.replace (pattern, function (s) {
116          if (p.insertionPoint < s.length) {          if (p.insertionPoint < s.length) {
117            token = {type: 'abort'};            token = {type: 'abort'};
118            return s;            return s;
119          }          }
120          token = {type: 'end-tag', value: 'script'};          token = {type: 'end-tag', value: tagName};
121          p.insertionPoint -= s.length;          p.insertionPoint -= s.length;
122          return '';          return '';
123        });        });
124        if (token) return token;        if (token) return token;
125        var m;        var m;
126        if ((p.insertionPoint < '</script'.length) &&        if ((p.insertionPoint < ('</' + tagName).length) &&
127            (m = i.s.match (/^<\/([SCRIPTscript]+)/))) {            (m = i.s.match (/^<\/([A-Za-z]+)/))) {
128          var v = m[1].substring (0, p.insertionPoint).toLowerCase ();          var v = m[1].substring (0, p.insertionPoint).toLowerCase ();
129          if (v == 'script'.substring (0, p.insertionPoint - '</'.length)) {          if (v == tagName.substring (0, p.insertionPoint - '</'.length)) {
130            return {type: 'abort'};            return {type: 'abort'};
131          }          }
132        }        }
# Line 150  Line 166 
166          tagName = v.toLowerCase ();          tagName = v.toLowerCase ();
167          return '';          return '';
168        });        });
169        e = e.replace (/^\s*([^\s=]+)\s*(?:=\s*(?:"([^"]*)"|'([^']*)'|([^"']+)))?/,        while (true) {
170        function (x, attrName, attrValue1, attrValue2, attrValue3) {          var m = false;
171          v = attrValue1 || attrValue2 || attrValue3;          e = e.replace (/^\s*([^\s=]+)\s*(?:=\s*(?:"([^"]*)"|'([^']*)'|([^"'\s]*)))?/,
172          v = v.replace (/&quot;/g, '"').replace (/&apos;/g, "'")          function (x, attrName, attrValue1, attrValue2, attrValue3) {
173              .replace (/&amp;/g, '&');            v = attrValue1 || attrValue2 || attrValue3;
174          attrs[attrName.toLowerCase ()] = v;            v = v.replace (/&quot;/g, '"').replace (/&apos;/g, "'")
175          return '';                .replace (/&amp;/g, '&');
176        });            attrs[attrName.toLowerCase ()] = v;
177              m = true;
178              return '';
179            });
180            if (!m) break;
181          }
182        if (e.length) {        if (e.length) {
183          log ('Broken start tag: "' + e + '"');          log ('Broken start tag: "' + e + '"');
184        }        }
# Line 210  Line 231 
231            el.manakaiParserInserted = true;            el.manakaiParserInserted = true;
232    
233            // 3. Switch the tokeniser's content model flag to the CDATA state.            // 3. Switch the tokeniser's content model flag to the CDATA state.
234            this.parseMode = 'script';            this.parseMode = 'cdata';
235              this.endTagName = 'script';
236    
237            // 4.1. Collect all the character tokens.            // 4.1. Collect all the character tokens.
238            while (true) {            while (true) {
# Line 224  Line 246 
246              // 4.2. Until it returns a token that is not a character token, or              // 4.2. Until it returns a token that is not a character token, or
247              // until it stops tokenising.              // until it stops tokenising.
248              } else if (token.type == 'eof' ||              } else if (token.type == 'eof' ||
249                         (token.type == 'end-tag' && token.value == 'script') ||                         token.type == 'end-tag' ||
250                         token.type == 'abort') {                         token.type == 'abort') {
251                // 6. Switched back to the PCDATA state.                // 6. Switched back to the PCDATA state.
252                this.parseMode = 'pcdata';                this.parseMode = 'pcdata';
253    
254                // 7.1. If the next token is not an end tag token with ...                // 7.1. If the next token is not an end tag token with ...
255                if (token.type != 'end-tag') {                if (!(token.type == 'end-tag' && token.value == 'script')) {
256                  // 7.2. This is a parse error.                  // 7.2. This is a parse error.
257                  log ('Parse error: no </' + 'script>');                  log ('Parse error: no </' + 'script>');
258    
# Line 294  Line 316 
316                //                //
317              }              }
318            }            }
319            } else if (token.value == 'style' ||
320                       token.value == 'noscript' ||
321                       token.value == 'xmp') {
322              // 1. Create an element for the token in the HTML namespace.
323              var el = new JSElement (this.doc, token.value);
324    
325              // 2. Append the new element to the current node.
326              this.openElements[this.openElements.length - 1].appendChild (el);
327    
328              // 3. Switch the tokeniser's content model flag to the CDATA state.
329              this.parseMode = 'cdata';
330              this.endTagName = token.value;
331    
332              // 4.1. Collect all the character tokens.
333              while (true) {
334                var token = this.getNextToken ();
335                log ('token: ' + token.type + ' "' + token.value + '"');
336    
337                if (token.type == 'char') {
338                  // 5. Append a single Text node to the script element node.
339                  el.manakaiAppendText (token.value);
340    
341                // 4.2. Until it returns a token that is not a character token, or
342                // until it stops tokenising.
343                } else if (token.type == 'eof' ||
344                           token.type == 'end-tag' ||
345                           token.type == 'abort') {
346                  // 6. Switched back to the PCDATA state.
347                  this.parseMode = 'pcdata';
348    
349                  // 7.1. If the next token is not an end tag token with ...
350                  if (!(token.type == 'end-tag' &&
351                        token.value == this.endTagName)) {
352                    // 7.2. This is a parse error.
353                    log ('Parse error: no </' + this.endTagName + '>');
354    
355                    // 7.3. Mark the script element as "already executed".
356                    el.manakaiAlreadyExecuted = true;
357                  } else {
358                    // 7.4. Ignore it.
359                    //
360                  }
361                  break;
362                }
363              }
364          } else {          } else {
365            var el = new JSElement (this.doc, token.value);            var el = new JSElement (this.doc, token.value);
366            this.openElements[this.openElements.length - 1].appendChild (el);            this.openElements[this.openElements.length - 1].appendChild (el);
# Line 324  Line 391 
391    
392      // "When a script completes loading" rules start applying.      // "When a script completes loading" rules start applying.
393    
394      // TODO: Handles "list of scripts that will execute as soon as possible"      while (this.scriptsExecutedSoon.length > 0 ||
395      // and "list of scripts that will execute asynchronously"             this.scriptsExecutedAsynchronously.length > 0) {
396          // Handle "list of scripts that will execute as soon as possible".
397          while (this.scriptsExecutedSoon.length > 0) {
398            var e = this.scriptsExecutedSoon.shift ();
399      
400            // If it has completed loading
401            log ('Execute an external script not inserted by parser...');
402            executeScript (this.doc, e);
403    
404            // NOTE: It MAY be executed before the end of the parsing, according
405            // to the spec.
406            this.hasAsyncScript = true;
407          }
408    
409          // Handle "list of scripts that will execute asynchronously".
410          while (this.scriptsExecutedAsynchronously.length > 0) {
411            var e = this.scriptsExecutedAsynchronously.shift ();
412    
413            // Step 1.
414            // We assume that all scripts have been loaded at this time.
415      
416            // Step 2.
417            log ('Execute an asynchronous script...');
418            executeScript (this.doc, e);
419    
420            // Step 3.
421            //
422    
423            // Step 4.
424            //
425    
426            this.hasAsyncScript = true;
427          }
428        }
429    
430      // Handle "list of scripts that will execute when the document has finished      // Handle "list of scripts that will execute when the document has finished
431      // parsing".      // parsing".
# Line 345  Line 445 
445    
446      log ('DOMContentLoaded event fired');      log ('DOMContentLoaded event fired');
447    
448      // "delays tha load event" things has completed:      // "delays the load event" things has completed:
449      // readyState = 'complete'      // readyState = 'complete'
450      log ('load event fired');      log ('load event fired');
451    
# Line 420  Line 520 
520          p.scriptsExecutedAfterParsing.push (e);          p.scriptsExecutedAfterParsing.push (e);
521          log ('Running a script: aborted (defer)');          log ('Running a script: aborted (defer)');
522        } else if (e.async && e.src != null) {        } else if (e.async && e.src != null) {
523          // TODO          p.scriptsExecutedAsynchronously.push (e);
524        } else if (e.async && e.src == null          log ('Running a script: aborted (async src)');
525                   /* && list of scripts that will execute asynchronously is not empty */) {        } else if (e.async && e.src == null &&
526          // TODO                   p.scriptsExecutedAsynchronously.length > 0) {
527            p.scriptsExecutedAsynchronously.push (e);
528            log ('Running a script: aborted (async)');
529            // ISSUE: What is the difference with the case above?
530        } else if (e.src != null && e.manakaiParserInserted) {        } else if (e.src != null && e.manakaiParserInserted) {
531          if (p.scriptExecutedWhenParserResumes) {          if (p.scriptExecutedWhenParserResumes) {
532            log ('Error: There is a script that will execute as soon as the parser resumes.');            log ('Error: There is a script that will execute as soon as the parser resumes.');
533          }          }
534          p.scriptExecutedWhenParserResumes = e;          p.scriptExecutedWhenParserResumes = e;
535          log ('Running a script: aborted (src)');          log ('Running a script: aborted (src parser-inserted)');
536        } else if (e.src != null) {        } else if (e.src != null) {
537          // TODO          p.scriptsExecutedSoon.push (e);
538            log ('Running a script: aborted (src)');
539        } else {        } else {
540          executeScript (doc, e); // even if other scripts are already executing.          executeScript (doc, e); // even if other scripts are already executing.
541        }        }
# Line 479  Line 583 
583        var m;        var m;
584        if (m = uri.match (/^javascript:\s*(?:'([^']*)'|"([^"]+)")\s*$/i)) {        if (m = uri.match (/^javascript:\s*(?:'([^']*)'|"([^"]+)")\s*$/i)) {
585          if (m[1]) {          if (m[1]) {
586            return m[1];            return unescapeJSLiteral (m[1]);
587          } else if (m[2]) {          } else if (m[2]) {
588            return m[2];            return unescapeJSLiteral (m[2]);
589          } else {          } else {
590            return null;            return null;
591          }          }
# Line 502  Line 606 
606          matched = true;          matched = true;
607          var args = [];          var args = [];
608          t.replace (/('[^']*'|"[^"]*")/g, function (s, v) {          t.replace (/('[^']*'|"[^"]*")/g, function (s, v) {
609            args.push (v.substring (1, v.length - 1));            args.push (unescapeJSLiteral (v.substring (1, v.length - 1)));
610            return '';            return '';
611          });          });
612          doc.write.apply (doc, args);          doc.write.apply (doc, args);
613          return '';          return '';
614        });        });
615          s = s.replace (/^\s*var\s+s\s*=\s*document\.createElement\s*\(\s*['"]script['"]\s*\)\s*;\s*s\.src\s*=\s*(?:'(javascript:[^']*)'|"(javascript:[^"]*)")\s*;\s*document\.documentElement\.appendChild\s*\(\s*s\s*\)\s*;\s*/,
616          function (s, t, u) {
617            matched = true;
618            var args = [unescapeJSLiteral (t ? t : u)];
619            doc._insertExternalScript.apply (doc, args);
620            return '';
621          });
622        if (s == '') break;        if (s == '') break;
623        if (!matched) {        if (!matched) {
624          log ('Script parse error: "' + s + '"');          log ('Script parse error: "' + s + '"');
# Line 516  Line 627 
627      }      }
628    } // parseAndRunScript    } // parseAndRunScript
629    
630      function unescapeJSLiteral (s) {
631        return s.replace (/\\u([0-9A-Fa-f]{4})/g, function (t, v) {
632          return String.fromCharCode (parseInt ('0x' + v));
633        });
634      } // unescapeJSLiteral
635    
636    function JSText (data) {    function JSText (data) {
637      this.data = data;      this.data = data;
638    } // JSText    } // JSText
# Line 626  Line 743 
743      return;      return;
744    }; // document.write    }; // document.write
745    
746      JSDocument.prototype._insertExternalScript = function (uri) {
747        var s = new JSElement (this, 'script');
748        s.src = uri;
749        this.documentElement.appendChild (s);
750      }; // _insertExternalScript
751    
752      JSDocument.prototype.__defineGetter__ ('documentElement', function () {
753        var cn = this.childNodes;
754        for (var i = 0; i < cn.length; i++) {
755          if (cn[i] instanceof JSElement) {
756            return cn[i]
757          }
758        }
759        return null;
760      });
761    
762    JSElement.prototype.__defineGetter__ ('text', function () {    JSElement.prototype.__defineGetter__ ('text', function () {
763      var r = '';      var r = '';
764      for (var i = 0; i < this.childNodes.length; i++) {      for (var i = 0; i < this.childNodes.length; i++) {
# Line 644  Line 777 
777          r += '| ' + indent + node.localName + '\n';          r += '| ' + indent + node.localName + '\n';
778          if (node.async) r += '| ' + indent + '  async=""\n';          if (node.async) r += '| ' + indent + '  async=""\n';
779          if (node.defer) r += '| ' + indent + '  defer=""\n';          if (node.defer) r += '| ' + indent + '  defer=""\n';
780          if (node.src) r += '| ' + indent + '  src="' + node.src + '"\n';          if (node.src != null) {
781              r += '| ' + indent + '  src="' + node.src + '"\n';
782            }
783          r += dumpTree (node, indent + '  ');          r += dumpTree (node, indent + '  ');
784        } else if (node instanceof JSText) {        } else if (node instanceof JSText) {
785          r += '| ' + indent + '"' + node.data + '"\n';          r += '| ' + indent + '"' + node.data + '"\n';
# Line 693  document.write ('aaaaaaa&lt;/p>&lt;scrip Line 828  document.write ('aaaaaaa&lt;/p>&lt;scrip
828  &lt;p>  &lt;p>
829  </textarea>  </textarea>
830    
831  <h2>Log</h2>  <h2 id=log>Log</h2>
832  <p><output></output>  <p><output></output>
833    
834  <h2>Note</h2>  <h2 id=notes>Notes</h2>
835    
836  <p>This is a <em>simplified</em> implementation of  <p>This is a <em>simplified</em> implementation of
837  <a href="http://www.whatwg.org/specs/web-apps/current-work/#parsing">HTML5  <a href="http://www.whatwg.org/specs/web-apps/current-work/#parsing">HTML5
# Line 711  algorithm.  Especially, this parser: Line 846  algorithm.  Especially, this parser:
846  algorithm, and so on.  algorithm, and so on.
847  <li>Does not raise parse errors for invalid attribute specifications in start  <li>Does not raise parse errors for invalid attribute specifications in start
848  or end tags.  or end tags.
849  <li>Does not support CDATA/PCDATA element other than <code>script</code>.  <li>Does not support PCDATA elements (<code>title</code> and
850    <code>textarea</code>).
851    <li>Does not strip the first newline in <code>pre</code> elements.
852  <li>Does not support <code>&lt;!--</code>..<code>--></code> parsing rule  <li>Does not support <code>&lt;!--</code>..<code>--></code> parsing rule
853  in <code>script</code> element.  in <code>script</code> element.
854  <li>Does not support foreign (SVG or MathML) elements.  <li>Does not support foreign (SVG or MathML) elements.
855  <li>Only supports <code>script</code> <code>type</code>  <li>Only supports <code>script</code> <code>type</code>
856  <code>text/javascript</code>.  <code>type</code> and <code>language</code>  <code>text/javascript</code>.  <code>type</code> and <code>language</code>
857  attributes are ignored.  attributes are ignored.
858  <li>Only supports <code>document.write</code>.  <li>Only supports limited statements.  It must consist of zero or more
859  The script code must be match to the regular expression  of statements looking similar to the following statements, possibly
860  <code>^\s*(?:document\.write\s*\(<var>v</var>\s*(?:,\s*<var>v</var>\s*)*\)\s*;\s*)*$</code>  introduced, followed, or separated by white space characters:
861  where <var>v</var> is <code>"[^"]*"|'[^']*'</code>.    <ul>
862      <li><code>document.write ("<var>string</var>", ["<var>string</var>", ...]);</code>.
863      <li><code>var s = document.createElement ("script");
864                s.src = "<var>string</var>";
865                document.documentElement.appendChild (s);</code>
866      </ul>
867    Note that strings may be delimited by <code>'</code>s instead of
868    <code>"</code>s.
869  <li>Only supports <code>javascript:</code>  <li>Only supports <code>javascript:</code>
870  <abbr title="Uniform Resourace Identifiers">URI</abbr> scheme in the  <abbr title="Uniform Resourace Identifiers">URI</abbr> scheme in the
871  <code>src</code> attribute of the <code>script</code> element.  In addition,  <code>src</code> attribute of the <code>script</code> element.  In addition,
872  the <abbr title="Uniform Resource Identifiers">URI</abbr> must be conform to  the <abbr title="Uniform Resource Identifiers">URI</abbr> must be conform to
873  the regular expression <code>^javascript:\s*(?:"[^"]*"|'[^']*')\s*$</code>.  the regular expression <code>^javascript:\s*(?:"[^"]*"|'[^']*')\s*$</code>.
874    <li>Only supports <code>\u<var>HHHH</var></code> escapes in JavaScript
875    string literals.
876    <li>Does not handle <i>stop parsing</i> phase correctly if the document is
877    replaced by <code>document.open ()</code> call.  In other word, delayed
878    (deferred or asynchronous) script executions and event firings might be
879    treated in a wrong way if a <code>document.open ()</code> invocation
880    is implicitly done by <code>document.write ()</code> in a delayed script.
881  </ul>  </ul>
882    
883  <p>For some reason, this parser does not work in browsers that do  <p>For some reason, this parser does not work in browsers that do
884  not support JavaScript 1.5.  not support JavaScript 1.5.
885    
886  <!-- TODO: multiple attributes are not supported yet -->  <!-- TODO: |src| attribute value should refer the value at the time
887    when it is inserted into the document, not the value when the script is
888    executed.  Currently it does not matter, since we don't allow dynamic
889    modification to the |src| content/DOM attribute value yet. -->
890    
891  </body>  </body>
 </html>  
892    </html>
893    <!-- $Date$ -->
894    <!--
895    
896    Copyright 2008 Wakaba <w@suika.fam.cx>
897    
898    This program is free software; you can redistribute it and/or
899    modify it under the terms of the GNU General Public License
900    as published by the Free Software Foundation; either version 2
901    of the License, or (at your option) any later version.
902    
903    This program is distributed in the hope that it will be useful,
904    but WITHOUT ANY WARRANTY; without even the implied warranty of
905    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
906    GNU General Public License for more details.
907    
908    You should have received a copy of the GNU General Public License
909    along with this program; if not, write to the Free Software
910    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
911    
912    -->

Legend:
Removed from v.1.8  
changed lines
  Added in v.1.14

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24