/[suikacvs]/markup/html/scripting-parser/parser.html
Suika

Diff of /markup/html/scripting-parser/parser.html

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.11 by wakaba, Sun Apr 27 10:44:36 2008 UTC revision 1.17 by wakaba, Fri May 16 10:29:25 2008 UTC
# Line 2  Line 2 
2  <html lang=en>  <html lang=en>
3  <head>  <head>
4  <title>Live Scripting HTML Parser</title>  <title>Live Scripting HTML Parser</title>
5    <link rel=author href="http://suika.fam.cx/~wakaba/who?">
6    <link rel=license href="http://suika.fam.cx/c/gnu/gpl"
7        title="GNU GPL2 or later">
8  <style>  <style>
9    h1, h2 {    h1 {
10        margin: 0;
11        font-size: 150%;
12      }
13      h2 {
14      margin: 0;      margin: 0;
15      font-size: 100%;      font-size: 100%;
16    }    }
17    p, pre {    p {
18      margin: 0;      margin: 0 1em;
19    }    }
20    textarea {    textarea {
21      width: 100%;      width: 100%;
# Line 58  Line 65 
65    
66    var logIndentLevel = 0;    var logIndentLevel = 0;
67    function log (s) {    function log (s) {
68        var indent = '';
69      for (var i = 0; i < logIndentLevel; i++) {      for (var i = 0; i < logIndentLevel; i++) {
70        s = '  ' + s;        indent += '  ';
71      }      }
72        s = indent + s.replace (/\n/g, "\n" + indent);
73      document.logElement.appendChild (document.createTextNode (s + "\n"));      document.logElement.appendChild (document.createTextNode (s + "\n"));
74    } // log    } // log
75    
# Line 74  Line 83 
83        doc = new JSDocument (this);        doc = new JSDocument (this);
84        doc.manakaiIsHTML = true;        doc.manakaiIsHTML = true;
85      }      }
86        this.nextToken = [];
87      this.doc = doc;      this.doc = doc;
88      this.openElements = [doc];      this.openElements = [doc];
89      this.input = i;      this.input = i;
90      this.scriptsExecutedAfterParsing = [];      this.scriptsExecutedAfterParsing = [];
91      this.scriptsExecutedSoon = [];      this.scriptsExecutedSoon = [];
92        this.scriptsExecutedAsynchronously = [];
93    } // Parser    } // Parser
94    
95    Parser.prototype.getNextToken = function () {    Parser.prototype.getNextToken = function () {
96        if (this.nextToken.length) {
97          return this.nextToken.shift ();
98        }
99    
100      var p = this;      var p = this;
101      var i = this.input;      var i = this.input;
102      if (this.parseMode == 'script') {      if (this.parseMode == 'cdata') {
103          var tagName = this.endTagName;
104        var token;        var token;
105        if (p.insertionPoint <= 0) {        if (p.insertionPoint <= 0) {
106          return {type: 'abort'};          return {type: 'abort'};
# Line 102  Line 118 
118          return '';          return '';
119        });        });
120        if (token) return token;        if (token) return token;
121        i.s = i.s.replace (/^<\/[Ss][Cc][Rr][Ii][Pp][Tt]>/, function (s) {        var pattern = new RegExp ('^</' + tagName + '>', 'i');
122          i.s = i.s.replace (pattern, function (s) {
123          if (p.insertionPoint < s.length) {          if (p.insertionPoint < s.length) {
124            token = {type: 'abort'};            token = {type: 'abort'};
125            return s;            return s;
126          }          }
127          token = {type: 'end-tag', value: 'script'};          token = {type: 'end-tag', value: tagName};
128          p.insertionPoint -= s.length;          p.insertionPoint -= s.length;
129          return '';          return '';
130        });        });
131        if (token) return token;        if (token) return token;
132        var m;        var m;
133        if ((p.insertionPoint < '</script'.length) &&        if ((p.insertionPoint < ('</' + tagName).length) &&
134            (m = i.s.match (/^<\/([SCRIPTscript]+)/))) {            (m = i.s.match (/^<\/([A-Za-z]+)/))) {
135          var v = m[1].substring (0, p.insertionPoint).toLowerCase ();          var v = m[1].substring (0, p.insertionPoint).toLowerCase ();
136          if (v == 'script'.substring (0, p.insertionPoint - '</'.length)) {          if (v == tagName.substring (0, p.insertionPoint - '</'.length)) {
137            return {type: 'abort'};            return {type: 'abort'};
138          }          }
139        }        }
# Line 134  Line 151 
151      i.s = i.s.replace (/^<\/([^>]+)(?:>|$)/, function (s, e) {      i.s = i.s.replace (/^<\/([^>]+)(?:>|$)/, function (s, e) {
152        if (p.insertionPoint < s.length ||        if (p.insertionPoint < s.length ||
153            (p.insertionPoint <= s.length &&            (p.insertionPoint <= s.length &&
154             s.substring (s.length - 1, 1) != '>')) {             s.substring (s.length - 1, s.length) != '>')) {
155          token = {type: 'abort'};          token = {type: 'abort'};
156          return s;          return s;
157        }        }
# Line 146  Line 163 
163      i.s = i.s.replace (/^<([^>]+)(?:>|$)/, function (s, e) {      i.s = i.s.replace (/^<([^>]+)(?:>|$)/, function (s, e) {
164        if (p.insertionPoint < s.length ||        if (p.insertionPoint < s.length ||
165            (p.insertionPoint <= s.length &&            (p.insertionPoint <= s.length &&
166             s.substring (s.length - 1, 1) != '>')) {             s.substring (s.length - 1, s.length) != '>')) {
167          token = {type: 'abort'};          token = {type: 'abort'};
168          return s;          return s;
169        }        }
# Line 209  Line 226 
226        var token = this.getNextToken ();        var token = this.getNextToken ();
227        log ('token: ' + token.type + ' "' + token.value + '"');        log ('token: ' + token.type + ' "' + token.value + '"');
228    
229          if (this.cdataEndTagRequired) {
230            // Generic CDATA parsing algorithm
231    
232            if (token.type != 'abort') {
233              // 7.
234              if (token.type == 'end-tag' && token.value == this.endTagName) {
235                // 7.1. Ignores it.
236                //
237              } else {
238                // 7.2. Parse error.
239                log ('Parse error: no </' + this.endTagName + '>');
240                this.nextToken.unshift (token);
241              }
242              this.cdataEndTagRequired = false;
243              continue;
244            }
245          }
246    
247        if (token.type == 'start-tag') {        if (token.type == 'start-tag') {
248          if (token.value == 'script') {          if (token.value == 'script') {
249            // 1. Create an element for the token in the HTML namespace.            // 1. Create an element for the token in the HTML namespace.
# Line 221  Line 256 
256            el.manakaiParserInserted = true;            el.manakaiParserInserted = true;
257    
258            // 3. Switch the tokeniser's content model flag to the CDATA state.            // 3. Switch the tokeniser's content model flag to the CDATA state.
259            this.parseMode = 'script';            this.parseMode = 'cdata';
260              this.endTagName = 'script';
261    
262            // 4.1. Collect all the character tokens.            // 4.1. Collect all the character tokens.
263            while (true) {            while (true) {
# Line 235  Line 271 
271              // 4.2. Until it returns a token that is not a character token, or              // 4.2. Until it returns a token that is not a character token, or
272              // until it stops tokenising.              // until it stops tokenising.
273              } else if (token.type == 'eof' ||              } else if (token.type == 'eof' ||
274                         (token.type == 'end-tag' && token.value == 'script') ||                         token.type == 'end-tag' ||
275                         token.type == 'abort') {                         token.type == 'abort') {
276                // 6. Switched back to the PCDATA state.                // 6. Switched back to the PCDATA state.
277                this.parseMode = 'pcdata';                this.parseMode = 'pcdata';
278    
279                // 7.1. If the next token is not an end tag token with ...                // 7.1. If the next token is not an end tag token with ...
280                if (token.type != 'end-tag') {                if (!(token.type == 'end-tag' && token.value == 'script')) {
281                  // 7.2. This is a parse error.                  // 7.2. This is a parse error.
282                  log ('Parse error: no </' + 'script>');                  log ('Parse error: no </' + 'script>');
283                    this.nextToken.unshift (token);
284    
285                  // 7.3. Mark the script element as "already executed".                  // 7.3. Mark the script element as "already executed".
286                  el.manakaiAlreadyExecuted = true;                  el.manakaiAlreadyExecuted = true;
# Line 305  Line 342 
342                //                //
343              }              }
344            }            }
345            } else if (token.value == 'style' ||
346                       token.value == 'noscript' ||
347                       token.value == 'xmp') {
348              // 1. Create an element for the token in the HTML namespace.
349              var el = new JSElement (this.doc, token.value);
350    
351              // 2. Append the new element to the current node.
352              this.openElements[this.openElements.length - 1].appendChild (el);
353    
354              // 3. Switch the tokeniser's content model flag to the CDATA state.
355              this.parseMode = 'cdata';
356              this.endTagName = token.value;
357    
358              // 4.1. Collect all the character tokens.
359              while (true) {
360                var token = this.getNextToken ();
361                log ('token: ' + token.type + ' "' + token.value + '"');
362    
363                if (token.type == 'char') {
364                  // 5. Append a single Text node to the script element node.
365                  el.manakaiAppendText (token.value);
366    
367                // 4.2. Until it returns a token that is not a character token, or
368                // until it stops tokenising.
369                } else if (token.type == 'eof' ||
370                           token.type == 'end-tag' ||
371                           token.type == 'abort') {
372                  // 6. Switched back to the PCDATA state.
373                  this.parseMode = 'pcdata';
374    
375                  if (token.type == 'abort') {
376                    this.cdataEndTagRequired = true;
377                    break;
378                  }
379    
380                  // 7.1. If the next token is not an end tag token with ...
381                  if (!(token.type == 'end-tag' &&
382                        token.value == this.endTagName)) {
383                    // 7.2. This is a parse error.
384                    log ('Parse error: no </' + this.endTagName + '>');
385                    this.nextToken.unshift (token);
386    
387                    // 7.3. Mark the script element as "already executed".
388                    el.manakaiAlreadyExecuted = true;
389                  } else {
390                    // 7.4. Ignore it.
391                    //
392                  }
393                  break;
394                }
395              }
396          } else {          } else {
397            var el = new JSElement (this.doc, token.value);            var el = new JSElement (this.doc, token.value);
398            this.openElements[this.openElements.length - 1].appendChild (el);            this.openElements[this.openElements.length - 1].appendChild (el);
# Line 335  Line 423 
423    
424      // "When a script completes loading" rules start applying.      // "When a script completes loading" rules start applying.
425    
426      // List of scripts that will execute as soon as possible      while (this.scriptsExecutedSoon.length > 0 ||
427      for (var i = 0; i < this.scriptsExecutedSoon.length; i++) {             this.scriptsExecutedAsynchronously.length > 0) {
428        var e = this.scriptsExecutedSoon[i];        // Handle "list of scripts that will execute as soon as possible".
429          while (this.scriptsExecutedSoon.length > 0) {
430            var e = this.scriptsExecutedSoon.shift ();
431      
432            // If it has completed loading
433            log ('Execute an external script not inserted by parser...');
434            executeScript (this.doc, e);
435    
436            // NOTE: It MAY be executed before the end of the parsing, according
437            // to the spec.
438            this.hasAsyncScript = true;
439          }
440    
441          // Handle "list of scripts that will execute asynchronously".
442          while (this.scriptsExecutedAsynchronously.length > 0) {
443            var e = this.scriptsExecutedAsynchronously.shift ();
444    
445            // Step 1.
446            // We assume that all scripts have been loaded at this time.
447      
448            // Step 2.
449            log ('Execute an asynchronous script...');
450            executeScript (this.doc, e);
451    
452            // Step 3.
453            //
454    
455            // Step 4.
456            //
457    
458        // If it has completed loading          this.hasAsyncScript = true;
459        log ('Execute an external script not inserted by parser...');        }
       executeScript (this.doc, e);  
   
       // NOTE: It MAY be executed before the end of the parsing, according  
       // to the spec.  
       this.hasAsyncScript = true;  
460      }      }
461    
     // TODO: Handles  
     // "list of scripts that will execute asynchronously"  
   
462      // Handle "list of scripts that will execute when the document has finished      // Handle "list of scripts that will execute when the document has finished
463      // parsing".      // parsing".
464      var list = this.scriptsExecutedAfterParsing;      var list = this.scriptsExecutedAfterParsing;
# Line 369  Line 477 
477    
478      log ('DOMContentLoaded event fired');      log ('DOMContentLoaded event fired');
479    
480      // "delays tha load event" things has completed:      // "delays the load event" things has completed:
481      // readyState = 'complete'      // readyState = 'complete'
482      log ('load event fired');      log ('load event fired');
483    
# Line 425  Line 533 
533        // 2.4. If the script element has its "already executed" flag set        // 2.4. If the script element has its "already executed" flag set
534        if (e.manakaiAlreadyExecuted) {        if (e.manakaiAlreadyExecuted) {
535          // 2.5. Abort these steps at this point.          // 2.5. Abort these steps at this point.
536          log ('Running a script: aborted');          log ('Running a script: aborted (already executed)');
537          logIndentLevel--;          logIndentLevel--;
538          return e;          return e;
539        }        }
# Line 444  Line 552 
552          p.scriptsExecutedAfterParsing.push (e);          p.scriptsExecutedAfterParsing.push (e);
553          log ('Running a script: aborted (defer)');          log ('Running a script: aborted (defer)');
554        } else if (e.async && e.src != null) {        } else if (e.async && e.src != null) {
555          // TODO          p.scriptsExecutedAsynchronously.push (e);
556        } else if (e.async && e.src == null          log ('Running a script: aborted (async src)');
557                   /* && list of scripts that will execute asynchronously is not empty */) {        } else if (e.async && e.src == null &&
558          // TODO                   p.scriptsExecutedAsynchronously.length > 0) {
559            p.scriptsExecutedAsynchronously.push (e);
560            log ('Running a script: aborted (async)');
561            // ISSUE: What is the difference with the case above?
562        } else if (e.src != null && e.manakaiParserInserted) {        } else if (e.src != null && e.manakaiParserInserted) {
563          if (p.scriptExecutedWhenParserResumes) {          if (p.scriptExecutedWhenParserResumes) {
564            log ('Error: There is a script that will execute as soon as the parser resumes.');            log ('Error: There is a script that will execute as soon as the parser resumes.');
# Line 487  Line 598 
598      }      }
599    
600      // If the load was successful      // If the load was successful
     log ('load event fired at the script element');  
601    
602      if (true) {      if (true) {
603      // Scripting is enabled, Document.designMode is disabled,      // Scripting is enabled, Document.designMode is disabled,
# Line 496  Line 606 
606        parseAndRunScript (doc, s);        parseAndRunScript (doc, s);
607      }      }
608    
609        log ('load event fired at the script element');
610    
611      log ('executing a script block: end');      log ('executing a script block: end');
612    } // executeScript    } // executeScript
613    
# Line 533  Line 645 
645          doc.write.apply (doc, args);          doc.write.apply (doc, args);
646          return '';          return '';
647        });        });
648        s = s.replace (/^\s*var\s+s\s*=\s*document\.createElement\s*\(\s*['"]script['"]\s*\)\s*;\s*s\.src\s*=\s*(?:'(javascript:[^']*)'|"(javascript:[^"]*)")\s*;\s*document\.documentElement\.appendChild\s*\(\s*s\s*\)\s*;\s*/,        var noDocumentElement = false;
649          s = s.replace (/^\s*var\s+s\s*=\s*document\.createElement\s*\(\s*['"]script['"]\s*\)\s*;\s*s\.src\s*=\s*(?:'([^']*)'|"([^"]*)")\s*;\s*document\.documentElement\.appendChild\s*\(\s*s\s*\)\s*;\s*/,
650        function (s, t, u) {        function (s, t, u) {
651          matched = true;          matched = true;
652          var args = [unescapeJSLiteral (t ? t : u)];          var args = [unescapeJSLiteral (t ? t : u)];
653          doc._insertExternalScript.apply (doc, args);          noDocumentElement = !doc._insertExternalScript.apply (doc, args);
654            return '';
655          });
656          if (noDocumentElement) {
657            log ('Script error: documentElement is null');
658            break;
659          }
660          s = s.replace (/^\s*w\s*\(\s*document\.documentElement\.innerHTML\s*\)\s*;\s*/,
661          function (s, t) {
662            matched = true;
663            log (dumpTree (doc, ''));
664          return '';          return '';
665        });        });
666        if (s == '') break;        if (s == '') break;
# Line 622  Line 745 
745    }; // document.open    }; // document.open
746    
747    JSDocument.prototype.write = function () {    JSDocument.prototype.write = function () {
748        log ('document.write: start');
749      logIndentLevel++;      logIndentLevel++;
750    
751      var p = this._parser;      var p = this._parser;
# Line 645  Line 769 
769      if (p.scriptExecutedAfterParserResumes) {      if (p.scriptExecutedAfterParserResumes) {
770        log ('document.write: processed later (there is an unprocessed <script src>)');        log ('document.write: processed later (there is an unprocessed <script src>)');
771        logIndentLevel--;        logIndentLevel--;
772          log ('document.write: return');
773        return;        return;
774      }      }
775    
# Line 658  Line 783 
783      // to do something here?      // to do something here?
784    
785      // 5. Return      // 5. Return
786        logIndentLevel--;
787      log ('document.write: return');      log ('document.write: return');
788    
     logIndentLevel--;  
789      return;      return;
790    }; // document.write    }; // document.write
791    
792    JSDocument.prototype._insertExternalScript = function (uri) {    JSDocument.prototype._insertExternalScript = function (uri) {
793      var s = new JSElement (this, 'script');      var s = new JSElement (this, 'script');
794      s.src = uri;      s.src = uri;
795      this.documentElement.appendChild (s);      if (this.documentElement) {
796          this.documentElement.appendChild (s);
797          return true;
798        } else {
799          return false;
800        }
801    }; // _insertExternalScript    }; // _insertExternalScript
802    
803    JSDocument.prototype.__defineGetter__ ('documentElement', function () {    JSDocument.prototype.__defineGetter__ ('documentElement', function () {
# Line 767  algorithm.  Especially, this parser: Line 897  algorithm.  Especially, this parser:
897  algorithm, and so on.  algorithm, and so on.
898  <li>Does not raise parse errors for invalid attribute specifications in start  <li>Does not raise parse errors for invalid attribute specifications in start
899  or end tags.  or end tags.
900  <li>Does not support CDATA/PCDATA element other than <code>script</code>.  <li>Does not support RCDATA elements (<code>title</code> and
901    <code>textarea</code>).
902    <li>Does not strip the first newline in <code>pre</code>,
903    <code>listing</code>, and <code>textarea</code> elements.
904  <li>Does not support <code>&lt;!--</code>..<code>--></code> parsing rule  <li>Does not support <code>&lt;!--</code>..<code>--></code> parsing rule
905  in <code>script</code> element.  in CDATA/RCDATA elements.
906  <li>Does not support foreign (SVG or MathML) elements.  <li>Does not support foreign (SVG or MathML) elements.
907  <li>Only supports <code>script</code> <code>type</code>  <li>Only supports <code>script</code> <code>type</code>
908  <code>text/javascript</code>.  <code>type</code> and <code>language</code>  <code>text/javascript</code>.  <code>type</code> and <code>language</code>
# Line 782  introduced, followed, or separated by wh Line 915  introduced, followed, or separated by wh
915    <li><code>var s = document.createElement ("script");    <li><code>var s = document.createElement ("script");
916              s.src = "<var>string</var>";              s.src = "<var>string</var>";
917              document.documentElement.appendChild (s);</code>              document.documentElement.appendChild (s);</code>
918      <li><code>w (document.documentElement.innerHTML);</code> (This statement
919      can be used to dump the document, even when the document has no
920      document element.  The output format is the tree dump format used
921      in html5lib test data, not <abbr>HTML</abbr>.)
922    </ul>    </ul>
923  Note that strings may be delimited by <code>'</code>s instead of  Note that strings may be delimited by <code>'</code>s instead of
924  <code>"</code>s.  <code>"</code>s.
# Line 792  the <abbr title="Uniform Resource Identi Line 929  the <abbr title="Uniform Resource Identi
929  the regular expression <code>^javascript:\s*(?:"[^"]*"|'[^']*')\s*$</code>.  the regular expression <code>^javascript:\s*(?:"[^"]*"|'[^']*')\s*$</code>.
930  <li>Only supports <code>\u<var>HHHH</var></code> escapes in JavaScript  <li>Only supports <code>\u<var>HHHH</var></code> escapes in JavaScript
931  string literals.  string literals.
932    <li>Does not handle <i>stop parsing</i> phase correctly if the document is
933    replaced by <code>document.open ()</code> call.  In other word, delayed
934    (deferred or asynchronous) script executions and event firings might be
935    treated in a wrong way if a <code>document.open ()</code> invocation
936    is implicitly done by <code>document.write ()</code> in a delayed script.
937  </ul>  </ul>
938    
939  <p>For some reason, this parser does not work in browsers that do  <p>For some reason, this parser does not work in browsers that do
940  not support JavaScript 1.5.  not support JavaScript 1.5.
941    
942  <!-- TODO: license -->  <!-- TODO: |src| attribute value should refer the value at the time
943    when it is inserted into the document, not the value when the script is
944    executed.  Currently it does not matter, since we don't allow dynamic
945    modification to the |src| content/DOM attribute value yet. -->
946    
947  </body>  </body>
 </html>  
948    </html>
949    <!-- $Date$ -->
950    <!--
951    
952    Copyright 2008 Wakaba <w@suika.fam.cx>
953    
954    This program is free software; you can redistribute it and/or
955    modify it under the terms of the GNU General Public License
956    as published by the Free Software Foundation; either version 2
957    of the License, or (at your option) any later version.
958    
959    This program is distributed in the hope that it will be useful,
960    but WITHOUT ANY WARRANTY; without even the implied warranty of
961    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
962    GNU General Public License for more details.
963    
964    You should have received a copy of the GNU General Public License
965    along with this program; if not, write to the Free Software
966    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
967    
968    -->

Legend:
Removed from v.1.11  
changed lines
  Added in v.1.17

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24