| 1 |
wakaba |
1.1 |
<!DOCTYPE HTML> |
| 2 |
|
|
<html lang=en> |
| 3 |
|
|
<head> |
| 4 |
|
|
<title>Demo of HTML5 Parsing Algorithm with Scripting Enabled</title> |
| 5 |
|
|
<style> |
| 6 |
|
|
textarea { |
| 7 |
|
|
display: block; |
| 8 |
|
|
width: 80%; |
| 9 |
|
|
margin-left: auto; |
| 10 |
|
|
margin-right: auto; |
| 11 |
|
|
min-height: 20em; |
| 12 |
|
|
} |
| 13 |
|
|
output { |
| 14 |
|
|
display: block; |
| 15 |
|
|
font-family: monospace; |
| 16 |
|
|
white-space: pre; |
| 17 |
|
|
} |
| 18 |
|
|
</style> |
| 19 |
|
|
<script> |
| 20 |
|
|
function update () { |
| 21 |
|
|
document.logElement.textContent = ''; |
| 22 |
|
|
var p = new Parser (); |
| 23 |
|
|
p.parse (new InputStream (document.sourceElement.value)); |
| 24 |
|
|
log (dumpTree (p.doc, '')); |
| 25 |
|
|
} // update |
| 26 |
|
|
|
| 27 |
|
|
function log (s) { |
| 28 |
|
|
document.logElement.appendChild (document.createTextNode (s + "\n")); |
| 29 |
|
|
} // log |
| 30 |
|
|
|
| 31 |
|
|
function InputStream (s) { |
| 32 |
|
|
this.s = s; |
| 33 |
|
|
} // InputStream |
| 34 |
|
|
|
| 35 |
|
|
function Parser () { |
| 36 |
|
|
this.parseMode = 'pcdata'; |
| 37 |
|
|
this.doc = new JSDocument (); |
| 38 |
|
|
this.openElements = [this.doc]; |
| 39 |
|
|
} // Parser |
| 40 |
|
|
|
| 41 |
|
|
Parser.prototype.getNextToken = function (i) { |
| 42 |
|
|
if (this.parseMode == 'script') { |
| 43 |
|
|
var token; |
| 44 |
|
|
i.s = i.s.replace (/^([\s\S]+?)<\/[Ss][Cc][Rr][Ii][Pp][Tt]>/, |
| 45 |
|
|
function (s, t) { |
| 46 |
|
|
token = {type: 'char', value: t}; |
| 47 |
|
|
return '<' + '/script>'; |
| 48 |
|
|
}); |
| 49 |
|
|
if (token) return token; |
| 50 |
|
|
i.s = i.s.replace (/^<\/[Ss][Cc][Rr][Ii][Pp][Tt]>/, function () { |
| 51 |
|
|
token = {type: 'end-tag', value: 'script'}; |
| 52 |
|
|
return ''; |
| 53 |
|
|
}); |
| 54 |
|
|
if (token) return token; |
| 55 |
|
|
return {type: 'eof'}; |
| 56 |
|
|
} |
| 57 |
|
|
|
| 58 |
|
|
var token; |
| 59 |
|
|
i.s = i.s.replace (/^<\/([^>]+)>/, function (s, e) { |
| 60 |
|
|
token = {type: 'end-tag', value: e.toLowerCase ()}; |
| 61 |
|
|
return ''; |
| 62 |
|
|
}); |
| 63 |
|
|
if (token) return token; |
| 64 |
|
|
i.s = i.s.replace (/^<([^>]+)>/, function (s, e) { |
| 65 |
|
|
token = {type: 'start-tag', value: e.toLowerCase ()}; |
| 66 |
|
|
return ''; |
| 67 |
|
|
}); |
| 68 |
|
|
if (token) return token; |
| 69 |
|
|
i.s = i.s.replace (/^[^<]+/, function (s) { |
| 70 |
|
|
token = {type: 'char', value: s}; |
| 71 |
|
|
return ''; |
| 72 |
|
|
}); |
| 73 |
|
|
if (token) return token; |
| 74 |
|
|
i.s = i.s.replace (/^[\s\S]/, function (s) { |
| 75 |
|
|
token = {type: 'char', value: s}; |
| 76 |
|
|
return ''; |
| 77 |
|
|
}); |
| 78 |
|
|
if (token) return token; |
| 79 |
|
|
return {type: 'eof'}; |
| 80 |
|
|
} // getNextToken |
| 81 |
|
|
|
| 82 |
|
|
Parser.prototype.parse = function (i) { |
| 83 |
|
|
log ('start parsing'); |
| 84 |
|
|
|
| 85 |
|
|
while (true) { |
| 86 |
|
|
var token = this.getNextToken (i); |
| 87 |
|
|
log ('token: ' + token.type + ' "' + token.value + '"'); |
| 88 |
|
|
|
| 89 |
|
|
if (token.type == 'start-tag') { |
| 90 |
|
|
var el = new JSElement (token.value); |
| 91 |
|
|
if (token.value == 'script') { |
| 92 |
|
|
this.parseMode = 'script'; |
| 93 |
|
|
|
| 94 |
|
|
while (true) { |
| 95 |
|
|
var token = this.getNextToken (i); |
| 96 |
|
|
log ('token: ' + token.type + ' "' + token.value + '"'); |
| 97 |
|
|
|
| 98 |
|
|
if (token.type == 'char') { |
| 99 |
|
|
el.manakaiAppendText (token.value); |
| 100 |
|
|
} else if (token.type == 'eof' || |
| 101 |
|
|
(token.type == 'end-tag' && token.value == 'script')) { |
| 102 |
|
|
this.parseMode = 'pcdata'; |
| 103 |
|
|
break; |
| 104 |
|
|
} |
| 105 |
|
|
} |
| 106 |
|
|
|
| 107 |
|
|
this.openElements[this.openElements.length - 1].appendChild (el); |
| 108 |
|
|
} else { |
| 109 |
|
|
this.openElements[this.openElements.length - 1].appendChild (el); |
| 110 |
|
|
this.openElements.push (el); |
| 111 |
|
|
} |
| 112 |
|
|
} else if (token.type == 'end-tag') { |
| 113 |
|
|
if (this.openElements[this.openElements.length - 1].localName == |
| 114 |
|
|
token.value) { |
| 115 |
|
|
this.openElements.pop (); |
| 116 |
|
|
} else { |
| 117 |
|
|
log ('parse error: unmatched end tag: ' + token.value); |
| 118 |
|
|
} |
| 119 |
|
|
} else if (token.type == 'eof') { |
| 120 |
|
|
break; |
| 121 |
|
|
} |
| 122 |
|
|
} |
| 123 |
|
|
|
| 124 |
|
|
log ('stop parsing'); |
| 125 |
|
|
} // parse |
| 126 |
|
|
|
| 127 |
|
|
function JSDocument () { |
| 128 |
|
|
this.childNodes = []; |
| 129 |
|
|
} // JSDocument |
| 130 |
|
|
|
| 131 |
|
|
function JSElement (localName) { |
| 132 |
|
|
this.localName = localName; |
| 133 |
|
|
this.childNodes = []; |
| 134 |
|
|
} // JSElement |
| 135 |
|
|
|
| 136 |
|
|
JSDocument.prototype.appendChild = JSElement.prototype.appendChild = |
| 137 |
|
|
function (e) { |
| 138 |
|
|
this.childNodes.push (e); |
| 139 |
|
|
e.parentNode = this; |
| 140 |
|
|
return e; |
| 141 |
|
|
}; // appendChild |
| 142 |
|
|
|
| 143 |
|
|
function JSText (data) { |
| 144 |
|
|
this.data = data; |
| 145 |
|
|
} // JSText |
| 146 |
|
|
|
| 147 |
|
|
JSDocument.prototype.manakaiAppendText = |
| 148 |
|
|
JSElement.prototype.manakaiAppendText = |
| 149 |
|
|
function (s) { |
| 150 |
|
|
if (this.childNodes.length > 0 && |
| 151 |
|
|
this.childNodes[this.childNodes.length - 1] instanceof JSText) { |
| 152 |
|
|
this.childNodes[this.childNodes.length - 1].data += s; |
| 153 |
|
|
} else { |
| 154 |
|
|
this.childNodes.push (new JSText (s)); |
| 155 |
|
|
} |
| 156 |
|
|
}; // manakaiAppendText |
| 157 |
|
|
|
| 158 |
|
|
function dumpTree (n, indent) { |
| 159 |
|
|
var r = ''; |
| 160 |
|
|
for (var i = 0; i < n.childNodes.length; i++) { |
| 161 |
|
|
var node = n.childNodes[i]; |
| 162 |
|
|
if (node instanceof JSElement) { |
| 163 |
|
|
r += '| ' + indent + node.localName + '\n'; |
| 164 |
|
|
r += dumpTree (node, indent + ' '); |
| 165 |
|
|
} else if (node instanceof JSText) { |
| 166 |
|
|
r += '| ' + indent + '"' + node.data + '"\n'; |
| 167 |
|
|
} else { |
| 168 |
|
|
r += '| ' + indent + node + '\n'; |
| 169 |
|
|
} |
| 170 |
|
|
} |
| 171 |
|
|
return r; |
| 172 |
|
|
} // dumpTree |
| 173 |
|
|
</script> |
| 174 |
|
|
</head> |
| 175 |
|
|
<body onload=" |
| 176 |
|
|
document.sourceElement = document.getElementsByTagName ('textarea')[0]; |
| 177 |
|
|
document.logElement = document.getElementsByTagName ('output')[0]; |
| 178 |
|
|
update (); |
| 179 |
|
|
"> |
| 180 |
|
|
|
| 181 |
|
|
<textarea onchange=" update () "><html> |
| 182 |
|
|
<head></head><body> |
| 183 |
|
|
<p> |
| 184 |
|
|
<script> |
| 185 |
|
|
document.write ('aaaaaaa</p>\n<script>\ndocument.write("cccccc")\n</', 'script>\nbbbbbb'); |
| 186 |
|
|
</script> |
| 187 |
|
|
<p> |
| 188 |
|
|
</textarea> |
| 189 |
|
|
|
| 190 |
|
|
<output></output> |
| 191 |
|
|
|
| 192 |
|
|
</body> |
| 193 |
|
|
</html> |