From c4c424473c3f9c4f1728ea94687838f6ba5d699c Mon Sep 17 00:00:00 2001 From: adon Date: Thu, 8 Oct 2015 14:40:26 +0800 Subject: [PATCH 1/2] Avoided string split() and join() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - `string.split(‘’)` was found very slow on large string - used an equiv. string-based manipulation --- src/context-parser.js | 145 ++++++++++-------------------------------- 1 file changed, 32 insertions(+), 113 deletions(-) diff --git a/src/context-parser.js b/src/context-parser.js index eeacd05..3c32648 100644 --- a/src/context-parser.js +++ b/src/context-parser.js @@ -524,14 +524,18 @@ Parser.prototype._getNextState = function (state, i, endsWithEOF) { }; /** -* @function Parser._convertString2Array +* @function Parser._inputSplice +* @param {integer} start - Index at which to start changing the this.input +* @param {integer} deleteCount - An integer indicating the number of characters to remove from this.input +* @param {string} addChars - A string to insert into the this.input * * @description -* Convert the immutable this.input to array type for Strict Context Parser processing (lazy conversion). -* +* This provides a similar splice interface to act on this.input as if it's an array. In addition, it updates the new length to this.inputLen */ -Parser.prototype._convertString2Array = function () { - if (typeof this.input === "string") this.input = this.input.split(''); +Parser.prototype._inputSplice = function(start, deleteCount, addChars) { + var str = this.input; + this.input = str.substr(0, start) + addChars + str.substr(start + deleteCount); + this.inputLen += addChars.length - deleteCount; }; /** @@ -567,20 +571,9 @@ Parser.prototype.fork = function() { */ Parser.prototype.contextualize = function (input, endsWithEOF) { FastParser.prototype.contextualize.call(this, input, endsWithEOF); - return this.getModifiedInput(); + return this.input; }; -/** - * @function Parser#getModifiedInput - * - * @description - * Get the modified input due to Strict Context Parser processing. - * - */ -Parser.prototype.getModifiedInput = function() { - // TODO: it is not defensive enough, should use Array.isArray, but need polyfill - return (typeof this.input === "string")? this.input:this.input.join(''); -}; /** * @function Parser#setCurrentState @@ -675,12 +668,8 @@ Parser.prototype.getLastState = function() { */ function ConvertBogusCommentToComment(i) { - // for lazy conversion - this._convertString2Array(); - // convert !--. i.e., from <* to this.on('preCanonicalize', PreCanonicalizeConvertBogusCommentEndTag); @@ -691,12 +680,8 @@ function PreCanonicalizeConvertBogusCommentEndTag(state, i, endsWithEOF) { // remove itself from the listener list this.off('preCanonicalize', PreCanonicalizeConvertBogusCommentEndTag); - // for lazy conversion - this._convertString2Array(); - // convert [>] to [-]-> - this.input.splice(i, 0, '-', '-'); - this.inputLen += 2; + this._inputSplice(i, 0, '--'); this.emit(this.listeners.bogusCommentCoverted, [state, i, endsWithEOF]); } @@ -730,44 +715,34 @@ function Canonicalize(state, i, endsWithEOF) { // batch replacement of NULL with \uFFFD would violate the spec // - for example, NULL is untouched in CDATA section state if (chr === '\x00' && statesRequiringNullReplacement[state]) { - // for lazy conversion - this._convertString2Array(); - this.input[i] = '\uFFFD'; + this._inputSplice(i, 1, '\uFFFD'); } // encode < into < for [<]* (* is non-alpha) in STATE_DATA, [<]% and [<]! in STATE_RCDATA and STATE_RAWTEXT else if ((potentialState === htmlState.STATE_TAG_OPEN && nextPotentialState === htmlState.STATE_DATA) || // [<]*, where * is non-alpha ((state === htmlState.STATE_RCDATA || state === htmlState.STATE_RAWTEXT) && // in STATE_RCDATA and STATE_RAWTEXT chr === '<' && (nextChr === '%' || nextChr === '!'))) { // [<]% or [<]! - // for lazy conversion - this._convertString2Array(); - // [<]*, [<]%, [<]! - this.input.splice(i, 1, '&', 'l', 't', ';'); - this.inputLen += 3; + this._inputSplice(i, 1, '<'); } // enforce // + convert bogus comment or unknown doctype to the standard html comment else if (potentialState === htmlState.STATE_MARKUP_DECLARATION_OPEN) { // <[!]*** reCanonicalizeNeeded = false; - // for lazy conversion - this._convertString2Array(); - // context-parser treats the doctype and [CDATA[ as resulting into STATE_BOGUS_COMMENT // so, we need our algorithm here to extract and check the next 7 characters - var commentKey = this.input.slice(i + 1, i + 8).join(''); + var commentKey = this.input.slice(i + 1, i + 8); // enforce if (commentKey.toLowerCase() === 'doctype') { // ' - if (this.input.slice(i + 8, i + 14).join('').toLowerCase() !== ' html>') { + if (this.input.slice(i + 8, i + 14).toLowerCase() !== ' html>') { // replace <[!]doctype xxxx> with <[!]--!doctype xxxx--> ConvertBogusCommentToComment.call(this, i); this.once('bogusCommentCoverted', function (state, i) { - [].splice.apply(this.input, [i + 3, 0].concat(''.split(''))); - this.inputLen += 15; + this._inputSplice(i + 3, 0, ''); }); reCanonicalizeNeeded = true; @@ -809,10 +784,7 @@ function Canonicalize(state, i, endsWithEOF) { // htmlState.STATE_BEFORE_ATTRIBUTE_NAME, // become ], or become ] else if (potentialState === htmlState.STATE_BEFORE_ATTRIBUTE_VALUE && // only from STATE_ATTRIBUTE_NAME or STATE_AFTER_ATTRIBUTE_NAME nextPotentialState === htmlState.STATE_DATA) { // or - // for lazy conversion - this._convertString2Array(); - - this.input.splice(i, 1); - this.inputLen--; + this._inputSplice(i, 1, ''); } // insert a space for ] with @@ -966,11 +889,7 @@ function Canonicalize(state, i, endsWithEOF) { // remove IE conditional comments function DisableIEConditionalComments(state, i){ if (state === htmlState.STATE_COMMENT && this.input[i] === ']' && this.input[i+1] === '>') { - // for lazy conversion - this._convertString2Array(); - - this.input.splice(i + 1, 0, ' '); - this.inputLen++; + this._inputSplice(i + 1, 0, ' '); } } From 5a27daa9c2e167381610c18e8e076adfd28ddba0 Mon Sep 17 00:00:00 2001 From: adon Date: Thu, 8 Oct 2015 15:00:31 +0800 Subject: [PATCH 2/2] enhanced benchmark scripts - tests the performance with different configurations --- bin/benchmark | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/bin/benchmark b/bin/benchmark index b5efe84..a477852 100755 --- a/bin/benchmark +++ b/bin/benchmark @@ -20,10 +20,17 @@ var allParsers = parsers.concat([ ['HtmlParser2', 'htmlparser2', 'Parser', 'end'], ]); -console.log("Usage: benchmark [all]"); +var selfParsers = [ + ['Context Parser (FastParser)', '../src/context-parser', 'FastParser', 'contextualize', {}], + ['Context Parser (Parser)', '../src/context-parser', 'Parser', 'contextualize', {}], + ['Context Parser (Parser,stateTracking=Off)', '../src/context-parser', 'Parser', 'contextualize', {enableStateTracking: false}], + ['Context Parser (Parser,stateTracking=Off,canonicalization=On)', '../src/context-parser', 'Parser', 'contextualize', {enableStateTracking: false, enableCanonicalization: true}] +]; + +console.log("Usage: benchmark [all|self]"); if ( process.argv.length > 2 ) { - parsers = allParsers; + parsers = process.argv[3] === 'all' ? allParsers : selfParsers; } parsers.forEach(function(parser) { @@ -33,13 +40,14 @@ parsers.forEach(function(parser) { var classname = parser[1]; var name = parser[2]; var method = parser[3]; + var config = parser[4]; try { if ( name || method ) { var Parser = name ? require(classname)[name] : require(classname); start = +new Date(); for(var i=0; i<10; i++) { - var parser = new Parser(); + var parser = config ? new Parser(config) : new Parser(); parser[method](html); } end = +new Date();