From 8e9745fc7dde562a21d93d340e971206a443199f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Sat, 27 Aug 2011 15:01:27 +0200 Subject: [PATCH 001/450] Restructured code, improved readability, deleted junk --- CHANGELOG | 38 -- json2.js | 482 --------------------- lib/DefaultHandler.js | 135 ++++++ lib/DomUtils.js | 94 +++++ lib/ElementType.js | 9 + lib/Parser.js | 397 ++++++++++++++++++ lib/RssHandler.js | 112 +++++ lib/htmlparser.js | 827 +------------------------------------ lib/htmlparser.min.js | 22 - lib/node-htmlparser.js | 6 - lib/node-htmlparser.min.js | 6 - profile.js | 63 --- runtests.min.html | 108 ----- runtests.min.js | 75 ---- snippet.js | 15 - utils_example.js | 35 -- 16 files changed, 752 insertions(+), 1672 deletions(-) delete mode 100644 CHANGELOG delete mode 100644 json2.js create mode 100644 lib/DefaultHandler.js create mode 100644 lib/DomUtils.js create mode 100644 lib/ElementType.js create mode 100644 lib/Parser.js create mode 100644 lib/RssHandler.js delete mode 100644 lib/htmlparser.min.js delete mode 100644 lib/node-htmlparser.js delete mode 100644 lib/node-htmlparser.min.js delete mode 100644 profile.js delete mode 100644 runtests.min.html delete mode 100644 runtests.min.js delete mode 100644 snippet.js delete mode 100644 utils_example.js diff --git a/CHANGELOG b/CHANGELOG deleted file mode 100644 index c262712..0000000 --- a/CHANGELOG +++ /dev/null @@ -1,38 +0,0 @@ -v1.8.0 - * - -v1.7.3 - * Renamed node-htmlparser.* to htmlparser.* and created shims for people still expecting node-htmlparser.* - -v1.7.2 - * Document position feature fixed to work correctly with chunked parsing - -v1.7.1 - * Document position feature disabled until it works correctly with chunked parsing - -v1.7.0 - * Empty tag checking switch to being case insensitive [fgnass] - * Added feature to include document position (row, col) in element data [fgnass] - * Added parser option "includeLocation" to enable document position data - -v1.6.4 - * Fixed 'prevElement' error [Swizec] - -v1.6.3 - * Updated to support being an npm package - * Fixed DomUtils.testElement() - -v1.6.1 - * Optimized DomUtils by up to 2-3x - -v1.6.0 - * Added support for RSS/Atom feeds - -v1.5.0 - * Added DefaultHandler option "enforceEmptyTags" so that XML can be parsed correctly - -v1.4.2 - * Added tests for parsing XML with namespaces - -v1.4.1 - * Added minified version diff --git a/json2.js b/json2.js deleted file mode 100644 index a1a3b17..0000000 --- a/json2.js +++ /dev/null @@ -1,482 +0,0 @@ -/* - http://www.JSON.org/json2.js - 2010-03-20 - - Public Domain. - - NO WARRANTY EXPRESSED OR IMPLIED. USE AT YOUR OWN RISK. - - See http://www.JSON.org/js.html - - - This code should be minified before deployment. - See http://javascript.crockford.com/jsmin.html - - USE YOUR OWN COPY. IT IS EXTREMELY UNWISE TO LOAD CODE FROM SERVERS YOU DO - NOT CONTROL. - - - This file creates a global JSON object containing two methods: stringify - and parse. - - JSON.stringify(value, replacer, space) - value any JavaScript value, usually an object or array. - - replacer an optional parameter that determines how object - values are stringified for objects. It can be a - function or an array of strings. - - space an optional parameter that specifies the indentation - of nested structures. If it is omitted, the text will - be packed without extra whitespace. If it is a number, - it will specify the number of spaces to indent at each - level. If it is a string (such as '\t' or ' '), - it contains the characters used to indent at each level. - - This method produces a JSON text from a JavaScript value. - - When an object value is found, if the object contains a toJSON - method, its toJSON method will be called and the result will be - stringified. A toJSON method does not serialize: it returns the - value represented by the name/value pair that should be serialized, - or undefined if nothing should be serialized. The toJSON method - will be passed the key associated with the value, and this will be - bound to the value - - For example, this would serialize Dates as ISO strings. - - Date.prototype.toJSON = function (key) { - function f(n) { - // Format integers to have at least two digits. - return n < 10 ? '0' + n : n; - } - - return this.getUTCFullYear() + '-' + - f(this.getUTCMonth() + 1) + '-' + - f(this.getUTCDate()) + 'T' + - f(this.getUTCHours()) + ':' + - f(this.getUTCMinutes()) + ':' + - f(this.getUTCSeconds()) + 'Z'; - }; - - You can provide an optional replacer method. It will be passed the - key and value of each member, with this bound to the containing - object. The value that is returned from your method will be - serialized. If your method returns undefined, then the member will - be excluded from the serialization. - - If the replacer parameter is an array of strings, then it will be - used to select the members to be serialized. It filters the results - such that only members with keys listed in the replacer array are - stringified. - - Values that do not have JSON representations, such as undefined or - functions, will not be serialized. Such values in objects will be - dropped; in arrays they will be replaced with null. You can use - a replacer function to replace those with JSON values. - JSON.stringify(undefined) returns undefined. - - The optional space parameter produces a stringification of the - value that is filled with line breaks and indentation to make it - easier to read. - - If the space parameter is a non-empty string, then that string will - be used for indentation. If the space parameter is a number, then - the indentation will be that many spaces. - - Example: - - text = JSON.stringify(['e', {pluribus: 'unum'}]); - // text is '["e",{"pluribus":"unum"}]' - - - text = JSON.stringify(['e', {pluribus: 'unum'}], null, '\t'); - // text is '[\n\t"e",\n\t{\n\t\t"pluribus": "unum"\n\t}\n]' - - text = JSON.stringify([new Date()], function (key, value) { - return this[key] instanceof Date ? - 'Date(' + this[key] + ')' : value; - }); - // text is '["Date(---current time---)"]' - - - JSON.parse(text, reviver) - This method parses a JSON text to produce an object or array. - It can throw a SyntaxError exception. - - The optional reviver parameter is a function that can filter and - transform the results. It receives each of the keys and values, - and its return value is used instead of the original value. - If it returns what it received, then the structure is not modified. - If it returns undefined then the member is deleted. - - Example: - - // Parse the text. Values that look like ISO date strings will - // be converted to Date objects. - - myData = JSON.parse(text, function (key, value) { - var a; - if (typeof value === 'string') { - a = -/^(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2}(?:\.\d*)?)Z$/.exec(value); - if (a) { - return new Date(Date.UTC(+a[1], +a[2] - 1, +a[3], +a[4], - +a[5], +a[6])); - } - } - return value; - }); - - myData = JSON.parse('["Date(09/09/2001)"]', function (key, value) { - var d; - if (typeof value === 'string' && - value.slice(0, 5) === 'Date(' && - value.slice(-1) === ')') { - d = new Date(value.slice(5, -1)); - if (d) { - return d; - } - } - return value; - }); - - - This is a reference implementation. You are free to copy, modify, or - redistribute. -*/ - -/*jslint evil: true, strict: false */ - -/*members "", "\b", "\t", "\n", "\f", "\r", "\"", JSON, "\\", apply, - call, charCodeAt, getUTCDate, getUTCFullYear, getUTCHours, - getUTCMinutes, getUTCMonth, getUTCSeconds, hasOwnProperty, join, - lastIndex, length, parse, prototype, push, replace, slice, stringify, - test, toJSON, toString, valueOf -*/ - - -// Create a JSON object only if one does not already exist. We create the -// methods in a closure to avoid creating global variables. - -if (!this.JSON) { - this.JSON = {}; -} - -(function () { - - function f(n) { - // Format integers to have at least two digits. - return n < 10 ? '0' + n : n; - } - - if (typeof Date.prototype.toJSON !== 'function') { - - Date.prototype.toJSON = function (key) { - - return isFinite(this.valueOf()) ? - this.getUTCFullYear() + '-' + - f(this.getUTCMonth() + 1) + '-' + - f(this.getUTCDate()) + 'T' + - f(this.getUTCHours()) + ':' + - f(this.getUTCMinutes()) + ':' + - f(this.getUTCSeconds()) + 'Z' : null; - }; - - String.prototype.toJSON = - Number.prototype.toJSON = - Boolean.prototype.toJSON = function (key) { - return this.valueOf(); - }; - } - - var cx = /[\u0000\u00ad\u0600-\u0604\u070f\u17b4\u17b5\u200c-\u200f\u2028-\u202f\u2060-\u206f\ufeff\ufff0-\uffff]/g, - escapable = /[\\\"\x00-\x1f\x7f-\x9f\u00ad\u0600-\u0604\u070f\u17b4\u17b5\u200c-\u200f\u2028-\u202f\u2060-\u206f\ufeff\ufff0-\uffff]/g, - gap, - indent, - meta = { // table of character substitutions - '\b': '\\b', - '\t': '\\t', - '\n': '\\n', - '\f': '\\f', - '\r': '\\r', - '"' : '\\"', - '\\': '\\\\' - }, - rep; - - - function quote(string) { - -// If the string contains no control characters, no quote characters, and no -// backslash characters, then we can safely slap some quotes around it. -// Otherwise we must also replace the offending characters with safe escape -// sequences. - - escapable.lastIndex = 0; - return escapable.test(string) ? - '"' + string.replace(escapable, function (a) { - var c = meta[a]; - return typeof c === 'string' ? c : - '\\u' + ('0000' + a.charCodeAt(0).toString(16)).slice(-4); - }) + '"' : - '"' + string + '"'; - } - - - function str(key, holder) { - -// Produce a string from holder[key]. - - var i, // The loop counter. - k, // The member key. - v, // The member value. - length, - mind = gap, - partial, - value = holder[key]; - -// If the value has a toJSON method, call it to obtain a replacement value. - - if (value && typeof value === 'object' && - typeof value.toJSON === 'function') { - value = value.toJSON(key); - } - -// If we were called with a replacer function, then call the replacer to -// obtain a replacement value. - - if (typeof rep === 'function') { - value = rep.call(holder, key, value); - } - -// What happens next depends on the value's type. - - switch (typeof value) { - case 'string': - return quote(value); - - case 'number': - -// JSON numbers must be finite. Encode non-finite numbers as null. - - return isFinite(value) ? String(value) : 'null'; - - case 'boolean': - case 'null': - -// If the value is a boolean or null, convert it to a string. Note: -// typeof null does not produce 'null'. The case is included here in -// the remote chance that this gets fixed someday. - - return String(value); - -// If the type is 'object', we might be dealing with an object or an array or -// null. - - case 'object': - -// Due to a specification blunder in ECMAScript, typeof null is 'object', -// so watch out for that case. - - if (!value) { - return 'null'; - } - -// Make an array to hold the partial results of stringifying this object value. - - gap += indent; - partial = []; - -// Is the value an array? - - if (Object.prototype.toString.apply(value) === '[object Array]') { - -// The value is an array. Stringify every element. Use null as a placeholder -// for non-JSON values. - - length = value.length; - for (i = 0; i < length; i += 1) { - partial[i] = str(i, value) || 'null'; - } - -// Join all of the elements together, separated with commas, and wrap them in -// brackets. - - v = partial.length === 0 ? '[]' : - gap ? '[\n' + gap + - partial.join(',\n' + gap) + '\n' + - mind + ']' : - '[' + partial.join(',') + ']'; - gap = mind; - return v; - } - -// If the replacer is an array, use it to select the members to be stringified. - - if (rep && typeof rep === 'object') { - length = rep.length; - for (i = 0; i < length; i += 1) { - k = rep[i]; - if (typeof k === 'string') { - v = str(k, value); - if (v) { - partial.push(quote(k) + (gap ? ': ' : ':') + v); - } - } - } - } else { - -// Otherwise, iterate through all of the keys in the object. - - for (k in value) { - if (Object.hasOwnProperty.call(value, k)) { - v = str(k, value); - if (v) { - partial.push(quote(k) + (gap ? ': ' : ':') + v); - } - } - } - } - -// Join all of the member texts together, separated with commas, -// and wrap them in braces. - - v = partial.length === 0 ? '{}' : - gap ? '{\n' + gap + partial.join(',\n' + gap) + '\n' + - mind + '}' : '{' + partial.join(',') + '}'; - gap = mind; - return v; - } - } - -// If the JSON object does not yet have a stringify method, give it one. - - if (typeof JSON.stringify !== 'function') { - JSON.stringify = function (value, replacer, space) { - -// The stringify method takes a value and an optional replacer, and an optional -// space parameter, and returns a JSON text. The replacer can be a function -// that can replace values, or an array of strings that will select the keys. -// A default replacer method can be provided. Use of the space parameter can -// produce text that is more easily readable. - - var i; - gap = ''; - indent = ''; - -// If the space parameter is a number, make an indent string containing that -// many spaces. - - if (typeof space === 'number') { - for (i = 0; i < space; i += 1) { - indent += ' '; - } - -// If the space parameter is a string, it will be used as the indent string. - - } else if (typeof space === 'string') { - indent = space; - } - -// If there is a replacer, it must be a function or an array. -// Otherwise, throw an error. - - rep = replacer; - if (replacer && typeof replacer !== 'function' && - (typeof replacer !== 'object' || - typeof replacer.length !== 'number')) { - throw new Error('JSON.stringify'); - } - -// Make a fake root object containing our value under the key of ''. -// Return the result of stringifying the value. - - return str('', {'': value}); - }; - } - - -// If the JSON object does not yet have a parse method, give it one. - - if (typeof JSON.parse !== 'function') { - JSON.parse = function (text, reviver) { - -// The parse method takes a text and an optional reviver function, and returns -// a JavaScript value if the text is a valid JSON text. - - var j; - - function walk(holder, key) { - -// The walk method is used to recursively walk the resulting structure so -// that modifications can be made. - - var k, v, value = holder[key]; - if (value && typeof value === 'object') { - for (k in value) { - if (Object.hasOwnProperty.call(value, k)) { - v = walk(value, k); - if (v !== undefined) { - value[k] = v; - } else { - delete value[k]; - } - } - } - } - return reviver.call(holder, key, value); - } - - -// Parsing happens in four stages. In the first stage, we replace certain -// Unicode characters with escape sequences. JavaScript handles many characters -// incorrectly, either silently deleting them, or treating them as line endings. - - text = String(text); - cx.lastIndex = 0; - if (cx.test(text)) { - text = text.replace(cx, function (a) { - return '\\u' + - ('0000' + a.charCodeAt(0).toString(16)).slice(-4); - }); - } - -// In the second stage, we run the text against regular expressions that look -// for non-JSON patterns. We are especially concerned with '()' and 'new' -// because they can cause invocation, and '=' because it can cause mutation. -// But just to be safe, we want to reject all unexpected forms. - -// We split the second stage into 4 regexp operations in order to work around -// crippling inefficiencies in IE's and Safari's regexp engines. First we -// replace the JSON backslash pairs with '@' (a non-JSON character). Second, we -// replace all simple value tokens with ']' characters. Third, we delete all -// open brackets that follow a colon or comma or that begin the text. Finally, -// we look to see that the remaining characters are only whitespace or ']' or -// ',' or ':' or '{' or '}'. If that is so, then the text is safe for eval. - - if (/^[\],:{}\s]*$/. -test(text.replace(/\\(?:["\\\/bfnrt]|u[0-9a-fA-F]{4})/g, '@'). -replace(/"[^"\\\n\r]*"|true|false|null|-?\d+(?:\.\d*)?(?:[eE][+\-]?\d+)?/g, ']'). -replace(/(?:^|:|,)(?:\s*\[)+/g, ''))) { - -// In the third stage we use the eval function to compile the text into a -// JavaScript structure. The '{' operator is subject to a syntactic ambiguity -// in JavaScript: it can begin a block or an object literal. We wrap the text -// in parens to eliminate the ambiguity. - - j = eval('(' + text + ')'); - -// In the optional fourth stage, we recursively walk the new structure, passing -// each name/value pair to a reviver function for possible transformation. - - return typeof reviver === 'function' ? - walk({'': j}, '') : j; - } - -// If the text is not JSON parseable, then a SyntaxError is thrown. - - throw new SyntaxError('JSON.parse'); - }; - } -}()); diff --git a/lib/DefaultHandler.js b/lib/DefaultHandler.js new file mode 100644 index 0000000..1536f6e --- /dev/null +++ b/lib/DefaultHandler.js @@ -0,0 +1,135 @@ +var ElementType = require("./ElementType.js"); + +function DefaultHandler (callback, options) { + this.dom = []; + this._done = false; + this._tagStack = []; + this._options = options ? options : { }; + if (this._options.ignoreWhitespace === undefined) + this._options.ignoreWhitespace = false; //Keep whitespace-only text nodes + if (this._options.verbose === undefined) + this._options.verbose = true; //Keep data property for tags and raw property for all + if (this._options.enforceEmptyTags === undefined) + this._options.enforceEmptyTags = true; //Don't allow children for HTML tags defined as empty in spec + if ((typeof callback) === "function") + this._callback = callback; +} + +//**"Static"**// +//HTML Tags that shouldn't contain child nodes +DefaultHandler._emptyTags = { + area: 1 + , base: 1 + , basefont: 1 + , br: 1 + , col: 1 + , frame: 1 + , hr: 1 + , img: 1 + , input: 1 + , isindex: 1 + , link: 1 + , meta: 1 + , param: 1 + , embed: 1 +}; +//Regex to detect whitespace only text nodes +DefaultHandler.reWhitespace = /^\s*$/; + +//**Public**// +//Methods// +//Resets the handler back to starting state +DefaultHandler.prototype.reset = function() { + this.dom = []; + this._done = false; + this._tagStack = []; + this._tagStack.last = function() { + return(this.length ? this[this.length - 1] : null); + }; +}; +//Signals the handler that parsing is done +DefaultHandler.prototype.done = function() { + this._done = true; + this.handleCallback(null); +}; +DefaultHandler.prototype.writeText = function(element) { + if (this._options.ignoreWhitespace) + if (DefaultHandler.reWhitespace.test(element.data)) + return; + this.handleElement(element); +}; + +//Methods// +DefaultHandler.prototype.error = +DefaultHandler.prototype.handleCallback = function(error) { + if ((typeof this._callback) !== "function") + if (error) + throw error; + else + return; + this._callback(error, this.dom); +}; + +DefaultHandler.prototype.isEmptyTag = function(element) { + var name = element.name.toLowerCase(); + if (name.charAt(0) === '/') { + name = name.substring(1); + } + return this._options.enforceEmptyTags && !!DefaultHandler._emptyTags[name]; +}; + +DefaultHandler.prototype.writeTag = DefaultHandler.prototype.writeDirective = DefaultHandler.prototype.writeComment = +DefaultHandler.prototype.handleElement = function(element) { + if (this._done) + this.handleCallback(new Error("Writing to the handler after done() called is not allowed without a reset()")); + if (!this._options.verbose) { + //element.raw = null; //FIXME: Not clean + //FIXME: Serious performance problem using delete + delete element.raw; + if (element.type === "tag" || element.type === "script" || element.type === "style") + delete element.data; + } + if (!this._tagStack.last()) { //There are no parent elements + //If the element can be a container, add it to the tag stack and the top level list + if (element.type !== ElementType.Text && element.type !== ElementType.Comment && element.type !== ElementType.Directive) { + if (element.name.charAt(0) !== "/") { //Ignore closing tags that obviously don't have an opening tag + this.dom.push(element); + if (!this.isEmptyTag(element)) { //Don't add tags to the tag stack that can't have children + this._tagStack.push(element); + } + } + } + else //Otherwise just add to the top level list + this.dom.push(element); + } + else { //There are parent elements + //If the element can be a container, add it as a child of the element + //on top of the tag stack and then add it to the tag stack + if (element.type !== ElementType.Text && element.type !== ElementType.Comment && element.type !== ElementType.Directive) { + if (element.name.charAt(0) === "/") { + //This is a closing tag, scan the tagStack to find the matching opening tag + //and pop the stack up to the opening tag's parent + var baseName = element.name.substring(1); + if (!this.isEmptyTag(element)) { + var pos = this._tagStack.length - 1; + while (pos > -1 && this._tagStack[pos--].name !== baseName) { } + if (pos > -1 || this._tagStack[0].name === baseName) + while (pos < this._tagStack.length - 1) + this._tagStack.pop(); + } + } + else { //This is not a closing tag + if (!this._tagStack.last().children) + this._tagStack.last().children = []; + this._tagStack.last().children.push(element); + if (!this.isEmptyTag(element)) //Don't add tags to the tag stack that can't have children + this._tagStack.push(element); + } + } + else { //This is not a container element + if (!this._tagStack.last().children) + this._tagStack.last().children = []; + this._tagStack.last().children.push(element); + } + } +}; \ No newline at end of file diff --git a/lib/DomUtils.js b/lib/DomUtils.js new file mode 100644 index 0000000..f930ba0 --- /dev/null +++ b/lib/DomUtils.js @@ -0,0 +1,94 @@ +var DomUtils = { + testElement: function(options, element) { + if (!element) { + return false; + } + + for (var key in options) { + if (key === "tag_name") { + if (element.type !== "tag" && element.type !== "script" && element.type !== "style") { + return false; + } + if (!options.tag_name(element.name)) { + return false; + } + } else if (key === "tag_type") { + if (!options.tag_type(element.type)) { + return false; + } + } else if (key === "tag_contains") { + if (element.type !== "text" && element.type !== "comment" && element.type !== "directive") { + return false; + } + if (!options.tag_contains(element.data)) { + return false; + } + } else { + if (!element.attribs || !options[key](element.attribs[key])) { + return false; + } + } + } + + return true; + } + + , getElements: function(options, currentElement, recurse, limit) { + recurse = (recurse === undefined || recurse === null) || !!recurse; + limit = isNaN(parseInt(limit, 10)) ? -1 : parseInt(limit, 10); + + if (!currentElement) { + return([]); + } + + var found = []; + var elementList; + + function getTest (checkVal) { + return(function (value) { return(value === checkVal); }); + } + for (var key in options) { + if ((typeof options[key]) !== "function") { + options[key] = getTest(options[key]); + } + } + + if (DomUtils.testElement(options, currentElement)) { + found.push(currentElement); + } + + if (limit >= 0 && found.length >= limit) { + return(found); + } + + if (recurse && currentElement.children) { + elementList = currentElement.children; + } else if (currentElement instanceof Array) { + elementList = currentElement; + } else { + return(found); + } + + for (var i = 0; i < elementList.length; i++) { + found = found.concat(DomUtils.getElements(options, elementList[i], recurse, limit)); + if (limit >= 0 && found.length >= limit) { + break; + } + } + + return(found); + } + + , getElementById: function(id, currentElement, recurse) { + var result = DomUtils.getElements({ id: id }, currentElement, recurse, 1); + return(result.length ? result[0] : null); + } + + , getElementsByTagName: function(name, currentElement, recurse, limit) { + return(DomUtils.getElements({ tag_name: name }, currentElement, recurse, limit)); + } + + , getElementsByTagType: function(type, currentElement, recurse, limit) { + return(DomUtils.getElements({ tag_type: type }, currentElement, recurse, limit)); + } +}; diff --git a/lib/ElementType.js b/lib/ElementType.js new file mode 100644 index 0000000..09d3d9f --- /dev/null +++ b/lib/ElementType.js @@ -0,0 +1,9 @@ +//Types of elements found in the DOM +var ElementType = { + Text: "text" //Plain text + , Directive: "directive" //Special tag + , Comment: "comment" //Special tag + , Script: "script" //Special tag + , Style: "style" //Special tag + , Tag: "tag" //Any tag that isn't special +}; \ No newline at end of file diff --git a/lib/Parser.js b/lib/Parser.js new file mode 100644 index 0000000..ecb7be3 --- /dev/null +++ b/lib/Parser.js @@ -0,0 +1,397 @@ +var ElementType = require("./ElementType.js"); + +function Parser (handler, options) { + this._options = options ? options : { }; + if (this._options.includeLocation === undefined) { + this._options.includeLocation = false; //Do not track element position in document by default + } + + this.validateHandler(handler); + this._handler = handler; + + this._buffer = ""; + this._done = false; + this._elements = []; + this._elementsCurrent = 0; + this._current = 0; + this._next = 0; + this._location = { + row: 0 + , col: 0 + , charOffset: 0 + , inBuffer: 0 + }; + this._parseState = ElementType.Text; + this._prevTagSep = ''; + this._tagStack = []; +} + +//**"Static"**// +//Regular expressions used for cleaning up and parsing (stateless) +Parser._reTrim = /(^\s+|\s+$)/g; //Trim leading/trailing whitespace +Parser._reTrimComment = /(^\!--|--$)/g; //Remove comment tag markup from comment contents +Parser._reWhitespace = /\s/g; //Used to find any whitespace to split on +Parser._reTagName = /^\s*(\/?)\s*([^\s\/]+)/; //Used to find the tag name for an element + +//Regular expressions used for parsing (stateful) +Parser._reAttrib = //Find attributes in a tag + /([^=<>\"\'\s]+)\s*=\s*"([^"]*)"|([^=<>\"\'\s]+)\s*=\s*'([^']*)'|([^=<>\"\'\s]+)\s*=\s*([^'"\s]+)|([^=<>\"\'\s\/]+)/g; +Parser._reTags = /[<\>]/g; //Find tag markers + +//**Public**// +//Methods// +//Parses a complete HTML and pushes it to the handler +Parser.prototype.parseComplete = function(data) { + this.reset(); + this.parseChunk(data); + this.done(); +}; + +//Parses a piece of an HTML document +Parser.prototype.parseChunk = function(data) { + if (this._done) + this.handleError(new Error("Attempted to parse chunk after parsing already done")); + this._buffer += data; //FIXME: this can be a bottleneck + this.parseTags(); +}; + +//Tells the parser that the HTML being parsed is complete +Parser.prototype.done = function() { + if (this._done) + return; + this._done = true; + + //Push any unparsed text into a final element in the element list + if (this._buffer.length) { + var rawData = this._buffer; + this._buffer = ""; + var element = { + raw: rawData + , data: (this._parseState === ElementType.Text) ? rawData : rawData.replace(Parser._reTrim, "") + , type: this._parseState + }; + if (this._parseState === ElementType.Tag || this._parseState === ElementType.Script || this._parseState === ElementType.Style) + element.name = this.parseTagName(element.data); + this.parseAttribs(element); + this._elements.push(element); + } + + this.writeHandler(); + this._handler.done(); +}; + +//Resets the parser to a blank state, ready to parse a new HTML document +Parser.prototype.reset = function() { + this._buffer = ""; + this._done = false; + this._elements = []; + this._elementsCurrent = 0; + this._current = 0; + this._next = 0; + this._location = { + row: 0 + , col: 0 + , charOffset: 0 + , inBuffer: 0 + }; + this._parseState = ElementType.Text; + this._prevTagSep = ''; + this._tagStack = []; + this._handler.reset(); +}; + +//**Private**// +//Methods// +//Takes an array of elements and parses any found attributes +Parser.prototype.parseTagAttribs = function(elements) { + var idxEnd = elements.length; + var idx = 0; + + while (idx < idxEnd) { + var element = elements[idx++]; + if (element.type === ElementType.Tag || element.type === ElementType.Script || element.type === ElementType.style) + this.parseAttribs(element); + } + + return(elements); +}; + +//Takes an element and adds an "attribs" property for any element attributes found +Parser.prototype.parseAttribs = function(element) { + //Only parse attributes for tags + if (element.type !== ElementType.Script && element.type !== ElementType.Style && element.type !== ElementType.Tag) + return; + + var tagName = element.data.split(Parser._reWhitespace, 1)[0]; + var attribRaw = element.data.substring(tagName.length); + if (attribRaw.length < 1) + return; + + var match; + Parser._reAttrib.lastIndex = 0; + while (match = Parser._reAttrib.exec(attribRaw)) { + if (element.attribs === undefined) + element.attribs = {}; + + if (typeof match[1] === "string" && match[1].length) { + element.attribs[match[1]] = match[2]; + } else if (typeof match[3] === "string" && match[3].length) { + element.attribs[match[3].toString()] = match[4].toString(); + } else if (typeof match[5] === "string" && match[5].length) { + element.attribs[match[5]] = match[6]; + } else if (typeof match[7] === "string" && match[7].length) { + element.attribs[match[7]] = match[7]; + } + } +}; + +//Extracts the base tag name from the data value of an element +Parser.prototype.parseTagName = function(data) { + if (data === null || data === "") + return(""); + var match = Parser._reTagName.exec(data); + if (!match) + return(""); + return((match[1] ? "/" : "") + match[2]); +}; + +//Parses through HTML text and returns an array of found elements +//I admit, this function is rather large but splitting up had an noticeable impact on speed +Parser.prototype.parseTags = function() { + var bufferEnd = this._buffer.length - 1; + while (Parser._reTags.test(this._buffer)) { + this._next = Parser._reTags.lastIndex - 1; + var tagSep = this._buffer.charAt(this._next); //The currently found tag marker + var rawData = this._buffer.substring(this._current, this._next); //The next chunk of data to parse + + //A new element to eventually be appended to the element list + var element = { + raw: rawData + , data: (this._parseState === ElementType.Text) ? rawData : rawData.replace(Parser._reTrim, "") + , type: this._parseState + }; + + var elementName = this.parseTagName(element.data), prevElement, rawLen; + + //This section inspects the current tag stack and modifies the current + //element if we're actually parsing a special area (script/comment/style tag) + if (this._tagStack.length) { //We're parsing inside a script/comment/style tag + if (this._tagStack[this._tagStack.length - 1] === ElementType.Script) { //We're currently in a script tag + if (elementName === "/script") //Actually, we're no longer in a script tag, so pop it off the stack + this._tagStack.pop(); + else { //Not a closing script tag + if (element.raw.indexOf("!--") !== 0) { //Make sure we're not in a comment + //All data from here to script close is now a text element + element.type = ElementType.Text; + //If the previous element is text, append the current text to it + if (this._elements.length && this._elements[this._elements.length - 1].type === ElementType.Text) { + prevElement = this._elements[this._elements.length - 1]; + prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep + element.raw; + element.raw = element.data = ""; //This causes the current element to not be added to the element list + } + } + } + } + else if (this._tagStack[this._tagStack.length - 1] === ElementType.Style) { //We're currently in a style tag + if (elementName === "/style") //Actually, we're no longer in a style tag, so pop it off the stack + this._tagStack.pop(); + else { + if (element.raw.indexOf("!--") !== 0) { //Make sure we're not in a comment + //All data from here to style close is now a text element + element.type = ElementType.Text; + //If the previous element is text, append the current text to it + if (this._elements.length && this._elements[this._elements.length - 1].type === ElementType.Text) { + prevElement = this._elements[this._elements.length - 1]; + if (element.raw !== "") { + prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep + element.raw; + element.raw = element.data = ""; //This causes the current element to not be added to the element list + } else { //Element is empty, so just append the last tag marker found + prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep; + } + } else { //The previous element was not text + if (element.raw !== "") { + element.raw = element.data = element.raw; + } + } + } + } + } + else if (this._tagStack[this._tagStack.length - 1] === ElementType.Comment) { //We're currently in a comment tag + rawLen = element.raw.length; + if (element.raw.charAt(rawLen - 2) === "-" && element.raw.charAt(rawLen - 1) === "-" && tagSep === ">") { + //Actually, we're no longer in a style tag, so pop it off the stack + this._tagStack.pop(); + //If the previous element is a comment, append the current text to it + if (this._elements.length && this._elements[this._elements.length - 1].type === ElementType.Comment) { + prevElement = this._elements[this._elements.length - 1]; + prevElement.raw = prevElement.data = (prevElement.raw + element.raw).replace(Parser._reTrimComment, ""); + element.raw = element.data = ""; //This causes the current element to not be added to the element list + element.type = ElementType.Text; + } + else //Previous element not a comment + element.type = ElementType.Comment; //Change the current element's type to a comment + } + else { //Still in a comment tag + element.type = ElementType.Comment; + //If the previous element is a comment, append the current text to it + if (this._elements.length && this._elements[this._elements.length - 1].type === ElementType.Comment) { + prevElement = this._elements[this._elements.length - 1]; + prevElement.raw = prevElement.data = prevElement.raw + element.raw + tagSep; + element.raw = element.data = ""; //This causes the current element to not be added to the element list + element.type = ElementType.Text; + } + else + element.raw = element.data = element.raw + tagSep; + } + } + } + + //Processing of non-special tags + if (element.type === ElementType.Tag) { + element.name = elementName; + + if (element.raw.indexOf("!--") === 0) { //This tag is really comment + element.type = ElementType.Comment; + delete element.name; + rawLen = element.raw.length; + //Check if the comment is terminated in the current element + if (element.raw.charAt(rawLen - 1) === "-" && element.raw.charAt(rawLen - 2) === "-" && tagSep === ">") + element.raw = element.data = element.raw.replace(Parser._reTrimComment, ""); + else { //It's not so push the comment onto the tag stack + element.raw += tagSep; + this._tagStack.push(ElementType.Comment); + } + } + else if (element.raw.indexOf("!") === 0 || element.raw.indexOf("?") === 0) { + element.type = ElementType.Directive; + //TODO: what about CDATA? + } + else if (element.name === "script") { + element.type = ElementType.Script; + //Special tag, push onto the tag stack if not terminated + if (element.data.charAt(element.data.length - 1) !== "/") + this._tagStack.push(ElementType.Script); + } + else if (element.name === "/script") + element.type = ElementType.Script; + else if (element.name === "style") { + element.type = ElementType.Style; + //Special tag, push onto the tag stack if not terminated + if (element.data.charAt(element.data.length - 1) !== "/") + this._tagStack.push(ElementType.Style); + } + else if (element.name === "/style") + element.type = ElementType.Style; + if (element.name && element.name.charAt(0) === "/") + element.data = element.name; + } + + //Add all tags and non-empty text elements to the element list + if (element.raw !== "" || element.type !== ElementType.Text) { + if (this._options.includeLocation && !element.location) { + element.location = this.getLocation(element.type === ElementType.Tag); + } + this.parseAttribs(element); + this._elements.push(element); + //If tag self-terminates, add an explicit, separate closing tag + if ( + element.type !== ElementType.Text + && + element.type !== ElementType.Comment + && + element.type !== ElementType.Directive + && + element.data.charAt(element.data.length - 1) === "/" + ) + this._elements.push({ + raw: "/" + element.name + , data: "/" + element.name + , name: "/" + element.name + , type: element.type + }); + } + this._parseState = (tagSep === "<") ? ElementType.Tag : ElementType.Text; + this._current = this._next + 1; + this._prevTagSep = tagSep; + } + + if (this._options.includeLocation) { + this.getLocation(); + this._location.row += this._location.inBuffer; + this._location.inBuffer = 0; + this._location.charOffset = 0; + } + this._buffer = (this._current <= bufferEnd) ? this._buffer.substring(this._current) : ""; + this._current = 0; + + this.writeHandler(); +}; + +Parser.prototype.getLocation = function(startTag) { + var c, + l = this._location, + end = this._current - (startTag ? 1 : 0), + chunk = startTag && l.charOffset === 0 && this._current === 0; + + for (; l.charOffset < end; l.charOffset++) { + c = this._buffer.charAt(l.charOffset); + if (c === '\n') { + l.inBuffer++; + l.col = 0; + } else if (c !== '\r') { + l.col++; + } + } + return { + line: l.row + l.inBuffer + 1 + , col: l.col + (chunk ? 0: 1) + }; +}; + +//Checks the handler to make it is an object with the right "interface" +Parser.prototype.validateHandler = function(handler) { + if ((typeof handler) !== "object") + throw new Error("Handler is not an object"); + if ((typeof handler.reset) !== "function") + throw new Error("Handler method 'reset' is invalid"); + if ((typeof handler.done) !== "function") + throw new Error("Handler method 'done' is invalid"); + if ((typeof handler.writeTag) !== "function") + throw new Error("Handler method 'writeTag' is invalid"); + if ((typeof handler.writeText) !== "function") + throw new Error("Handler method 'writeText' is invalid"); + if ((typeof handler.writeComment) !== "function") + throw new Error("Handler method 'writeComment' is invalid"); + if ((typeof handler.writeDirective) !== "function") + throw new Error("Handler method 'writeDirective' is invalid"); +}; + +//Writes parsed elements out to the handler +Parser.prototype.writeHandler = function(forceFlush) { + forceFlush = !!forceFlush; + if (this._tagStack.length && !forceFlush) + return; + while (this._elements.length) { + var element = this._elements.shift(); + switch (element.type) { + case ElementType.Comment: + this._handler.writeComment(element); + break; + case ElementType.Directive: + this._handler.writeDirective(element); + break; + case ElementType.Text: + this._handler.writeText(element); + break; + default: + this._handler.writeTag(element); + break; + } + } +}; + +Parser.prototype.handleError = function(error) { + if ((typeof this._handler.error) === "function") + this._handler.error(error); + else throw error; +}; \ No newline at end of file diff --git a/lib/RssHandler.js b/lib/RssHandler.js new file mode 100644 index 0000000..caf7f83 --- /dev/null +++ b/lib/RssHandler.js @@ -0,0 +1,112 @@ +var DefaultHandler = require("./DefaultHandler.js"), + DomUtils = require("./DomUtils.js"); + +//TODO: make this a trully streamable handler +function RssHandler (callback) { + RssHandler.super_.call(this, callback, { ignoreWhitespace: true, verbose: false, enforceEmptyTags: false }); +} + +function inherits (ctor, superCtor) { + var tempCtor = function(){}; + tempCtor.prototype = superCtor.prototype; + ctor.super_ = superCtor; + ctor.prototype = new tempCtor(); + ctor.prototype.constructor = ctor; +} + +inherits(RssHandler, DefaultHandler); + +RssHandler.prototype.done = function() { + var feed = { }; + var feedRoot; + + var found = DomUtils.getElementsByTagName(function (value) { return(value === "rss" || value === "feed"); }, this.dom, false); + if (found.length) { + feedRoot = found[0]; + } + if (feedRoot) { + if (feedRoot.name === "rss") { + feed.type = "rss"; + feedRoot = feedRoot.children[0]; // + feed.id = ""; + try { + feed.title = DomUtils.getElementsByTagName("title", feedRoot.children, false)[0].children[0].data; + } catch (ex) { } + try { + feed.link = DomUtils.getElementsByTagName("link", feedRoot.children, false)[0].children[0].data; + } catch (ex) { } + try { + feed.description = DomUtils.getElementsByTagName("description", feedRoot.children, false)[0].children[0].data; + } catch (ex) { } + try { + feed.updated = new Date(DomUtils.getElementsByTagName("lastBuildDate", feedRoot.children, false)[0].children[0].data); + } catch (ex) { } + try { + feed.author = DomUtils.getElementsByTagName("managingEditor", feedRoot.children, false)[0].children[0].data; + } catch (ex) { } + feed.items = []; + DomUtils.getElementsByTagName("item", feedRoot.children).forEach(function (item, index, list) { + var entry = {}; + try { + entry.id = DomUtils.getElementsByTagName("guid", item.children, false)[0].children[0].data; + } catch (ex) { } + try { + entry.title = DomUtils.getElementsByTagName("title", item.children, false)[0].children[0].data; + } catch (ex) { } + try { + entry.link = DomUtils.getElementsByTagName("link", item.children, false)[0].children[0].data; + } catch (ex) { } + try { + entry.description = DomUtils.getElementsByTagName("description", item.children, false)[0].children[0].data; + } catch (ex) { } + try { + entry.pubDate = new Date(DomUtils.getElementsByTagName("pubDate", item.children, false)[0].children[0].data); + } catch (ex) { } + feed.items.push(entry); + }); + } else { + feed.type = "atom"; + try { + feed.id = DomUtils.getElementsByTagName("id", feedRoot.children, false)[0].children[0].data; + } catch (ex) { } + try { + feed.title = DomUtils.getElementsByTagName("title", feedRoot.children, false)[0].children[0].data; + } catch (ex) { } + try { + feed.link = DomUtils.getElementsByTagName("link", feedRoot.children, false)[0].attribs.href; + } catch (ex) { } + try { + feed.description = DomUtils.getElementsByTagName("subtitle", feedRoot.children, false)[0].children[0].data; + } catch (ex) { } + try { + feed.updated = new Date(DomUtils.getElementsByTagName("updated", feedRoot.children, false)[0].children[0].data); + } catch (ex) { } + try { + feed.author = DomUtils.getElementsByTagName("email", feedRoot.children, true)[0].children[0].data; + } catch (ex) { } + feed.items = []; + DomUtils.getElementsByTagName("entry", feedRoot.children).forEach(function (item, index, list) { + var entry = {}; + try { + entry.id = DomUtils.getElementsByTagName("id", item.children, false)[0].children[0].data; + } catch (ex) { } + try { + entry.title = DomUtils.getElementsByTagName("title", item.children, false)[0].children[0].data; + } catch (ex) { } + try { + entry.link = DomUtils.getElementsByTagName("link", item.children, false)[0].attribs.href; + } catch (ex) { } + try { + entry.description = DomUtils.getElementsByTagName("summary", item.children, false)[0].children[0].data; + } catch (ex) { } + try { + entry.pubDate = new Date(DomUtils.getElementsByTagName("updated", item.children, false)[0].children[0].data); + } catch (ex) { } + feed.items.push(entry); + }); + } + + this.dom = feed; + } + RssHandler.super_.prototype.done.call(this); +}; \ No newline at end of file diff --git a/lib/htmlparser.js b/lib/htmlparser.js index c56928b..e14ae86 100644 --- a/lib/htmlparser.js +++ b/lib/htmlparser.js @@ -1,822 +1,5 @@ -/*********************************************** -Copyright 2010, 2011, Chris Winberry . All rights reserved. -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to -deal in the Software without restriction, including without limitation the -rights to use, copy, modify, merge, publish, distribute, sublicense, and/or -sell copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -IN THE SOFTWARE. -***********************************************/ -/* v1.8.0s */ - -(function () { - -function runningInNode () { - return( - (typeof require) == "function" - && - (typeof exports) == "object" - && - (typeof module) == "object" - && - (typeof __filename) == "string" - && - (typeof __dirname) == "string" - ); -} - -if (!runningInNode()) { - if (!this.Tautologistics) - this.Tautologistics = {}; - else if (this.Tautologistics.NodeHtmlParser) - return; //NodeHtmlParser already defined! - this.Tautologistics.NodeHtmlParser = {}; - exports = this.Tautologistics.NodeHtmlParser; -} - -//Types of elements found in the DOM -var ElementType = { - Text: "text" //Plain text - , Directive: "directive" //Special tag - , Comment: "comment" //Special tag - , Script: "script" //Special tag - , Style: "style" //Special tag - , Tag: "tag" //Any tag that isn't special -} - -function Parser (handler, options) { - this._options = options ? options : { }; - if (this._options.includeLocation == undefined) { - this._options.includeLocation = false; //Do not track element position in document by default - } - - this.validateHandler(handler); - this._handler = handler; - this.reset(); -} - - //**"Static"**// - //Regular expressions used for cleaning up and parsing (stateless) - Parser._reTrim = /(^\s+|\s+$)/g; //Trim leading/trailing whitespace - Parser._reTrimComment = /(^\!--|--$)/g; //Remove comment tag markup from comment contents - Parser._reWhitespace = /\s/g; //Used to find any whitespace to split on - Parser._reTagName = /^\s*(\/?)\s*([^\s\/]+)/; //Used to find the tag name for an element - - //Regular expressions used for parsing (stateful) - Parser._reAttrib = //Find attributes in a tag - /([^=<>\"\'\s]+)\s*=\s*"([^"]*)"|([^=<>\"\'\s]+)\s*=\s*'([^']*)'|([^=<>\"\'\s]+)\s*=\s*([^'"\s]+)|([^=<>\"\'\s\/]+)/g; - Parser._reTags = /[\<\>]/g; //Find tag markers - - //**Public**// - //Methods// - //Parses a complete HTML and pushes it to the handler - Parser.prototype.parseComplete = function Parser$parseComplete (data) { - this.reset(); - this.parseChunk(data); - this.done(); - } - - //Parses a piece of an HTML document - Parser.prototype.parseChunk = function Parser$parseChunk (data) { - if (this._done) - this.handleError(new Error("Attempted to parse chunk after parsing already done")); - this._buffer += data; //FIXME: this can be a bottleneck - this.parseTags(); - } - - //Tells the parser that the HTML being parsed is complete - Parser.prototype.done = function Parser$done () { - if (this._done) - return; - this._done = true; - - //Push any unparsed text into a final element in the element list - if (this._buffer.length) { - var rawData = this._buffer; - this._buffer = ""; - var element = { - raw: rawData - , data: (this._parseState == ElementType.Text) ? rawData : rawData.replace(Parser._reTrim, "") - , type: this._parseState - }; - if (this._parseState == ElementType.Tag || this._parseState == ElementType.Script || this._parseState == ElementType.Style) - element.name = this.parseTagName(element.data); - this.parseAttribs(element); - this._elements.push(element); - } - - this.writeHandler(); - this._handler.done(); - } - - //Resets the parser to a blank state, ready to parse a new HTML document - Parser.prototype.reset = function Parser$reset () { - this._buffer = ""; - this._done = false; - this._elements = []; - this._elementsCurrent = 0; - this._current = 0; - this._next = 0; - this._location = { - row: 0 - , col: 0 - , charOffset: 0 - , inBuffer: 0 - }; - this._parseState = ElementType.Text; - this._prevTagSep = ''; - this._tagStack = []; - this._handler.reset(); - } - - //**Private**// - //Properties// - Parser.prototype._options = null; //Parser options for how to behave - Parser.prototype._handler = null; //Handler for parsed elements - Parser.prototype._buffer = null; //Buffer of unparsed data - Parser.prototype._done = false; //Flag indicating whether parsing is done - Parser.prototype._elements = null; //Array of parsed elements - Parser.prototype._elementsCurrent = 0; //Pointer to last element in _elements that has been processed - Parser.prototype._current = 0; //Position in data that has already been parsed - Parser.prototype._next = 0; //Position in data of the next tag marker (<>) - Parser.prototype._location = null; //Position tracking for elements in a stream - Parser.prototype._parseState = ElementType.Text; //Current type of element being parsed - Parser.prototype._prevTagSep = ''; //Previous tag marker found - //Stack of element types previously encountered; keeps track of when - //parsing occurs inside a script/comment/style tag - Parser.prototype._tagStack = null; - - //Methods// - //Takes an array of elements and parses any found attributes - Parser.prototype.parseTagAttribs = function Parser$parseTagAttribs (elements) { - var idxEnd = elements.length; - var idx = 0; - - while (idx < idxEnd) { - var element = elements[idx++]; - if (element.type == ElementType.Tag || element.type == ElementType.Script || element.type == ElementType.style) - this.parseAttribs(element); - } - - return(elements); - } - - //Takes an element and adds an "attribs" property for any element attributes found - Parser.prototype.parseAttribs = function Parser$parseAttribs (element) { - //Only parse attributes for tags - if (element.type != ElementType.Script && element.type != ElementType.Style && element.type != ElementType.Tag) - return; - - var tagName = element.data.split(Parser._reWhitespace, 1)[0]; - var attribRaw = element.data.substring(tagName.length); - if (attribRaw.length < 1) - return; - - var match; - Parser._reAttrib.lastIndex = 0; - while (match = Parser._reAttrib.exec(attribRaw)) { - if (element.attribs == undefined) - element.attribs = {}; - - if (typeof match[1] == "string" && match[1].length) { - element.attribs[match[1]] = match[2]; - } else if (typeof match[3] == "string" && match[3].length) { - element.attribs[match[3].toString()] = match[4].toString(); - } else if (typeof match[5] == "string" && match[5].length) { - element.attribs[match[5]] = match[6]; - } else if (typeof match[7] == "string" && match[7].length) { - element.attribs[match[7]] = match[7]; - } - } - } - - //Extracts the base tag name from the data value of an element - Parser.prototype.parseTagName = function Parser$parseTagName (data) { - if (data == null || data == "") - return(""); - var match = Parser._reTagName.exec(data); - if (!match) - return(""); - return((match[1] ? "/" : "") + match[2]); - } - - //Parses through HTML text and returns an array of found elements - //I admit, this function is rather large but splitting up had an noticeable impact on speed - Parser.prototype.parseTags = function Parser$parseTags () { - var bufferEnd = this._buffer.length - 1; - while (Parser._reTags.test(this._buffer)) { - this._next = Parser._reTags.lastIndex - 1; - var tagSep = this._buffer.charAt(this._next); //The currently found tag marker - var rawData = this._buffer.substring(this._current, this._next); //The next chunk of data to parse - - //A new element to eventually be appended to the element list - var element = { - raw: rawData - , data: (this._parseState == ElementType.Text) ? rawData : rawData.replace(Parser._reTrim, "") - , type: this._parseState - }; - - var elementName = this.parseTagName(element.data); - - //This section inspects the current tag stack and modifies the current - //element if we're actually parsing a special area (script/comment/style tag) - if (this._tagStack.length) { //We're parsing inside a script/comment/style tag - if (this._tagStack[this._tagStack.length - 1] == ElementType.Script) { //We're currently in a script tag - if (elementName == "/script") //Actually, we're no longer in a script tag, so pop it off the stack - this._tagStack.pop(); - else { //Not a closing script tag - if (element.raw.indexOf("!--") != 0) { //Make sure we're not in a comment - //All data from here to script close is now a text element - element.type = ElementType.Text; - //If the previous element is text, append the current text to it - if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Text) { - var prevElement = this._elements[this._elements.length - 1]; - prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep + element.raw; - element.raw = element.data = ""; //This causes the current element to not be added to the element list - } - } - } - } - else if (this._tagStack[this._tagStack.length - 1] == ElementType.Style) { //We're currently in a style tag - if (elementName == "/style") //Actually, we're no longer in a style tag, so pop it off the stack - this._tagStack.pop(); - else { - if (element.raw.indexOf("!--") != 0) { //Make sure we're not in a comment - //All data from here to style close is now a text element - element.type = ElementType.Text; - //If the previous element is text, append the current text to it - if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Text) { - var prevElement = this._elements[this._elements.length - 1]; - if (element.raw != "") { - prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep + element.raw; - element.raw = element.data = ""; //This causes the current element to not be added to the element list - } else { //Element is empty, so just append the last tag marker found - prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep; - } - } else { //The previous element was not text - if (element.raw != "") { - element.raw = element.data = element.raw; - } - } - } - } - } - else if (this._tagStack[this._tagStack.length - 1] == ElementType.Comment) { //We're currently in a comment tag - var rawLen = element.raw.length; - if (element.raw.charAt(rawLen - 2) == "-" && element.raw.charAt(rawLen - 1) == "-" && tagSep == ">") { - //Actually, we're no longer in a style tag, so pop it off the stack - this._tagStack.pop(); - //If the previous element is a comment, append the current text to it - if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Comment) { - var prevElement = this._elements[this._elements.length - 1]; - prevElement.raw = prevElement.data = (prevElement.raw + element.raw).replace(Parser._reTrimComment, ""); - element.raw = element.data = ""; //This causes the current element to not be added to the element list - element.type = ElementType.Text; - } - else //Previous element not a comment - element.type = ElementType.Comment; //Change the current element's type to a comment - } - else { //Still in a comment tag - element.type = ElementType.Comment; - //If the previous element is a comment, append the current text to it - if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Comment) { - var prevElement = this._elements[this._elements.length - 1]; - prevElement.raw = prevElement.data = prevElement.raw + element.raw + tagSep; - element.raw = element.data = ""; //This causes the current element to not be added to the element list - element.type = ElementType.Text; - } - else - element.raw = element.data = element.raw + tagSep; - } - } - } - - //Processing of non-special tags - if (element.type == ElementType.Tag) { - element.name = elementName; - - if (element.raw.indexOf("!--") == 0) { //This tag is really comment - element.type = ElementType.Comment; - delete element["name"]; - var rawLen = element.raw.length; - //Check if the comment is terminated in the current element - if (element.raw.charAt(rawLen - 1) == "-" && element.raw.charAt(rawLen - 2) == "-" && tagSep == ">") - element.raw = element.data = element.raw.replace(Parser._reTrimComment, ""); - else { //It's not so push the comment onto the tag stack - element.raw += tagSep; - this._tagStack.push(ElementType.Comment); - } - } - else if (element.raw.indexOf("!") == 0 || element.raw.indexOf("?") == 0) { - element.type = ElementType.Directive; - //TODO: what about CDATA? - } - else if (element.name == "script") { - element.type = ElementType.Script; - //Special tag, push onto the tag stack if not terminated - if (element.data.charAt(element.data.length - 1) != "/") - this._tagStack.push(ElementType.Script); - } - else if (element.name == "/script") - element.type = ElementType.Script; - else if (element.name == "style") { - element.type = ElementType.Style; - //Special tag, push onto the tag stack if not terminated - if (element.data.charAt(element.data.length - 1) != "/") - this._tagStack.push(ElementType.Style); - } - else if (element.name == "/style") - element.type = ElementType.Style; - if (element.name && element.name.charAt(0) == "/") - element.data = element.name; - } - - //Add all tags and non-empty text elements to the element list - if (element.raw != "" || element.type != ElementType.Text) { - if (this._options.includeLocation && !element.location) { - element.location = this.getLocation(element.type == ElementType.Tag); - } - this.parseAttribs(element); - this._elements.push(element); - //If tag self-terminates, add an explicit, separate closing tag - if ( - element.type != ElementType.Text - && - element.type != ElementType.Comment - && - element.type != ElementType.Directive - && - element.data.charAt(element.data.length - 1) == "/" - ) - this._elements.push({ - raw: "/" + element.name - , data: "/" + element.name - , name: "/" + element.name - , type: element.type - }); - } - this._parseState = (tagSep == "<") ? ElementType.Tag : ElementType.Text; - this._current = this._next + 1; - this._prevTagSep = tagSep; - } - - if (this._options.includeLocation) { - this.getLocation(); - this._location.row += this._location.inBuffer; - this._location.inBuffer = 0; - this._location.charOffset = 0; - } - this._buffer = (this._current <= bufferEnd) ? this._buffer.substring(this._current) : ""; - this._current = 0; - - this.writeHandler(); - } - - Parser.prototype.getLocation = function Parser$getLocation (startTag) { - var c, - l = this._location, - end = this._current - (startTag ? 1 : 0), - chunk = startTag && l.charOffset == 0 && this._current == 0; - - for (; l.charOffset < end; l.charOffset++) { - c = this._buffer.charAt(l.charOffset); - if (c == '\n') { - l.inBuffer++; - l.col = 0; - } else if (c != '\r') { - l.col++; - } - } - return { - line: l.row + l.inBuffer + 1 - , col: l.col + (chunk ? 0: 1) - }; - } - - //Checks the handler to make it is an object with the right "interface" - Parser.prototype.validateHandler = function Parser$validateHandler (handler) { - if ((typeof handler) != "object") - throw new Error("Handler is not an object"); - if ((typeof handler.reset) != "function") - throw new Error("Handler method 'reset' is invalid"); - if ((typeof handler.done) != "function") - throw new Error("Handler method 'done' is invalid"); - if ((typeof handler.writeTag) != "function") - throw new Error("Handler method 'writeTag' is invalid"); - if ((typeof handler.writeText) != "function") - throw new Error("Handler method 'writeText' is invalid"); - if ((typeof handler.writeComment) != "function") - throw new Error("Handler method 'writeComment' is invalid"); - if ((typeof handler.writeDirective) != "function") - throw new Error("Handler method 'writeDirective' is invalid"); - } - - //Writes parsed elements out to the handler - Parser.prototype.writeHandler = function Parser$writeHandler (forceFlush) { - forceFlush = !!forceFlush; - if (this._tagStack.length && !forceFlush) - return; - while (this._elements.length) { - var element = this._elements.shift(); - switch (element.type) { - case ElementType.Comment: - this._handler.writeComment(element); - break; - case ElementType.Directive: - this._handler.writeDirective(element); - break; - case ElementType.Text: - this._handler.writeText(element); - break; - default: - this._handler.writeTag(element); - break; - } - } - } - - Parser.prototype.handleError = function Parser$handleError (error) { - if ((typeof this._handler.error) == "function") - this._handler.error(error); - else - throw error; - } - -//TODO: make this a trully streamable handler -function RssHandler (callback) { - RssHandler.super_.call(this, callback, { ignoreWhitespace: true, verbose: false, enforceEmptyTags: false }); -} -inherits(RssHandler, DefaultHandler); - - RssHandler.prototype.done = function RssHandler$done () { - var feed = { }; - var feedRoot; - - var found = DomUtils.getElementsByTagName(function (value) { return(value == "rss" || value == "feed"); }, this.dom, false); - if (found.length) { - feedRoot = found[0]; - } - if (feedRoot) { - if (feedRoot.name == "rss") { - feed.type = "rss"; - feedRoot = feedRoot.children[0]; // - feed.id = ""; - try { - feed.title = DomUtils.getElementsByTagName("title", feedRoot.children, false)[0].children[0].data; - } catch (ex) { } - try { - feed.link = DomUtils.getElementsByTagName("link", feedRoot.children, false)[0].children[0].data; - } catch (ex) { } - try { - feed.description = DomUtils.getElementsByTagName("description", feedRoot.children, false)[0].children[0].data; - } catch (ex) { } - try { - feed.updated = new Date(DomUtils.getElementsByTagName("lastBuildDate", feedRoot.children, false)[0].children[0].data); - } catch (ex) { } - try { - feed.author = DomUtils.getElementsByTagName("managingEditor", feedRoot.children, false)[0].children[0].data; - } catch (ex) { } - feed.items = []; - DomUtils.getElementsByTagName("item", feedRoot.children).forEach(function (item, index, list) { - var entry = {}; - try { - entry.id = DomUtils.getElementsByTagName("guid", item.children, false)[0].children[0].data; - } catch (ex) { } - try { - entry.title = DomUtils.getElementsByTagName("title", item.children, false)[0].children[0].data; - } catch (ex) { } - try { - entry.link = DomUtils.getElementsByTagName("link", item.children, false)[0].children[0].data; - } catch (ex) { } - try { - entry.description = DomUtils.getElementsByTagName("description", item.children, false)[0].children[0].data; - } catch (ex) { } - try { - entry.pubDate = new Date(DomUtils.getElementsByTagName("pubDate", item.children, false)[0].children[0].data); - } catch (ex) { } - feed.items.push(entry); - }); - } else { - feed.type = "atom"; - try { - feed.id = DomUtils.getElementsByTagName("id", feedRoot.children, false)[0].children[0].data; - } catch (ex) { } - try { - feed.title = DomUtils.getElementsByTagName("title", feedRoot.children, false)[0].children[0].data; - } catch (ex) { } - try { - feed.link = DomUtils.getElementsByTagName("link", feedRoot.children, false)[0].attribs.href; - } catch (ex) { } - try { - feed.description = DomUtils.getElementsByTagName("subtitle", feedRoot.children, false)[0].children[0].data; - } catch (ex) { } - try { - feed.updated = new Date(DomUtils.getElementsByTagName("updated", feedRoot.children, false)[0].children[0].data); - } catch (ex) { } - try { - feed.author = DomUtils.getElementsByTagName("email", feedRoot.children, true)[0].children[0].data; - } catch (ex) { } - feed.items = []; - DomUtils.getElementsByTagName("entry", feedRoot.children).forEach(function (item, index, list) { - var entry = {}; - try { - entry.id = DomUtils.getElementsByTagName("id", item.children, false)[0].children[0].data; - } catch (ex) { } - try { - entry.title = DomUtils.getElementsByTagName("title", item.children, false)[0].children[0].data; - } catch (ex) { } - try { - entry.link = DomUtils.getElementsByTagName("link", item.children, false)[0].attribs.href; - } catch (ex) { } - try { - entry.description = DomUtils.getElementsByTagName("summary", item.children, false)[0].children[0].data; - } catch (ex) { } - try { - entry.pubDate = new Date(DomUtils.getElementsByTagName("updated", item.children, false)[0].children[0].data); - } catch (ex) { } - feed.items.push(entry); - }); - } - - this.dom = feed; - } - RssHandler.super_.prototype.done.call(this); - } - -/////////////////////////////////////////////////// - -function DefaultHandler (callback, options) { - this.reset(); - this._options = options ? options : { }; - if (this._options.ignoreWhitespace == undefined) - this._options.ignoreWhitespace = false; //Keep whitespace-only text nodes - if (this._options.verbose == undefined) - this._options.verbose = true; //Keep data property for tags and raw property for all - if (this._options.enforceEmptyTags == undefined) - this._options.enforceEmptyTags = true; //Don't allow children for HTML tags defined as empty in spec - if ((typeof callback) == "function") - this._callback = callback; -} - - //**"Static"**// - //HTML Tags that shouldn't contain child nodes - DefaultHandler._emptyTags = { - area: 1 - , base: 1 - , basefont: 1 - , br: 1 - , col: 1 - , frame: 1 - , hr: 1 - , img: 1 - , input: 1 - , isindex: 1 - , link: 1 - , meta: 1 - , param: 1 - , embed: 1 - } - //Regex to detect whitespace only text nodes - DefaultHandler.reWhitespace = /^\s*$/; - - //**Public**// - //Properties// - DefaultHandler.prototype.dom = null; //The hierarchical object containing the parsed HTML - //Methods// - //Resets the handler back to starting state - DefaultHandler.prototype.reset = function DefaultHandler$reset() { - this.dom = []; - this._done = false; - this._tagStack = []; - this._tagStack.last = function DefaultHandler$_tagStack$last () { - return(this.length ? this[this.length - 1] : null); - } - } - //Signals the handler that parsing is done - DefaultHandler.prototype.done = function DefaultHandler$done () { - this._done = true; - this.handleCallback(null); - } - DefaultHandler.prototype.writeTag = function DefaultHandler$writeTag (element) { - this.handleElement(element); - } - DefaultHandler.prototype.writeText = function DefaultHandler$writeText (element) { - if (this._options.ignoreWhitespace) - if (DefaultHandler.reWhitespace.test(element.data)) - return; - this.handleElement(element); - } - DefaultHandler.prototype.writeComment = function DefaultHandler$writeComment (element) { - this.handleElement(element); - } - DefaultHandler.prototype.writeDirective = function DefaultHandler$writeDirective (element) { - this.handleElement(element); - } - DefaultHandler.prototype.error = function DefaultHandler$error (error) { - this.handleCallback(error); - } - - //**Private**// - //Properties// - DefaultHandler.prototype._options = null; //Handler options for how to behave - DefaultHandler.prototype._callback = null; //Callback to respond to when parsing done - DefaultHandler.prototype._done = false; //Flag indicating whether handler has been notified of parsing completed - DefaultHandler.prototype._tagStack = null; //List of parents to the currently element being processed - //Methods// - DefaultHandler.prototype.handleCallback = function DefaultHandler$handleCallback (error) { - if ((typeof this._callback) != "function") - if (error) - throw error; - else - return; - this._callback(error, this.dom); - } - - DefaultHandler.prototype.isEmptyTag = function(element) { - var name = element.name.toLowerCase(); - if (name.charAt(0) == '/') { - name = name.substring(1); - } - return this._options.enforceEmptyTags && !!DefaultHandler._emptyTags[name]; - }; - - DefaultHandler.prototype.handleElement = function DefaultHandler$handleElement (element) { - if (this._done) - this.handleCallback(new Error("Writing to the handler after done() called is not allowed without a reset()")); - if (!this._options.verbose) { -// element.raw = null; //FIXME: Not clean - //FIXME: Serious performance problem using delete - delete element.raw; - if (element.type == "tag" || element.type == "script" || element.type == "style") - delete element.data; - } - if (!this._tagStack.last()) { //There are no parent elements - //If the element can be a container, add it to the tag stack and the top level list - if (element.type != ElementType.Text && element.type != ElementType.Comment && element.type != ElementType.Directive) { - if (element.name.charAt(0) != "/") { //Ignore closing tags that obviously don't have an opening tag - this.dom.push(element); - if (!this.isEmptyTag(element)) { //Don't add tags to the tag stack that can't have children - this._tagStack.push(element); - } - } - } - else //Otherwise just add to the top level list - this.dom.push(element); - } - else { //There are parent elements - //If the element can be a container, add it as a child of the element - //on top of the tag stack and then add it to the tag stack - if (element.type != ElementType.Text && element.type != ElementType.Comment && element.type != ElementType.Directive) { - if (element.name.charAt(0) == "/") { - //This is a closing tag, scan the tagStack to find the matching opening tag - //and pop the stack up to the opening tag's parent - var baseName = element.name.substring(1); - if (!this.isEmptyTag(element)) { - var pos = this._tagStack.length - 1; - while (pos > -1 && this._tagStack[pos--].name != baseName) { } - if (pos > -1 || this._tagStack[0].name == baseName) - while (pos < this._tagStack.length - 1) - this._tagStack.pop(); - } - } - else { //This is not a closing tag - if (!this._tagStack.last().children) - this._tagStack.last().children = []; - this._tagStack.last().children.push(element); - if (!this.isEmptyTag(element)) //Don't add tags to the tag stack that can't have children - this._tagStack.push(element); - } - } - else { //This is not a container element - if (!this._tagStack.last().children) - this._tagStack.last().children = []; - this._tagStack.last().children.push(element); - } - } - } - - var DomUtils = { - testElement: function DomUtils$testElement (options, element) { - if (!element) { - return false; - } - - for (var key in options) { - if (key == "tag_name") { - if (element.type != "tag" && element.type != "script" && element.type != "style") { - return false; - } - if (!options["tag_name"](element.name)) { - return false; - } - } else if (key == "tag_type") { - if (!options["tag_type"](element.type)) { - return false; - } - } else if (key == "tag_contains") { - if (element.type != "text" && element.type != "comment" && element.type != "directive") { - return false; - } - if (!options["tag_contains"](element.data)) { - return false; - } - } else { - if (!element.attribs || !options[key](element.attribs[key])) { - return false; - } - } - } - - return true; - } - - , getElements: function DomUtils$getElements (options, currentElement, recurse, limit) { - recurse = (recurse === undefined || recurse === null) || !!recurse; - limit = isNaN(parseInt(limit)) ? -1 : parseInt(limit); - - if (!currentElement) { - return([]); - } - - var found = []; - var elementList; - - function getTest (checkVal) { - return(function (value) { return(value == checkVal); }); - } - for (var key in options) { - if ((typeof options[key]) != "function") { - options[key] = getTest(options[key]); - } - } - - if (DomUtils.testElement(options, currentElement)) { - found.push(currentElement); - } - - if (limit >= 0 && found.length >= limit) { - return(found); - } - - if (recurse && currentElement.children) { - elementList = currentElement.children; - } else if (currentElement instanceof Array) { - elementList = currentElement; - } else { - return(found); - } - - for (var i = 0; i < elementList.length; i++) { - found = found.concat(DomUtils.getElements(options, elementList[i], recurse, limit)); - if (limit >= 0 && found.length >= limit) { - break; - } - } - - return(found); - } - - , getElementById: function DomUtils$getElementById (id, currentElement, recurse) { - var result = DomUtils.getElements({ id: id }, currentElement, recurse, 1); - return(result.length ? result[0] : null); - } - - , getElementsByTagName: function DomUtils$getElementsByTagName (name, currentElement, recurse, limit) { - return(DomUtils.getElements({ tag_name: name }, currentElement, recurse, limit)); - } - - , getElementsByTagType: function DomUtils$getElementsByTagType (type, currentElement, recurse, limit) { - return(DomUtils.getElements({ tag_type: type }, currentElement, recurse, limit)); - } - } - - function inherits (ctor, superCtor) { - var tempCtor = function(){}; - tempCtor.prototype = superCtor.prototype; - ctor.super_ = superCtor; - ctor.prototype = new tempCtor(); - ctor.prototype.constructor = ctor; - } - -exports.Parser = Parser; - -exports.DefaultHandler = DefaultHandler; - -exports.RssHandler = RssHandler; - -exports.ElementType = ElementType; - -exports.DomUtils = DomUtils; - -})(); +exports.Parser = require("./Parser.js"); +exports.DefaultHandler = require("./DefaultHandler.js"); +exports.RssHandler = require("./RssHandler.js"); +exports.ElementType = require("./ElementType.js"); +exports.DomUtils = require("./DomUtils.js"); \ No newline at end of file diff --git a/lib/htmlparser.min.js b/lib/htmlparser.min.js deleted file mode 100644 index 2e09f29..0000000 --- a/lib/htmlparser.min.js +++ /dev/null @@ -1,22 +0,0 @@ -/*********************************************** -Copyright 2010, 2011, Chris Winberry . All rights reserved. -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to -deal in the Software without restriction, including without limitation the -rights to use, copy, modify, merge, publish, distribute, sublicense, and/or -sell copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -IN THE SOFTWARE. -***********************************************/ -/* v1.8.0 */ -(function(){function e(a,c){this._options=c?c:{};if(this._options.includeLocation==undefined)this._options.includeLocation=false;this.validateHandler(a);this._handler=a;this.reset()}function n(a){n.super_.call(this,a,{ignoreWhitespace:true,verbose:false,enforceEmptyTags:false})}function i(a,c){this.reset();this._options=c?c:{};if(this._options.ignoreWhitespace==undefined)this._options.ignoreWhitespace=false;if(this._options.verbose==undefined)this._options.verbose=true;if(this._options.enforceEmptyTags== undefined)this._options.enforceEmptyTags=true;if(typeof a=="function")this._callback=a}if(!(typeof require=="function"&&typeof exports=="object"&&typeof module=="object"&&typeof __filename=="string"&&typeof __dirname=="string")){if(this.Tautologistics){if(this.Tautologistics.NodeHtmlParser)return}else this.Tautologistics={};this.Tautologistics.NodeHtmlParser={};exports=this.Tautologistics.NodeHtmlParser}var d={Text:"text",Directive:"directive",Comment:"comment",Script:"script",Style:"style",Tag:"tag"}; e._reTrim=/(^\s+|\s+$)/g;e._reTrimComment=/(^\!--|--$)/g;e._reWhitespace=/\s/g;e._reTagName=/^\s*(\/?)\s*([^\s\/]+)/;e._reAttrib=/([^=<>\"\'\s]+)\s*=\s*"([^"]*)"|([^=<>\"\'\s]+)\s*=\s*'([^']*)'|([^=<>\"\'\s]+)\s*=\s*([^'"\s]+)|([^=<>\"\'\s\/]+)/g;e._reTags=/[\<\>]/g;e.prototype.parseComplete=function(a){this.reset();this.parseChunk(a);this.done()};e.prototype.parseChunk=function(a){this._done&&this.handleError(Error("Attempted to parse chunk after parsing already done"));this._buffer+=a;this.parseTags()}; e.prototype.done=function(){if(!this._done){this._done=true;if(this._buffer.length){var a=this._buffer;this._buffer="";a={raw:a,data:this._parseState==d.Text?a:a.replace(e._reTrim,""),type:this._parseState};if(this._parseState==d.Tag||this._parseState==d.Script||this._parseState==d.Style)a.name=this.parseTagName(a.data);this.parseAttribs(a);this._elements.push(a)}this.writeHandler();this._handler.done()}};e.prototype.reset=function(){this._buffer="";this._done=false;this._elements=[];this._next=this._current= this._elementsCurrent=0;this._location={row:0,col:0,charOffset:0,inBuffer:0};this._parseState=d.Text;this._prevTagSep="";this._tagStack=[];this._handler.reset()};e.prototype._options=null;e.prototype._handler=null;e.prototype._buffer=null;e.prototype._done=false;e.prototype._elements=null;e.prototype._elementsCurrent=0;e.prototype._current=0;e.prototype._next=0;e.prototype._location=null;e.prototype._parseState=d.Text;e.prototype._prevTagSep="";e.prototype._tagStack=null;e.prototype.parseTagAttribs= function(a){for(var c=a.length,b=0;b"){this._tagStack.pop(); if(this._elements.length&&this._elements[this._elements.length-1].type==d.Comment){g=this._elements[this._elements.length-1];g.raw=g.data=(g.raw+b.raw).replace(e._reTrimComment,"");b.raw=b.data="";b.type=d.Text}else b.type=d.Comment}else{b.type=d.Comment;if(this._elements.length&&this._elements[this._elements.length-1].type==d.Comment){g=this._elements[this._elements.length-1];g.raw=g.data=g.raw+b.raw+c;b.raw=b.data="";b.type=d.Text}else b.raw=b.data=b.raw+c}}if(b.type==d.Tag){b.name=h;if(b.raw.indexOf("!--")== 0){b.type=d.Comment;delete b.name;g=b.raw.length;if(b.raw.charAt(g-1)=="-"&&b.raw.charAt(g-2)=="-"&&c==">")b.raw=b.data=b.raw.replace(e._reTrimComment,"");else{b.raw+=c;this._tagStack.push(d.Comment)}}else if(b.raw.indexOf("!")==0||b.raw.indexOf("?")==0)b.type=d.Directive;else if(b.name=="script"){b.type=d.Script;b.data.charAt(b.data.length-1)!="/"&&this._tagStack.push(d.Script)}else if(b.name=="/script")b.type=d.Script;else if(b.name=="style"){b.type=d.Style;b.data.charAt(b.data.length-1)!="/"&& this._tagStack.push(d.Style)}else if(b.name=="/style")b.type=d.Style;if(b.name&&b.name.charAt(0)=="/")b.data=b.name}if(b.raw!=""||b.type!=d.Text){if(this._options.includeLocation&&!b.location)b.location=this.getLocation(b.type==d.Tag);this.parseAttribs(b);this._elements.push(b);b.type!=d.Text&&b.type!=d.Comment&&b.type!=d.Directive&&b.data.charAt(b.data.length-1)=="/"&&this._elements.push({raw:"/"+b.name,data:"/"+b.name,name:"/"+b.name,type:b.type})}this._parseState=c=="<"?d.Tag:d.Text;this._current= this._next+1;this._prevTagSep=c}if(this._options.includeLocation){this.getLocation();this._location.row+=this._location.inBuffer;this._location.inBuffer=0;this._location.charOffset=0}this._buffer=this._current<=a?this._buffer.substring(this._current):"";this._current=0;this.writeHandler()};e.prototype.getLocation=function(a){for(var c=this._location,b=this._current-(a?1:0),h=a&&c.charOffset==0&&this._current==0;c.charOffset-1&&this._tagStack[a--].name!=c;);if(a>-1||this._tagStack[0].name==c)for(;a=0&&l.length>=h)return l;if(b&&c.children)c=c.children;else if(c instanceof Array)c=c;else return l; for(m=0;m=0&&l.length>=h)break}return l},getElementById:function(a,c,b){a=f.getElements({id:a},c,b,1);return a.length?a[0]:null},getElementsByTagName:function(a,c,b,h){return f.getElements({tag_name:a},c,b,h)},getElementsByTagType:function(a,c,b,h){return f.getElements({tag_type:a},c,b,h)}};exports.Parser=e;exports.DefaultHandler=i;exports.RssHandler=n;exports.ElementType=d;exports.DomUtils=f})(); \ No newline at end of file diff --git a/lib/node-htmlparser.js b/lib/node-htmlparser.js deleted file mode 100644 index 1fc03ea..0000000 --- a/lib/node-htmlparser.js +++ /dev/null @@ -1,6 +0,0 @@ -var htmlparser = require("./htmlparser"); -exports.Parser = htmlparser.Parser; -exports.DefaultHandler = htmlparser.DefaultHandler; -exports.RssHandler = htmlparser.RssHandler; -exports.ElementType = htmlparser.ElementType; -exports.DomUtils = htmlparser.DomUtils; diff --git a/lib/node-htmlparser.min.js b/lib/node-htmlparser.min.js deleted file mode 100644 index 27d5eea..0000000 --- a/lib/node-htmlparser.min.js +++ /dev/null @@ -1,6 +0,0 @@ -var htmlparser = require("./htmlparser.min"); -exports.Parser = htmlparser.Parser; -exports.DefaultHandler = htmlparser.DefaultHandler; -exports.RssHandler = htmlparser.RssHandler; -exports.ElementType = htmlparser.ElementType; -exports.DomUtils = htmlparser.DomUtils; diff --git a/profile.js b/profile.js deleted file mode 100644 index f9d0ef2..0000000 --- a/profile.js +++ /dev/null @@ -1,63 +0,0 @@ -//node --prof --prof_auto profile.js -//deps/v8/tools/mac-tick-processor v8.log -var sys = require("sys"); -var fs = require("fs"); -var http = require("http"); -var htmlparser = require("./lib/htmlparser"); -//var libxml = require('./libxmljs'); - -var testNHP = true; //Should node-htmlparser be exercised? -var testLXJS = false; //Should libxmljs be exercised? -var testIterations = 100; //Number of test loops to run - -var testHost = "localhost"; //Host to fetch test HTML from -var testPort = 80; //Port on host to fetch test HTML from -var testPath = "/~chris/feed.xml"; //Path on host to fetch HTML from - -function getMillisecs () { - return((new Date()).getTime()); -} - -function timeExecutions (loops, func) { - var start = getMillisecs(); - - while (loops--) - func(); - - return(getMillisecs() - start); -} - -var html = ""; -http.createClient(testPort, testHost) - .request("GET", testPath, { host: testHost }) - .addListener("response", function (response) { - if (response.statusCode == "200") { - response.setEncoding("utf8"); - response.addListener("data", function (chunk) { - html += chunk; - }).addListener("end", function() { - var timeNodeHtmlParser = !testNHP ? 0 : timeExecutions(testIterations, function () { - var handler = new htmlparser.DefaultHandler(function(err, dom) { - if (err) - sys.debug("Error: " + err); - }); - var parser = new htmlparser.Parser(handler, { includeLocation: true }); - parser.parseComplete(html); - }) - - var timeLibXmlJs = !testLXJS ? 0 : timeExecutions(testIterations, function () { - var dom = libxml.parseHtmlString(html); - }) - - if (testNHP) - sys.debug("NodeHtmlParser: " + timeNodeHtmlParser); - if (testLXJS) - sys.debug("LibXmlJs: " + timeLibXmlJs); - if (testNHP && testLXJS) - sys.debug("Difference: " + ((timeNodeHtmlParser - timeLibXmlJs) / timeLibXmlJs) * 100); - }); - } - else - sys.debug("Error: got response status " + response.statusCode); - }) - .end(); diff --git a/runtests.min.html b/runtests.min.html deleted file mode 100644 index 73ea4c7..0000000 --- a/runtests.min.html +++ /dev/null @@ -1,108 +0,0 @@ - - - - - Node.js HTML Parser - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/runtests.min.js b/runtests.min.js deleted file mode 100644 index df33736..0000000 --- a/runtests.min.js +++ /dev/null @@ -1,75 +0,0 @@ -/*********************************************** -Copyright 2010, Chris Winberry . All rights reserved. -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to -deal in the Software without restriction, including without limitation the -rights to use, copy, modify, merge, publish, distribute, sublicense, and/or -sell copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -IN THE SOFTWARE. -***********************************************/ - -var sys = require("sys"); -var fs = require("fs"); -var htmlparser = require("./lib/htmlparser.min"); - -var testFolder = "./tests"; -var chunkSize = 5; - -var testFiles = fs.readdirSync(testFolder); -var testCount = 0; -var failedCount = 0; -for (var i in testFiles) { - testCount++; - var fileParts = testFiles[i].split("."); - fileParts.pop(); - var moduleName = fileParts.join("."); - var test = require(testFolder + "/" + moduleName); - var handlerCallback = function handlerCallback (error) { - if (error) - sys.puts("Handler error: " + error); - } - var handler = (test.type == "rss") ? - new htmlparser.RssHandler(handlerCallback, test.options.handler) - : - new htmlparser.DefaultHandler(handlerCallback, test.options.handler) - ; - var parser = new htmlparser.Parser(handler, test.options.parser); - parser.parseComplete(test.html); - var resultComplete = handler.dom; - var chunkPos = 0; - parser.reset(); - while (chunkPos < test.html.length) { - parser.parseChunk(test.html.substring(chunkPos, chunkPos + chunkSize)); - chunkPos += chunkSize; - } - parser.done(); - var resultChunk = handler.dom; - var testResult = - sys.inspect(resultComplete, false, null) === sys.inspect(test.expected, false, null) - && - sys.inspect(resultChunk, false, null) === sys.inspect(test.expected, false, null) - ; - sys.puts("[" + test.name + "\]: " + (testResult ? "passed" : "FAILED")); - if (!testResult) { - failedCount++; - sys.puts("== Complete =="); - sys.puts(sys.inspect(resultComplete, false, null)); - sys.puts("== Chunked =="); - sys.puts(sys.inspect(resultChunk, false, null)); - sys.puts("== Expected =="); - sys.puts(sys.inspect(test.expected, false, null)); - } -} -sys.puts("Total tests: " + testCount); -sys.puts("Failed tests: " + failedCount); diff --git a/snippet.js b/snippet.js deleted file mode 100644 index 9448ea3..0000000 --- a/snippet.js +++ /dev/null @@ -1,15 +0,0 @@ -//node --prof --prof_auto profile.js -//deps/v8/tools/mac-tick-processor v8.log -var sys = require("sys"); -var htmlparser = require("./htmlparser"); - -var html = "text"; - -var handler = new htmlparser.DefaultHandler(function(err, dom) { - if (err) - sys.debug("Error: " + err); - else - sys.debug(sys.inspect(dom, false, null)); -}, { enforceEmptyTags: true }); -var parser = new htmlparser.Parser(handler); -parser.parseComplete(html); diff --git a/utils_example.js b/utils_example.js deleted file mode 100644 index d219de5..0000000 --- a/utils_example.js +++ /dev/null @@ -1,35 +0,0 @@ -//node --prof --prof_auto profile.js -//deps/v8/tools/mac-tick-processor v8.log -var sys = require("sys"); -var htmlparser = require("./lib/htmlparser"); - -var html = "text atext btext ctext ehhhhellowworld"; - -var handler = new htmlparser.DefaultHandler(function(err, dom) { - if (err) { - sys.debug("Error: " + err); - } - else { - sys.debug(sys.inspect(dom, false, null)); - var id = htmlparser.DomUtils.getElementById("x", dom); - sys.debug("id: " + sys.inspect(id, false, null)); - var class = htmlparser.DomUtils.getElements({ class: "y" }, dom); - sys.debug("class: " + sys.inspect(class, false, null)); - var multiclass = htmlparser.DomUtils.getElements({ class: function (value) { return(value && value.indexOf("h") > -1); } }, dom); - sys.debug("multiclass: " + sys.inspect(multiclass, false, null)); - var name = htmlparser.DomUtils.getElementsByTagName("a", dom); - sys.debug("name: " + sys.inspect(name, false, null)); - var text = htmlparser.DomUtils.getElementsByTagType("text", dom); - sys.debug("text: " + sys.inspect(text, false, null)); - var nested = htmlparser.DomUtils.getElements({ tag_name: "d", id: "z", class: "w" }, dom); - nested = htmlparser.DomUtils.getElementsByTagName("e", nested); - nested = htmlparser.DomUtils.getElementsByTagType("text", nested); - sys.debug("nested: " + sys.inspect(nested, false, null)); - var double = htmlparser.DomUtils.getElementsByTagName("yy", dom); - sys.debug("double: " + sys.inspect(double, false, null)); - var single = htmlparser.DomUtils.getElements( { tag_name: "yy", id: "secondyy" }, dom); - sys.debug("single: " + sys.inspect(single, false, null)); - } -}, { verbose: false }); -var parser = new htmlparser.Parser(handler); -parser.parseComplete(html); From 01fb1badef8c2be003211903e8349c7e6404c478 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Sat, 27 Aug 2011 15:10:14 +0200 Subject: [PATCH 002/450] didn't export constructors --- lib/DefaultHandler.js | 12 +++-- lib/DomUtils.js | 2 +- lib/ElementType.js | 2 +- lib/Parser.js | 4 +- lib/RssHandler.js | 4 +- runtests.html | 108 ------------------------------------------ runtests.js | 1 + 7 files changed, 16 insertions(+), 117 deletions(-) delete mode 100644 runtests.html diff --git a/lib/DefaultHandler.js b/lib/DefaultHandler.js index 1536f6e..5b06fc2 100644 --- a/lib/DefaultHandler.js +++ b/lib/DefaultHandler.js @@ -18,7 +18,7 @@ function DefaultHandler (callback, options) { //**"Static"**// //HTML Tags that shouldn't contain child nodes DefaultHandler._emptyTags = { - area: 1 + area: 1 , base: 1 , basefont: 1 , br: 1 @@ -34,7 +34,7 @@ DefaultHandler._emptyTags = { , embed: 1 }; //Regex to detect whitespace only text nodes -DefaultHandler.reWhitespace = /^\s*$/; +var reWhitespace = /^\s*$/; //**Public**// //Methods// @@ -53,8 +53,8 @@ DefaultHandler.prototype.done = function() { this.handleCallback(null); }; DefaultHandler.prototype.writeText = function(element) { - if (this._options.ignoreWhitespace) - if (DefaultHandler.reWhitespace.test(element.data)) + if(this._options.ignoreWhitespace) + if(reWhitespace.test(element.data)) return; this.handleElement(element); }; @@ -132,4 +132,6 @@ DefaultHandler.prototype.handleElement = function(element) { this._tagStack.last().children.push(element); } } -}; \ No newline at end of file +}; + +exports = DefaultHandler; \ No newline at end of file diff --git a/lib/DomUtils.js b/lib/DomUtils.js index f930ba0..a7420f6 100644 --- a/lib/DomUtils.js +++ b/lib/DomUtils.js @@ -1,4 +1,4 @@ -var DomUtils = { +exports = { testElement: function(options, element) { if (!element) { return false; diff --git a/lib/ElementType.js b/lib/ElementType.js index 09d3d9f..96c89fa 100644 --- a/lib/ElementType.js +++ b/lib/ElementType.js @@ -1,5 +1,5 @@ //Types of elements found in the DOM -var ElementType = { +exports = { Text: "text" //Plain text , Directive: "directive" //Special tag , Comment: "comment" //Special tag diff --git a/lib/Parser.js b/lib/Parser.js index ecb7be3..2e818f1 100644 --- a/lib/Parser.js +++ b/lib/Parser.js @@ -394,4 +394,6 @@ Parser.prototype.handleError = function(error) { if ((typeof this._handler.error) === "function") this._handler.error(error); else throw error; -}; \ No newline at end of file +}; + +exports = Parser; \ No newline at end of file diff --git a/lib/RssHandler.js b/lib/RssHandler.js index caf7f83..259b52e 100644 --- a/lib/RssHandler.js +++ b/lib/RssHandler.js @@ -109,4 +109,6 @@ RssHandler.prototype.done = function() { this.dom = feed; } RssHandler.super_.prototype.done.call(this); -}; \ No newline at end of file +}; + +exports = RssHandler; \ No newline at end of file diff --git a/runtests.html b/runtests.html deleted file mode 100644 index e89702d..0000000 --- a/runtests.html +++ /dev/null @@ -1,108 +0,0 @@ - - - - - Node.js HTML Parser - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/runtests.js b/runtests.js index e906fe4..3ddcd9e 100644 --- a/runtests.js +++ b/runtests.js @@ -39,6 +39,7 @@ for (var i in testFiles) { if (error) sys.puts("Handler error: " + error); } + console.log(testFiles[i]); var handler = (test.type == "rss") ? new htmlparser.RssHandler(handlerCallback, test.options.handler) : From b92ecd7c09575f262a8d10be2d807a83e3750121 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Sat, 27 Aug 2011 15:12:22 +0200 Subject: [PATCH 003/450] Now all tests pass --- lib/DefaultHandler.js | 2 +- lib/DomUtils.js | 4 +++- lib/ElementType.js | 2 +- lib/Parser.js | 2 +- lib/RssHandler.js | 2 +- 5 files changed, 7 insertions(+), 5 deletions(-) diff --git a/lib/DefaultHandler.js b/lib/DefaultHandler.js index 5b06fc2..3f79c70 100644 --- a/lib/DefaultHandler.js +++ b/lib/DefaultHandler.js @@ -134,4 +134,4 @@ DefaultHandler.prototype.handleElement = function(element) { } }; -exports = DefaultHandler; \ No newline at end of file +module.exports = DefaultHandler; \ No newline at end of file diff --git a/lib/DomUtils.js b/lib/DomUtils.js index a7420f6..f7a6f3e 100644 --- a/lib/DomUtils.js +++ b/lib/DomUtils.js @@ -1,4 +1,4 @@ -exports = { +var DomUtils = { testElement: function(options, element) { if (!element) { return false; @@ -92,3 +92,5 @@ exports = { return(DomUtils.getElements({ tag_type: type }, currentElement, recurse, limit)); } }; + +module.exports = DomUtils; \ No newline at end of file diff --git a/lib/ElementType.js b/lib/ElementType.js index 96c89fa..c112c91 100644 --- a/lib/ElementType.js +++ b/lib/ElementType.js @@ -1,5 +1,5 @@ //Types of elements found in the DOM -exports = { +module.exports = { Text: "text" //Plain text , Directive: "directive" //Special tag , Comment: "comment" //Special tag diff --git a/lib/Parser.js b/lib/Parser.js index 2e818f1..b74bc2c 100644 --- a/lib/Parser.js +++ b/lib/Parser.js @@ -396,4 +396,4 @@ Parser.prototype.handleError = function(error) { else throw error; }; -exports = Parser; \ No newline at end of file +module.exports = Parser; \ No newline at end of file diff --git a/lib/RssHandler.js b/lib/RssHandler.js index 259b52e..e14ad58 100644 --- a/lib/RssHandler.js +++ b/lib/RssHandler.js @@ -111,4 +111,4 @@ RssHandler.prototype.done = function() { RssHandler.super_.prototype.done.call(this); }; -exports = RssHandler; \ No newline at end of file +module.exports = RssHandler; \ No newline at end of file From 31bcb1213f6b614f33e4e7961dcb2c0f88607a67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Sat, 27 Aug 2011 15:14:07 +0200 Subject: [PATCH 004/450] moved runtests.js to tests-directory --- runtests.js => tests/00-runtests.js | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename runtests.js => tests/00-runtests.js (100%) diff --git a/runtests.js b/tests/00-runtests.js similarity index 100% rename from runtests.js rename to tests/00-runtests.js From c11def8225e4985e2c45083c3e233b34dfc1c4e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Sat, 27 Aug 2011 15:29:46 +0200 Subject: [PATCH 005/450] fixed tests --- tests/00-runtests.js | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/00-runtests.js b/tests/00-runtests.js index 3ddcd9e..b879227 100644 --- a/tests/00-runtests.js +++ b/tests/00-runtests.js @@ -21,15 +21,15 @@ IN THE SOFTWARE. var sys = require("sys"); var fs = require("fs"); -var htmlparser = require("./lib/htmlparser"); +var htmlparser = require("../lib/htmlparser"); -var testFolder = "./tests"; +var testFolder = "."; var chunkSize = 5; var testFiles = fs.readdirSync(testFolder); var testCount = 0; var failedCount = 0; -for (var i in testFiles) { +for (var i = 1; i < testFiles.length; i++) { testCount++; var fileParts = testFiles[i].split("."); fileParts.pop(); From 2119bde08611a84cb8326b00ce43d4e5a198bbc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Sat, 27 Aug 2011 15:30:47 +0200 Subject: [PATCH 006/450] moved "last()"-method from _callStack to DefaultHandlers prototype --- lib/DefaultHandler.js | 55 ++++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/lib/DefaultHandler.js b/lib/DefaultHandler.js index 3f79c70..8bfdfb0 100644 --- a/lib/DefaultHandler.js +++ b/lib/DefaultHandler.js @@ -15,23 +15,27 @@ function DefaultHandler (callback, options) { this._callback = callback; } -//**"Static"**// +DefaultHandler.prototype._lastTag = function() { + var stack = this._tagStack; + return(stack.length ? stack[stack.length - 1] : null); +}; + //HTML Tags that shouldn't contain child nodes -DefaultHandler._emptyTags = { - area: 1 - , base: 1 - , basefont: 1 - , br: 1 - , col: 1 - , frame: 1 - , hr: 1 - , img: 1 - , input: 1 - , isindex: 1 - , link: 1 - , meta: 1 - , param: 1 - , embed: 1 +var _emptyTags = { + area: true + , base: true + , basefont: true + , br: true + , col: true + , frame: true + , hr: true + , img: true + , input: true + , isindex: true + , link: true + , meta: true + , param: true + , embed: true }; //Regex to detect whitespace only text nodes var reWhitespace = /^\s*$/; @@ -43,9 +47,6 @@ DefaultHandler.prototype.reset = function() { this.dom = []; this._done = false; this._tagStack = []; - this._tagStack.last = function() { - return(this.length ? this[this.length - 1] : null); - }; }; //Signals the handler that parsing is done DefaultHandler.prototype.done = function() { @@ -75,7 +76,7 @@ DefaultHandler.prototype.isEmptyTag = function(element) { if (name.charAt(0) === '/') { name = name.substring(1); } - return this._options.enforceEmptyTags && !!DefaultHandler._emptyTags[name]; + return this._options.enforceEmptyTags && _emptyTags[name]; }; DefaultHandler.prototype.writeTag = DefaultHandler.prototype.writeDirective = DefaultHandler.prototype.writeComment = @@ -89,7 +90,7 @@ DefaultHandler.prototype.handleElement = function(element) { if (element.type === "tag" || element.type === "script" || element.type === "style") delete element.data; } - if (!this._tagStack.last()) { //There are no parent elements + if (!this._lastTag()) { //There are no parent elements //If the element can be a container, add it to the tag stack and the top level list if (element.type !== ElementType.Text && element.type !== ElementType.Comment && element.type !== ElementType.Directive) { if (element.name.charAt(0) !== "/") { //Ignore closing tags that obviously don't have an opening tag @@ -119,17 +120,17 @@ DefaultHandler.prototype.handleElement = function(element) { } } else { //This is not a closing tag - if (!this._tagStack.last().children) - this._tagStack.last().children = []; - this._tagStack.last().children.push(element); + if (!this._lastTag().children) + this._lastTag().children = []; + this._lastTag().children.push(element); if (!this.isEmptyTag(element)) //Don't add tags to the tag stack that can't have children this._tagStack.push(element); } } else { //This is not a container element - if (!this._tagStack.last().children) - this._tagStack.last().children = []; - this._tagStack.last().children.push(element); + if (!this._lastTag().children) + this._lastTag().children = []; + this._lastTag().children.push(element); } } }; From 4a4110b431b2046c1d298f5973c2a2aa3ae642d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Sat, 27 Aug 2011 15:54:37 +0200 Subject: [PATCH 007/450] removed repeating code in RssHandler --- lib/RssHandler.js | 111 ++++++++++++++++++++-------------------------- 1 file changed, 48 insertions(+), 63 deletions(-) diff --git a/lib/RssHandler.js b/lib/RssHandler.js index e14ad58..7273c10 100644 --- a/lib/RssHandler.js +++ b/lib/RssHandler.js @@ -16,9 +16,15 @@ function inherits (ctor, superCtor) { inherits(RssHandler, DefaultHandler); +function fetch(what, where, recurse){ + try{ return DomUtils.getElementsByTagName(what, where, !!recurse)[0].children[0].data; + } catch(e){return false;} +} + RssHandler.prototype.done = function() { var feed = { }; var feedRoot; + var tmp; var found = DomUtils.getElementsByTagName(function (value) { return(value === "rss" || value === "feed"); }, this.dom, false); if (found.length) { @@ -29,79 +35,58 @@ RssHandler.prototype.done = function() { feed.type = "rss"; feedRoot = feedRoot.children[0]; // feed.id = ""; - try { - feed.title = DomUtils.getElementsByTagName("title", feedRoot.children, false)[0].children[0].data; - } catch (ex) { } - try { - feed.link = DomUtils.getElementsByTagName("link", feedRoot.children, false)[0].children[0].data; - } catch (ex) { } - try { - feed.description = DomUtils.getElementsByTagName("description", feedRoot.children, false)[0].children[0].data; - } catch (ex) { } - try { - feed.updated = new Date(DomUtils.getElementsByTagName("lastBuildDate", feedRoot.children, false)[0].children[0].data); - } catch (ex) { } - try { - feed.author = DomUtils.getElementsByTagName("managingEditor", feedRoot.children, false)[0].children[0].data; - } catch (ex) { } + if(tmp = fetch("title", feedRoot.children)) + feed.title = tmp; + if(tmp = fetch("link", feedRoot.children)) + feed.link = tmp; + if(tmp = fetch("description", feedRoot.children)) + feed.description = tmp; + if(tmp = fetch("lastBuildDate", feedRoot.children)) + feed.updated = new Date(tmp); + if(tmp = fetch("managingEditor", feedRoot.children)) + feed.author = tmp; feed.items = []; DomUtils.getElementsByTagName("item", feedRoot.children).forEach(function (item, index, list) { var entry = {}; - try { - entry.id = DomUtils.getElementsByTagName("guid", item.children, false)[0].children[0].data; - } catch (ex) { } - try { - entry.title = DomUtils.getElementsByTagName("title", item.children, false)[0].children[0].data; - } catch (ex) { } - try { - entry.link = DomUtils.getElementsByTagName("link", item.children, false)[0].children[0].data; - } catch (ex) { } - try { - entry.description = DomUtils.getElementsByTagName("description", item.children, false)[0].children[0].data; - } catch (ex) { } - try { - entry.pubDate = new Date(DomUtils.getElementsByTagName("pubDate", item.children, false)[0].children[0].data); - } catch (ex) { } + if(tmp = fetch("guid", item.children)) + entry.id = tmp; + if(tmp = fetch("title", item.children)) + entry.title = tmp; + if(tmp = fetch("link", item.children)) + entry.link = tmp; + if(tmp = fetch("description", item.children)) + entry.description = tmp; + if(tmp = fetch("pubDate", item.children)) + entry.pubDate = new Date(tmp); feed.items.push(entry); }); } else { feed.type = "atom"; - try { - feed.id = DomUtils.getElementsByTagName("id", feedRoot.children, false)[0].children[0].data; - } catch (ex) { } - try { - feed.title = DomUtils.getElementsByTagName("title", feedRoot.children, false)[0].children[0].data; - } catch (ex) { } - try { - feed.link = DomUtils.getElementsByTagName("link", feedRoot.children, false)[0].attribs.href; - } catch (ex) { } - try { - feed.description = DomUtils.getElementsByTagName("subtitle", feedRoot.children, false)[0].children[0].data; - } catch (ex) { } - try { - feed.updated = new Date(DomUtils.getElementsByTagName("updated", feedRoot.children, false)[0].children[0].data); - } catch (ex) { } - try { - feed.author = DomUtils.getElementsByTagName("email", feedRoot.children, true)[0].children[0].data; - } catch (ex) { } + if(tmp = fetch("id", feedRoot.children)) + feed.id = tmp; + if(tmp = fetch("title", feedRoot.children)) + feed.title = tmp; + try{ feed.link = DomUtils.getElementsByTagName("link", feedRoot.children, false)[0].attribs.href; + }catch (ex){} + if(tmp = fetch("subtitle", feedRoot.children)) + feed.description = tmp; + if(tmp = fetch("updated", feedRoot.children)) + feed.updated = new Date(tmp); + if(tmp = fetch("email", feedRoot.children, true)) + feed.author = tmp; feed.items = []; DomUtils.getElementsByTagName("entry", feedRoot.children).forEach(function (item, index, list) { var entry = {}; - try { - entry.id = DomUtils.getElementsByTagName("id", item.children, false)[0].children[0].data; - } catch (ex) { } - try { - entry.title = DomUtils.getElementsByTagName("title", item.children, false)[0].children[0].data; - } catch (ex) { } - try { - entry.link = DomUtils.getElementsByTagName("link", item.children, false)[0].attribs.href; - } catch (ex) { } - try { - entry.description = DomUtils.getElementsByTagName("summary", item.children, false)[0].children[0].data; - } catch (ex) { } - try { - entry.pubDate = new Date(DomUtils.getElementsByTagName("updated", item.children, false)[0].children[0].data); - } catch (ex) { } + if(tmp = fetch("id", item.children)) + entry.id = tmp; + if(tmp = fetch("title", item.children)) + entry.title = tmp; + try { entry.link = DomUtils.getElementsByTagName("link", item.children)[0].attribs.href; + } catch(ex){} + if(tmp = fetch("summary", item.children)) + entry.description = tmp; + if(tmp = fetch("updated", item.children)) + entry.pubDate = new Date(tmp); feed.items.push(entry); }); } From 91a6a86f12b33b9db867fe4891c6e99a70b81b64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Sat, 27 Aug 2011 16:02:51 +0200 Subject: [PATCH 008/450] again some cleanup --- lib/RssHandler.js | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/lib/RssHandler.js b/lib/RssHandler.js index 7273c10..00edbf0 100644 --- a/lib/RssHandler.js +++ b/lib/RssHandler.js @@ -16,9 +16,15 @@ function inherits (ctor, superCtor) { inherits(RssHandler, DefaultHandler); +function getElements(what, where, one, recurse){ + var ret = DomUtils.getElementsByTagName(what, where, !!recurse); + if(one) try{ return ret[0]; } catch(e){return false;} + else return ret; +} function fetch(what, where, recurse){ - try{ return DomUtils.getElementsByTagName(what, where, !!recurse)[0].children[0].data; - } catch(e){return false;} + var ret = getElements(what, where, true, !!recurse); + if(ret) try{ return ret.children[0].data; } catch(e){return false;} + else return false; } RssHandler.prototype.done = function() { @@ -26,7 +32,7 @@ RssHandler.prototype.done = function() { var feedRoot; var tmp; - var found = DomUtils.getElementsByTagName(function (value) { return(value === "rss" || value === "feed"); }, this.dom, false); + var found = getElements(function (value) { return(value === "rss" || value === "feed"); }, this.dom); if (found.length) { feedRoot = found[0]; } @@ -46,7 +52,7 @@ RssHandler.prototype.done = function() { if(tmp = fetch("managingEditor", feedRoot.children)) feed.author = tmp; feed.items = []; - DomUtils.getElementsByTagName("item", feedRoot.children).forEach(function (item, index, list) { + getElements("item", feedRoot.children).forEach(function (item, index, list) { var entry = {}; if(tmp = fetch("guid", item.children)) entry.id = tmp; @@ -66,7 +72,7 @@ RssHandler.prototype.done = function() { feed.id = tmp; if(tmp = fetch("title", feedRoot.children)) feed.title = tmp; - try{ feed.link = DomUtils.getElementsByTagName("link", feedRoot.children, false)[0].attribs.href; + try{ feed.link = getElements("link", feedRoot.children, true).attribs.href; }catch (ex){} if(tmp = fetch("subtitle", feedRoot.children)) feed.description = tmp; @@ -75,13 +81,13 @@ RssHandler.prototype.done = function() { if(tmp = fetch("email", feedRoot.children, true)) feed.author = tmp; feed.items = []; - DomUtils.getElementsByTagName("entry", feedRoot.children).forEach(function (item, index, list) { + getElements("entry", feedRoot.children).forEach(function (item, index, list) { var entry = {}; if(tmp = fetch("id", item.children)) entry.id = tmp; if(tmp = fetch("title", item.children)) entry.title = tmp; - try { entry.link = DomUtils.getElementsByTagName("link", item.children)[0].attribs.href; + try { entry.link = getElements("link", item.children, true).attribs.href; } catch(ex){} if(tmp = fetch("summary", item.children)) entry.description = tmp; From 9a8786055c08fe9118140aa03c5280591b8902cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Sun, 28 Aug 2011 10:49:26 +0200 Subject: [PATCH 009/450] Added EventedHandler, using an interface like sax.js --- lib/DefaultHandler.js | 4 +-- lib/EventedHandler.js | 79 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+), 3 deletions(-) create mode 100644 lib/EventedHandler.js diff --git a/lib/DefaultHandler.js b/lib/DefaultHandler.js index 8bfdfb0..545e688 100644 --- a/lib/DefaultHandler.js +++ b/lib/DefaultHandler.js @@ -37,8 +37,6 @@ var _emptyTags = { , param: true , embed: true }; -//Regex to detect whitespace only text nodes -var reWhitespace = /^\s*$/; //**Public**// //Methods// @@ -55,7 +53,7 @@ DefaultHandler.prototype.done = function() { }; DefaultHandler.prototype.writeText = function(element) { if(this._options.ignoreWhitespace) - if(reWhitespace.test(element.data)) + if(element.data.trim() === "") return; this.handleElement(element); }; diff --git a/lib/EventedHandler.js b/lib/EventedHandler.js new file mode 100644 index 0000000..d233976 --- /dev/null +++ b/lib/EventedHandler.js @@ -0,0 +1,79 @@ +var EventedHandler = function(cbs){ + //map the handlers to their callbacks + this.writeComment = stripData(cbs.oncomment); + this.writeDirective = stripData(cbs.onprocessinginstruction); + this.writeText = stripData(cbs.ontext); + this.done = cbs.onend || emptyFunction; + + //if someone wants to listen to that + this.reset = cbs.onreset || emptyFunction; + this.error = cbs.onerror; //if nothing was set, the error is thrown + + //functions to be called within writeTag + this.onOpenTag = openTagCB(cbs.onopentag, cbs.onattribute); + this.onCloseTag = cbs.onclosetag || emptyFunction; + + //privates + this._stack = []; +}; + +var emptyFunction = function(){}; +var stripData = function(callback){ + if(typeof callback !== "function") return emptyFunction; + return function(data){ + callback(data.data); + }; +}; +var openTagCB = function(openTag, attribute){ + function open(name, attributes){ openTag({name:name, attributes:attributes}); } + function attr(name, attributes){ for(var i in attributes) attribute({name:i, value:attributes[i]}); } + if(openTag){ + if(attribute) return function(name, attributes){open(name,attributes); attr(attributes);}; + else return open; + } + else if(attribute) return attr; + else return emptyFunction; +}; + +//HTML Tags that shouldn't contain child nodes +var emptyTags = { + area: true + , base: true + , basefont: true + , br: true + , col: true + , frame: true + , hr: true + , img: true + , input: true + , isindex: true + , link: true + , meta: true + , param: true + , embed: true +}; + +EventedHandler.prototype.writeTag = function(element){ + var closing = element.name.charAt(0) === "/", + name = closing ? element.name.substring(1) : element.name, + attributes = element.attribs || {}, + empty = emptyTags[name]; + + if(closing){ + if(!empty){ + var i = this._stack.length - 1; + while(i !== -1 && this._stack[i--].name !== name){} + if( (i+=1) !== 0) + while(i < this._stack.length) this.onCloseTag(this._stack.pop().name); + } + else if(name === "br"){ //special case for
s + this.onOpenTag(name, attributes); + this.onCloseTag(name); + } + } + else{ + this.onOpenTag(name, attributes); + if(empty) this.onCloseTag(name); + else this._tagStack.push(element); + } +}; \ No newline at end of file From 38a3502a0cfdfb65ffeb68e0e8fc6fa0eba77e16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Sun, 28 Aug 2011 11:06:18 +0200 Subject: [PATCH 010/450] Some improvements inside the parser. Still very ugly. --- lib/Parser.js | 196 +++++++++++++++++++++++++------------------------- 1 file changed, 97 insertions(+), 99 deletions(-) diff --git a/lib/Parser.js b/lib/Parser.js index b74bc2c..9375f6f 100644 --- a/lib/Parser.js +++ b/lib/Parser.js @@ -1,8 +1,8 @@ var ElementType = require("./ElementType.js"); -function Parser (handler, options) { +function Parser (handler, options){ this._options = options ? options : { }; - if (this._options.includeLocation === undefined) { + if(this._options.includeLocation === undefined){ this._options.includeLocation = false; //Do not track element position in document by default } @@ -28,49 +28,54 @@ function Parser (handler, options) { //**"Static"**// //Regular expressions used for cleaning up and parsing (stateless) -Parser._reTrim = /(^\s+|\s+$)/g; //Trim leading/trailing whitespace -Parser._reTrimComment = /(^\!--|--$)/g; //Remove comment tag markup from comment contents -Parser._reWhitespace = /\s/g; //Used to find any whitespace to split on -Parser._reTagName = /^\s*(\/?)\s*([^\s\/]+)/; //Used to find the tag name for an element +var _reTrim = /(^\s+|\s+$)/g; //Trim leading/trailing whitespace +var _reTrimComment = /(^\!--|--$)/g; //Remove comment tag markup from comment contents +var _reWhitespace = /\s/g; //Used to find any whitespace to split on +var _reTagName = /^\s*(\/?)\s*([^\s\/]+)/; //Used to find the tag name for an element //Regular expressions used for parsing (stateful) -Parser._reAttrib = //Find attributes in a tag +var _reAttrib = //Find attributes in a tag /([^=<>\"\'\s]+)\s*=\s*"([^"]*)"|([^=<>\"\'\s]+)\s*=\s*'([^']*)'|([^=<>\"\'\s]+)\s*=\s*([^'"\s]+)|([^=<>\"\'\s\/]+)/g; -Parser._reTags = /[<\>]/g; //Find tag markers +var _reTags = /[<\>]/g; //Find tag markers + +var tagTypes = {}; +tagTypes[ ElementType.Script ] = true; +tagTypes[ ElementType.Style ] = true; +tagTypes[ ElementType.Tag ] = true; //**Public**// //Methods// //Parses a complete HTML and pushes it to the handler -Parser.prototype.parseComplete = function(data) { +Parser.prototype.parseComplete = function(data){ this.reset(); this.parseChunk(data); this.done(); }; //Parses a piece of an HTML document -Parser.prototype.parseChunk = function(data) { - if (this._done) +Parser.prototype.parseChunk = function(data){ + if(this._done) this.handleError(new Error("Attempted to parse chunk after parsing already done")); this._buffer += data; //FIXME: this can be a bottleneck this.parseTags(); }; //Tells the parser that the HTML being parsed is complete -Parser.prototype.done = function() { - if (this._done) +Parser.prototype.done = function(){ + if(this._done) return; this._done = true; //Push any unparsed text into a final element in the element list - if (this._buffer.length) { + if(this._buffer.length){ var rawData = this._buffer; this._buffer = ""; var element = { raw: rawData - , data: (this._parseState === ElementType.Text) ? rawData : rawData.replace(Parser._reTrim, "") + , data: (this._parseState === ElementType.Text) ? rawData : rawData.replace(_reTrim, "") , type: this._parseState }; - if (this._parseState === ElementType.Tag || this._parseState === ElementType.Script || this._parseState === ElementType.Style) + if(this._parseState === ElementType.Tag || this._parseState === ElementType.Script || this._parseState === ElementType.Style) element.name = this.parseTagName(element.data); this.parseAttribs(element); this._elements.push(element); @@ -81,7 +86,7 @@ Parser.prototype.done = function() { }; //Resets the parser to a blank state, ready to parse a new HTML document -Parser.prototype.reset = function() { +Parser.prototype.reset = function(){ this._buffer = ""; this._done = false; this._elements = []; @@ -103,71 +108,65 @@ Parser.prototype.reset = function() { //**Private**// //Methods// //Takes an array of elements and parses any found attributes -Parser.prototype.parseTagAttribs = function(elements) { - var idxEnd = elements.length; - var idx = 0; - - while (idx < idxEnd) { - var element = elements[idx++]; - if (element.type === ElementType.Tag || element.type === ElementType.Script || element.type === ElementType.style) +Parser.prototype.parseTagAttribs = function(elements){ + for(var i = 0, j = elements.length; i < j; i++){ + var element = elements[i]; + if(tagTypes[element.type]) this.parseAttribs(element); } - + return(elements); }; //Takes an element and adds an "attribs" property for any element attributes found -Parser.prototype.parseAttribs = function(element) { +Parser.prototype.parseAttribs = function(element){ //Only parse attributes for tags - if (element.type !== ElementType.Script && element.type !== ElementType.Style && element.type !== ElementType.Tag) - return; + if(!tagTypes[element.type]) return; - var tagName = element.data.split(Parser._reWhitespace, 1)[0]; + var tagName = element.data.split(_reWhitespace, 1)[0]; var attribRaw = element.data.substring(tagName.length); - if (attribRaw.length < 1) + if(attribRaw.length < 1) return; var match; - Parser._reAttrib.lastIndex = 0; - while (match = Parser._reAttrib.exec(attribRaw)) { - if (element.attribs === undefined) + _reAttrib.lastIndex = 0; + while (match = _reAttrib.exec(attribRaw)){ + if(element.attribs === undefined) element.attribs = {}; - if (typeof match[1] === "string" && match[1].length) { + if(typeof match[1] === "string" && match[1].length){ element.attribs[match[1]] = match[2]; - } else if (typeof match[3] === "string" && match[3].length) { + } else if(typeof match[3] === "string" && match[3].length){ element.attribs[match[3].toString()] = match[4].toString(); - } else if (typeof match[5] === "string" && match[5].length) { + } else if(typeof match[5] === "string" && match[5].length){ element.attribs[match[5]] = match[6]; - } else if (typeof match[7] === "string" && match[7].length) { + } else if(typeof match[7] === "string" && match[7].length){ element.attribs[match[7]] = match[7]; } } }; //Extracts the base tag name from the data value of an element -Parser.prototype.parseTagName = function(data) { - if (data === null || data === "") - return(""); - var match = Parser._reTagName.exec(data); - if (!match) - return(""); - return((match[1] ? "/" : "") + match[2]); +Parser.prototype.parseTagName = function(data){ + if(!data) return ""; + var match = _reTagName.exec(data); + if(!match) return ""; + return (match[1] ? "/" : "") + match[2]; }; //Parses through HTML text and returns an array of found elements //I admit, this function is rather large but splitting up had an noticeable impact on speed -Parser.prototype.parseTags = function() { +Parser.prototype.parseTags = function(){ var bufferEnd = this._buffer.length - 1; - while (Parser._reTags.test(this._buffer)) { - this._next = Parser._reTags.lastIndex - 1; + while (_reTags.test(this._buffer)){ + this._next = _reTags.lastIndex - 1; var tagSep = this._buffer.charAt(this._next); //The currently found tag marker var rawData = this._buffer.substring(this._current, this._next); //The next chunk of data to parse //A new element to eventually be appended to the element list var element = { raw: rawData - , data: (this._parseState === ElementType.Text) ? rawData : rawData.replace(Parser._reTrim, "") + , data: (this._parseState === ElementType.Text) ? rawData : rawData.replace(_reTrim, "") , type: this._parseState }; @@ -175,16 +174,16 @@ Parser.prototype.parseTags = function() { //This section inspects the current tag stack and modifies the current //element if we're actually parsing a special area (script/comment/style tag) - if (this._tagStack.length) { //We're parsing inside a script/comment/style tag - if (this._tagStack[this._tagStack.length - 1] === ElementType.Script) { //We're currently in a script tag - if (elementName === "/script") //Actually, we're no longer in a script tag, so pop it off the stack + if(this._tagStack.length){ //We're parsing inside a script/comment/style tag + if(this._tagStack[this._tagStack.length - 1] === ElementType.Script){ //We're currently in a script tag + if(elementName === "/script") //Actually, we're no longer in a script tag, so pop it off the stack this._tagStack.pop(); else { //Not a closing script tag - if (element.raw.indexOf("!--") !== 0) { //Make sure we're not in a comment + if(element.raw.indexOf("!--") !== 0){ //Make sure we're not in a comment //All data from here to script close is now a text element element.type = ElementType.Text; //If the previous element is text, append the current text to it - if (this._elements.length && this._elements[this._elements.length - 1].type === ElementType.Text) { + if(this._elements.length && this._elements[this._elements.length - 1].type === ElementType.Text){ prevElement = this._elements[this._elements.length - 1]; prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep + element.raw; element.raw = element.data = ""; //This causes the current element to not be added to the element list @@ -192,39 +191,39 @@ Parser.prototype.parseTags = function() { } } } - else if (this._tagStack[this._tagStack.length - 1] === ElementType.Style) { //We're currently in a style tag - if (elementName === "/style") //Actually, we're no longer in a style tag, so pop it off the stack + else if(this._tagStack[this._tagStack.length - 1] === ElementType.Style){ //We're currently in a style tag + if(elementName === "/style") //Actually, we're no longer in a style tag, so pop it off the stack this._tagStack.pop(); else { - if (element.raw.indexOf("!--") !== 0) { //Make sure we're not in a comment + if(element.raw.indexOf("!--") !== 0){ //Make sure we're not in a comment //All data from here to style close is now a text element element.type = ElementType.Text; //If the previous element is text, append the current text to it - if (this._elements.length && this._elements[this._elements.length - 1].type === ElementType.Text) { + if(this._elements.length && this._elements[this._elements.length - 1].type === ElementType.Text){ prevElement = this._elements[this._elements.length - 1]; - if (element.raw !== "") { + if(element.raw !== ""){ prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep + element.raw; element.raw = element.data = ""; //This causes the current element to not be added to the element list } else { //Element is empty, so just append the last tag marker found prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep; } } else { //The previous element was not text - if (element.raw !== "") { + if(element.raw !== ""){ element.raw = element.data = element.raw; } } } } } - else if (this._tagStack[this._tagStack.length - 1] === ElementType.Comment) { //We're currently in a comment tag + else if(this._tagStack[this._tagStack.length - 1] === ElementType.Comment){ //We're currently in a comment tag rawLen = element.raw.length; - if (element.raw.charAt(rawLen - 2) === "-" && element.raw.charAt(rawLen - 1) === "-" && tagSep === ">") { + if(element.raw.charAt(rawLen - 2) === "-" && element.raw.charAt(rawLen - 1) === "-" && tagSep === ">"){ //Actually, we're no longer in a style tag, so pop it off the stack this._tagStack.pop(); //If the previous element is a comment, append the current text to it - if (this._elements.length && this._elements[this._elements.length - 1].type === ElementType.Comment) { + if(this._elements.length && this._elements[this._elements.length - 1].type === ElementType.Comment){ prevElement = this._elements[this._elements.length - 1]; - prevElement.raw = prevElement.data = (prevElement.raw + element.raw).replace(Parser._reTrimComment, ""); + prevElement.raw = prevElement.data = (prevElement.raw + element.raw).replace(_reTrimComment, ""); element.raw = element.data = ""; //This causes the current element to not be added to the element list element.type = ElementType.Text; } @@ -234,7 +233,7 @@ Parser.prototype.parseTags = function() { else { //Still in a comment tag element.type = ElementType.Comment; //If the previous element is a comment, append the current text to it - if (this._elements.length && this._elements[this._elements.length - 1].type === ElementType.Comment) { + if(this._elements.length && this._elements[this._elements.length - 1].type === ElementType.Comment){ prevElement = this._elements[this._elements.length - 1]; prevElement.raw = prevElement.data = prevElement.raw + element.raw + tagSep; element.raw = element.data = ""; //This causes the current element to not be added to the element list @@ -247,54 +246,54 @@ Parser.prototype.parseTags = function() { } //Processing of non-special tags - if (element.type === ElementType.Tag) { + if(element.type === ElementType.Tag){ element.name = elementName; - if (element.raw.indexOf("!--") === 0) { //This tag is really comment + if(element.raw.indexOf("!--") === 0){ //This tag is really comment element.type = ElementType.Comment; delete element.name; rawLen = element.raw.length; //Check if the comment is terminated in the current element - if (element.raw.charAt(rawLen - 1) === "-" && element.raw.charAt(rawLen - 2) === "-" && tagSep === ">") - element.raw = element.data = element.raw.replace(Parser._reTrimComment, ""); + if(element.raw.charAt(rawLen - 1) === "-" && element.raw.charAt(rawLen - 2) === "-" && tagSep === ">") + element.raw = element.data = element.raw.replace(_reTrimComment, ""); else { //It's not so push the comment onto the tag stack element.raw += tagSep; this._tagStack.push(ElementType.Comment); } } - else if (element.raw.indexOf("!") === 0 || element.raw.indexOf("?") === 0) { + else if(element.raw.indexOf("!") === 0 || element.raw.indexOf("?") === 0){ element.type = ElementType.Directive; //TODO: what about CDATA? } - else if (element.name === "script") { + else if(element.name === "script"){ element.type = ElementType.Script; //Special tag, push onto the tag stack if not terminated - if (element.data.charAt(element.data.length - 1) !== "/") + if(element.data.charAt(element.data.length - 1) !== "/") this._tagStack.push(ElementType.Script); } - else if (element.name === "/script") + else if(element.name === "/script") element.type = ElementType.Script; - else if (element.name === "style") { + else if(element.name === "style"){ element.type = ElementType.Style; //Special tag, push onto the tag stack if not terminated - if (element.data.charAt(element.data.length - 1) !== "/") + if(element.data.charAt(element.data.length - 1) !== "/") this._tagStack.push(ElementType.Style); } - else if (element.name === "/style") + else if(element.name === "/style") element.type = ElementType.Style; - if (element.name && element.name.charAt(0) === "/") + if(element.name && element.name.charAt(0) === "/") element.data = element.name; } //Add all tags and non-empty text elements to the element list - if (element.raw !== "" || element.type !== ElementType.Text) { - if (this._options.includeLocation && !element.location) { + if(element.raw !== "" || element.type !== ElementType.Text){ + if(this._options.includeLocation && !element.location){ element.location = this.getLocation(element.type === ElementType.Tag); } this.parseAttribs(element); this._elements.push(element); //If tag self-terminates, add an explicit, separate closing tag - if ( + if( element.type !== ElementType.Text && element.type !== ElementType.Comment @@ -315,7 +314,7 @@ Parser.prototype.parseTags = function() { this._prevTagSep = tagSep; } - if (this._options.includeLocation) { + if(this._options.includeLocation){ this.getLocation(); this._location.row += this._location.inBuffer; this._location.inBuffer = 0; @@ -327,18 +326,18 @@ Parser.prototype.parseTags = function() { this.writeHandler(); }; -Parser.prototype.getLocation = function(startTag) { +Parser.prototype.getLocation = function(startTag){ var c, l = this._location, end = this._current - (startTag ? 1 : 0), chunk = startTag && l.charOffset === 0 && this._current === 0; - for (; l.charOffset < end; l.charOffset++) { + for (; l.charOffset < end; l.charOffset++){ c = this._buffer.charAt(l.charOffset); - if (c === '\n') { + if(c === '\n'){ l.inBuffer++; l.col = 0; - } else if (c !== '\r') { + } else if(c !== '\r'){ l.col++; } } @@ -349,31 +348,30 @@ Parser.prototype.getLocation = function(startTag) { }; //Checks the handler to make it is an object with the right "interface" -Parser.prototype.validateHandler = function(handler) { - if ((typeof handler) !== "object") +Parser.prototype.validateHandler = function(handler){ + if((typeof handler) !== "object") throw new Error("Handler is not an object"); - if ((typeof handler.reset) !== "function") + if((typeof handler.reset) !== "function") throw new Error("Handler method 'reset' is invalid"); - if ((typeof handler.done) !== "function") + if((typeof handler.done) !== "function") throw new Error("Handler method 'done' is invalid"); - if ((typeof handler.writeTag) !== "function") + if((typeof handler.writeTag) !== "function") throw new Error("Handler method 'writeTag' is invalid"); - if ((typeof handler.writeText) !== "function") + if((typeof handler.writeText) !== "function") throw new Error("Handler method 'writeText' is invalid"); - if ((typeof handler.writeComment) !== "function") + if((typeof handler.writeComment) !== "function") throw new Error("Handler method 'writeComment' is invalid"); - if ((typeof handler.writeDirective) !== "function") + if((typeof handler.writeDirective) !== "function") throw new Error("Handler method 'writeDirective' is invalid"); }; //Writes parsed elements out to the handler -Parser.prototype.writeHandler = function(forceFlush) { - forceFlush = !!forceFlush; - if (this._tagStack.length && !forceFlush) +Parser.prototype.writeHandler = function(forceFlush){ + if(this._tagStack.length && !forceFlush) return; - while (this._elements.length) { + while (this._elements.length){ var element = this._elements.shift(); - switch (element.type) { + switch (element.type){ case ElementType.Comment: this._handler.writeComment(element); break; @@ -390,8 +388,8 @@ Parser.prototype.writeHandler = function(forceFlush) { } }; -Parser.prototype.handleError = function(error) { - if ((typeof this._handler.error) === "function") +Parser.prototype.handleError = function(error){ + if((typeof this._handler.error) === "function") this._handler.error(error); else throw error; }; From a94fedf8c70198aefd9328226b5ec30dff263043 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Sun, 28 Aug 2011 11:10:07 +0200 Subject: [PATCH 011/450] Export the evented handler --- lib/EventedHandler.js | 4 +++- lib/htmlparser.js | 13 ++++++++----- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/lib/EventedHandler.js b/lib/EventedHandler.js index d233976..f4cd91e 100644 --- a/lib/EventedHandler.js +++ b/lib/EventedHandler.js @@ -76,4 +76,6 @@ EventedHandler.prototype.writeTag = function(element){ if(empty) this.onCloseTag(name); else this._tagStack.push(element); } -}; \ No newline at end of file +}; + +module.exports = EventedHandler; \ No newline at end of file diff --git a/lib/htmlparser.js b/lib/htmlparser.js index e14ae86..72457cd 100644 --- a/lib/htmlparser.js +++ b/lib/htmlparser.js @@ -1,5 +1,8 @@ -exports.Parser = require("./Parser.js"); -exports.DefaultHandler = require("./DefaultHandler.js"); -exports.RssHandler = require("./RssHandler.js"); -exports.ElementType = require("./ElementType.js"); -exports.DomUtils = require("./DomUtils.js"); \ No newline at end of file +module.exports = { + Parser: require("./Parser.js"), + DefaultHandler: require("./DefaultHandler.js"), + RssHandler: require("./RssHandler.js"), + ElementType: require("./ElementType.js"), + DomUtils: require("./DomUtils.js"), + EventedHandler: require("./EventedHandler.js") +} \ No newline at end of file From 2b57694b591e9933d8260b6b01fcb176648b64c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Sun, 28 Aug 2011 11:34:14 +0200 Subject: [PATCH 012/450] Fixed a bug & restructured some code --- lib/EventedHandler.js | 19 ++----------------- lib/htmlparser.js | 4 ++-- 2 files changed, 4 insertions(+), 19 deletions(-) diff --git a/lib/EventedHandler.js b/lib/EventedHandler.js index f4cd91e..eb5aa9d 100644 --- a/lib/EventedHandler.js +++ b/lib/EventedHandler.js @@ -36,22 +36,7 @@ var openTagCB = function(openTag, attribute){ }; //HTML Tags that shouldn't contain child nodes -var emptyTags = { - area: true - , base: true - , basefont: true - , br: true - , col: true - , frame: true - , hr: true - , img: true - , input: true - , isindex: true - , link: true - , meta: true - , param: true - , embed: true -}; +var emptyTags={area:true,base:true,basefont:true,br:true,col:true,frame:true,hr:true,img:true,input:true,isindex:true,link:true,meta:true,param:true,embed:true}; EventedHandler.prototype.writeTag = function(element){ var closing = element.name.charAt(0) === "/", @@ -74,7 +59,7 @@ EventedHandler.prototype.writeTag = function(element){ else{ this.onOpenTag(name, attributes); if(empty) this.onCloseTag(name); - else this._tagStack.push(element); + else this._stack.push(element); } }; diff --git a/lib/htmlparser.js b/lib/htmlparser.js index 72457cd..9aa5bde 100644 --- a/lib/htmlparser.js +++ b/lib/htmlparser.js @@ -2,7 +2,7 @@ module.exports = { Parser: require("./Parser.js"), DefaultHandler: require("./DefaultHandler.js"), RssHandler: require("./RssHandler.js"), + EventedHandler: require("./EventedHandler.js"), ElementType: require("./ElementType.js"), - DomUtils: require("./DomUtils.js"), - EventedHandler: require("./EventedHandler.js") + DomUtils: require("./DomUtils.js") } \ No newline at end of file From 1883157fd81e8dac9dc2ce6ffcbbc21333f01412 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Sun, 28 Aug 2011 12:04:02 +0200 Subject: [PATCH 013/450] Wrong number of arguments was passed in EventedHandler --- lib/EventedHandler.js | 2 +- lib/Parser.js | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/EventedHandler.js b/lib/EventedHandler.js index eb5aa9d..1787e43 100644 --- a/lib/EventedHandler.js +++ b/lib/EventedHandler.js @@ -28,7 +28,7 @@ var openTagCB = function(openTag, attribute){ function open(name, attributes){ openTag({name:name, attributes:attributes}); } function attr(name, attributes){ for(var i in attributes) attribute({name:i, value:attributes[i]}); } if(openTag){ - if(attribute) return function(name, attributes){open(name,attributes); attr(attributes);}; + if(attribute) return function(name, attributes){open(name,attributes); attr(null, attributes);}; else return open; } else if(attribute) return attr; diff --git a/lib/Parser.js b/lib/Parser.js index 9375f6f..c092b8c 100644 --- a/lib/Parser.js +++ b/lib/Parser.js @@ -75,7 +75,7 @@ Parser.prototype.done = function(){ , data: (this._parseState === ElementType.Text) ? rawData : rawData.replace(_reTrim, "") , type: this._parseState }; - if(this._parseState === ElementType.Tag || this._parseState === ElementType.Script || this._parseState === ElementType.Style) + if(tagTypes[this._parseState]) element.name = this.parseTagName(element.data); this.parseAttribs(element); this._elements.push(element); From 19c01c3623e7f159af857dc198b16d7186ef3566 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Sun, 28 Aug 2011 12:13:45 +0200 Subject: [PATCH 014/450] Removed repeating code --- lib/Parser.js | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/lib/Parser.js b/lib/Parser.js index c092b8c..fbbaabb 100644 --- a/lib/Parser.js +++ b/lib/Parser.js @@ -55,7 +55,7 @@ Parser.prototype.parseComplete = function(data){ //Parses a piece of an HTML document Parser.prototype.parseChunk = function(data){ if(this._done) - this.handleError(new Error("Attempted to parse chunk after parsing already done")); + this.handleError(Error("Attempted to parse chunk after parsing already done")); this._buffer += data; //FIXME: this can be a bottleneck this.parseTags(); }; @@ -349,20 +349,12 @@ Parser.prototype.getLocation = function(startTag){ //Checks the handler to make it is an object with the right "interface" Parser.prototype.validateHandler = function(handler){ - if((typeof handler) !== "object") - throw new Error("Handler is not an object"); - if((typeof handler.reset) !== "function") - throw new Error("Handler method 'reset' is invalid"); - if((typeof handler.done) !== "function") - throw new Error("Handler method 'done' is invalid"); - if((typeof handler.writeTag) !== "function") - throw new Error("Handler method 'writeTag' is invalid"); - if((typeof handler.writeText) !== "function") - throw new Error("Handler method 'writeText' is invalid"); - if((typeof handler.writeComment) !== "function") - throw new Error("Handler method 'writeComment' is invalid"); - if((typeof handler.writeDirective) !== "function") - throw new Error("Handler method 'writeDirective' is invalid"); + if(typeof handler !== "object") + throw Error("Handler is not an object"); + ["reset", "done", "writeTag", "writeText", "writeComment", "writeDirective"].forEach(function(name){ + if(typeof handler[name] !== "function") + throw Error("Handler method '" + name + "' is invalid"); + }); }; //Writes parsed elements out to the handler @@ -389,7 +381,7 @@ Parser.prototype.writeHandler = function(forceFlush){ }; Parser.prototype.handleError = function(error){ - if((typeof this._handler.error) === "function") + if(typeof this._handler.error === "function") this._handler.error(error); else throw error; }; From 262bbc6b2421505af2292d07ad1bd382ec649272 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Sun, 28 Aug 2011 19:37:33 +0200 Subject: [PATCH 015/450] renamed module, added it to npm (as "htmlparser2") --- package.json | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/package.json b/package.json index b395c90..82865a1 100644 --- a/package.json +++ b/package.json @@ -1,23 +1,22 @@ { - "name": "htmlparser" - , "description": "Forgiving HTML/XML/RSS Parser in JS for *both* Node and Browsers" - , "version": "1.7.3" - , "author": "Chris Winberry " - , "contributors": [] + "name": "htmlparser2" + , "description": "Forgiving HTML/XML/RSS Parser for Node. This version is optimised and cleaned and provides a SAX interface (EventedHandler)." + , "version": "1.0.0" + , "author": "Felix Boehm " + , "contributors": [ "Chris Winberry " ] , "repository": { "type": "git" - , "url": "git://github.com/tautologistics/node-htmlparser.git" + , "url": "git://github.com/fb55/node-htmlparser.git" } , "bugs": { - "mail": "chris@winberry.net" - , "web": "http://github.com/tautologistics/node-htmlparser/issues" + "mail": "me@feedic.com" + , "web": "http://github.com/fb55/node-htmlparser/issues" } - , "os": [ "linux", "darwin", "freebsd", "win32" ] , "directories": { "lib": "./lib/" } , "main": "./lib/htmlparser" - , "engines": { "node": ">=0.1.33" } + , "engines": { "node": ">0" } , "licenses": [{ "type": "MIT" , "url": "http://github.com/tautologistics/node-htmlparser/raw/master/LICENSE" }] -} +} \ No newline at end of file From 14ee72eb9eb61e1d9ee25e7fef14f39d7415b773 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Fri, 21 Oct 2011 14:38:21 +0200 Subject: [PATCH 016/450] Replaced indexOf(a)!==0 with substring(a.length)!==a, charAt(length-1) with substr(-1) + many other improvements --- lib/Parser.js | 116 +++++++++++++++++++++++--------------------------- 1 file changed, 53 insertions(+), 63 deletions(-) diff --git a/lib/Parser.js b/lib/Parser.js index fbbaabb..1eb557f 100644 --- a/lib/Parser.js +++ b/lib/Parser.js @@ -28,9 +28,8 @@ function Parser (handler, options){ //**"Static"**// //Regular expressions used for cleaning up and parsing (stateless) -var _reTrim = /(^\s+|\s+$)/g; //Trim leading/trailing whitespace var _reTrimComment = /(^\!--|--$)/g; //Remove comment tag markup from comment contents -var _reWhitespace = /\s/g; //Used to find any whitespace to split on +var _reWhitespace = /\s/; //Used to find any whitespace to split on var _reTagName = /^\s*(\/?)\s*([^\s\/]+)/; //Used to find the tag name for an element //Regular expressions used for parsing (stateful) @@ -62,8 +61,7 @@ Parser.prototype.parseChunk = function(data){ //Tells the parser that the HTML being parsed is complete Parser.prototype.done = function(){ - if(this._done) - return; + if(this._done) return; this._done = true; //Push any unparsed text into a final element in the element list @@ -72,7 +70,7 @@ Parser.prototype.done = function(){ this._buffer = ""; var element = { raw: rawData - , data: (this._parseState === ElementType.Text) ? rawData : rawData.replace(_reTrim, "") + , data: (this._parseState === ElementType.Text) ? rawData : rawData.trim() , type: this._parseState }; if(tagTypes[this._parseState]) @@ -88,8 +86,8 @@ Parser.prototype.done = function(){ //Resets the parser to a blank state, ready to parse a new HTML document Parser.prototype.reset = function(){ this._buffer = ""; + this._prevTagSep = ""; this._done = false; - this._elements = []; this._elementsCurrent = 0; this._current = 0; this._next = 0; @@ -100,8 +98,8 @@ Parser.prototype.reset = function(){ , inBuffer: 0 }; this._parseState = ElementType.Text; - this._prevTagSep = ''; this._tagStack = []; + this._elements = []; this._handler.reset(); }; @@ -166,7 +164,7 @@ Parser.prototype.parseTags = function(){ //A new element to eventually be appended to the element list var element = { raw: rawData - , data: (this._parseState === ElementType.Text) ? rawData : rawData.replace(_reTrim, "") + , data: (this._parseState === ElementType.Text) ? rawData : rawData.trim() , type: this._parseState }; @@ -175,49 +173,47 @@ Parser.prototype.parseTags = function(){ //This section inspects the current tag stack and modifies the current //element if we're actually parsing a special area (script/comment/style tag) if(this._tagStack.length){ //We're parsing inside a script/comment/style tag - if(this._tagStack[this._tagStack.length - 1] === ElementType.Script){ //We're currently in a script tag + var type = this._tagStack[this._tagStack.length - 1]; + if(type === ElementType.Script){ //We're currently in a script tag if(elementName === "/script") //Actually, we're no longer in a script tag, so pop it off the stack this._tagStack.pop(); else { //Not a closing script tag - if(element.raw.indexOf("!--") !== 0){ //Make sure we're not in a comment + if(rawData.substring(0, 3) !== "!--"){ //Make sure we're not in a comment //All data from here to script close is now a text element element.type = ElementType.Text; //If the previous element is text, append the current text to it if(this._elements.length && this._elements[this._elements.length - 1].type === ElementType.Text){ prevElement = this._elements[this._elements.length - 1]; - prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep + element.raw; + prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep + rawData; element.raw = element.data = ""; //This causes the current element to not be added to the element list } } } } - else if(this._tagStack[this._tagStack.length - 1] === ElementType.Style){ //We're currently in a style tag + else if(type === ElementType.Style){ //We're currently in a style tag if(elementName === "/style") //Actually, we're no longer in a style tag, so pop it off the stack this._tagStack.pop(); else { - if(element.raw.indexOf("!--") !== 0){ //Make sure we're not in a comment + if(rawData.substring(0, 3) !== "!--"){ //Make sure we're not in a comment //All data from here to style close is now a text element element.type = ElementType.Text; //If the previous element is text, append the current text to it if(this._elements.length && this._elements[this._elements.length - 1].type === ElementType.Text){ prevElement = this._elements[this._elements.length - 1]; - if(element.raw !== ""){ - prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep + element.raw; + if(rawData !== ""){ + prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep + rawData; element.raw = element.data = ""; //This causes the current element to not be added to the element list } else { //Element is empty, so just append the last tag marker found prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep; } - } else { //The previous element was not text - if(element.raw !== ""){ - element.raw = element.data = element.raw; - } + } else {//The previous element was not text + if(rawData !== "") element.data = rawData; } } } } - else if(this._tagStack[this._tagStack.length - 1] === ElementType.Comment){ //We're currently in a comment tag - rawLen = element.raw.length; - if(element.raw.charAt(rawLen - 2) === "-" && element.raw.charAt(rawLen - 1) === "-" && tagSep === ">"){ + else if(type === ElementType.Comment){ //We're currently in a comment tag + if(rawData.substr(-2) === "--" && tagSep === ">"){ //Actually, we're no longer in a style tag, so pop it off the stack this._tagStack.pop(); //If the previous element is a comment, append the current text to it @@ -247,42 +243,40 @@ Parser.prototype.parseTags = function(){ //Processing of non-special tags if(element.type === ElementType.Tag){ - element.name = elementName; - - if(element.raw.indexOf("!--") === 0){ //This tag is really comment + if(element.raw.substring(0, 3) === "!--"){ //This tag is really comment element.type = ElementType.Comment; - delete element.name; rawLen = element.raw.length; //Check if the comment is terminated in the current element - if(element.raw.charAt(rawLen - 1) === "-" && element.raw.charAt(rawLen - 2) === "-" && tagSep === ">") + if(element.raw.substr(-2) === "--" && tagSep === ">") element.raw = element.data = element.raw.replace(_reTrimComment, ""); else { //It's not so push the comment onto the tag stack element.raw += tagSep; this._tagStack.push(ElementType.Comment); } } - else if(element.raw.indexOf("!") === 0 || element.raw.indexOf("?") === 0){ - element.type = ElementType.Directive; - //TODO: what about CDATA? - } - else if(element.name === "script"){ - element.type = ElementType.Script; - //Special tag, push onto the tag stack if not terminated - if(element.data.charAt(element.data.length - 1) !== "/") - this._tagStack.push(ElementType.Script); - } - else if(element.name === "/script") - element.type = ElementType.Script; - else if(element.name === "style"){ - element.type = ElementType.Style; - //Special tag, push onto the tag stack if not terminated - if(element.data.charAt(element.data.length - 1) !== "/") - this._tagStack.push(ElementType.Style); + else { + element.name = elementName; + + if(element.raw[0] === "!" || element.raw[0] === "?"){ + element.type = ElementType.Directive; + //TODO: what about CDATA? + } + else if(elementName[0] === "/"){ + element.data = element.name; + if(elementName === "/script") element.type = ElementType.Script; + else if(elementName === "/style") element.type = ElementType.Style; + } + else if(elementName === "script"){ + element.type = ElementType.Script; + //Special tag, push onto the tag stack if not terminated + if(element.data.substr(-1) !== "/") this._tagStack.push(ElementType.Script); + } + else if(elementName === "style"){ + element.type = ElementType.Style; + //Special tag, push onto the tag stack if not terminated + if(element.data.substr(-1) !== "/") this._tagStack.push(ElementType.Style); + } } - else if(element.name === "/style") - element.type = ElementType.Style; - if(element.name && element.name.charAt(0) === "/") - element.data = element.name; } //Add all tags and non-empty text elements to the element list @@ -300,7 +294,7 @@ Parser.prototype.parseTags = function(){ && element.type !== ElementType.Directive && - element.data.charAt(element.data.length - 1) === "/" + element.data.substr(-1) === "/" ) this._elements.push({ raw: "/" + element.name @@ -329,17 +323,18 @@ Parser.prototype.parseTags = function(){ Parser.prototype.getLocation = function(startTag){ var c, l = this._location, - end = this._current - (startTag ? 1 : 0), - chunk = startTag && l.charOffset === 0 && this._current === 0; + end = this._current, + chunk = startTag && l.charOffset === 0 && end === 0; + + if(startTag) end--; for (; l.charOffset < end; l.charOffset++){ - c = this._buffer.charAt(l.charOffset); + c = this._buffer[l.charOffset]; if(c === '\n'){ l.inBuffer++; l.col = 0; - } else if(c !== '\r'){ + } else if(c !== '\r') l.col++; - } } return { line: l.row + l.inBuffer + 1 @@ -364,18 +359,13 @@ Parser.prototype.writeHandler = function(forceFlush){ while (this._elements.length){ var element = this._elements.shift(); switch (element.type){ - case ElementType.Comment: - this._handler.writeComment(element); - break; - case ElementType.Directive: - this._handler.writeDirective(element); + case ElementType.Comment: this._handler.writeComment(element); break; - case ElementType.Text: - this._handler.writeText(element); + case ElementType.Directive: this._handler.writeDirective(element); break; - default: - this._handler.writeTag(element); + case ElementType.Text: this._handler.writeText(element); break; + default: this._handler.writeTag(element); } } }; From fd49686da4f9faa50b8d42e1c9267f110eb55f82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Fri, 21 Oct 2011 14:38:57 +0200 Subject: [PATCH 017/450] 1.1.0 --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 82865a1..7129c9d 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "htmlparser2" , "description": "Forgiving HTML/XML/RSS Parser for Node. This version is optimised and cleaned and provides a SAX interface (EventedHandler)." - , "version": "1.0.0" + , "version": "1.1.0" , "author": "Felix Boehm " , "contributors": [ "Chris Winberry " ] , "repository": { From cf39b1c42c506fc2a0bf096ba2a68f9fa4697d48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Fri, 21 Oct 2011 15:21:54 +0200 Subject: [PATCH 018/450] Restructured DomUtils --- lib/DomUtils.js | 86 ++++++++++++++++++------------------------------- 1 file changed, 32 insertions(+), 54 deletions(-) diff --git a/lib/DomUtils.js b/lib/DomUtils.js index f7a6f3e..a6e0bdf 100644 --- a/lib/DomUtils.js +++ b/lib/DomUtils.js @@ -1,96 +1,74 @@ -var DomUtils = { - testElement: function(options, element) { - if (!element) { - return false; - } +module.exports = { + testElement: function(options, element) { + if (!element) return false; + + var type = element.type; for (var key in options) { if (key === "tag_name") { - if (element.type !== "tag" && element.type !== "script" && element.type !== "style") { - return false; - } - if (!options.tag_name(element.name)) { - return false; - } + if (type !== "tag" && type !== "script" && type !== "style") return false; + if (!options.tag_name(element.name)) return false; } else if (key === "tag_type") { - if (!options.tag_type(element.type)) { - return false; - } + if (!options.tag_type(type)) return false; } else if (key === "tag_contains") { - if (element.type !== "text" && element.type !== "comment" && element.type !== "directive") { - return false; - } - if (!options.tag_contains(element.data)) { - return false; - } - } else { - if (!element.attribs || !options[key](element.attribs[key])) { - return false; - } - } + if (type !== "text" && type !== "comment" && type !== "directive") return false; + if (!options.tag_contains(element.data)) return false; + } else if (!element.attribs || !options[key](element.attribs[key])) + return false; } return true; } , getElements: function(options, currentElement, recurse, limit) { + if (!currentElement) return []; + recurse = (recurse === undefined || recurse === null) || !!recurse; - limit = isNaN(parseInt(limit, 10)) ? -1 : parseInt(limit, 10); - if (!currentElement) { - return([]); - } + var parsed_limit = parseInt(limit, 10); + limit = isNaN(parsed_limit) ? -1 : parsed_limit; var found = []; var elementList; function getTest (checkVal) { - return(function (value) { return(value === checkVal); }); + return function (value) { return value === checkVal; }; } for (var key in options) { - if ((typeof options[key]) !== "function") { + if (typeof options[key] !== "function") { options[key] = getTest(options[key]); } } - if (DomUtils.testElement(options, currentElement)) { + if (this.testElement(options, currentElement)) { found.push(currentElement); } - if (limit >= 0 && found.length >= limit) { - return(found); - } + if (limit >= 0 && found.length >= limit) return found; - if (recurse && currentElement.children) { - elementList = currentElement.children; - } else if (currentElement instanceof Array) { - elementList = currentElement; - } else { - return(found); - } + if(recurse && currentElement.children) elementList = currentElement.children; + else if(Array.isArray(currentElement)) elementList = currentElement; + else return found; for (var i = 0; i < elementList.length; i++) { - found = found.concat(DomUtils.getElements(options, elementList[i], recurse, limit)); - if (limit >= 0 && found.length >= limit) { - break; - } + found = found.concat(this.getElements(options, elementList[i], recurse, limit)); + + if (limit >= 0 && found.length >= limit) break; } - return(found); + return found; } , getElementById: function(id, currentElement, recurse) { - var result = DomUtils.getElements({ id: id }, currentElement, recurse, 1); - return(result.length ? result[0] : null); + var result = this.getElements({ id: id }, currentElement, recurse, 1); + return result.length ? result[0] : null; } , getElementsByTagName: function(name, currentElement, recurse, limit) { - return(DomUtils.getElements({ tag_name: name }, currentElement, recurse, limit)); + return this.getElements({ tag_name: name }, currentElement, recurse, limit); } , getElementsByTagType: function(type, currentElement, recurse, limit) { - return(DomUtils.getElements({ tag_type: type }, currentElement, recurse, limit)); + return this.getElements({ tag_type: type }, currentElement, recurse, limit); } -}; - -module.exports = DomUtils; \ No newline at end of file +}; \ No newline at end of file From 796fec61fedb32a35b9cc38d0c485803166a11e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Sat, 22 Oct 2011 19:57:42 +0200 Subject: [PATCH 019/450] Some small improvements --- lib/Parser.js | 131 +++++++++++++++++++++---------------------- tests/00-runtests.js | 13 +++-- 2 files changed, 73 insertions(+), 71 deletions(-) diff --git a/lib/Parser.js b/lib/Parser.js index 1eb557f..aafbaa5 100644 --- a/lib/Parser.js +++ b/lib/Parser.js @@ -14,7 +14,6 @@ function Parser (handler, options){ this._elements = []; this._elementsCurrent = 0; this._current = 0; - this._next = 0; this._location = { row: 0 , col: 0 @@ -35,7 +34,7 @@ var _reTagName = /^\s*(\/?)\s*([^\s\/]+)/; //Used to find the tag name for an el //Regular expressions used for parsing (stateful) var _reAttrib = //Find attributes in a tag /([^=<>\"\'\s]+)\s*=\s*"([^"]*)"|([^=<>\"\'\s]+)\s*=\s*'([^']*)'|([^=<>\"\'\s]+)\s*=\s*([^'"\s]+)|([^=<>\"\'\s\/]+)/g; -var _reTags = /[<\>]/g; //Find tag markers +var _reTags = /[<>]/g; //Find tag markers var tagTypes = {}; tagTypes[ ElementType.Script ] = true; @@ -69,17 +68,18 @@ Parser.prototype.done = function(){ var rawData = this._buffer; this._buffer = ""; var element = { - raw: rawData - , data: (this._parseState === ElementType.Text) ? rawData : rawData.trim() + raw: rawData + , data: this._parseState === ElementType.Text ? rawData : rawData.trim() , type: this._parseState }; - if(tagTypes[this._parseState]) + if(tagTypes[this._parseState]){ element.name = this.parseTagName(element.data); - this.parseAttribs(element); + this.parseAttribs(element); + } this._elements.push(element); } - this.writeHandler(); + this.writeHandler(true); this._handler.done(); }; @@ -90,7 +90,6 @@ Parser.prototype.reset = function(){ this._done = false; this._elementsCurrent = 0; this._current = 0; - this._next = 0; this._location = { row: 0 , col: 0 @@ -109,7 +108,6 @@ Parser.prototype.reset = function(){ Parser.prototype.parseTagAttribs = function(elements){ for(var i = 0, j = elements.length; i < j; i++){ var element = elements[i]; - if(tagTypes[element.type]) this.parseAttribs(element); } @@ -120,11 +118,11 @@ Parser.prototype.parseTagAttribs = function(elements){ Parser.prototype.parseAttribs = function(element){ //Only parse attributes for tags if(!tagTypes[element.type]) return; - - var tagName = element.data.split(_reWhitespace, 1)[0]; - var attribRaw = element.data.substring(tagName.length); - if(attribRaw.length < 1) - return; + + var pos = element.data.search(_reWhitespace); + if(pos === -1) return; + var attribRaw = element.data.substr(pos); + if(attribRaw === "") return; var match; _reAttrib.lastIndex = 0; @@ -132,13 +130,13 @@ Parser.prototype.parseAttribs = function(element){ if(element.attribs === undefined) element.attribs = {}; - if(typeof match[1] === "string" && match[1].length){ + if(match[1]){ element.attribs[match[1]] = match[2]; - } else if(typeof match[3] === "string" && match[3].length){ - element.attribs[match[3].toString()] = match[4].toString(); - } else if(typeof match[5] === "string" && match[5].length){ + } else if(match[3]){ + element.attribs[match[3]] = match[4]; + } else if(match[5]){ element.attribs[match[5]] = match[6]; - } else if(typeof match[7] === "string" && match[7].length){ + } else if(match[7]){ element.attribs[match[7]] = match[7]; } } @@ -147,44 +145,47 @@ Parser.prototype.parseAttribs = function(element){ //Extracts the base tag name from the data value of an element Parser.prototype.parseTagName = function(data){ if(!data) return ""; - var match = _reTagName.exec(data); - if(!match) return ""; - return (match[1] ? "/" : "") + match[2]; + var match = data.match(_reTagName); + if(match === null) return ""; + return match[1] + match[2]; }; //Parses through HTML text and returns an array of found elements //I admit, this function is rather large but splitting up had an noticeable impact on speed Parser.prototype.parseTags = function(){ - var bufferEnd = this._buffer.length - 1; - while (_reTags.test(this._buffer)){ - this._next = _reTags.lastIndex - 1; - var tagSep = this._buffer.charAt(this._next); //The currently found tag marker - var rawData = this._buffer.substring(this._current, this._next); //The next chunk of data to parse + var buffer = this._buffer, stack = this._tagStack; + + var next, tagSep, rawData, element, elementName, prevElement, rawLen; + + while (_reTags.test(buffer)){ + next = _reTags.lastIndex - 1; + tagSep = buffer.charAt(next); //The currently found tag marker + rawData = buffer.substring(this._current, next); //The next chunk of data to parse //A new element to eventually be appended to the element list - var element = { + element = { raw: rawData , data: (this._parseState === ElementType.Text) ? rawData : rawData.trim() , type: this._parseState }; - var elementName = this.parseTagName(element.data), prevElement, rawLen; + elementName = this.parseTagName(element.data); //This section inspects the current tag stack and modifies the current //element if we're actually parsing a special area (script/comment/style tag) - if(this._tagStack.length){ //We're parsing inside a script/comment/style tag - var type = this._tagStack[this._tagStack.length - 1]; + if(stack.length){ //We're parsing inside a script/comment/style tag + var type = stack[stack.length - 1]; if(type === ElementType.Script){ //We're currently in a script tag if(elementName === "/script") //Actually, we're no longer in a script tag, so pop it off the stack - this._tagStack.pop(); + stack.pop(); else { //Not a closing script tag if(rawData.substring(0, 3) !== "!--"){ //Make sure we're not in a comment //All data from here to script close is now a text element element.type = ElementType.Text; //If the previous element is text, append the current text to it - if(this._elements.length && this._elements[this._elements.length - 1].type === ElementType.Text){ - prevElement = this._elements[this._elements.length - 1]; - prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep + rawData; + prevElement = this._elements && this._elements[this._elements.length - 1]; + if(prevElement && prevElement.type === ElementType.Text){ + prevElement.data = prevElement.raw += this._prevTagSep + rawData; element.raw = element.data = ""; //This causes the current element to not be added to the element list } } @@ -192,19 +193,19 @@ Parser.prototype.parseTags = function(){ } else if(type === ElementType.Style){ //We're currently in a style tag if(elementName === "/style") //Actually, we're no longer in a style tag, so pop it off the stack - this._tagStack.pop(); + stack.pop(); else { if(rawData.substring(0, 3) !== "!--"){ //Make sure we're not in a comment //All data from here to style close is now a text element element.type = ElementType.Text; //If the previous element is text, append the current text to it - if(this._elements.length && this._elements[this._elements.length - 1].type === ElementType.Text){ - prevElement = this._elements[this._elements.length - 1]; + prevElement = this._elements && this._elements[this._elements.length - 1]; + if(prevElement && prevElement.type === ElementType.Text){ if(rawData !== ""){ - prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep + rawData; + prevElement.data = prevElement.raw += this._prevTagSep + rawData; element.raw = element.data = ""; //This causes the current element to not be added to the element list } else { //Element is empty, so just append the last tag marker found - prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep; + prevElement.data = prevElement.raw += this._prevTagSep; } } else {//The previous element was not text if(rawData !== "") element.data = rawData; @@ -213,12 +214,14 @@ Parser.prototype.parseTags = function(){ } } else if(type === ElementType.Comment){ //We're currently in a comment tag + + prevElement = this._elements && this._elements[this._elements.length - 1]; + if(rawData.substr(-2) === "--" && tagSep === ">"){ //Actually, we're no longer in a style tag, so pop it off the stack - this._tagStack.pop(); + stack.pop(); //If the previous element is a comment, append the current text to it - if(this._elements.length && this._elements[this._elements.length - 1].type === ElementType.Comment){ - prevElement = this._elements[this._elements.length - 1]; + if(prevElement && prevElement.type === ElementType.Comment){ prevElement.raw = prevElement.data = (prevElement.raw + element.raw).replace(_reTrimComment, ""); element.raw = element.data = ""; //This causes the current element to not be added to the element list element.type = ElementType.Text; @@ -229,14 +232,13 @@ Parser.prototype.parseTags = function(){ else { //Still in a comment tag element.type = ElementType.Comment; //If the previous element is a comment, append the current text to it - if(this._elements.length && this._elements[this._elements.length - 1].type === ElementType.Comment){ - prevElement = this._elements[this._elements.length - 1]; - prevElement.raw = prevElement.data = prevElement.raw + element.raw + tagSep; + if(prevElement && prevElement.type === ElementType.Comment){ + prevElement.data = prevElement.raw += element.raw + tagSep; element.raw = element.data = ""; //This causes the current element to not be added to the element list element.type = ElementType.Text; } else - element.raw = element.data = element.raw + tagSep; + element.data = element.raw += tagSep; } } } @@ -247,34 +249,34 @@ Parser.prototype.parseTags = function(){ element.type = ElementType.Comment; rawLen = element.raw.length; //Check if the comment is terminated in the current element - if(element.raw.substr(-2) === "--" && tagSep === ">") + if(tagSep === ">" && element.raw.substr(-2) === "--") element.raw = element.data = element.raw.replace(_reTrimComment, ""); else { //It's not so push the comment onto the tag stack element.raw += tagSep; - this._tagStack.push(ElementType.Comment); + stack.push(ElementType.Comment); } } else { element.name = elementName; - if(element.raw[0] === "!" || element.raw[0] === "?"){ + if(element.raw.charAt(0) === "!" || element.raw.charAt(0) === "?"){ element.type = ElementType.Directive; //TODO: what about CDATA? } - else if(elementName[0] === "/"){ - element.data = element.name; + else if(elementName.charAt(0) === "/"){ + element.data = elementName; if(elementName === "/script") element.type = ElementType.Script; else if(elementName === "/style") element.type = ElementType.Style; } else if(elementName === "script"){ element.type = ElementType.Script; //Special tag, push onto the tag stack if not terminated - if(element.data.substr(-1) !== "/") this._tagStack.push(ElementType.Script); + if(element.data.substr(-1) !== "/") stack.push(ElementType.Script); } else if(elementName === "style"){ element.type = ElementType.Style; //Special tag, push onto the tag stack if not terminated - if(element.data.substr(-1) !== "/") this._tagStack.push(ElementType.Style); + if(element.data.substr(-1) !== "/") stack.push(ElementType.Style); } } } @@ -287,24 +289,21 @@ Parser.prototype.parseTags = function(){ this.parseAttribs(element); this._elements.push(element); //If tag self-terminates, add an explicit, separate closing tag - if( - element.type !== ElementType.Text - && - element.type !== ElementType.Comment - && - element.type !== ElementType.Directive - && - element.data.substr(-1) === "/" - ) + if( element.data.substr(-1) === "/" + && element.type !== ElementType.Text + && element.type !== ElementType.Comment + && element.type !== ElementType.Directive + ){ this._elements.push({ - raw: "/" + element.name + raw: "/" + element.name , data: "/" + element.name , name: "/" + element.name , type: element.type }); + } } this._parseState = (tagSep === "<") ? ElementType.Tag : ElementType.Text; - this._current = this._next + 1; + this._current = next + 1; this._prevTagSep = tagSep; } @@ -314,7 +313,7 @@ Parser.prototype.parseTags = function(){ this._location.inBuffer = 0; this._location.charOffset = 0; } - this._buffer = (this._current <= bufferEnd) ? this._buffer.substring(this._current) : ""; + this._buffer = this._buffer.substring(this._current); this._current = 0; this.writeHandler(); diff --git a/tests/00-runtests.js b/tests/00-runtests.js index b879227..f20464c 100644 --- a/tests/00-runtests.js +++ b/tests/00-runtests.js @@ -29,18 +29,18 @@ var chunkSize = 5; var testFiles = fs.readdirSync(testFolder); var testCount = 0; var failedCount = 0; +var totalTime = 0; for (var i = 1; i < testFiles.length; i++) { testCount++; - var fileParts = testFiles[i].split("."); - fileParts.pop(); - var moduleName = fileParts.join("."); + var moduleName = testFiles[i]; var test = require(testFolder + "/" + moduleName); var handlerCallback = function handlerCallback (error) { if (error) sys.puts("Handler error: " + error); } console.log(testFiles[i]); - var handler = (test.type == "rss") ? + var start = Date.now(); + var handler = (test.type === "rss") ? new htmlparser.RssHandler(handlerCallback, test.options.handler) : new htmlparser.DefaultHandler(handlerCallback, test.options.handler) @@ -61,7 +61,9 @@ for (var i = 1; i < testFiles.length; i++) { && sys.inspect(resultChunk, false, null) === sys.inspect(test.expected, false, null) ; - sys.puts("[" + test.name + "\]: " + (testResult ? "passed" : "FAILED")); + var took = Date.now() - start; + totalTime += took; + sys.puts("[" + test.name + "\]: " + (testResult ? "passed" : "FAILED") + " (took: " + took + "ms)"); if (!testResult) { failedCount++; sys.puts("== Complete =="); @@ -74,3 +76,4 @@ for (var i = 1; i < testFiles.length; i++) { } sys.puts("Total tests: " + testCount); sys.puts("Failed tests: " + failedCount); +sys.puts("Total time: " + totalTime); From 499bfbd3956a21aa21236d261d1f0b9c8aca6065 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Tue, 25 Oct 2011 21:32:41 +0200 Subject: [PATCH 020/450] npm bugfix --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 7129c9d..e3950ab 100644 --- a/package.json +++ b/package.json @@ -10,7 +10,7 @@ } , "bugs": { "mail": "me@feedic.com" - , "web": "http://github.com/fb55/node-htmlparser/issues" + , "url": "http://github.com/fb55/node-htmlparser/issues" } , "directories": { "lib": "./lib/" } , "main": "./lib/htmlparser" From 718b5ccc080cf4a6e94a0c15bedf868b6f61896f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Tue, 25 Oct 2011 21:33:24 +0200 Subject: [PATCH 021/450] use util.inherits (function was taken from node anyway) --- lib/RssHandler.js | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/lib/RssHandler.js b/lib/RssHandler.js index 00edbf0..8481e9c 100644 --- a/lib/RssHandler.js +++ b/lib/RssHandler.js @@ -1,29 +1,24 @@ var DefaultHandler = require("./DefaultHandler.js"), - DomUtils = require("./DomUtils.js"); + DomUtils = require("./DomUtils.js"), + inherits = require("util").inherits; //TODO: make this a trully streamable handler function RssHandler (callback) { RssHandler.super_.call(this, callback, { ignoreWhitespace: true, verbose: false, enforceEmptyTags: false }); } -function inherits (ctor, superCtor) { - var tempCtor = function(){}; - tempCtor.prototype = superCtor.prototype; - ctor.super_ = superCtor; - ctor.prototype = new tempCtor(); - ctor.prototype.constructor = ctor; -} - inherits(RssHandler, DefaultHandler); function getElements(what, where, one, recurse){ var ret = DomUtils.getElementsByTagName(what, where, !!recurse); - if(one) try{ return ret[0]; } catch(e){return false;} + if(one) + if(ret && ret.length > 0) return ret[0]; + else return false; else return ret; } function fetch(what, where, recurse){ var ret = getElements(what, where, true, !!recurse); - if(ret) try{ return ret.children[0].data; } catch(e){return false;} + if(ret && ret.children && ret.children.length > 0) return ret.children[0].data; else return false; } @@ -32,7 +27,7 @@ RssHandler.prototype.done = function() { var feedRoot; var tmp; - var found = getElements(function (value) { return(value === "rss" || value === "feed"); }, this.dom); + var found = getElements(function(value) { return value === "rss" || value === "feed"; }, this.dom); if (found.length) { feedRoot = found[0]; } From 4da4cfdf0de4a5805d8345686d62d4174c151430 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Tue, 25 Oct 2011 21:33:55 +0200 Subject: [PATCH 022/450] Code cleanup, preparation for big changes --- lib/Parser.js | 233 +++++++++++++++++++++----------------------------- 1 file changed, 99 insertions(+), 134 deletions(-) diff --git a/lib/Parser.js b/lib/Parser.js index aafbaa5..26c01af 100644 --- a/lib/Parser.js +++ b/lib/Parser.js @@ -1,18 +1,17 @@ var ElementType = require("./ElementType.js"); function Parser (handler, options){ - this._options = options ? options : { }; - if(this._options.includeLocation === undefined){ - this._options.includeLocation = false; //Do not track element position in document by default - } + this._options = options ? options : { + includeLocation: false, //Do not track element position in document by default + xmlMode: false //Special behaviour for script/style tags by default + }; - this.validateHandler(handler); + validateHandler(handler); this._handler = handler; - + this._buffer = ""; this._done = false; this._elements = []; - this._elementsCurrent = 0; this._current = 0; this._location = { row: 0 @@ -73,8 +72,8 @@ Parser.prototype.done = function(){ , type: this._parseState }; if(tagTypes[this._parseState]){ - element.name = this.parseTagName(element.data); - this.parseAttribs(element); + element.name = parseTagName(element.data); + parseAttribs(element); } this._elements.push(element); } @@ -88,7 +87,6 @@ Parser.prototype.reset = function(){ this._buffer = ""; this._prevTagSep = ""; this._done = false; - this._elementsCurrent = 0; this._current = 0; this._location = { row: 0 @@ -103,47 +101,32 @@ Parser.prototype.reset = function(){ }; //**Private**// -//Methods// -//Takes an array of elements and parses any found attributes -Parser.prototype.parseTagAttribs = function(elements){ - for(var i = 0, j = elements.length; i < j; i++){ - var element = elements[i]; - this.parseAttribs(element); - } - - return(elements); -}; - -//Takes an element and adds an "attribs" property for any element attributes found -Parser.prototype.parseAttribs = function(element){ +//Takes an element and adds an "attribs" property for any element attributes found +var parseAttribs = function(element){ //Only parse attributes for tags if(!tagTypes[element.type]) return; - + var pos = element.data.search(_reWhitespace); if(pos === -1) return; var attribRaw = element.data.substr(pos); if(attribRaw === "") return; - var match; _reAttrib.lastIndex = 0; - while (match = _reAttrib.exec(attribRaw)){ - if(element.attribs === undefined) - element.attribs = {}; - - if(match[1]){ - element.attribs[match[1]] = match[2]; - } else if(match[3]){ - element.attribs[match[3]] = match[4]; - } else if(match[5]){ - element.attribs[match[5]] = match[6]; - } else if(match[7]){ - element.attribs[match[7]] = match[7]; + var match = _reAttrib.exec(attribRaw); + if(match){ + element.attribs = {}; + do{ + if(match[1]) element.attribs[match[1]] = match[2]; + else if(match[3]) element.attribs[match[3]] = match[4]; + else if(match[5]) element.attribs[match[5]] = match[6]; + else if(match[7]) element.attribs[match[7]] = match[7]; } + while(match = _reAttrib.exec(attribRaw)); } }; //Extracts the base tag name from the data value of an element -Parser.prototype.parseTagName = function(data){ +var parseTagName = function(data){ if(!data) return ""; var match = data.match(_reTagName); if(match === null) return ""; @@ -153,14 +136,16 @@ Parser.prototype.parseTagName = function(data){ //Parses through HTML text and returns an array of found elements //I admit, this function is rather large but splitting up had an noticeable impact on speed Parser.prototype.parseTags = function(){ - var buffer = this._buffer, stack = this._tagStack; - - var next, tagSep, rawData, element, elementName, prevElement, rawLen; - + var buffer = this._buffer, stack = this._tagStack, handler = this._handler; + + var next, type, tagSep, rawData, element, elementName, prevElement, elementType; + while (_reTags.test(buffer)){ next = _reTags.lastIndex - 1; tagSep = buffer.charAt(next); //The currently found tag marker rawData = buffer.substring(this._current, next); //The next chunk of data to parse + elementType = this._parseState; + type = stack.slice(-1)[0]; //A new element to eventually be appended to the element list element = { @@ -169,112 +154,88 @@ Parser.prototype.parseTags = function(){ , type: this._parseState }; - elementName = this.parseTagName(element.data); + if(this._parseState === ElementType.Tag) elementName = parseTagName(rawData); + else elementName = ""; + //This section inspects the current tag stack and modifies the current //element if we're actually parsing a special area (script/comment/style tag) - if(stack.length){ //We're parsing inside a script/comment/style tag - var type = stack[stack.length - 1]; - if(type === ElementType.Script){ //We're currently in a script tag - if(elementName === "/script") //Actually, we're no longer in a script tag, so pop it off the stack - stack.pop(); - else { //Not a closing script tag - if(rawData.substring(0, 3) !== "!--"){ //Make sure we're not in a comment - //All data from here to script close is now a text element - element.type = ElementType.Text; - //If the previous element is text, append the current text to it - prevElement = this._elements && this._elements[this._elements.length - 1]; - if(prevElement && prevElement.type === ElementType.Text){ - prevElement.data = prevElement.raw += this._prevTagSep + rawData; - element.raw = element.data = ""; //This causes the current element to not be added to the element list - } - } - } - } - else if(type === ElementType.Style){ //We're currently in a style tag - if(elementName === "/style") //Actually, we're no longer in a style tag, so pop it off the stack - stack.pop(); - else { - if(rawData.substring(0, 3) !== "!--"){ //Make sure we're not in a comment - //All data from here to style close is now a text element - element.type = ElementType.Text; - //If the previous element is text, append the current text to it - prevElement = this._elements && this._elements[this._elements.length - 1]; - if(prevElement && prevElement.type === ElementType.Text){ - if(rawData !== ""){ - prevElement.data = prevElement.raw += this._prevTagSep + rawData; - element.raw = element.data = ""; //This causes the current element to not be added to the element list - } else { //Element is empty, so just append the last tag marker found - prevElement.data = prevElement.raw += this._prevTagSep; - } - } else {//The previous element was not text - if(rawData !== "") element.data = rawData; - } - } - } - } - else if(type === ElementType.Comment){ //We're currently in a comment tag - - prevElement = this._elements && this._elements[this._elements.length - 1]; - - if(rawData.substr(-2) === "--" && tagSep === ">"){ - //Actually, we're no longer in a style tag, so pop it off the stack - stack.pop(); - //If the previous element is a comment, append the current text to it - if(prevElement && prevElement.type === ElementType.Comment){ - prevElement.raw = prevElement.data = (prevElement.raw + element.raw).replace(_reTrimComment, ""); - element.raw = element.data = ""; //This causes the current element to not be added to the element list - element.type = ElementType.Text; - } - else //Previous element not a comment - element.type = ElementType.Comment; //Change the current element's type to a comment - } - else { //Still in a comment tag - element.type = ElementType.Comment; - //If the previous element is a comment, append the current text to it - if(prevElement && prevElement.type === ElementType.Comment){ - prevElement.data = prevElement.raw += element.raw + tagSep; - element.raw = element.data = ""; //This causes the current element to not be added to the element list - element.type = ElementType.Text; - } - else - element.data = element.raw += tagSep; - } + if(!type){ /* nothing */ } + else if(type === ElementType.Script && elementName === "/script") stack.pop(); + else if(type === ElementType.Style && elementName === "/style") stack.pop(); + else if(!this._options.xmlMode && (type === ElementType.Script || type === ElementType.Style)){ + //special behaviour for script & style tags + if(rawData.substring(0, 3) !== "!--"){ //Make sure we're not in a comment + //All data from here to style close is now a text element + elementType = ElementType.Text; + //If the previous element is text, append the current text to it + prevElement = this._elements && this._elements[this._elements.length - 1]; + if(prevElement && prevElement.type === ElementType.Text){ + prevElement.data = prevElement.raw += this._prevTagSep + rawData; + rawData = element.data = ""; //This causes the current element to not be added to the element list + } else element.data = rawData; //The previous element was not text } } + else if(type === ElementType.Comment){ //We're currently in a comment tag + + prevElement = this._elements && this._elements[this._elements.length - 1]; + + if(rawData.substr(-2) === "--" && tagSep === ">"){ + stack.pop(); + //If the previous element is a comment, append the current text to it + if(prevElement && prevElement.type === ElementType.Comment){ //Previous element was a comment + prevElement.raw = prevElement.data = (prevElement.raw + rawData).replace(_reTrimComment, ""); + rawData = element.data = ""; //This causes the current element to not be added to the element list + elementType = ElementType.Text; + } + else elementType = ElementType.Comment; //Change the current element's type to a comment + } + else { //Still in a comment tag + elementType = ElementType.Comment; + //If the previous element is a comment, append the current text to it + if(prevElement && prevElement.type === ElementType.Comment){ + prevElement.data = prevElement.raw += rawData + tagSep; + rawData = element.data = ""; //This causes the current element to not be added to the element list + elementType = ElementType.Text; + } + else + element.data = rawData += tagSep; + } + } + + //Processing of non-special tags - if(element.type === ElementType.Tag){ - if(element.raw.substring(0, 3) === "!--"){ //This tag is really comment - element.type = ElementType.Comment; - rawLen = element.raw.length; + if(elementType === ElementType.Tag){ + if(rawData.substring(0, 3) === "!--"){ //This tag is really comment + elementType = ElementType.Comment; //Check if the comment is terminated in the current element - if(tagSep === ">" && element.raw.substr(-2) === "--") - element.raw = element.data = element.raw.replace(_reTrimComment, ""); + if(tagSep === ">" && rawData.substr(-2) === "--") + rawData = element.data = rawData.replace(_reTrimComment, ""); else { //It's not so push the comment onto the tag stack - element.raw += tagSep; + rawData += tagSep; stack.push(ElementType.Comment); } } else { element.name = elementName; - - if(element.raw.charAt(0) === "!" || element.raw.charAt(0) === "?"){ - element.type = ElementType.Directive; + + if(rawData.charAt(0) === "!" || rawData.charAt(0) === "?"){ + elementType = ElementType.Directive; //TODO: what about CDATA? } else if(elementName.charAt(0) === "/"){ element.data = elementName; - if(elementName === "/script") element.type = ElementType.Script; - else if(elementName === "/style") element.type = ElementType.Style; + if(elementName === "/script") elementType = ElementType.Script; + else if(elementName === "/style") elementType = ElementType.Style; } else if(elementName === "script"){ - element.type = ElementType.Script; + elementType = ElementType.Script; //Special tag, push onto the tag stack if not terminated if(element.data.substr(-1) !== "/") stack.push(ElementType.Script); } else if(elementName === "style"){ - element.type = ElementType.Style; + elementType = ElementType.Style; //Special tag, push onto the tag stack if not terminated if(element.data.substr(-1) !== "/") stack.push(ElementType.Style); } @@ -282,23 +243,26 @@ Parser.prototype.parseTags = function(){ } //Add all tags and non-empty text elements to the element list - if(element.raw !== "" || element.type !== ElementType.Text){ + if(rawData !== "" || elementType !== ElementType.Text){ + element.raw = rawData; + element.type = elementType; + if(this._options.includeLocation && !element.location){ - element.location = this.getLocation(element.type === ElementType.Tag); + element.location = this.getLocation(elementType === ElementType.Tag); } - this.parseAttribs(element); + parseAttribs(element); this._elements.push(element); //If tag self-terminates, add an explicit, separate closing tag if( element.data.substr(-1) === "/" - && element.type !== ElementType.Text - && element.type !== ElementType.Comment - && element.type !== ElementType.Directive + && elementType !== ElementType.Text + && elementType !== ElementType.Comment + && elementType !== ElementType.Directive ){ this._elements.push({ raw: "/" + element.name , data: "/" + element.name , name: "/" + element.name - , type: element.type + , type: elementType }); } } @@ -315,6 +279,7 @@ Parser.prototype.parseTags = function(){ } this._buffer = this._buffer.substring(this._current); this._current = 0; + _reTags.lastIndex = 0; this.writeHandler(); }; @@ -324,9 +289,9 @@ Parser.prototype.getLocation = function(startTag){ l = this._location, end = this._current, chunk = startTag && l.charOffset === 0 && end === 0; - + if(startTag) end--; - + for (; l.charOffset < end; l.charOffset++){ c = this._buffer[l.charOffset]; if(c === '\n'){ @@ -342,7 +307,7 @@ Parser.prototype.getLocation = function(startTag){ }; //Checks the handler to make it is an object with the right "interface" -Parser.prototype.validateHandler = function(handler){ +var validateHandler = function(handler){ if(typeof handler !== "object") throw Error("Handler is not an object"); ["reset", "done", "writeTag", "writeText", "writeComment", "writeDirective"].forEach(function(name){ From db4f6383b56921ad1e87d74004e3c54428e05531 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Tue, 25 Oct 2011 21:48:13 +0200 Subject: [PATCH 023/450] Create element object later --- lib/Parser.js | 60 +++++++++++++++++++++++++++------------------------ 1 file changed, 32 insertions(+), 28 deletions(-) diff --git a/lib/Parser.js b/lib/Parser.js index 26c01af..5f70387 100644 --- a/lib/Parser.js +++ b/lib/Parser.js @@ -138,25 +138,25 @@ var parseTagName = function(data){ Parser.prototype.parseTags = function(){ var buffer = this._buffer, stack = this._tagStack, handler = this._handler; - var next, type, tagSep, rawData, element, elementName, prevElement, elementType; + var next, type, tagSep, rawData, element, elementName, prevElement, elementType, elementData, includeName; while (_reTags.test(buffer)){ next = _reTags.lastIndex - 1; tagSep = buffer.charAt(next); //The currently found tag marker rawData = buffer.substring(this._current, next); //The next chunk of data to parse + includeName = false; elementType = this._parseState; + + if(elementType === ElementType.Text){ + elementData = rawData; + elementName = ""; + } + else{ + elementData = rawData.trim(); + elementName = parseTagName(elementData); + } type = stack.slice(-1)[0]; - //A new element to eventually be appended to the element list - element = { - raw: rawData - , data: (this._parseState === ElementType.Text) ? rawData : rawData.trim() - , type: this._parseState - }; - - if(this._parseState === ElementType.Tag) elementName = parseTagName(rawData); - else elementName = ""; - //This section inspects the current tag stack and modifies the current //element if we're actually parsing a special area (script/comment/style tag) @@ -172,8 +172,8 @@ Parser.prototype.parseTags = function(){ prevElement = this._elements && this._elements[this._elements.length - 1]; if(prevElement && prevElement.type === ElementType.Text){ prevElement.data = prevElement.raw += this._prevTagSep + rawData; - rawData = element.data = ""; //This causes the current element to not be added to the element list - } else element.data = rawData; //The previous element was not text + rawData = elementData = ""; //This causes the current element to not be added to the element list + } else elementData = rawData; //The previous element was not text } } else if(type === ElementType.Comment){ //We're currently in a comment tag @@ -185,7 +185,7 @@ Parser.prototype.parseTags = function(){ //If the previous element is a comment, append the current text to it if(prevElement && prevElement.type === ElementType.Comment){ //Previous element was a comment prevElement.raw = prevElement.data = (prevElement.raw + rawData).replace(_reTrimComment, ""); - rawData = element.data = ""; //This causes the current element to not be added to the element list + rawData = elementData = ""; //This causes the current element to not be added to the element list elementType = ElementType.Text; } else elementType = ElementType.Comment; //Change the current element's type to a comment @@ -195,11 +195,11 @@ Parser.prototype.parseTags = function(){ //If the previous element is a comment, append the current text to it if(prevElement && prevElement.type === ElementType.Comment){ prevElement.data = prevElement.raw += rawData + tagSep; - rawData = element.data = ""; //This causes the current element to not be added to the element list + rawData = elementData = ""; //This causes the current element to not be added to the element list elementType = ElementType.Text; } else - element.data = rawData += tagSep; + elementData = rawData += tagSep; } } @@ -211,49 +211,53 @@ Parser.prototype.parseTags = function(){ elementType = ElementType.Comment; //Check if the comment is terminated in the current element if(tagSep === ">" && rawData.substr(-2) === "--") - rawData = element.data = rawData.replace(_reTrimComment, ""); + rawData = elementData = rawData.replace(_reTrimComment, ""); else { //It's not so push the comment onto the tag stack rawData += tagSep; stack.push(ElementType.Comment); } } else { - element.name = elementName; + includeName = true; if(rawData.charAt(0) === "!" || rawData.charAt(0) === "?"){ elementType = ElementType.Directive; //TODO: what about CDATA? } else if(elementName.charAt(0) === "/"){ - element.data = elementName; + elementData = elementName; if(elementName === "/script") elementType = ElementType.Script; else if(elementName === "/style") elementType = ElementType.Style; } else if(elementName === "script"){ elementType = ElementType.Script; //Special tag, push onto the tag stack if not terminated - if(element.data.substr(-1) !== "/") stack.push(ElementType.Script); + if(elementData.substr(-1) !== "/") stack.push(ElementType.Script); } else if(elementName === "style"){ elementType = ElementType.Style; //Special tag, push onto the tag stack if not terminated - if(element.data.substr(-1) !== "/") stack.push(ElementType.Style); + if(elementData.substr(-1) !== "/") stack.push(ElementType.Style); } } } //Add all tags and non-empty text elements to the element list if(rawData !== "" || elementType !== ElementType.Text){ - element.raw = rawData; - element.type = elementType; - - if(this._options.includeLocation && !element.location){ - element.location = this.getLocation(elementType === ElementType.Tag); - } + element = { + raw: rawData, + data: elementData, + type: elementType + }; + + if(includeName) element.name = elementName; + if(this._options.includeLocation) element.location = this.getLocation(elementType === ElementType.Tag); + parseAttribs(element); this._elements.push(element); + //If tag self-terminates, add an explicit, separate closing tag - if( element.data.substr(-1) === "/" + if( elementData.substr(-1) === "/" && elementType !== ElementType.Text && elementType !== ElementType.Comment && elementType !== ElementType.Directive From 7529828f7affe34591f5b3e55a56c2a42c0e41b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Tue, 25 Oct 2011 22:05:22 +0200 Subject: [PATCH 024/450] Prepared big changes The element stack will be removed, a new event structure will be introduced --- lib/Parser.js | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/lib/Parser.js b/lib/Parser.js index 5f70387..b02a26f 100644 --- a/lib/Parser.js +++ b/lib/Parser.js @@ -255,17 +255,38 @@ Parser.prototype.parseTags = function(){ parseAttribs(element); this._elements.push(element); + + /* + switch(elementType){ + case ElementType.Text: + this._handler.ontext(element); + case ElementType.Tag: + case ElementType.Style: + case ElementType.Script: + if(elementName[0] === "/") this._handler.onclosetag(elementName.substr(1)); + else this._handler.onopentag(element); + break; + case ElementType.Comment: + this._handler.oncomment(element); + break; + case ElementType.Directive: + this._handler.onprocessinginstruction; + break; + default: throw Error("Unsupported type: " + elementType); + } + */ //If tag self-terminates, add an explicit, separate closing tag if( elementData.substr(-1) === "/" && elementType !== ElementType.Text && elementType !== ElementType.Comment && elementType !== ElementType.Directive - ){ + ){ + //this._handler.onclosetag(elementName); this._elements.push({ - raw: "/" + element.name - , data: "/" + element.name - , name: "/" + element.name + raw: "/" + elementName + , data: "/" + elementName + , name: "/" + elementName , type: elementType }); } From ce7636e9a4b8575e2eea9c094892d30d18e2862c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Tue, 25 Oct 2011 22:06:06 +0200 Subject: [PATCH 025/450] Some steps preparing for the upcoming changes --- lib/EventedHandler.js | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/lib/EventedHandler.js b/lib/EventedHandler.js index 1787e43..a64bf75 100644 --- a/lib/EventedHandler.js +++ b/lib/EventedHandler.js @@ -10,8 +10,8 @@ var EventedHandler = function(cbs){ this.error = cbs.onerror; //if nothing was set, the error is thrown //functions to be called within writeTag - this.onOpenTag = openTagCB(cbs.onopentag, cbs.onattribute); - this.onCloseTag = cbs.onclosetag || emptyFunction; + this.onopentag = openTagCB(cbs.onopentag, cbs.onattribute); + this.onclosetag = cbs.onclosetag || emptyFunction; //privates this._stack = []; @@ -49,16 +49,16 @@ EventedHandler.prototype.writeTag = function(element){ var i = this._stack.length - 1; while(i !== -1 && this._stack[i--].name !== name){} if( (i+=1) !== 0) - while(i < this._stack.length) this.onCloseTag(this._stack.pop().name); + while(i < this._stack.length) this.onclosetag(this._stack.pop().name); } else if(name === "br"){ //special case for
s - this.onOpenTag(name, attributes); - this.onCloseTag(name); + this.onopentag(name, attributes); + this.onclosetag(name); } } else{ - this.onOpenTag(name, attributes); - if(empty) this.onCloseTag(name); + this.onopentag(name, attributes); + if(empty) this.onclosetag(name); else this._stack.push(element); } }; From f22f65d9627c35aecc06bcc749b3718acf5531e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Tue, 25 Oct 2011 22:19:26 +0200 Subject: [PATCH 026/450] Removed regexp for comment trimming use String#slice instead --- lib/Parser.js | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/lib/Parser.js b/lib/Parser.js index b02a26f..4552cd9 100644 --- a/lib/Parser.js +++ b/lib/Parser.js @@ -1,7 +1,7 @@ var ElementType = require("./ElementType.js"); function Parser (handler, options){ - this._options = options ? options : { + this._options = options || { includeLocation: false, //Do not track element position in document by default xmlMode: false //Special behaviour for script/style tags by default }; @@ -10,7 +10,9 @@ function Parser (handler, options){ this._handler = handler; this._buffer = ""; + this._prevTagSep = ""; this._done = false; + this._tagStack = []; this._elements = []; this._current = 0; this._location = { @@ -20,13 +22,10 @@ function Parser (handler, options){ , inBuffer: 0 }; this._parseState = ElementType.Text; - this._prevTagSep = ''; - this._tagStack = []; } //**"Static"**// //Regular expressions used for cleaning up and parsing (stateless) -var _reTrimComment = /(^\!--|--$)/g; //Remove comment tag markup from comment contents var _reWhitespace = /\s/; //Used to find any whitespace to split on var _reTagName = /^\s*(\/?)\s*([^\s\/]+)/; //Used to find the tag name for an element @@ -182,9 +181,10 @@ Parser.prototype.parseTags = function(){ if(rawData.substr(-2) === "--" && tagSep === ">"){ stack.pop(); + rawData = rawData.slice(0, -2); //If the previous element is a comment, append the current text to it if(prevElement && prevElement.type === ElementType.Comment){ //Previous element was a comment - prevElement.raw = prevElement.data = (prevElement.raw + rawData).replace(_reTrimComment, ""); + prevElement.data = prevElement.raw += rawData; rawData = elementData = ""; //This causes the current element to not be added to the element list elementType = ElementType.Text; } @@ -209,9 +209,10 @@ Parser.prototype.parseTags = function(){ if(elementType === ElementType.Tag){ if(rawData.substring(0, 3) === "!--"){ //This tag is really comment elementType = ElementType.Comment; + rawData = rawData.substr(3); //Check if the comment is terminated in the current element if(tagSep === ">" && rawData.substr(-2) === "--") - rawData = elementData = rawData.replace(_reTrimComment, ""); + elementData = rawData = rawData.slice(0, -2); else { //It's not so push the comment onto the tag stack rawData += tagSep; stack.push(ElementType.Comment); From b6937eacfb59e0ec62b52c2c375940344a1a14c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Tue, 25 Oct 2011 22:46:16 +0200 Subject: [PATCH 027/450] Deleted browser related content in tests --- tests/01-basic.js | 29 ------------------------- tests/02-single_tag_1.js | 31 +-------------------------- tests/03-single_tag_2.js | 31 +-------------------------- tests/04-unescaped_in_script.js | 31 +-------------------------- tests/05-tags_in_comment.js | 31 +-------------------------- tests/06-comment_in_script.js | 31 +-------------------------- tests/07-unescaped_in_style.js | 31 +-------------------------- tests/08-extra_spaces_in_tag.js | 29 ------------------------- tests/09-unquoted_attrib.js | 31 +-------------------------- tests/10-singular_attribute.js | 31 +-------------------------- tests/11-text_outside_tags.js | 31 +-------------------------- tests/12-text_only.js | 31 +-------------------------- tests/13-comment_in_text.js | 31 +-------------------------- tests/14-comment_in_text_in_script.js | 31 +-------------------------- tests/15-non-verbose.js | 31 +-------------------------- tests/16-ignore_whitespace.js | 31 +-------------------------- tests/17-xml_namespace.js | 31 +-------------------------- tests/18-enforce_empty_tags.js | 31 +-------------------------- tests/19-ignore_empty_tags.js | 31 +-------------------------- tests/20-rss.js | 31 +-------------------------- tests/21-atom.js | 31 +-------------------------- tests/22-position_data.js | 31 +-------------------------- 22 files changed, 20 insertions(+), 658 deletions(-) diff --git a/tests/01-basic.js b/tests/01-basic.js index 7846898..4f1ce6b 100644 --- a/tests/01-basic.js +++ b/tests/01-basic.js @@ -1,30 +1,3 @@ -(function () { - -function RunningInNode () { - return( - (typeof require) == "function" - && - (typeof exports) == "object" - && - (typeof module) == "object" - && - (typeof __filename) == "string" - && - (typeof __dirname) == "string" - ); -} - -if (!RunningInNode()) { - if (!this.Tautologistics) - this.Tautologistics = {}; - if (!this.Tautologistics.NodeHtmlParser) - this.Tautologistics.NodeHtmlParser = {}; - if (!this.Tautologistics.NodeHtmlParser.Tests) - this.Tautologistics.NodeHtmlParser.Tests = []; - exports = {}; - this.Tautologistics.NodeHtmlParser.Tests.push(exports); -} - exports.name = "Basic test"; exports.options = { handler: {} @@ -57,5 +30,3 @@ exports.expected = ] } ]; - -})(); diff --git a/tests/02-single_tag_1.js b/tests/02-single_tag_1.js index 1735b5e..0180f55 100644 --- a/tests/02-single_tag_1.js +++ b/tests/02-single_tag_1.js @@ -1,30 +1,3 @@ -(function () { - -function RunningInNode () { - return( - (typeof require) == "function" - && - (typeof exports) == "object" - && - (typeof module) == "object" - && - (typeof __filename) == "string" - && - (typeof __dirname) == "string" - ); -} - -if (!RunningInNode()) { - if (!this.Tautologistics) - this.Tautologistics = {}; - if (!this.Tautologistics.NodeHtmlParser) - this.Tautologistics.NodeHtmlParser = {}; - if (!this.Tautologistics.NodeHtmlParser.Tests) - this.Tautologistics.NodeHtmlParser.Tests = []; - exports = {}; - this.Tautologistics.NodeHtmlParser.Tests.push(exports); -} - exports.name = "Single Tag 1"; exports.options = { handler: {} @@ -34,6 +7,4 @@ exports.html = "
text
"; exports.expected = [ { raw: 'br', data: 'br', type: 'tag', name: 'br' } , { raw: 'text', data: 'text', type: 'text' } - ]; - -})(); + ]; \ No newline at end of file diff --git a/tests/03-single_tag_2.js b/tests/03-single_tag_2.js index 2e6e92c..9363dda 100644 --- a/tests/03-single_tag_2.js +++ b/tests/03-single_tag_2.js @@ -1,30 +1,3 @@ -(function () { - -function RunningInNode () { - return( - (typeof require) == "function" - && - (typeof exports) == "object" - && - (typeof module) == "object" - && - (typeof __filename) == "string" - && - (typeof __dirname) == "string" - ); -} - -if (!RunningInNode()) { - if (!this.Tautologistics) - this.Tautologistics = {}; - if (!this.Tautologistics.NodeHtmlParser) - this.Tautologistics.NodeHtmlParser = {}; - if (!this.Tautologistics.NodeHtmlParser.Tests) - this.Tautologistics.NodeHtmlParser.Tests = []; - exports = {}; - this.Tautologistics.NodeHtmlParser.Tests.push(exports); -} - exports.name = "Single Tag 2"; exports.options = { handler: {} @@ -35,6 +8,4 @@ exports.expected = [ { raw: 'br', data: 'br', type: 'tag', name: 'br' } , { raw: 'text', data: 'text', type: 'text' } , { raw: 'br', data: 'br', type: 'tag', name: 'br' } - ]; - -})(); + ]; \ No newline at end of file diff --git a/tests/04-unescaped_in_script.js b/tests/04-unescaped_in_script.js index fb2cc3a..8f0bc3a 100644 --- a/tests/04-unescaped_in_script.js +++ b/tests/04-unescaped_in_script.js @@ -1,30 +1,3 @@ -(function () { - -function RunningInNode () { - return( - (typeof require) == "function" - && - (typeof exports) == "object" - && - (typeof module) == "object" - && - (typeof __filename) == "string" - && - (typeof __dirname) == "string" - ); -} - -if (!RunningInNode()) { - if (!this.Tautologistics) - this.Tautologistics = {}; - if (!this.Tautologistics.NodeHtmlParser) - this.Tautologistics.NodeHtmlParser = {}; - if (!this.Tautologistics.NodeHtmlParser.Tests) - this.Tautologistics.NodeHtmlParser.Tests = []; - exports = {}; - this.Tautologistics.NodeHtmlParser.Tests.push(exports); -} - exports.name = "Unescaped chars in script"; exports.options = { handler: {} @@ -51,6 +24,4 @@ exports.expected = } ] } -]; - -})(); +]; \ No newline at end of file diff --git a/tests/05-tags_in_comment.js b/tests/05-tags_in_comment.js index 68a0779..9f66f6b 100644 --- a/tests/05-tags_in_comment.js +++ b/tests/05-tags_in_comment.js @@ -1,30 +1,3 @@ -(function () { - -function RunningInNode () { - return( - (typeof require) == "function" - && - (typeof exports) == "object" - && - (typeof module) == "object" - && - (typeof __filename) == "string" - && - (typeof __dirname) == "string" - ); -} - -if (!RunningInNode()) { - if (!this.Tautologistics) - this.Tautologistics = {}; - if (!this.Tautologistics.NodeHtmlParser) - this.Tautologistics.NodeHtmlParser = {}; - if (!this.Tautologistics.NodeHtmlParser.Tests) - this.Tautologistics.NodeHtmlParser.Tests = []; - exports = {}; - this.Tautologistics.NodeHtmlParser.Tests.push(exports); -} - exports.name = "Special char in comment"; exports.options = { handler: {} @@ -43,6 +16,4 @@ exports.expected = } ] } -]; - -})(); +]; \ No newline at end of file diff --git a/tests/06-comment_in_script.js b/tests/06-comment_in_script.js index 2d04ec0..af8468a 100644 --- a/tests/06-comment_in_script.js +++ b/tests/06-comment_in_script.js @@ -1,30 +1,3 @@ -(function () { - -function RunningInNode () { - return( - (typeof require) == "function" - && - (typeof exports) == "object" - && - (typeof module) == "object" - && - (typeof __filename) == "string" - && - (typeof __dirname) == "string" - ); -} - -if (!RunningInNode()) { - if (!this.Tautologistics) - this.Tautologistics = {}; - if (!this.Tautologistics.NodeHtmlParser) - this.Tautologistics.NodeHtmlParser = {}; - if (!this.Tautologistics.NodeHtmlParser.Tests) - this.Tautologistics.NodeHtmlParser.Tests = []; - exports = {}; - this.Tautologistics.NodeHtmlParser.Tests.push(exports); -} - exports.name = "Script source in comment"; exports.options = { handler: {} @@ -43,6 +16,4 @@ exports.expected = } ] } -]; - -})(); +]; \ No newline at end of file diff --git a/tests/07-unescaped_in_style.js b/tests/07-unescaped_in_style.js index 563a64a..c5817fc 100644 --- a/tests/07-unescaped_in_style.js +++ b/tests/07-unescaped_in_style.js @@ -1,30 +1,3 @@ -(function () { - -function RunningInNode () { - return( - (typeof require) == "function" - && - (typeof exports) == "object" - && - (typeof module) == "object" - && - (typeof __filename) == "string" - && - (typeof __dirname) == "string" - ); -} - -if (!RunningInNode()) { - if (!this.Tautologistics) - this.Tautologistics = {}; - if (!this.Tautologistics.NodeHtmlParser) - this.Tautologistics.NodeHtmlParser = {}; - if (!this.Tautologistics.NodeHtmlParser.Tests) - this.Tautologistics.NodeHtmlParser.Tests = []; - exports = {}; - this.Tautologistics.NodeHtmlParser.Tests.push(exports); -} - exports.name = "Unescaped chars in style"; exports.options = { handler: {} @@ -44,6 +17,4 @@ exports.expected = } ] } -]; - -})(); +]; \ No newline at end of file diff --git a/tests/08-extra_spaces_in_tag.js b/tests/08-extra_spaces_in_tag.js index 1767565..5c85bed 100644 --- a/tests/08-extra_spaces_in_tag.js +++ b/tests/08-extra_spaces_in_tag.js @@ -1,30 +1,3 @@ -(function () { - -function RunningInNode () { - return( - (typeof require) == "function" - && - (typeof exports) == "object" - && - (typeof module) == "object" - && - (typeof __filename) == "string" - && - (typeof __dirname) == "string" - ); -} - -if (!RunningInNode()) { - if (!this.Tautologistics) - this.Tautologistics = {}; - if (!this.Tautologistics.NodeHtmlParser) - this.Tautologistics.NodeHtmlParser = {}; - if (!this.Tautologistics.NodeHtmlParser.Tests) - this.Tautologistics.NodeHtmlParser.Tests = []; - exports = {}; - this.Tautologistics.NodeHtmlParser.Tests.push(exports); -} - exports.name = "Extra spaces in tag"; exports.options = { handler: {} @@ -45,5 +18,3 @@ exports.expected = ] } ]; - -})(); diff --git a/tests/09-unquoted_attrib.js b/tests/09-unquoted_attrib.js index da6bac7..d448a54 100644 --- a/tests/09-unquoted_attrib.js +++ b/tests/09-unquoted_attrib.js @@ -1,30 +1,3 @@ -(function () { - -function RunningInNode () { - return( - (typeof require) == "function" - && - (typeof exports) == "object" - && - (typeof module) == "object" - && - (typeof __filename) == "string" - && - (typeof __dirname) == "string" - ); -} - -if (!RunningInNode()) { - if (!this.Tautologistics) - this.Tautologistics = {}; - if (!this.Tautologistics.NodeHtmlParser) - this.Tautologistics.NodeHtmlParser = {}; - if (!this.Tautologistics.NodeHtmlParser.Tests) - this.Tautologistics.NodeHtmlParser.Tests = []; - exports = {}; - this.Tautologistics.NodeHtmlParser.Tests.push(exports); -} - exports.name = "Unquoted attributes"; exports.options = { handler: {} @@ -44,6 +17,4 @@ exports.expected = } ] } -]; - -})(); +]; \ No newline at end of file diff --git a/tests/10-singular_attribute.js b/tests/10-singular_attribute.js index 6c22e1a..d749b94 100644 --- a/tests/10-singular_attribute.js +++ b/tests/10-singular_attribute.js @@ -1,30 +1,3 @@ -(function () { - -function RunningInNode () { - return( - (typeof require) == "function" - && - (typeof exports) == "object" - && - (typeof module) == "object" - && - (typeof __filename) == "string" - && - (typeof __dirname) == "string" - ); -} - -if (!RunningInNode()) { - if (!this.Tautologistics) - this.Tautologistics = {}; - if (!this.Tautologistics.NodeHtmlParser) - this.Tautologistics.NodeHtmlParser = {}; - if (!this.Tautologistics.NodeHtmlParser.Tests) - this.Tautologistics.NodeHtmlParser.Tests = []; - exports = {}; - this.Tautologistics.NodeHtmlParser.Tests.push(exports); -} - exports.name = "Singular attribute"; exports.options = { handler: {} @@ -38,6 +11,4 @@ exports.expected = , name: 'option' , attribs: { value: 'foo', selected: 'selected' } } -]; - -})(); +]; \ No newline at end of file diff --git a/tests/11-text_outside_tags.js b/tests/11-text_outside_tags.js index ae63136..ae40c76 100644 --- a/tests/11-text_outside_tags.js +++ b/tests/11-text_outside_tags.js @@ -1,30 +1,3 @@ -(function () { - -function RunningInNode () { - return( - (typeof require) == "function" - && - (typeof exports) == "object" - && - (typeof module) == "object" - && - (typeof __filename) == "string" - && - (typeof __dirname) == "string" - ); -} - -if (!RunningInNode()) { - if (!this.Tautologistics) - this.Tautologistics = {}; - if (!this.Tautologistics.NodeHtmlParser) - this.Tautologistics.NodeHtmlParser = {}; - if (!this.Tautologistics.NodeHtmlParser.Tests) - this.Tautologistics.NodeHtmlParser.Tests = []; - exports = {}; - this.Tautologistics.NodeHtmlParser.Tests.push(exports); -} - exports.name = "Text outside tags"; exports.options = { handler: {} @@ -45,6 +18,4 @@ exports.expected = , data: '\nline two' , type: 'text' } -]; - -})(); +]; \ No newline at end of file diff --git a/tests/12-text_only.js b/tests/12-text_only.js index 64fab9e..9612840 100644 --- a/tests/12-text_only.js +++ b/tests/12-text_only.js @@ -1,30 +1,3 @@ -(function () { - -function RunningInNode () { - return( - (typeof require) == "function" - && - (typeof exports) == "object" - && - (typeof module) == "object" - && - (typeof __filename) == "string" - && - (typeof __dirname) == "string" - ); -} - -if (!RunningInNode()) { - if (!this.Tautologistics) - this.Tautologistics = {}; - if (!this.Tautologistics.NodeHtmlParser) - this.Tautologistics.NodeHtmlParser = {}; - if (!this.Tautologistics.NodeHtmlParser.Tests) - this.Tautologistics.NodeHtmlParser.Tests = []; - exports = {}; - this.Tautologistics.NodeHtmlParser.Tests.push(exports); -} - exports.name = "Only text"; exports.options = { handler: {} @@ -36,6 +9,4 @@ exports.expected = , data: 'this is the text' , type: 'text' } -]; - -})(); +]; \ No newline at end of file diff --git a/tests/13-comment_in_text.js b/tests/13-comment_in_text.js index e201ef6..c40d891 100644 --- a/tests/13-comment_in_text.js +++ b/tests/13-comment_in_text.js @@ -1,30 +1,3 @@ -(function () { - -function RunningInNode () { - return( - (typeof require) == "function" - && - (typeof exports) == "object" - && - (typeof module) == "object" - && - (typeof __filename) == "string" - && - (typeof __dirname) == "string" - ); -} - -if (!RunningInNode()) { - if (!this.Tautologistics) - this.Tautologistics = {}; - if (!this.Tautologistics.NodeHtmlParser) - this.Tautologistics.NodeHtmlParser = {}; - if (!this.Tautologistics.NodeHtmlParser.Tests) - this.Tautologistics.NodeHtmlParser.Tests = []; - exports = {}; - this.Tautologistics.NodeHtmlParser.Tests.push(exports); -} - exports.name = "Comment within text"; exports.options = { handler: {} @@ -44,6 +17,4 @@ exports.expected = , data: ' the text' , type: 'text' } -]; - -})(); +]; \ No newline at end of file diff --git a/tests/14-comment_in_text_in_script.js b/tests/14-comment_in_text_in_script.js index 215a02e..8534610 100644 --- a/tests/14-comment_in_text_in_script.js +++ b/tests/14-comment_in_text_in_script.js @@ -1,30 +1,3 @@ -(function () { - -function RunningInNode () { - return( - (typeof require) == "function" - && - (typeof exports) == "object" - && - (typeof module) == "object" - && - (typeof __filename) == "string" - && - (typeof __dirname) == "string" - ); -} - -if (!RunningInNode()) { - if (!this.Tautologistics) - this.Tautologistics = {}; - if (!this.Tautologistics.NodeHtmlParser) - this.Tautologistics.NodeHtmlParser = {}; - if (!this.Tautologistics.NodeHtmlParser.Tests) - this.Tautologistics.NodeHtmlParser.Tests = []; - exports = {}; - this.Tautologistics.NodeHtmlParser.Tests.push(exports); -} - exports.name = "Comment within text within script"; exports.options = { handler: {} @@ -52,6 +25,4 @@ exports.expected = ] } -]; - -})(); +]; \ No newline at end of file diff --git a/tests/15-non-verbose.js b/tests/15-non-verbose.js index 829fce4..9d5a30f 100644 --- a/tests/15-non-verbose.js +++ b/tests/15-non-verbose.js @@ -1,30 +1,3 @@ -(function () { - -function RunningInNode () { - return( - (typeof require) == "function" - && - (typeof exports) == "object" - && - (typeof module) == "object" - && - (typeof __filename) == "string" - && - (typeof __dirname) == "string" - ); -} - -if (!RunningInNode()) { - if (!this.Tautologistics) - this.Tautologistics = {}; - if (!this.Tautologistics.NodeHtmlParser) - this.Tautologistics.NodeHtmlParser = {}; - if (!this.Tautologistics.NodeHtmlParser.Tests) - this.Tautologistics.NodeHtmlParser.Tests = []; - exports = {}; - this.Tautologistics.NodeHtmlParser.Tests.push(exports); -} - exports.name = "Option 'verbose' set to 'false'"; exports.options = { handler: { verbose: false } @@ -41,6 +14,4 @@ exports.expected = } ] } -]; - -})(); +]; \ No newline at end of file diff --git a/tests/16-ignore_whitespace.js b/tests/16-ignore_whitespace.js index 68f4439..beb0f34 100644 --- a/tests/16-ignore_whitespace.js +++ b/tests/16-ignore_whitespace.js @@ -1,30 +1,3 @@ -(function () { - -function RunningInNode () { - return( - (typeof require) == "function" - && - (typeof exports) == "object" - && - (typeof module) == "object" - && - (typeof __filename) == "string" - && - (typeof __dirname) == "string" - ); -} - -if (!RunningInNode()) { - if (!this.Tautologistics) - this.Tautologistics = {}; - if (!this.Tautologistics.NodeHtmlParser) - this.Tautologistics.NodeHtmlParser = {}; - if (!this.Tautologistics.NodeHtmlParser.Tests) - this.Tautologistics.NodeHtmlParser.Tests = []; - exports = {}; - this.Tautologistics.NodeHtmlParser.Tests.push(exports); -} - exports.name = "Options 'ignoreWhitespace' set to 'true'"; exports.options = { handler: { ignoreWhitespace: true } @@ -66,6 +39,4 @@ exports.expected = } ] } -]; - -})(); +]; \ No newline at end of file diff --git a/tests/17-xml_namespace.js b/tests/17-xml_namespace.js index 562f26b..a2c0d1f 100644 --- a/tests/17-xml_namespace.js +++ b/tests/17-xml_namespace.js @@ -1,30 +1,3 @@ -(function () { - -function RunningInNode () { - return( - (typeof require) == "function" - && - (typeof exports) == "object" - && - (typeof module) == "object" - && - (typeof __filename) == "string" - && - (typeof __dirname) == "string" - ); -} - -if (!RunningInNode()) { - if (!this.Tautologistics) - this.Tautologistics = {}; - if (!this.Tautologistics.NodeHtmlParser) - this.Tautologistics.NodeHtmlParser = {}; - if (!this.Tautologistics.NodeHtmlParser.Tests) - this.Tautologistics.NodeHtmlParser.Tests = []; - exports = {}; - this.Tautologistics.NodeHtmlParser.Tests.push(exports); -} - exports.name = "XML Namespace"; exports.options = { handler: {} @@ -33,6 +6,4 @@ exports.options = { exports.html = "text"; exports.expected = [ { raw: 'ns:tag', data: 'ns:tag', type: 'tag', name: 'ns:tag', children: [ { raw: 'text', data: 'text', type: 'text' } ] } - ]; - -})(); + ]; \ No newline at end of file diff --git a/tests/18-enforce_empty_tags.js b/tests/18-enforce_empty_tags.js index 3ea3757..01af3e3 100644 --- a/tests/18-enforce_empty_tags.js +++ b/tests/18-enforce_empty_tags.js @@ -1,30 +1,3 @@ -(function () { - -function RunningInNode () { - return( - (typeof require) == "function" - && - (typeof exports) == "object" - && - (typeof module) == "object" - && - (typeof __filename) == "string" - && - (typeof __dirname) == "string" - ); -} - -if (!RunningInNode()) { - if (!this.Tautologistics) - this.Tautologistics = {}; - if (!this.Tautologistics.NodeHtmlParser) - this.Tautologistics.NodeHtmlParser = {}; - if (!this.Tautologistics.NodeHtmlParser.Tests) - this.Tautologistics.NodeHtmlParser.Tests = []; - exports = {}; - this.Tautologistics.NodeHtmlParser.Tests.push(exports); -} - exports.name = "Enforce empty tags"; exports.options = { handler: {} @@ -35,6 +8,4 @@ exports.expected = [ { raw: 'link', data: 'link', type: 'tag', name: 'link' } , { raw: 'text', data: 'text', type: 'text' } - ]; - -})(); + ]; \ No newline at end of file diff --git a/tests/19-ignore_empty_tags.js b/tests/19-ignore_empty_tags.js index 4f47a59..abb508f 100644 --- a/tests/19-ignore_empty_tags.js +++ b/tests/19-ignore_empty_tags.js @@ -1,30 +1,3 @@ -(function () { - -function RunningInNode () { - return( - (typeof require) == "function" - && - (typeof exports) == "object" - && - (typeof module) == "object" - && - (typeof __filename) == "string" - && - (typeof __dirname) == "string" - ); -} - -if (!RunningInNode()) { - if (!this.Tautologistics) - this.Tautologistics = {}; - if (!this.Tautologistics.NodeHtmlParser) - this.Tautologistics.NodeHtmlParser = {}; - if (!this.Tautologistics.NodeHtmlParser.Tests) - this.Tautologistics.NodeHtmlParser.Tests = []; - exports = {}; - this.Tautologistics.NodeHtmlParser.Tests.push(exports); -} - exports.name = "Ignore empty tags"; exports.options = { handler: { enforceEmptyTags: false } @@ -36,6 +9,4 @@ exports.expected = { raw: 'link', data: 'link', type: 'tag', name: 'link', children: [ { raw: 'text', data: 'text', type: 'text' } ] } - ]; - -})(); + ]; \ No newline at end of file diff --git a/tests/20-rss.js b/tests/20-rss.js index 52442d6..8179702 100644 --- a/tests/20-rss.js +++ b/tests/20-rss.js @@ -1,30 +1,3 @@ -(function () { - -function RunningInNode () { - return( - (typeof require) == "function" - && - (typeof exports) == "object" - && - (typeof module) == "object" - && - (typeof __filename) == "string" - && - (typeof __dirname) == "string" - ); -} - -if (!RunningInNode()) { - if (!this.Tautologistics) - this.Tautologistics = {}; - if (!this.Tautologistics.NodeHtmlParser) - this.Tautologistics.NodeHtmlParser = {}; - if (!this.Tautologistics.NodeHtmlParser.Tests) - this.Tautologistics.NodeHtmlParser.Tests = []; - exports = {}; - this.Tautologistics.NodeHtmlParser.Tests.push(exports); -} - exports.name = "RSS (2.0)"; exports.options = { handler: {} @@ -115,6 +88,4 @@ exports.expected = { , pubDate: new Date("Tue, 20 May 2003 08:56:02 GMT") } ] - }; - -})(); + }; \ No newline at end of file diff --git a/tests/21-atom.js b/tests/21-atom.js index 4d8c279..f78fdd2 100644 --- a/tests/21-atom.js +++ b/tests/21-atom.js @@ -1,30 +1,3 @@ -(function () { - -function RunningInNode () { - return( - (typeof require) == "function" - && - (typeof exports) == "object" - && - (typeof module) == "object" - && - (typeof __filename) == "string" - && - (typeof __dirname) == "string" - ); -} - -if (!RunningInNode()) { - if (!this.Tautologistics) - this.Tautologistics = {}; - if (!this.Tautologistics.NodeHtmlParser) - this.Tautologistics.NodeHtmlParser = {}; - if (!this.Tautologistics.NodeHtmlParser.Tests) - this.Tautologistics.NodeHtmlParser.Tests = []; - exports = {}; - this.Tautologistics.NodeHtmlParser.Tests.push(exports); -} - exports.name = "Atom (1.0)"; exports.options = { handler: {} @@ -75,6 +48,4 @@ exports.expected = { , pubDate: new Date("2003-12-13T18:30:02Z") } ] - }; - -})(); + }; \ No newline at end of file diff --git a/tests/22-position_data.js b/tests/22-position_data.js index fcd7c90..b9fecb6 100644 --- a/tests/22-position_data.js +++ b/tests/22-position_data.js @@ -1,30 +1,3 @@ -(function () { - -function RunningInNode () { - return( - (typeof require) == "function" - && - (typeof exports) == "object" - && - (typeof module) == "object" - && - (typeof __filename) == "string" - && - (typeof __dirname) == "string" - ); -} - -if (!RunningInNode()) { - if (!this.Tautologistics) - this.Tautologistics = {}; - if (!this.Tautologistics.NodeHtmlParser) - this.Tautologistics.NodeHtmlParser = {}; - if (!this.Tautologistics.NodeHtmlParser.Tests) - this.Tautologistics.NodeHtmlParser.Tests = []; - exports = {}; - this.Tautologistics.NodeHtmlParser.Tests.push(exports); -} - exports.name = "Postion data"; exports.options = { handler: {} @@ -95,6 +68,4 @@ exports.expected = [ } }] } - ]; - -})(); + ]; \ No newline at end of file From dc5fe9c9ae212a09ea31948499d8c31672042191 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Tue, 25 Oct 2011 22:47:06 +0200 Subject: [PATCH 028/450] Added a test https://github.com/tautologistics/node-htmlparser/issues/29 --- tests/23-template_script_tags.js | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 tests/23-template_script_tags.js diff --git a/tests/23-template_script_tags.js b/tests/23-template_script_tags.js new file mode 100644 index 0000000..fb63af1 --- /dev/null +++ b/tests/23-template_script_tags.js @@ -0,0 +1,15 @@ +exports.name = "Template script tags"; +exports.options = { + handler: {} + , parser: {} +}; +exports.html = ""; +exports.expected = [ { raw: 'script type="text/template"', + data: 'script type="text/template"', + type: 'script', + name: 'script', + attribs: { type: 'text/template' }, + children: + [ { raw: '

Heading1

', + data: '

Heading1

', + type: 'text' } ] } ]; \ No newline at end of file From 5c7ec40364f86ecc822aa2e9d89826d874595fa3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Tue, 25 Oct 2011 22:48:48 +0200 Subject: [PATCH 029/450] Fixed test 23 --- tests/23-template_script_tags.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/23-template_script_tags.js b/tests/23-template_script_tags.js index fb63af1..6e256d2 100644 --- a/tests/23-template_script_tags.js +++ b/tests/23-template_script_tags.js @@ -10,6 +10,6 @@ exports.expected = [ { raw: 'script type="text/template"', name: 'script', attribs: { type: 'text/template' }, children: - [ { raw: '

Heading1

', - data: '

Heading1

', + [ { raw: '

Heading1

', + data: '

Heading1

', type: 'text' } ] } ]; \ No newline at end of file From bf7f439bfee88906e2bea08a0a54564cabb01d78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Wed, 26 Oct 2011 10:11:53 +0200 Subject: [PATCH 030/450] Removed trash in runtests.js --- tests/00-runtests.js | 23 +---------------------- 1 file changed, 1 insertion(+), 22 deletions(-) diff --git a/tests/00-runtests.js b/tests/00-runtests.js index f20464c..486d69c 100644 --- a/tests/00-runtests.js +++ b/tests/00-runtests.js @@ -1,24 +1,3 @@ -/*********************************************** -Copyright 2010, Chris Winberry . All rights reserved. -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to -deal in the Software without restriction, including without limitation the -rights to use, copy, modify, merge, publish, distribute, sublicense, and/or -sell copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -IN THE SOFTWARE. -***********************************************/ - var sys = require("sys"); var fs = require("fs"); var htmlparser = require("../lib/htmlparser"); @@ -74,6 +53,6 @@ for (var i = 1; i < testFiles.length; i++) { sys.puts(sys.inspect(test.expected, false, null)); } } +sys.puts("Total time: " + totalTime); sys.puts("Total tests: " + testCount); sys.puts("Failed tests: " + failedCount); -sys.puts("Total time: " + totalTime); From 6051bf19fd240db54e2ebbaf0ec2317d0c1fe271 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Wed, 26 Oct 2011 10:13:39 +0200 Subject: [PATCH 031/450] Replaced _reTags with indexOf --- lib/Parser.js | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/lib/Parser.js b/lib/Parser.js index 4552cd9..c58ed57 100644 --- a/lib/Parser.js +++ b/lib/Parser.js @@ -32,7 +32,6 @@ var _reTagName = /^\s*(\/?)\s*([^\s\/]+)/; //Used to find the tag name for an el //Regular expressions used for parsing (stateful) var _reAttrib = //Find attributes in a tag /([^=<>\"\'\s]+)\s*=\s*"([^"]*)"|([^=<>\"\'\s]+)\s*=\s*'([^']*)'|([^=<>\"\'\s]+)\s*=\s*([^'"\s]+)|([^=<>\"\'\s\/]+)/g; -var _reTags = /[<>]/g; //Find tag markers var tagTypes = {}; tagTypes[ ElementType.Script ] = true; @@ -137,13 +136,20 @@ var parseTagName = function(data){ Parser.prototype.parseTags = function(){ var buffer = this._buffer, stack = this._tagStack, handler = this._handler; - var next, type, tagSep, rawData, element, elementName, prevElement, elementType, elementData, includeName; + var next, type, tagSep, rawData, element, elementName, prevElement, elementType, elementData, includeName = false; + + var opening = buffer.indexOf("<"), closing = buffer.indexOf(">"); - while (_reTags.test(buffer)){ - next = _reTags.lastIndex - 1; - tagSep = buffer.charAt(next); //The currently found tag marker + while(opening !== -1 || closing !== -1){ + if(closing === -1 || (opening !== -1 && opening < closing)){ + next = opening; + opening = buffer.indexOf(tagSep = "<", next + 1); + } + else{ + next = closing; + closing = buffer.indexOf(tagSep = ">", next + 1); + } rawData = buffer.substring(this._current, next); //The next chunk of data to parse - includeName = false; elementType = this._parseState; if(elementType === ElementType.Text){ @@ -198,8 +204,7 @@ Parser.prototype.parseTags = function(){ rawData = elementData = ""; //This causes the current element to not be added to the element list elementType = ElementType.Text; } - else - elementData = rawData += tagSep; + else elementData = rawData += tagSep; } } @@ -251,7 +256,10 @@ Parser.prototype.parseTags = function(){ type: elementType }; - if(includeName) element.name = elementName; + if(includeName){ + element.name = elementName; + includeName = false; + } if(this._options.includeLocation) element.location = this.getLocation(elementType === ElementType.Tag); parseAttribs(element); @@ -278,10 +286,10 @@ Parser.prototype.parseTags = function(){ */ //If tag self-terminates, add an explicit, separate closing tag - if( elementData.substr(-1) === "/" - && elementType !== ElementType.Text + if( elementType !== ElementType.Text && elementType !== ElementType.Comment && elementType !== ElementType.Directive + && elementData.substr(-1) === "/" ){ //this._handler.onclosetag(elementName); this._elements.push({ @@ -305,7 +313,6 @@ Parser.prototype.parseTags = function(){ } this._buffer = this._buffer.substring(this._current); this._current = 0; - _reTags.lastIndex = 0; this.writeHandler(); }; From b51764486672bf81ca8ec766ab4405c33aea0bbb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Wed, 26 Oct 2011 15:43:51 +0200 Subject: [PATCH 032/450] Added doctype to first test --- tests/01-basic.js | 50 ++++++++++++++++++++++------------------------- 1 file changed, 23 insertions(+), 27 deletions(-) diff --git a/tests/01-basic.js b/tests/01-basic.js index 4f1ce6b..2901583 100644 --- a/tests/01-basic.js +++ b/tests/01-basic.js @@ -3,30 +3,26 @@ exports.options = { handler: {} , parser: {} }; -exports.html = "The TitleHello world"; -exports.expected = - [ { raw: 'html' - , data: 'html' - , type: 'tag' - , name: 'html' - , children: - [ { raw: 'title' - , data: 'title' - , type: 'tag' - , name: 'title' - , children: [ { raw: 'The Title', data: 'The Title', type: 'text' } ] - } - , { raw: 'body' - , data: 'body' - , type: 'tag' - , name: 'body' - , children: - [ { raw: 'Hello world' - , data: 'Hello world' - , type: 'text' - } - ] - } - ] - } - ]; +exports.html = "The TitleHello world"; +exports.expected = [ { raw: '!DOCTYPE html', + data: '!DOCTYPE html', + type: 'directive', + name: '!DOCTYPE' }, + { raw: 'html', + data: 'html', + type: 'tag', + name: 'html', + children: + [ { raw: 'title', + data: 'title', + type: 'tag', + name: 'title', + children: [ { raw: 'The Title', data: 'The Title', type: 'text' } ] }, + { raw: 'body', + data: 'body', + type: 'tag', + name: 'body', + children: + [ { raw: 'Hello world', + data: 'Hello world', + type: 'text' } ] } ] } ]; From bcf8aeb3d4ce3555c81be709dc935e88ab71a357 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Wed, 26 Oct 2011 16:13:20 +0200 Subject: [PATCH 033/450] Removed char loop from Parser#getLocation Use String#split & String#replace instead --- lib/Parser.js | 50 ++++++++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/lib/Parser.js b/lib/Parser.js index c58ed57..5377291 100644 --- a/lib/Parser.js +++ b/lib/Parser.js @@ -16,10 +16,10 @@ function Parser (handler, options){ this._elements = []; this._current = 0; this._location = { - row: 0 - , col: 0 - , charOffset: 0 - , inBuffer: 0 + row: 0, + col: 0, + charOffset: 0, + inBuffer: 0 }; this._parseState = ElementType.Text; } @@ -28,10 +28,10 @@ function Parser (handler, options){ //Regular expressions used for cleaning up and parsing (stateless) var _reWhitespace = /\s/; //Used to find any whitespace to split on var _reTagName = /^\s*(\/?)\s*([^\s\/]+)/; //Used to find the tag name for an element +var _reRow = RegExp("\r","g"); -//Regular expressions used for parsing (stateful) -var _reAttrib = //Find attributes in a tag - /([^=<>\"\'\s]+)\s*=\s*"([^"]*)"|([^=<>\"\'\s]+)\s*=\s*'([^']*)'|([^=<>\"\'\s]+)\s*=\s*([^'"\s]+)|([^=<>\"\'\s\/]+)/g; +//Find attributes in a tag +var _reAttrib = /([^=<>\"\'\s]+)\s*=\s*"([^"]*)"|([^=<>\"\'\s]+)\s*=\s*'([^']*)'|([^=<>\"\'\s]+)\s*=\s*([^'"\s]+)|([^=<>\"\'\s\/]+)/g; var tagTypes = {}; tagTypes[ ElementType.Script ] = true; @@ -49,8 +49,7 @@ Parser.prototype.parseComplete = function(data){ //Parses a piece of an HTML document Parser.prototype.parseChunk = function(data){ - if(this._done) - this.handleError(Error("Attempted to parse chunk after parsing already done")); + if(this._done) this.handleError(Error("Attempted to parse chunk after parsing already done")); this._buffer += data; //FIXME: this can be a bottleneck this.parseTags(); }; @@ -318,24 +317,27 @@ Parser.prototype.parseTags = function(){ }; Parser.prototype.getLocation = function(startTag){ - var c, - l = this._location, + var c, end, chunk, + l = this._location; + if(startTag){ + end = this._current-1, + chunk = l.charOffset === 0 && end === -1; + } else { end = this._current, - chunk = startTag && l.charOffset === 0 && end === 0; - - if(startTag) end--; - - for (; l.charOffset < end; l.charOffset++){ - c = this._buffer[l.charOffset]; - if(c === '\n'){ - l.inBuffer++; - l.col = 0; - } else if(c !== '\r') - l.col++; + chunk = false; } + + var str = this._buffer.substring(l.charOffset, l.charOffset = end); + var rows = str.split("\n"), + rowNum = rows.length - 1; + + l.inBuffer += rowNum; + if(rowNum !== 0) l.col = rows[rowNum].replace(_reRow,"").length; + else l.col += str.replace(_reRow,"").length; + return { - line: l.row + l.inBuffer + 1 - , col: l.col + (chunk ? 0: 1) + line: l.row + l.inBuffer + 1, + col: l.col + (chunk ? 0: 1) }; }; From 8175f9ea506543812a7a028295158874fd6a6cf4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Fri, 28 Oct 2011 18:34:11 +0200 Subject: [PATCH 034/450] Renamed htmlparser.js to index.js The old name was misleading. --- lib/{htmlparser.js => index.js} | 0 package.json | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename lib/{htmlparser.js => index.js} (100%) diff --git a/lib/htmlparser.js b/lib/index.js similarity index 100% rename from lib/htmlparser.js rename to lib/index.js diff --git a/package.json b/package.json index e3950ab..c5e6fa3 100644 --- a/package.json +++ b/package.json @@ -13,7 +13,7 @@ , "url": "http://github.com/fb55/node-htmlparser/issues" } , "directories": { "lib": "./lib/" } - , "main": "./lib/htmlparser" + , "main": "./lib/" , "engines": { "node": ">0" } , "licenses": [{ "type": "MIT" From e614ee0509767e65620fc726520a6020a0891acf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Fri, 28 Oct 2011 18:35:50 +0200 Subject: [PATCH 035/450] Don't allow Date: Fri, 28 Oct 2011 20:02:24 +0200 Subject: [PATCH 036/450] corrected reference in runtests --- tests/00-runtests.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/00-runtests.js b/tests/00-runtests.js index 486d69c..071bf64 100644 --- a/tests/00-runtests.js +++ b/tests/00-runtests.js @@ -1,6 +1,6 @@ var sys = require("sys"); var fs = require("fs"); -var htmlparser = require("../lib/htmlparser"); +var htmlparser = require(".."); var testFolder = "."; var chunkSize = 5; From cb0ab4ba812f8f64d43b2fb1b044302338519708 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Fri, 28 Oct 2011 20:02:59 +0200 Subject: [PATCH 037/450] Small changes to Parser#getLocation --- lib/Parser.js | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/lib/Parser.js b/lib/Parser.js index 3942130..7bab01f 100644 --- a/lib/Parser.js +++ b/lib/Parser.js @@ -327,13 +327,14 @@ Parser.prototype.getLocation = function(startTag){ chunk = false; } - var str = this._buffer.substring(l.charOffset, l.charOffset = end); - var rows = str.split("\n"), + var rows = this._buffer.substring(l.charOffset, l.charOffset = end).split("\n"), rowNum = rows.length - 1; l.inBuffer += rowNum; - if(rowNum !== 0) l.col = rows[rowNum].replace(_reRow,"").length; - else l.col += str.replace(_reRow,"").length; + + var num = rows[rowNum].replace(_reRow,"").length; + if(rowNum == 0) l.col += num; + else l.col = num; return { line: l.row + l.inBuffer + 1, From a997bc0715651c925aecca8aa721e78fd0176c53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Fri, 28 Oct 2011 20:07:17 +0200 Subject: [PATCH 038/450] Updated readme --- README.md | 99 ++++++++++++++++++++----------------------------------- 1 file changed, 36 insertions(+), 63 deletions(-) diff --git a/README.md b/README.md index 52467ba..edd9028 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,15 @@ #NodeHtmlParser -A forgiving HTML/XML/RSS parser written in JS for both the browser and NodeJS (yes, despite the name it works just fine in any modern browser). The parser can handle streams (chunked data) and supports custom handlers for writing custom DOMs/output. +A forgiving HTML/XML/RSS parser written in JS for NodeJS. The parser can handle streams (chunked data) and supports custom handlers for writing custom DOMs/output. ##Installing - - npm install htmlparser + `npm install htmlparser` ##Running Tests + `node tests/00-runtests.js` -###Run tests under node: - node runtests.js - -###Run tests in browser: -View runtests.html in any browser - -##Usage In Node +##Usage var htmlparser = require("htmlparser"); - var rawHtml = "Xyz - , Style: "style" //Special tag - , Tag: "tag" //Any tag that isn't special + Text: "text", /*Plain text*/ + Directive: "directive", /*Special tag */ + Comment: "comment", /*Special tag */ + Script: "script", /*Special tag */ + Style: "style", /*Special tag */ + Tag: "tag" /*Any tag that isn't special*/ }; \ No newline at end of file diff --git a/tests/00-runtests.js b/tests/00-runtests.js index 55e3204..8df0fc7 100644 --- a/tests/00-runtests.js +++ b/tests/00-runtests.js @@ -12,7 +12,6 @@ var totalTime = 0; var name = __filename.split("/").slice(-1)[0]; for (var i = 1; i < testFiles.length; i++) { if(testFiles[i] === name) continue; - console.log(testFiles[i], __filename); testCount++; var moduleName = testFiles[i]; var test = require(testFolder + "/" + moduleName); From 03dd39e225e9ba31010479c02480fb76b3097fb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Fri, 4 Nov 2011 18:26:07 +0100 Subject: [PATCH 052/450] Removed duplicated code --- lib/DefaultHandler.js | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/lib/DefaultHandler.js b/lib/DefaultHandler.js index 545e688..7073fd5 100644 --- a/lib/DefaultHandler.js +++ b/lib/DefaultHandler.js @@ -81,16 +81,18 @@ DefaultHandler.prototype.writeTag = DefaultHandler.prototype.writeDirective = De DefaultHandler.prototype.handleElement = function(element) { if (this._done) this.handleCallback(new Error("Writing to the handler after done() called is not allowed without a reset()")); + var couldBeContainer = element.type === ElementType.Tag || element.type === ElementType.Script || element.type === ElementType.Style; if (!this._options.verbose) { //element.raw = null; //FIXME: Not clean //FIXME: Serious performance problem using delete delete element.raw; - if (element.type === "tag" || element.type === "script" || element.type === "style") + if(couldBeContainer) delete element.data; } - if (!this._lastTag()) { //There are no parent elements + var lastTag = this._lastTag(); + if (!lastTag) { //There are no parent elements //If the element can be a container, add it to the tag stack and the top level list - if (element.type !== ElementType.Text && element.type !== ElementType.Comment && element.type !== ElementType.Directive) { + if(couldBeContainer){ if (element.name.charAt(0) !== "/") { //Ignore closing tags that obviously don't have an opening tag this.dom.push(element); if (!this.isEmptyTag(element)) { //Don't add tags to the tag stack that can't have children @@ -104,31 +106,31 @@ DefaultHandler.prototype.handleElement = function(element) { else { //There are parent elements //If the element can be a container, add it as a child of the element //on top of the tag stack and then add it to the tag stack - if (element.type !== ElementType.Text && element.type !== ElementType.Comment && element.type !== ElementType.Directive) { + if(couldBeContainer){ if (element.name.charAt(0) === "/") { //This is a closing tag, scan the tagStack to find the matching opening tag //and pop the stack up to the opening tag's parent var baseName = element.name.substring(1); if (!this.isEmptyTag(element)) { var pos = this._tagStack.length - 1; - while (pos > -1 && this._tagStack[pos--].name !== baseName) { } - if (pos > -1 || this._tagStack[0].name === baseName) - while (pos < this._tagStack.length - 1) + while (pos !== -1 && this._tagStack[pos--].name !== baseName) { } + if (pos !== -1 || this._tagStack[0].name === baseName) + while(pos < this._tagStack.length - 1) this._tagStack.pop(); } } else { //This is not a closing tag - if (!this._lastTag().children) - this._lastTag().children = []; - this._lastTag().children.push(element); + if (!lastTag.children) + lastTag.children = []; + lastTag.children.push(element); if (!this.isEmptyTag(element)) //Don't add tags to the tag stack that can't have children this._tagStack.push(element); } } else { //This is not a container element - if (!this._lastTag().children) - this._lastTag().children = []; - this._lastTag().children.push(element); + if (!lastTag.children) + lastTag.children = []; + lastTag.children.push(element); } } }; From eb37d11ebbaf35aa833d2f36955ad9fdbf39d1c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Fri, 4 Nov 2011 20:33:48 +0100 Subject: [PATCH 053/450] Splited writeTag in both handlers to smaller functions + added a test for events --- lib/DefaultHandler.js | 164 ++++++++++++++++-------------------------- lib/EventedHandler.js | 50 +++++++------ tests/00-runtests.js | 22 ++++-- tests/25-events.js | 20 ++++++ 4 files changed, 126 insertions(+), 130 deletions(-) create mode 100644 tests/25-events.js diff --git a/lib/DefaultHandler.js b/lib/DefaultHandler.js index 7073fd5..471618b 100644 --- a/lib/DefaultHandler.js +++ b/lib/DefaultHandler.js @@ -1,42 +1,28 @@ var ElementType = require("./ElementType.js"); -function DefaultHandler (callback, options) { +function DefaultHandler(callback, options){ this.dom = []; this._done = false; this._tagStack = []; - this._options = options ? options : { }; - if (this._options.ignoreWhitespace === undefined) - this._options.ignoreWhitespace = false; //Keep whitespace-only text nodes - if (this._options.verbose === undefined) - this._options.verbose = true; //Keep data property for tags and raw property for all - if (this._options.enforceEmptyTags === undefined) - this._options.enforceEmptyTags = true; //Don't allow children for HTML tags defined as empty in spec - if ((typeof callback) === "function") - this._callback = callback; + if(options){ + this._options = options; + if(typeof this._options.verbose === "undefined") + this._options.verbose = true; + if (typeof this._options.enforceEmptyTags === "undefined") + this._options.enforceEmptyTags = true; + } + this._callback = callback; } -DefaultHandler.prototype._lastTag = function() { - var stack = this._tagStack; - return(stack.length ? stack[stack.length - 1] : null); +//default options +DefaultHandler.prototype._options = { + ignoreWhitespace: false, //Keep whitespace-only text nodes + verbose: true, //Keep data property for tags and raw property for all + enforceEmptyTags: true //Don't allow children for HTML tags defined as empty in spec }; //HTML Tags that shouldn't contain child nodes -var _emptyTags = { - area: true - , base: true - , basefont: true - , br: true - , col: true - , frame: true - , hr: true - , img: true - , input: true - , isindex: true - , link: true - , meta: true - , param: true - , embed: true -}; +var emptyTags={area:true,base:true,basefont:true,br:true,col:true,frame:true,hr:true,img:true,input:true,isindex:true,link:true,meta:true,param:true,embed:true}; //**Public**// //Methods// @@ -51,88 +37,60 @@ DefaultHandler.prototype.done = function() { this._done = true; this.handleCallback(null); }; -DefaultHandler.prototype.writeText = function(element) { - if(this._options.ignoreWhitespace) - if(element.data.trim() === "") - return; - this.handleElement(element); -}; //Methods// DefaultHandler.prototype.error = -DefaultHandler.prototype.handleCallback = function(error) { - if ((typeof this._callback) !== "function") - if (error) - throw error; - else - return; - this._callback(error, this.dom); +DefaultHandler.prototype.handleCallback = function(error){ + if(typeof this._callback === "function") + this._callback(error, this.dom); + else if(error) throw error; }; -DefaultHandler.prototype.isEmptyTag = function(element) { - var name = element.name.toLowerCase(); - if (name.charAt(0) === '/') { - name = name.substring(1); - } - return this._options.enforceEmptyTags && _emptyTags[name]; +DefaultHandler.prototype._isEmptyTag = function(name) { + return this._options.enforceEmptyTags && emptyTags[name]; }; -DefaultHandler.prototype.writeTag = DefaultHandler.prototype.writeDirective = DefaultHandler.prototype.writeComment = -DefaultHandler.prototype.handleElement = function(element) { - if (this._done) - this.handleCallback(new Error("Writing to the handler after done() called is not allowed without a reset()")); - var couldBeContainer = element.type === ElementType.Tag || element.type === ElementType.Script || element.type === ElementType.Style; - if (!this._options.verbose) { - //element.raw = null; //FIXME: Not clean - //FIXME: Serious performance problem using delete - delete element.raw; - if(couldBeContainer) - delete element.data; - } - var lastTag = this._lastTag(); - if (!lastTag) { //There are no parent elements - //If the element can be a container, add it to the tag stack and the top level list - if(couldBeContainer){ - if (element.name.charAt(0) !== "/") { //Ignore closing tags that obviously don't have an opening tag - this.dom.push(element); - if (!this.isEmptyTag(element)) { //Don't add tags to the tag stack that can't have children - this._tagStack.push(element); - } - } - } - else //Otherwise just add to the top level list - this.dom.push(element); - } - else { //There are parent elements - //If the element can be a container, add it as a child of the element - //on top of the tag stack and then add it to the tag stack - if(couldBeContainer){ - if (element.name.charAt(0) === "/") { - //This is a closing tag, scan the tagStack to find the matching opening tag - //and pop the stack up to the opening tag's parent - var baseName = element.name.substring(1); - if (!this.isEmptyTag(element)) { - var pos = this._tagStack.length - 1; - while (pos !== -1 && this._tagStack[pos--].name !== baseName) { } - if (pos !== -1 || this._tagStack[0].name === baseName) - while(pos < this._tagStack.length - 1) - this._tagStack.pop(); - } - } - else { //This is not a closing tag - if (!lastTag.children) - lastTag.children = []; - lastTag.children.push(element); - if (!this.isEmptyTag(element)) //Don't add tags to the tag stack that can't have children - this._tagStack.push(element); - } - } - else { //This is not a container element - if (!lastTag.children) - lastTag.children = []; - lastTag.children.push(element); - } +DefaultHandler.prototype._closeTag = function(name){ + //Ignore closing tags that obviously don't have an opening tag + if(!this._tagStack || this._isEmptyTag(name)) return; + + var pos = this._tagStack.length - 1; + while (pos !== -1 && this._tagStack[pos--].name !== name) { } + if ( ++pos !== 0 || this._tagStack[0].name === name) + while(pos < this._tagStack.length) + this._tagStack.pop(); +}; + +DefaultHandler.prototype._addDomElement = function(element){ + if(!this._options.verbose) delete element.raw; + + var lastTag = this._tagStack[this._tagStack.length-1]; + if(!lastTag) this.dom.push(element); + else{ //There are parent elements + if(!lastTag.children) lastTag.children = [element]; + else lastTag.children.push(element); } +} + +DefaultHandler.prototype._openTag = function(element){ + if(!this._options.verbose) delete element.data; + + this._addDomElement(element); + + //Don't add tags to the tag stack that can't have children + if(!this._isEmptyTag(element.name)) this._tagStack.push(element); +} + +DefaultHandler.prototype.writeText = function(element){ + if(this._options.ignoreWhitespace && element.data.trim() === "") return; + this._addDomElement(element); +}; + +DefaultHandler.prototype.writeDirective = DefaultHandler.prototype.writeComment = DefaultHandler.prototype._addDomElement; + +DefaultHandler.prototype.writeTag = function(element) { + if(element.name.charAt(0) === "/") this._closeTag(element.name.substr(1)); + else this._openTag(element); }; module.exports = DefaultHandler; \ No newline at end of file diff --git a/lib/EventedHandler.js b/lib/EventedHandler.js index a64bf75..428992f 100644 --- a/lib/EventedHandler.js +++ b/lib/EventedHandler.js @@ -25,9 +25,13 @@ var stripData = function(callback){ }; }; var openTagCB = function(openTag, attribute){ - function open(name, attributes){ openTag({name:name, attributes:attributes}); } function attr(name, attributes){ for(var i in attributes) attribute({name:i, value:attributes[i]}); } if(openTag){ + var open; + if(openTag.length === 1){ //to be compatible with sax.js + open = function open(name, attributes){ openTag({name:name, attributes:attributes}); } + } + else open = openTag if(attribute) return function(name, attributes){open(name,attributes); attr(null, attributes);}; else return open; } @@ -38,29 +42,31 @@ var openTagCB = function(openTag, attribute){ //HTML Tags that shouldn't contain child nodes var emptyTags={area:true,base:true,basefont:true,br:true,col:true,frame:true,hr:true,img:true,input:true,isindex:true,link:true,meta:true,param:true,embed:true}; -EventedHandler.prototype.writeTag = function(element){ - var closing = element.name.charAt(0) === "/", - name = closing ? element.name.substring(1) : element.name, - attributes = element.attribs || {}, - empty = emptyTags[name]; - - if(closing){ - if(!empty){ - var i = this._stack.length - 1; - while(i !== -1 && this._stack[i--].name !== name){} - if( (i+=1) !== 0) - while(i < this._stack.length) this.onclosetag(this._stack.pop().name); - } - else if(name === "br"){ //special case for
s - this.onopentag(name, attributes); - this.onclosetag(name); - } +EventedHandler.prototype._openTag = function(name, attrs){ + this.onopentag(name, attrs); + if(emptyTags[name]) this.onclosetag(name); + else this._stack.push(name); +}; + +EventedHandler.prototype._closeTag = function(name){ + if(!emptyTags[name] && this._stack){ + var i = this._stack.length-1; + while(i !== -1 && this._stack[i--] !== name){}; + if( ++i !== 0 || this._stack[0] === name) + while(i < this._stack.length) + this.onclosetag(this._stack.pop()); } - else{ - this.onopentag(name, attributes); - if(empty) this.onclosetag(name); - else this._stack.push(element); + else if(name === "br"){//many browsers (eg. Safari) convert
to
+ this.onopentag(name, {}); + this.onclosetag(name); } }; +EventedHandler.prototype.writeTag = function(element){ + if(element.name.charAt(0) === "/") + this._closeTag(element.name.substr(1)); + else + this._openTag(element.name, element.attribs || {}); +}; + module.exports = EventedHandler; \ No newline at end of file diff --git a/tests/00-runtests.js b/tests/00-runtests.js index 8df0fc7..238dc59 100644 --- a/tests/00-runtests.js +++ b/tests/00-runtests.js @@ -10,6 +10,7 @@ var testCount = 0; var failedCount = 0; var totalTime = 0; var name = __filename.split("/").slice(-1)[0]; +var handler; for (var i = 1; i < testFiles.length; i++) { if(testFiles[i] === name) continue; testCount++; @@ -21,14 +22,22 @@ for (var i = 1; i < testFiles.length; i++) { } console.log(testFiles[i]); var start = Date.now(); - var handler = (test.type === "rss") ? - new htmlparser.RssHandler(handlerCallback, test.options.handler) - : - new htmlparser.DefaultHandler(handlerCallback, test.options.handler) - ; + if(test.type === "rss"){ + handler = new htmlparser.RssHandler(handlerCallback, test.options.handler); + } + else if(test.type === "event"){ + handler = new htmlparser.EventedHandler(test.options.handler); + } + else{ + handler = new htmlparser.DefaultHandler(handlerCallback, test.options.handler); + } var parser = new htmlparser.Parser(handler, test.options.parser); parser.parseComplete(test.html); var resultComplete = handler.dom; + if(test.type === "event"){ + resultComplete = test.result; + test.result = []; + } var chunkPos = 0; parser.reset(); while (chunkPos < test.html.length) { @@ -37,6 +46,9 @@ for (var i = 1; i < testFiles.length; i++) { } parser.done(); var resultChunk = handler.dom; + if(test.type === "event"){ + resultChunk = test.result; + } var testResult = sys.inspect(resultComplete, false, null) === sys.inspect(test.expected, false, null) && diff --git a/tests/25-events.js b/tests/25-events.js new file mode 100644 index 0000000..1a27bd0 --- /dev/null +++ b/tests/25-events.js @@ -0,0 +1,20 @@ +exports.name = "Events"; +exports.type = "event"; +exports.result = []; +exports.options = {handler: { + onopentag: function(name, attributes){ + exports.result.push({event:"open", name: name, attributes: attributes}); + }, + onclosetag: function(name){ + exports.result.push({event:"close", name: name}); + }, + ontext: function(text){ + exports.result.push({event:"text", text: text}); + } +}, parser: {}}; +exports.html = "

adsf

"; +exports.expected = [ { event: 'open', + name: 'h1', + attributes: { class: 'test' } }, + { event: 'text', text: 'adsf' }, + { event: 'close', name: 'h1' } ]; \ No newline at end of file From 38da6e0d0f6b05d3bd15a8aacc98f617f7996ea2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Fri, 4 Nov 2011 21:46:01 +0100 Subject: [PATCH 054/450] added a limit to the elements fetched inside RssHandler --- lib/RssHandler.js | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/lib/RssHandler.js b/lib/RssHandler.js index ac0b9eb..377d9f5 100644 --- a/lib/RssHandler.js +++ b/lib/RssHandler.js @@ -12,14 +12,15 @@ inherits(RssHandler, DefaultHandler); RssHandler.prototype.done = DefaultHandler.prototype.done; function getElements(what, where, one, recurse){ - var ret = DomUtils.getElementsByTagName(what, where, !!recurse); - if(one) + if(one){ + var ret = DomUtils.getElementsByTagName(what, where, recurse, 1) if(ret && ret.length > 0) return ret[0]; else return false; - else return ret; + } + else return DomUtils.getElementsByTagName(what, where, recurse); } function fetch(what, where, recurse){ - var ret = getElements(what, where, true, !!recurse); + var ret = getElements(what, where, true, recurse); if(ret && ret.children && ret.children.length > 0) return ret.children[0].data; else return false; } From 5101cc2b1d6eb4cd0ededcad348baca65b81869e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Fri, 4 Nov 2011 22:36:12 +0100 Subject: [PATCH 055/450] improved DomUtils what I really wanted to do is now in comments (it failed multiple times) --- lib/DomUtils.js | 110 +++++++++++++++++++++++----------------------- lib/RssHandler.js | 10 ++--- 2 files changed, 59 insertions(+), 61 deletions(-) diff --git a/lib/DomUtils.js b/lib/DomUtils.js index a6e0bdf..9f4c2b5 100644 --- a/lib/DomUtils.js +++ b/lib/DomUtils.js @@ -1,74 +1,76 @@ -module.exports = { - testElement: function(options, element) { - if (!element) return false; - - var type = element.type; +var ElementType = require("./ElementType.js"); - for (var key in options) { - if (key === "tag_name") { - if (type !== "tag" && type !== "script" && type !== "style") return false; - if (!options.tag_name(element.name)) return false; - } else if (key === "tag_type") { - if (!options.tag_type(type)) return false; - } else if (key === "tag_contains") { - if (type !== "text" && type !== "comment" && type !== "directive") return false; - if (!options.tag_contains(element.data)) return false; - } else if (!element.attribs || !options[key](element.attribs[key])) - return false; - } - - return true; - } +function getTest (checkVal) { + return function (value) { return value === checkVal; }; +} - , getElements: function(options, currentElement, recurse, limit) { - if (!currentElement) return []; +function testElement(options, element) { + if (!element) return false; + + var type = element.type; - recurse = (recurse === undefined || recurse === null) || !!recurse; + for (var key in options) { + if (key === "tag_name") { + if (type !== ElementType.Tag && type !== ElementType.Script && type !== ElementType.Style) return false; + if (!options.tag_name(element.name)) return false; + } else if (key === "tag_type") { + if (!options.tag_type(type)) return false; + } else if (key === "tag_contains") { + if (type !== ElementType.Text && type !== ElementType.Comment && type !== ElementType.Directive) return false; + if (!options.tag_contains(element.data)) return false; + } else if (!element.attribs || !options[key](element.attribs[key])) + return false; + } - var parsed_limit = parseInt(limit, 10); - limit = isNaN(parsed_limit) ? -1 : parsed_limit; + return true; +} - var found = []; - var elementList; +module.exports = { + testElement: testElement, + + getElements: function(options, currentElement, recurse, limit){ + recurse = recurse === undefined || recurse === null || recurse; + if(isNaN(limit)) limit = -1; - function getTest (checkVal) { - return function (value) { return value === checkVal; }; - } - for (var key in options) { - if (typeof options[key] !== "function") { + for(var key in options){ + if (typeof options[key] !== "function") options[key] = getTest(options[key]); - } - } - - if (this.testElement(options, currentElement)) { - found.push(currentElement); } - - if (limit >= 0 && found.length >= limit) return found; - - if(recurse && currentElement.children) elementList = currentElement.children; - else if(Array.isArray(currentElement)) elementList = currentElement; - else return found; - - for (var i = 0; i < elementList.length; i++) { - found = found.concat(this.getElements(options, elementList[i], recurse, limit)); - - if (limit >= 0 && found.length >= limit) break; - } - - return found; + return this.testAttr(testElement.bind(null, options), currentElement, recurse, limit); } , getElementById: function(id, currentElement, recurse) { var result = this.getElements({ id: id }, currentElement, recurse, 1); return result.length ? result[0] : null; + //function(elem){return elem.attribs && elem.attribs.id === id;} } , getElementsByTagName: function(name, currentElement, recurse, limit) { return this.getElements({ tag_name: name }, currentElement, recurse, limit); + /*function(elem){ + var type = elem.type; + if(type !== ElementType.Tag && type !== ElementType.Script && type !== ElementType.Style) return false; + return elem.name === name; + };*/ } - , getElementsByTagType: function(type, currentElement, recurse, limit) { - return this.getElements({ tag_type: type }, currentElement, recurse, limit); + , getElementsByTagType: function(type, currentElement, recurse, limit){ + return this.testAttr(function(elem){return elem.type === type;}, currentElement, recurse, limit); + //function(elem){return elem.type === type;} } -}; \ No newline at end of file + + , testAttr: function(test, element, recurse, limit){ + var found = [], elementList; + if(!element) return found; + if(test(element)) found.push(element); + + if(recurse && element.children) elementList = element.children; + else if(Array.isArray(element)) elementList = element; + else return found; + + for(var i = 0, j = elementList.length; i < j && (limit < 0 || found.length < limit); i++){ + found = found.concat(this.testAttr(test, elementList[i], recurse, limit)); + } + + return found; + }}; \ No newline at end of file diff --git a/lib/RssHandler.js b/lib/RssHandler.js index 377d9f5..f9b63d4 100644 --- a/lib/RssHandler.js +++ b/lib/RssHandler.js @@ -12,16 +12,12 @@ inherits(RssHandler, DefaultHandler); RssHandler.prototype.done = DefaultHandler.prototype.done; function getElements(what, where, one, recurse){ - if(one){ - var ret = DomUtils.getElementsByTagName(what, where, recurse, 1) - if(ret && ret.length > 0) return ret[0]; - else return false; - } - else return DomUtils.getElementsByTagName(what, where, recurse); + if(one) return DomUtils.getElementsByTagName(what, where, recurse, 1)[0]; + else return DomUtils.getElementsByTagName(what, where, recurse); } function fetch(what, where, recurse){ var ret = getElements(what, where, true, recurse); - if(ret && ret.children && ret.children.length > 0) return ret.children[0].data; + if(ret && (ret = ret.children) && ret.length > 0) return ret[0].data; else return false; } From 0e78ab532b395cdd681f46ed02098e066163d6c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Fri, 4 Nov 2011 22:53:34 +0100 Subject: [PATCH 056/450] some small adjustments --- lib/DefaultHandler.js | 5 ++--- lib/Parser.js | 2 +- tests/00-runtests.js | 7 ++----- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/lib/DefaultHandler.js b/lib/DefaultHandler.js index 471618b..2201731 100644 --- a/lib/DefaultHandler.js +++ b/lib/DefaultHandler.js @@ -57,8 +57,7 @@ DefaultHandler.prototype._closeTag = function(name){ var pos = this._tagStack.length - 1; while (pos !== -1 && this._tagStack[pos--].name !== name) { } if ( ++pos !== 0 || this._tagStack[0].name === name) - while(pos < this._tagStack.length) - this._tagStack.pop(); + this._tagStack.splice(pos, this._tagStack.length); }; DefaultHandler.prototype._addDomElement = function(element){ @@ -86,7 +85,7 @@ DefaultHandler.prototype.writeText = function(element){ this._addDomElement(element); }; -DefaultHandler.prototype.writeDirective = DefaultHandler.prototype.writeComment = DefaultHandler.prototype._addDomElement; +DefaultHandler.prototype.writeComment = DefaultHandler.prototype.writeDirective = DefaultHandler.prototype._addDomElement; DefaultHandler.prototype.writeTag = function(element) { if(element.name.charAt(0) === "/") this._closeTag(element.name.substr(1)); diff --git a/lib/Parser.js b/lib/Parser.js index 70d2397..a464cc7 100644 --- a/lib/Parser.js +++ b/lib/Parser.js @@ -212,7 +212,7 @@ Parser.prototype.parseTags = function(){ if(elementType === ElementType.Tag){ if(rawData.substring(0, 3) === "!--"){ //This tag is really comment elementType = ElementType.Comment; - rawData = rawData.substr(3); + elementData = rawData = rawData.substr(3); //Check if the comment is terminated in the current element if(tagSep === ">" && rawData.substr(-2) === "--") elementData = rawData = rawData.slice(0, -2); diff --git a/tests/00-runtests.js b/tests/00-runtests.js index 238dc59..63d8317 100644 --- a/tests/00-runtests.js +++ b/tests/00-runtests.js @@ -49,11 +49,8 @@ for (var i = 1; i < testFiles.length; i++) { if(test.type === "event"){ resultChunk = test.result; } - var testResult = - sys.inspect(resultComplete, false, null) === sys.inspect(test.expected, false, null) - && - sys.inspect(resultChunk, false, null) === sys.inspect(test.expected, false, null) - ; + var testResult = sys.inspect(resultComplete, false, null) === sys.inspect(test.expected, false, null) + && sys.inspect(resultChunk, false, null) === sys.inspect(test.expected, false, null); var took = Date.now() - start; totalTime += took; sys.puts("[" + test.name + "\]: " + (testResult ? "passed" : "FAILED") + " (took: " + took + "ms)"); From bc12cd8717f4ed1270538d39e2428c3d4f25d5fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Sat, 5 Nov 2011 12:42:32 +0100 Subject: [PATCH 057/450] Replaced _tagStack with _contentFlags, tweaked DefaultHandler That fixed https://github.com/tautologistics/node-htmlparser/issues/29. --- lib/DefaultHandler.js | 30 +++++- lib/Parser.js | 159 +++++++++++++++---------------- tests/23-template_script_tags.js | 6 +- 3 files changed, 107 insertions(+), 88 deletions(-) diff --git a/lib/DefaultHandler.js b/lib/DefaultHandler.js index 2201731..6f3b663 100644 --- a/lib/DefaultHandler.js +++ b/lib/DefaultHandler.js @@ -3,6 +3,7 @@ var ElementType = require("./ElementType.js"); function DefaultHandler(callback, options){ this.dom = []; this._done = false; + this._inSpecialTag = false; this._tagStack = []; if(options){ this._options = options; @@ -30,6 +31,7 @@ var emptyTags={area:true,base:true,basefont:true,br:true,col:true,frame:true,hr: DefaultHandler.prototype.reset = function() { this.dom = []; this._done = false; + this._inSpecialTag = false; this._tagStack = []; }; //Signals the handler that parsing is done @@ -56,17 +58,33 @@ DefaultHandler.prototype._closeTag = function(name){ var pos = this._tagStack.length - 1; while (pos !== -1 && this._tagStack[pos--].name !== name) { } - if ( ++pos !== 0 || this._tagStack[0].name === name) - this._tagStack.splice(pos, this._tagStack.length); + if ( pos !== -1 || this._tagStack[0].name === name) + this._tagStack.splice(pos+1); }; DefaultHandler.prototype._addDomElement = function(element){ if(!this._options.verbose) delete element.raw; - var lastTag = this._tagStack[this._tagStack.length-1]; + var lastTag = this._tagStack[this._tagStack.length-1], tmp; if(!lastTag) this.dom.push(element); else{ //There are parent elements - if(!lastTag.children) lastTag.children = [element]; + if(!lastTag.children){ + lastTag.children = [element]; + return; + } + tmp = lastTag.children[lastTag.children.length-1]; + if(element.type === ElementType.Comment && tmp.type === ElementType.Comment){ + tmp.data += element.data; + if(this._options.verbose) tmp.raw = tmp.data; + } + else if(this._inSpecialTag && element.type === ElementType.Text){ + if(tmp.type !== ElementType.Text) lastTag.children.push(element); + else { + tmp.data += element.data; + if(this._options.verbose) + tmp.raw = tmp.data; + } + } else lastTag.children.push(element); } } @@ -76,6 +94,10 @@ DefaultHandler.prototype._openTag = function(element){ this._addDomElement(element); + if(element.type === ElementType.Script || element.type === ElementType.Style){ + this._inSpecialTag = true; + } + //Don't add tags to the tag stack that can't have children if(!this._isEmptyTag(element.name)) this._tagStack.push(element); } diff --git a/lib/Parser.js b/lib/Parser.js index a464cc7..7566fca 100644 --- a/lib/Parser.js +++ b/lib/Parser.js @@ -11,8 +11,8 @@ function Parser(handler, options){ this._buffer = ""; this._prevTagSep = ""; + this._contentFlags = 0; this._done = false; - this._tagStack = []; this._elements = []; this._current = 0; this._location = { @@ -64,10 +64,10 @@ Parser.prototype.done = function(){ var rawData = this._buffer; this._buffer = ""; var element = { - raw: rawData - , data: this._parseState === ElementType.Text ? rawData : rawData.trim() - , type: this._parseState - }; + raw: rawData, + data: this._parseState === ElementType.Text ? rawData : rawData.trim(), + type: this._parseState + }; if(tagTypes[this._parseState]){ element.name = parseTagName(element.data); var attrs = parseAttributes(element.data); @@ -76,7 +76,7 @@ Parser.prototype.done = function(){ this._elements.push(element); } - this.writeHandler(true); + this.writeHandler(); this._handler.done(); }; @@ -86,6 +86,7 @@ Parser.prototype.reset = function(){ this._prevTagSep = ""; this._done = false; this._current = 0; + this._contentFlags = 0; this._location = { row: 0 , col: 0 @@ -93,7 +94,6 @@ Parser.prototype.reset = function(){ , inBuffer: 0 }; this._parseState = ElementType.Text; - this._tagStack = []; this._elements = []; this._handler.reset(); }; @@ -125,11 +125,18 @@ var parseTagName = function(data){ return match[1] + match[2]; }; +//Special tags that are threated differently +var SpecialTags = {}; +SpecialTags[ElementType.Style] = 1; //2^0 +SpecialTags[ElementType.Script] = 2; //2^1 +SpecialTags["w"] = 4; //2^2 - if set, append prev tag sep to data +SpecialTags[ElementType.Comment] = 8; //2^8 + //Parses through HTML text and returns an array of found elements Parser.prototype.parseTags = function(){ - var buffer = this._buffer, stack = this._tagStack; + var buffer = this._buffer; - var next, type, tagSep, rawData, element, elementName, prevElement, elementType, elementData, attributes, includeName = false; + var next, tagSep, rawData, element, elementName, prevElement, elementType, elementData, attributes, includeName = false; var opening = buffer.indexOf("<"), closing = buffer.indexOf(">"); @@ -155,55 +162,44 @@ Parser.prototype.parseTags = function(){ elementData = rawData; elementName = ""; } - - type = stack[stack.length-1]; //This section inspects the current tag stack and modifies the current //element if we're actually parsing a special area (script/comment/style tag) - if(type === ElementType.Comment){ //We're currently in a comment tag - - prevElement = this._elements[this._elements.length - 1]; + if(this._contentFlags === 0){ /*do nothing*/ } + else if(this._contentFlags >= SpecialTags[ElementType.Comment]){ //We're currently in a comment tag + elementType = ElementType.Comment; //Change the current element's type to a comment if(tagSep === ">" && rawData.substr(-2) === "--"){ //comment ends - stack.pop(); - rawData = rawData.slice(0, -2); - //If the previous element is a comment, append the current text to it - if(prevElement && prevElement.type === ElementType.Comment){ //Previous element was a comment - prevElement.data = prevElement.raw += rawData; - //This causes the current element to not be added to the element list - rawData = elementData = ""; - elementType = ElementType.Text; - } - else elementType = ElementType.Comment; //Change the current element's type to a comment - } - else { //Still in a comment tag - elementType = ElementType.Comment; - //If the previous element is a comment, append the current text to it - if(prevElement && prevElement.type === ElementType.Comment){ - prevElement.data = prevElement.raw += rawData + tagSep; - //This causes the current element to not be added to the element list - rawData = elementData = ""; - elementType = ElementType.Text; - } - else elementData = rawData += tagSep; + this._contentFlags -= SpecialTags[ElementType.Comment]; + elementData = rawData = rawData.slice(0, -2); } + else elementData = rawData += tagSep; + this._prevTagSep = tagSep; } - else if(type === ElementType.Script && elementName === "/script") stack.pop(); - else if(type === ElementType.Style && elementName === "/style") stack.pop(); - else if(!this._options.xmlMode && (type === ElementType.Script || type === ElementType.Style)){ - //special behaviour for script & style tags - if(rawData.substring(0, 3) !== "!--"){ //Make sure we're not in a comment - //All data from here to style close is now a text element - elementType = ElementType.Text; - //If the previous element is text, append the current text to it - prevElement = this._elements[this._elements.length - 1]; - if(prevElement && prevElement.type === ElementType.Text){ - prevElement.data = prevElement.raw += this._prevTagSep + rawData; - //This causes the current element to not be added to the element list - rawData = elementData = ""; - } else elementData = rawData; //The previous element was not text + //if it's a closing tag, remove the flag + else if(this._contentFlags >= SpecialTags[ElementType.Script] && elementName === "/script"){ + this._contentFlags %= SpecialTags["w"]; //remove the written flag + this._contentFlags -= SpecialTags[ElementType.Script]; + } + else if(this._contentFlags >= SpecialTags[ElementType.Style] && elementName === "/style"){ + this._contentFlags %= SpecialTags["w"]; //remove the written flag + this._contentFlags -= SpecialTags[ElementType.Style]; + } + //special behaviour for script & style tags + //Make sure we're not in a comment + else if(!this._options.xmlMode && rawData.substring(0, 3) !== "!--"){ + //All data from here to style close is now a text element + elementType = ElementType.Text; + //If the previous element is text, append the last tag sep to element + if(this._contentFlags >= SpecialTags["w"]){ + elementData = rawData = this._prevTagSep + rawData; + } + else{ //The previous element was not text + this._contentFlags += SpecialTags["w"]; + elementData = rawData; } + this._prevTagSep = tagSep; } @@ -212,13 +208,14 @@ Parser.prototype.parseTags = function(){ if(elementType === ElementType.Tag){ if(rawData.substring(0, 3) === "!--"){ //This tag is really comment elementType = ElementType.Comment; - elementData = rawData = rawData.substr(3); + this._contentFlags %= SpecialTags["w"]; //remove the written flag //Check if the comment is terminated in the current element if(tagSep === ">" && rawData.substr(-2) === "--") - elementData = rawData = rawData.slice(0, -2); + elementData = rawData = rawData.slice(3, -2); else { //It's not so push the comment onto the tag stack - rawData += tagSep; - stack.push(ElementType.Comment); + elementData = rawData = rawData.substr(3) + tagSep; + this._contentFlags += SpecialTags[ElementType.Comment]; + this._prevTagSep = tagSep; } } else { @@ -236,12 +233,18 @@ Parser.prototype.parseTags = function(){ else if(elementName === "script"){ elementType = ElementType.Script; //Special tag, push onto the tag stack if not terminated - if(elementData.substr(-1) !== "/") stack.push(ElementType.Script); + if(elementData.substr(-1) !== "/"){ + this._contentFlags += SpecialTags[ElementType.Script]; + this._prevTagSep = tagSep; + } } else if(elementName === "style"){ elementType = ElementType.Style; //Special tag, push onto the tag stack if not terminated - if(elementData.substr(-1) !== "/") stack.push(ElementType.Style); + if(elementData.substr(-1) !== "/"){ + this._contentFlags += SpecialTags[ElementType.Style]; + this._prevTagSep = tagSep; + } } } } @@ -271,42 +274,35 @@ Parser.prototype.parseTags = function(){ /* switch(elementType){ case ElementType.Text: - this._handler.ontext(rawData); - break; - case ElementType.Tag: - case ElementType.Style: - case ElementType.Script: - if(elementName[0] === "/") this._handler.onclosetag(elementName.substr(1)); - else this._handler.onopentag(elementName, parseAttributes(elementData)); + this._handler.writeText(element); break; case ElementType.Comment: - this._handler.oncomment(rawData); + this._handler.writeComment(element); break; case ElementType.Directive: - this._handler.onprocessinginstruction(rawData); + this._handler.writeDirective(element); break; - default: throw Error("Unsupported type: " + elementType); + //case ElementType.Tag: + //case ElementType.Style: + //case ElementType.Script: + default: + if(elementName[0] === "/") this._handler._closeTag(elementName.substr(1)); + else this._handler._openTag(elementName, parseAttributes(elementData)); } */ //If tag self-terminates, add an explicit, separate closing tag - if( elementType !== ElementType.Text - && elementType !== ElementType.Comment - && elementType !== ElementType.Directive - && elementData.substr(-1) === "/" - ){ - //this._handler.onclosetag(elementName); + if(tagTypes[elementType] && elementData.substr(-1) === "/"){ + //this._handler._closeTag(elementName); this._elements.push({ - raw: elementName = "/" + elementName - , data: elementName - , name: elementName - , type: elementType + raw: elementName = "/" + elementName, + data: elementName, name: elementName, + type: elementType }); } } this._parseState = (tagSep === "<") ? ElementType.Tag : ElementType.Text; this._current = next + 1; - this._prevTagSep = tagSep; } if(this._options.includeLocation){ @@ -332,15 +328,18 @@ Parser.prototype.getLocation = function(startTag){ chunk = false; } - var rows = this._buffer.substring(l.charOffset, l.charOffset = end).split("\n"), + var rows = this._buffer.substring(l.charOffset, end).split("\n"), rowNum = rows.length - 1; + l.charOffset = end; l.inBuffer += rowNum; var num = rows[rowNum].replace(_reRow,"").length; - if(rowNum == 0) l.col += num; + if(rowNum === 0) l.col += num; else l.col = num; + if(arguments.length === 0) return; + return { line: l.row + l.inBuffer + 1, col: l.col + (chunk ? 0: 1) @@ -358,9 +357,7 @@ var validateHandler = function(handler){ }; //Writes parsed elements out to the handler -Parser.prototype.writeHandler = function(forceFlush){ - if(this._tagStack.length && !forceFlush) - return; +Parser.prototype.writeHandler = function(){ while (this._elements.length){ var element = this._elements.shift(); switch (element.type){ diff --git a/tests/23-template_script_tags.js b/tests/23-template_script_tags.js index 6e256d2..24864fd 100644 --- a/tests/23-template_script_tags.js +++ b/tests/23-template_script_tags.js @@ -3,13 +3,13 @@ exports.options = { handler: {} , parser: {} }; -exports.html = ""; +exports.html = ""; exports.expected = [ { raw: 'script type="text/template"', data: 'script type="text/template"', type: 'script', name: 'script', attribs: { type: 'text/template' }, children: - [ { raw: '

Heading1

', - data: '

Heading1

', + [ { raw: '

Heading1

', + data: '

Heading1

', type: 'text' } ] } ]; \ No newline at end of file From dbc23014abe7f831db3691224def8f56e308e0c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Sat, 5 Nov 2011 12:53:42 +0100 Subject: [PATCH 058/450] Made openTag & closeTag public methods Just deleted the "_" in front of them --- lib/DefaultHandler.js | 10 +++++----- lib/EventedHandler.js | 8 ++++---- lib/Parser.js | 6 +++--- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/lib/DefaultHandler.js b/lib/DefaultHandler.js index 6f3b663..b42906f 100644 --- a/lib/DefaultHandler.js +++ b/lib/DefaultHandler.js @@ -5,7 +5,7 @@ function DefaultHandler(callback, options){ this._done = false; this._inSpecialTag = false; this._tagStack = []; - if(options){ + if(options){ //otherwise, the prototype is used this._options = options; if(typeof this._options.verbose === "undefined") this._options.verbose = true; @@ -52,7 +52,7 @@ DefaultHandler.prototype._isEmptyTag = function(name) { return this._options.enforceEmptyTags && emptyTags[name]; }; -DefaultHandler.prototype._closeTag = function(name){ +DefaultHandler.prototype.closeTag = function(name){ //Ignore closing tags that obviously don't have an opening tag if(!this._tagStack || this._isEmptyTag(name)) return; @@ -89,7 +89,7 @@ DefaultHandler.prototype._addDomElement = function(element){ } } -DefaultHandler.prototype._openTag = function(element){ +DefaultHandler.prototype.openTag = function(element){ if(!this._options.verbose) delete element.data; this._addDomElement(element); @@ -110,8 +110,8 @@ DefaultHandler.prototype.writeText = function(element){ DefaultHandler.prototype.writeComment = DefaultHandler.prototype.writeDirective = DefaultHandler.prototype._addDomElement; DefaultHandler.prototype.writeTag = function(element) { - if(element.name.charAt(0) === "/") this._closeTag(element.name.substr(1)); - else this._openTag(element); + if(element.name.charAt(0) === "/") this.closeTag(element.name.substr(1)); + else this.openTag(element); }; module.exports = DefaultHandler; \ No newline at end of file diff --git a/lib/EventedHandler.js b/lib/EventedHandler.js index 428992f..1ed2b02 100644 --- a/lib/EventedHandler.js +++ b/lib/EventedHandler.js @@ -42,13 +42,13 @@ var openTagCB = function(openTag, attribute){ //HTML Tags that shouldn't contain child nodes var emptyTags={area:true,base:true,basefont:true,br:true,col:true,frame:true,hr:true,img:true,input:true,isindex:true,link:true,meta:true,param:true,embed:true}; -EventedHandler.prototype._openTag = function(name, attrs){ +EventedHandler.prototype.openTag = function(name, attrs){ this.onopentag(name, attrs); if(emptyTags[name]) this.onclosetag(name); else this._stack.push(name); }; -EventedHandler.prototype._closeTag = function(name){ +EventedHandler.prototype.closeTag = function(name){ if(!emptyTags[name] && this._stack){ var i = this._stack.length-1; while(i !== -1 && this._stack[i--] !== name){}; @@ -64,9 +64,9 @@ EventedHandler.prototype._closeTag = function(name){ EventedHandler.prototype.writeTag = function(element){ if(element.name.charAt(0) === "/") - this._closeTag(element.name.substr(1)); + this.closeTag(element.name.substr(1)); else - this._openTag(element.name, element.attribs || {}); + this.openTag(element.name, element.attribs || {}); }; module.exports = EventedHandler; \ No newline at end of file diff --git a/lib/Parser.js b/lib/Parser.js index 7566fca..53a5cd7 100644 --- a/lib/Parser.js +++ b/lib/Parser.js @@ -286,14 +286,14 @@ Parser.prototype.parseTags = function(){ //case ElementType.Style: //case ElementType.Script: default: - if(elementName[0] === "/") this._handler._closeTag(elementName.substr(1)); - else this._handler._openTag(elementName, parseAttributes(elementData)); + if(elementName[0] === "/") this._handler.closeTag(elementName.substr(1)); + else this._handler.openTag(elementName, parseAttributes(elementData)); } */ //If tag self-terminates, add an explicit, separate closing tag if(tagTypes[elementType] && elementData.substr(-1) === "/"){ - //this._handler._closeTag(elementName); + //this._handler.closeTag(elementName); this._elements.push({ raw: elementName = "/" + elementName, data: elementName, name: elementName, From 000bd0231095924ef39a659a0e032e989cf6ab9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Sat, 5 Nov 2011 13:00:54 +0100 Subject: [PATCH 059/450] Improved _addDomElement --- lib/DefaultHandler.js | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/lib/DefaultHandler.js b/lib/DefaultHandler.js index b42906f..6fc2bcd 100644 --- a/lib/DefaultHandler.js +++ b/lib/DefaultHandler.js @@ -65,25 +65,22 @@ DefaultHandler.prototype.closeTag = function(name){ DefaultHandler.prototype._addDomElement = function(element){ if(!this._options.verbose) delete element.raw; - var lastTag = this._tagStack[this._tagStack.length-1], tmp; + var lastTag = this._tagStack[this._tagStack.length-1], lastChild; if(!lastTag) this.dom.push(element); else{ //There are parent elements if(!lastTag.children){ lastTag.children = [element]; return; } - tmp = lastTag.children[lastTag.children.length-1]; - if(element.type === ElementType.Comment && tmp.type === ElementType.Comment){ - tmp.data += element.data; - if(this._options.verbose) tmp.raw = tmp.data; + lastChild = lastTag.children[lastTag.children.length-1]; + if(element.type === ElementType.Comment && lastChild.type === ElementType.Comment){ + lastChild.data += element.data; + if(this._options.verbose) lastChild.raw = lastChild.data; } - else if(this._inSpecialTag && element.type === ElementType.Text){ - if(tmp.type !== ElementType.Text) lastTag.children.push(element); - else { - tmp.data += element.data; - if(this._options.verbose) - tmp.raw = tmp.data; - } + else if(this._inSpecialTag && element.type === ElementType.Text && lastChild.type === ElementType.Text){ + lastChild.data += element.data; + if(this._options.verbose) + lastChild.raw = lastChild.data; } else lastTag.children.push(element); } From 6f2032f5d6f608aab8966cb9418322f9d95c0f35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Sat, 5 Nov 2011 13:02:22 +0100 Subject: [PATCH 060/450] Set feeds to xmlMode because they are xml, so "; -exports.expected = -[ { raw: 'script' - , data: 'script' - , type: 'script' - , name: 'script' - , children: - [ { raw: 'var foo = 1;' - , data: 'var foo = 1;' - , type: 'comment' - } - ] +exports.expected = [ + { + "type": "script", + "name": "script", + "children": [ + { + "data": "var foo = 1;", + "type": "comment" + } + ] } ]; \ No newline at end of file diff --git a/tests/HTML/07-unescaped_in_style.js b/tests/HTML/07-unescaped_in_style.js index c5817fc..3784336 100644 --- a/tests/HTML/07-unescaped_in_style.js +++ b/tests/HTML/07-unescaped_in_style.js @@ -5,14 +5,11 @@ exports.options = { }; exports.html = ""; exports.expected = -[ { raw: 'style type="text/css"' - , data: 'style type="text/css"' - , type: 'style' +[ { type: 'style' , name: 'style' , attribs: { type: 'text/css' } , children: - [ { raw: '\n body > p\n { font-weight: bold; }' - , data: '\n body > p\n { font-weight: bold; }' + [ { data: '\n body > p\n { font-weight: bold; }' , type: 'text' } ] diff --git a/tests/HTML/11-text_outside_tags.js b/tests/HTML/11-text_outside_tags.js index ae40c76..d544b23 100644 --- a/tests/HTML/11-text_outside_tags.js +++ b/tests/HTML/11-text_outside_tags.js @@ -4,18 +4,17 @@ exports.options = { , parser: {} }; exports.html = "Line one\n
\nline two"; -exports.expected = -[ { raw: 'Line one\n' - , data: 'Line one\n' - , type: 'text' - } - , { raw: 'br' - , data: 'br' - , type: 'tag' - , name: 'br' - } - , { raw: '\nline two' - , data: '\nline two' - , type: 'text' +exports.expected = [ + { + "data": "Line one\n", + "type": "text" + }, + { + "type": "tag", + "name": "br" + }, + { + "data": "\nline two", + "type": "text" } ]; \ No newline at end of file diff --git a/tests/HTML/12-text_only.js b/tests/HTML/12-text_only.js index 9612840..45d774f 100644 --- a/tests/HTML/12-text_only.js +++ b/tests/HTML/12-text_only.js @@ -4,9 +4,9 @@ exports.options = { , parser: {} }; exports.html = "this is the text"; -exports.expected = -[ { raw: 'this is the text' - , data: 'this is the text' - , type: 'text' +exports.expected = [ + { + "data": "this is the text", + "type": "text" } ]; \ No newline at end of file diff --git a/tests/HTML/13-comment_in_text.js b/tests/HTML/13-comment_in_text.js index c40d891..46bd94d 100644 --- a/tests/HTML/13-comment_in_text.js +++ b/tests/HTML/13-comment_in_text.js @@ -4,17 +4,17 @@ exports.options = { , parser: {} }; exports.html = "this is the text"; -exports.expected = -[ { raw: 'this is ' - , data: 'this is ' - , type: 'text' - } -, { raw: ' the comment ' - , data: ' the comment ' - , type: 'comment' - } -, { raw: ' the text' - , data: ' the text' - , type: 'text' +exports.expected = [ + { + "data": "this is ", + "type": "text" + }, + { + "data": " the comment ", + "type": "comment" + }, + { + "data": " the text", + "type": "text" } ]; \ No newline at end of file diff --git a/tests/HTML/14-comment_in_text_in_script.js b/tests/HTML/14-comment_in_text_in_script.js index 8534610..c4fff65 100644 --- a/tests/HTML/14-comment_in_text_in_script.js +++ b/tests/HTML/14-comment_in_text_in_script.js @@ -4,25 +4,23 @@ exports.options = { , parser: {} }; exports.html = ""; -exports.expected = -[ { raw: 'script' - , data: 'script' - , type: 'script' - , name: 'script' - , children: - [ { raw: 'this is ' - , data: 'this is ' - , type: 'text' - } - , { raw: ' the comment ' - , data: ' the comment ' - , type: 'comment' - } - , { raw: ' the text' - , data: ' the text' - , type: 'text' - } - - ] +exports.expected = [ + { + "type": "script", + "name": "script", + "children": [ + { + "data": "this is ", + "type": "text" + }, + { + "data": " the comment ", + "type": "comment" + }, + { + "data": " the text", + "type": "text" + } + ] } ]; \ No newline at end of file diff --git a/tests/HTML/16-ignore_whitespace.js b/tests/HTML/16-ignore_whitespace.js index beb0f34..9049bd8 100644 --- a/tests/HTML/16-ignore_whitespace.js +++ b/tests/HTML/16-ignore_whitespace.js @@ -4,39 +4,35 @@ exports.options = { , parser: {} }; exports.html = "Line one\n
\t\n
\nline two\n
x
"; -exports.expected = -[ { raw: 'Line one\n' - , data: 'Line one\n' - , type: 'text' +exports.expected = [ + { + "data": "Line one\n", + "type": "text" + }, + { + "type": "tag", + "name": "br" + }, + { + "type": "tag", + "name": "br" + }, + { + "data": "\nline two", + "type": "text" + }, + { + "type": "tag", + "name": "font", + "children": [ + { + "type": "tag", + "name": "br" + }, + { + "data": " x ", + "type": "text" + } + ] } - , { raw: 'br' - , data: 'br' - , type: 'tag' - , name: 'br' - } - , { raw: 'br' - , data: 'br' - , type: 'tag' - , name: 'br' - } - , { raw: '\nline two' - , data: '\nline two' - , type: 'text' - } - , { raw: 'font' - , data: 'font' - , type: 'tag' - , name: 'font' - , children: - [ { raw: 'br' - , data: 'br' - , type: 'tag' - , name: 'br' - } - , { raw: ' x ' - , data: ' x ' - , type: 'text' - } - ] - } ]; \ No newline at end of file diff --git a/tests/HTML/17-xml_namespace.js b/tests/HTML/17-xml_namespace.js index a2c0d1f..2789a6e 100644 --- a/tests/HTML/17-xml_namespace.js +++ b/tests/HTML/17-xml_namespace.js @@ -4,6 +4,15 @@ exports.options = { , parser: {} }; exports.html = "text"; -exports.expected = - [ { raw: 'ns:tag', data: 'ns:tag', type: 'tag', name: 'ns:tag', children: [ { raw: 'text', data: 'text', type: 'text' } ] } - ]; \ No newline at end of file +exports.expected = [ + { + "type": "tag", + "name": "ns:tag", + "children": [ + { + "data": "text", + "type": "text" + } + ] + } +]; \ No newline at end of file diff --git a/tests/HTML/18-enforce_empty_tags.js b/tests/HTML/18-enforce_empty_tags.js index 01af3e3..131a353 100644 --- a/tests/HTML/18-enforce_empty_tags.js +++ b/tests/HTML/18-enforce_empty_tags.js @@ -4,8 +4,13 @@ exports.options = { , parser: {} }; exports.html = "text"; -exports.expected = - [ - { raw: 'link', data: 'link', type: 'tag', name: 'link' } - , { raw: 'text', data: 'text', type: 'text' } - ]; \ No newline at end of file +exports.expected = [ + { + "type": "tag", + "name": "link" + }, + { + "data": "text", + "type": "text" + } +]; \ No newline at end of file diff --git a/tests/HTML/19-ignore_empty_tags.js b/tests/HTML/19-ignore_empty_tags.js index abb508f..b50c086 100644 --- a/tests/HTML/19-ignore_empty_tags.js +++ b/tests/HTML/19-ignore_empty_tags.js @@ -4,9 +4,15 @@ exports.options = { , parser: {} }; exports.html = "text"; -exports.expected = - [ - { raw: 'link', data: 'link', type: 'tag', name: 'link', children: [ - { raw: 'text', data: 'text', type: 'text' } - ] } - ]; \ No newline at end of file +exports.expected = [ + { + "type": "tag", + "name": "link", + "children": [ + { + "data": "text", + "type": "text" + } + ] + } +]; \ No newline at end of file diff --git a/tests/HTML/23-template_script_tags.js b/tests/HTML/20-template_script_tags.js similarity index 63% rename from tests/HTML/23-template_script_tags.js rename to tests/HTML/20-template_script_tags.js index 24864fd..e6b7b74 100644 --- a/tests/HTML/23-template_script_tags.js +++ b/tests/HTML/20-template_script_tags.js @@ -4,12 +4,10 @@ exports.options = { , parser: {} }; exports.html = ""; -exports.expected = [ { raw: 'script type="text/template"', - data: 'script type="text/template"', +exports.expected = [ { type: 'script', name: 'script', attribs: { type: 'text/template' }, children: - [ { raw: '

Heading1

', - data: '

Heading1

', + [ { data: '

Heading1

', type: 'text' } ] } ]; \ No newline at end of file diff --git a/tests/HTML/22-position_data.js b/tests/HTML/22-position_data.js deleted file mode 100644 index b9fecb6..0000000 --- a/tests/HTML/22-position_data.js +++ /dev/null @@ -1,71 +0,0 @@ -exports.name = "Postion data"; -exports.options = { - handler: {} - , parser: { includeLocation: true } -}; -exports.html = "\r\n\n\tThe Title\nHello world\r\n\n\n\n"; -exports.expected = [ - { - raw: 'html', - data: 'html', - type: 'tag', - name: 'html', - location: { - line: 1, - col: 1 - }, - children: [{ - raw: '\r\n\n\t', - data: '\r\n\n\t', - type: 'text', - location: { - line: 1, - col: 7 - } - }, { - raw: 'title', - data: 'title', - type: 'tag', - name: 'title', - location: { - line: 3, - col: 2 - }, - children: [{ - raw: 'The Title', - data: 'The Title', - type: 'text', - location: { - line: 3, - col: 9 - } - }] - }, { - raw: 'body', - data: 'body', - type: 'tag', - name: 'body', - location: { - line: 3, - col: 26 - }, - children: [{ - raw: '\nHello world\r\n\n', - data: '\nHello world\r\n\n', - type: 'text', - location: { - line: 3, - col: 32 - } - }] - }, { - raw: '\n\n', - data: '\n\n', - type: 'text', - location: { - line: 6, - col: 8 - } - }] - } - ]; \ No newline at end of file From 2206351f9f1003512fb9d03dc069e07ae34ad080 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Fri, 11 Nov 2011 19:18:12 +0100 Subject: [PATCH 083/450] Use the new name inside FeedHandler --- lib/FeedHandler.js | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/FeedHandler.js b/lib/FeedHandler.js index 2b82b1b..878992c 100644 --- a/lib/FeedHandler.js +++ b/lib/FeedHandler.js @@ -3,11 +3,11 @@ var DefaultHandler = require("./DefaultHandler.js"), inherits = require("util").inherits; //TODO: make this a trully streamable handler -function RssHandler(callback){ +function FeedHandler(callback){ DefaultHandler.call(this, callback, { ignoreWhitespace: true, verbose: false, enforceEmptyTags: false }); } -inherits(RssHandler, DefaultHandler); +inherits(FeedHandler, DefaultHandler); function getElements(what, where, one, recurse){ if(one) return DomUtils.getElementsByTagName(what, where, recurse, 1)[0]; @@ -23,7 +23,7 @@ var isValidFeed = function(value) { return value === "rss" || value === "feed" || value === "rdf:RDF"; } -RssHandler.prototype.done = function() { +FeedHandler.prototype.done = function() { var feed = {}; var feedRoot; var tmp, items, childs; @@ -102,4 +102,4 @@ RssHandler.prototype.done = function() { DefaultHandler.prototype.handleCallback.call(this); }; -module.exports = RssHandler; \ No newline at end of file +module.exports = FeedHandler; \ No newline at end of file From 819069ce475c5424f368b1b00102d08351fa8f55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Fri, 11 Nov 2011 19:20:55 +0100 Subject: [PATCH 084/450] Updated readme --- README.md | 64 ++----------------------------------------------------- 1 file changed, 2 insertions(+), 62 deletions(-) diff --git a/README.md b/README.md index f7daadb..6e950cc 100644 --- a/README.md +++ b/README.md @@ -58,8 +58,8 @@ Besides, it features an additional handler that provides the interface of [sax.j } parser.done(); -##Parsing RSS/Atom Feeds - new htmlparser.RssHandler(function (error, dom) { +##Parsing RSS/RDF/Atom Feeds + new htmlparser.FeedHandler(function (error, dom) { ... }); @@ -78,7 +78,6 @@ Indicates whether ` text ") +); + +var dom = handler.dom; + +exports.dir = "./DomUtils/"; + +exports.test = function(test, cb){ + cb(null, test.getElements(dom)); + cb(null, test.getByFunction(dom)); +}; \ No newline at end of file diff --git a/tests/DomUtils/01-by_id.js b/tests/DomUtils/01-by_id.js new file mode 100644 index 0000000..578257f --- /dev/null +++ b/tests/DomUtils/01-by_id.js @@ -0,0 +1,54 @@ +var DomUtils = require("../../lib/DomUtils.js"); + +exports.name = "Get element by id"; +exports.getElements = function(dom){ + return DomUtils.getElements({id:"asdf"}, dom, true, 1)[0]; +}; +exports.getByFunction = function(dom){ + return DomUtils.getElementById("asdf", dom, true); +}; +exports.expected = { + "type": "tag", + "name": "tag1", + "attribs": { + "id": "asdf" + }, + "children": [ + { + "data": " ", + "type": "text" + }, + { + "type": "script", + "name": "script", + "children": [ + { + "data": "text", + "type": "text" + } + ] + }, + { + "data": " ", + "type": "text" + }, + { + "data": " comment ", + "type": "comment" + }, + { + "data": " ", + "type": "text" + }, + { + "type": "tag", + "name": "tag2", + "children": [ + { + "data": " text ", + "type": "text" + } + ] + } + ] +}; \ No newline at end of file diff --git a/tests/DomUtils/02-by_tagname.js b/tests/DomUtils/02-by_tagname.js new file mode 100644 index 0000000..280414e --- /dev/null +++ b/tests/DomUtils/02-by_tagname.js @@ -0,0 +1,22 @@ +var DomUtils = require("../../lib/DomUtils.js"); + +exports.name = "Get elements by tagName"; +exports.getElements = function(dom){ + return DomUtils.getElements({tag_name:"tag2"}, dom, true); +}; +exports.getByFunction = function(dom){ + return DomUtils.getElementsByTagName("tag2", dom, true); +}; +exports.expected = []; +for(var i = 0; i < 20; i++) exports.expected.push( + { + "type": "tag", + "name": "tag2", + "children": [ + { + "data": " text ", + "type": "text" + } + ] + } +); \ No newline at end of file diff --git a/tests/DomUtils/03-by_type.js b/tests/DomUtils/03-by_type.js new file mode 100644 index 0000000..16a3971 --- /dev/null +++ b/tests/DomUtils/03-by_type.js @@ -0,0 +1,22 @@ +var DomUtils = require("../../lib/DomUtils.js"); + +exports.name = "Get elements by type"; +exports.getElements = function(dom){ + return DomUtils.getElements({tag_type:"script"}, dom, true); +}; +exports.getByFunction = function(dom){ + return DomUtils.getElementsByTagType("script", dom, true); +}; +exports.expected = []; +for(var i = 0; i < 20; i++) exports.expected.push( + { + "type": "script", + "name": "script", + "children": [ + { + "data": "text", + "type": "text" + } + ] + } +); \ No newline at end of file From 05d1e2ec2c6addc2ce8b480907e3975386603675 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Sun, 13 Nov 2011 14:06:52 +0100 Subject: [PATCH 098/450] Added force option to parseTags, removed logic from done --- lib/Parser.js | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/lib/Parser.js b/lib/Parser.js index be00b19..34d7d54 100644 --- a/lib/Parser.js +++ b/lib/Parser.js @@ -21,7 +21,6 @@ var _reTagName = /^\s*(\/?)\s*([^\s\/]+)/; //Used to find the tag name for an el var _reAttrib = /([^=<>\"\'\s]+)\s*=\s*"([^"]*)"|([^=<>\"\'\s]+)\s*=\s*'([^']*)'|([^=<>\"\'\s]+)\s*=\s*([^'"\s]+)|([^=<>\"\'\s\/]+)/g;//" Parser.prototype._options = { - includeLocation: false, //Do not track element position in document by default xmlMode: false //Special behaviour for script/style tags by default }; @@ -46,22 +45,13 @@ Parser.prototype.done = function(){ if(this._done) return; this._done = true; - //Push any unparsed text into a final element in the element list - if(this._buffer){ - var data = this._buffer; - if(this._parseState === ElementType.Tag){ - data = data.trim(); - var name = parseTagName(data); - if(name.charAt(0) === "/"){ - if(this._cbs.onclosetag) this._cbs.onclosetag(name.substr(1)); - } else if(this._cbs.onopentag){ - this._cbs.onopentag(name, parseAttributes(data), ElementType.Tag); - } - } - else if(this._cbs.ontext) this._cbs.ontext(data); - - this._buffer = ""; + //Parse the buffer to its end + if(this._buffer) this.parseTags(true); + + if(this._cbs.onclosetag){ + while(this._stack.length) this._cbs.onclosetag(this._stack.pop()); } + if(this._cbs.onend) this._cbs.onend(); }; @@ -107,15 +97,18 @@ SpecialTags.w = 4; //2^2 - if set, append prev tag sep to data SpecialTags[ElementType.Comment] = 8; //2^3 //Parses through HTML text and returns an array of found elements -Parser.prototype.parseTags = function(){ +Parser.prototype.parseTags = function(force){ var buffer = this._buffer, current = 0; var next, tagSep, rawData, elementName, prevElement, elementType, elementData, attributes; var opening = buffer.indexOf("<"), closing = buffer.indexOf(">"); + //if force is true, parse everything + if(force) opening = Infinity; + while(opening !== closing){ //just false if both are -1 - if(closing === -1 || (opening !== -1 && opening < closing)){ + if((opening !== -1 && opening < closing) || closing === -1){ next = opening; tagSep = "<"; opening = buffer.indexOf(tagSep, next + 1); @@ -229,7 +222,7 @@ Parser.prototype._processCloseTag = function(name){ this._cbs.onclosetag(this._stack.pop()); } else this._stack.splice(i); - } + } //many browsers (eg. Safari, Chrome) convert
to
else if(name === "br" && !this._options.xmlMode) this._processOpenTag(name, "/"); From 9b1c606c413ad7a625260eae81f63c72ff287a64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Sun, 13 Nov 2011 14:33:32 +0100 Subject: [PATCH 099/450] Added option to convert tag names to lower case Also renamed parseTags to _parseTags (so it's officially a private function) and added write as another name for parseChunk --- lib/Parser.js | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/lib/Parser.js b/lib/Parser.js index 34d7d54..307193c 100644 --- a/lib/Parser.js +++ b/lib/Parser.js @@ -14,14 +14,12 @@ function Parser(cbs, options){ //**"Static"**// //Regular expressions used for cleaning up and parsing (stateless) -var _reWhitespace = /\s/; //Used to find any whitespace to split on -var _reTagName = /^\s*(\/?)\s*([^\s\/]+)/; //Used to find the tag name for an element - -//Find attributes in a tag +var _reTagName = /^\s*(\/?)\s*([^\s\/]+)/; //matches tagnames var _reAttrib = /([^=<>\"\'\s]+)\s*=\s*"([^"]*)"|([^=<>\"\'\s]+)\s*=\s*'([^']*)'|([^=<>\"\'\s]+)\s*=\s*([^'"\s]+)|([^=<>\"\'\s\/]+)/g;//" Parser.prototype._options = { - xmlMode: false //Special behaviour for script/style tags by default + xmlMode: false, //Special behaviour for script/style tags by default + lowerCaseTags: false //call .toLowerCase for each tagname }; //**Public**// @@ -34,10 +32,11 @@ Parser.prototype.parseComplete = function(data){ }; //Parses a piece of an HTML document +Parser.prototype.write = Parser.prototype.parseChunk = function(data){ if(this._done) this._handleError(Error("Attempted to parse chunk after parsing already done")); this._buffer += data; //FIXME: this can be a bottleneck - this.parseTags(); + this._parseTags(); }; //Tells the parser that the HTML being parsed is complete @@ -46,7 +45,7 @@ Parser.prototype.done = function(){ this._done = true; //Parse the buffer to its end - if(this._buffer) this.parseTags(true); + if(this._buffer) this._parseTags(true); if(this._cbs.onclosetag){ while(this._stack.length) this._cbs.onclosetag(this._stack.pop()); @@ -64,7 +63,7 @@ Parser.prototype.reset = function(){ //**Private**// //Takes an element and adds an "attribs" property for any element attributes found var parseAttributes = function(data){ - var pos = data.search(_reWhitespace); + var pos = data.search(/\s/); //Find any whitespace if(pos === -1) return; var attribRaw = data.substr(pos); @@ -82,10 +81,13 @@ var parseAttributes = function(data){ }; //Extracts the base tag name from the data value of an element -var parseTagName = function(data){ +Parser.prototype._parseTagName = function(data){ var match = data.match(_reTagName); if(match === null) return ""; - return match[1] + match[2]; + if(this._options.lowerCaseTags){ + return match[1] + match[2].toLowerCase(); + } + else return match[1] + match[2]; }; //Special tags that are threated differently @@ -97,7 +99,7 @@ SpecialTags.w = 4; //2^2 - if set, append prev tag sep to data SpecialTags[ElementType.Comment] = 8; //2^3 //Parses through HTML text and returns an array of found elements -Parser.prototype.parseTags = function(force){ +Parser.prototype._parseTags = function(force){ var buffer = this._buffer, current = 0; var next, tagSep, rawData, elementName, prevElement, elementType, elementData, attributes; @@ -127,7 +129,7 @@ Parser.prototype.parseTags = function(force){ if(elementType === ElementType.Tag){ elementData = rawData.trim(); - elementName = parseTagName(elementData); + elementName = this._parseTagName(elementData); } else{ elementData = rawData; From dc2a3b4ff89db44674eff16a5f110a36eb477028 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Tue, 22 Nov 2011 19:05:55 +0100 Subject: [PATCH 100/450] Added a prototype for a new FeedHandler (not finished yet!) --- lib/_FeedHandler.js | 100 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 lib/_FeedHandler.js diff --git a/lib/_FeedHandler.js b/lib/_FeedHandler.js new file mode 100644 index 0000000..5f8adeb --- /dev/null +++ b/lib/_FeedHandler.js @@ -0,0 +1,100 @@ +// NOT FINISHED YET! DON'T USE IT! + +//opening tags +var searchRoot = function(tagName){ + if(tagName === "rss" || tagName === "rdf:RDF" || tagName === "feed"){ + if(tagName === "rdf:RDF") this.feed.type = "rdf"; + else this.feed.type = tagName; + this._map = RssFeedMap; + this.onopentag = getChannelElement; + } + else if(tagName === "feed"){ + this.feed.type = "atom"; + this._map = AtomFeedMap; + this.onclosetag = getFeedElements; + this.ontext = writeText; + this.onopentag = getOpenTag; + } +} + +var getChannelElement = function(tagName){ + if(tagName === "channel"){ + this.onopentag = getOpenTag; + this.onclosetag = getFeedElements; + this.ontext = writeText; + } +} + +var getOpenTag = function(tagName, attribs){ + this._level += 1; + if(tagName === this._childName){ + if(this._feed.type === "atom"){ + } + else{ + + } + } else if(tagName === "link" && this._level === 1 + && this._feed.type === "atom" && attribs.href){ + this.feed.link = attribs.href; + } +}; + +//text +var writeText = function(text){ + if(this._stack[this._level]){ + this._stack[this._level] += text; + } else this._stack[this._level] = text; +}; + +//closing tags +var getFeedElements = function(tagName){ + var text = this._stack.pop(); + if(this._level-- === 1){ + var elemName = this._map[tagName]; + if(elemName){ + if(elemName === "updated") text = Date(text); + this._feed[elemName] = text; + } + } +}; + +//mappings +var RssFeedMap = { + title: "title", + link: "link", + description: "description", + lastBuildDate: "updated", + managingEditor: "author"/*, + item: "item"*/ +}; + +var AtomFeedMap = { + id: "id", + title: "title", + subtitle: "description", + updated: "updated", + email: "author"/*, + entry: "item"*/ +}; + +//TODO: make this a trully streamable handler +function FeedHandler(callback, onitem){ + this.onopentag = searchRoot; + this.feed = { + type: null, + id: "", + title: null, + link: null, + description: null, + updated: null, + author: null, + items: [] + }; + this._level = 0; + this._stack = []; + this._map = null; + this.onend = callback; + this.onitem = onitem; //called when a new item was found +} + +module.exports = FeedHandler; \ No newline at end of file From 7ccffcffd97751c97e02c03e3dfffa8c14b589ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Tue, 22 Nov 2011 19:06:34 +0100 Subject: [PATCH 101/450] Removed EventedHandler from index.js --- lib/index.js | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/index.js b/lib/index.js index 2ceb6eb..85a26a6 100644 --- a/lib/index.js +++ b/lib/index.js @@ -2,7 +2,6 @@ module.exports = { Parser: require("./Parser.js"), DefaultHandler: require("./DefaultHandler.js"), FeedHandler: require("./FeedHandler.js"), - EventedHandler: require("./EventedHandler.js"), ElementType: require("./ElementType.js"), DomUtils: require("./DomUtils.js") } \ No newline at end of file From df8f48eedd08a4952246ba9cb9faf9d66d4e9f01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Tue, 22 Nov 2011 19:06:50 +0100 Subject: [PATCH 102/450] Added callbacks to prototype --- lib/Parser.js | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/lib/Parser.js b/lib/Parser.js index 307193c..47610f0 100644 --- a/lib/Parser.js +++ b/lib/Parser.js @@ -22,6 +22,16 @@ Parser.prototype._options = { lowerCaseTags: false //call .toLowerCase for each tagname }; +Parser.prototype._cbs = { + /* + onopentag, + onclosetag, + ontext, + onprocessinginstruction, + oncomment + */ +}; + //**Public**// //Methods// //Parses a complete HTML and pushes it to the handler From 0073cecb3858453678655867dbc3dd2bce0570b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Tue, 22 Nov 2011 19:07:50 +0100 Subject: [PATCH 103/450] Restructured some code in FeedHandler --- lib/FeedHandler.js | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/lib/FeedHandler.js b/lib/FeedHandler.js index 331611d..c72e18c 100644 --- a/lib/FeedHandler.js +++ b/lib/FeedHandler.js @@ -1,13 +1,14 @@ var DefaultHandler = require("./DefaultHandler.js"), - DomUtils = require("./DomUtils.js"), - inherits = require("util").inherits; + DomUtils = require("./DomUtils.js"); //TODO: make this a streamable handler function FeedHandler(callback){ - DefaultHandler.call(this, callback, { ignoreWhitespace: true }); + this.init(callback, { ignoreWhitespace: true }); } -inherits(FeedHandler, DefaultHandler); +require("util").inherits(FeedHandler, DefaultHandler); + +FeedHandler.prototype.init = DefaultHandler; function getElements(what, where, one, recurse){ if(one) return DomUtils.getElementsByTagName(what, where, recurse, 1)[0]; From 3ed5cc4d0d7124c1db8b464d3f0f2e2d9d6380da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Wed, 23 Nov 2011 20:23:29 +0100 Subject: [PATCH 104/450] Updated readme concerning verbose output --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 7356f86..bc195ff 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,8 @@ This is a fork of the project above. The main difference is that this is just in Besides, it features an additional handler that provides the interface of [sax.js](https://github.com/isaacs/sax-js) (written for my readability port [readabilitySAX](https://github.com/fb55/readabilitysax)). I also fixed a couple of bugs & included some pull requests for the original project (eg. [RDF feed support](https://github.com/tautologistics/node-htmlparser/pull/35)). +The support for location data and verbose output was removed a couple of versions ago. It's still available in [this earlier version](https://github.com/FB55/node-htmlparser/tree/e1ae2b231c66caf75ca9b1328925e0cf95bfecc2) of htmlparser2 (if you really need it, for whatever reason that may be). + ##Usage var htmlparser = require("htmlparser"); From 2750286aa5fe8703234098b58df406be4788ee3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Fri, 25 Nov 2011 08:58:17 +0100 Subject: [PATCH 105/450] removed unused vars --- lib/Parser.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Parser.js b/lib/Parser.js index 47610f0..b37a78f 100644 --- a/lib/Parser.js +++ b/lib/Parser.js @@ -112,7 +112,7 @@ SpecialTags[ElementType.Comment] = 8; //2^3 Parser.prototype._parseTags = function(force){ var buffer = this._buffer, current = 0; - var next, tagSep, rawData, elementName, prevElement, elementType, elementData, attributes; + var next, tagSep, rawData, elementName, elementType, elementData; var opening = buffer.indexOf("<"), closing = buffer.indexOf(">"); From 0ad61f63f81ff7d54a863cbf03bda403f09e52d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Fri, 25 Nov 2011 09:18:36 +0100 Subject: [PATCH 106/450] Restructured some code --- lib/DefaultHandler.js | 89 ++++++++++++++++++++++++++++--------------- 1 file changed, 58 insertions(+), 31 deletions(-) diff --git a/lib/DefaultHandler.js b/lib/DefaultHandler.js index 77cd08d..8c389d6 100644 --- a/lib/DefaultHandler.js +++ b/lib/DefaultHandler.js @@ -5,84 +5,111 @@ function DefaultHandler(callback, options){ this._done = false; this._inSpecialTag = false; this._tagStack = []; - if(options){ //otherwise, the prototype is used - this._options = options; - } + if(options) this._options = options; //otherwise, the prototype is used if(callback) this._callback = callback; } //default options DefaultHandler.prototype._options = { - ignoreWhitespace: false,//Keep whitespace-only text nodes + ignoreWhitespace: false //Keep whitespace-only text nodes }; -//**Public**// -//Methods// //Resets the handler back to starting state DefaultHandler.prototype.onreset = DefaultHandler; //Signals the handler that parsing is done -DefaultHandler.prototype.onend = function() { +DefaultHandler.prototype.onend = function(){ if(this._done) return; this._done = true; this._handleCallback(null); }; -//Methods// -DefaultHandler.prototype.onerror = -DefaultHandler.prototype._handleCallback = function(error){ - if(typeof this._callback === "function") - this._callback(error, this.dom); - else if(error) throw error; +DefaultHandler.prototype.onerror = function(error){ + if(typeof this._callback === "function"){ + return this._callback(error, this.dom); + } else { + if(error) throw error; + } }; +DefaultHandler.prototype._handleCallback = DefaultHandler.prototype.onerror; + DefaultHandler.prototype.onclosetag = function(name){ this._tagStack.pop(); }; DefaultHandler.prototype._addDomElement = function(element){ - var lastTag = this._tagStack[this._tagStack.length-1], lastChild; - if(!lastTag) this.dom.push(element); - else{ //There are parent elements + var lastChild, + lastTag = this._tagStack[this._tagStack.length - 1]; + + if(lastTag){ //There are parent elements if(!lastTag.children){ lastTag.children = [element]; return; } - lastChild = lastTag.children[lastTag.children.length-1]; + lastChild = lastTag.children[lastTag.children.length - 1]; if(this._inSpecialTag && element.type === ElementType.Text && lastChild.type === ElementType.Text){ lastChild.data += element.data; + } else { + lastTag.children.push(element); } - else lastTag.children.push(element); + } + else { + this.dom.push(element); } }; DefaultHandler.prototype.onopentag = function(name, attribs, type){ - if(type === ElementType.Script || type === ElementType.Style) this._inSpecialTag = true; - - var element = {type:type, name:name, attribs:attribs}; + if(type === ElementType.Script || type === ElementType.Style){ + this._inSpecialTag = true; + } + var element = { + type: type, + name: name, + attribs: attribs + }; this._addDomElement(element); this._tagStack.push(element); }; DefaultHandler.prototype.ontext = function(data){ if(this._options.ignoreWhitespace && data.trim() === "") return; - this._addDomElement({data:data, type:ElementType.Text}); + this._addDomElement({ + data: data, + type: ElementType.Text + }); }; DefaultHandler.prototype.oncomment = function(data){ - var lastTag = this._tagStack[this._tagStack.length-1], element, - lastChild = lastTag && lastTag.children && lastTag.children[lastTag.children.length-1]; + var lastTag = this._tagStack[this._tagStack.length - 1]; + var lastChild = lastTag && lastTag.children && lastTag.children[lastTag.children.length - 1]; + + var element; if(!lastChild || lastChild.type !== ElementType.Comment){ - element = {data:data, type: ElementType.Comment}; - if(!lastTag) this.dom.push(element); - else if(!lastChild) lastTag.children = [element]; - else if(lastChild.type !== ElementType.Comment) lastTag.children.push(element); + element = { + data: data, + type: ElementType.Comment + }; + if(!lastTag){ + return this.dom.push(element); + } else if(!lastChild){ + lastTag.children = [element]; + } else { + if(lastChild.type !== ElementType.Comment){ + lastTag.children.push(element); + } + } + } else { + lastChild.data += data; } - else lastChild.data += data; -} +}; DefaultHandler.prototype.onprocessinginstruction = function(name, data){ - this._addDomElement({name:name, data:data, type:ElementType.Directive}); + this._addDomElement({ + name: name, + data: data, + type: ElementType.Directive + }); }; module.exports = DefaultHandler; \ No newline at end of file From 3ce24f4c3eae804918fe0b83dcb37885059206f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Fri, 25 Nov 2011 10:56:23 +0100 Subject: [PATCH 107/450] Updated readme --- package.json | 45 ++++++++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 19 deletions(-) diff --git a/package.json b/package.json index e394215..27bb103 100644 --- a/package.json +++ b/package.json @@ -1,22 +1,29 @@ { - "name": "htmlparser2" - , "description": "Forgiving HTML/XML/RSS Parser for Node. This version is optimised and cleaned and provides a SAX interface (EventedHandler)." - , "version": "1.5.0" - , "author": "Felix Boehm " - , "contributors": [ "Chris Winberry " ] - , "repository": { - "type": "git" - , "url": "git://github.com/fb55/node-htmlparser.git" - } - , "bugs": { - "mail": "me@feedic.com" - , "url": "http://github.com/fb55/node-htmlparser/issues" - } - , "directories": { "lib": "./lib/" } - , "main": "./lib/" - , "engines": { "node": ">0" } - , "licenses": [{ - "type": "MIT" - , "url": "http://github.com/tautologistics/node-htmlparser/raw/master/LICENSE" + "name": "htmlparser2", + "description": "Forgiving HTML/XML/RSS Parser for Node. This version is optimised and cleaned and provides a SAX interface.", + "version": "1.9.0", + "author": "Felix Boehm ", + "contributors": ["Chris Winberry "], + "repository": { + "type": "git", + "url": "git://github.com/fb55/node-htmlparser.git" + }, + "bugs": { + "mail": "me@feedic.com", + "url": "http://github.com/fb55/node-htmlparser/issues" + }, + "directories": { + "lib": "./lib/" + }, + "main": "./lib/", + "scripts": { + "test": "make test" + }, + "engines": { + "node": ">0" + }, + "licenses": [{ + "type": "MIT", + "url": "http://github.com/tautologistics/node-htmlparser/raw/master/LICENSE" }] } \ No newline at end of file From 2f68f49413cf534f060e36dfd8153cce69718a0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Fri, 25 Nov 2011 10:59:13 +0100 Subject: [PATCH 108/450] Throw if there was an error --- tests/00-runtests.js | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/00-runtests.js b/tests/00-runtests.js index a1ccc67..b1fd7f5 100644 --- a/tests/00-runtests.js +++ b/tests/00-runtests.js @@ -55,4 +55,8 @@ function runTests(test){ //log the results console.log("Total time:", totalTime); console.log("Total tests:", testCount); -console.log("Failed tests:", failCount); \ No newline at end of file +console.log("Failed tests:", failCount); + +if(failCount !== 0){ + throw Error("Encountered " + failCount + " errors!"); +} \ No newline at end of file From 51a1370ded8f8416160f4c8657a93e2786750cf3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Fri, 25 Nov 2011 11:00:51 +0100 Subject: [PATCH 109/450] Added .travis.yml required for http://travis-ci.org/ --- .travis.yml | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..381c985 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,4 @@ +language: node_js +node_js: + - 0.4 + - 0.6 \ No newline at end of file From a813aed7202e1e213dc9d73c94fc79cf96b73307 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Fri, 25 Nov 2011 11:04:32 +0100 Subject: [PATCH 110/450] Fixed tests dir in package.json --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 27bb103..f6ea609 100644 --- a/package.json +++ b/package.json @@ -17,7 +17,7 @@ }, "main": "./lib/", "scripts": { - "test": "make test" + "test": "cd tests && node 00-runtests.js" }, "engines": { "node": ">0" From fe24ee7f70cf1833b0128738d793209156cf4287 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Fri, 25 Nov 2011 11:12:02 +0100 Subject: [PATCH 111/450] Added Travis status to readme --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index bc195ff..41688df 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,10 @@ A forgiving HTML/XML/RSS parser written in JS for NodeJS. The parser can handle ##Running Tests node tests/00-runtests.js +This project is linked to [Travis CI](http://travis-ci.org/). The latest builds status is: + +[![Build Status](https://secure.travis-ci.org/FB55/node-htmlparser.png)](http://travis-ci.org/FB55/node-htmlparser) + ##How is this different from [node-htmlparser](https://github.com/tautologistics/node-htmlparser)? This is a fork of the project above. The main difference is that this is just intended to be used with node. Besides, the code is much better structured, has less duplications and is remarkably faster than the original. From af9023bc5bf8276b84e5876a8d896f7535aef0ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Fri, 25 Nov 2011 11:24:04 +0100 Subject: [PATCH 112/450] Updated readme, moved options to wiki --- README.md | 98 +++---------------------------------------------------- 1 file changed, 5 insertions(+), 93 deletions(-) diff --git a/README.md b/README.md index 41688df..56a1efe 100644 --- a/README.md +++ b/README.md @@ -14,9 +14,9 @@ This project is linked to [Travis CI](http://travis-ci.org/). The latest builds ##How is this different from [node-htmlparser](https://github.com/tautologistics/node-htmlparser)? This is a fork of the project above. The main difference is that this is just intended to be used with node. Besides, the code is much better structured, has less duplications and is remarkably faster than the original. -Besides, it features an additional handler that provides the interface of [sax.js](https://github.com/isaacs/sax-js) (written for my readability port [readabilitySAX](https://github.com/fb55/readabilitysax)). I also fixed a couple of bugs & included some pull requests for the original project (eg. [RDF feed support](https://github.com/tautologistics/node-htmlparser/pull/35)). +Besides, the parser now provides the interface of [sax.js](https://github.com/isaacs/sax-js) (originally intended for my readability port [readabilitySAX](https://github.com/fb55/readabilitysax)). I also fixed a couple of bugs & included some pull requests for the original project (eg. [RDF feed support](https://github.com/tautologistics/node-htmlparser/pull/35)). -The support for location data and verbose output was removed a couple of versions ago. It's still available in [this earlier version](https://github.com/FB55/node-htmlparser/tree/e1ae2b231c66caf75ca9b1328925e0cf95bfecc2) of htmlparser2 (if you really need it, for whatever reason that may be). +The support for location data and verbose output was removed a couple of versions ago. It's still available in the [verbose branch](https://github.com/FB55/node-htmlparser/tree/verbose) (if you really need it, for whatever reason that may be). ##Usage @@ -64,94 +64,6 @@ The support for location data and verbose output was removed a couple of version ... }); -##Parser options - -###Usage - var Parser = new htmlparser.Parser(handler, options); - -###Option: includeLocation -Indicates whether the parser should include the location of a token as part of it. Default: false. - -###Option: xmlMode -Indicates whether `*/ - Style: "style", /*Special tag */ - Tag: "tag" /*Any tag that isn't special*/ + Text: "text", //Plain text + Directive: "directive", //Special tag + Comment: "comment", //Special tag + Script: "script", //Special tag + Style: "style", //Special tag + Tag: "tag" //Any tag that isn't special }; \ No newline at end of file diff --git a/lib/_FeedHandler.js b/lib/_FeedHandler.js index 5f8adeb..73059d4 100644 --- a/lib/_FeedHandler.js +++ b/lib/_FeedHandler.js @@ -68,6 +68,10 @@ var RssFeedMap = { item: "item"*/ }; +var RssItemMap = { + +}; + var AtomFeedMap = { id: "id", title: "title", @@ -77,6 +81,10 @@ var AtomFeedMap = { entry: "item"*/ }; +var AtomItemMap = { + +}; + //TODO: make this a trully streamable handler function FeedHandler(callback, onitem){ this.onopentag = searchRoot; From 2f66caeca5882b44c25896740ec0b6446feace6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Fri, 9 Dec 2011 09:46:44 +0100 Subject: [PATCH 123/450] Added oncommentend event, fixed #2 + added test --- lib/DefaultHandler.js | 38 +++++++++++++-------------- lib/Parser.js | 7 +++-- tests/HTML/21-conditional_comments.js | 16 +++++++++++ 3 files changed, 38 insertions(+), 23 deletions(-) create mode 100644 tests/HTML/21-conditional_comments.js diff --git a/lib/DefaultHandler.js b/lib/DefaultHandler.js index 89fbebb..4380e0f 100644 --- a/lib/DefaultHandler.js +++ b/lib/DefaultHandler.js @@ -86,26 +86,26 @@ DefaultHandler.prototype.ontext = function(data){ DefaultHandler.prototype.oncomment = function(data){ var lastTag = this._tagStack[this._tagStack.length - 1]; - var lastChild = lastTag && lastTag.children && lastTag.children[lastTag.children.length - 1]; - - var element; - if(!lastChild || lastChild.type !== ElementType.Comment){ - element = { - data: data, - type: ElementType.Comment - }; - if(!lastTag){ - return this.dom.push(element); - } else if(!lastChild){ - lastTag.children = [element]; - } else { - if(lastChild.type !== ElementType.Comment){ - lastTag.children.push(element); - } - } - } else { - lastChild.data += data; + + if(lastTag && lastTag.type === ElementType.Comment){ + lastTag.data += data; + return; } + + var element = { + data: data, + type: ElementType.Comment + }; + + if(!lastTag) this.dom.push(element); + else if(!lastTag.children) lastTag.children = [element]; + else lastTag.children.push(element); + + this._tagStack.push(element); +}; + +DefaultHandler.prototype.oncommentend = function(){ + this._tagStack.pop(); }; DefaultHandler.prototype.onprocessinginstruction = function(name, data){ diff --git a/lib/Parser.js b/lib/Parser.js index beeef93..323699c 100644 --- a/lib/Parser.js +++ b/lib/Parser.js @@ -211,11 +211,10 @@ Parser.prototype._processComment = function(rawData, tagSep){ if(tagSep === ">" && rawData.substr(-2) === "--"){ //comment ends //remove the written flag (also removes the comment flag) this._contentFlags %= SpecialTags.w; - rawData = rawData.slice(0, -2); + if(this._cbs.oncomment) this._cbs.oncomment(rawData.slice(0, -2)); + if(this._cbs.oncommentend) this._cbs.oncommentend(); } - else rawData += tagSep; - - if(this._cbs.oncomment) this._cbs.oncomment(rawData); + else if(this._cbs.oncomment) this._cbs.oncomment(rawData + tagSep); }; var emptyTags = require("./ClosingTags.js").self; diff --git a/tests/HTML/21-conditional_comments.js b/tests/HTML/21-conditional_comments.js new file mode 100644 index 0000000..583981e --- /dev/null +++ b/tests/HTML/21-conditional_comments.js @@ -0,0 +1,16 @@ +exports.name = "Conditional comments"; +exports.options = { + handler: {} + , parser: {} +}; +exports.html = ""; +exports.expected = [ + { + "data": "[if lt IE 7]> Date: Sun, 11 Dec 2011 13:27:10 +0100 Subject: [PATCH 124/450] Added Parser#end as an alias for #done --- lib/Parser.js | 7 ++++--- package.json | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/lib/Parser.js b/lib/Parser.js index 323699c..30a5aa2 100644 --- a/lib/Parser.js +++ b/lib/Parser.js @@ -37,8 +37,8 @@ Parser.prototype._cbs = { //Parses a complete HTML and pushes it to the handler Parser.prototype.parseComplete = function(data){ this.reset(); - this.parseChunk(data); - this.done(); + this.write(data); + this.end(); }; //Parses a piece of an HTML document @@ -50,11 +50,12 @@ Parser.prototype.parseChunk = function(data){ }; //Tells the parser that the HTML being parsed is complete -Parser.prototype.done = function(){ +Parser.prototype.end = Parser.prototype.done = function(chunk){ if(this._done) return; this._done = true; //Parse the buffer to its end + if(chunk) this._buffer += chunk; if(this._buffer) this._parseTags(true); if(this._cbs.onclosetag){ diff --git a/package.json b/package.json index 60ccf7f..cdf5a46 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "htmlparser2", "description": "Forgiving HTML/XML/RSS Parser for Node. This version is optimised and cleaned and provides a SAX interface.", - "version": "2.0.0", + "version": "2.0.2", "author": "Felix Boehm ", "contributors": ["Chris Winberry "], "repository": { From 48cae534208c7009908e96b301b5c8273d34db68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Thu, 15 Dec 2011 20:47:37 +0100 Subject: [PATCH 125/450] Accept malformed directives --- lib/Parser.js | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/Parser.js b/lib/Parser.js index 30a5aa2..2db3cff 100644 --- a/lib/Parser.js +++ b/lib/Parser.js @@ -14,7 +14,7 @@ function Parser(cbs, options){ //**"Static"**// //Regular expressions used for cleaning up and parsing (stateless) -var _reTagName = /^\s*(\/?)\s*([^\s\/]+)/; //matches tagnames +var _reTagName = /^\s*([\/\?\!]?)\s*([^\s\/]+)/; //matches tagnames var _reAttrib = /([^=<>\"\'\s]+)\s*=\s*"([^"]*)"|([^=<>\"\'\s]+)\s*=\s*'([^']*)'|([^=<>\"\'\s]+)\s*=\s*([^'"\s]+)|([^=<>\"\'\s\/]+)/g;//" Parser.prototype._options = { @@ -74,8 +74,8 @@ Parser.prototype.reset = function(){ //**Private**// //Takes an element and adds an "attribs" property for any element attributes found var parseAttributes = function(data){ - var pos = data.search(/\s/), attrs = {}; //Find any whitespace - if(pos === -1) return attrs; + var pos = data.search(/\w\s/) + 1, attrs = {}; //Find any whitespace + if(pos === 0) return attrs; var attribRaw = data.substr(pos); _reAttrib.lastIndex = 0; From af54c2870d488cfc1a4e7b01916f44568062993d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Thu, 15 Dec 2011 20:48:22 +0100 Subject: [PATCH 126/450] Added getInnerHTML & getOuterHTML methods to DomUtils For #3 --- lib/DomUtils.js | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/lib/DomUtils.js b/lib/DomUtils.js index 6b3bb7d..5ef7e75 100644 --- a/lib/DomUtils.js +++ b/lib/DomUtils.js @@ -97,5 +97,38 @@ module.exports = { return filter(function(elem){return type(elem.type);}, element, recurse, limit); } else return filter(function(elem){return elem.type === type;}, element, recurse, limit); + }, + + getInnerHTML: function(elem){ + if(!elem.children) return ""; + + var childs = elem.children, + childNum = childs.length, + ret = ""; + + for(var i = 0; i < childNum; i++){ + ret += this.getOuterHTML(childs[i]); + } + + return ret; + }, + + getOuterHTML: function(elem){ + var type = elem.type; + + if(type === "text") return elem.data; + if(type === "comment") return ""; + + var ret = "<" + elem.name; + + for(var i in elem.attribs){ + ret += " " + i + "=\"" + elem.attribs[i] + "\""; + } + + ret += ">"; + + if(type === "directive") return ret; + + return ret + this.getInnerHTML(elem) + ""; } }; \ No newline at end of file From d6469cf0f1650311911092ca2919b85286f226e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Fri, 16 Dec 2011 11:07:29 +0100 Subject: [PATCH 127/450] Make empty tags self-closing, handle attributes better in DomUtils --- lib/DomUtils.js | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/lib/DomUtils.js b/lib/DomUtils.js index 5ef7e75..1b55ea9 100644 --- a/lib/DomUtils.js +++ b/lib/DomUtils.js @@ -116,19 +116,24 @@ module.exports = { getOuterHTML: function(elem){ var type = elem.type; - if(type === "text") return elem.data; - if(type === "comment") return ""; + if(type === ElementType.Text) return elem.data; + if(type === ElementType.Comment) return ""; var ret = "<" + elem.name; - for(var i in elem.attribs){ - ret += " " + i + "=\"" + elem.attribs[i] + "\""; + var value; + for(var name in elem.attribs){ + value = elem.attribs[name]; + ret += " " + name + "="; + + if(/^\w+$/.test(value)) ret += value; + else if(value.indeOf("\"") !== -1) ret += "'" + value + "'"; + else ret += "\"" + value + "\""; } - ret += ">"; + if(type === ElementType.Directive) return ret + ">"; + if(type === ElementType.Tag && !elem.children) return ret + "/>"; - if(type === "directive") return ret; - - return ret + this.getInnerHTML(elem) + ""; + return ">" + ret + this.getInnerHTML(elem) + ""; } }; \ No newline at end of file From b059a99e021de8b3ddf51b9787ed804848bcd612 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Fri, 16 Dec 2011 11:30:20 +0100 Subject: [PATCH 128/450] Added a better regexp to test for unquoted attributes Is now matching the HTML spec --- lib/DomUtils.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/DomUtils.js b/lib/DomUtils.js index 1b55ea9..3285f80 100644 --- a/lib/DomUtils.js +++ b/lib/DomUtils.js @@ -126,13 +126,13 @@ module.exports = { value = elem.attribs[name]; ret += " " + name + "="; - if(/^\w+$/.test(value)) ret += value; + if(/^[^\s"\'\`\=\<\>]+$/.test(value)) ret += value; else if(value.indeOf("\"") !== -1) ret += "'" + value + "'"; else ret += "\"" + value + "\""; } if(type === ElementType.Directive) return ret + ">"; - if(type === ElementType.Tag && !elem.children) return ret + "/>"; + if(type === ElementType.Tag && !elem.children) return ret + " />"; return ">" + ret + this.getInnerHTML(elem) + ""; } From fa7321a6c74245b4650330b2b165ff86bea1c44f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Sun, 18 Dec 2011 19:12:54 +0100 Subject: [PATCH 129/450] Restructured Parser#_parseTags made it much cleaner & compact --- lib/Parser.js | 116 +++++++++++++++++++++++++------------------------- 1 file changed, 58 insertions(+), 58 deletions(-) diff --git a/lib/Parser.js b/lib/Parser.js index 2db3cff..0e998e5 100644 --- a/lib/Parser.js +++ b/lib/Parser.js @@ -14,8 +14,8 @@ function Parser(cbs, options){ //**"Static"**// //Regular expressions used for cleaning up and parsing (stateless) -var _reTagName = /^\s*([\/\?\!]?)\s*([^\s\/]+)/; //matches tagnames -var _reAttrib = /([^=<>\"\'\s]+)\s*=\s*"([^"]*)"|([^=<>\"\'\s]+)\s*=\s*'([^']*)'|([^=<>\"\'\s]+)\s*=\s*([^'"\s]+)|([^=<>\"\'\s\/]+)/g;//" +var _reTagName = /^([\?\!]?)\s*([^\s\/]+)/; //matches tagnames +var _reAttrib = /([^=<>\"\'\s]+)\s*=\s*(?:"([^"]*)"|'([^']*)'|([^'"\s]+))|([^=<>\"\'\s\/]+)/g; Parser.prototype._options = { xmlMode: false, //Special behaviour for script/style tags by default @@ -82,10 +82,8 @@ var parseAttributes = function(data){ var match; while(match = _reAttrib.exec(attribRaw)){ - if(match[1]) attrs[match[1]] = match[2]; - else if(match[3]) attrs[match[3]] = match[4]; - else if(match[5]) attrs[match[5]] = match[6]; - else if(match[7]) attrs[match[7]] = match[7]; + if(match[1]) attrs[match[1]] = match[2] || match[3] || match[4]; + else attrs[match[5]] = match[5]; } return attrs; @@ -138,68 +136,58 @@ Parser.prototype._parseTags = function(force){ current = next + 1; this._parseState = (tagSep === "<") ? ElementType.Tag : ElementType.Text; - if(elementType === ElementType.Tag){ - elementData = rawData.trim(); - elementName = this._parseTagName(elementData); - } - else{ - elementData = rawData; - elementName = ""; - } - - //This section inspects the current tag stack and modifies the current - //element if we're actually parsing a special area (script/comment/style tag) - if(this._contentFlags === 0){ /*do nothing*/ } - else if(this._contentFlags >= SpecialTags[ElementType.Comment]){ + if(this._contentFlags >= SpecialTags[ElementType.Comment]){ //We're currently in a comment tag this._processComment(rawData, tagSep); continue; } - //if it's a closing tag, remove the flag - else if(this._contentFlags >= SpecialTags[ElementType.Script] && elementName === "/script"){ - //remove the script flag (also removes the written flag) - this._contentFlags %= SpecialTags[ElementType.Script]; - } - else if(this._contentFlags >= SpecialTags[ElementType.Style] && elementName === "/style"){ - //remove the style flag (also removes the written flag) - this._contentFlags %= SpecialTags[ElementType.Style]; - } - //special behaviour for script & style tags - //Make sure we're not in a comment - else if(!this._options.xmlMode && rawData.substring(0, 3) !== "!--"){ - //If the previous element is text, append the last tag sep to element - if(this._contentFlags >= SpecialTags.w){ - if(this._cbs.ontext) this._cbs.ontext(this._prevTagSep + rawData); - } - else{ //The previous element was not text - this._contentFlags += SpecialTags.w; - if(rawData !== "" && this._cbs.ontext) this._cbs.ontext(rawData); - } - this._prevTagSep = tagSep; - continue; - } - - //Processing of non-special tags + if(elementType === ElementType.Tag){ - if(rawData.substring(0, 3) === "!--"){ //This tag is a comment - this._contentFlags += SpecialTags[ElementType.Comment]; - this._processComment(rawData.substr(3), tagSep); - continue; + elementData = rawData.trimLeft(); + if(elementData.charAt(0) === "/"){ + elementName = this._parseTagName(elementData.substr(1)); + if(this._contentFlags !== 0){ + //if it's a closing tag, remove the flag + if(this._contentFlags >= SpecialTags[ElementType.Script] && elementName === "script"){ + //remove the script flag (also removes the written flag) + this._contentFlags %= SpecialTags[ElementType.Script]; + } + else if(this._contentFlags >= SpecialTags[ElementType.Style] && elementName === "style"){ + //remove the style flag (also removes the written flag) + this._contentFlags %= SpecialTags[ElementType.Style]; + } + else { + this._writeSpecial(rawData, tagSep); + continue; + } + } + this._processCloseTag(elementName); } - - if(rawData.charAt(0) === "!" || rawData.charAt(0) === "?"){ - //ElementType.Directive + else if(elementData.charAt(0) === "!" || elementData.charAt(0) === "?"){ + if(elementData.substr(0, 3) === "!--"){ + //This tag is a comment + this._contentFlags += SpecialTags[ElementType.Comment]; + this._processComment(rawData.substr(3), tagSep); + } + else if(this._contentFlags !== 0){ + this._writeSpecial(rawData, tagSep); + } + //This tag is a directive //TODO: what about CDATA? - if(this._cbs.onprocessinginstruction){ - this._cbs.onprocessinginstruction(elementName, elementData); + else if(this._cbs.onprocessinginstruction){ + this._cbs.onprocessinginstruction(this._parseTagName(elementData), elementData); } - continue; } - if(elementName.charAt(0) === "/") this._processCloseTag(elementName.substr(1)); - else this._processOpenTag(elementName, elementData, tagSep); + else if(this._contentFlags !== 0) this._writeSpecial(rawData, tagSep); + else this._processOpenTag(this._parseTagName(elementData), elementData, tagSep); } - else if(elementType === ElementType.Text && rawData !== "" && this._cbs.ontext){ - this._cbs.ontext(elementData); + else{ + if(this._contentFlags !== 0){ + this._writeSpecial(rawData, tagSep); + } + else if(rawData !== "" && this._cbs.ontext){ + this._cbs.ontext(rawData); + } } } @@ -218,6 +206,18 @@ Parser.prototype._processComment = function(rawData, tagSep){ else if(this._cbs.oncomment) this._cbs.oncomment(rawData + tagSep); }; +Parser.prototype._writeSpecial = function(rawData, tagSep){ + //if the previous element is text, append the last tag sep to element + if(this._contentFlags >= SpecialTags.w){ + if(this._cbs.ontext) this._cbs.ontext(this._prevTagSep + rawData); + } + else{ //The previous element was not text + this._contentFlags += SpecialTags.w; + if(rawData !== "" && this._cbs.ontext) this._cbs.ontext(rawData); + } + this._prevTagSep = tagSep; +}; + var emptyTags = require("./ClosingTags.js").self; Parser.prototype._isEmptyTag = function(name){ From c530ba5298e79a16c1556ace87a9328bab08acce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Sun, 18 Dec 2011 19:31:36 +0100 Subject: [PATCH 130/450] Shortened Parser#_parseTagName, removed elementName var --- lib/Parser.js | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/lib/Parser.js b/lib/Parser.js index 0e998e5..7cca147 100644 --- a/lib/Parser.js +++ b/lib/Parser.js @@ -14,7 +14,7 @@ function Parser(cbs, options){ //**"Static"**// //Regular expressions used for cleaning up and parsing (stateless) -var _reTagName = /^([\?\!]?)\s*([^\s\/]+)/; //matches tagnames +var _reTagName = /[^\s\/]+/; //matches tagnames var _reAttrib = /([^=<>\"\'\s]+)\s*=\s*(?:"([^"]*)"|'([^']*)'|([^'"\s]+))|([^=<>\"\'\s\/]+)/g; Parser.prototype._options = { @@ -94,9 +94,9 @@ Parser.prototype._parseTagName = function(data){ var match = data.match(_reTagName); if(match === null) return ""; if(this._options.lowerCaseTags){ - return match[1] + match[2].toLowerCase(); + return match[0].toLowerCase(); } - else return match[1] + match[2]; + else return match[0]; }; //Special tags that are threated differently @@ -111,7 +111,7 @@ SpecialTags[ElementType.Comment] = 8; //2^3 Parser.prototype._parseTags = function(force){ var buffer = this._buffer, current = 0; - var next, tagSep, rawData, elementName, elementType, elementData; + var next, tagSep, rawData, elementType, elementData; var opening = buffer.indexOf("<"), closing = buffer.indexOf(">"); @@ -145,14 +145,15 @@ Parser.prototype._parseTags = function(force){ if(elementType === ElementType.Tag){ elementData = rawData.trimLeft(); if(elementData.charAt(0) === "/"){ - elementName = this._parseTagName(elementData.substr(1)); + //elementData = elementData.substr(1).trim(); + elementData = this._parseTagName(elementData.substr(1)); if(this._contentFlags !== 0){ //if it's a closing tag, remove the flag - if(this._contentFlags >= SpecialTags[ElementType.Script] && elementName === "script"){ + if(this._contentFlags >= SpecialTags[ElementType.Script] && elementData === "script"){ //remove the script flag (also removes the written flag) this._contentFlags %= SpecialTags[ElementType.Script]; } - else if(this._contentFlags >= SpecialTags[ElementType.Style] && elementName === "style"){ + else if(this._contentFlags >= SpecialTags[ElementType.Style] && elementData === "style"){ //remove the style flag (also removes the written flag) this._contentFlags %= SpecialTags[ElementType.Style]; } @@ -161,7 +162,7 @@ Parser.prototype._parseTags = function(force){ continue; } } - this._processCloseTag(elementName); + this._processCloseTag(elementData); } else if(elementData.charAt(0) === "!" || elementData.charAt(0) === "?"){ if(elementData.substr(0, 3) === "!--"){ @@ -175,7 +176,10 @@ Parser.prototype._parseTags = function(force){ //This tag is a directive //TODO: what about CDATA? else if(this._cbs.onprocessinginstruction){ - this._cbs.onprocessinginstruction(this._parseTagName(elementData), elementData); + this._cbs.onprocessinginstruction( + elementData.charAt(0) + this._parseTagName(elementData.substr(1)), + elementData + ); } } else if(this._contentFlags !== 0) this._writeSpecial(rawData, tagSep); From 05abd8f755e2281712f5f3869dab3b28f2da2004 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Sun, 18 Dec 2011 19:46:00 +0100 Subject: [PATCH 131/450] Replaced Parser#_parseState with Parser#_tagSep instead Also removed Parser#_prevTagSep --- lib/Parser.js | 49 ++++++++++++++++++++++--------------------------- 1 file changed, 22 insertions(+), 27 deletions(-) diff --git a/lib/Parser.js b/lib/Parser.js index 7cca147..ef4c57e 100644 --- a/lib/Parser.js +++ b/lib/Parser.js @@ -5,11 +5,10 @@ function Parser(cbs, options){ if(cbs) this._cbs = cbs; this._buffer = ""; - this._prevTagSep = ""; + this._tagSep = ""; this._stack = []; this._contentFlags = 0; this._done = false; - this._parseState = ElementType.Text; } //**"Static"**// @@ -111,7 +110,7 @@ SpecialTags[ElementType.Comment] = 8; //2^3 Parser.prototype._parseTags = function(force){ var buffer = this._buffer, current = 0; - var next, tagSep, rawData, elementType, elementData; + var next, rawData, elementType, elementData, lastTagSep; var opening = buffer.indexOf("<"), closing = buffer.indexOf(">"); @@ -119,30 +118,30 @@ Parser.prototype._parseTags = function(force){ if(force) opening = Infinity; while(opening !== closing){ //just false if both are -1 + lastTagSep = this._tagSep; + if((opening !== -1 && opening < closing) || closing === -1){ next = opening; - tagSep = "<"; - opening = buffer.indexOf(tagSep, next + 1); + this._tagSep = "<"; + opening = buffer.indexOf("<", next + 1); } else{ next = closing; - tagSep = ">"; - closing = buffer.indexOf(tagSep, next + 1); + this._tagSep = ">"; + closing = buffer.indexOf(">", next + 1); } rawData = buffer.substring(current, next); //The next chunk of data to parse - elementType = this._parseState; //set elements for next run current = next + 1; - this._parseState = (tagSep === "<") ? ElementType.Tag : ElementType.Text; if(this._contentFlags >= SpecialTags[ElementType.Comment]){ //We're currently in a comment tag - this._processComment(rawData, tagSep); + this._processComment(rawData); continue; } - if(elementType === ElementType.Tag){ + if(lastTagSep === "<"){ elementData = rawData.trimLeft(); if(elementData.charAt(0) === "/"){ //elementData = elementData.substr(1).trim(); @@ -158,7 +157,7 @@ Parser.prototype._parseTags = function(force){ this._contentFlags %= SpecialTags[ElementType.Style]; } else { - this._writeSpecial(rawData, tagSep); + this._writeSpecial(rawData, lastTagSep); continue; } } @@ -168,10 +167,10 @@ Parser.prototype._parseTags = function(force){ if(elementData.substr(0, 3) === "!--"){ //This tag is a comment this._contentFlags += SpecialTags[ElementType.Comment]; - this._processComment(rawData.substr(3), tagSep); + this._processComment(rawData.substr(3)); } else if(this._contentFlags !== 0){ - this._writeSpecial(rawData, tagSep); + this._writeSpecial(rawData, lastTagSep); } //This tag is a directive //TODO: what about CDATA? @@ -182,12 +181,12 @@ Parser.prototype._parseTags = function(force){ ); } } - else if(this._contentFlags !== 0) this._writeSpecial(rawData, tagSep); - else this._processOpenTag(this._parseTagName(elementData), elementData, tagSep); + else if(this._contentFlags !== 0) this._writeSpecial(rawData, lastTagSep); + else this._processOpenTag(this._parseTagName(elementData), elementData); } else{ if(this._contentFlags !== 0){ - this._writeSpecial(rawData, tagSep); + this._writeSpecial(rawData, lastTagSep); } else if(rawData !== "" && this._cbs.ontext){ this._cbs.ontext(rawData); @@ -198,28 +197,25 @@ Parser.prototype._parseTags = function(force){ this._buffer = buffer.substring(current); }; -Parser.prototype._processComment = function(rawData, tagSep){ - this._prevTagSep = tagSep; - - if(tagSep === ">" && rawData.substr(-2) === "--"){ //comment ends +Parser.prototype._processComment = function(rawData){ + if(this._tagSep === ">" && rawData.substr(-2) === "--"){ //comment ends //remove the written flag (also removes the comment flag) this._contentFlags %= SpecialTags.w; if(this._cbs.oncomment) this._cbs.oncomment(rawData.slice(0, -2)); if(this._cbs.oncommentend) this._cbs.oncommentend(); } - else if(this._cbs.oncomment) this._cbs.oncomment(rawData + tagSep); + else if(this._cbs.oncomment) this._cbs.oncomment(rawData + this._tagSep); }; -Parser.prototype._writeSpecial = function(rawData, tagSep){ +Parser.prototype._writeSpecial = function(rawData, lastTagSep){ //if the previous element is text, append the last tag sep to element if(this._contentFlags >= SpecialTags.w){ - if(this._cbs.ontext) this._cbs.ontext(this._prevTagSep + rawData); + if(this._cbs.ontext) this._cbs.ontext(lastTagSep + rawData); } else{ //The previous element was not text this._contentFlags += SpecialTags.w; if(rawData !== "" && this._cbs.ontext) this._cbs.ontext(rawData); } - this._prevTagSep = tagSep; }; var emptyTags = require("./ClosingTags.js").self; @@ -244,7 +240,7 @@ Parser.prototype._processCloseTag = function(name){ this._processOpenTag(name, "/"); }; -Parser.prototype._processOpenTag = function(name, data, tagSep){ +Parser.prototype._processOpenTag = function(name, data){ var type = ElementType.Tag; if(this._options.xmlMode){ /*do nothing*/ } else if(name === "script") type = ElementType.Script; @@ -260,7 +256,6 @@ Parser.prototype._processOpenTag = function(name, data, tagSep){ } else { this._contentFlags += SpecialTags[type]; this._stack.push(name); - this._prevTagSep = tagSep; } }; From 3c95ad90150ac0089fee788d12d119e73e5d7d18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Sun, 18 Dec 2011 19:51:08 +0100 Subject: [PATCH 132/450] Removed Parser#_isEmptyTag --- lib/Parser.js | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/lib/Parser.js b/lib/Parser.js index ef4c57e..bf2a11c 100644 --- a/lib/Parser.js +++ b/lib/Parser.js @@ -220,12 +220,8 @@ Parser.prototype._writeSpecial = function(rawData, lastTagSep){ var emptyTags = require("./ClosingTags.js").self; -Parser.prototype._isEmptyTag = function(name){ - return !this._options.xmlMode && emptyTags[name]; -}; - Parser.prototype._processCloseTag = function(name){ - if(this._stack && !this._isEmptyTag(name)){ + if(this._stack && (!emptyTags[name] || this._options.xmlMode)){ var i = this._stack.length; while(i !== 0 && this._stack[--i] !== name){} if(i !== 0 || this._stack[0] === name) @@ -251,7 +247,7 @@ Parser.prototype._processOpenTag = function(name, data){ } //If tag self-terminates, add an explicit, separate closing tag - if(data.substr(-1) === "/" || this._isEmptyTag(name)){ + if(data.substr(-1) === "/" || (emptyTags[name] && !this._options.xmlMode)){ if(this._cbs.onclosetag) this._cbs.onclosetag(name); } else { this._contentFlags += SpecialTags[type]; From 563ea0ef6ac52b3022aa59b7d30b3bbfdc6e6ec5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Mon, 19 Dec 2011 22:27:45 +0100 Subject: [PATCH 133/450] Fixed a bug, introduced Parser#_wroteSpecial The bug: When a `` occurred inside a ` - Tag: "tag" //Any tag that isn't special + Tag: "tag", //Any tag that isn't special + CDATA: "cdata" }; \ No newline at end of file diff --git a/lib/Parser.js b/lib/Parser.js index 0115e69..082f8b4 100644 --- a/lib/Parser.js +++ b/lib/Parser.js @@ -13,8 +13,9 @@ function Parser(cbs, options){ } //Regular expressions used for cleaning up and parsing (stateless) -var _reTagName = /[^\s\/]+/; //matches tag names -var _reAttrib = /([^=<>\"\'\s]+)\s*=\s*(?:"([^"]*)"|'([^']*)'|([^'"\s]+))|([^=<>\"\'\s\/]+)/g; +var _reAttrib = /\s([^=\"\'\s\/]+)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+)))?/g, + _reAttribStart = /\s+[^=\"\'\s\/]/, + _reTail = /\s|\//; Parser.prototype._options = { xmlMode: false, //Special behavior for script/style tags by default @@ -23,11 +24,16 @@ Parser.prototype._options = { Parser.prototype._cbs = { /* - onopentag, - onclosetag, - ontext, - onprocessinginstruction, - oncomment + oncdatastart, + oncdataend, + ontext, + onprocessinginstruction, + oncomment, + oncommentend, + onclosetag, + onopentag, + onerror, + onreset */ }; @@ -71,16 +77,13 @@ Parser.prototype.reset = function(){ //parses the attribute string var parseAttributes = function(data){ - var pos = data.search(/\w\s/) + 1, attrs = {}; //Find any whitespace - if(pos === 0) return attrs; - var attribRaw = data.substr(pos); - - _reAttrib.lastIndex = 0; + var pos = data.search(_reAttribStart), attrs = {}; + if(pos === -1) return attrs; + var attribRaw = data.substring(pos); var match; while(match = _reAttrib.exec(attribRaw)){ - if(match[1]) attrs[match[1]] = match[2] || match[3] || match[4]; - else attrs[match[5]] = match[5]; + attrs[match[1]] = match[2] || match[3] || match[4] || match[1]; } return attrs; @@ -88,12 +91,12 @@ var parseAttributes = function(data){ //Extracts the base tag name from the data value of an element Parser.prototype._parseTagName = function(data){ - var match = data.match(_reTagName); - if(match === null) return ""; - if(this._options.lowerCaseTags){ - return match[0].toLowerCase(); - } - else return match[0]; + var pos = data.search(_reTail), match; + if(pos === -1) match = data; + else match = data.substr(0, pos); + + if(!this._options.lowerCaseTags) return match; + return match.toLowerCase(); }; //Special tags that are treated differently @@ -101,7 +104,8 @@ var SpecialTags = {}; //SpecialTags[ElementType.Tag] = 0; SpecialTags[ElementType.Style] = 1; //2^0 SpecialTags[ElementType.Script] = 2; //2^1 -SpecialTags[ElementType.Comment] = 4; //2^3 +SpecialTags[ElementType.Comment] = 4; //2^2 +SpecialTags[ElementType.CDATA] = 8; //2^3 //Parses through HTML text and returns an array of found elements Parser.prototype._parseTags = function(force){ @@ -132,13 +136,21 @@ Parser.prototype._parseTags = function(force){ //set elements for next run current = next + 1; - if(this._contentFlags >= SpecialTags[ElementType.Comment]){ + if(this._contentFlags >= SpecialTags[ElementType.CDATA]){ + if(this._tagSep === ">" && rawData.substr(-2) === "]]"){ + if(rawData.length !== 2 && this._cbs.ontext){ + this._cbs.ontext(rawData.slice(0,-2)); + } + this._contentFlags -= SpecialTags[ElementType.CDATA]; + if(this._cbs.oncdataend) this._cbs.oncdataend(); + } + else if(this._cbs.ontext) this._cbs.ontext(rawData + this._tagSep); + } + else if(this._contentFlags >= SpecialTags[ElementType.Comment]){ //We're currently in a comment tag this._processComment(rawData); - continue; } - - if(lastTagSep === "<"){ + else if(lastTagSep === "<"){ elementData = rawData.trimLeft(); if(elementData.charAt(0) === "/"){ //elementData = elementData.substr(1).trim(); @@ -160,23 +172,41 @@ Parser.prototype._parseTags = function(force){ } this._processCloseTag(elementData); } - else if(elementData.charAt(0) === "!" || elementData.charAt(0) === "?"){ - if(elementData.substr(0, 3) === "!--"){ + else if(elementData.charAt(0) === "!"){ + if(elementData.substr(1, 2) === "--"){ //This tag is a comment this._contentFlags += SpecialTags[ElementType.Comment]; this._processComment(rawData.substr(3)); } + else if(elementData.substr(1, 7) === "[CDATA["){ + if(this._cbs.oncdatastart) this._cbs.oncdatastart(); + if(this._tagSep === ">" && elementData.substr(-2) === "]]"){ + if(this._cbs.oncdataend) this._cbs.oncdataend(); + if(this._cbs.ontext) this._cbs.ontext(elementData.slice(8, -2)); + } + else{ + if(this._cbs.ontext) this._cbs.ontext(elementData.substr(8)); + this._contentFlags += SpecialTags[ElementType.CDATA]; + } + } else if(this._contentFlags !== 0) this._writeSpecial(rawData, lastTagSep); //This tag is a directive - //TODO: what about CDATA? else if(this._cbs.onprocessinginstruction){ this._cbs.onprocessinginstruction( - elementData.charAt(0) + this._parseTagName(elementData.substr(1)), + "!" + this._parseTagName(elementData.substr(1)), elementData ); } } else if(this._contentFlags !== 0) this._writeSpecial(rawData, lastTagSep); + else if(elementData.charAt(0) === "?"){ + if(this._cbs.onprocessinginstruction){ + this._cbs.onprocessinginstruction( + "?" + this._parseTagName(elementData.substr(1)), + elementData + ); + } + } else this._processOpenTag(this._parseTagName(elementData), elementData); } else{ diff --git a/tests/Events/04-cdata.js b/tests/Events/04-cdata.js new file mode 100644 index 0000000..197235d --- /dev/null +++ b/tests/Events/04-cdata.js @@ -0,0 +1,75 @@ +exports.name = "simple"; +exports.options = {handler: {}, parser: {}}; +exports.html = "<> fo]]>"; +exports.expected = [ + { + "event": "opentag", + "data": [ + "tag", + {}, + "tag" + ] + }, + { + "event": "cdatastart", + "data": [] + }, + { + "event": "text", + "data": [ + " asdf " + ] + }, + { + "event": "text", + "data": [ + "<" + ] + }, + { + "event": "text", + "data": [ + "asdf>" + ] + }, + { + "event": "text", + "data": [ + "<" + ] + }, + { + "event": "text", + "data": [ + "/adsf>" + ] + }, + { + "event": "text", + "data": [ + "<" + ] + }, + { + "event": "text", + "data": [ + ">" + ] + }, + { + "event": "text", + "data": [ + " fo" + ] + }, + { + "event": "cdataend", + "data": [] + }, + { + "event": "closetag", + "data": [ + "tag" + ] + } +]; \ No newline at end of file diff --git a/tests/Feeds/03-rdf.js b/tests/Feeds/03-rdf.js index 8d7d385..96e4c89 100644 --- a/tests/Feeds/03-rdf.js +++ b/tests/Feeds/03-rdf.js @@ -1,5 +1,4 @@ exports.name = "RDF test"; -exports.type = "rss"; exports.options = { handler: {}, parser: { @@ -10,17 +9,20 @@ exports.options = { exports.html = '\n\n\t\n\t\tcraigslist | all community in SF bay area\n\t\thttp://sfbay.craigslist.org/ccc/\n\t\t\n\t\ten-us\n\t\tCopyright 2011 craigslist, inc.\n\t\twebmaster@craigslist.org\n\t\twebmaster@craigslist.org\n\t\thttp://sfbay.craigslist.org/ccc//\n\t\tcraigslist | all community in SF bay area\n\t\tCollection\n\t\t2011-11-04T09:39:10-07:00\n\t\t4\n\t\thourly\n\t\t\n\t\t\t\n\t\t\t\t\n\t\t\t\n\t\t\n\t\n\t\n\t\t<![CDATA[ Music Equipment Repair and Consignment ]]>\n\t\t\nhttp://sfbay.craigslist.org/sby/muc/2681301534.html\n\n\t\t
We are pleased to announce our NEW LOCATION: 1199 N 5th st. San Jose, ca 95112. Please call ahead, by appointment only.

Recently featured by Metro Newspaper in their 2011 Best of the Silicon Valley edition see it online here:
http://www.metroactive.com/best-of-silicon-valley/2011/music-nightlife/editor-picks.html

Guitar Set up (acoustic and electronic) $40!\n]]>
\n\t\t2011-11-04T09:35:17-07:00\n\t\ten-us\n\t\tCopyright 2011 craigslist, inc.\n\t\t\nhttp://sfbay.craigslist.org/sby/muc/2681301534.html\n\n\t\t\n\t\ttext\n\t\t2011-11-04T09:35:17-07:00\n\t
\n\t\n\t\t<![CDATA[\nRide Offered - Oakland/BART to LA/SFV - TODAY 3PM 11/04 (oakland north / temescal)\n]]>\n\t\t\nhttp://sfbay.craigslist.org/eby/rid/2685010755.html\n\n\t\t\n]]>\n\t\t2011-11-04T09:34:54-07:00\n\t\ten-us\n\t\tCopyright 2011 craigslist, inc.\n\t\t\nhttp://sfbay.craigslist.org/eby/rid/2685010755.html\n\n\t\t\n\t\ttext\n\t\t2011-11-04T09:34:54-07:00\n\t\n
'; exports.expected = { - type: 'rdf:RDF', - id: '', - title: 'craigslist | all community in SF bay area', - link: 'http://sfbay.craigslist.org/ccc/', - items: [{ - title: '![CDATA[ Music Equipment Repair and Consignment ]]', - link: '\nhttp://sfbay.craigslist.org/sby/muc/2681301534.html\n', - description: '![CDATA[\nSan Jose Rock Shop offers musical instrument repair and consignment! (408) 215-2065' - }, { - title: '![CDATA[\nRide Offered - Oakland/BART to LA/SFV - TODAY 3PM 11/04 (oakland north / temescal)\n]]', - link: '\nhttp://sfbay.craigslist.org/eby/rid/2685010755.html\n', - description: '![CDATA[\nIm offering a lift for up to two people from Oakland (or near any BART station in the East Bay/580/880 Corridor, or San Jose/Morgan Hill, Gilroy) to the San Fernando Valley / Los Angeles area. Specifically, Im leaving from Oakland between 2:30 and 3:00pm (this is flexible, but if I leave too late my girlfriend will kill me), and heading to Woodland Hills via the 580, I-5, 405, and 101.' - }] + "type": "rdf:RDF", + "id": "", + "title": "craigslist | all community in SF bay area", + "link": "http://sfbay.craigslist.org/ccc/", + "items": [ + { + "title": " Music Equipment Repair and Consignment ", + "link": "\nhttp://sfbay.craigslist.org/sby/muc/2681301534.html\n", + "description": "\nSan Jose Rock Shop offers musical instrument repair and consignment! (408) 215-2065" + }, + { + "title": "\nRide Offered - Oakland/BART to LA/SFV - TODAY 3PM 11/04 (oakland north / temescal)\n", + "link": "\nhttp://sfbay.craigslist.org/eby/rid/2685010755.html\n", + "description": "\nIm offering a lift for up to two people from Oakland (or near any BART station in the East Bay/580/880 Corridor, or San Jose/Morgan Hill, Gilroy) to the San Fernando Valley / Los Angeles area. Specifically, Im leaving from Oakland between 2:30 and 3:00pm (this is flexible, but if I leave too late my girlfriend will kill me), and heading to Woodland Hills via the 580, I-5, 405, and 101." + } + ] }; \ No newline at end of file From b927498cc4cfcc7e796814cc0b3f7bb7cc89da2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Thu, 22 Dec 2011 19:58:20 +0100 Subject: [PATCH 146/450] Switched to ES6 proxies inside events test If they are not available, use a static object --- package.json | 2 +- tests/03-events.js | 61 +++++++++++++++++++------------ tests/Events/01-simple.js | 29 ++++++++++++--- tests/Events/02-template.js | 35 ++++++++++++------ tests/Events/03-lowercase_tags.js | 29 ++++++++++++--- 5 files changed, 111 insertions(+), 45 deletions(-) diff --git a/package.json b/package.json index 6869a4f..b875cfb 100644 --- a/package.json +++ b/package.json @@ -17,7 +17,7 @@ }, "main": "./lib/", "scripts": { - "test": "node tests/00-runtests.js" + "test": "node --harmony_proxies tests/00-runtests.js" }, "engines": "node >= 0.3.0", "licenses": [{ diff --git a/tests/03-events.js b/tests/03-events.js index 0cbf54a..196db95 100644 --- a/tests/03-events.js +++ b/tests/03-events.js @@ -1,29 +1,44 @@ -var helper = require("./test-helper.js"); +var helper = require("./test-helper.js"), + sliceArr = Array.prototype.slice; exports.dir = "/Events/"; exports.test = function(test, cb){ - var tokens = []; - var cbs = { - onopentag: function(name, attributes){ - tokens.push({event:"open", name: name, attributes: attributes}); - }, - onclosetag: function(name){ - tokens.push({event:"close", name: name}); - }, - ontext: function(text){ - tokens.push({event:"text", text: text}); - }, - oncomment: function(data){ - tokens.push({event:"comment", data:data}); - }, - onprocessinginstruction: function(name, data){ - tokens.push({event:"processing", name:name, data:data}); - }, - onend: function(){ - //deletes all tokens - cb(null, tokens.splice(0)); - } - }; + var tokens = [], cbs; + if(typeof Proxy !== "undefined"){ + cbs = Proxy.create({ get: function(a, name){ + if(name === "onend"){ + return function(){ + cb(null, tokens.splice(0)); + } + } + if(name === "onreset") return function(){}; + return function(){ + tokens.push({ + event: name.substr(2), + data: sliceArr.apply(arguments) + }); + } + }}); + } + else{ + cbs = { + onerror: cb, + onend: function(){ + cb(null, tokens.splice(0)); + } + }; + ["cdatastart", "cdataend", "text" + , "processinginstruction", "comment" + , "commentend", "closetag" + , "opentag"].forEach(function(name){ + cbs["on" + name] = function(){ + tokens.push({ + event: name, + data: sliceArr.apply(arguments) + }); + } + }); + } helper.writeToParser(cbs, test.options.parser, test.html); }; \ No newline at end of file diff --git a/tests/Events/01-simple.js b/tests/Events/01-simple.js index 2cf35f7..cfecf9c 100644 --- a/tests/Events/01-simple.js +++ b/tests/Events/01-simple.js @@ -1,8 +1,27 @@ exports.name = "simple"; exports.options = {handler: {}, parser: {}}; exports.html = "

adsf

"; -exports.expected = [ { event: 'open', - name: 'h1', - attributes: { class: 'test' } }, - { event: 'text', text: 'adsf' }, - { event: 'close', name: 'h1' } ]; \ No newline at end of file +exports.expected = [ + { + "event": "opentag", + "data": [ + "h1", + { + "class": "test" + }, + "tag" + ] + }, + { + "event": "text", + "data": [ + "adsf" + ] + }, + { + "event": "closetag", + "data": [ + "h1" + ] + } +]; \ No newline at end of file diff --git a/tests/Events/02-template.js b/tests/Events/02-template.js index 76a29ab..6f68857 100644 --- a/tests/Events/02-template.js +++ b/tests/Events/02-template.js @@ -3,30 +3,43 @@ exports.options = {handler: {}, parser: {}}; exports.html = ""; exports.expected = [ { - "event": "open", - "name": "script", - "attributes": { - "type": "text/template" - } + "event": "opentag", + "data": [ + "script", + { + "type": "text/template" + }, + "script" + ] }, { "event": "text", - "text": "Heading1" + "data": [ + ">Heading1" + ] }, { "event": "text", - "text": "" + "data": [ + ">" + ] }, { - "event": "close", - "name": "script" + "event": "closetag", + "data": [ + "script" + ] } ]; \ No newline at end of file diff --git a/tests/Events/03-lowercase_tags.js b/tests/Events/03-lowercase_tags.js index 1ef2ad5..42348f9 100644 --- a/tests/Events/03-lowercase_tags.js +++ b/tests/Events/03-lowercase_tags.js @@ -1,8 +1,27 @@ exports.name = "simple"; exports.options = {handler: {}, parser: {lowerCaseTags:true}}; exports.html = "

adsf

"; -exports.expected = [ { event: 'open', - name: 'h1', - attributes: { class: 'test' } }, - { event: 'text', text: 'adsf' }, - { event: 'close', name: 'h1' } ]; \ No newline at end of file +exports.expected = [ + { + "event": "opentag", + "data": [ + "h1", + { + "class": "test" + }, + "tag" + ] + }, + { + "event": "text", + "data": [ + "adsf" + ] + }, + { + "event": "closetag", + "data": [ + "h1" + ] + } +]; \ No newline at end of file From 8c304017bb326151cbade4d64f185f73ab2073a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Fri, 23 Dec 2011 18:27:15 +0100 Subject: [PATCH 147/450] Improved attrib parsing (again) --- lib/Parser.js | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/lib/Parser.js b/lib/Parser.js index 082f8b4..25d01cb 100644 --- a/lib/Parser.js +++ b/lib/Parser.js @@ -13,8 +13,7 @@ function Parser(cbs, options){ } //Regular expressions used for cleaning up and parsing (stateless) -var _reAttrib = /\s([^=\"\'\s\/]+)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+)))?/g, - _reAttribStart = /\s+[^=\"\'\s\/]/, +var _reAttrib = /\s(\S+?)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+))|\s|\/|$)/g, _reTail = /\s|\//; Parser.prototype._options = { @@ -77,12 +76,9 @@ Parser.prototype.reset = function(){ //parses the attribute string var parseAttributes = function(data){ - var pos = data.search(_reAttribStart), attrs = {}; - if(pos === -1) return attrs; - var attribRaw = data.substring(pos); - - var match; - while(match = _reAttrib.exec(attribRaw)){ + var attrs = {}, match; + + while(match = _reAttrib.exec(data)){ attrs[match[1]] = match[2] || match[3] || match[4] || match[1]; } From e70304ac6e9dd29ae81d8964a798f7f3ab24a34d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Sat, 24 Dec 2011 13:00:55 +0100 Subject: [PATCH 148/450] Shortened Parser#_parseTagName --- lib/Parser.js | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/lib/Parser.js b/lib/Parser.js index 25d01cb..b44fbfb 100644 --- a/lib/Parser.js +++ b/lib/Parser.js @@ -14,7 +14,7 @@ function Parser(cbs, options){ //Regular expressions used for cleaning up and parsing (stateless) var _reAttrib = /\s(\S+?)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+))|\s|\/|$)/g, - _reTail = /\s|\//; + _reTail = /\s|\/|$/; Parser.prototype._options = { xmlMode: false, //Special behavior for script/style tags by default @@ -23,16 +23,16 @@ Parser.prototype._options = { Parser.prototype._cbs = { /* - oncdatastart, oncdataend, - ontext, - onprocessinginstruction, + oncdatastart, + onclosetag, oncomment, oncommentend, - onclosetag, - onopentag, onerror, - onreset + onopentag, + onprocessinginstruction, + onreset, + ontext */ }; @@ -87,10 +87,7 @@ var parseAttributes = function(data){ //Extracts the base tag name from the data value of an element Parser.prototype._parseTagName = function(data){ - var pos = data.search(_reTail), match; - if(pos === -1) match = data; - else match = data.substr(0, pos); - + var match = data.substr(0, data.search(_reTail)); if(!this._options.lowerCaseTags) return match; return match.toLowerCase(); }; From 0f0d7561449d19335ff2f4513786c42674a60404 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Sat, 24 Dec 2011 13:01:43 +0100 Subject: [PATCH 149/450] Moved cb names to tests/test-helper, added additional tests to benchmark --- tests/03-events.js | 5 +---- tests/99-benchmark.js | 34 +++++++++++++++++++++++----------- tests/test-helper.js | 4 +++- 3 files changed, 27 insertions(+), 16 deletions(-) diff --git a/tests/03-events.js b/tests/03-events.js index 196db95..3552a7e 100644 --- a/tests/03-events.js +++ b/tests/03-events.js @@ -28,10 +28,7 @@ exports.test = function(test, cb){ cb(null, tokens.splice(0)); } }; - ["cdatastart", "cdataend", "text" - , "processinginstruction", "comment" - , "commentend", "closetag" - , "opentag"].forEach(function(name){ + helper.EVENTS.forEach(function(name){ cbs["on" + name] = function(){ tokens.push({ event: name, diff --git a/tests/99-benchmark.js b/tests/99-benchmark.js index 373a2fd..ed70b02 100644 --- a/tests/99-benchmark.js +++ b/tests/99-benchmark.js @@ -1,14 +1,26 @@ -var xml = Array(5e3).join(" text "), +var multiply = function(text){ + return Array(5e3+1).join(text); + }, + tests = { + self_closing: multiply("
"), + tag: multiply(" Text "), + comment: multiply(""), + directive: multiply(""), + special: multiply(""), + xml: multiply(" text ") + } empty = function(){}, - parser = new (require("../lib/Parser.js"))({ - onopentag: empty, - onclosetag: empty, - oncomment: empty, - oncommentend: empty, - onprocessinginstruction: empty - }), + cbs = {}; + +require("./test-helper.js").EVENTS.forEach(function(name){ + cbs["on" + name] = empty; +}); + +var parser = new (require("../lib/Parser.js"))(cbs), ben = require("ben"); -console.log("Test took (ms)", ben(1e2, function(){ - parser.parseComplete(xml); -})); \ No newline at end of file +Object.keys(tests).forEach(function(name){ + console.log("Test", name, "took", ben(150, function(){ + parser.parseComplete(tests[name]); + })); +}); \ No newline at end of file diff --git a/tests/test-helper.js b/tests/test-helper.js index f533505..8fc24fc 100644 --- a/tests/test-helper.js +++ b/tests/test-helper.js @@ -10,4 +10,6 @@ exports.writeToParser = function(handler, options, data){ parser.done(); //then parse everything parser.parseComplete(data); -} \ No newline at end of file +} + +exports.EVENTS = ["cdatastart", "cdataend", "text", "processinginstruction", "comment", "commentend", "closetag", "opentag"/*, "error", "end"*/]; \ No newline at end of file From c36eed8af100b17f2563c11378b2b8294ce37721 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Sat, 24 Dec 2011 13:09:29 +0100 Subject: [PATCH 150/450] Fixed a bug with boolean attributes `` just returned `{foo:"foo"}` as an attribute --- lib/Parser.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Parser.js b/lib/Parser.js index b44fbfb..7b580e7 100644 --- a/lib/Parser.js +++ b/lib/Parser.js @@ -13,7 +13,7 @@ function Parser(cbs, options){ } //Regular expressions used for cleaning up and parsing (stateless) -var _reAttrib = /\s(\S+?)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+))|\s|\/|$)/g, +var _reAttrib = /\s(\S+?)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+))|(?=\s)|\/|$)/g, _reTail = /\s|\/|$/; Parser.prototype._options = { From 9c8a5fb9cc97103f80f03291dafbe6df6eef338d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Wed, 28 Dec 2011 11:29:39 +0100 Subject: [PATCH 151/450] Use Array#lastIndexOf inside Parser#_processCloseTag Might be a performance plus in future versions of node (currently it doesn't have much impact, it's just more readable) --- lib/Parser.js | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/lib/Parser.js b/lib/Parser.js index 7b580e7..7acc5de 100644 --- a/lib/Parser.js +++ b/lib/Parser.js @@ -256,14 +256,13 @@ var emptyTags = { Parser.prototype._processCloseTag = function(name){ if(this._stack && (!emptyTags[name] || this._options.xmlMode)){ - var i = this._stack.length; - while(i !== 0 && this._stack[--i] !== name){} - if(i !== 0 || this._stack[0] === name) + var pos = this._stack.lastIndexOf(name); + if(pos !== -1) if(this._cbs.onclosetag){ - while(i < this._stack.length) - this._cbs.onclosetag(this._stack.pop()); + pos = this._stack.length - pos; + while(pos--) this._cbs.onclosetag(this._stack.pop()); } - else this._stack.splice(i); + else this._stack.splice(pos); } //many browsers (eg. Safari, Chrome) convert
to
else if(name === "br" && !this._options.xmlMode) From de99728f32af00253d7577e63f27aa41f95d01c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Sat, 31 Dec 2011 14:05:57 +0100 Subject: [PATCH 152/450] 2.1.3 --- package.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/package.json b/package.json index b875cfb..4b1ca20 100644 --- a/package.json +++ b/package.json @@ -1,8 +1,9 @@ { "name": "htmlparser2", "description": "Forgiving HTML/XML/RSS Parser for Node. This version is optimised and cleaned and provides a SAX interface.", - "version": "2.1.2", + "version": "2.1.3", "author": "Felix Boehm ", + "keywords": ["html", "parser", "xml", "dom", "rss", "feed", "atom"], "contributors": ["Chris Winberry "], "repository": { "type": "git", From 0eb23186ecfda26a132213ffde65e69bb819ae30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Sat, 31 Dec 2011 14:06:28 +0100 Subject: [PATCH 153/450] Removed FeedHandler prototype It's currently not finished --- lib/_FeedHandler.js | 108 -------------------------------------------- 1 file changed, 108 deletions(-) delete mode 100644 lib/_FeedHandler.js diff --git a/lib/_FeedHandler.js b/lib/_FeedHandler.js deleted file mode 100644 index 73059d4..0000000 --- a/lib/_FeedHandler.js +++ /dev/null @@ -1,108 +0,0 @@ -// NOT FINISHED YET! DON'T USE IT! - -//opening tags -var searchRoot = function(tagName){ - if(tagName === "rss" || tagName === "rdf:RDF" || tagName === "feed"){ - if(tagName === "rdf:RDF") this.feed.type = "rdf"; - else this.feed.type = tagName; - this._map = RssFeedMap; - this.onopentag = getChannelElement; - } - else if(tagName === "feed"){ - this.feed.type = "atom"; - this._map = AtomFeedMap; - this.onclosetag = getFeedElements; - this.ontext = writeText; - this.onopentag = getOpenTag; - } -} - -var getChannelElement = function(tagName){ - if(tagName === "channel"){ - this.onopentag = getOpenTag; - this.onclosetag = getFeedElements; - this.ontext = writeText; - } -} - -var getOpenTag = function(tagName, attribs){ - this._level += 1; - if(tagName === this._childName){ - if(this._feed.type === "atom"){ - } - else{ - - } - } else if(tagName === "link" && this._level === 1 - && this._feed.type === "atom" && attribs.href){ - this.feed.link = attribs.href; - } -}; - -//text -var writeText = function(text){ - if(this._stack[this._level]){ - this._stack[this._level] += text; - } else this._stack[this._level] = text; -}; - -//closing tags -var getFeedElements = function(tagName){ - var text = this._stack.pop(); - if(this._level-- === 1){ - var elemName = this._map[tagName]; - if(elemName){ - if(elemName === "updated") text = Date(text); - this._feed[elemName] = text; - } - } -}; - -//mappings -var RssFeedMap = { - title: "title", - link: "link", - description: "description", - lastBuildDate: "updated", - managingEditor: "author"/*, - item: "item"*/ -}; - -var RssItemMap = { - -}; - -var AtomFeedMap = { - id: "id", - title: "title", - subtitle: "description", - updated: "updated", - email: "author"/*, - entry: "item"*/ -}; - -var AtomItemMap = { - -}; - -//TODO: make this a trully streamable handler -function FeedHandler(callback, onitem){ - this.onopentag = searchRoot; - this.feed = { - type: null, - id: "", - title: null, - link: null, - description: null, - updated: null, - author: null, - items: [] - }; - this._level = 0; - this._stack = []; - this._map = null; - this.onend = callback; - this.onitem = onitem; //called when a new item was found -} - -module.exports = FeedHandler; \ No newline at end of file From c5d1bd3f969f19fbfdece36bf860e64a95d76eab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Sat, 31 Dec 2011 15:00:25 +0100 Subject: [PATCH 154/450] Added a Stream interface might be useful when working with other streams (eg. generated by request) --- lib/Stream.js | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++ lib/index.js | 4 ++++ 2 files changed, 62 insertions(+) create mode 100644 lib/Stream.js diff --git a/lib/Stream.js b/lib/Stream.js new file mode 100644 index 0000000..bab2969 --- /dev/null +++ b/lib/Stream.js @@ -0,0 +1,58 @@ +var Parser = require("./Parser.js"); + +var Stream = function(options){ + Parser.call(this, new cbs(this), options); +}; + +require("util").inherits(Stream, require("stream")); + +//util.inherits would overwrite the prototype when called twice, +//so we need a different approach +Object.getOwnPropertyNames(Parser.prototype).forEach(function(name){ + Stream.prototype[name] = Parser.prototype[name]; +}); + +Stream.prototype.writable = true; +Stream.prototype.readable = true; + +var cbs = function(scope){ + this.scope = scope; +}; + +cbs.prototype = { + oncdataend: function(){ + this.scope.emit("cdataend"); + }, + oncdatastart: function(){ + this.scope.emit("cdatastart"); + }, + onclosetag: function(name){ + this.scope.emit("closetag", name); + }, + oncomment: function(text){ + this.scope.emit("comment", text); + }, + oncommentend: function(){ + this.scope.emit("commentend"); + }, + onerror: function(err){ + this.scope.emit("error", err); + }, + onopentag: function(name, attribs, type){ + this.scope.emit("opentag", name, attribs, type); + }, + onprocessinginstruction: function(name, data){ + this.scope.emit("processinginstruction", name, data); + }, + onreset: function(){ + this.scope.emit("reset"); + }, + ontext: function(text){ + this.scope.emit("text", text); + //let the 'pipe' function do something useful + //this.scope.emit("data", text); + } +}; + +module.exports = Stream; +Stream.cbs = cbs; \ No newline at end of file diff --git a/lib/index.js b/lib/index.js index 45b07cb..df40916 100644 --- a/lib/index.js +++ b/lib/index.js @@ -15,6 +15,10 @@ module.exports = { Object.defineProperty(this, "ElementType", {value:require("./ElementType.js")}); return this.ElementType; }, + get Stream(){ + Object.defineProperty(this, "Stream", {value:require("./Stream.js")}); + return this.Stream; + }, get DomUtils(){ Object.defineProperty(this, "DomUtils", {value:require("./DomUtils.js")}); return this.DomUtils; From d62f465d55e2c103ccddfaeb4b8197bbbcb2a0b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Sat, 31 Dec 2011 15:34:54 +0100 Subject: [PATCH 155/450] Minor changes --- lib/Parser.js | 2 +- lib/Stream.js | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/lib/Parser.js b/lib/Parser.js index 7acc5de..e73230a 100644 --- a/lib/Parser.js +++ b/lib/Parser.js @@ -183,7 +183,7 @@ Parser.prototype._parseTags = function(force){ } } else if(this._contentFlags !== 0) this._writeSpecial(rawData, lastTagSep); - //This tag is a directive + //TODO: This isn't a processing instruction, needs a new name else if(this._cbs.onprocessinginstruction){ this._cbs.onprocessinginstruction( "!" + this._parseTagName(elementData.substr(1)), diff --git a/lib/Stream.js b/lib/Stream.js index bab2969..350dfe1 100644 --- a/lib/Stream.js +++ b/lib/Stream.js @@ -54,5 +54,4 @@ cbs.prototype = { } }; -module.exports = Stream; -Stream.cbs = cbs; \ No newline at end of file +module.exports = Stream; \ No newline at end of file From 2469a0d6fc272b320bff7601fc983286bba8a3ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Fri, 6 Jan 2012 13:44:00 +0100 Subject: [PATCH 156/450] Added two new events, `onopentagname` and `on attribut` They should allow lighter functions --- lib/Parser.js | 12 ++++++++++++ lib/Stream.js | 6 ++++++ tests/Events/01-simple.js | 13 +++++++++++++ tests/Events/02-template.js | 13 +++++++++++++ tests/Events/03-lowercase_tags.js | 15 ++++++++++++++- tests/Events/04-cdata.js | 8 +++++++- tests/test-helper.js | 2 +- 7 files changed, 66 insertions(+), 3 deletions(-) diff --git a/lib/Parser.js b/lib/Parser.js index e73230a..741cc64 100644 --- a/lib/Parser.js +++ b/lib/Parser.js @@ -269,15 +269,27 @@ Parser.prototype._processCloseTag = function(name){ this._processOpenTag(name, "/"); }; +Parser.prototype._parseAttributes = function(data){ + for(var match; match = _reAttrib.exec(data);){ + this._cbs.onattribute(match[1], match[2] || match[3] || match[4] || match[1]); + } +}; + Parser.prototype._processOpenTag = function(name, data){ var type = ElementType.Tag; if(this._options.xmlMode){ /*do nothing*/ } else if(name === "script") type = ElementType.Script; else if(name === "style") type = ElementType.Style; + if(this._cbs.onopentagname){ + this._cbs.onopentagname(name); + } if(this._cbs.onopentag){ this._cbs.onopentag(name, parseAttributes(data), type); } + if(this._cbs.onattribute){ + this._parseAttributes(data); + } //If tag self-terminates, add an explicit, separate closing tag if(data.substr(-1) === "/" || (emptyTags[name] && !this._options.xmlMode)){ diff --git a/lib/Stream.js b/lib/Stream.js index 350dfe1..f02abbf 100644 --- a/lib/Stream.js +++ b/lib/Stream.js @@ -40,6 +40,12 @@ cbs.prototype = { }, onopentag: function(name, attribs, type){ this.scope.emit("opentag", name, attribs, type); + }, + onopentagname: function(name){ + this.scope.emit("opentagname", name); + }, + onattribute: function(name, value){ + this.scope.emit("attribute", name, value); }, onprocessinginstruction: function(name, data){ this.scope.emit("processinginstruction", name, data); diff --git a/tests/Events/01-simple.js b/tests/Events/01-simple.js index cfecf9c..2307b05 100644 --- a/tests/Events/01-simple.js +++ b/tests/Events/01-simple.js @@ -2,6 +2,12 @@ exports.name = "simple"; exports.options = {handler: {}, parser: {}}; exports.html = "

adsf

"; exports.expected = [ + { + "event": "opentagname", + "data": [ + "h1" + ] + }, { "event": "opentag", "data": [ @@ -12,6 +18,13 @@ exports.expected = [ "tag" ] }, + { + "event": "attribute", + "data": [ + "class", + "test" + ] + }, { "event": "text", "data": [ diff --git a/tests/Events/02-template.js b/tests/Events/02-template.js index 6f68857..467f02e 100644 --- a/tests/Events/02-template.js +++ b/tests/Events/02-template.js @@ -2,6 +2,12 @@ exports.name = "Template script tags"; exports.options = {handler: {}, parser: {}}; exports.html = ""; exports.expected = [ + { + "event": "opentagname", + "data": [ + "script" + ] + }, { "event": "opentag", "data": [ @@ -12,6 +18,13 @@ exports.expected = [ "script" ] }, + { + "event": "attribute", + "data": [ + "type", + "text/template" + ] + }, { "event": "text", "data": [ diff --git a/tests/Events/03-lowercase_tags.js b/tests/Events/03-lowercase_tags.js index 42348f9..55e8ac5 100644 --- a/tests/Events/03-lowercase_tags.js +++ b/tests/Events/03-lowercase_tags.js @@ -1,7 +1,13 @@ -exports.name = "simple"; +exports.name = "Lowercase tags"; exports.options = {handler: {}, parser: {lowerCaseTags:true}}; exports.html = "

adsf

"; exports.expected = [ + { + "event": "opentagname", + "data": [ + "h1" + ] + }, { "event": "opentag", "data": [ @@ -12,6 +18,13 @@ exports.expected = [ "tag" ] }, + { + "event": "attribute", + "data": [ + "class", + "test" + ] + }, { "event": "text", "data": [ diff --git a/tests/Events/04-cdata.js b/tests/Events/04-cdata.js index 197235d..b0d18be 100644 --- a/tests/Events/04-cdata.js +++ b/tests/Events/04-cdata.js @@ -1,7 +1,13 @@ -exports.name = "simple"; +exports.name = "CDATA"; exports.options = {handler: {}, parser: {}}; exports.html = "<> fo]]>"; exports.expected = [ + { + "event": "opentagname", + "data": [ + "tag" + ] + }, { "event": "opentag", "data": [ diff --git a/tests/test-helper.js b/tests/test-helper.js index 8fc24fc..07af5c9 100644 --- a/tests/test-helper.js +++ b/tests/test-helper.js @@ -12,4 +12,4 @@ exports.writeToParser = function(handler, options, data){ parser.parseComplete(data); } -exports.EVENTS = ["cdatastart", "cdataend", "text", "processinginstruction", "comment", "commentend", "closetag", "opentag"/*, "error", "end"*/]; \ No newline at end of file +exports.EVENTS = ["attribute", "cdatastart", "cdataend", "text", "processinginstruction", "comment", "commentend", "closetag", "opentag", "opentagname"/*, "error", "end"*/]; \ No newline at end of file From 3ea9838e43abf3daaedbc292669dac26e821e74a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Fri, 6 Jan 2012 13:44:49 +0100 Subject: [PATCH 157/450] Moved feeds to /tests/Documents They shouldn't be inside the test files --- tests/Documents/Atom_Example.xml | 25 +++++++++++++ tests/Documents/RDF_Example.xml | 63 ++++++++++++++++++++++++++++++++ tests/Documents/RSS_Example.xml | 48 ++++++++++++++++++++++++ tests/Feeds/01-rss.js | 49 +------------------------ tests/Feeds/02-atom.js | 28 +------------- tests/Feeds/03-rdf.js | 2 +- 6 files changed, 139 insertions(+), 76 deletions(-) create mode 100644 tests/Documents/Atom_Example.xml create mode 100644 tests/Documents/RDF_Example.xml create mode 100644 tests/Documents/RSS_Example.xml diff --git a/tests/Documents/Atom_Example.xml b/tests/Documents/Atom_Example.xml new file mode 100644 index 0000000..7349745 --- /dev/null +++ b/tests/Documents/Atom_Example.xml @@ -0,0 +1,25 @@ + + + + Example Feed + A subtitle. + + + urn:uuid:60a76c80-d399-11d9-b91C-0003939e0af6 + 2003-12-13T18:30:02Z + + John Doe + johndoe@example.com + + + + Atom-Powered Robots Run Amok + + + + urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a + 2003-12-13T18:30:02Z + Some text. + + + \ No newline at end of file diff --git a/tests/Documents/RDF_Example.xml b/tests/Documents/RDF_Example.xml new file mode 100644 index 0000000..068da17 --- /dev/null +++ b/tests/Documents/RDF_Example.xml @@ -0,0 +1,63 @@ + + + + craigslist | all community in SF bay area + http://sfbay.craigslist.org/ccc/ + + en-us + Copyright 2011 craigslist, inc. + webmaster@craigslist.org + webmaster@craigslist.org + http://sfbay.craigslist.org/ccc// + craigslist | all community in SF bay area + Collection + 2011-11-04T09:39:10-07:00 + 4 + hourly + + + + + + + + <![CDATA[ Music Equipment Repair and Consignment ]]> + +http://sfbay.craigslist.org/sby/muc/2681301534.html + +
We are pleased to announce our NEW LOCATION: 1199 N 5th st. San Jose, ca 95112. Please call ahead, by appointment only.

Recently featured by Metro Newspaper in their 2011 Best of the Silicon Valley edition see it online here:
http://www.metroactive.com/best-of-silicon-valley/2011/music-nightlife/editor-picks.html

Guitar Set up (acoustic and electronic) $40! +]]>
+ 2011-11-04T09:35:17-07:00 + en-us + Copyright 2011 craigslist, inc. + +http://sfbay.craigslist.org/sby/muc/2681301534.html + + + text + 2011-11-04T09:35:17-07:00 +
+ + <![CDATA[ +Ride Offered - Oakland/BART to LA/SFV - TODAY 3PM 11/04 (oakland north / temescal) +]]> + +http://sfbay.craigslist.org/eby/rid/2685010755.html + + +]]> + 2011-11-04T09:34:54-07:00 + en-us + Copyright 2011 craigslist, inc. + +http://sfbay.craigslist.org/eby/rid/2685010755.html + + + text + 2011-11-04T09:34:54-07:00 + +
\ No newline at end of file diff --git a/tests/Documents/RSS_Example.xml b/tests/Documents/RSS_Example.xml new file mode 100644 index 0000000..0d1fde8 --- /dev/null +++ b/tests/Documents/RSS_Example.xml @@ -0,0 +1,48 @@ + + + + + Liftoff News + http://liftoff.msfc.nasa.gov/ + Liftoff to Space Exploration. + en-us + Tue, 10 Jun 2003 04:00:00 GMT + + Tue, 10 Jun 2003 09:41:01 GMT + http://blogs.law.harvard.edu/tech/rss + Weblog Editor 2.0 + editor@example.com + webmaster@example.com + + + Star City + http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp + How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia's <a href="http://howe.iki.rssi.ru/GCTC/gctc_e.htm">Star City</a>. + Tue, 03 Jun 2003 09:39:21 GMT + http://liftoff.msfc.nasa.gov/2003/06/03.html#item573 + + + + Sky watchers in Europe, Asia, and parts of Alaska and Canada will experience a <a href="http://science.nasa.gov/headlines/y2003/30may_solareclipse.htm">partial eclipse of the Sun</a> on Saturday, May 31st. + Fri, 30 May 2003 11:06:42 GMT + http://liftoff.msfc.nasa.gov/2003/05/30.html#item572 + + + + The Engine That Does More + http://liftoff.msfc.nasa.gov/news/2003/news-VASIMR.asp + Before man travels to Mars, NASA hopes to design new engines that will let us fly through the Solar System more quickly. The proposed VASIMR engine would do that. + Tue, 27 May 2003 08:37:32 GMT + http://liftoff.msfc.nasa.gov/2003/05/27.html#item571 + + + + Astronauts' Dirty Laundry + http://liftoff.msfc.nasa.gov/news/2003/news-laundry.asp + Compared to earlier spacecraft, the International Space Station has many luxuries, but laundry facilities are not one of them. Instead, astronauts have other options. + Tue, 20 May 2003 08:56:02 GMT + http://liftoff.msfc.nasa.gov/2003/05/20.html#item570 + + + + \ No newline at end of file diff --git a/tests/Feeds/01-rss.js b/tests/Feeds/01-rss.js index 8059ed6..0af3e1a 100644 --- a/tests/Feeds/01-rss.js +++ b/tests/Feeds/01-rss.js @@ -6,54 +6,7 @@ exports.options = { } }; exports.type = "rss"; -//http://cyber.law.harvard.edu/rss/examples/rss2sample.xml -exports.html = '\ -\ - \ - Liftoff News\ - http://liftoff.msfc.nasa.gov/\ - Liftoff to Space Exploration.\ - en-us\ - Tue, 10 Jun 2003 04:00:00 GMT\ -\ - Tue, 10 Jun 2003 09:41:01 GMT\ - http://blogs.law.harvard.edu/tech/rss\ - Weblog Editor 2.0\ - editor@example.com\ - webmaster@example.com\ - \ -\ - Star City\ - http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp\ - How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia\'s <a href="http://howe.iki.rssi.ru/GCTC/gctc_e.htm">Star City</a>.\ - Tue, 03 Jun 2003 09:39:21 GMT\ - http://liftoff.msfc.nasa.gov/2003/06/03.html#item573\ -\ - \ - \ - Sky watchers in Europe, Asia, and parts of Alaska and Canada will experience a <a href="http://science.nasa.gov/headlines/y2003/30may_solareclipse.htm">partial eclipse of the Sun</a> on Saturday, May 31st.\ - Fri, 30 May 2003 11:06:42 GMT\ - http://liftoff.msfc.nasa.gov/2003/05/30.html#item572\ -\ - \ - \ - The Engine That Does More\ - http://liftoff.msfc.nasa.gov/news/2003/news-VASIMR.asp\ - Before man travels to Mars, NASA hopes to design new engines that will let us fly through the Solar System more quickly. The proposed VASIMR engine would do that.\ - Tue, 27 May 2003 08:37:32 GMT\ - http://liftoff.msfc.nasa.gov/2003/05/27.html#item571\ -\ - \ - \ - Astronauts\' Dirty Laundry\ - http://liftoff.msfc.nasa.gov/news/2003/news-laundry.asp\ - Compared to earlier spacecraft, the International Space Station has many luxuries, but laundry facilities are not one of them. Instead, astronauts have other options.\ - Tue, 20 May 2003 08:56:02 GMT\ - http://liftoff.msfc.nasa.gov/2003/05/20.html#item570\ -\ - \ - \ -'; +exports.html = require("fs").readFileSync(__dirname+"/../Documents/RSS_Example.xml").toString(); exports.expected = { type: "rss", id: "", diff --git a/tests/Feeds/02-atom.js b/tests/Feeds/02-atom.js index 48674c9..a9c5c47 100644 --- a/tests/Feeds/02-atom.js +++ b/tests/Feeds/02-atom.js @@ -6,33 +6,7 @@ exports.options = { } }; exports.type = "rss"; -//http://en.wikipedia.org/wiki/Atom_%28standard%29 -exports.html = '\ -\ -\ -\ - Example Feed\ - A subtitle.\ - \ - \ - urn:uuid:60a76c80-d399-11d9-b91C-0003939e0af6\ - 2003-12-13T18:30:02Z\ - \ - John Doe\ - johndoe@example.com\ - \ -\ - \ - Atom-Powered Robots Run Amok\ - \ - \ - \ - urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a\ - 2003-12-13T18:30:02Z\ - Some text.\ - \ -\ -'; +exports.html = require("fs").readFileSync(__dirname+"/../Documents/Atom_Example.xml").toString(); exports.expected = { type: "atom", id: "urn:uuid:60a76c80-d399-11d9-b91C-0003939e0af6", diff --git a/tests/Feeds/03-rdf.js b/tests/Feeds/03-rdf.js index 96e4c89..90c05cb 100644 --- a/tests/Feeds/03-rdf.js +++ b/tests/Feeds/03-rdf.js @@ -6,7 +6,7 @@ exports.options = { } }; -exports.html = '\n\n\t\n\t\tcraigslist | all community in SF bay area\n\t\thttp://sfbay.craigslist.org/ccc/\n\t\t\n\t\ten-us\n\t\tCopyright 2011 craigslist, inc.\n\t\twebmaster@craigslist.org\n\t\twebmaster@craigslist.org\n\t\thttp://sfbay.craigslist.org/ccc//\n\t\tcraigslist | all community in SF bay area\n\t\tCollection\n\t\t2011-11-04T09:39:10-07:00\n\t\t4\n\t\thourly\n\t\t\n\t\t\t\n\t\t\t\t\n\t\t\t\n\t\t\n\t\n\t\n\t\t<![CDATA[ Music Equipment Repair and Consignment ]]>\n\t\t\nhttp://sfbay.craigslist.org/sby/muc/2681301534.html\n\n\t\t
We are pleased to announce our NEW LOCATION: 1199 N 5th st. San Jose, ca 95112. Please call ahead, by appointment only.

Recently featured by Metro Newspaper in their 2011 Best of the Silicon Valley edition see it online here:
http://www.metroactive.com/best-of-silicon-valley/2011/music-nightlife/editor-picks.html

Guitar Set up (acoustic and electronic) $40!\n]]>
\n\t\t2011-11-04T09:35:17-07:00\n\t\ten-us\n\t\tCopyright 2011 craigslist, inc.\n\t\t\nhttp://sfbay.craigslist.org/sby/muc/2681301534.html\n\n\t\t\n\t\ttext\n\t\t2011-11-04T09:35:17-07:00\n\t
\n\t\n\t\t<![CDATA[\nRide Offered - Oakland/BART to LA/SFV - TODAY 3PM 11/04 (oakland north / temescal)\n]]>\n\t\t\nhttp://sfbay.craigslist.org/eby/rid/2685010755.html\n\n\t\t\n]]>\n\t\t2011-11-04T09:34:54-07:00\n\t\ten-us\n\t\tCopyright 2011 craigslist, inc.\n\t\t\nhttp://sfbay.craigslist.org/eby/rid/2685010755.html\n\n\t\t\n\t\ttext\n\t\t2011-11-04T09:34:54-07:00\n\t\n
'; +exports.html = require("fs").readFileSync(__dirname+"/../Documents/RDF_Example.xml").toString(); exports.expected = { "type": "rdf:RDF", From 34dd1b9389276a7d7279298d2e6d663276ba3098 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Fri, 6 Jan 2012 14:24:09 +0100 Subject: [PATCH 158/450] Added missing `end` event to Stream --- lib/Stream.js | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/Stream.js b/lib/Stream.js index f02abbf..c4443f9 100644 --- a/lib/Stream.js +++ b/lib/Stream.js @@ -50,6 +50,9 @@ cbs.prototype = { onprocessinginstruction: function(name, data){ this.scope.emit("processinginstruction", name, data); }, + onend: function(){ + this.scope.emit("end"); + }, onreset: function(){ this.scope.emit("reset"); }, From fbc29a9b8c48a7c34c91651cd118e0cf291c3d85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Fri, 6 Jan 2012 14:34:52 +0100 Subject: [PATCH 159/450] Added a test for Streams, made runtests async --- tests/00-runtests.js | 54 +- tests/05-stream.js | 53 ++ tests/Documents/Basic.html | 1 + tests/HTML/01-basic.js | 2 +- tests/Stream/01-basic.js | 85 +++ tests/Stream/02-RSS.js | 1126 ++++++++++++++++++++++++++++++++++++ 6 files changed, 1295 insertions(+), 26 deletions(-) create mode 100644 tests/05-stream.js create mode 100644 tests/Documents/Basic.html create mode 100644 tests/Stream/01-basic.js create mode 100644 tests/Stream/02-RSS.js diff --git a/tests/00-runtests.js b/tests/00-runtests.js index af716b9..a8c8eb0 100644 --- a/tests/00-runtests.js +++ b/tests/00-runtests.js @@ -1,8 +1,8 @@ var fs = require("fs"); -var testCount = 0, - failCount = 0, - totalTime = 0; +var runCount = 0, + testCount = 0, + failCount = 0; function runTests(test){ var begin = Date.now(); @@ -14,15 +14,14 @@ function runTests(test){ }).forEach(function(file){ if(file === false) return; var second = false, - failed = false, - start = Date.now(), - took = 0; + failed = false; + + runCount++; console.log("Testing:", file.name); test.test(file, function(err, dom){ if(err) console.log("Handler error:", err); - took += Date.now() - start; var expected = JSON.stringify(file.expected, null, 2), got = JSON.stringify(dom, null, 2); @@ -31,32 +30,37 @@ function runTests(test){ console.log("Expected", expected, "Got", got, second); } - start = Date.now(); - if(second){ - testCount+=1; - if(failed) failCount+=1; + runCount--; + testCount++; + if(failed) failCount++; - console.log("["+file.name+"]:",failed?"failed":"passed","(took",took,"ms)"); + console.log("["+file.name+"]:", failed ? "failed":"passed"); } else second = true; }); }); - var took = Date.now()-begin; - totalTime+=took; - console.log(test.dir,"took",took); + console.log("->", test.dir.slice(1, -1), "iterated"); }; //run all tests -["./01-html.js", "./02-feed.js", "./03-events.js", "./04-dom_utils.js"] - .map(require) - .forEach(runTests); +[ + "./01-html.js", + "./02-feed.js", + "./03-events.js", + "./04-dom_utils.js", + "./05-stream.js" +].map(require).forEach(runTests); //log the results -console.log("Total time:", totalTime); -console.log("Total tests:", testCount); -console.log("Failed tests:", failCount); - -if(failCount !== 0){ - throw Error("Encountered " + failCount + " errors!"); -} \ No newline at end of file +(function check(){ + if(runCount !== 0){ + return setTimeout(check, 50); + } + console.log("Total tests:", testCount); + console.log("Failed tests:", failCount); + + if(failCount !== 0){ + throw Error("Encountered " + failCount + " errors!"); + } +})(); \ No newline at end of file diff --git a/tests/05-stream.js b/tests/05-stream.js new file mode 100644 index 0000000..92f45bb --- /dev/null +++ b/tests/05-stream.js @@ -0,0 +1,53 @@ +var helper = require("./test-helper.js"), + Stream = require("..").Stream, + sliceArr = Array.prototype.slice, + fs = require("fs"); + +exports.dir = "/Stream/"; + +exports.test = function(test, cb){ + var tokens = [], + stream = new Stream(test.options), + second = false; + + if(typeof Proxy !== "undefined"){ + stream._events = Proxy.create({ get: function(a, name){ + if(name === "end"){ + return function(){ + cb(null, tokens.splice(0)); + if(!second){ + second = true; + stream.parseComplete(fs.readFileSync(__dirname + test.file).toString()); + } + }; + } + if(helper.EVENTS.indexOf(name) !== -1) return function(){ + tokens.push({ + event: name, + data: sliceArr.apply(arguments) + }); + } + }}); + } + else { + stream._events = { + error: cb, + end: function(){ + cb(null, tokens.splice(0)); + if(!second){ + second = true; + stream.parseComplete(fs.readFileSync(__dirname + test.file).toString()); + } + } + }; + helper.EVENTS.forEach(function(name){ + stream._events[name] = function(){ + tokens.push({ + event: name, + data: sliceArr.apply(arguments) + }); + } + }); + } + fs.createReadStream(__dirname + test.file).pipe(stream); +}; \ No newline at end of file diff --git a/tests/Documents/Basic.html b/tests/Documents/Basic.html new file mode 100644 index 0000000..65957a2 --- /dev/null +++ b/tests/Documents/Basic.html @@ -0,0 +1 @@ +The TitleHello world \ No newline at end of file diff --git a/tests/HTML/01-basic.js b/tests/HTML/01-basic.js index cba7edd..beaa970 100644 --- a/tests/HTML/01-basic.js +++ b/tests/HTML/01-basic.js @@ -3,7 +3,7 @@ exports.options = { handler: {} , parser: {} }; -exports.html = "The TitleHello world"; +exports.html = require("fs").readFileSync(__dirname + "/../Documents/Basic.html").toString(); exports.expected = [ { "name": "!DOCTYPE", diff --git a/tests/Stream/01-basic.js b/tests/Stream/01-basic.js new file mode 100644 index 0000000..eada9ab --- /dev/null +++ b/tests/Stream/01-basic.js @@ -0,0 +1,85 @@ +exports.name = "Basic html"; +exports.options = {}; + +exports.file = "/Documents/Basic.html"; +exports.expected = [ + { + "event": "processinginstruction", + "data": [ + "!DOCTYPE", + "!DOCTYPE html" + ] + }, + { + "event": "opentagname", + "data": [ + "html" + ] + }, + { + "event": "opentag", + "data": [ + "html", + {}, + "tag" + ] + }, + { + "event": "opentagname", + "data": [ + "title" + ] + }, + { + "event": "opentag", + "data": [ + "title", + {}, + "tag" + ] + }, + { + "event": "text", + "data": [ + "The Title" + ] + }, + { + "event": "closetag", + "data": [ + "title" + ] + }, + { + "event": "opentagname", + "data": [ + "body" + ] + }, + { + "event": "opentag", + "data": [ + "body", + {}, + "tag" + ] + }, + { + "event": "text", + "data": [ + "Hello world" + ] + }, + { + "event": "closetag", + "data": [ + "body" + ] + }, + { + "event": "closetag", + "data": [ + "html" + ] + } +]; \ No newline at end of file diff --git a/tests/Stream/02-RSS.js b/tests/Stream/02-RSS.js new file mode 100644 index 0000000..8accd12 --- /dev/null +++ b/tests/Stream/02-RSS.js @@ -0,0 +1,1126 @@ +exports.name = "RSS feed"; +exports.options = {}; + +exports.file = "/Documents/RSS_Example.xml"; +exports.expected = [ + { + "event": "processinginstruction", + "data": [ + "?xml", + "?xml version=\"1.0\"?" + ] + }, + { + "event": "text", + "data": [ + "\n" + ] + }, + { + "event": "comment", + "data": [ + " http://cyber.law.harvard.edu/rss/examples/rss2sample.xml " + ] + }, + { + "event": "commentend", + "data": [] + }, + { + "event": "text", + "data": [ + "\n" + ] + }, + { + "event": "opentagname", + "data": [ + "rss" + ] + }, + { + "event": "opentag", + "data": [ + "rss", + { + "version": "2.0" + }, + "tag" + ] + }, + { + "event": "attribute", + "data": [ + "version", + "2.0" + ] + }, + { + "event": "text", + "data": [ + "\n " + ] + }, + { + "event": "opentagname", + "data": [ + "channel" + ] + }, + { + "event": "opentag", + "data": [ + "channel", + {}, + "tag" + ] + }, + { + "event": "text", + "data": [ + "\n " + ] + }, + { + "event": "opentagname", + "data": [ + "title" + ] + }, + { + "event": "opentag", + "data": [ + "title", + {}, + "tag" + ] + }, + { + "event": "text", + "data": [ + "Liftoff News" + ] + }, + { + "event": "closetag", + "data": [ + "title" + ] + }, + { + "event": "text", + "data": [ + "\n " + ] + }, + { + "event": "opentagname", + "data": [ + "link" + ] + }, + { + "event": "opentag", + "data": [ + "link", + {}, + "tag" + ] + }, + { + "event": "closetag", + "data": [ + "link" + ] + }, + { + "event": "text", + "data": [ + "http://liftoff.msfc.nasa.gov/" + ] + }, + { + "event": "text", + "data": [ + "\n " + ] + }, + { + "event": "opentagname", + "data": [ + "description" + ] + }, + { + "event": "opentag", + "data": [ + "description", + {}, + "tag" + ] + }, + { + "event": "text", + "data": [ + "Liftoff to Space Exploration." + ] + }, + { + "event": "closetag", + "data": [ + "description" + ] + }, + { + "event": "text", + "data": [ + "\n " + ] + }, + { + "event": "opentagname", + "data": [ + "language" + ] + }, + { + "event": "opentag", + "data": [ + "language", + {}, + "tag" + ] + }, + { + "event": "text", + "data": [ + "en-us" + ] + }, + { + "event": "closetag", + "data": [ + "language" + ] + }, + { + "event": "text", + "data": [ + "\n " + ] + }, + { + "event": "opentagname", + "data": [ + "pubDate" + ] + }, + { + "event": "opentag", + "data": [ + "pubDate", + {}, + "tag" + ] + }, + { + "event": "text", + "data": [ + "Tue, 10 Jun 2003 04:00:00 GMT" + ] + }, + { + "event": "closetag", + "data": [ + "pubDate" + ] + }, + { + "event": "text", + "data": [ + "\n\n " + ] + }, + { + "event": "opentagname", + "data": [ + "lastBuildDate" + ] + }, + { + "event": "opentag", + "data": [ + "lastBuildDate", + {}, + "tag" + ] + }, + { + "event": "text", + "data": [ + "Tue, 10 Jun 2003 09:41:01 GMT" + ] + }, + { + "event": "closetag", + "data": [ + "lastBuildDate" + ] + }, + { + "event": "text", + "data": [ + "\n " + ] + }, + { + "event": "opentagname", + "data": [ + "docs" + ] + }, + { + "event": "opentag", + "data": [ + "docs", + {}, + "tag" + ] + }, + { + "event": "text", + "data": [ + "http://blogs.law.harvard.edu/tech/rss" + ] + }, + { + "event": "closetag", + "data": [ + "docs" + ] + }, + { + "event": "text", + "data": [ + "\n " + ] + }, + { + "event": "opentagname", + "data": [ + "generator" + ] + }, + { + "event": "opentag", + "data": [ + "generator", + {}, + "tag" + ] + }, + { + "event": "text", + "data": [ + "Weblog Editor 2.0" + ] + }, + { + "event": "closetag", + "data": [ + "generator" + ] + }, + { + "event": "text", + "data": [ + "\n " + ] + }, + { + "event": "opentagname", + "data": [ + "managingEditor" + ] + }, + { + "event": "opentag", + "data": [ + "managingEditor", + {}, + "tag" + ] + }, + { + "event": "text", + "data": [ + "editor@example.com" + ] + }, + { + "event": "closetag", + "data": [ + "managingEditor" + ] + }, + { + "event": "text", + "data": [ + "\n " + ] + }, + { + "event": "opentagname", + "data": [ + "webMaster" + ] + }, + { + "event": "opentag", + "data": [ + "webMaster", + {}, + "tag" + ] + }, + { + "event": "text", + "data": [ + "webmaster@example.com" + ] + }, + { + "event": "closetag", + "data": [ + "webMaster" + ] + }, + { + "event": "text", + "data": [ + "\n " + ] + }, + { + "event": "opentagname", + "data": [ + "item" + ] + }, + { + "event": "opentag", + "data": [ + "item", + {}, + "tag" + ] + }, + { + "event": "text", + "data": [ + "\n\n " + ] + }, + { + "event": "opentagname", + "data": [ + "title" + ] + }, + { + "event": "opentag", + "data": [ + "title", + {}, + "tag" + ] + }, + { + "event": "text", + "data": [ + "Star City" + ] + }, + { + "event": "closetag", + "data": [ + "title" + ] + }, + { + "event": "text", + "data": [ + "\n " + ] + }, + { + "event": "opentagname", + "data": [ + "link" + ] + }, + { + "event": "opentag", + "data": [ + "link", + {}, + "tag" + ] + }, + { + "event": "closetag", + "data": [ + "link" + ] + }, + { + "event": "text", + "data": [ + "http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp" + ] + }, + { + "event": "text", + "data": [ + "\n " + ] + }, + { + "event": "opentagname", + "data": [ + "description" + ] + }, + { + "event": "opentag", + "data": [ + "description", + {}, + "tag" + ] + }, + { + "event": "text", + "data": [ + "How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia's <a href=\"http://howe.iki.rssi.ru/GCTC/gctc_e.htm\">Star City</a>." + ] + }, + { + "event": "closetag", + "data": [ + "description" + ] + }, + { + "event": "text", + "data": [ + "\n " + ] + }, + { + "event": "opentagname", + "data": [ + "pubDate" + ] + }, + { + "event": "opentag", + "data": [ + "pubDate", + {}, + "tag" + ] + }, + { + "event": "text", + "data": [ + "Tue, 03 Jun 2003 09:39:21 GMT" + ] + }, + { + "event": "closetag", + "data": [ + "pubDate" + ] + }, + { + "event": "text", + "data": [ + "\n " + ] + }, + { + "event": "opentagname", + "data": [ + "guid" + ] + }, + { + "event": "opentag", + "data": [ + "guid", + {}, + "tag" + ] + }, + { + "event": "text", + "data": [ + "http://liftoff.msfc.nasa.gov/2003/06/03.html#item573" + ] + }, + { + "event": "closetag", + "data": [ + "guid" + ] + }, + { + "event": "text", + "data": [ + "\n\n " + ] + }, + { + "event": "closetag", + "data": [ + "item" + ] + }, + { + "event": "text", + "data": [ + "\n " + ] + }, + { + "event": "opentagname", + "data": [ + "item" + ] + }, + { + "event": "opentag", + "data": [ + "item", + {}, + "tag" + ] + }, + { + "event": "text", + "data": [ + "\n " + ] + }, + { + "event": "opentagname", + "data": [ + "description" + ] + }, + { + "event": "opentag", + "data": [ + "description", + {}, + "tag" + ] + }, + { + "event": "text", + "data": [ + "Sky watchers in Europe, Asia, and parts of Alaska and Canada will experience a <a href=\"http://science.nasa.gov/headlines/y2003/30may_solareclipse.htm\">partial eclipse of the Sun</a> on Saturday, May 31st." + ] + }, + { + "event": "closetag", + "data": [ + "description" + ] + }, + { + "event": "text", + "data": [ + "\n " + ] + }, + { + "event": "opentagname", + "data": [ + "pubDate" + ] + }, + { + "event": "opentag", + "data": [ + "pubDate", + {}, + "tag" + ] + }, + { + "event": "text", + "data": [ + "Fri, 30 May 2003 11:06:42 GMT" + ] + }, + { + "event": "closetag", + "data": [ + "pubDate" + ] + }, + { + "event": "text", + "data": [ + "\n " + ] + }, + { + "event": "opentagname", + "data": [ + "guid" + ] + }, + { + "event": "opentag", + "data": [ + "guid", + {}, + "tag" + ] + }, + { + "event": "text", + "data": [ + "http://liftoff.msfc.nasa.gov/2003/05/30.html#item572" + ] + }, + { + "event": "closetag", + "data": [ + "guid" + ] + }, + { + "event": "text", + "data": [ + "\n\n " + ] + }, + { + "event": "closetag", + "data": [ + "item" + ] + }, + { + "event": "text", + "data": [ + "\n " + ] + }, + { + "event": "opentagname", + "data": [ + "item" + ] + }, + { + "event": "opentag", + "data": [ + "item", + {}, + "tag" + ] + }, + { + "event": "text", + "data": [ + "\n " + ] + }, + { + "event": "opentagname", + "data": [ + "title" + ] + }, + { + "event": "opentag", + "data": [ + "title", + {}, + "tag" + ] + }, + { + "event": "text", + "data": [ + "The Engine That Does More" + ] + }, + { + "event": "closetag", + "data": [ + "title" + ] + }, + { + "event": "text", + "data": [ + "\n " + ] + }, + { + "event": "opentagname", + "data": [ + "link" + ] + }, + { + "event": "opentag", + "data": [ + "link", + {}, + "tag" + ] + }, + { + "event": "closetag", + "data": [ + "link" + ] + }, + { + "event": "text", + "data": [ + "http://liftoff.msfc.nasa.gov/news/2003/news-VASIMR.asp" + ] + }, + { + "event": "text", + "data": [ + "\n " + ] + }, + { + "event": "opentagname", + "data": [ + "description" + ] + }, + { + "event": "opentag", + "data": [ + "description", + {}, + "tag" + ] + }, + { + "event": "text", + "data": [ + "Before man travels to Mars, NASA hopes to design new engines that will let us fly through the Solar System more quickly. The proposed VASIMR engine would do that." + ] + }, + { + "event": "closetag", + "data": [ + "description" + ] + }, + { + "event": "text", + "data": [ + "\n " + ] + }, + { + "event": "opentagname", + "data": [ + "pubDate" + ] + }, + { + "event": "opentag", + "data": [ + "pubDate", + {}, + "tag" + ] + }, + { + "event": "text", + "data": [ + "Tue, 27 May 2003 08:37:32 GMT" + ] + }, + { + "event": "closetag", + "data": [ + "pubDate" + ] + }, + { + "event": "text", + "data": [ + "\n " + ] + }, + { + "event": "opentagname", + "data": [ + "guid" + ] + }, + { + "event": "opentag", + "data": [ + "guid", + {}, + "tag" + ] + }, + { + "event": "text", + "data": [ + "http://liftoff.msfc.nasa.gov/2003/05/27.html#item571" + ] + }, + { + "event": "closetag", + "data": [ + "guid" + ] + }, + { + "event": "text", + "data": [ + "\n\n " + ] + }, + { + "event": "closetag", + "data": [ + "item" + ] + }, + { + "event": "text", + "data": [ + "\n " + ] + }, + { + "event": "opentagname", + "data": [ + "item" + ] + }, + { + "event": "opentag", + "data": [ + "item", + {}, + "tag" + ] + }, + { + "event": "text", + "data": [ + "\n " + ] + }, + { + "event": "opentagname", + "data": [ + "title" + ] + }, + { + "event": "opentag", + "data": [ + "title", + {}, + "tag" + ] + }, + { + "event": "text", + "data": [ + "Astronauts' Dirty Laundry" + ] + }, + { + "event": "closetag", + "data": [ + "title" + ] + }, + { + "event": "text", + "data": [ + "\n " + ] + }, + { + "event": "opentagname", + "data": [ + "link" + ] + }, + { + "event": "opentag", + "data": [ + "link", + {}, + "tag" + ] + }, + { + "event": "closetag", + "data": [ + "link" + ] + }, + { + "event": "text", + "data": [ + "http://liftoff.msfc.nasa.gov/news/2003/news-laundry.asp" + ] + }, + { + "event": "text", + "data": [ + "\n " + ] + }, + { + "event": "opentagname", + "data": [ + "description" + ] + }, + { + "event": "opentag", + "data": [ + "description", + {}, + "tag" + ] + }, + { + "event": "text", + "data": [ + "Compared to earlier spacecraft, the International Space Station has many luxuries, but laundry facilities are not one of them. Instead, astronauts have other options." + ] + }, + { + "event": "closetag", + "data": [ + "description" + ] + }, + { + "event": "text", + "data": [ + "\n " + ] + }, + { + "event": "opentagname", + "data": [ + "pubDate" + ] + }, + { + "event": "opentag", + "data": [ + "pubDate", + {}, + "tag" + ] + }, + { + "event": "text", + "data": [ + "Tue, 20 May 2003 08:56:02 GMT" + ] + }, + { + "event": "closetag", + "data": [ + "pubDate" + ] + }, + { + "event": "text", + "data": [ + "\n " + ] + }, + { + "event": "opentagname", + "data": [ + "guid" + ] + }, + { + "event": "opentag", + "data": [ + "guid", + {}, + "tag" + ] + }, + { + "event": "text", + "data": [ + "http://liftoff.msfc.nasa.gov/2003/05/20.html#item570" + ] + }, + { + "event": "closetag", + "data": [ + "guid" + ] + }, + { + "event": "text", + "data": [ + "\n\n " + ] + }, + { + "event": "closetag", + "data": [ + "item" + ] + }, + { + "event": "text", + "data": [ + "\n " + ] + }, + { + "event": "closetag", + "data": [ + "channel" + ] + }, + { + "event": "text", + "data": [ + "\n" + ] + }, + { + "event": "closetag", + "data": [ + "rss" + ] + } +]; \ No newline at end of file From 98098ed5880dc651f42f6d83e6152048c8303fe6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Sat, 7 Jan 2012 22:33:50 +0100 Subject: [PATCH 160/450] Changed comments --- lib/ElementType.js | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/lib/ElementType.js b/lib/ElementType.js index b2c8a10..618465e 100644 --- a/lib/ElementType.js +++ b/lib/ElementType.js @@ -1,10 +1,10 @@ //Types of elements found in the DOM module.exports = { - Text: "text", //Plain text - Directive: "directive", //Special tag - Comment: "comment", //Special tag - Script: "script", //Special tag - Style: "style", //Special tag - Tag: "tag", //Any tag that isn't special - CDATA: "cdata" + Text: "text", //Text + Directive: "directive", // + Comment: "comment", // + Script: "script", // text ") From c3259d3bb7e4fb4898538e31cd5f0c537de6d6e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Mon, 9 Jan 2012 15:27:37 +0100 Subject: [PATCH 165/450] Use the new name inside the readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d6697af..1a4fb12 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ The support for location data and verbose output was removed a couple of version ```javascript var htmlparser = require("htmlparser"); var rawHtml = "Xyz text '; +}; +exports.getByFunction = function(dom){ + return DomUtils.getOuterHTML(DomUtils.getElementById("asdf", dom, true)); +}; +exports.expected = ' text '; diff --git a/tests/DomUtils/05-inner_html.js b/tests/DomUtils/05-inner_html.js new file mode 100644 index 0000000..72dba0e --- /dev/null +++ b/tests/DomUtils/05-inner_html.js @@ -0,0 +1,10 @@ +var DomUtils = require("../../lib/DomUtils.js"); + +exports.name = "Get inner HTML"; +exports.getElements = function(dom){ + return ' text '; +}; +exports.getByFunction = function(dom){ + return DomUtils.getInnerHTML(DomUtils.getElementById("asdf", dom, true)); +}; +exports.expected = ' text '; From 862e0b73a3c69ae42abfbd2c2b47f644502a9648 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Sat, 28 Jan 2012 17:00:46 +0100 Subject: [PATCH 190/450] Fixed a bug `` got `/` as an attribute --- lib/Parser.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Parser.js b/lib/Parser.js index 21ca835..7827f52 100644 --- a/lib/Parser.js +++ b/lib/Parser.js @@ -14,7 +14,7 @@ function Parser(cbs, options){ } //Regular expressions used for cleaning up and parsing (stateless) -var _reAttrib = /\s(\S+?)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+))|(?=\s)|\/|$)/g, +var _reAttrib = /\s([^\s\/]+?)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+))|(?=\s)|\/|$)/g, _reTail = /\s|\/|$/; Parser.prototype._options = { From a6ce1f43ed4bf9123d920b59e635e8d873569590 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Sun, 29 Jan 2012 13:27:12 +0100 Subject: [PATCH 191/450] 2.2.3 --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 8deb4b5..624215b 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "htmlparser2", "description": "Performance-optimized forgiving HTML/XML/RSS parser", - "version": "2.2.2", + "version": "2.2.3", "author": "Felix Boehm ", "keywords": ["html", "parser", "streams", "xml", "dom", "rss", "feed", "atom"], "contributors": ["Chris Winberry "], From 1ec1578c8c5857b91fba9413fb5dc74a67fb7778 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Sun, 29 Jan 2012 13:29:15 +0100 Subject: [PATCH 192/450] Sort properties before stringify in runtests --- tests/00-runtests.js | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tests/00-runtests.js b/tests/00-runtests.js index 03f5e33..ab997ed 100644 --- a/tests/00-runtests.js +++ b/tests/00-runtests.js @@ -4,6 +4,14 @@ var runCount = 0, testCount = 0, failCount = 0; +function getSortedObject(obj){ + if(typeof obj !== "object" || Array.isArray(obj)) return obj; + return Object.keys(obj).sort().reduce(function(o, name){ + o[name] = obj[name]; + return o; + }, {}); +}; + function runTests(test){ var begin = Date.now(); //read files, load them, run them @@ -23,8 +31,8 @@ function runTests(test){ test.test(file, function(err, dom){ if(err) console.log("Handler error:", err); - var expected = JSON.stringify(file.expected, null, 2), - got = JSON.stringify(dom, null, 2); + var expected = JSON.stringify(getSortedObject(file.expected), null, 2), + got = JSON.stringify(getSortedObject(dom), null, 2); if(expected !== got){ failed = true; console.log("Expected", expected, "Got", got, second); From d2bdcbc046a6d3eb63d946fbd008d258d57ae6a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Sun, 29 Jan 2012 13:29:59 +0100 Subject: [PATCH 193/450] RDF feeds now have a type of `rdf` --- lib/FeedHandler.js | 2 +- tests/Feeds/03-rdf.js | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/FeedHandler.js b/lib/FeedHandler.js index 6ec480b..ca6c856 100644 --- a/lib/FeedHandler.js +++ b/lib/FeedHandler.js @@ -56,7 +56,7 @@ FeedHandler.prototype.onend = function() { } else{ childs = getElements("channel", feedRoot.children, true).children; - feed.type = feedRoot.name; + feed.type = feedRoot.name.substr(0, 3); feed.id = ""; if(tmp = fetch("title", childs)) feed.title = tmp; if(tmp = fetch("link", childs)) feed.link = tmp; diff --git a/tests/Feeds/03-rdf.js b/tests/Feeds/03-rdf.js index 3460444..96065b1 100644 --- a/tests/Feeds/03-rdf.js +++ b/tests/Feeds/03-rdf.js @@ -9,7 +9,7 @@ exports.options = { exports.html = require("fs").readFileSync(__dirname+"/../Documents/RDF_Example.xml").toString(); exports.expected = { - "type": "rdf:RDF", + "type": "rdf", "id": "", "title": "craigslist | all community in SF bay area", "link": "http://sfbay.craigslist.org/ccc/", From 9eebc264c4b99f1b3434b9491cdf7f385040dae4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Sun, 29 Jan 2012 14:13:54 +0100 Subject: [PATCH 194/450] Removed clutter from tests --- tests/00-runtests.js | 1 - tests/02-feed.js | 9 ++++++--- tests/Feeds/01-rss.js | 7 ------- tests/Feeds/02-atom.js | 7 ------- tests/Feeds/03-rdf.js | 8 -------- 5 files changed, 6 insertions(+), 26 deletions(-) diff --git a/tests/00-runtests.js b/tests/00-runtests.js index ab997ed..54f7fe0 100644 --- a/tests/00-runtests.js +++ b/tests/00-runtests.js @@ -13,7 +13,6 @@ function getSortedObject(obj){ }; function runTests(test){ - var begin = Date.now(); //read files, load them, run them fs.readdirSync(__dirname + test.dir ).map(function(file){ diff --git a/tests/02-feed.js b/tests/02-feed.js index e2b0911..cd5dc56 100644 --- a/tests/02-feed.js +++ b/tests/02-feed.js @@ -1,7 +1,10 @@ //Runs tests for feeds var helper = require("./test-helper.js"), - FeedHandler = require("../lib/FeedHandler.js"); + FeedHandler = require("../lib/FeedHandler.js"), + parserOpts = { + xmlMode: true + }; exports.dir = "/Feeds/"; @@ -9,6 +12,6 @@ exports.test = function(test, cb){ var handler = new FeedHandler(function(err, dom){ if(err) cb(err, 0); //return the error else cb(null, dom); - }, test.options.handler); - helper.writeToParser(handler, test.options.parser, test.html); + }); + helper.writeToParser(handler, parserOpts, test.html); }; \ No newline at end of file diff --git a/tests/Feeds/01-rss.js b/tests/Feeds/01-rss.js index 0af3e1a..b50a468 100644 --- a/tests/Feeds/01-rss.js +++ b/tests/Feeds/01-rss.js @@ -1,11 +1,4 @@ exports.name = "RSS (2.0)"; -exports.options = { - handler: {}, - parser: { - xmlMode: true - } -}; -exports.type = "rss"; exports.html = require("fs").readFileSync(__dirname+"/../Documents/RSS_Example.xml").toString(); exports.expected = { type: "rss", diff --git a/tests/Feeds/02-atom.js b/tests/Feeds/02-atom.js index a9c5c47..c986d73 100644 --- a/tests/Feeds/02-atom.js +++ b/tests/Feeds/02-atom.js @@ -1,11 +1,4 @@ exports.name = "Atom (1.0)"; -exports.options = { - handler: {}, - parser: { - xmlMode: true - } -}; -exports.type = "rss"; exports.html = require("fs").readFileSync(__dirname+"/../Documents/Atom_Example.xml").toString(); exports.expected = { type: "atom", diff --git a/tests/Feeds/03-rdf.js b/tests/Feeds/03-rdf.js index 96065b1..2c7383c 100644 --- a/tests/Feeds/03-rdf.js +++ b/tests/Feeds/03-rdf.js @@ -1,13 +1,5 @@ exports.name = "RDF test"; -exports.options = { - handler: {}, - parser: { - xmlMode: true - } -}; - exports.html = require("fs").readFileSync(__dirname+"/../Documents/RDF_Example.xml").toString(); - exports.expected = { "type": "rdf", "id": "", From e7cb57f87043b358b24a8193d9720c7e6ef3413c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Tue, 7 Feb 2012 19:53:44 +0100 Subject: [PATCH 195/450] Set prototype of emptyTags to null Otherwise, tags named after properties of the object prototype would have been found. --- lib/Parser.js | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/Parser.js b/lib/Parser.js index 7827f52..b2033aa 100644 --- a/lib/Parser.js +++ b/lib/Parser.js @@ -254,6 +254,7 @@ Parser.prototype._writeSpecial = function(rawData, lastTagSep){ }; var emptyTags = { + __proto__: null, area: true, base: true, basefont: true, From f406be9731952c436ebfe9721ea4c44fb87a32a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Wed, 8 Feb 2012 20:05:09 +0100 Subject: [PATCH 196/450] Added bench.js taken from astro/node-expat --- README.md | 32 ++++++++++------ tests/bench.js | 100 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 121 insertions(+), 11 deletions(-) create mode 100644 tests/bench.js diff --git a/README.md b/README.md index f1aad76..cb32994 100644 --- a/README.md +++ b/README.md @@ -5,15 +5,6 @@ A forgiving HTML/XML/RSS parser written in JS for NodeJS. The parser can handle ##Installing npm install htmlparser2 -##How is this different from [node-htmlparser](https://github.com/tautologistics/node-htmlparser)? -This is a fork of the project above. The main difference is that this is just intended to be used with node (it runs on other platforms using [browserify](https://github.com/substack/node-browserify)). Besides, the code is much better structured, has less duplications and is remarkably faster than the original. - -The parser now provides a callback interface close to [sax.js](https://github.com/isaacs/sax-js) (originally intended for [readabilitySAX](https://github.com/fb55/readabilitysax)). I also fixed a couple of bugs & included some pull requests for the original project (eg. [RDF feed support](https://github.com/tautologistics/node-htmlparser/pull/35)). - -The support for location data and verbose output was removed a couple of versions ago. It's still available in the [verbose branch](https://github.com/FB55/node-htmlparser/tree/verbose) (if you really need it, for whatever reason that may be). - -The `DefaultHandler` and the `RssHandler` were renamed to clarify their purpose (to `DomHandler` and `FeedHandler`). The old names are still available when requiring `htmlparser2`, so your code should work as expected. - ##Usage ```javascript @@ -56,7 +47,26 @@ Read more about the DomHandler in the [wiki](https://github.com/FB55/node-htmlpa ##Parsing RSS/RDF/Atom Feeds ```javascript -new htmlparser.FeedHandler(function (error, feed) { +new htmlparser.FeedHandler(function( error, feed){ ... }); -``` \ No newline at end of file +``` + +##Performance +Using a slightly modified version of [node-expat](https://github.com/astro/node-expat)s `bench.js`, I received the following results (on a MacBook (late 2010): + +* [htmlparser](https://github.com/tautologistics/node-htmlparser): 51779 el/s +* [sax.js](https://github.com/isaacs/sax-js): 53169 el/s +* [node-expat](https://github.com/astro/node-expat): 103388 el/s +* [htmlparser2](https://github.com/fb55/node-htmlparser): 118614 el/s + +The test may be found in `tests/bench.js`. + +##How is this different from [node-htmlparser](https://github.com/tautologistics/node-htmlparser)? +This is a fork of the project above. The main difference is that this is just intended to be used with node (it runs on other platforms using [browserify](https://github.com/substack/node-browserify)). Besides, the code is much better structured, has less duplications and is remarkably faster than the original. + +The parser now provides a callback interface close to [sax.js](https://github.com/isaacs/sax-js) (originally intended for [readabilitySAX](https://github.com/fb55/readabilitysax)). I also fixed a couple of bugs & included some pull requests for the original project (eg. [RDF feed support](https://github.com/tautologistics/node-htmlparser/pull/35)). + +The support for location data and verbose output was removed a couple of versions ago. It's still available in the [verbose branch](https://github.com/FB55/node-htmlparser/tree/verbose). + +The `DefaultHandler` and the `RssHandler` were renamed to clarify their purpose (to `DomHandler` and `FeedHandler`). The old names are still available when requiring `htmlparser2`, so your code should work as expected. \ No newline at end of file diff --git a/tests/bench.js b/tests/bench.js new file mode 100644 index 0000000..1e7b599 --- /dev/null +++ b/tests/bench.js @@ -0,0 +1,100 @@ +/* +var node_xml = require("node-xml"); + +function NodeXmlParser() { + var parser = new node_xml.SaxParser(function(cb) { }); + this.parse = function(s) { + parser.parseString(s); + }; +} + +var p = new NodeXmlParser(); +*//* +var libxml = require("libxmljs"); + +function LibXmlJsParser() { + var parser = new libxml.SaxPushParser(function(cb) { }); + this.parse = function(s) { + parser.push(s, false); + }; +} + +var p = new LibXmlJsParser(); +*//* +var sax = require('sax'); + +function SaxParser() { + var parser = sax.parser(); + this.parse = function(s) { + parser.write(s); + } +} + +var p = new SaxParser(); +*//* +var expat = require('node-expat'); + +function ExpatParser() { + var parser = new expat.Parser(); + this.parse = function(s) { + parser.parse(s, false); + }; +} + +var p = new ExpatParser(); +*//* +var htmlparser = require('htmlparser'); + +function HtmlParser() { + var handler = new htmlparser.DefaultHandler(); + var parser = new htmlparser.Parser(handler); + this.parse = function(s) { + parser.parseComplete(s); + }; +} + +var p = new HtmlParser(); +*/ +var htmlparser2 = require('htmlparser2/lib/Parser.js'); + +// provide callbacks +// otherwise, parsing would be optimized +var emptyCBs = { + onopentagname: function(){}, + onattribute: function(){}, + ontext: function(){}, + onclosetag: function(){} +}; + +function HtmlParser2() { + var parser = new htmlparser2(emptyCBs); + this.parse = function(s) { + parser.write(s); + }; +} + +var p = new HtmlParser2(); + + +p.parse(""); +var nEl = 0; +(function d() { + p.parse("quux"); + nEl++; + process.nextTick(d); +})(); + +var its =[]; +setInterval(function() { + console.log(nEl + " el/s"); + its.push(nEl); + nEl = 0; +}, 1e3); + +process.on('SIGINT', function () { + var average = its.reduce(function(average, v){ + return average+v; + }) / its.length; + console.log("Average:", average, "el/s"); + process.exit(0); +}); \ No newline at end of file From ebaf3a739fb33d9dc7ae875220abafdfe9cff38c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Thu, 16 Feb 2012 16:59:41 +0100 Subject: [PATCH 197/450] Added ProxyHandler --- lib/ProxyHandler.js | 17 +++++++++++++++++ lib/index.js | 4 ++++ 2 files changed, 21 insertions(+) create mode 100644 lib/ProxyHandler.js diff --git a/lib/ProxyHandler.js b/lib/ProxyHandler.js new file mode 100644 index 0000000..e67f36d --- /dev/null +++ b/lib/ProxyHandler.js @@ -0,0 +1,17 @@ +var ProxyHandler = function(cbs){ + if(cbs) this._cbs = cbs; +}; + +ProxyHandler.prototype._cbs = {}; + +Object.keys(require("./").EVENTS).forEach(function(name){ + ProxyHandler.prototype.__defineGetter__(name, function(){ + return this._cbs[name]; + }); + ProxyHandler.prototype.__defineSetter__(name, function(value){ + //allow functions to be overwritten + Object.defineProperty(this, name, {value: value}); + }); +}); + +module.exports = ProxyHandler; \ No newline at end of file diff --git a/lib/index.js b/lib/index.js index f6e788e..621a06b 100644 --- a/lib/index.js +++ b/lib/index.js @@ -25,6 +25,10 @@ module.exports = { defineProp(this, "WritableStream", {value:require("./WritableStream.js")}); return this.WritableStream; }, + get ProxyHandler(){ + defineProp(this, "ProxyHandler", {value:require("./ProxyHandler.js")}); + return this.ProxyHandler; + }, get DomUtils(){ defineProp(this, "DomUtils", {value:require("./DomUtils.js")}); return this.DomUtils; From 6c3fbf2a748145a3426ee64b18223f07dfffb5a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Tue, 28 Feb 2012 21:19:36 +0100 Subject: [PATCH 198/450] Parser#_processOpenTag now takes a single argument It does the name parsing itself --- lib/Parser.js | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/lib/Parser.js b/lib/Parser.js index b2033aa..27a8a95 100644 --- a/lib/Parser.js +++ b/lib/Parser.js @@ -15,7 +15,7 @@ function Parser(cbs, options){ //Regular expressions used for cleaning up and parsing (stateless) var _reAttrib = /\s([^\s\/]+?)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+))|(?=\s)|\/|$)/g, - _reTail = /\s|\/|$/; + _reTail = /\s|\/|$/; Parser.prototype._options = { xmlMode: false, //Special behavior for script/style tags by default @@ -203,7 +203,7 @@ Parser.prototype._parseTags = function(force){ ); } } - else this._processOpenTag(this._parseTagName(elementData), elementData); + else this._processOpenTag(elementData); } else{ if(this._contentFlags !== 0){ @@ -283,7 +283,7 @@ Parser.prototype._processCloseTag = function(name){ } //many browsers (eg. Safari, Chrome) convert
to
else if(name === "br" && !this._options.xmlMode) - this._processOpenTag(name, "/"); + this._processOpenTag(name + "/"); }; Parser.prototype._parseAttributes = function(data){ @@ -301,8 +301,10 @@ var parseAttributes = function(data){ return attrs; }; -Parser.prototype._processOpenTag = function(name, data){ - var type = ElementType.Tag; +Parser.prototype._processOpenTag = function(data){ + var name = this._parseTagName(data), + type = ElementType.Tag; + if(this._options.xmlMode){ /*do nothing*/ } else if(name === "script") type = ElementType.Script; else if(name === "style") type = ElementType.Style; From 6ebf630eb763d4838bf865404db6bd64ba759f35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Tue, 28 Feb 2012 21:24:09 +0100 Subject: [PATCH 199/450] Use onopentagname and onattribute events in DomHandler That ridiculous for-in-loop was removed! --- lib/DomHandler.js | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/lib/DomHandler.js b/lib/DomHandler.js index 3000afd..1f6eea3 100644 --- a/lib/DomHandler.js +++ b/lib/DomHandler.js @@ -60,20 +60,21 @@ DomHandler.prototype._addDomElement = function(element){ } }; -DomHandler.prototype.onopentag = function(name, attribs){ +DomHandler.prototype.onopentagname = function(name){ var element = { type: name === "script" ? ElementType.Script : name === "style" ? ElementType.Style : ElementType.Tag, name: name }; - //for some reason, an if doesn't work - for(var i in attribs){ - element.attribs = attribs; - break; - } this._addDomElement(element); this._tagStack.push(element); }; +DomHandler.prototype.onattribute = function(name, value){ + var element = this._tagStack[this._tagStack.length-1]; + if(!("attribs" in element)) element.attribs = {}; + element.attribs[name] = value; +}; + DomHandler.prototype.ontext = function(data){ if(this._options.ignoreWhitespace && data.trim() === "") return; this._addDomElement({ From 3f3c030945fc845fac5c8abe3221776a271d18cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bo=CC=88hm?= Date: Sun, 4 Mar 2012 15:50:54 +0100 Subject: [PATCH 200/450] Use JSON instead of JS files for describing tests --- tests/DOM/01-basic.js | 39 - tests/DOM/01-basic.json | 41 + tests/DOM/02-single_tag_1.js | 20 - tests/DOM/02-single_tag_1.json | 22 + tests/DOM/03-single_tag_2.js | 20 - tests/DOM/03-single_tag_2.json | 22 + tests/DOM/04-unescaped_in_script.js | 27 - tests/DOM/04-unescaped_in_script.json | 29 + tests/DOM/05-tags_in_comment.js | 18 - tests/DOM/05-tags_in_comment.json | 20 + tests/DOM/06-comment_in_script.js | 18 - tests/DOM/06-comment_in_script.json | 20 + tests/DOM/07-unescaped_in_style.js | 17 - tests/DOM/07-unescaped_in_style.json | 23 + tests/DOM/08-extra_spaces_in_tag.js | 21 - tests/DOM/08-extra_spaces_in_tag.json | 23 + tests/DOM/09-unquoted_attrib.js | 21 - tests/DOM/09-unquoted_attrib.json | 23 + tests/DOM/10-singular_attribute.js | 16 - tests/DOM/10-singular_attribute.json | 18 + tests/DOM/11-text_outside_tags.js | 20 - tests/DOM/11-text_outside_tags.json | 22 + tests/DOM/12-text_only.js | 12 - tests/DOM/12-text_only.json | 14 + tests/DOM/13-comment_in_text.js | 20 - tests/DOM/13-comment_in_text.json | 22 + tests/DOM/14-comment_in_text_in_script.js | 26 - tests/DOM/14-comment_in_text_in_script.json | 28 + tests/DOM/15-non-verbose.js | 17 - tests/DOM/15-non-verbose.json | 25 + tests/DOM/16-ignore_whitespace.js | 38 - tests/DOM/16-ignore_whitespace.json | 42 + tests/DOM/17-xml_namespace.js | 18 - tests/DOM/17-xml_namespace.json | 20 + tests/DOM/18-enforce_empty_tags.js | 16 - tests/DOM/18-enforce_empty_tags.json | 18 + tests/DOM/19-ignore_empty_tags.js | 18 - tests/DOM/19-ignore_empty_tags.json | 22 + tests/DOM/20-template_script_tags.js | 13 - tests/DOM/20-template_script_tags.json | 23 + tests/DOM/21-conditional_comments.js | 16 - tests/DOM/21-conditional_comments.json | 18 + tests/DOM/22-lowercase_tags.js | 39 - tests/DOM/22-lowercase_tags.json | 43 + tests/Events/01-simple.js | 39 - tests/Events/01-simple.json | 44 + tests/Events/02-template.js | 57 - tests/Events/02-template.json | 62 ++ tests/Events/03-lowercase_tags.js | 39 - tests/Events/03-lowercase_tags.json | 46 + tests/Events/04-cdata.js | 80 -- tests/Events/04-cdata.json | 85 ++ tests/Stream/01-basic.j.json | 83 ++ tests/Stream/01-basic.js | 82 -- tests/Stream/02-RSS.j.json | 1093 +++++++++++++++++++ tests/Stream/02-RSS.js | 1092 ------------------ 56 files changed, 1951 insertions(+), 1859 deletions(-) delete mode 100644 tests/DOM/01-basic.js create mode 100644 tests/DOM/01-basic.json delete mode 100644 tests/DOM/02-single_tag_1.js create mode 100644 tests/DOM/02-single_tag_1.json delete mode 100644 tests/DOM/03-single_tag_2.js create mode 100644 tests/DOM/03-single_tag_2.json delete mode 100644 tests/DOM/04-unescaped_in_script.js create mode 100644 tests/DOM/04-unescaped_in_script.json delete mode 100644 tests/DOM/05-tags_in_comment.js create mode 100644 tests/DOM/05-tags_in_comment.json delete mode 100644 tests/DOM/06-comment_in_script.js create mode 100644 tests/DOM/06-comment_in_script.json delete mode 100644 tests/DOM/07-unescaped_in_style.js create mode 100644 tests/DOM/07-unescaped_in_style.json delete mode 100644 tests/DOM/08-extra_spaces_in_tag.js create mode 100644 tests/DOM/08-extra_spaces_in_tag.json delete mode 100644 tests/DOM/09-unquoted_attrib.js create mode 100644 tests/DOM/09-unquoted_attrib.json delete mode 100644 tests/DOM/10-singular_attribute.js create mode 100644 tests/DOM/10-singular_attribute.json delete mode 100644 tests/DOM/11-text_outside_tags.js create mode 100644 tests/DOM/11-text_outside_tags.json delete mode 100644 tests/DOM/12-text_only.js create mode 100644 tests/DOM/12-text_only.json delete mode 100644 tests/DOM/13-comment_in_text.js create mode 100644 tests/DOM/13-comment_in_text.json delete mode 100644 tests/DOM/14-comment_in_text_in_script.js create mode 100644 tests/DOM/14-comment_in_text_in_script.json delete mode 100644 tests/DOM/15-non-verbose.js create mode 100644 tests/DOM/15-non-verbose.json delete mode 100644 tests/DOM/16-ignore_whitespace.js create mode 100644 tests/DOM/16-ignore_whitespace.json delete mode 100644 tests/DOM/17-xml_namespace.js create mode 100644 tests/DOM/17-xml_namespace.json delete mode 100644 tests/DOM/18-enforce_empty_tags.js create mode 100644 tests/DOM/18-enforce_empty_tags.json delete mode 100644 tests/DOM/19-ignore_empty_tags.js create mode 100644 tests/DOM/19-ignore_empty_tags.json delete mode 100644 tests/DOM/20-template_script_tags.js create mode 100644 tests/DOM/20-template_script_tags.json delete mode 100644 tests/DOM/21-conditional_comments.js create mode 100644 tests/DOM/21-conditional_comments.json delete mode 100644 tests/DOM/22-lowercase_tags.js create mode 100644 tests/DOM/22-lowercase_tags.json delete mode 100644 tests/Events/01-simple.js create mode 100644 tests/Events/01-simple.json delete mode 100644 tests/Events/02-template.js create mode 100644 tests/Events/02-template.json delete mode 100644 tests/Events/03-lowercase_tags.js create mode 100644 tests/Events/03-lowercase_tags.json delete mode 100644 tests/Events/04-cdata.js create mode 100644 tests/Events/04-cdata.json create mode 100644 tests/Stream/01-basic.j.json delete mode 100644 tests/Stream/01-basic.js create mode 100644 tests/Stream/02-RSS.j.json delete mode 100644 tests/Stream/02-RSS.js diff --git a/tests/DOM/01-basic.js b/tests/DOM/01-basic.js deleted file mode 100644 index beaa970..0000000 --- a/tests/DOM/01-basic.js +++ /dev/null @@ -1,39 +0,0 @@ -exports.name = "Basic test"; -exports.options = { - handler: {} - , parser: {} -}; -exports.html = require("fs").readFileSync(__dirname + "/../Documents/Basic.html").toString(); -exports.expected = [ - { - "name": "!DOCTYPE", - "data": "!DOCTYPE html", - "type": "directive" - }, - { - "type": "tag", - "name": "html", - "children": [ - { - "type": "tag", - "name": "title", - "children": [ - { - "data": "The Title", - "type": "text" - } - ] - }, - { - "type": "tag", - "name": "body", - "children": [ - { - "data": "Hello world", - "type": "text" - } - ] - } - ] - } -] diff --git a/tests/DOM/01-basic.json b/tests/DOM/01-basic.json new file mode 100644 index 0000000..7453c30 --- /dev/null +++ b/tests/DOM/01-basic.json @@ -0,0 +1,41 @@ +{ + "name": "Basic test", + "options": { + "handler": {}, + "parser": {} + }, + "html": "The TitleHello world", + "expected": [ + { + "name": "!DOCTYPE", + "data": "!DOCTYPE html", + "type": "directive" + }, + { + "type": "tag", + "name": "html", + "children": [ + { + "type": "tag", + "name": "title", + "children": [ + { + "data": "The Title", + "type": "text" + } + ] + }, + { + "type": "tag", + "name": "body", + "children": [ + { + "data": "Hello world", + "type": "text" + } + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/tests/DOM/02-single_tag_1.js b/tests/DOM/02-single_tag_1.js deleted file mode 100644 index 12ecc38..0000000 --- a/tests/DOM/02-single_tag_1.js +++ /dev/null @@ -1,20 +0,0 @@ -exports.name = "Single Tag 1"; -exports.options = { - handler: {} - , parser: {} -}; -exports.html = "
text
"; -exports.expected = [ - { - "type": "tag", - "name": "br" - }, - { - "data": "text", - "type": "text" - }, - { - "type": "tag", - "name": "br" - } -]; \ No newline at end of file diff --git a/tests/DOM/02-single_tag_1.json b/tests/DOM/02-single_tag_1.json new file mode 100644 index 0000000..4efff6a --- /dev/null +++ b/tests/DOM/02-single_tag_1.json @@ -0,0 +1,22 @@ +{ + "name": "Single Tag 1", + "options": { + "handler": {}, + "parser": {} + }, + "html": "
text
", + "expected": [ + { + "type": "tag", + "name": "br" + }, + { + "data": "text", + "type": "text" + }, + { + "type": "tag", + "name": "br" + } + ] +} \ No newline at end of file diff --git a/tests/DOM/03-single_tag_2.js b/tests/DOM/03-single_tag_2.js deleted file mode 100644 index eaeec64..0000000 --- a/tests/DOM/03-single_tag_2.js +++ /dev/null @@ -1,20 +0,0 @@ -exports.name = "Single Tag 2"; -exports.options = { - handler: {} - , parser: {} -}; -exports.html = "
text
"; -exports.expected = [ - { - "type": "tag", - "name": "br" - }, - { - "data": "text", - "type": "text" - }, - { - "type": "tag", - "name": "br" - } -]; \ No newline at end of file diff --git a/tests/DOM/03-single_tag_2.json b/tests/DOM/03-single_tag_2.json new file mode 100644 index 0000000..e7b23b8 --- /dev/null +++ b/tests/DOM/03-single_tag_2.json @@ -0,0 +1,22 @@ +{ + "name": "Single Tag 2", + "options": { + "handler": {}, + "parser": {} + }, + "html": "
text
", + "expected": [ + { + "type": "tag", + "name": "br" + }, + { + "data": "text", + "type": "text" + }, + { + "type": "tag", + "name": "br" + } + ] +} \ No newline at end of file diff --git a/tests/DOM/04-unescaped_in_script.js b/tests/DOM/04-unescaped_in_script.js deleted file mode 100644 index 5fdefc4..0000000 --- a/tests/DOM/04-unescaped_in_script.js +++ /dev/null @@ -1,27 +0,0 @@ -exports.name = "Unescaped chars in script"; -exports.options = { - handler: {} - , parser: {} -}; -exports.html = ""; -exports.expected = [ - { - 'type': 'tag', - 'name': 'head', - 'children': [ - { - 'type': 'script', - 'name': 'script', - 'attribs': { - 'language': 'Javascript' - }, - 'children': [ - { - 'data': 'var foo = ""; alert(2 > foo); var baz = 10 << 2; var zip = 10 >> 1; var yap = "<<>>>><<";', - 'type': 'text' - } - ] - } - ] - } -]; \ No newline at end of file diff --git a/tests/DOM/04-unescaped_in_script.json b/tests/DOM/04-unescaped_in_script.json new file mode 100644 index 0000000..029d202 --- /dev/null +++ b/tests/DOM/04-unescaped_in_script.json @@ -0,0 +1,29 @@ +{ + "name": "Unescaped chars in script", + "options": { + "handler": {}, + "parser": {} + }, + "html": "", + "expected": [ + { + "type": "tag", + "name": "head", + "children": [ + { + "type": "script", + "name": "script", + "attribs": { + "language": "Javascript" + }, + "children": [ + { + "data": "var foo = \"\"; alert(2 > foo); var baz = 10 << 2; var zip = 10 >> 1; var yap = \"<<>>>><<\";", + "type": "text" + } + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/tests/DOM/05-tags_in_comment.js b/tests/DOM/05-tags_in_comment.js deleted file mode 100644 index e0c770e..0000000 --- a/tests/DOM/05-tags_in_comment.js +++ /dev/null @@ -1,18 +0,0 @@ -exports.name = "Special char in comment"; -exports.options = { - handler: {} - , parser: {} -}; -exports.html = ""; -exports.expected = [ - { - "type": "tag", - "name": "head", - "children": [ - { - "data": " commented out tags Test", - "type": "comment" - } - ] - } -]; \ No newline at end of file diff --git a/tests/DOM/05-tags_in_comment.json b/tests/DOM/05-tags_in_comment.json new file mode 100644 index 0000000..577d23b --- /dev/null +++ b/tests/DOM/05-tags_in_comment.json @@ -0,0 +1,20 @@ +{ + "name": "Special char in comment", + "options": { + "handler": {}, + "parser": {} + }, + "html": "", + "expected": [ + { + "type": "tag", + "name": "head", + "children": [ + { + "data": " commented out tags Test", + "type": "comment" + } + ] + } + ] +} \ No newline at end of file diff --git a/tests/DOM/06-comment_in_script.js b/tests/DOM/06-comment_in_script.js deleted file mode 100644 index 6022b91..0000000 --- a/tests/DOM/06-comment_in_script.js +++ /dev/null @@ -1,18 +0,0 @@ -exports.name = "Script source in comment"; -exports.options = { - handler: {} - , parser: {} -}; -exports.html = ""; -exports.expected = [ - { - "type": "script", - "name": "script", - "children": [ - { - "data": "var foo = 1;", - "type": "comment" - } - ] - } -]; \ No newline at end of file diff --git a/tests/DOM/06-comment_in_script.json b/tests/DOM/06-comment_in_script.json new file mode 100644 index 0000000..a4246f4 --- /dev/null +++ b/tests/DOM/06-comment_in_script.json @@ -0,0 +1,20 @@ +{ + "name": "Script source in comment", + "options": { + "handler": {}, + "parser": {} + }, + "html": "", + "expected": [ + { + "type": "script", + "name": "script", + "children": [ + { + "data": "var foo = 1;", + "type": "comment" + } + ] + } + ] +} \ No newline at end of file diff --git a/tests/DOM/07-unescaped_in_style.js b/tests/DOM/07-unescaped_in_style.js deleted file mode 100644 index 3784336..0000000 --- a/tests/DOM/07-unescaped_in_style.js +++ /dev/null @@ -1,17 +0,0 @@ -exports.name = "Unescaped chars in style"; -exports.options = { - handler: {} - , parser: {} -}; -exports.html = ""; -exports.expected = -[ { type: 'style' - , name: 'style' - , attribs: { type: 'text/css' } - , children: - [ { data: '\n body > p\n { font-weight: bold; }' - , type: 'text' - } - ] - } -]; \ No newline at end of file diff --git a/tests/DOM/07-unescaped_in_style.json b/tests/DOM/07-unescaped_in_style.json new file mode 100644 index 0000000..d6bf9fb --- /dev/null +++ b/tests/DOM/07-unescaped_in_style.json @@ -0,0 +1,23 @@ +{ + "name": "Unescaped chars in style", + "options": { + "handler": {}, + "parser": {} + }, + "html": "", + "expected": [ + { + "type": "style", + "name": "style", + "attribs": { + "type": "text/css" + }, + "children": [ + { + "data": "\n body > p\n\t{ font-weight: bold; }", + "type": "text" + } + ] + } + ] +} \ No newline at end of file diff --git a/tests/DOM/08-extra_spaces_in_tag.js b/tests/DOM/08-extra_spaces_in_tag.js deleted file mode 100644 index 8b6cda5..0000000 --- a/tests/DOM/08-extra_spaces_in_tag.js +++ /dev/null @@ -1,21 +0,0 @@ -exports.name = "Extra spaces in tag"; -exports.options = { - handler: {} - , parser: {} -}; -exports.html = "<\n font\t\n size='14' \n>the text<\n / \nfont \n>"; -exports.expected = [ - { - "type": "tag", - "name": "font", - "attribs": { - "size": "14" - }, - "children": [ - { - "data": "the text", - "type": "text" - } - ] - } -]; diff --git a/tests/DOM/08-extra_spaces_in_tag.json b/tests/DOM/08-extra_spaces_in_tag.json new file mode 100644 index 0000000..78b30f4 --- /dev/null +++ b/tests/DOM/08-extra_spaces_in_tag.json @@ -0,0 +1,23 @@ +{ + "name": "Extra spaces in tag", + "options": { + "handler": {}, + "parser": {} + }, + "html": "<\n font\t\n size='14' \n>the text<\n /\t\nfont\t \n>", + "expected": [ + { + "type": "tag", + "name": "font", + "attribs": { + "size": "14" + }, + "children": [ + { + "data": "the text", + "type": "text" + } + ] + } + ] +} \ No newline at end of file diff --git a/tests/DOM/09-unquoted_attrib.js b/tests/DOM/09-unquoted_attrib.js deleted file mode 100644 index c787422..0000000 --- a/tests/DOM/09-unquoted_attrib.js +++ /dev/null @@ -1,21 +0,0 @@ -exports.name = "Unquoted attributes"; -exports.options = { - handler: {} - , parser: {} -}; -exports.html = "the text"; -exports.expected = [ - { - "type": "tag", - "name": "font", - "attribs": { - "size": "14" - }, - "children": [ - { - "data": "the text", - "type": "text" - } - ] - } -]; \ No newline at end of file diff --git a/tests/DOM/09-unquoted_attrib.json b/tests/DOM/09-unquoted_attrib.json new file mode 100644 index 0000000..ae5f44c --- /dev/null +++ b/tests/DOM/09-unquoted_attrib.json @@ -0,0 +1,23 @@ +{ + "name": "Unquoted attributes", + "options": { + "handler": {}, + "parser": {} + }, + "html": "the text", + "expected": [ + { + "type": "tag", + "name": "font", + "attribs": { + "size": "14" + }, + "children": [ + { + "data": "the text", + "type": "text" + } + ] + } + ] +} \ No newline at end of file diff --git a/tests/DOM/10-singular_attribute.js b/tests/DOM/10-singular_attribute.js deleted file mode 100644 index af10053..0000000 --- a/tests/DOM/10-singular_attribute.js +++ /dev/null @@ -1,16 +0,0 @@ -exports.name = "Singular attribute"; -exports.options = { - handler: {} - , parser: {} -}; -exports.html = "