diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..22990c0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,16 @@ +lib-cov +*.seed +*.log +*.csv +*.dat +*.out +*.pid +*.gz +.* +*.sqlite +pids +logs +results + +node_modules +npm-debug.log diff --git a/lib/crawlme.js b/lib/crawlme.js index 7441bab..f7b0ce0 100644 --- a/lib/crawlme.js +++ b/lib/crawlme.js @@ -8,10 +8,11 @@ var async = require('async'); exports = module.exports = function(options) { // regex for stripping html of script tags. Borrowed from jQuery var stripScriptTags = /)<[^<]*)*<\/script>/gi; - + // Default options options = options || {}; - options.waitFor = options.waitFor || 1000; + options.divider = options.divider || '#!'; + options.waitFor = options.waitFor || 2000; options.protocol = options.protocol || 'http'; options.cacheSize = options.cacheSize || 1*1024*1024; options.cacheRefresh = options.cacheRefresh || 15*60*1000; //15 minutes @@ -42,7 +43,7 @@ exports = module.exports = function(options) { // Get the URL to the AJAX version of this page function getAjaxUrl(req) { var urlParts = req.url.split('?_escaped_fragment_='); - + // If no fragment in URL this is not a request for an HTML snapshot // of an AJAX page. if (urlParts.length !== 2) return undefined; @@ -50,7 +51,7 @@ exports = module.exports = function(options) { // Express adds a protocol property to the req object. var protocol = req.protocol || options.protocol; - var url = protocol + '://' + req.headers.host; + var url = process.env.CRAWLME_HOST || protocol + '://' + req.headers.host; var path = urlParts[0]; var fragment = urlParts[1]; @@ -58,7 +59,7 @@ exports = module.exports = function(options) { // We are dealing with crawlable an ajax page without a hash fragment url += path; // No hashbang or fragment } else { - url += path + '#!' + decodeURIComponent(fragment); + url += path + options.divider + decodeURIComponent(fragment); } return url; @@ -75,10 +76,10 @@ exports = module.exports = function(options) { if(cached) return cb(null, cached); } - Browser.visit(url, {waitFor: options.waitFor}, - function(err, browser, status) { - if(err) return cb(err); + var browser = new Browser(); + browser.visit(url, function() { + browser.wait(options.waitFor, function () { // links var links = browser.queryAll('a'); links.forEach(function(link) { @@ -92,8 +93,8 @@ exports = module.exports = function(options) { var snapshot = stripScripts(browser.html()); cache.set(url, snapshot); cb(null, snapshot); - } - ); + }); + }); } // Start the cache refresh timer @@ -105,12 +106,13 @@ exports = module.exports = function(options) { if ('GET' !== req.method) return next(); // Try to extract the ajax URL from the request - var url = getAjaxUrl(req); + var url = options.getUrl ? options.getUrl(req) : getAjaxUrl(req); // If we aren't being crawled continue to next middleware if (!url) return next(); // Generate the snapshot + console.log('Zombie wants to eat: ' + url); getHTMLSnapshot(url, function(err, snapshot) { if (err) { console.log('Zombie reported an error: ' + err); diff --git a/package.json b/package.json index cf0f37b..67c6cd2 100644 --- a/package.json +++ b/package.json @@ -2,16 +2,23 @@ "author": "Aron Kornhall (http://optimalbits.com)", "name": "crawlme", "description": "Makes your ajax web application indexable by search engines by generating html snapshots on the fly. Caches results for blazing fast responses and better page ranking.", - "keywords": ["ajax", "crawling", "google", "indexing", "SEO", "Search Engine Optimization"], + "keywords": [ + "ajax", + "crawling", + "google", + "indexing", + "SEO", + "Search Engine Optimization" + ], "version": "0.0.7", "main": "./index.js", "engines": { "node": ">=0.6.10" }, "dependencies": { - "zombie": "2.x.x", + "async": "0.2.6", "lru-cache": "2.3.x", - "async": "0.2.6" + "zombie": "^4.3.0" }, "devDependencies": { "connect": "2.x.x",