From edb114355d1c139de0b51580cd929a20f4edd56b Mon Sep 17 00:00:00 2001 From: Vaclav Klecanda Date: Thu, 17 Dec 2015 16:12:44 +0100 Subject: [PATCH 1/4] setable divider --- lib/crawlme.js | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/crawlme.js b/lib/crawlme.js index 7441bab..5183424 100644 --- a/lib/crawlme.js +++ b/lib/crawlme.js @@ -8,9 +8,10 @@ var async = require('async'); exports = module.exports = function(options) { // regex for stripping html of script tags. Borrowed from jQuery var stripScriptTags = /)<[^<]*)*<\/script>/gi; - + // Default options options = options || {}; + options.divider = options.divider || '#!'; options.waitFor = options.waitFor || 1000; options.protocol = options.protocol || 'http'; options.cacheSize = options.cacheSize || 1*1024*1024; @@ -42,7 +43,7 @@ exports = module.exports = function(options) { // Get the URL to the AJAX version of this page function getAjaxUrl(req) { var urlParts = req.url.split('?_escaped_fragment_='); - + // If no fragment in URL this is not a request for an HTML snapshot // of an AJAX page. if (urlParts.length !== 2) return undefined; @@ -58,7 +59,7 @@ exports = module.exports = function(options) { // We are dealing with crawlable an ajax page without a hash fragment url += path; // No hashbang or fragment } else { - url += path + '#!' + decodeURIComponent(fragment); + url += path + options.divider + decodeURIComponent(fragment); } return url; From e6dac340958b745536f5333f622292dcbbf99c8c Mon Sep 17 00:00:00 2001 From: Vaclav Klecanda Date: Wed, 26 Oct 2016 20:55:12 +0200 Subject: [PATCH 2/4] ignore --- .gitignore | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..22990c0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,16 @@ +lib-cov +*.seed +*.log +*.csv +*.dat +*.out +*.pid +*.gz +.* +*.sqlite +pids +logs +results + +node_modules +npm-debug.log From b14b2646d76b4d105e894671f73036d637f0b54c Mon Sep 17 00:00:00 2001 From: Vaclav Klecanda Date: Wed, 26 Oct 2016 20:55:55 +0200 Subject: [PATCH 3/4] new zombie and adaptation --- lib/crawlme.js | 14 +++++++------- package.json | 13 ++++++++++--- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/lib/crawlme.js b/lib/crawlme.js index 5183424..caf9de4 100644 --- a/lib/crawlme.js +++ b/lib/crawlme.js @@ -12,7 +12,7 @@ exports = module.exports = function(options) { // Default options options = options || {}; options.divider = options.divider || '#!'; - options.waitFor = options.waitFor || 1000; + options.waitFor = options.waitFor || 2000; options.protocol = options.protocol || 'http'; options.cacheSize = options.cacheSize || 1*1024*1024; options.cacheRefresh = options.cacheRefresh || 15*60*1000; //15 minutes @@ -51,7 +51,7 @@ exports = module.exports = function(options) { // Express adds a protocol property to the req object. var protocol = req.protocol || options.protocol; - var url = protocol + '://' + req.headers.host; + var url = process.env.CRAWLME_HOST || protocol + '://' + req.headers.host; var path = urlParts[0]; var fragment = urlParts[1]; @@ -76,10 +76,10 @@ exports = module.exports = function(options) { if(cached) return cb(null, cached); } - Browser.visit(url, {waitFor: options.waitFor}, - function(err, browser, status) { - if(err) return cb(err); + var browser = new Browser(); + browser.visit(url, function() { + browser.wait(options.waitFor, function () { // links var links = browser.queryAll('a'); links.forEach(function(link) { @@ -93,8 +93,8 @@ exports = module.exports = function(options) { var snapshot = stripScripts(browser.html()); cache.set(url, snapshot); cb(null, snapshot); - } - ); + }); + }); } // Start the cache refresh timer diff --git a/package.json b/package.json index cf0f37b..67c6cd2 100644 --- a/package.json +++ b/package.json @@ -2,16 +2,23 @@ "author": "Aron Kornhall (http://optimalbits.com)", "name": "crawlme", "description": "Makes your ajax web application indexable by search engines by generating html snapshots on the fly. Caches results for blazing fast responses and better page ranking.", - "keywords": ["ajax", "crawling", "google", "indexing", "SEO", "Search Engine Optimization"], + "keywords": [ + "ajax", + "crawling", + "google", + "indexing", + "SEO", + "Search Engine Optimization" + ], "version": "0.0.7", "main": "./index.js", "engines": { "node": ">=0.6.10" }, "dependencies": { - "zombie": "2.x.x", + "async": "0.2.6", "lru-cache": "2.3.x", - "async": "0.2.6" + "zombie": "^4.3.0" }, "devDependencies": { "connect": "2.x.x", From e80091eedc85d9bebb1f87b3cdc34c22fbc8efb6 Mon Sep 17 00:00:00 2001 From: vencax Date: Tue, 13 Mar 2018 06:58:24 +0100 Subject: [PATCH 4/4] custom urlgetfunc --- lib/crawlme.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/crawlme.js b/lib/crawlme.js index caf9de4..f7b0ce0 100644 --- a/lib/crawlme.js +++ b/lib/crawlme.js @@ -106,12 +106,13 @@ exports = module.exports = function(options) { if ('GET' !== req.method) return next(); // Try to extract the ajax URL from the request - var url = getAjaxUrl(req); + var url = options.getUrl ? options.getUrl(req) : getAjaxUrl(req); // If we aren't being crawled continue to next middleware if (!url) return next(); // Generate the snapshot + console.log('Zombie wants to eat: ' + url); getHTMLSnapshot(url, function(err, snapshot) { if (err) { console.log('Zombie reported an error: ' + err);