Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
lib-cov
*.seed
*.log
*.csv
*.dat
*.out
*.pid
*.gz
.*
*.sqlite
pids
logs
results

node_modules
npm-debug.log
24 changes: 13 additions & 11 deletions lib/crawlme.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@ var async = require('async');
exports = module.exports = function(options) {
// regex for stripping html of script tags. Borrowed from jQuery
var stripScriptTags = /<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi;

// Default options
options = options || {};
options.waitFor = options.waitFor || 1000;
options.divider = options.divider || '#!';
options.waitFor = options.waitFor || 2000;
options.protocol = options.protocol || 'http';
options.cacheSize = options.cacheSize || 1*1024*1024;
options.cacheRefresh = options.cacheRefresh || 15*60*1000; //15 minutes
Expand Down Expand Up @@ -42,23 +43,23 @@ exports = module.exports = function(options) {
// Get the URL to the AJAX version of this page
function getAjaxUrl(req) {
var urlParts = req.url.split('?_escaped_fragment_=');

// If no fragment in URL this is not a request for an HTML snapshot
// of an AJAX page.
if (urlParts.length !== 2) return undefined;

// Express adds a protocol property to the req object.
var protocol = req.protocol || options.protocol;

var url = protocol + '://' + req.headers.host;
var url = process.env.CRAWLME_HOST || protocol + '://' + req.headers.host;
var path = urlParts[0];
var fragment = urlParts[1];

if (fragment.length === 0) {
// We are dealing with crawlable an ajax page without a hash fragment
url += path; // No hashbang or fragment
} else {
url += path + '#!' + decodeURIComponent(fragment);
url += path + options.divider + decodeURIComponent(fragment);
}

return url;
Expand All @@ -75,10 +76,10 @@ exports = module.exports = function(options) {
if(cached) return cb(null, cached);
}

Browser.visit(url, {waitFor: options.waitFor},
function(err, browser, status) {
if(err) return cb(err);
var browser = new Browser();

browser.visit(url, function() {
browser.wait(options.waitFor, function () {
// links
var links = browser.queryAll('a');
links.forEach(function(link) {
Expand All @@ -92,8 +93,8 @@ exports = module.exports = function(options) {
var snapshot = stripScripts(browser.html());
cache.set(url, snapshot);
cb(null, snapshot);
}
);
});
});
}

// Start the cache refresh timer
Expand All @@ -105,12 +106,13 @@ exports = module.exports = function(options) {
if ('GET' !== req.method) return next();

// Try to extract the ajax URL from the request
var url = getAjaxUrl(req);
var url = options.getUrl ? options.getUrl(req) : getAjaxUrl(req);

// If we aren't being crawled continue to next middleware
if (!url) return next();

// Generate the snapshot
console.log('Zombie wants to eat: ' + url);
getHTMLSnapshot(url, function(err, snapshot) {
if (err) {
console.log('Zombie reported an error: ' + err);
Expand Down
13 changes: 10 additions & 3 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,23 @@
"author": "Aron Kornhall <aron@optimalbits.com> (http://optimalbits.com)",
"name": "crawlme",
"description": "Makes your ajax web application indexable by search engines by generating html snapshots on the fly. Caches results for blazing fast responses and better page ranking.",
"keywords": ["ajax", "crawling", "google", "indexing", "SEO", "Search Engine Optimization"],
"keywords": [
"ajax",
"crawling",
"google",
"indexing",
"SEO",
"Search Engine Optimization"
],
"version": "0.0.7",
"main": "./index.js",
"engines": {
"node": ">=0.6.10"
},
"dependencies": {
"zombie": "2.x.x",
"async": "0.2.6",
"lru-cache": "2.3.x",
"async": "0.2.6"
"zombie": "^4.3.0"
},
"devDependencies": {
"connect": "2.x.x",
Expand Down