-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscrape.js
More file actions
92 lines (76 loc) · 1.96 KB
/
scrape.js
File metadata and controls
92 lines (76 loc) · 1.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
var request = require('request'),
cheerio = require('cheerio'),
fs = require('fs'),
search = 'growth+hacking',
url = 'https://www.google.com/search?q=' + search,
collection = {},
totalResults = 0,
resultsDownloaded = 0;
function file() {
resultsDownloaded++;
if (resultsDownloaded !== totalResults) {
return;
}
var words = [];
for (prop in collection) {
words.push({
word: prop,
count: collection[prop]
});
}
words.sort(function(a,b) {
return b.count - a.count;
});
var keywords = words.slice(0, 20);
fs.writeFile('output.json', JSON.stringify(keywords, null, 4), function(error) {
console.log('File successfully written! - Check your project directory for the output');
});
}
var searchFile = fs.readFileSync('words.txt', 'utf8', function(error, data) {
if (error) {
return error;
} else {
return data;
}
});
request(url, function(error, response, body) {
if (error) {
console.log("Couldn't get page because of error: " + error);
}
var $ = cheerio.load(body),
links = $('.r a');
links.each(function(i, link) {
var url = $(link).attr('href');
url = url.replace('/url?q=', '').split('&')[0];
if (url.charAt(0) === '/') {
return;
}
totalResults++;
request(url, function(error, response, body) {
if (error) {
console.log("Couldn't get page because of error: " + error);
return;
}
var $page = cheerio.load(body),
text = $page('body').text();
text = text
.replace(/\s+/g, " ")
.replace(/[^a-zA-Z ]/g, "")
.toLowerCase();
text.split(' ').forEach(function(word) {
if (word.length < 4 || word.length > 20) {
return;
}
if (searchFile.indexOf(word) > -1) {
return;
}
if (collection[word]) {
collection[word]++;
} else {
collection[word] = 1;
}
});
file();
});
});
});