-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy paththread_scraper.js
More file actions
95 lines (80 loc) · 2.8 KB
/
thread_scraper.js
File metadata and controls
95 lines (80 loc) · 2.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
// Made by Edward Fox - @icemaz 08/07/2017
// Facepunch thread scrape
var cheerio = require('cheerio'),
Xray = require('x-ray'),
makeDriver = require('request-x-ray'),
request = require('request'),
fs = require('fs');
const options = {
method: "GET",
jar: true,
encoding: 'binary',
headers: {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"
}
}
var x = Xray({
filters: {
trim: function (value) {
return typeof value === 'string' ? value.trim() : value
},
whitespace: function (value) {
return typeof value === 'string' ? value.replace(/\r|\n|\t|\"/g,' ') : value
},
urlFilter: function (value){
return typeof value === 'string' ? value.replace(/\\\\/g,'') : value
}
}
});
const driver = makeDriver(options);
x.driver(driver);
class ThreadScraper {
constructor(id, cookie){
this.id = id;
options.headers["Cookie"] = cookie;
options.url = "https://facepunch.com/showthread.php?t=" + id;
}
get thread_id(){
return this.id;
}
scrapeThread(callback){
var curPage = 1;
var threadId = this.id;
request(options, function(err, res, body){
var $ = cheerio.load(body);
// Grab the page count, so we know how much we should be scraping
var pageCount = 1;
if($('#pagination_top').length > 0){
pageCount = parseInt($('#pagination_top .first_last a').attr('href').split('page=')[1]);
}
x('https://facepunch.com/showthread.php?t=' + this.id,
{
id: 'input[name=t]@value',
title: '#lastelement',
nextpage: '#pagination_top a[rel=next]@href',
posts: x('.posts li', [{
post: '@html',
username: '.username | trim | whitespace',
text: 'blockquote | trim | whitespace',
date: '.date | trim | whitespace',
//edit: '.postdate span:not(.date)@title | trim | whitespace',
ratings: x('.rating_results', ['span | trim | whitespace'])
}])
})
.paginate(function(){
console.log("Current: " + "https://facepunch.com/showthread.php?t=" + threadId + "&page=" + curPage);
curPage++;
return "https://facepunch.com/showthread.php?t=" + threadId + "&page=" + curPage;
})
.limit(pageCount)
(function(err,data){
if(err){
console.log(err);
}else{
callback(data);
}
});
}.bind(this));
}
}
module.exports = ThreadScraper;