-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraper.ts
More file actions
174 lines (137 loc) · 4.3 KB
/
scraper.ts
File metadata and controls
174 lines (137 loc) · 4.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import * as fs from "fs";
import axios from "axios";
import * as cheerio from "cheerio";
const [, , tagA, tagB] = process.argv;
const RESULTS_URL = "https://danbooru.donmai.us/posts";
const POST_URL = "https://danbooru.donmai.us/posts";
const tags: String = tagB ? `${tagA}+${tagB}` : tagA;
const output_dir = "./output_" + tags + "/";
var contentful_page = true;
/**
*
* @param page page number
* @returns an axios Response object
*/
const fetch_results_page = async (page: number) => {
try {
const res = await axios.get(`${RESULTS_URL}?tags=${tags}&page=${page}`);
return res;
} catch (err) {
console.error(err);
}
process.exit(1);
};
/**
*
* @param results_data html data
* @returns array of post ids
*/
const fetch_post_ids = (results_data: any): string[] => {
const $ = cheerio.load(results_data);
const articles = $("article");
if (articles.length === 0) {
if (
$("#posts").children().children().prop("innerText") !==
"No posts found."
) {
console.error("Unable to fetch posts (泣)");
process.exit(1);
}
console.warn("Seems like that was the last page");
contentful_page = false;
}
let ids = [];
for (let i = 0; i < articles.length; i++) {
const hit = articles[i];
try {
if (hit.attribs.id.slice(0, 5) !== "post_")
throw new Error("Unknown article element found");
} catch (err: any) {
console.warn(err.message, err);
continue;
}
ids[i] = hit.attribs["data-id"];
}
if (ids.length !== articles.length)
console.warn("Missed one or more post ids\n");
return ids;
};
/**
*
* @param ids array of post ids
* @returns array with elements of the form ['/post/\<post_id\>', data]
*/
const fetch_post_pages = async (ids: string[]) => {
const reqs = [];
for (let i = 0; i < ids.length; i++) {
reqs[i] = axios.get(`${POST_URL}/${ids[i]}`);
}
const res_html: [string, string][] = [];
(await Promise.all(reqs)).forEach(
(res, i) => (res_html[i] = [res.request.path.slice(7), res.data])
);
return res_html;
};
/**
*
* @param raw_html array of [post path, post page]
* @returns array of [post path, cdn endpoints]
*/
const parse_file_urls = (raw_html: [string, string][]) => {
const urls: [string, string][] = [];
for (let i = 0; i < raw_html.length; i++) {
const $ = cheerio.load(raw_html[i][1]);
const prop = $("#content").children("section").attr("data-file-url");
if (prop) urls.push([raw_html[i][0], prop]);
}
if (urls.length !== raw_html.length)
console.warn("Missed one or more images");
return urls;
};
/**
*
* @param urls array of urls
* @returns void
*/
const fetch_files = async (urls: [string, string][]) => {
const reqs: any[] = [];
for (let i = 0; i < urls.length; i++) {
reqs[i] = axios.get(urls[i][1], { responseType: "arraybuffer" });
}
let write_to_disk: Promise<any>[] = [];
(await Promise.allSettled(Object.values(reqs))).forEach(
(res: any, i) => {
if (res.status === "fulfilled") {
const path = output_dir + urls[i][0] + res.value.request.path.slice(-4);
write_to_disk.push(fs.promises.writeFile(path, res.value.data));
} else {
console.log("Failed to fetch image ", urls[i][1])
}
}
);
(await Promise.allSettled(write_to_disk)).forEach(
(res: any, i) => {
if (res.status === "rejected") {
console.error("Failed to persist image ", urls[i][1])
}
}
)
};
let main = async () => {
let results_page: any;
let current_page = 1;
while (contentful_page) {
console.log("Fetching images from page", current_page);
results_page = await fetch_results_page(current_page);
const post_ids = fetch_post_ids(results_page.data);
const post_pages = await fetch_post_pages(post_ids);
const file_urls = parse_file_urls(post_pages);
const dir = output_dir;
if (!fs.existsSync(dir)) fs.mkdirSync(dir);
await fetch_files(file_urls);
current_page++;
}
console.log("Bai bai!");
process.exit(0);
};
main();