-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscrape.js
More file actions
144 lines (134 loc) · 4.89 KB
/
scrape.js
File metadata and controls
144 lines (134 loc) · 4.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
const fs = require("fs");
const https = require("https");
const mmm = require('mmmagic'),
Magic = mmm.Magic;
const {connect} = require("./connect-to-dbs");
const magic = new Magic(mmm.MAGIC_MIME_TYPE);
const args = process.argv.slice(2);
if (!args[0]) {
console.error("Usage: $0 <subset>")
process.exit(1)
}
const [baseDatasetName] = args
for (const datasetSuffix of ['', '-ml']) {
const sqlQueryPath = `input/${baseDatasetName}/query${datasetSuffix}.sql`;
if (!fs.existsSync(sqlQueryPath)) {
console.error(`${sqlQueryPath} does not exist`)
process.exit(1)
}
}
// const imageRoot="https://inducks.org/hr.php?normalsize=1&image=https://outducks.org/"
const imageRoot = 'https://res.cloudinary.com/dl7hskxab/image/upload/inducks-covers/'
const filePathRoot = 'input/full/'
const addUrlToDataset = (dgConnection, personcode, url, datasetId) => {
const filePath = `${filePathRoot}${url}`;
return new Promise((resolve, reject) => {
magic.detectFile(filePath, async (err, result) => {
if (/^image\//.test(result)) {
await dgConnection.query(
"insert ignore into dataset_entryurl(dataset_id, sitecode_url) VALUES(?, ?)",
[datasetId, url]
)
await dgConnection.query(
"insert ignore into entryurl_details(sitecode_url, personcode, decision) VALUES(?, ?, ?)",
[url, personcode, null]
)
// Update the entryurl's personcode, in case a previous scrape stored it wrong
await dgConnection.query(
"update entryurl_details set personcode=? where sitecode_url = ?",
[personcode, url]
)
} else {
console.log(`Marking ${filePath.replace(filePathRoot, '')} as invalid`)
await dgConnection.query(
"delete from dataset_entryurl where sitecode_url = ?",
[datasetId, url]
)
await dgConnection.query(
"delete from entryurl_details where sitecode_url = ?",
[url]
)
await dgConnection.query(
"insert ignore into entryurl_details(sitecode_url, personcode, decision) VALUES(?, ?, ?)",
[url, personcode, 'no_drawing']
);
if (fs.existsSync(filePath)) {
fs.unlinkSync(filePath)
}
reject('Invalid')
}
resolve('OK')
})
})
}
const downloadAndAddUrlToDataset = (dgConnection, personcode, url, datasetId) => {
return new Promise(async (resolve, reject) => {
const filePath = `${filePathRoot}${url}`;
const isEntryurlInvalid = (await dgConnection.query(
"select decision from entryurl_details where (decision is not null and decision <> 'ok') and sitecode_url=?",
[url]
)).length > 0;
if (isEntryurlInvalid) {
console.log(`Skipped ${url} (marked as invalid)`)
if (fs.existsSync(filePath)) {
fs.unlinkSync(filePath)
}
resolve('OK');
return
}
if (fs.existsSync(filePath)) {
addUrlToDataset(dgConnection, personcode, url, datasetId).then(() => {
console.log(`Skipped ${url} (already downloaded)`)
resolve('OK');
}).catch(() => {
resolve('Ignored')
});
} else {
fs.mkdirSync(filePath, {recursive: true})
fs.rmdirSync(filePath)
const file = fs.createWriteStream(filePath);
await https.get(`${imageRoot}/${url}`, async response => {
response.pipe(file);
response.on('end', () => {
addUrlToDataset(dgConnection, personcode, url, datasetId)
.then(() => {
console.log(`Downloaded ${url}`)
})
.catch(() => {
console.log(`Could not download ${url}`)
})
.finally(() => {
resolve('OK');
})
});
}).on('error', () => {
reject(`Skipped ${url} (could not download)`)
});
}
})
}
const downloadDatasetFromQuery = (coaConnection, dgConnection, datasetSuffix, datasetId) => {
return new Promise(async (resolve) => {
const response = (await coaConnection.query(fs.readFileSync(`input/${baseDatasetName}/query${datasetSuffix}.sql`).toString()));
const data = response[0].personcode ? response : response[response.length-1]
for (const {personcode, entryurl_urls} of data) {
const urls = entryurl_urls.split('|')
for (const url of urls) {
await downloadAndAddUrlToDataset(dgConnection, personcode, url, datasetId)
}
}
resolve('OK')
})
};
const scrape = async () => {
const {coaConnection, dgConnection} = await connect()
for (const datasetSuffix of ['', '-ml']) {
const [dataset] = await dgConnection.query("select id from dataset where name=?", [`${baseDatasetName}${datasetSuffix}`])
await dgConnection.query("delete from dataset_entryurl where dataset_id=?", [dataset.id])
await downloadDatasetFromQuery(coaConnection, dgConnection, datasetSuffix, dataset.id);
}
await coaConnection.end();
await dgConnection.end();
process.exit(0)
}
scrape()