-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathindex.js
More file actions
140 lines (107 loc) · 4.31 KB
/
index.js
File metadata and controls
140 lines (107 loc) · 4.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
// Copyright (c) 2021, Fexra
//
// Please see the included LICENSE file for more information.
"use strict";
const fs = require("fs");
const https = require("https");
const { basename, join } = require("path");
const { extract } = require("article-parser");
const { NodeHtmlMarkdown } = require("node-html-markdown");
const slugify = require("slugify");
const sanitize = require("sanitize-filename");
const articles = require("./source/articles.json");
// Initialize
(async () => {
try {
// setup `dist` folder and `dist/images``
const distFolder = join(__dirname, "dist");
const articlesFolder = join(distFolder, "articles");
const imagesFolder = join(distFolder, "images");
// create the folders if they don't exist
if (!fs.existsSync(distFolder)) fs.mkdirSync(distFolder);
if (!fs.existsSync(articlesFolder)) fs.mkdirSync(articlesFolder);
if (!fs.existsSync(imagesFolder)) fs.mkdirSync(imagesFolder);
// cycle through the medium URLs
console.log(`Scraping ${articles.length} blog articles`);
let articleCount = 1;
for (let article of articles) {
console.log(`Processing blog article ${articleCount}`);
// scrape the article by url
let scrapedArticle = await extract(article.url);
if (!scrapedArticle) continue;
// convert article content to markdown
let markdown = NodeHtmlMarkdown.translate(scrapedArticle.content);
// find all markdown image tags
let images = [markdown.match(/(!\[.*?\]\()(.+?)(\))/g)];
images = images[0];
console.log(
`Found ` + images.length + ` images for blog article ` + articleCount
);
let imageCount = 0;
// download each image
for (const image of images) {
// skip avatar + preview images
if (image.includes("1*_Zp5LjJUruaSaS58AHoS0w.png")) continue;
if (image.includes("q=20")) continue;
// format image url + extension
let imageUrl = image.match(/!\[.*?\]\((.*?)\)/)[1];
console.log(`Processing image ` + imageUrl);
// create image
let fileName = sanitize(basename(imageUrl));
let filePath = join(imagesFolder, fileName);
if (!fs.existsSync(filePath)) {
const file = await fs.createWriteStream(filePath);
try {
// download image
await https.get(imageUrl, function (response) {
response.pipe(file);
});
} catch (e) {
console.warn(`Error downloading image ${imageUrl}`);
}
imageCount++;
}
else {
console.log(`Image already downloaded`);
}
}
console.log(
`Downloaded ` + imageCount + ` images for blog article ` + articleCount
);
// cleanup article
//remove medium header (avatar, author and cleanup date)
markdown = markdown.replace(
/\[(.*)\].*-----(.*)--------------------------------\)(\n\n)/g, ''
);
// replace external image urls with the scraped internal jekyll ones
markdown = markdown.replace(
/!\[\]\(.*\/(.*)\)(\n\n)/g,
function (match, fileName) {
if (!fileName.includes("q=20")) {
return ") + ")\n\n";
} else {
return "";
}
}
);
// remove title and add (jekyll style)
markdown = markdown.replace(/^#.*$/m, '');
// add jekyll template to the start of the markdown
let imageUrl = scrapedArticle.image.split('/')
imageUrl = imageUrl[imageUrl.length - 1];
imageUrl = sanitize(basename(imageUrl))
imageUrl = '{{ site.baseurl }}/images/' + imageUrl
markdown = `---\nlayout: 'post' \ntitle: '${scrapedArticle.title}' \ndescription: '${scrapedArticle.description}' \nimage: '${imageUrl}' \ndate: '${scrapedArticle.published}' \n---` + markdown
// store article
const articleSlug = article.url.match(/([^\/]+$)/)[0].replace(/[^-]+$/,'').slice(0, -1)
const artitleDate = scrapedArticle.published.substring(0,10);
const convertedArticle = await fs.createWriteStream(
`${articlesFolder}/${artitleDate}-${articleSlug}.md`
);
convertedArticle.write(markdown);
articleCount++;
}
} catch (err) {
console.error(err);
}
})();