-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfetch.py
More file actions
97 lines (89 loc) · 3.74 KB
/
fetch.py
File metadata and controls
97 lines (89 loc) · 3.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import copy
import os
import re
import bs4
import ftfy
import httpx
from markdownify import markdownify as md
DATE_REGEX = re.compile(
r"\n\s*(January|February|March|April|May|June|July|August|September|October|November|December)?\s*\d{4}\b.*"
)
CHAR_REPLACEMENTS = {
"⟨": "<",
"⟩": ">",
"≈": "~", # or "≈" if you still want the symbol but in a supported font
"̇": ".", # Unicode combining dot, often replaced with a period or omitted
"̀": "`", # Unicode combining grave accent, often omitted or replaced with a backtick
}
def main():
response = httpx.get("https://paulgraham.com/articles.html")
response.raise_for_status()
soup = bs4.BeautifulSoup(response.text, "html.parser")
title_links = [
(link.get_text(), f"https://paulgraham.com/{link['href']}")
for link in soup.find_all("a")
if link.find_parent("font")
and link.find_next_sibling("img")
and not link["href"].startswith(
"http"
) # essays are on the same domain with relative links
and link.get_text()
!= "Revenge of the Nerds" # this essay is not in the right format for latex
]
files = []
for i, (title, link) in enumerate(title_links, start=1):
print(f"{title}: {link}")
essay = httpx.get(link)
essay.encoding = "windows-1252"
essay.raise_for_status()
essay_soup = bs4.BeautifulSoup(essay.text, "html.parser")
essay_content = essay_soup.find("font", {"face": "verdana"})
# let's replace all relative links with absolute links
for url in essay_content.find_all("a"):
if "href" not in url.attrs:
url.unwrap()
elif not url["href"].startswith("http"):
if url["href"].startswith("#"):
url.unwrap()
else:
url["href"] = f"https://paulgraham.com/{url['href']}"
# remove all font tags
for font in essay_content.find_all("font"):
font.unwrap()
# TODO: handle images with pandoc & latex
for img in essay_content.find_all("img"):
img.decompose()
# Find all comments and keep their content (e.g.: Great Hackers)
for comment in essay_content.find_all(
string=lambda text: isinstance(text, bs4.Comment)
):
comment.extract()
essay_content_for_date = copy.deepcopy(essay_content)
for table in essay_content_for_date.find_all("table"):
table.decompose()
date = DATE_REGEX.search(
"\n".join(essay_content_for_date.prettify().split("\n")[:5])
) # date is in the first 5 lines
if not date:
print(f"Could not find date for {title}, skipping...")
continue
date = date.group()
essay_content = essay_content.prettify().replace("<br>", "<br/>")
essay_content = essay_content.replace(date, f"<h4>{date}</h4>\n")
essay_content = ftfy.fix_text(essay_content)
date = date.strip()
for old_char, new_char in CHAR_REPLACEMENTS.items():
essay_content = essay_content.replace(old_char, new_char)
essay_content = "\\newpage\n\\noindent\n\n" + md(
f"<h1>{title}</h1>\n<br/>\n<a href='{link}'>{link}</a>\n<br/>\n{essay_content}"
)
idx_date_title = f"{i} {date} {title}"
safe_idx_date_title = idx_date_title.replace("/", "&").replace(" ", "_")
output_file = os.path.join("output", "essays", f"{safe_idx_date_title}.md")
with open(output_file, "w") as f:
f.write(essay_content)
files.append(output_file + "\n")
with open(os.path.join("conf", "essays-list.txt"), "w") as f:
f.write("".join(files))
if __name__ == "__main__":
main()