-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathslim2markdown.py
More file actions
59 lines (54 loc) · 2.63 KB
/
slim2markdown.py
File metadata and controls
59 lines (54 loc) · 2.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from lxml import html
from bs4 import BeautifulSoup
import requests
import subprocess
import html2text
import base64
import re
h = html2text.HTML2Text()
h.body_width = 0
insurance_type = ["life-insurance", "health-insurance" ,"renters-insurance", "pet-insurance", "long-term-disability-insurance"]
extensions = ["learn", "guide", "guide/faqs", "define", "learn/states", "companies"]
for ins in insurance_type:
for ext in extensions:
final_url = "https://www.policygenius.com/" + ins + "/" + ext + "/"
r = requests.get(final_url)
data = r.text
soup = BeautifulSoup(data ,'lxml')
for link in soup.find_all('a'):
links = link.get('href')
check1 = re.compile("life-insurace|health-insurance|renters-insurance|pet-insurance|long-term-disability-insurance")
check2 = re.compile("qa|define|learn|guide|glossary|companies")
if links is not None:
if check1.search(links):
# if check2.search(links):
if links.startswith('/'):
links = "https://www.policygenius.com" + links
page = requests.get(links)
page = h.handle(page.text)
links = links.replace('https://www.policygenius.com', '')
links = links[1:]
if links.endswith('/'):
links = links[:-1]
links = links.replace('/', '_')
print(links)
if links.startswith('life-insurance'):
f=open('life-insurance/' + links + '.md', 'w')
f.write(page)
f.close()
elif links.startswith('health-insurance'):
f=open('health-insurance/' + links + '.md', 'w')
f.write(page)
f.close()
elif links.startswith('pet-insurance'):
f=open('pet-insurance/' + links + '.md', 'w')
f.write(page)
f.close()
elif links.startswith('renters-insurance'):
f=open('renters-insurance/' + links + '.md', 'w')
f.write(page)
f.close()
elif links.startswith('long-term-disability-insurance'):
f=open('long-term-disability-insurance/' + links + '.md', 'w')
f.write(page)
f.close()