-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraper.py
More file actions
77 lines (53 loc) · 1.74 KB
/
scraper.py
File metadata and controls
77 lines (53 loc) · 1.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
from bs4 import BeautifulSoup
import urllib3
import re
#list of sites visited
listmap = set("wiki/Baylor_University")
# scrape takes a url and gets a list of embedded urls from the html
# returns the number of files visited
def scrape(url, counter):
#get GET request page
response = http.request('GET', url)
#get html from request responce
soup = BeautifulSoup(response.data, features="html5lib")
#write the current URL to the document
f.write(url + "\t")
#for each link found within the html
for link in soup.findAll('a', attrs={'href': re.compile("^/wiki/[a-zA-z-0-9_()-]+$")}):
#parse the href to get the link
currentUrl = link.get('href')
#print the found link
print(currentUrl)
#write it to a file
f.write(currentUrl + "\t")
#increment the counter
counter = counter + 1
#we dont want to go overboard
if counter < 1000:
#if we have not visited the link before
if currentUrl in listmap:
print("duplicate")
else:
#add the current node to the list
listmap.add(currentUrl)
#write to the file
f.write("\n")
#scrape on that node
counter = scrape(baseUrl + link.get('href'), counter)
f.write("\n")
return counter
#counter for number of files
counter = 0;
#empty the dataset
open("dataset", 'w').close()
#open the file
f = open("dataset", "a")
#init database link
http = urllib3.PoolManager()
#base link url
baseUrl = "https://en.wikipedia.org"
#starting node
url = 'https://en.wikipedia.org/wiki/Baylor_University'
#count the number of sites added
counter = scrape(url, counter)
print(counter)