-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcrawler.py
More file actions
89 lines (77 loc) · 2.57 KB
/
crawler.py
File metadata and controls
89 lines (77 loc) · 2.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import requests
from collections import deque
from urllib.parse import urlparse, urljoin
from urllib.robotparser import RobotFileParser
from html.parser import HTMLParser
from bs4 import BeautifulSoup, SoupStrainer
def do_add(s, x):
"""
Helper function that adds element to set and returns True if
the value has been added, False if it has not
"""
l = len(s)
s.add(x)
return len(s) != l
def handle_uri(queue, url_set, current_url, uri):
"""
Handle discovered URIs: forms URLs if URI is a path only,
rejects non-HTTP(S) URLs, adds valid URLs to the queue
"""
uri_scheme = urlparse(uri).scheme
if uri_scheme == 'http' or uri_scheme == 'https':
url = uri
# only add URL to queue if it is unique
if do_add(url_set, url):
queue.appendleft(url)
return True
elif uri_scheme == '':
url = urljoin(current_url, uri)
if do_add(url_set, url):
queue.appendleft(url)
return True
return False
def find_unique_urls(start_url, limit):
"""
Find a set of size limit of unique URLs starting the crawl at start_url
"""
# we are using a set since elements should be unique
url_set = set()
queue = deque()
queue.appendleft(start_url)
# initialise robotparser
rp = RobotFileParser()
# add starting URL to the set
url_set.add(start_url)
limit -= 1
while len(url_set) < limit:
# check if queue is empty
if not queue:
break
url = queue.pop()
rp.set_url(url)
rp.read()
# check if robots.txt allows to fetch
if rp.can_fetch('*', url):
response = requests.get(url)
else:
continue
current_url = response.url
if response.status_code == 200 and response.headers['content-type'][:9] == 'text/html':
parse_only = SoupStrainer('a')
soup = BeautifulSoup(response.text, "html.parser", parse_only=parse_only)
for item in soup.children:
if limit > 0 and handle_uri(queue, url_set, current_url, item['href']):
limit -= 1
return url_set
if __name__ == '__main__':
import sys
if len(sys.argv) > 1:
urls = find_unique_urls(str(sys.argv[1]), 100)
for url in urls:
print(url)
else:
print('Please provide a starting url.')
print('Use this to run:')
print('python3 crawler.py \'https://www.mywebsite.com\'')
print('or')
print('python crawler.py \'https://www.mywebsite.com\'')