netcrawler/crawler.py at master · mnk179/netcrawler · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import requests
from collections import deque
from urllib.parse import urlparse, urljoin
from urllib.robotparser import RobotFileParser
from html.parser import HTMLParser
from bs4 import BeautifulSoup, SoupStrainer

def do_add(s, x):
    """
    Helper function that adds element to set and returns True if
    the value has been added, False if it has not
    """
    l = len(s)
    s.add(x)
    return len(s) != l

def handle_uri(queue, url_set, current_url, uri):
    """
    Handle discovered URIs: forms URLs if URI is a path only,
    rejects non-HTTP(S) URLs, adds valid URLs to the queue
    """
    uri_scheme = urlparse(uri).scheme
    if uri_scheme == 'http' or uri_scheme == 'https':
        url = uri
        # only add URL to queue if it is unique
        if do_add(url_set, url):
            queue.appendleft(url)
            return True
    elif uri_scheme == '':
        url = urljoin(current_url, uri)
        if do_add(url_set, url):
            queue.appendleft(url)
            return True
    return False

def find_unique_urls(start_url, limit):
    """
    Find a set of size limit of unique URLs starting the crawl at start_url
    """
    # we are using a set since elements should be unique
    url_set = set()
    queue = deque()
    queue.appendleft(start_url)

    # initialise robotparser
    rp = RobotFileParser()

    # add starting URL to the set
    url_set.add(start_url)
    limit -= 1

    while len(url_set) < limit:
        # check if queue is empty
        if not queue:
            break

        url = queue.pop()
        rp.set_url(url)
        rp.read()

        # check if robots.txt allows to fetch
        if rp.can_fetch('*', url):
            response = requests.get(url)
        else:
            continue

        current_url = response.url

        if response.status_code == 200 and response.headers['content-type'][:9] == 'text/html':
            parse_only = SoupStrainer('a')
            soup = BeautifulSoup(response.text, "html.parser", parse_only=parse_only)
            for item in soup.children:
                if limit > 0 and handle_uri(queue, url_set, current_url, item['href']):
                    limit -= 1

    return url_set

if __name__ == '__main__':
    import sys
    if len(sys.argv) > 1:
        urls = find_unique_urls(str(sys.argv[1]), 100)
        for url in urls:
            print(url)
    else:
        print('Please provide a starting url.')
        print('Use this to run:')
        print('python3 crawler.py \'https://www.mywebsite.com\'')
        print('or')
        print('python crawler.py \'https://www.mywebsite.com\'')