-
Notifications
You must be signed in to change notification settings - Fork 16
Expand file tree
/
Copy pathcrawl_bot.py
More file actions
100 lines (81 loc) · 4.39 KB
/
crawl_bot.py
File metadata and controls
100 lines (81 loc) · 4.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from get_domains import *
from file_manage import *
from link_finder import link_crawler
from urllib.request import urlopen
import tldextract
#Importing Stem libraries
from stem import Signal
from stem.control import Controller
import socks, socket
#Initiating Connection
with Controller.from_port(port=9051) as controller:
controller.authenticate("16:975E551CC19179966040700B3CA42216C351480AC90D2A92CE9CE4C1DA")
controller.signal(Signal.NEWNYM)
# TOR SETUP GLOBAL Vars
SOCKS_PORT = 9050 # TOR proxy port that is default from torrc, change to whatever torrc is configured to
socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", SOCKS_PORT)
socket.socket = socks.socksocket
# Perform DNS resolution through the socket
def getaddrinfo(*args):
return [(socket.AF_INET, socket.SOCK_STREAM, 6, '', (args[0], args[1]))]
socket.getaddrinfo = getaddrinfo
class Crawl_bot:
folder_name, start_link, domain_name, queued_data, crawled_data = '', '', '', '', ''
queue = set()
data_crawled = set()
def __init__(self, folder_name, start_link, domain_name):
Crawl_bot.folder_name = folder_name
Crawl_bot.start_link = start_link
Crawl_bot.domain_name = domain_name
Crawl_bot.queued_data = Crawl_bot.folder_name + '/queue.txt'
Crawl_bot.crawled_data = Crawl_bot.folder_name + '/crawled.txt'
self.initiate_directory()
self.crawl_page('Spider starts here', Crawl_bot.start_link)
@staticmethod
def initiate_directory(): # Define and create new directory on the first run
create_project_folder(Crawl_bot.folder_name)
create_data_files(Crawl_bot.folder_name, Crawl_bot.start_link)
Crawl_bot.queue = convert_to_set(Crawl_bot.queued_data)
Crawl_bot.data_crawled = convert_to_set(Crawl_bot.crawled_data)
@staticmethod
def crawl_page(thread_name, web_url): # Fill queue and then update files, also updating user display
print(web_url)
if web_url not in Crawl_bot.data_crawled:
print(thread_name + ' now crawl starts ' + web_url)
print('Queue_url ' + str(len(Crawl_bot.queue)) + ' | Crawled_url ' + str(len(Crawl_bot.data_crawled)))
Crawl_bot.add_url_to_queue(Crawl_bot.collect_url(web_url))
Crawl_bot.queue.remove(web_url)
Crawl_bot.data_crawled.add(web_url)
Crawl_bot.update_folder()
# Converts raw response data into readable information and checks for proper html formatting
@staticmethod
def collect_url(web_url):
html_data_string = ''
try:
received_response = urlopen(web_url)
if 'text/html' in received_response.getheader('Content-Type'):
data_bytes = received_response.read()
html_data_string = data_bytes.decode("latin-1")
link_finder = link_crawler(Crawl_bot.start_link, web_url)
link_finder.feed(html_data_string)
##############################################################################################################################################################################################
#######################################FOR SCRAPPING PURPOSES#################################################################################################################################
f = open(Crawl_bot.folder_name + '/' + ((tldextract.extract(web_url)).domain), 'w')
f.write(html_data_string)
f.close()
###############################################################################################################################################################################################
###############################################################################################################################################################################################
except Exception as e:
print(str(e))
return set()
return link_finder.page_urls()
@staticmethod
def add_url_to_queue(links): # Queue data saves to project files
for url in links:
if (url in Crawl_bot.queue) or (url in Crawl_bot.data_crawled):
continue
Crawl_bot.queue.add(url)
@staticmethod
def update_folder(): # Update the project directory
set_to_file(Crawl_bot.queue, Crawl_bot.queued_data)
set_to_file(Crawl_bot.data_crawled, Crawl_bot.crawled_data)