-
Notifications
You must be signed in to change notification settings - Fork 16
Expand file tree
/
Copy pathmain.py
More file actions
62 lines (47 loc) · 1.53 KB
/
main.py
File metadata and controls
62 lines (47 loc) · 1.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from crawl_bot import Crawl_bot
from file_manage import *
from queue import Queue
import threading, sys, os
from get_domains import *
import tldextract
def input_url(base_url):
global BASE_URL, regex
BASE_URL=base_url
url_extract = tldextract.extract(BASE_URL)
regex = url_extract.domain
delete = str('rm -r ' + regex)
os.system(delete)
if __name__=='__main__':
if (len(sys.argv)==2):
input_url(sys.argv[1])
else:
print("Invalid input")
GET_DOMAIN = get_domain_name(BASE_URL)
FOLDER_NAME = regex
data_crawled = FOLDER_NAME + '/crawled.txt'
data_in_queue = FOLDER_NAME + '/queue.txt'
thread_count =50
queue = Queue()
Crawl_bot(FOLDER_NAME, BASE_URL, GET_DOMAIN)
def do_job(): # Get the job done
while True:
url = queue.get()
Crawl_bot.crawl_page(threading.current_thread().name, url)
queue.task_done()
def queue_jobs(): # Define each queued link as a new job
for url_link in convert_to_set(data_in_queue):
queue.put(url_link)
queue.join()
initiate_bot()
def get_links_to_queue(): # Also used to create threads to work
for _ in range(thread_count):
thread = threading.Thread(target=do_job)
thread.daemon = True
thread.start()
def initiate_bot(): # Does the crawling job
links_in_queue = convert_to_set(data_in_queue)
if len(links_in_queue) > 0:
print(str(len(links_in_queue)) + ' queued links')
queue_jobs()
get_links_to_queue()
initiate_bot()