-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathcronjob.py
More file actions
103 lines (74 loc) · 3.11 KB
/
cronjob.py
File metadata and controls
103 lines (74 loc) · 3.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
from datetime import date, timedelta
from settings import CRON_SETTINGS
from crontools import newdomains
from crontools import globaldata
from crontools import trends
from settings import *
import numpy as np
import sys
import os
LIMIT = CRON_SETTINGS['LIMIT']
def getting_update(cur_date, workers, batch_size):
""" it will fetch and scrap all the updates for current date """
print('Updating Database Using Cisco Ranklist Of', cur_date, '...\n')
print('Fetching cisco ranklist...')
urls = newdomains.fetch_ranklist(cur_date)
print('Filtering cisco ranklist...')
urls = newdomains.process_ranklist(urls)[:LIMIT]
print('Filtering Already visited domains...')
url_to_scrap = newdomains.get_url_to_scrap(urls)
print('\n',len(url_to_scrap), 'new domains found')
print('\nScrapping urls...')
newdomains.fast_scrap_batches(url_to_scrap, workers, batch_size)
return urls, url_to_scrap
def update(cur_date, urls, url_to_scrap):
""" this will update kmeans, and web.db """
new_urls = []
embedding = []
for data in newdomains.temp_read():
new_urls.append(data[0])
embedding.append(np.frombuffer(data[1]))
print('\n',len(new_urls), 'new domains scrapped')
rejected = newdomains.get_reject_list(url_to_scrap, new_urls)
print('\nAdjusting ranks...')
ranks = newdomains.get_adjusted_ranks(cur_date, new_urls, urls)
print('performing updates on global_data.....')
globaldata.add_new_records(cur_date)
globaldata.delete_records(cur_date)
globaldata.update_rank(list(ranks.items()))
globaldata.update_date(list(ranks.keys()), str(cur_date))
print('updating trends.......')
trends.update_trends(new_urls, embedding, str(cur_date))
print('updating visited domains...')
newdomains.update_visited_domains(list(ranks.keys()), new_urls, cur_date)
newdomains.add_new_visited_domains(rejected, cur_date, status=0)
newdomains.delete_blacklisted(cur_date)
print('clearing temporary data...')
newdomains.temp_clear()
print('SUCCESS\n')
def run(workers=None, batch_size=None):
""" run pending updates """
if(workers is None): workers = CRON_SETTINGS['WORKERS']
if(batch_size is None): batch_size = CRON_SETTINGS['BATCH_SIZE']
print('\nWARNING-: Scrapper will use ', workers, 'parallel workers and batch size', batch_size, '\n')
str_to_date = lambda s: date(int(s[0:4]),int(s[5:7]),int(s[8:10]))
cur_date = str_to_date(DB_DATE) + timedelta(days=1)
till_date = date.today() - timedelta(days=1)
print('Ready to perform all updates from', cur_date, 'to', till_date, '\n')
while(cur_date <= till_date):
try:
newdomains.temp_clear()
urls, url_to_scrap = getting_update(cur_date, workers, batch_size)
#SHUT DOWN SERVER HERE.....
os.system("./server_stop.sh")
update(cur_date, urls, url_to_scrap)
#RESTART SERVER HERE....
os.system("./server_start.sh")
cur_date += timedelta(days=1)
except Exception as e:
print('Error while updating at', cur_date, e)
print('Stop updates')
break
if len(sys.argv) == 1: run()
if len(sys.argv) == 2: run(int(sys.argv[1]))
if len(sys.argv) == 3: run(int(sys.argv[1]), int(sys.argv[2]))