-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathsettings.py
More file actions
60 lines (42 loc) · 1.8 KB
/
settings.py
File metadata and controls
60 lines (42 loc) · 1.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
"""
Note-: Changing setings can cause inconsistency in globaldata as well as trends related data
"""
__all__ = ['DB_PATH', 'DB_DATE', 'DB_DEFAULT_RANK', 'WINDOW', 'DB_FIRST_DATE', 'MAX_CLUSTER', 'MAX_WORDS', 'MAX_RANK']
import sqlite3
DB_PATH = './database/web.db'
KMEANS_PATH = './dump_obj/kmeans'
CLUSTERNAME_DB = './database/tempclustername.db'
def get_last_update_date():
""" return string consist date upto which database is updated """
print('loading database updates...')
try:
conn = sqlite3.connect(DB_PATH)
cur = conn.cursor()
return cur.execute('SELECT date_p FROM size ORDER BY date_p DESC LIMIT 1').fetchone()[0]
except sqlite3.Error as error:
print('error finding present date', error)
#present date according to database
DB_DATE = get_last_update_date()
#default rank
DB_DEFAULT_RANK = 150000
#Max Rank of any website
MAX_RANK=200000
#sitedata consist only site that found in last *WINDOW* days
WINDOW = 30
#no of clusters
MAX_CLUSTER = 100
#date at which project started
DB_FIRST_DATE = '2020-04-07'
#maximum no. of words per site to be kept in database
MAX_WORDS = 50
CRON_SETTINGS = {
'SUFFICIENT': 50, #Site will be rejected if scrapper found less than $SUFFICIENT words in it
'TIMEOUT_SCRAPPER': 30, #max time in second scrapper wait for one url
'TIMEOUT_STATUS_CHECK': 10, #max time in second status checker wait for one url
'MAX_WAIT_FOR_RESPONSE': 5, #url will be rejected if it send nothing for this much seconds
'LIMIT': 150000, # First $LIMIT entries will be consider from filtered cisco-ranklist
'TEMP_DB_PATH': './database/temp.db', #to use only during cron-job
'WORKERS': 30, #no of workers in multiprocessing
'BATCH_SIZE': 90, #no of urls attempt in one go.
'BLACKLIST_TIME':4 #no of days a url will go in blacklist if it doesn't respond
}