-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgenerate_dataset.py
More file actions
184 lines (150 loc) · 6.46 KB
/
generate_dataset.py
File metadata and controls
184 lines (150 loc) · 6.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
from dynamic_feature_extraction import *
from simulation import *
from empirical_distribution_outdegree import sample_empirical_outdegree
from typing import List
import os
import random
from multiprocessing import Pool
import scipy.stats
from datetime import datetime
FILEPATH = os.path.join(os.getcwd(), "Dataset")
# absolute filepath in which to put the features
FEATUREMATRIX_FP = os.path.join(FILEPATH, "DatasetFeatures")
# if folder does not exist, create it
if not os.path.isdir(FEATUREMATRIX_FP):
os.mkdir(FEATUREMATRIX_FP)
# STORE NETWORKS
SOURCE_FP = os.path.join(FILEPATH, "DatasetNetworkSource")
# if folder does not exist, create it
if not os.path.isdir(SOURCE_FP):
os.mkdir(SOURCE_FP)
# number of networks per category
START_NUMBER = 0
FINAL_NUMBER = 750
NR_OF_PROCESSES = 8
# max size of network being analyzed
NETWORK_SIZE = 20000
# number of different sizes to calculate features for
# maximum width or height for feature matrix, minumum 2
MATRIX_DIMENSIONS = (10,10)
'''
Dumps all feature files
@arg name: string, which is used to load the correct files and name the output directory
@arg fp: string, filepath where the name_nodes.csv and name_edges.csv can be found
@arg destination: where to dump the feature folder
@arg sizes. ...
@arg dimension, ...
'''
def dump_features(name : str):
g = GraphAnalysis(name, SOURCE_FP, verbose=False)
g.calculate_feature_matrix(a=MATRIX_DIMENSIONS[0], b=MATRIX_DIMENSIONS[1], fp=FEATUREMATRIX_FP, scaled_timesteps=True, save=True)
def append_result(name, params, result):
row = [name,] + params + [result]
assert len(row) == 7, "incorrect length of entry"
filename = os.path.join(FILEPATH, "results.csv")
if os.path.isfile(filename):
with open(filename, "a", newline="") as f:
writer = csv.writer(f)
writer.writerow(row)
else:
with open(filename, "w", newline="") as f:
writer = csv.writer(f)
writer.writerow(["Name", "Code", "Network Size", "PA", "Fitness", "Aging", "Result"])
writer.writerow(row)
def name_exists(name):
return os.path.isfile(os.path.join(FEATUREMATRIX_FP, name + "_matrix.csv"))
def generate_network(name, code, *args):
if name_exists(name):
print(datetime.now().strftime("%H:%M:%S"), f"{name} already exists")
return
print(datetime.now().strftime("%H:%M:%S"), f"generating {name}")
info = [code, NETWORK_SIZE] + list(args)
res = False
count = 0
while not res:
ctbp = simulator_from_parameters(NETWORK_SIZE, sample_empirical_outdegree, *args)
res = ctbp.generate(save_to=[SOURCE_FP, name], debug=False)
count += 1
if count > 1000: # died out too many times
append_result(name, info, "Died out for 1000 tries")
return
append_result(name, info, count)
dump_features(name)
# 0 - Uniform Attachment Model
# 1 - Affine PA(a,b)
# 2 - Power-law (xmin, tau) Fitness
# 3 - Exponential (lambda) Fitness
# 4 - Power-law Fitness (xmin, tau),
# Aging (mu, st.dev).
# 5 - Exponential Fitness (lambda),
# Aging (mu, st.dev)
# 6 - PA(a,b) with Exponential (lambda) Fitness
# 7 - PA(a,b),
# Aging(mu, st.dev).
# 8 - PA(a,b),
# Exponential fitness (lambda),
# Aging (mu, st.dev).
def main_procedure():
print(datetime.now().strftime("%H:%M:%S"), "Generating parameters")
# # 0 - Uniform Attachment
params = []
for i in range(START_NUMBER, FINAL_NUMBER):
name = "0-"+(str(i))
params.append([name, 0, None, None, None])
# # 1 - Affine PA(a,b)
for i in range(START_NUMBER, FINAL_NUMBER):
name = "1-"+(str(i))
pa = (scipy.stats.uniform(loc=1, scale=3).rvs(), scipy.stats.uniform(loc=1, scale=3).rvs())
params.append([name, 1, pa, None, None])
# # 2 - Power-law (xmin, tau) Fitness
for i in range(START_NUMBER, FINAL_NUMBER):
name = "2-"+(str(i))
fitness = (scipy.stats.uniform(0, 1).rvs(), scipy.stats.uniform(loc=2, scale=2).rvs())
params.append([name, 2, None, fitness, None])
# # 3 - Exponential (lambda) Fitness
for i in range(START_NUMBER, FINAL_NUMBER):
name = "3-"+(str(i))
fitness = scipy.stats.uniform(loc=0.1, scale=2.9).rvs()
params.append([name, 3, None, fitness, None])
# 4 - Power-law Fitness (xmin, tau),
# Aging (mu, st.dev).
for i in range(START_NUMBER, FINAL_NUMBER):
name = "4-"+(str(i))
fitness = (scipy.stats.uniform(loc=0.5, scale=0.5).rvs(), scipy.stats.uniform(loc=2, scale=0.7).rvs())
aging = (scipy.stats.uniform(loc=0.1, scale=2.9).rvs(), 1)
params.append([name, 4, None, fitness, aging])
# 5 - Exponential Fitness (lambda),
# Aging (mu, st.dev)
for i in range(START_NUMBER, FINAL_NUMBER):
name = "5-"+(str(i))
fitness = scipy.stats.uniform(loc=0.1, scale=0.9).rvs()
aging = (scipy.stats.uniform(loc=0.1, scale=2.9).rvs(), 1)
params.append([name, 5, None, fitness, aging])
# 6 - PA(a,b) with Uniform (a, b) Fitness
for i in range(START_NUMBER, FINAL_NUMBER):
name = "6-"+(str(i))
pa = (scipy.stats.uniform(loc=1, scale=3).rvs(), scipy.stats.uniform(loc=1, scale=3).rvs())
fitness = ("uniform", scipy.stats.uniform(loc=0.1, scale=0.9).rvs(), scipy.stats.uniform(loc=1, scale=4).rvs())
params.append([name, 6, pa, fitness, None])
# 7 - PA(a,b),
# Aging(mu, st.dev).
for i in range(START_NUMBER, FINAL_NUMBER):
name = "7-"+(str(i))
pa = (scipy.stats.uniform(loc=3.3, scale=3.7).rvs(), scipy.stats.uniform(loc=1, scale=3).rvs())
aging = (scipy.stats.uniform(loc=0.1, scale=29).rvs(), 1)
params.append([name, 7, pa, None, aging])
# 8 - PA(a,b),
# Exponential fitness (lambda),
# Aging (mu, st.dev).
for i in range(START_NUMBER, FINAL_NUMBER):
name = "8-"+(str(i))
pa = (scipy.stats.uniform(loc=1, scale=3).rvs(), scipy.stats.uniform(loc=1, scale=3).rvs())
fitness = scipy.stats.uniform(loc=0.1, scale=((pa[0] + (pa[1]/10.29))- 0.1 )).rvs() #lambda ∈ [0.1, a + b/E[M]]
aging = (scipy.stats.uniform(loc=0.1, scale=2.9).rvs(), 1)
params.append([name, 8, pa, fitness, aging])
random.shuffle(params)
print(datetime.now().strftime("%H:%M:%S"), f"Generated Params, moving on to generating {len(params)} networks.")
with Pool(processes=NR_OF_PROCESSES) as pool:
pool.starmap(generate_network, params)
if __name__ == "__main__":
main_procedure()