-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpredictions_script.py
More file actions
361 lines (257 loc) · 12.9 KB
/
predictions_script.py
File metadata and controls
361 lines (257 loc) · 12.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
##########
### Imports
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
from tqdm import tqdm
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import rdFMCS
from scipy.spatial import distance
from scipy.stats import pearsonr
from scipy.stats import fisher_exact
from scipy.stats.contingency import crosstab
from scipy.stats import hypergeom
from sklearn.manifold import TSNE
import random
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import pickle
import os
# from spec2vec import Spec2Vec
# from matchms import calculate_scores
# from openeye import oechem
os.chdir('../raw_data')
# write or load files
class Files:
def __init__(self, filename):
self.file = filename
def write_to_file(self, data):
with open(self.file, 'wb') as f:
pickle.dump(data, f)
return None
def load_pickle(self):
data = pd.read_pickle(self.file)
return data
def load_csv(self, sep, usecols=None):
data = pd.read_csv(self.file, sep=sep, usecols=usecols)
return data
#########THE PREDICTOR HIMSELF
def predict(subject_df,query_df,dims,
subject_embed='sdl_z2',
metric='euc',
query_embed='sdl_z1',
top_k=20,
random_preds=False
): #both dfs should have z1 and z2 coloumns
# initiate an empty df
preds = pd.DataFrame()
preds.index=[x for x in query_df['spec_id']] # set query spec id as rownames
preds[f'top_{top_k}_{metric}'] = None
preds[f'true_{metric}'] = None
preds[f'top_{top_k}_inchi14'] = None
preds[f'top_{top_k}_tanis'] = None
preds[f'true_tanimoto'] = None # only necessary to check that our indexing is correct; true always tanis == 1
# compute the distances and select top k
for query_index, query in enumerate(tqdm(query_df[query_embed])): #(query z1)
#calculate tanimotos if it does not meet threshold;pass
query_id = query_df['spec_id'].iloc[query_index]
query_inchi = query_df['inchikey14'].iloc[query_index]
similarity = []
#print(query_inchi)
for subject in subject_df[subject_embed]:#(subject z2)
#subject = subject_df.loc[subject_index,'z2']
if metric == 'corr':
corr = pearsonr(query[:dims], subject[:dims])[0]
similarity.append(corr)
if metric== 'euc':
euc = distance.euclidean(query[:dims], subject[:dims])
similarity.append(euc)
if metric == 'cos':
cos = distance.cosine(query[:dims], subject[:dims])
similarity.append(cos)
if metric == 'corr': # if corr then higher the better
top_k_corr = np.sort(similarity)[::-1][:top_k]
top_k_ichi = []
top_k_tanis = []
for corr in top_k_corr:
if random_preds:
sunject_index = random.randint(0, subject_df.shape[0]-1) #minus 1 to be within the df
inchi = subject_df['inchikey14'].iloc[subject_index]
#print(inchi)
top_k_ichi.append(inchi)
else:
subject_index = similarity.index(corr) # pick subject index
inchi = subject_df['inchikey14'].iloc[subject_index]
#print(inchi)
top_k_ichi.append(inchi)
#compute tanis
smile1 = subject_df['smiles'].iloc[subject_index] #extract the subject smile
smile2= query_df['smiles'].iloc[query_index]#extract the query smile
tani = tanimoto(smile1,smile2)
top_k_tanis.append(tani)
# update the df
preds.at[query_id, f'top_{top_k}_{metric}'] = top_k_corr # add corr
preds.at[query_id, f'top_{top_k}_inchi14' ] = top_k_ichi # add inchi of the predicted
preds.at[query_id, f'top_{top_k}_tanis'] = top_k_tanis
#compute true dist between query and the true structure
if query_inchi in list(subject_df['inchikey14']):
#extract true hit embeddings to compute true cosine distance
true_subject_embed = subject_df.loc[subject_df['inchikey14'] == query_inchi, subject_embed].iloc[0]
true_corr = pearsonr(query[:dims], true_subject_embed[:dims])[0]
preds.at[query_id,f'true_{metric}'] = true_corr
# also compute true tanimoto
smile2= query_df['smiles'].iloc[query_index]
true_smile = subject_df.loc[subject_df['inchikey14'] == query_inchi, 'smiles'].iloc[0]
preds.at[query_id,f'true_tanimoto'] = tanimoto(true_smile,smile2) # smile2 == query smile
# else:
# pass
else: # if cos or euc, then lower the better
top_k_dist = np.sort(similarity)[:top_k]
top_k_ichi = []
top_k_tanis = []
for dist in top_k_dist:
if random_preds:
subject_index = random.randint(0, subject_df.shape[0]-1) #randomly pick a structure index
inchi = subject_df['inchikey14'].iloc[subject_index]
top_k_ichi.append(inchi)
else:
subject_index = similarity.index(dist)
#print(subject_index)
inchi = subject_df['inchikey14'].iloc[subject_index]
top_k_ichi.append(inchi)
#compute tanis
smile1 = subject_df['smiles'].iloc[subject_index] #extract the subject smile
smile2= query_df['smiles'].iloc[query_index]#extract the query smile
tani = tanimoto(smile1,smile2)
top_k_tanis.append(tani)
preds.at[query_id, f'top_{top_k}_{metric}'] = top_k_dist # add corr
preds.at[query_id, f'top_{top_k}_inchi14'] = top_k_ichi # add inchi of the predicted
preds.at[query_id, f'top_{top_k}_tanis'] = top_k_tanis
#compute true dist between query and the true structure
if query_inchi in list(subject_df['inchikey14']):
#extract true hit embedding to compute true cosine distance
true_subject_embed = subject_df.loc[subject_df['inchikey14'] == query_inchi, subject_embed].iloc[0]
true_dist = distance.cosine(query[:dims], true_subject_embed[:dims])
preds.at[query_id,f'true_{metric}'] = true_dist
# also compute true tanimoto
smile2= query_df['smiles'].iloc[query_index]
true_smile = subject_df.loc[subject_df['inchikey14'] == query_inchi, 'smiles'].iloc[0]
preds.at[query_id,f'true_tanimoto'] = tanimoto(true_smile,smile2) # smile2 == query smile
# else:
# pass
#add query class (not efficient but no time to think for now :)
preds = add_query_class(preds, query_df)
preds = add_subject_class(subject_df,preds)
return preds
#######TO COMPUTE TANIS
# function to calculate pairwise tanimoto scores
def tanimoto(smi1, smi2):
#molecule
mol1 = Chem.MolFromSmiles(smi1)
mol2 = Chem.MolFromSmiles(smi2)
#fingerprint
fp1 = Chem.RDKFingerprint(mol1)
fp2 = Chem.RDKFingerprint(mol2)
#similarity
score = round(DataStructs.FingerprintSimilarity(fp1,fp2),4)
return score
#### add query classes
def add_query_class(preds_df, query_df):
count = 0
query_smiles = []
query_inchis = []
query_classes = []
for row in preds_df.itertuples():
#print(row.Index)
# initiate new cols
#cca_cos_preds_df['query_smile'] = None
#cca_cos_preds_df['query_class'] = None
# extract true class and smile
query_inchi = query_df.loc[query_df['spec_id'] == row.Index, 'inchikey14'].iloc[0]
query_smile = query_df.loc[query_df['spec_id'] == row.Index, 'smiles'].iloc[0]
query_class = query_df.loc[query_df['spec_id'] == row.Index, 'cf_class'].iloc[0]
query_inchis.append(query_inchi)
query_smiles.append(query_smile)
query_classes.append(query_class)
# add true class and smile to the new cols
#cca_cos_preds_df.loc[row.Index,'query_smile'] = query_smile
#cca_cos_preds_df.loc[row.Index,'query_class'] = query_class
#cca_cos_preds_df.loc[ cca_cos_preds_df.index == row.Index, 'query_smile'] = query_smile
#print(cca_cos_preds_df.loc[row.Index,'query_smile'])
#count +=1
#print(query_smile)
# if count > 5:
# break
preds_df['query_inchi14'] = [x for x in query_inchis]
preds_df['query_smile'] = [x for x in query_smiles ]
preds_df['query_class'] = [x for x in query_classes ]
return preds_df
# add subject top 20 classes, smiles
def add_subject_class(subject_df,preds_df):
top_20_classes = []
top_20_smiles = []
for row in preds_df.itertuples():
#print(row.top_20_inchi14)
top_20_class = []
top_20_smile = []
for inchi in row.top_20_inchi14:
# extract true class and smile
hit_smile = subject_df.loc[subject_df['inchikey14'] == inchi, 'smiles'].iloc[0]
hit_class = subject_df.loc[subject_df['inchikey14'] == inchi, 'cf_class'].iloc[0]
top_20_class.append(hit_class)
top_20_smile.append(hit_smile)
top_20_classes.append(top_20_class)
top_20_smiles.append(top_20_smile)
#break
preds_df['top_20_classes']= [x for x in top_20_classes]
preds_df['top_20_smiles'] = [x for x in top_20_smiles]
return preds_df
##### Compute predictions from the final model ¶
# load the data
paths = ['./sdl_logs/sdl_optimized_params/train_df_max3_sdl_and_cca_final_model_z_scores.pickle',
'./sdl_logs/sdl_optimized_params/test_df_max3_sdl_and_cca_final_model_z_scores.pickle',
#'./sdl_logs/sdl_optimized_params/val_df_max3_sdl_and_cca_final_model_z_scores.pickle'
]
train_df = Files(paths[0]).load_pickle()
test_df = Files(paths[1]).load_pickle()
test_df.head(3)
# select unique inchikey14 and create a database from training set
train_df.drop_duplicates('inchikey14', inplace=True)
print(train_df.shape, test_df.shape[0])
# #%%time
# metrics = ['cos', 'corr']
# models = ['sdl', 'cca']
# #size = 5
# for metric in metrics:
# for model in models:
# print(f'\nModel {model}')
# # cosine distance
# predictions_df = predict(subject_df=train_df,\
# query_df=train_df,dims=100,
# subject_embed=f'{model}_z2', # base name of z scores cols in subject df
# query_embed=f'{model}_z1', # base name of z scores cols in query df
# metric=metric,
# top_k=20)
# # print('\nComputing Distance is Completed successfully\n')
# # # tanimotos and hits
# # scores, hit = get_tanimotos(dist,subject_df=train_df,\
# # query_df=test_df.head(size),\
# # metric=metric)
# print('\nComputing Tanimoto and hits is Completed successfully\n')
# # write the distances to file
# Files(f'./sdl_logs/sdl_optimized_params/train_in_train/{model}_preds/{model}_final_model_train_in_train_{metric}_predictions_df.pickle').write_to_file(predictions_df)
# del dist # rescue memory :)
# Files(f'./sdl_logs/sdl_optimized_params/{model}_preds/{model}_final_model_test_{metric}_tanimoto.pickle').write_to_file(scores[0]) # scores has [tanimoto, mcs]
# Files(f'./sdl_logs/sdl_optimized_params/{model}_preds/{model}_final_model_test_{metric}_hits.pickle').write_to_file(hit)
# del scores, hit # only load when actually using them.
# for comparison, also compute random predictions.
predictions_random = predict(subject_df=train_df,\
query_df=test_df,dims=100,
subject_embed='sdl_z2', # base name of z scores cols in subject df
query_embed='sdl_z1', # base name of z scores cols in query df
metric='cos',
top_k=20,random_preds=True)
Files(f'./sdl_logs/sdl_optimized_params/test_in_train/sdl_preds/random_final_model_train_in_train_cos_predictions_df.pickle').write_to_file(predictions_random)