DeepCCA_thesis/predictions_script.py at main · LukaLmelias/DeepCCA_thesis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
##########
### Imports

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
from tqdm import tqdm
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import rdFMCS
from scipy.spatial import distance
from scipy.stats import pearsonr
from scipy.stats import fisher_exact
from scipy.stats.contingency import crosstab
from scipy.stats import hypergeom
from sklearn.manifold import TSNE
import random
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import pickle
import os


# from spec2vec import Spec2Vec
# from matchms import calculate_scores

# from openeye import oechem

os.chdir('../raw_data')


# write or load files

class Files:
    def __init__(self, filename):
        self.file = filename

    def write_to_file(self, data):
        with open(self.file, 'wb') as f:
            pickle.dump(data, f)
        return None

    def load_pickle(self):
        data = pd.read_pickle(self.file)
        return data

    def load_csv(self, sep, usecols=None):
        data = pd.read_csv(self.file, sep=sep, usecols=usecols)
        return data


#########THE PREDICTOR HIMSELF
def predict(subject_df,query_df,dims,
            subject_embed='sdl_z2',
            metric='euc',
           query_embed='sdl_z1',
           top_k=20,
           random_preds=False
           ): #both dfs should have z1 and z2 coloumns

    # initiate an empty df
    preds = pd.DataFrame()
    preds.index=[x for x in query_df['spec_id']] # set query spec id as rownames
    preds[f'top_{top_k}_{metric}'] = None
    preds[f'true_{metric}'] = None
    preds[f'top_{top_k}_inchi14'] = None
    preds[f'top_{top_k}_tanis'] = None
    preds[f'true_tanimoto'] = None # only necessary to check that our indexing is correct; true always tanis == 1


    # compute the distances and select top k

    for query_index, query in enumerate(tqdm(query_df[query_embed])): #(query z1)

        #calculate tanimotos if it does not meet threshold;pass

        query_id = query_df['spec_id'].iloc[query_index]
        query_inchi = query_df['inchikey14'].iloc[query_index]

        similarity = []


        #print(query_inchi)

        for subject in subject_df[subject_embed]:#(subject z2)

            #subject = subject_df.loc[subject_index,'z2']
            if metric == 'corr':
                corr = pearsonr(query[:dims], subject[:dims])[0]
                similarity.append(corr)

            if metric== 'euc':
                euc = distance.euclidean(query[:dims], subject[:dims])
                similarity.append(euc)

            if metric == 'cos':
                cos = distance.cosine(query[:dims], subject[:dims])
                similarity.append(cos)

        if metric == 'corr': # if corr then higher the better
            top_k_corr = np.sort(similarity)[::-1][:top_k]
            top_k_ichi = []
            top_k_tanis = []
            for corr in top_k_corr:

                if random_preds:
                    sunject_index = random.randint(0, subject_df.shape[0]-1)  #minus 1 to be within the df
                    inchi = subject_df['inchikey14'].iloc[subject_index]
                    #print(inchi)
                    top_k_ichi.append(inchi)
                else:
                    subject_index = similarity.index(corr) # pick subject index
                    inchi = subject_df['inchikey14'].iloc[subject_index]
                    #print(inchi)
                    top_k_ichi.append(inchi)


                #compute tanis
                smile1 = subject_df['smiles'].iloc[subject_index] #extract the subject smile
                smile2= query_df['smiles'].iloc[query_index]#extract the query smile
                tani = tanimoto(smile1,smile2)
                top_k_tanis.append(tani)

            # update the df
            preds.at[query_id, f'top_{top_k}_{metric}'] = top_k_corr # add corr
            preds.at[query_id, f'top_{top_k}_inchi14' ] = top_k_ichi # add inchi of the predicted
            preds.at[query_id, f'top_{top_k}_tanis'] = top_k_tanis


            #compute true dist between query and the true structure
            if query_inchi in list(subject_df['inchikey14']):

                #extract true hit embeddings to compute true cosine distance
                true_subject_embed = subject_df.loc[subject_df['inchikey14'] == query_inchi, subject_embed].iloc[0]

                true_corr = pearsonr(query[:dims], true_subject_embed[:dims])[0]
                preds.at[query_id,f'true_{metric}'] = true_corr

                # also compute true tanimoto
                smile2= query_df['smiles'].iloc[query_index]
                true_smile = subject_df.loc[subject_df['inchikey14'] == query_inchi, 'smiles'].iloc[0]
                preds.at[query_id,f'true_tanimoto'] = tanimoto(true_smile,smile2) # smile2 == query  smile

#             else:
#                 pass


        else: # if cos or euc, then lower the better
            top_k_dist = np.sort(similarity)[:top_k]
            top_k_ichi = []
            top_k_tanis = []
            for dist in top_k_dist:

                if random_preds:

                    subject_index =  random.randint(0, subject_df.shape[0]-1) #randomly pick a structure index
                    inchi = subject_df['inchikey14'].iloc[subject_index]
                    top_k_ichi.append(inchi)

                else:
                    subject_index = similarity.index(dist)
                    #print(subject_index)
                    inchi = subject_df['inchikey14'].iloc[subject_index]
                    top_k_ichi.append(inchi)

                #compute tanis
                smile1 = subject_df['smiles'].iloc[subject_index] #extract the subject smile
                smile2= query_df['smiles'].iloc[query_index]#extract the query smile
                tani = tanimoto(smile1,smile2)
                top_k_tanis.append(tani)

            preds.at[query_id, f'top_{top_k}_{metric}'] = top_k_dist # add corr
            preds.at[query_id, f'top_{top_k}_inchi14'] = top_k_ichi # add inchi of the predicted
            preds.at[query_id, f'top_{top_k}_tanis'] = top_k_tanis

            #compute true dist between query and the true structure
            if query_inchi in list(subject_df['inchikey14']):

                #extract true hit embedding to compute true cosine distance
                true_subject_embed = subject_df.loc[subject_df['inchikey14'] == query_inchi, subject_embed].iloc[0]

                true_dist = distance.cosine(query[:dims], true_subject_embed[:dims])
                preds.at[query_id,f'true_{metric}'] = true_dist


                # also compute true tanimoto
                smile2= query_df['smiles'].iloc[query_index]
                true_smile = subject_df.loc[subject_df['inchikey14'] == query_inchi, 'smiles'].iloc[0]
                preds.at[query_id,f'true_tanimoto'] = tanimoto(true_smile,smile2) # smile2 == query  smile

#             else:
#                 pass

    #add query class (not efficient but no time to think for now :)
    preds = add_query_class(preds, query_df)
    preds = add_subject_class(subject_df,preds)


    return preds


#######TO COMPUTE TANIS
# function to calculate pairwise tanimoto scores
def tanimoto(smi1, smi2):
    #molecule
    mol1 = Chem.MolFromSmiles(smi1)
    mol2 = Chem.MolFromSmiles(smi2)
    #fingerprint
    fp1 = Chem.RDKFingerprint(mol1)
    fp2 = Chem.RDKFingerprint(mol2)

    #similarity
    score = round(DataStructs.FingerprintSimilarity(fp1,fp2),4)
    return score


#### add query classes

def add_query_class(preds_df, query_df):
    count = 0
    query_smiles = []
    query_inchis = []
    query_classes = []
    for row in preds_df.itertuples():
        #print(row.Index)

        # initiate new cols
        #cca_cos_preds_df['query_smile'] = None
        #cca_cos_preds_df['query_class'] = None

        # extract true class and smile
        query_inchi = query_df.loc[query_df['spec_id'] == row.Index, 'inchikey14'].iloc[0]
        query_smile = query_df.loc[query_df['spec_id'] == row.Index, 'smiles'].iloc[0]
        query_class = query_df.loc[query_df['spec_id'] == row.Index, 'cf_class'].iloc[0]

        query_inchis.append(query_inchi)
        query_smiles.append(query_smile)
        query_classes.append(query_class)
        # add true class and smile to the new cols
        #cca_cos_preds_df.loc[row.Index,'query_smile'] = query_smile
        #cca_cos_preds_df.loc[row.Index,'query_class'] = query_class

        #cca_cos_preds_df.loc[ cca_cos_preds_df.index == row.Index, 'query_smile'] = query_smile

        #print(cca_cos_preds_df.loc[row.Index,'query_smile'])
        #count +=1
        #print(query_smile)
        #     if count > 5:
        #         break

    preds_df['query_inchi14'] = [x for x in query_inchis]
    preds_df['query_smile'] = [x for x in query_smiles ]
    preds_df['query_class'] = [x for x in query_classes ]

    return preds_df


# add subject top 20 classes, smiles

def add_subject_class(subject_df,preds_df):
    top_20_classes = []
    top_20_smiles = []
    for row in preds_df.itertuples():
    #print(row.top_20_inchi14)


        top_20_class = []
        top_20_smile = []
        for inchi in row.top_20_inchi14:
            # extract true class and smile

            hit_smile = subject_df.loc[subject_df['inchikey14'] == inchi, 'smiles'].iloc[0]
            hit_class = subject_df.loc[subject_df['inchikey14'] == inchi, 'cf_class'].iloc[0]

            top_20_class.append(hit_class)
            top_20_smile.append(hit_smile)
        top_20_classes.append(top_20_class)
        top_20_smiles.append(top_20_smile)
    #break
    preds_df['top_20_classes']= [x for x in top_20_classes]
    preds_df['top_20_smiles'] = [x for x in top_20_smiles]
    return preds_df


##### Compute predictions from the final model ¶

# load the data
paths = ['./sdl_logs/sdl_optimized_params/train_df_max3_sdl_and_cca_final_model_z_scores.pickle',
         './sdl_logs/sdl_optimized_params/test_df_max3_sdl_and_cca_final_model_z_scores.pickle',
         #'./sdl_logs/sdl_optimized_params/val_df_max3_sdl_and_cca_final_model_z_scores.pickle'
        ]


train_df = Files(paths[0]).load_pickle()
test_df = Files(paths[1]).load_pickle()
test_df.head(3)

# select unique inchikey14 and create a database from training set
train_df.drop_duplicates('inchikey14', inplace=True)
print(train_df.shape, test_df.shape[0])


# #%%time
# metrics = ['cos', 'corr']

# models = ['sdl', 'cca']
# #size = 5
# for metric in metrics:
#     for model in models:
#         print(f'\nModel {model}')
#         # cosine distance
#         predictions_df = predict(subject_df=train_df,\
#                         query_df=train_df,dims=100,
#                         subject_embed=f'{model}_z2', # base name of z scores cols in subject df
#                        query_embed=f'{model}_z1', # base name of z scores cols in query df
#                        metric=metric,
#                                 top_k=20)

# #         print('\nComputing Distance is Completed successfully\n')
# #        # tanimotos and hits
# #         scores, hit = get_tanimotos(dist,subject_df=train_df,\
# #                                     query_df=test_df.head(size),\
# #                                     metric=metric)

#         print('\nComputing Tanimoto and hits is Completed successfully\n')
#         # write the distances to file
#         Files(f'./sdl_logs/sdl_optimized_params/train_in_train/{model}_preds/{model}_final_model_train_in_train_{metric}_predictions_df.pickle').write_to_file(predictions_df)

#         del dist # rescue memory :)

#         Files(f'./sdl_logs/sdl_optimized_params/{model}_preds/{model}_final_model_test_{metric}_tanimoto.pickle').write_to_file(scores[0]) # scores has [tanimoto, mcs]
#         Files(f'./sdl_logs/sdl_optimized_params/{model}_preds/{model}_final_model_test_{metric}_hits.pickle').write_to_file(hit)

#         del scores, hit # only load when actually using them.


# for comparison, also compute random predictions.

predictions_random = predict(subject_df=train_df,\
                        query_df=test_df,dims=100,
                        subject_embed='sdl_z2', # base name of z scores cols in subject df
                       query_embed='sdl_z1', # base name of z scores cols in query df
                       metric='cos',
                      top_k=20,random_preds=True)


Files(f'./sdl_logs/sdl_optimized_params/test_in_train/sdl_preds/random_final_model_train_in_train_cos_predictions_df.pickle').write_to_file(predictions_random)