-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_cleaner.py
More file actions
74 lines (74 loc) · 2.66 KB
/
data_cleaner.py
File metadata and controls
74 lines (74 loc) · 2.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
+import rdkit
+from rdkit import Chem
+import json
+from collections import Counter
+from rdkit.Chem import AllChem
+from rdkit.Chem import Draw
+import pandas as pd
+
+c = open('t_tmp')
+js = c.read()
+reaction_dicts = json.loads(js)
+reaction_df = pd.DataFrame(reaction_dicts)
+new_df = pd.DataFrame()
+
+new_df['ExperimentalYield'] = reaction_df['ExperimentalYield']
+new_df['Reagents'] = reaction_df.loc[:, "Reagents"].apply(lambda l: [e["SmilesFormula"] for e in l])
+new_df['Products'] = reaction_df.loc[:, "Products"].apply(lambda l: [e["SmilesFormula"] for e in l])
+new_rct_series = new_df["Reagents"].apply(lambda l: ".".join(l))
+new_prd_series = new_df["Products"].apply(lambda l: ".".join(l))
+
+def canon(df):
+ df_1 = []
+ for smiles in df:
+ try:
+ canon = Chem.CanonSmiles(smiles)
+ df_1.append(canon)
+ except:
+ mol = Chem.MolFromSmiles(smiles, sanitize=False)
+ rea = AllChem.ReactionFromSmarts("[n:1]([O-:2]):[o:3]>>[N+:1]([O-:2])=[O:3]")
+ rea.Initialize()
+ mol = Chem.MolToSmiles(rea.RunReactants([mol])[0][0])
+ try:
+ canon = Chem.CanonSmiles(mol)
+ df_1.append(canon)
+ except:
+ if '[n+]' in mol:
+ mol = mol.replace('[n+]', '[N+]')
+ if '(:o)' in mol:
+ mol = mol.replace('(:o)', '(=O)')
+ if 'o' in mol:
+ mol = mol.replace('o', 'O')
+ canon = Chem.CanonSmiles(mol)
+ df_1.append(canon)
+ return df_1
+
+new_df['ReagentsCanon'] = new_df['Reagents'].apply(canon)
+new_df['ProductsCanon'] = new_df['Products'].apply(canon)
+
+dict_prod = dict()
+reaction_dicts = []
+for row in new_df.iterrows():
+ for elem in row[1]['ProductsCanon']:
+ reaction_dict = {'ProductCanon':elem, 'ReagentsCanon': row[1]['ReagentsCanon'], 'ExperimentalYield': row[1]['ExperimentalYield']}
+ reaction_dicts.append(reaction_dict)
+
+reaction_df = pd.DataFrame(reaction_dicts)
+new_reag = reaction_df["ReagentsCanon"].apply(lambda f: ".".join(f))
+reaction_df.loc[:, "ReactionSmiles"] = new_reag + ">>" + reaction_df["ProductCanon"]
+del reaction_df['ProductCanon']
+del reaction_df['ReagentsCanon']
+
+def group_reactions(yields):
+
+ if any([elem > 0 for elem in yields]):
+ return 1
+ elif any([elem == 0 for elem in yields]):
+ return 0
+ else:
+ return -1
+
+
+reaction_df = reaction_df.groupby('ReactionSmiles').agg({"ExperimentalYield": group_reactions})
+reaction_df = reaction_df.rename(columns={'ExperimentalYield': 'Flag'})
+reaction_df