-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmapping_functions.py
More file actions
121 lines (99 loc) · 5.13 KB
/
mapping_functions.py
File metadata and controls
121 lines (99 loc) · 5.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
def align_dca2pdb(msa_name, pdbseq, msaseq):
"""
Some code taken from
https://github.com/bsir/dca-frustratometer/blob/master/dca_frustratometer.py
:param msa_name:
:param pdbseq: PDB seq string
:param msaseq: MSA seq string
:return:
"""
import numpy as np
from Bio import pairwise2
from evcouplings.compare import mapping as evmp
print("(map_dca2pdb)\taligning dca sequence to pdb coord sequence...")
# need to penalize for opening and adding gaps otherwise mapping is off (s param {-.5,-.1})
alignments_1 = pairwise2.align.globalxs(pdbseq[0], msaseq[0], -.5, -.1)
alignments_2 = pairwise2.align.globalxs(pdbseq[1], msaseq[1], -.5, -.1)
print(pairwise2.format_alignment(*alignments_1[0], full_sequences=True))
print(pairwise2.format_alignment(*alignments_2[0], full_sequences=True))
map_1 = evmp.map_indices(alignments_1[0][0], 1, 0, alignments_1[0][1], 1, 0)
map_2 = evmp.map_indices(alignments_2[0][0], 1 + len(pdbseq[0]), 0,
alignments_2[0][1], 1 + len(msaseq[0]), 0)
map_pdb_dca = map_1.append(map_2)
map_pdb_dca = map_pdb_dca.rename(columns={"i": "pdb_i", "A_i": "pdb_res", "j": "dca_i", "A_j": "dca_res"})
outfile = "results\\reference_maps\\ref_map_{}.txt".format(msa_name.strip(".fas"))
np.savetxt(outfile, map_pdb_dca, header="pdb_i\tpdb_res\tdca_i\tdca_res", fmt="%s\t%s\t%s\t%s", comments='')
print("(map_dca2pdb)\tWrote {}".format(outfile))
map_pdb_dca = map_pdb_dca.dropna()
map_dca2pdb_dict = dict(zip(map_pdb_dca["dca_i"], map_pdb_dca["pdb_i"]))
return map_dca2pdb_dict
def apply_map(array, map_pdb_dca):
"""
:param array:
:param map_pdb_dca:
:return:
"""
import numpy as np
print("(apply_map)")
map_dca_list = []
for i, j, fn_apc, fn in array:
i = int(i)
j = int(j)
# if i in map_pdb_dca.keys() and j in map_pdb_dca.keys():
if str(i) in map_pdb_dca.keys() and str(j) in map_pdb_dca.keys():
map_index_i = int(map_pdb_dca[str(i)])
map_index_j = int(map_pdb_dca[str(j)])
# map_index_i = int(map_pdb_dca[i])
# map_index_j = int(map_pdb_dca[j])
if map_index_i and map_index_j >= 0:
map_dca_list.append([map_index_i, map_index_j, fn_apc, fn, i, j])
return np.array(map_dca_list)
def map_dict(msa_name):
import pandas as pd
import numpy as np
from get_region import get_dca_indices
sifts_table_file = "databases/sifts/pdb_chain_uniprot_plus.csv"
s = pd.read_csv(sifts_table_file, comment="#")
pdbid = msa_name[:4].lower()
chain_1 = msa_name.split("_")[1]
chain_2 = msa_name.split("_")[3]
pdb_start_chain_1 = s.query("pdb_id == @pdbid and pdb_chain == @chain_1").coord_start.values
pdb_start_chain_2 = s.query("pdb_id == @pdbid and pdb_chain == @chain_2").coord_start.values
pdb_end_chain_1 = s.query("pdb_id == @pdbid and pdb_chain == @chain_1").coord_end.values
pdb_end_chain_2 = s.query("pdb_id == @pdbid and pdb_chain == @chain_2").coord_end.values
uniprot_start_chain_1 = s.query("pdb_id == @pdbid and pdb_chain == @chain_1").uniprot_start.values
uniprot_end_chain_1 = s.query("pdb_id == @pdbid and pdb_chain == @chain_1").uniprot_end.values
uniprot_start_chain_2 = s.query("pdb_id == @pdbid and pdb_chain == @chain_2").uniprot_start.values
uniprot_end_chain_2 = s.query("pdb_id == @pdbid and pdb_chain == @chain_2").uniprot_end.values
# pdb
pdb_start_chain_1 = ([int(i) for i in pdb_start_chain_1])
pdb_end_chain_1 = ([int(i) for i in pdb_end_chain_1])
# add last index of end index + 1 for chain 2
pdb_start_chain_2 = ([(int(i) + pdb_end_chain_1[-1] + 1) for i in pdb_start_chain_2])
pdb_end_chain_2 = ([(int(i) + pdb_end_chain_1[-1] + 1) for i in pdb_end_chain_2])
# uniprot
uniprot_start_chain_1 = ([int(i) for i in uniprot_start_chain_1])
uniprot_end_chain_1 = ([int(i) for i in uniprot_end_chain_1])
# add last index of end index + 1 for chain 2
uniprot_start_chain_2 = ([(int(i) + uniprot_end_chain_1[-1] + 1) for i in uniprot_start_chain_2])
uniprot_end_chain_2 = ([(int(i) + uniprot_end_chain_1[-1] + 1) for i in uniprot_end_chain_2])
pdb_start_indices = pdb_start_chain_1 + pdb_start_chain_2
pdb_end_indices = pdb_end_chain_1 + pdb_end_chain_2
uniprot_start_indices = uniprot_start_chain_1 + uniprot_start_chain_2
uniprot_end_indices = uniprot_end_chain_1 + uniprot_end_chain_2
pdb_indices = make_indices(pdb_start_indices, pdb_end_indices)
# pdb_indices = range(1, len(pdb_indices))
uniprot_indices = make_indices(uniprot_start_indices, uniprot_end_indices)
dca_indices = get_dca_indices(msa_name)
uni2pdb = dict(zip(uniprot_indices, pdb_indices))
dca2uni = dict(zip(dca_indices, uniprot_indices))
dca2pdb = dict(zip(dca_indices, pdb_indices))
pdb2uni = dict(zip(pdb_indices, uniprot_indices))
# print(dca2pdb)
return uni2pdb, dca2uni, dca2pdb, pdb2uni
def make_indices(indices_1, indices_2):
import numpy as np
x = []
for i in range(len(indices_1)):
x.append(np.array(range(indices_1[i], indices_2[i])))
return np.concatenate(x)