-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprotein.py
More file actions
102 lines (93 loc) · 4.08 KB
/
protein.py
File metadata and controls
102 lines (93 loc) · 4.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import pathlib
import requests
from esm import FastaBatchedDataset
from utils import extract, csv2arr
from message_hub import hub
class Protein:
def __init__(self, uniprot_id, out_dir, use_cache=False, layer=33,):
self.uniprot_id = uniprot_id.upper()
fasta = requests.get(f"https://rest.uniprot.org/uniprotkb/{self.uniprot_id}.fasta").text
self.seq = "".join(fasta.split("\n")[1:])
tsv = requests.get(f"https://rest.uniprot.org/uniprotkb/{self.uniprot_id}.tsv").text
tsv = [line.split("\t") for line in tsv.split("\n") if line]
self.info = {}
for idx, key in enumerate(tsv[0]):
self.info[key] = tsv[1][idx]
self.receptor = self.info["Entry Name"].split("_")[0]
self.species = self.info["Entry Name"].split("_")[1]
self.muts = {"WT"}
self.layer = layer
self.out_dir = out_dir / f"{self.receptor}_{self.species}"
self.out_dir.mkdir(parents=True, exist_ok=True)
self.missing_list = []
self.use_cache = use_cache
def deep_mut(self):
for idx in range(1,len(self.seq)):
mut_from = self.seq[idx]
for mut_to in ["A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"]:
mut = mut_from + f"{idx+1}" + mut_to
added = self.add_mut(mut)
def add_mut(self, mut):
mut_from = mut[0]
mut_to = mut[-1]
mut_pos = int(mut[1:-1]) - 1
if mut_from != mut_to and self.seq[mut_pos] == mut_from:
self.muts.add(mut)
return True
else:
return False
def to_dataset(self):
seq_labels, seq_strs = [], []
hub.put(("message", f"正在查询缓存"))
self.get_missing_muts()
total = len(self.missing_list)
current = 0
for mut in self.missing_list:
if mut == "WT":
variant = f"{self.receptor}_{self.species}_WT"
mut_seq = self.seq
else:
mut_to = mut[-1]
mut_pos = int(mut[1:-1]) - 1
variant = f"{self.receptor}_{self.species}_{mut}"
mut_seq = self.seq[:mut_pos] + mut_to + self.seq[mut_pos + 1:]
seq_labels.append(variant)
seq_strs.append(mut_seq)
current += 1
hub.put(("progress",f"正在生成突变序列,进度:{current}/{total}"))
return FastaBatchedDataset(seq_labels, seq_strs)
def get_missing_muts(self):
if self.use_cache and (self.out_dir / "repr").exists():
print(self.out_dir / "repr")
self.missing_list = []
for mut in self.muts:
file_name = f"{self.receptor}_{self.species}_{mut}_esm2_{self.layer}.csv"
if not (self.out_dir / "repr" / file_name).exists():
self.missing_list.append(mut)
else:
self.missing_list = list(self.muts)
if len(self.missing_list) > 0:
hub.put(("message", f"共{len(self.missing_list)}条序列需要提取特征"))
return self.missing_list
def collect_repr(self):
all_repr = {}
dataset = self.to_dataset()
if self.use_cache:
temp_dir = self.out_dir / "repr"
temp_dir.mkdir(exist_ok=True)
if not len(dataset) == 0:
extract(dataset, repr_layer=self.layer,temp_dir=temp_dir)
for idx, mut in enumerate(self.muts):
file_name = f"{self.receptor}_{self.species}_{mut}_esm2_{self.layer}.csv"
repr_arr = csv2arr(self.out_dir / "repr" / file_name)
variant = f"{self.receptor}_{self.species}_{mut}"
all_repr[variant] = repr_arr
else:
dataset = self.to_dataset()
all_repr = extract(dataset,repr_layer=self.layer)
return all_repr
if __name__ == "__main__":
out_dir = pathlib.Path("test")
Prot = Protein("P0DPR3",out_dir=out_dir)
Prot.deep_mut()
Prot.collect_repr()