forked from bmmoore43/MotifDiscovery
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathMapEnrichedKmers.py
More file actions
executable file
·142 lines (100 loc) · 4.41 KB
/
MapEnrichedKmers.py
File metadata and controls
executable file
·142 lines (100 loc) · 4.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
"""
PURPOSE:
Given a list of kmers, a folder with mapping files, and a set of genes, list their mapping locations
Before running, set path to Miniconda: export PATH=/mnt/home/azodichr/miniconda3/bin:$PATH
INPUT:
-pos_file FASTA file with positive examples
OUTPUT:
-S
python ~/GitHub/MotifDiscovery/MapEnrichedKmers.py -k ../01_RF_Results/16_2_5_UpdatedRF/L_NNU_NNN_imp.txt -tamo /mnt/home/azodichr/01_DualStress_At/14_LogicClusters/06_MapKmers/02_mapped/ -genes ../01_RF_Results/16_2_5_UpdatedRF/00_SeqFiles/L_NNU.txtls
"""
from collections import defaultdict
import pandas as pd
import sys, os
class DAP_Seq:
def map_specific_kmers(self, DF, PATH, POS, NEG):
#Load feature matrix and save feature names
if isinstance(DF, str): #If loading df as a file
df = pd.read_csv(DF, sep='\t', index_col = 0)
else: #If df is already in pandas format from previous script
df = DF
wanted_kmers = list(df.columns.values)
pos_kmers = defaultdict(list)
neg_kmers = defaultdict(list)
#Make dictionary of positive example genes
#key = kmer, value = list of all genes with that kmer & its location (GENE:Start:Stop:Strand)
for i in os.listdir(PATH):
if POS in i and "0.9max.out.pvalue" in i:
k_list = []
for l in open(os.path.join(PATH,i),'r'):
if l[0].isdigit():
kmer = l.strip().split("\t")[1]
else:
loci = ":".join(l.strip().split("\t")[0:4])
pos_kmers[kmer].append(loci)
#Repeat to make dictionary with negative example genes.
for i in os.listdir(PATH):
if NEG in i and "0.9max.out.pvalue" in i:
k_list = []
for l in open(os.path.join(PATH,i),'r'):
if l[0].isdigit():
kmer = l.strip().split("\t")[1]
else:
loci = ":".join(l.strip().split("\t")[0:4])
neg_kmers[kmer].append(loci)
print(pos_kmers)
#Load bed file
#For each kmer record the highest, lowest, and average availability amongst the genes in the pos vs. neg list. Calculate mean dif.
#Search bed file for positive and negative genes with kmers present
#Output
def all_DAP_sites(self, DAP, POS, NEG):
if isinstance(DAP, str): #If loading df as a file
dap = pd.read_csv(DAP, sep='\t')
else: #If df is already in pandas format from previous script
dap = DAP
with open(POS) as p:
pos = p.read().splitlines()
with open(NEG) as n:
neg = n.read().splitlines()
df_p = dap[dap['Gene'].isin(pos)]
df_p.insert(1, 'Class', 1)
num_pos = df_p.shape[0]
df_n = dap[dap['Gene'].isin(neg)]
df_n.insert(1, 'Class', 0)
num_neg = df_n.shape[0]
frames = [df_p, df_n]
df = pd.concat(frames)
for column in df:
if column == "Class" or column == "Gene":
pass
else:
pass
#print(df[column])
df.to_csv("all_DAP_sites_co2k4_NND_df.txt", sep="\t", index=False)
#-------------------------------------------------------------------------------
if __name__ == '__main__':
DAP_Seq=DAP_Seq() # Print main function help if no imputs given
for i in range (1,len(sys.argv),2):
if sys.argv[i] == '-f': #Directory with mapped tamo files
F = sys.argv[i+1]
if sys.argv[i] == '-maps': #Directory with mapped tamo files
PATH = sys.argv[i+1]
if sys.argv[i] == '-pos': #String that will identify positive genes in the maps directory
POS = sys.argv[i+1]
if sys.argv[i] == '-neg': #String that will identify negative genes in the maps directory
NEG = sys.argv[i+1]
if sys.argv[i] == '-df': #Dataframe of pos-neg examples and kmer features with presence and absense of kmers.
DF = sys.argv[i+1]
if sys.argv[i] == '-DAP_matrix': #DF with all DAP TFs as columns and all genes as rows, 1 = hit in promoter
DAP = sys.argv[i+1]
if len(sys.argv) <= 1:
print(__doc__)
sys.exit()
if F == "map_specific_kmers":
if "" in [DF, PATH, POS, NEG]:
print("Need df, path, pos, and neg")
DAP_Seq.map_specific_kmers(DF, PATH, POS, NEG)
elif F == "all_DAP_sites":
if "" in [DAP, POS, NEG]:
print("Need df, path, pos, and neg")
DAP_Seq.all_DAP_sites(DAP, POS, NEG)