forked from bmmoore43/MotifDiscovery
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfilter_df_FET.py
More file actions
executable file
·153 lines (117 loc) · 4.46 KB
/
filter_df_FET.py
File metadata and controls
executable file
·153 lines (117 loc) · 4.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
"""
PURPOSE:
Take a machine learning dataframe and filter out features that aren't enriched in the positive class
Path to Miniconda: export PATH=/mnt/home/azodichr/miniconda3/bin:$PATH
INPUT:
-df Unfiltered df
-pos FASTA file with positive examples
-neg FASTA file with negative examples - the script will run ML on 50 random samples to get balanced test
-pval P-value cut off for Fisher's exact test (Default = 0.01)
-FDR Default: N. Designate (Y/N) if you want to run FDR correction during enrichment test
OUTPUT:
-df_FET Dataframe that goes into SK-learn for ML.
"""
import pandas as pd
import numpy as np
import sys, os
from math import sqrt
from scipy.stats import fisher_exact
PVAL = 0.01
neg = '0'
pos = '1'
cla = 'Class'
for i in range (1,len(sys.argv),2):
if sys.argv[i] == '-pos': #Fasta file for positive examples
POS = sys.argv[i+1]
if sys.argv[i] == '-neg': #Fasta file for negative examples
NEG = sys.argv[i+1]
if sys.argv[i] == '-pval': #Default is 0.01
PVAL = float(sys.argv[i+1])
if sys.argv[i] == '-df':
DF = sys.argv[i+1]
if sys.argv[i] == "-fdr":
FDR = sys.argv[i+1]
df = pd.read_csv(DF, sep='\t', index_col = 0, header=0)
kmers = list(df)
kmers.remove(cla)
enriched = []
for k in kmers:
#if k == 'ABF2':
temp = df.groupby([cla, k]).size().reset_index(name="Count")
try:
TP = temp.loc[(temp[cla] == 1) & (temp[k] == 1), 'Count'].iloc[0]
except:
TP = 0
try:
TN = temp.loc[(temp[cla] == 0) & (temp[k] == 0), 'Count'].iloc[0]
except:
TN = 0
try:
FP = temp.loc[(temp[cla] == 0) & (temp[k] == 1), 'Count'].iloc[0]
except:
FP = 0
try:
FN = temp.loc[(temp[cla] == 1) & (temp[k] == 0), 'Count'].iloc[0]
except:
FN = 0
#print(TP, FP, TN, FN)
oddsratio,pvalue = fisher_exact([[TP,FN],[FP,TN]],alternative='greater')
if pvalue <= PVAL:
enriched.append(k)
print(enriched)
exit()
if FDR == "N":
for k in positive_present:
try:
count += 1
TP = positive_present[k] #Positive Examples with kmer present
FP = negative_present[k] #Negative Examples with kmer present
TN = num_neg-negative_present[k] #Negative Examples without kmer
FN = num_pos-positive_present[k] #Positive Examples without kmer
oddsratio,pvalue = fisher_exact([[TP,FN],[FP,TN]],alternative='greater')
outFISH.write('\n%s\t%d\t%d\t%.7f' % (k, (positive_present[k]),(negative_present[k]),pvalue))
if pvalue <= PVAL: # Remove unenriched features from dataframe
enriched_kmers[k] = pvalue
if pvalue > PVAL:
DF = DF.drop(k, 1)
if count%10000==0:
print("Completed " + str(count) + " features")
except ValueError:
count += 1
outFISH.write('\n%s\t%d\t%d\t1.0' % (k, (positive_present[k]),(negative_present[k])))
elif FDR == "Y":
fdr_dict = {}
for k in positive_present:
try:
count += 1
TP = positive_present[k] #Positive Examples with kmer present
FP = negative_present[k] #Negative Examples with kmer present
TN = num_neg-negative_present[k] #Negative Examples without kmer
FN = num_pos-positive_present[k] #Positive Examples without kmer
oddsratio,pvalue = fisher_exact([[TP,FN],[FP,TN]],alternative='greater')
outFISH.write('\n%s\t%d\t%d\t%.7f' % (k, (positive_present[k]),(negative_present[k]),pvalue))
pvalue_i = str(pvalue)
fdr_dict[k] = pvalue_i
except ValueError:
count += 1
outFISH.write('\n%s\t%d\t%d\t1.0' % (k, (positive_present[k]),(negative_present[k])))
outFISH.close()
f_path = os.getcwd()+ "/" + SAVE + "_FETresults.txt"
R=('R --vanilla --slave --args '+ os.getcwd() + " " + SAVE + " " + f_path+'< /mnt/home/azodichr/GitHub/MotifDiscovery/FDR.R')
os.system(R)
fdr_file = os.getcwd() +"/" + SAVE + "_FETresults_FDR.csv"
for l in open(fdr_file,'r'):
if "feature" in l:
pass
else:
kmer, poscount, negcount, pval, adjp = l.strip().split(",")
if float(adjp) <= PVAL: #Remove unenriched features from dataframe
enriched_kmers[kmer] = adjp
elif float(adjp) > PVAL:
DF = DF.drop(kmer, 1)
if count%10000==0:
print("Completed " + str(count) + " features")
else:
print("Please include ''-FDR Y/N'' in command to designate if you want FDR correction")
if __name__ == '__main__':
Make_DF(K, PVAL, SAVE)