-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathMap_PWM_biopy.py
More file actions
executable file
·138 lines (112 loc) · 5.23 KB
/
Map_PWM_biopy.py
File metadata and controls
executable file
·138 lines (112 loc) · 5.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
"""
PURPOSE:
Use biopython's mapping function to map motifs to a DNA sequence. Improves on previous lab motif mapping scripts by setting a
maximum precision desired when calculating the score distribution, the computational cost of doing this without a limit
grows exponentially with the length of the motif.
INPUT:
-m PWM file (ending in .pwm) or directory with .pwm files
-fasta Fasta file with ONE sequence only.
-AT ATbias in organism (Arabidopsis = 0.33)
-GC GCbias in organism (Arabidopsis = 0.17)
-p Threshold value for false positives (Default = 1e-5)
-include File with list of PWMs to include (Default = all)
-out Name for output file
OUTPUT:
-out Results: Sequence - Motif - Motif_consensus - Hit_Seq - Hit_position - Hit_score -Threshold
AUTHOR: Christina Azodi
REVISIONS: Submitted 8/10/2016
"""
import os,sys, math
from Bio import motifs, SeqIO
from Bio.Seq import Seq
import numpy as np
import time
def make_pssm(m, ATbias, GCbias, p):
"""Convert pwm into pssm (position-specific scoring matrices) to use for mapping
and calulate thresholds based on lenght and information content of motif"""
#Add pseudocount so that the probablity never becomes zero which will result in pssm = -inf.
pwm = m.counts.normalize(pseudocounts = 0.001)
consensus = pwm.consensus
print("Motif consensus sequence: " + str(consensus))
# Make position-specific scoring matrices & generage stats
background = {'A': float(ATbias), 'C': float(GCbias), 'G': float(GCbias), 'T': float(ATbias)}
pssm = pwm.log_odds(background)
print("Maximum score obtainable from motif: %4.2f" % pssm.max)
print("Minimum score obtainable from motif: %4.2f" % pssm.min)
print("Mean pssm score (a measure of information content of motif compared to background): %s" % str(pssm.mean(background)))
# Determine the threshold for mapping
# Note: Since the space for a score distribution grows exponentially with motif length,
# this uses an approximation with a given precision to keep computation cost manageable
distribution = pssm.distribution(background=background, precision=10**4)
# Set the threshold to p
threshold = distribution.threshold_fpr(p)
print("Threshold for matching (fpr=0.00001): %3f" % threshold)
# Alternative threshold method: balance FN with FP (100 false negatives for every 1 false positive)
#threshold = distribution.threshold_balanced(100)
#print("Threshold for matching (fnr/fpr=100): %3f" % threshold)
return(pssm, consensus, threshold)
def mapping(seq_file, pssm, threshold, consensus, motif_name, motif_len):
"""Map the pssm to the sequence in fasta format"""
seq_name = seq_file.strip().split("/")[-1]
for record in SeqIO.parse(seq_file, "fasta"):
seq = Seq(str(record.seq), m.alphabet)
local_array = np.array([])
for position, score in pssm.search(seq, threshold=threshold):
real_seq = record.seq[position:position+motif_len]
out.write("%s\t%s\t%s\t%s\t%s\t%4.5f\t%4.5f\n" % (seq_name, motif_name, consensus, real_seq, str(position), score, threshold))
#return(x)
if __name__=="__main__":
start_time = time.time()
# Set default values
ATbias = 0.33
GCbias = 0.17
p = 0.00001
include = "all"
#Define arguments
for i in range (1,len(sys.argv),2):
if sys.argv[i] == '-m': # pwm sequence file (TAMO)
f = sys.argv[i+1]
if sys.argv[i] == '-AT': # ATbias in organism (Arabidopsis = 0.33)
ATbias = sys.argv[i+1]
if sys.argv[i] == '-GC': # GCbias in organism (Arabidopsis = 0.17)
GCbias = sys.argv[i+1]
if sys.argv[i] == '-fasta': # FASTA file of the sequence
seq_file = sys.argv[i+1]
if sys.argv[i] == '-p': # threshold (default = 1e-5)
p = sys.argv[i+1]
if sys.argv[i] == '-out': # Results output name
out_file = sys.argv[i+1]
if sys.argv[i] == '-include': # List of file names from pwm directory to include (default = all)
include = sys.argv[i+1]
if len(sys.argv) <= 1:
print(__doc__)
exit()
#threshold= math.pow(10,-float(sys.argv[2])) # P-val, 5 means -log(n,10)
out = open(out_file, 'w')
out.write("Sequence\tMotif\tMotif_consensus\tHit_Seq\tHit_position\tHit_score\tThreshold\n")
# Specify if you only want to include certain pwm's in the directory
if include == "all":
include_if = os.listdir(f)
else:
include_if = open(include, 'r').read().splitlines()
if f[-4:] == ".pwm":
#Make PSSM for mapping
motif_name = f.strip().split("/")[-1]
m = motifs.read(open(f), "pfm")
motif_len = len(m)
pssm, consensus, threshold = make_pssm(m, ATbias, GCbias, p)
hits = mapping(seq_file, pssm, threshold, consensus, motif_name, motif_len)
else:
for motif_file in os.listdir(f):
if motif_file in include_if:
motif_name = motif_file.strip().split("/")[-1]
print(motif_name)
path = f + motif_file
m = motifs.read(open(path), "pfm")
motif_len = len(m)
pssm, consensus, threshold = make_pssm(m, ATbias, GCbias, p)
hits = mapping(seq_file, pssm, threshold, consensus, motif_name, motif_len)
else:
pass
end_time = time.time()
print("Run time: " + str(float(end_time)-float(start_time)))