-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path1c_GetRegionStats.py
More file actions
172 lines (128 loc) · 4.92 KB
/
1c_GetRegionStats.py
File metadata and controls
172 lines (128 loc) · 4.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
# IMPORT
import sys
# FUNCTIONS
def DifferenceVaraince(n,S):
''' Calculates the variance in the expected number of SNPs between
sequences for Tajima's D where "n" in the number of sequences and
S in the number of segregating sites.
See https://en.wikipedia.org/wiki/Tajima%27s_D
'''
import math
a1 = sum([1/float(i+1) for i in range(n-1)])
a2 = sum([1/float((i+1)**2) for i in range(n-1)])
b1 = float(n+1)/float(3*(n-1))
b2 = float(2*(n**2 + n + 3))/float(9*n*(n-1))
c1 = b1 - (1/float(a1))
c2 = b2 - float(n+2)/float(a1*n) + float(a2)/float(a1**2)
e1 = c1/float(a1)
e2 = c2/float(a1**2 + a2)
V_of_d = math.sqrt(e1*S + e2*S*(S-1))
return V_of_d
def SeqPairDiff(seq1,seq2):
''' Finds the aligned length, location of differences, and number of differences
between two genomic sequences
'''
bases = ("A","C","G","T")
length = 0
sites = []
difference = 0
for index in range(len(seq1)):
[base1,base2] = [seq1[index],seq2[index]]
if base1 in bases and base2 in bases:
length += 1
if not base1 == base2:
difference += 1
sites.append(index)
return length, sites, difference
# MAIN
print'''
Takes a list regions [1] and a *.index.ref [2] and returns
statistics for each region (NtDiversity and Tajima's D)
'''
region_lines = [ln.strip() for ln in open(sys.argv[1],"r").readlines()]
ref_lines = [ln.strip() for ln in open(sys.argv[2],"r").readlines()]
# Make dictionary of relationship in the reference file
ref_dict = {}
for ln in ref_lines:
[name,file] = ln.split("\t")
ref_dict[name] = file
target_file = ref_dict["Target"]
# Make a dictioanry of regions
reg_dict = {}
for ln in region_lines:
[contig,start,stop] = ln.split("\t")
# Get offset from index file
try:
contig_index_file = ref_dict[contig]
except KeyError as e:
print "No index file for contig " + contig
start_offset = open(contig_index_file,"r").readlines()[int(start)-1].strip()
number_of_lines = int(stop) - int(start)
# Get lines from target file
target_read = open(target_file)
target_read.seek(int(start_offset))
lines = [target_read.readline() for n in range(number_of_lines+1)]
target_read.close()
reg_dict[(contig,start,stop)] = lines
# Convert the regional information into sequences
seq_keys = []
seq_dict = {}
for region in reg_dict:
# Get lines and convert to sequence
sequences = []
region_lines = [ln.strip().split("\t")[2:] for ln in reg_dict[region]]
for index in range(len(region_lines[0])):
sequences.append([ln[index] for ln in region_lines])
seq_keys.append(region)
seq_dict[region] = sequences
# For each region, calculate the pairwise difference between all sequenecs
stats_dict = {}
for region in seq_keys:
# Get sequences
r_seq = seq_dict[region]
seq_num = len(r_seq)
# Make list to store our stats
seq_lens = [] # List of the aligned length between sequences
seq_sites = [] # List of sites with different SNPs
seq_diff = [] # List of differences between sequences
for i in range(len(r_seq)):
for j in range(i+1,len(r_seq)):
[lens,sites,diffs] = SeqPairDiff(r_seq[i],r_seq[j])
seq_lens.append(lens)
seq_sites.extend(sites)
seq_diff.append(diffs)
seq_sites = list(set(seq_sites))
#Test code
#print "# of Diff Sites: " + str(len(seq_sites))
#print "Seq Diffs: " + str(seq_diff)
# Calculate Stats #
if len(seq_sites) > 0:
# Nucleotide Diversity: See https://en.wikipedia.org/wiki/Nucleotide_diversity
seq_frequency = 1/float(seq_num)
sum_diffs = sum([float(seq_diff[i])/float(seq_lens[i]) for i in range(len(seq_lens)) if seq_lens[i] > 0])
nt_diversity = 2 * (seq_frequency**2) * float(sum_diffs)
#Test Code
#print "SeqFrequency: " + str(seq_frequency)
#print "Sum Diffs: " + str(sum_diffs)
# Tajima's D: https://en.wikipedia.org/wiki/Tajima%27s_D
avg_polymorphism = float(sum(seq_diff)) / float(len(seq_diff))
expected_polymorphism = float(len(seq_sites)) / sum([1/float(i+1) for i in range(seq_num-1)])
variance = DifferenceVaraince(seq_num,len(seq_sites))
tajimas_D = (avg_polymorphism - expected_polymorphism)/variance
#Test Code
#print "Avg Poly: " + str(avg_polymorphism)
#print "Expected: " + str(expected_polymorphism)
else:
nt_diversity = "NoDiffSites"
tajimas_D = "NoDiffSites"
# Save values
stats_dict[region] = [nt_diversity,tajimas_D]
# Write output
outlines = []
for region in seq_keys:
[nt,D] = stats_dict[region]
outlines.append("_".join(list(region)) + "\t" + str(nt) + "\t" + str(D) + "\n")
output = open(sys.argv[1] + ".SeqStats","w")
output.write("Region\tNtDiversity\tTajimasD\n")
output.write("".join(outlines))
output.close()