-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcleaned_gaps_alignment.py
More file actions
117 lines (100 loc) · 3.6 KB
/
cleaned_gaps_alignment.py
File metadata and controls
117 lines (100 loc) · 3.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
'''Cleans an alignment file of columns which have a certain percentage of
gaps, then obtains substitution score vectors'''
import numpy
from Bio import pairwise2
from Bio.Seq import MutableSeq
from Bio.Seq import Seq
from Bio import SeqIO
import os
from Bio.SubsMat import MatrixInfo as matlist
filename = 'gross-alignment.fasta'
def records_to_list(filename):
records = []
with open(filename, 'rU') as handle:
for record in SeqIO.parse(handle, 'fasta'):
records.append(record)
return records
records = records_to_list(filename)
test_records = []
test_record_names = ['HRH1', 'ACM1']
for name in test_record_names:
for record in records:
if name in record.id:
test_records.append(record)
gap_list = []
for record in test_records:
gaps_row = []
for char in record.seq:
if char == '-':
gaps_row.append(0)
else:
gaps_row.append(1)
gap_list.append(gaps_row)
gap_matrix = numpy.asarray(gap_list)
'''Get the columns to remove'''
gaps_limit_ratio = float(1/(len(test_record_names)))
clean_column_index = []
total_column_len = float(len(gap_matrix[:,0]))
for n in range(len(gap_matrix[0,:])):
total_non_gaps = 0
for num in gap_matrix[:,n]:
total_non_gaps += num
ratio_non_gaps = (total_non_gaps/total_column_len)
if (ratio_non_gaps)<=gaps_limit_ratio:
clean_column_index.append(n)
#print clean_column_index, len(records[0].seq)-len(clean_column_index)
'''Remove those columns'''
cleaned_align_index = [pos for pos in range(len(test_records[0].seq)) if pos
not in clean_column_index]
cleaned_records = []
cleaned_pos_index = []
for record in test_records:
new_seq = ''
for pos, char in enumerate(record.seq):
if pos not in clean_column_index:
new_seq += char
cleaned_pos_index.append(pos)
#break
#print record.seq, len(record.seq), 'before'
record.seq = Seq(new_seq)
#print record.seq, len(record.seq), 'after'
cleaned_records.append(record)
#print cleaned_pos_index
print cleaned_records
#print 'FInished cleaning records, calculating vectors...'
def write_pairwise_scores(data, filename):
import csv
with open(filename, 'ab') as output:
csv_obj = csv.writer(output, delimiter=',')
csv_obj.writerows([data])
with open('HRH1_ACM1_gross.fasta', 'a') as handle:
for record in cleaned_records:
SeqIO.write(record, handle, 'fasta')
#write_pairwise_scores(cleaned_pos_index, 'cleaned_pos_index.csv')
#matrix = matlist.blosum62
#matrix['-', '-'] = 0
#for char in 'ARNDCQEGHILKMFPSTWYV':
# matrix[char, '-'] = -10
# matrix['-', char] = -10
#
#from itertools import combinations
#record_pairs = list(combinations([record for record in cleaned_records], 2))
##record_pairs = []
##for record1 in records:
## for record2 in records:
## if record1.id == record2.id:
## record_pairs.append([record1, record2])
#
#for record_pair in record_pairs:
# score_vector = []
# for n, sequence in enumerate(record_pair[0].seq):
# align = pairwise2.align.localdx(record_pair[0].seq[n],
# record_pair[1].seq[n], matrix, score_only=True)
# print len(score_vector)
# score_vector.append(align)
# #score_vector.append([cleaned_align_index[n], align])
# write_pairwise_scores(score_vector,
# 'pairwise_aa_substitution_vectors_new.csv' )
# #write_pairwise_scores([record1.id, record2.id, score_vector],
# #'pairwise_aa_substitution_vectors.csv' )
#