PBC2021/Challenge3.py at main · wardvanbelle/PBC2021 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import re
from Bio import ExPASy, SwissProt
import requests
import time

start_time = time.time()

def dna2tris(dna, start=0):
    mat = {'A': {'C': 0, 'G': 1, 'T': 2},
           'C': {'G': 0, 'T': 1, 'A': 2},
           'G': {'T': 0, 'A': 1, 'C': 2},
           'T': {'A': 0, 'C': 1, 'G': 2}}

    tris = []

    if start == 1:
        tris.append(mat['A'][dna[1]])

    for i in range(1, len(dna)):
        tris.append(mat[dna[i-1]][dna[i]])

    return tris


def tris_ascii(tris, mat):
    word = ''
    k = 0
    while k <= len(seq)-5:
        letter = ''.join([str(tr) for tr in tris[k:k+5]])
        if letter in mat:
            word += mat[letter]
            k += 5
        elif ''.join([str(tr) for tr in tris[k:k+6]]) in mat:
            letter = ''.join([str(tr) for tr in tris[k:k+6]])
            word += mat[letter]
            k += 6
        else:
            word += ' '
            k += 5

    return word


huff3 = open('test.txt', 'r')  # test.txt is een aangepaste versie van de huff3.cd
huff = {}
for line in huff3:
    temp = re.split(r'\t|\n', line)
    huff[temp[3]] = temp[1]

sequence = open('bpc3.dna', 'r')
seq = ''
for line in sequence:
    seq += line

print(tris_ascii(dna2tris(seq, start=1), huff))

fullurl = 'https://www.uniprot.org/uniprot/?query=homo+sapiens+2018+length%3A%5B4+TO+4%5D&sort=score&format=list'
accession = requests.get(fullurl)
handle = ExPASy.get_sprot_raw(re.sub('\n', '', accession.text))
record = SwissProt.read(handle)

aa_long = record.sequence

#get protein out of swissprot:

uniprot = open('uniprot-proteome_UP000005640+reviewed_yes.fasta', 'r')  # dit is een file met alle swissprot files in fasta format.
prots = []
s = 1
for line in uniprot:
    if re.search('^>', line):
        if s == 0:
            prots.append(prot)
        s = 0
        prot = ''
    else:
        prot += line

contains_twice100 = []
count = 0

for prot in prots:
    count += 1
    regex = '(?=' + aa_long + ')'
    begins = [m.start() for m in re.finditer(regex, prot)]

    for i in range(0, len(begins) - 1):
        if len(aa_long) <= begins[i + 1] - begins[i] <= 100 - len(aa_long):
            contains_twice100.append(count)

print(len(contains_twice100))

end_time = time.time()
print(end_time - start_time)