-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhomology.py
More file actions
118 lines (83 loc) · 5.14 KB
/
homology.py
File metadata and controls
118 lines (83 loc) · 5.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
################################################################################
# homology.py — Identify Reciprocal Best Homologs Using BLAST XML Output
# Author: Adhithi R. Raghavan
# Created: 2023
#
# Description:
# This script defines two functions:
# - getBestHomolog(xmlfilepath): extracts best homologs for each query sequence
# - getReciprocalBestHomology(xml1, xml2, output): finds reciprocal best hits (RBHs)
#
# Usage:
# Import these functions in another script or run as a standalone module
# with appropriate XML BLAST outputs from two species.
################################################################################
#Making function to obtain best homolog.
def getBestHomolog(xmlfilepath1):
"""This function reads an the output of Blast search (Xml file)
and returns the best Homolog for each protein, if it exists.
Input required: Path to blast output as an Xml file. """
#Importing modules
from Bio.Blast import NCBIXML
import pandas as pd
#Creating empty dictionary so it can be populated in the for loop below
my_dict = {}
#Parsing the XML file - used parse versus read because it has multiple records - ie one for each protein.
record = NCBIXML.parse(open(xmlfilepath1))
#Making a for loop to go over every record
for eachrecord in record:
#Because every record may not have an alignment, so saying if alignment exists then do the stuff below.
if eachrecord.alignments:
#Get TargetID title
TargetID = (eachrecord.alignments[0].title)
#Get query ID (ie is protein being searched for, for homology)
QueryID = (eachrecord.query)
#Saving it as a dictionary with QueryID as key and TargetID as values.
my_dict[QueryID]= TargetID
#Returning the dictionary outside the loop.
return(my_dict)
def getReciprocalBestHomology(xmlfilepath1, xmlfilepath2, outputpath = "homologs.txt"):
"""This function gives proteins with the best reciprocal homology in the two species.
Input required:
xmlfilepath1: Path to blast output as an Xml file for Species 1.
xmlfilepath2: Path to blast output as an Xml file for Species 2.
outputpath = default is homologs.txt
"""
#Importing modules
from Bio.Blast import NCBIXML
import pandas as pd
#Using the earlier function getBestHomolog to obtain best homolog for each species
print("Obtaining best identified homolog for each protein for Species 1 ...")
output_of_Drosophila = getBestHomolog("drosophila_celegans.xml")
print("Best homolog for each protein (if it exists) in Species 1 has been identified! ")
print("...............................................................................")
print("Obtaining best identified homolog for each protein for Species 2 ...")
output_of_Celegans = getBestHomolog("celegans_drosophila.xml")
print("Best homolog for each protein (if it exists) in Species 2 has been identified! ")
print("...............................................................................")
#Find reciprocal best homology between the two species:
print("Obtaining reciprocal best homology between Species 1 and 2...")
#Making a for loop that first iterates over every key and value in output_of_Drosophila dictionary made earlier
for eachkey_Drosophilia, eachvalue_Drosophilia in output_of_Drosophila.items():
#Making a second for loop that first iterates over every key and value in output_of_Celegans dictionary made earlier
#So basically eachkey_Drosophilia and eachvalue_Drosophilia will be checked with all keys and values in C.elegans dictionary.
for eachkey_Celegans, eachvalue_Celegans in output_of_Celegans.items():
#For matching reciprocal homology
#searching if key of Drosophilia (which was the Query ID) - matches values in C.elgans(TargetID)
if eachkey_Drosophilia == eachvalue_Celegans:
#And also nesting the otherway
#if key of C.elegans (which was the Query ID) - matches values in Drosophilia(TargetID)
if eachkey_Celegans == eachvalue_Drosophilia:
#If it does: Then asking it to open and append values to outputfile - whatever user defined/default
f = open(outputpath, "a")
#Append eachkey_Drosophilia
f.write(eachkey_Drosophilia)
#Adding a tab to separate
f.write("\t")
#Append eachkey_Drosophilia
f.write(eachvalue_Drosophilia)
#So that every reciprocal best homology is on a new line.
f.write("\n")
#Close the file
f.close()
print("Reciprocal best homology between Species 1 and 2 has been obtained, and can be found in the output file! ")