-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgffparser.py
More file actions
110 lines (73 loc) · 4.7 KB
/
gffparser.py
File metadata and controls
110 lines (73 loc) · 4.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
"""
gffparser.py — Extract Gene Names from GFF Files by Region
Author: Adhithi R. Raghavan
Date: May 2023
Description:
This script parses a GFF file to extract gene names from a specific chromosome and coordinate range.
It supports both command-line usage and output redirection to a file.
Command-line arguments:
-i / --filepath : Path to input GFF file [Required]
-c / --chromosomenumber : Chromosome (e.g., Chr1, Chr2) [Required]
-s / --startcoordinate : Start coordinate [Required]
-e / --endcoordinate : End coordinate [Required]
-o / --outputpath : Optional path to save output
Example usage:
python gffparser.py -i example.gff -c Chr1 -s 10000 -e 50000
"""
if __name__=="__main__":
#To run only on command line:
import argparse #So that arguments can be parsed
parser = argparse.ArgumentParser() #Saving the argparse.ArgumentParser with a variable name so it can be used below.
#This is to add flags, add as many as you want.
#https://docs.python.org/3/library/argparse.html - Found this to be useful for information on parsing arguements.
#Adding parsers as suggested in the question
parser.add_argument("-i", "--filepath", help ="Please provide path to file",
required = True)
parser.add_argument("-c", "--chromosomenumber",
help ="Please provide chromosome number, eg. Chr1, Chr2, etc",
required = True)
parser.add_argument("-s", "--startcoordinate",
help ="Please provide start coordinate of region",
type = int, required = True)
parser.add_argument("-e", "--endcoordinate",
help ="Please provide end coordinate of region",
type = int, required = True)
parser.add_argument("-o", "--outputpath",
help ="Please provide path where the output should be stored",
# action = 'store',
required = False) #Setting required to be false, because if is provided then the output will be saved, if not I want to only print the output.
args = parser.parse_args()
#Making empty variables, so that it can be populated later.
enteries =[]
nameofenteries =[]
finaloutput=[]
import re
with open(args.filepath, 'r') as file:
reader1 = file.readlines()
reader = []
for line in reader1:
#print(line.split('\t'))
reader2 = line.rstrip().split('\t')
reader.append(reader2)
#print(reader)
for row in reader:
if row[0] == args.chromosomenumber:
#print(row) #Column 0 - is chromosome number
if row[2] == 'gene': #Column 2 - is gene. Setting this as a set feature with gene.
if int(row[3]) >= args.startcoordinate: #Values greater and equal to the provided start
if int(row[4]) <= args.endcoordinate:#Values lesser and equal to the provided stop
#print(row)
enteries =row[8].split(';') #Now the last column has names and other info. Split by ;
#The third column of that has only name, split that by =, that is name=genename. Need only something.
nameofenteries = enteries[2].split('=')
#Getting the genename, which is the first column.
finaloutput = (nameofenteries[1]) #Save that finaloutput variable name.
if args.outputpath: #if argsoutput path is set, then it is telling to do this/
my_output = args.outputpath #Storing outputpath as a variable name
#opening the file/filepath, a is for append. So it will append, write "w" was overwriting values.
f = open(my_output, "a")
f.write(finaloutput)#So within the file now adding the finaloutput value
f.write("\n") #So that every output is on a new line.
f.close() #Close the file
else: #If no outputpath arguement given
print(finaloutput) #Then just print the finaloutput.