-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfasta_parser.py
More file actions
53 lines (42 loc) · 1.88 KB
/
fasta_parser.py
File metadata and controls
53 lines (42 loc) · 1.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import csv
from Bio import SeqIO
def fasta_parser(fasta_file):
sequences = {}
for record in SeqIO.parse(fasta_file, "fasta"):
header = record.description.split('|')
current_name = header[1].strip()
sequences[current_name] = str(record.seq)
return sequences
# File name of the input sequence file
input_file = "input.fasta"
# File name of the output TSV file
output_file = "output.tsv"
# Parse the input sequences from the file
sequences = fasta_parser(input_file)
# Generate the sequence statistics
sequence_lengths = [len(seq) for seq in sequences.values()]
total_sequences = len(sequences)
average_length = sum(sequence_lengths) / total_sequences
minimum_length = min(sequence_lengths)
maximum_length = max(sequence_lengths)
# Find the names of sequences with minimum and maximum lengths
sequence_name_min_length = [name for name, length in sequences.items() if len(length) == minimum_length]
sequence_name_max_length = [name for name, length in sequences.items() if len(length) == maximum_length]
# Prepare the data for TSV output
table_data = []
for name, sequence in sequences.items():
length = len(sequence)
table_data.append([name, length, sequence])
# Save the output to a TSV file
with open(output_file, "w", newline="") as file:
writer = csv.writer(file, delimiter="\t")
writer.writerow(["Name", "Length", "Sequence"])
writer.writerows(table_data)
writer.writerow([]) # Add an empty row
writer.writerow(["Sequence Statistics"])
writer.writerow(["Total Sequences", total_sequences])
writer.writerow(["Average Length", average_length])
writer.writerow(["Minimum Length", minimum_length, ", ".join(sequence_name_min_length)])
writer.writerow(["Maximum Length", maximum_length, ", ".join(sequence_name_max_length)])
# Print the success message
print("Output saved successfully!")