Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion data/count_ATs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ fi

export file=$1

fasta_stats $file | \
fasta_stats.py $file | \
grep 'unit:AT' | \
grep 'dinucleotide' | \
wc
120 changes: 60 additions & 60 deletions data/fasta_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,67 +4,67 @@
import re

if len(sys.argv) != 2:
print("Usage: fasta_dna_stats <fasta_file>")
print("This script is for informational purposes only, and requires that the input file be a DNA (As, Ts, Cs, and Gs) FASTA-formatted file.")
quit()
print("Usage: fasta_dna_stats <fasta_file>")
print("This script is for informational purposes only, and requires that the input file be a DNA (As, Ts, Cs, and Gs) FASTA-formatted file.")
quit()

####### Parse input
def fasta_hash_from_file(filename: str) -> dict:
handle = open(filename, "r")
seqs_dict = dict()
current_id = ""
current_seq_list = list()
for line in handle:
line_list = re.split(r"\s+", line.strip())
if re.match(r"^>", line_list[0]):
if current_id != "":
seqs_dict[current_id] = "".join(current_seq_list)
current_id = re.subn(r"^>", "", line_list[0])[0]
current_seq_list = list()
else:
current_seq_list.append(line_list[0])

seqs_dict[current_id] = "".join(current_seq_list)
handle.close()
return seqs_dict
handle = open(filename, "r")
seqs_dict = dict()
current_id = ""
current_seq_list = list()
for line in handle:
line_list = re.split(r"\s+", line.strip())
if re.match(r"^>", line_list[0]):
if current_id != "":
seqs_dict[current_id] = "".join(current_seq_list)
current_id = re.subn(r"^>", "", line_list[0])[0]
current_seq_list = list()
else:
current_seq_list.append(line_list[0])

seqs_dict[current_id] = "".join(current_seq_list)
handle.close()
return seqs_dict

def longest_perfect_repeat(seq: str) -> list:
result = re.finditer(r"(.{1,10}?)\1{1,}", seq)
max = 1
maxrep = seq[0]
maxunit = seq[0]
for match in result:
unit = match.group(1)
if len(match.group(0)) > max and len(set(unit)) > 1:
max = len(match.group(0))
maxrep = match.group(0)
maxunit = match.group(1)

return [max, maxrep, maxunit]
result = re.finditer(r"(.{1,10}?)\1{1,}", seq)
max = 1
maxrep = seq[0]
maxunit = seq[0]
for match in result:
unit = match.group(1)
if len(match.group(0)) > max and len(set(unit)) > 1:
max = len(match.group(0))
maxrep = match.group(0)
maxunit = match.group(1)

return [max, maxrep, maxunit]

def gc_content(seq: str) -> float:
gc_count = seq.count("G") + seq.count("g") + seq.count("C") + seq.count("c")
gc = gc_count/len(seq)
return(round(gc,3))
gc_count = seq.count("G") + seq.count("g") + seq.count("C") + seq.count("c")
gc = gc_count/len(seq)
return(round(gc,3))

def most_common_fivemer(seq: str) -> list:
mers = dict()
for index in range(0, len(seq) - 5 + 1):
fivemer = seq[index:index+5]
if fivemer in mers:
mers[fivemer] = mers[fivemer] + 1
else:
mers[fivemer] = 1

max = 0
maxmer = ""
for fivemer in mers:
if mers[fivemer] > max:
max = mers[fivemer]
maxmer = fivemer

return [maxmer, max]
mers = dict()
for index in range(0, len(seq) - 5 + 1):
fivemer = seq[index:index+5]
if fivemer in mers:
mers[fivemer] = mers[fivemer] + 1
else:
mers[fivemer] = 1

max = 0
maxmer = ""
for fivemer in mers:
if mers[fivemer] > max:
max = mers[fivemer]
maxmer = fivemer

return [maxmer, max]

filename = sys.argv[1]
seqs_dict = fasta_hash_from_file(filename)
Expand All @@ -91,13 +91,13 @@ def most_common_fivemer(seq: str) -> list:
types[10] = "decanucleotide"

for id in seqs_dict:
print(f"Processing sequence ID {id}",file=sys.stderr)
print(f"Processing sequence ID {id}",file=sys.stderr)

seq = seqs_dict[id]
print(f"{id}\t{gc_content(seq)}\t{len(seq)}", end="\t")
seq = seqs_dict[id]
print(f"{id}\t{gc_content(seq)}\t{len(seq)}", end="\t")

maxmer = most_common_fivemer(seq)
print(f"{maxmer[0]}\t{maxmer[1]}", end="\t")
longest_rep_list = longest_perfect_repeat(seq)
print(f"unit: {longest_rep_list[2]}\t{longest_rep_list[0]}\t{types[len(longest_rep_list[2])]}")
maxmer = most_common_fivemer(seq)
print(f"{maxmer[0]}\t{maxmer[1]}", end="\t")
longest_rep_list = longest_perfect_repeat(seq)
print(f"unit:{longest_rep_list[2]}\t{longest_rep_list[0]}\t{types[len(longest_rep_list[2])]}")
24 changes: 0 additions & 24 deletions data/pz_blastx_yeast_top1.txt

This file was deleted.

Loading