oneilsh · Leslie-C · Jun 16, 2025 · Jun 12, 2025 · Jun 12, 2025 · Jun 12, 2025
diff --git a/data/count_ATs.sh b/data/count_ATs.sh
@@ -8,7 +8,7 @@ fi
 
 export file=$1
 
-fasta_stats $file | \
+fasta_stats.py $file | \
     grep 'unit:AT' | \
     grep 'dinucleotide' | \
     wc
diff --git a/data/fasta_stats.py b/data/fasta_stats.py
@@ -4,67 +4,67 @@
 import re
 
 if len(sys.argv) != 2:
-	print("Usage: fasta_dna_stats <fasta_file>")
-	print("This script is for informational purposes only, and requires that the input file be a DNA (As, Ts, Cs, and Gs) FASTA-formatted file.")
-	quit()
+    print("Usage: fasta_dna_stats <fasta_file>")
+    print("This script is for informational purposes only, and requires that the input file be a DNA (As, Ts, Cs, and Gs) FASTA-formatted file.")
+    quit()
 
 ####### Parse input
 def fasta_hash_from_file(filename: str) -> dict:
-	handle = open(filename, "r")
-	seqs_dict = dict()
-	current_id = ""
-	current_seq_list = list()
-	
-	for line in handle:
-		line_list = re.split(r"\s+", line.strip())
-		if re.match(r"^>", line_list[0]):
-			if current_id != "":
-				seqs_dict[current_id] = "".join(current_seq_list)
-			current_id = re.subn(r"^>", "", line_list[0])[0]
-			current_seq_list = list()
-		else:
-			current_seq_list.append(line_list[0])
-
-	seqs_dict[current_id] = "".join(current_seq_list)
-	handle.close()
-	return seqs_dict
+    handle = open(filename, "r")
+    seqs_dict = dict()
+    current_id = ""
+    current_seq_list = list()
+    
+    for line in handle:
+        line_list = re.split(r"\s+", line.strip())
+        if re.match(r"^>", line_list[0]):
+            if current_id != "":
+                seqs_dict[current_id] = "".join(current_seq_list)
+            current_id = re.subn(r"^>", "", line_list[0])[0]
+            current_seq_list = list()
+        else:
+            current_seq_list.append(line_list[0])
+
+    seqs_dict[current_id] = "".join(current_seq_list)
+    handle.close()
+    return seqs_dict
 
 def longest_perfect_repeat(seq: str) -> list:
-	result = re.finditer(r"(.{1,10}?)\1{1,}", seq)
-	max = 1
-	maxrep = seq[0]
-	maxunit = seq[0]
-	for match in result:
-		unit = match.group(1)
-		if len(match.group(0)) > max and len(set(unit)) > 1:
-			max = len(match.group(0))
-			maxrep = match.group(0)
-			maxunit = match.group(1)
-
-	return [max, maxrep, maxunit]
+    result = re.finditer(r"(.{1,10}?)\1{1,}", seq)
+    max = 1
+    maxrep = seq[0]
+    maxunit = seq[0]
+    for match in result:
+        unit = match.group(1)
+        if len(match.group(0)) > max and len(set(unit)) > 1:
+            max = len(match.group(0))
+            maxrep = match.group(0)
+            maxunit = match.group(1)
+
+    return [max, maxrep, maxunit]
 
 def gc_content(seq: str) -> float:
-	gc_count = seq.count("G") + seq.count("g") + seq.count("C") + seq.count("c")
-	gc = gc_count/len(seq)
-	return(round(gc,3))
+    gc_count = seq.count("G") + seq.count("g") + seq.count("C") + seq.count("c")
+    gc = gc_count/len(seq)
+    return(round(gc,3))
 
 def most_common_fivemer(seq: str) -> list:
-	mers = dict()
-	for index in range(0, len(seq) - 5 + 1):
-		fivemer = seq[index:index+5]
-		if fivemer in mers:
-			mers[fivemer] = mers[fivemer] + 1
-		else:
-			mers[fivemer] = 1
-
-	max = 0
-	maxmer = ""
-	for fivemer in mers:
-		if mers[fivemer] > max:
-			max = mers[fivemer]
-			maxmer = fivemer
-
-	return [maxmer, max]
+    mers = dict()
+    for index in range(0, len(seq) - 5 + 1):
+        fivemer = seq[index:index+5]
+        if fivemer in mers:
+            mers[fivemer] = mers[fivemer] + 1
+        else:
+            mers[fivemer] = 1
+
+    max = 0
+    maxmer = ""
+    for fivemer in mers:
+        if mers[fivemer] > max:
+            max = mers[fivemer]
+            maxmer = fivemer
+
+    return [maxmer, max]
 
 filename = sys.argv[1]
 seqs_dict = fasta_hash_from_file(filename)
@@ -91,13 +91,13 @@ def most_common_fivemer(seq: str) -> list:
 types[10] = "decanucleotide"
 
 for id in seqs_dict:
-	print(f"Processing sequence ID {id}",file=sys.stderr)
+    print(f"Processing sequence ID {id}",file=sys.stderr)
 
-	seq = seqs_dict[id]
-	print(f"{id}\t{gc_content(seq)}\t{len(seq)}", end="\t")
+    seq = seqs_dict[id]
+    print(f"{id}\t{gc_content(seq)}\t{len(seq)}", end="\t")
 
-	maxmer = most_common_fivemer(seq)
-	print(f"{maxmer[0]}\t{maxmer[1]}", end="\t")
-	
-	longest_rep_list = longest_perfect_repeat(seq)
-	print(f"unit: {longest_rep_list[2]}\t{longest_rep_list[0]}\t{types[len(longest_rep_list[2])]}")
+    maxmer = most_common_fivemer(seq)
+    print(f"{maxmer[0]}\t{maxmer[1]}", end="\t")
+    
+    longest_rep_list = longest_perfect_repeat(seq)
+    print(f"unit:{longest_rep_list[2]}\t{longest_rep_list[0]}\t{types[len(longest_rep_list[2])]}")
diff --git a/data/pz_blastx_yeast_top1.txt b/data/pz_blastx_yeast_top1.txt