From 51f0e1fe56f033006450857645260eb1e11605e3 Mon Sep 17 00:00:00 2001
From: Tejas Jammihal <56969697+tejas-j@users.noreply.github.com>
Date: Tue, 9 Apr 2024 10:07:57 -0400
Subject: [PATCH] Update pangolin.py

Updated one_hot_encode function to handle non-ATGCN bases more elegantly without failing
---
 pangolin/pangolin.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/pangolin/pangolin.py b/pangolin/pangolin.py
index 194736f..a838c5e 100755
--- a/pangolin/pangolin.py
+++ b/pangolin/pangolin.py
@@ -16,8 +16,12 @@
 
 
 def one_hot_encode(seq, strand):
-    seq = seq.upper().replace('A', '1').replace('C', '2')
-    seq = seq.replace('G', '3').replace('T', '4').replace('N', '0')
+    # Convert to uppercase
+    seq = seq.upper()
+    # Replace every character that is not A, C, G, or T with 'N'
+    seq = ''.join(['N' if char not in 'ACGT' else char for char in seq])
+    # Then replace A, C, G, T with 1, 2, 3, 4, respectively, and N (everything else) with 0
+    seq = seq.replace('A', '1').replace('C', '2').replace('G', '3').replace('T', '4').replace('N', '0')
     if strand == '+':
         seq = np.asarray(list(map(int, list(seq))))
     elif strand == '-':
@@ -25,7 +29,6 @@ def one_hot_encode(seq, strand):
         seq = (5 - seq) % 5  # Reverse complement
     return IN_MAP[seq.astype('int8')]
 
-
 def compute_score(ref_seq, alt_seq, strand, d, models):
     ref_seq = one_hot_encode(ref_seq, strand).T
     ref_seq = torch.from_numpy(np.expand_dims(ref_seq, axis=0)).float()