From 51f0e1fe56f033006450857645260eb1e11605e3 Mon Sep 17 00:00:00 2001 From: Tejas Jammihal <56969697+tejas-j@users.noreply.github.com> Date: Tue, 9 Apr 2024 10:07:57 -0400 Subject: [PATCH] Update pangolin.py Updated one_hot_encode function to handle non-ATGCN bases more elegantly without failing --- pangolin/pangolin.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pangolin/pangolin.py b/pangolin/pangolin.py index 194736f..a838c5e 100755 --- a/pangolin/pangolin.py +++ b/pangolin/pangolin.py @@ -16,8 +16,12 @@ def one_hot_encode(seq, strand): - seq = seq.upper().replace('A', '1').replace('C', '2') - seq = seq.replace('G', '3').replace('T', '4').replace('N', '0') + # Convert to uppercase + seq = seq.upper() + # Replace every character that is not A, C, G, or T with 'N' + seq = ''.join(['N' if char not in 'ACGT' else char for char in seq]) + # Then replace A, C, G, T with 1, 2, 3, 4, respectively, and N (everything else) with 0 + seq = seq.replace('A', '1').replace('C', '2').replace('G', '3').replace('T', '4').replace('N', '0') if strand == '+': seq = np.asarray(list(map(int, list(seq)))) elif strand == '-': @@ -25,7 +29,6 @@ def one_hot_encode(seq, strand): seq = (5 - seq) % 5 # Reverse complement return IN_MAP[seq.astype('int8')] - def compute_score(ref_seq, alt_seq, strand, d, models): ref_seq = one_hot_encode(ref_seq, strand).T ref_seq = torch.from_numpy(np.expand_dims(ref_seq, axis=0)).float()