From 90dd78d5f5b2be6a905db4cba3bffa0f1e1d18f6 Mon Sep 17 00:00:00 2001
From: synedraacus <alexeymorozov1991@gmail.com>
Date: Mon, 29 Apr 2024 14:07:04 +0200
Subject: [PATCH 1/5] Test FASTA files and input

---
 DeepSP_predict.py     | 71 +++++++++++++++++++++++++++++++++++++++++++
 test_fasta/mAb1.fasta |  4 +++
 test_fasta/mAb2.fasta |  4 +++
 test_fasta/mAb3.fasta |  4 +++
 4 files changed, 83 insertions(+)
 create mode 100755 DeepSP_predict.py
 create mode 100644 test_fasta/mAb1.fasta
 create mode 100644 test_fasta/mAb2.fasta
 create mode 100644 test_fasta/mAb3.fasta

diff --git a/DeepSP_predict.py b/DeepSP_predict.py
new file mode 100755
index 0000000..572b136
--- /dev/null
+++ b/DeepSP_predict.py
@@ -0,0 +1,71 @@
+#! /usr/bin/env python3
+
+from argparse import ArgumentParser
+from Bio import SeqIO
+from Bio.SeqRecord import SeqRecord
+from Bio.Seq import Seq
+from csv import DictReader
+
+
+def process_csv_input(files: list[str]) -> list[tuple[SeqRecord]]:
+    """
+    Take antibody sequences from a CSV input.
+
+    Assumes that column names are 'Name,Heavy_Chain,Light_Chain'; other columns,
+    if any, are ignored.
+    """
+    antibodies = []
+    for csv_file in files:
+        with open(csv_file) as csv_input:
+            reader = DictReader(csv_input)
+            for antibody in reader:
+                antibodies.append([SeqRecord(id=antibody['Name'],
+                                            seq=Seq(antibody['Heavy_Chain'])),
+                                   SeqRecord(id=antibody['Name'],
+                                             seq=Seq(antibody['Light_Chain']))
+                                   ])
+    return antibodies
+
+
+def process_fasta_input(files: list[str]) -> list[tuple[SeqRecord]]:
+    """
+    Take antibody sequences from FASTA input.
+
+    Assumes that each FASTA file contains two records named '{ab_id}_VH' and
+    '{ab_id}_VL'. Raises exception if any of these records is absent or any
+    additional records are found; order of records in the file does not matter.
+    """
+    antibodies = []
+    for fasta_file in args.i:
+        name = ''
+        heavy = ''
+        light = ''
+        for record in SeqIO.parse(fasta_file, 'fasta'):
+            if record.id.endswith('_VH'):
+                name = '_'.join(record.id.split('_')[:-1])
+                heavy = record.seq
+            elif record.id.endswith('_VL'):
+                light = record.seq
+            else:
+                raise ValueError(f'Invalid postfix in FASTA record name {record.id}')
+        antibodies.append([SeqRecord(id=name, seq=heavy),
+                           SeqRecord(id=name, seq=light)])
+    return antibodies
+
+
+if __name__ == '__main__':
+    parser = ArgumentParser('DeepSP prediction')
+    parser.add_argument('-i', type=str, nargs='+',
+                        help='Input file(s)')
+    parser.add_argument('--in_format', type=str, default='fasta',
+                        help='Input format (`fasta` or `csv`)')
+    parser.add_argument('-o', type=str, help='Output CSV path')
+    args = parser.parse_args()
+    if args.in_format.lower() == 'csv':
+        antibodies = process_csv_input(args.i)
+    elif args.in_format.lower() == 'fasta':
+        antibodies = process_fasta_input(args.i)
+    else:
+        raise ValueError('Only `csv` and `fasta` (case-insensitive) are valid in_format values')
+    print(antibodies)
+    print(len(antibodies))
\ No newline at end of file
diff --git a/test_fasta/mAb1.fasta b/test_fasta/mAb1.fasta
new file mode 100644
index 0000000..727ecae
--- /dev/null
+++ b/test_fasta/mAb1.fasta
@@ -0,0 +1,4 @@
+>mAb1_VH
+EVQLVESGGGLVQPGRSLRLSCAASGFTFDDYAMHWVRQAPGKGLEWVSAITWNSGHIDYADSVEGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAKVSYLSTASSLDYWGQGTLVTVSS
+>mAB_VL
+DIQMTQSPSSLSASVGDRVTITCRASQGIRNYLAWYQQKPGKAPKLLIYAASTLQSGVPSRFSGSGSGTDFTLTISSLQPEDVATYYCQRYNRAPYTFGQGTKVEIK
\ No newline at end of file
diff --git a/test_fasta/mAb2.fasta b/test_fasta/mAb2.fasta
new file mode 100644
index 0000000..c170638
--- /dev/null
+++ b/test_fasta/mAb2.fasta
@@ -0,0 +1,4 @@
+>mAb2_VH
+EVQLVESGGGLVQPGGSLRLSCAASGFTFSDSWIHWVRQAPGKGLEWVAWISPYGGSTYYADSVKGRFTISADTSKNTAYLQMNSLRAEDTAVYYCARRHWPGGFDYWGQGTLVTVSA
+>mAb2_VL
+DIQMTQSPSSLSASVGDRVTITCRASQDVSTAVAWYQQKPGKAPKLLIYSASFLYSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQYLYHPATFGQGTKVEIK
\ No newline at end of file
diff --git a/test_fasta/mAb3.fasta b/test_fasta/mAb3.fasta
new file mode 100644
index 0000000..116c1fe
--- /dev/null
+++ b/test_fasta/mAb3.fasta
@@ -0,0 +1,4 @@
+>mAb3_VL
+DILLTQSPVILSVSPGERVSFSCRASQSIGTNIHWYQQRTNGSPRLLIKYASESISGIPSRFSGSGSGTDFTLSINSVESEDIADYYCQQNNNWPTTFGAGTKLELK
+>mAb3_VH
+QVQLKQSGPGLVQPSQSLSITCTVSGFSLTNYGVHWVRQSPGKGLEWLGVIWSGGNTDYNTPFTSRLSINKDNSKSQVFFKMNSLQSNDTAIYYCARALTYYDYEFAYWGQGTLVTVSA
\ No newline at end of file

From ee3d0eaf6eb02c1e4f25f24846a12e2a5554d7c0 Mon Sep 17 00:00:00 2001
From: synedraacus <alexeymorozov1991@gmail.com>
Date: Mon, 29 Apr 2024 15:02:03 +0200
Subject: [PATCH 2/5] Basically working script

---
 DeepSP_predict.py | 205 ++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 198 insertions(+), 7 deletions(-)

diff --git a/DeepSP_predict.py b/DeepSP_predict.py
index 572b136..3595866 100755
--- a/DeepSP_predict.py
+++ b/DeepSP_predict.py
@@ -5,7 +5,13 @@
 from Bio.SeqRecord import SeqRecord
 from Bio.Seq import Seq
 from csv import DictReader
+from keras.models import model_from_json
 
+import os
+import subprocess
+import sys
+import pandas as pd
+import numpy as np
 
 def process_csv_input(files: list[str]) -> list[tuple[SeqRecord]]:
     """
@@ -20,9 +26,11 @@ def process_csv_input(files: list[str]) -> list[tuple[SeqRecord]]:
             reader = DictReader(csv_input)
             for antibody in reader:
                 antibodies.append([SeqRecord(id=antibody['Name'],
-                                            seq=Seq(antibody['Heavy_Chain'])),
+                                             seq=Seq(antibody['Heavy_Chain']),
+                                             description=''),
                                    SeqRecord(id=antibody['Name'],
-                                             seq=Seq(antibody['Light_Chain']))
+                                             seq=Seq(antibody['Light_Chain']),
+                                             description='')
                                    ])
     return antibodies
 
@@ -48,18 +56,198 @@ def process_fasta_input(files: list[str]) -> list[tuple[SeqRecord]]:
                 light = record.seq
             else:
                 raise ValueError(f'Invalid postfix in FASTA record name {record.id}')
-        antibodies.append([SeqRecord(id=name, seq=heavy),
-                           SeqRecord(id=name, seq=light)])
+        antibodies.append([SeqRecord(id=name, seq=heavy, description=''),
+                           SeqRecord(id=name, seq=light, description='')])
     return antibodies
 
 
+def anarci_align(antibodies: list[tuple[SeqRecord]],
+                 output_filename: str = 'seq_aligned_HL.txt') -> str:
+    """
+    Call ANARCI on both chains of the antibody, then merge the outputs in format
+    accepted by downstream analyses. Returns a filename with merged results.
+    """
+    with open('seq_H.fasta', mode='w') as h_fasta:
+        SeqIO.write([x[0] for x in antibodies], h_fasta, 'fasta')
+    with open('seq_L.fasta', mode='w') as l_fasta:
+        SeqIO.write([x[1] for x in antibodies], l_fasta, 'fasta')
+    print(f'Calling ANARCI', file=sys.stderr)
+    subprocess.run(['ANARCI', '-i', 'seq_H.fasta', '-o', 'seq_aligned',
+                    '-s', 'imgt', '-r', 'heavy', '--csv'])
+    subprocess.run(['ANARCI', '-i', 'seq_L.fasta', '-o', 'seq_aligned',
+                    '-s', 'imgt', '-r', 'light', '--csv'])
+    with open(output_filename, mode="w") as outfile:
+        # Process aligned sequences
+        # TODO: remove pandas calls, use builtin csv
+        infile_H = pd.read_csv('seq_aligned_H.csv')
+        infile_L = pd.read_csv('seq_aligned_KL.csv')
+        H_inclusion_list = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
+                            '11', '12', '13', '14', '15', '16', '17', '18', '19', '20',
+                            '21', '22', '23', '24', '25', '26', '27', '28', '29', '30',
+                            '31', '32', '33', '34', '35', '36', '37', '38', '39', '40',
+                            '41', '42', '43', '44', '45', '46', '47', '48', '49', '50',
+                            '51', '52', '53', '54', '55', '56', '57', '58', '59', '60',
+                            '61', '62', '63', '64', '65', '66', '67', '68', '69', '70',
+                            '71', '72', '73', '74', '75', '76', '77', '78', '79', '80',
+                            '81', '82', '83', '84', '85', '86', '87', '88', '89', '90',
+                            '91', '92', '93', '94', '95', '96', '97', '98', '99', '100',
+                            '101', '102', '103', '104', '105', '106', '107', '108', '109', '110',
+                            '111', '111A', '111B', '111C', '111D', '111E', '111F', '111G', '111H',
+                            '112I', '112H', '112G', '112F', '112E', '112D', '112C', '112B', '112A', '112',
+                            '113', '114', '115', '116', '117', '118', '119', '120',
+                            '121', '122', '123', '124', '125', '126', '127', '128']
+
+        L_inclusion_list = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
+                            '11', '12', '13', '14', '15', '16', '17', '18', '19', '20',
+                            '21', '22', '23', '24', '25', '26', '27', '28', '29', '30',
+                            '31', '32', '33', '34', '35', '36', '37', '38', '39', '40',
+                            '41', '42', '43', '44', '45', '46', '47', '48', '49', '50',
+                            '51', '52', '53', '54', '55', '56', '57', '58', '59', '60',
+                            '61', '62', '63', '64', '65', '66', '67', '68', '69', '70',
+                            '71', '72', '73', '74', '75', '76', '77', '78', '79', '80',
+                            '81', '82', '83', '84', '85', '86', '87', '88', '89', '90',
+                            '91', '92', '93', '94', '95', '96', '97', '98', '99', '100',
+                            '101', '102', '103', '104', '105', '106', '107', '108', '109', '110',
+                            '111', '112', '113', '114', '115', '116', '117', '118', '119', '120',
+                            '121', '122', '123', '124', '125', '126', '127']
+
+        H_dict = {'1': 0, '2': 1, '3': 2, '4': 3, '5': 4, '6': 5, '7': 6, '8': 7, '9': 8, '10': 9,
+                  '11': 10, '12': 11, '13': 12, '14': 13, '15': 14, '16': 15, '17': 16, '18': 17, '19': 18, '20': 19,
+                  '21': 20, '22': 21, '23': 22, '24': 23, '25': 24, '26': 25, '27': 26, '28': 27, '29': 28, '30': 29,
+                  '31': 30, '32': 31, '33': 32, '34': 33, '35': 34, '36': 35, '37': 36, '38': 37, '39': 38, '40': 39,
+                  '41': 40, '42': 41, '43': 42, '44': 43, '45': 44, '46': 45, '47': 46, '48': 47, '49': 48, '50': 49,
+                  '51': 50, '52': 51, '53': 52, '54': 53, '55': 54, '56': 55, '57': 56, '58': 57, '59': 58, '60': 59,
+                  '61': 60, '62': 61, '63': 62, '64': 63, '65': 64, '66': 65, '67': 66, '68': 67, '69': 68, '70': 69,
+                  '71': 70, '72': 71, '73': 72, '74': 73, '75': 74, '76': 75, '77': 76, '78': 77, '79': 78, '80': 79,
+                  '81': 80, '82': 81, '83': 82, '84': 83, '85': 84, '86': 85, '87': 86, '88': 87, '89': 88, '90': 89,
+                  '91': 90, '92': 91, '93': 92, '94': 93, '95': 94, '96': 95, '97': 96, '98': 97, '99': 98, '100': 99,
+                  '101': 100, '102': 101, '103': 102, '104': 103, '105': 104, '106': 105, '107': 106, '108': 107,
+                  '109': 108, '110': 109,
+                  '111': 110, '111A': 111, '111B': 112, '111C': 113, '111D': 114, '111E': 115, '111F': 116, '111G': 117,
+                  '111H': 118,
+                  '112I': 119, '112H': 120, '112G': 121, '112F': 122, '112E': 123, '112D': 124, '112C': 125, '112B': 126,
+                  '112A': 127, '112': 128,
+                  '113': 129, '114': 130, '115': 131, '116': 132, '117': 133, '118': 134, '119': 135, '120': 136,
+                  '121': 137, '122': 138, '123': 139, '124': 140, '125': 141, '126': 142, '127': 143, '128': 144}
+
+        L_dict = {'1': 0, '2': 1, '3': 2, '4': 3, '5': 4, '6': 5, '7': 6, '8': 7, '9': 8, '10': 9,
+                  '11': 10, '12': 11, '13': 12, '14': 13, '15': 14, '16': 15, '17': 16, '18': 17, '19': 18, '20': 19,
+                  '21': 20, '22': 21, '23': 22, '24': 23, '25': 24, '26': 25, '27': 26, '28': 27, '29': 28, '30': 29,
+                  '31': 30, '32': 31, '33': 32, '34': 33, '35': 34, '36': 35, '37': 36, '38': 37, '39': 38, '40': 39,
+                  '41': 40, '42': 41, '43': 42, '44': 43, '45': 44, '46': 45, '47': 46, '48': 47, '49': 48, '50': 49,
+                  '51': 50, '52': 51, '53': 52, '54': 53, '55': 54, '56': 55, '57': 56, '58': 57, '59': 58, '60': 59,
+                  '61': 60, '62': 61, '63': 62, '64': 63, '65': 64, '66': 65, '67': 66, '68': 67, '69': 68, '70': 69,
+                  '71': 70, '72': 71, '73': 72, '74': 73, '75': 74, '76': 75, '77': 76, '78': 77, '79': 78, '80': 79,
+                  '81': 80, '82': 81, '83': 82, '84': 83, '85': 84, '86': 85, '87': 86, '88': 87, '89': 88, '90': 89,
+                  '91': 90, '92': 91, '93': 92, '94': 93, '95': 94, '96': 95, '97': 96, '98': 97, '99': 98, '100': 99,
+                  '101': 100, '102': 101, '103': 102, '104': 103, '105': 104, '106': 105, '107': 106, '108': 107,
+                  '109': 108, '110': 109,
+                  '111': 110, '112': 111, '113': 112, '114': 113, '115': 114, '116': 115, '117': 116, '118': 117,
+                  '119': 118, '120': 119,
+                  '121': 120, '122': 121, '123': 122, '124': 123, '125': 124, '126': 125, '127': 126, '128': 127}
+
+        N_mAbs = len(infile_H["Id"])
+
+        for i in range(N_mAbs):
+            H_tmp = 145 * ['-']
+            L_tmp = 127 * ['-']
+            for col in infile_H.columns:
+                if (col in H_inclusion_list):
+                    H_tmp[H_dict[col]] = infile_H.iloc[i][col]
+            for col in infile_L.columns:
+                if (col in L_inclusion_list):
+                    L_tmp[L_dict[col]] = infile_L.iloc[i][col]
+
+            aa_string = ''
+            for aa in H_tmp + L_tmp:
+                aa_string += aa
+            outfile.write(infile_H.iloc[i, 0] + " " + aa_string)
+            outfile.write("\n")
+    # Cleanup
+    for filename in ('seq_H.fasta', 'seq_L.fasta',
+                 'seq_aligned_H.csv', 'seq_aligned_KL.csv'):
+        os.remove(filename)
+    # TODO: return prepared data instead of file
+    # Also remove seq_aligned_HL.txt
+    return output_filename
+
+
+def load_input_data(filename):
+    name_list = []
+    seq_list = []
+    with open(filename) as datafile:
+        for line in datafile:
+            line = line.strip().split()
+            name_list.append(line[0])
+            seq_list.append(line[1])
+    return name_list, seq_list
+
+
+def one_hot_encoder(s):
+    d = {'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4, 'G': 5, 'H': 6, 'I': 7, 'K': 8, 'L': 9, 'M': 10, 'N': 11, 'P': 12, 'Q': 13, 'R': 14, 'S': 15, 'T': 16, 'V': 17, 'W': 18, 'Y': 19, '-': 20}
+
+    x = np.zeros((len(d), len(s)))
+    x[[d[c] for c in s], range(len(s))] = 1
+
+    return x
+
+
+def generate_features(name_list, seq_list):
+    # sappos
+    X = seq_list
+    X = [one_hot_encoder(s=x) for x in X]
+    X = np.transpose(np.asarray(X), (0, 2, 1))
+    X = np.asarray(X)
+    json_file = open('Conv1D_regressionSAPpos.json', 'r')
+    loaded_model_json = json_file.read()
+    json_file.close()
+    loaded_model = model_from_json(loaded_model_json)
+    # load weights into model
+    loaded_model.load_weights("Conv1D_regression_SAPpos.h5")
+    loaded_model.compile(optimizer='adam', loss='mae', metrics=['mae'])
+    sap_pos = loaded_model.predict(X)
+
+    # scmpos
+    json_file = open('Conv1D_regressionSCMpos.json', 'r')
+    loaded_model_json = json_file.read()
+    json_file.close()
+    loaded_model = model_from_json(loaded_model_json)
+    # load weights into model
+    loaded_model.load_weights("Conv1D_regression_SCMpos.h5")
+    loaded_model.compile(optimizer='adam', loss='mae', metrics=['mae'])
+    scm_pos = loaded_model.predict(X)
+
+    # scmneg
+    json_file = open('Conv1D_regressionSCMneg.json', 'r')
+    loaded_model_json = json_file.read()
+    json_file.close()
+    loaded_model = model_from_json(loaded_model_json)
+    # load weights into model
+    loaded_model.load_weights("Conv1D_regression_SCMneg.h5")
+    loaded_model.compile(optimizer='adam', loss='mae', metrics=['mae'])
+    scm_neg = loaded_model.predict(X)
+
+    features = ['Name', 'SAP_pos_CDRH1', 'SAP_pos_CDRH2', 'SAP_pos_CDRH3', 'SAP_pos_CDRL1', 'SAP_pos_CDRL2',
+                'SAP_pos_CDRL3', 'SAP_pos_CDR', 'SAP_pos_Hv', 'SAP_pos_Lv', 'SAP_pos_Fv',
+                'SCM_pos_CDRH1', 'SCM_pos_CDRH2', 'SCM_pos_CDRH3', 'SCM_pos_CDRL1', 'SCM_pos_CDRL2', 'SCM_pos_CDRL3',
+                'SCM_pos_CDR', 'SCM_pos_Hv', 'SCM_pos_Lv', 'SCM_pos_Fv',
+                'SCM_neg_CDRH1', 'SCM_neg_CDRH2', 'SCM_neg_CDRH3', 'SCM_neg_CDRL1', 'SCM_neg_CDRL2', 'SCM_neg_CDRL3',
+                'SCM_neg_CDR', 'SCM_neg_Hv', 'SCM_neg_Lv', 'SCM_neg_Fv']
+    df = pd.concat([pd.DataFrame(name_list), pd.DataFrame(sap_pos), pd.DataFrame(scm_pos), pd.DataFrame(scm_neg)],
+                   ignore_index=True, axis=1, );
+    df.columns = features
+    return df
+
+
+
 if __name__ == '__main__':
     parser = ArgumentParser('DeepSP prediction')
     parser.add_argument('-i', type=str, nargs='+',
                         help='Input file(s)')
     parser.add_argument('--in_format', type=str, default='fasta',
                         help='Input format (`fasta` or `csv`)')
-    parser.add_argument('-o', type=str, help='Output CSV path')
+    parser.add_argument('-o', type=str, default='out.csv',
+                        help='Output CSV path')
     args = parser.parse_args()
     if args.in_format.lower() == 'csv':
         antibodies = process_csv_input(args.i)
@@ -67,5 +255,8 @@ def process_fasta_input(files: list[str]) -> list[tuple[SeqRecord]]:
         antibodies = process_fasta_input(args.i)
     else:
         raise ValueError('Only `csv` and `fasta` (case-insensitive) are valid in_format values')
-    print(antibodies)
-    print(len(antibodies))
\ No newline at end of file
+    print(f'Found {len(antibodies)} antibodies, processing', file=sys.stderr)
+    anarci_align(antibodies)
+    name_list, seq_list = load_input_data('seq_aligned_HL.txt')
+    df = generate_features(name_list, seq_list)
+    df.to_csv(args.o, index=False)

From 0ce24bffd92f86517d6983fd5a41b37f5e07ee78 Mon Sep 17 00:00:00 2001
From: synedraacus <alexeymorozov1991@gmail.com>
Date: Mon, 29 Apr 2024 15:48:13 +0200
Subject: [PATCH 3/5] Not using intermediate file

---
 DeepSP_predict.py | 186 +++++++++++++++++++++-------------------------
 1 file changed, 85 insertions(+), 101 deletions(-)

diff --git a/DeepSP_predict.py b/DeepSP_predict.py
index 3595866..97e88f9 100755
--- a/DeepSP_predict.py
+++ b/DeepSP_predict.py
@@ -76,110 +76,95 @@ def anarci_align(antibodies: list[tuple[SeqRecord]],
                     '-s', 'imgt', '-r', 'heavy', '--csv'])
     subprocess.run(['ANARCI', '-i', 'seq_L.fasta', '-o', 'seq_aligned',
                     '-s', 'imgt', '-r', 'light', '--csv'])
-    with open(output_filename, mode="w") as outfile:
-        # Process aligned sequences
-        # TODO: remove pandas calls, use builtin csv
-        infile_H = pd.read_csv('seq_aligned_H.csv')
-        infile_L = pd.read_csv('seq_aligned_KL.csv')
-        H_inclusion_list = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
-                            '11', '12', '13', '14', '15', '16', '17', '18', '19', '20',
-                            '21', '22', '23', '24', '25', '26', '27', '28', '29', '30',
-                            '31', '32', '33', '34', '35', '36', '37', '38', '39', '40',
-                            '41', '42', '43', '44', '45', '46', '47', '48', '49', '50',
-                            '51', '52', '53', '54', '55', '56', '57', '58', '59', '60',
-                            '61', '62', '63', '64', '65', '66', '67', '68', '69', '70',
-                            '71', '72', '73', '74', '75', '76', '77', '78', '79', '80',
-                            '81', '82', '83', '84', '85', '86', '87', '88', '89', '90',
-                            '91', '92', '93', '94', '95', '96', '97', '98', '99', '100',
-                            '101', '102', '103', '104', '105', '106', '107', '108', '109', '110',
-                            '111', '111A', '111B', '111C', '111D', '111E', '111F', '111G', '111H',
-                            '112I', '112H', '112G', '112F', '112E', '112D', '112C', '112B', '112A', '112',
-                            '113', '114', '115', '116', '117', '118', '119', '120',
-                            '121', '122', '123', '124', '125', '126', '127', '128']
+    infile_H = pd.read_csv('seq_aligned_H.csv')
+    infile_L = pd.read_csv('seq_aligned_KL.csv')
+    H_inclusion_list = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
+                        '11', '12', '13', '14', '15', '16', '17', '18', '19', '20',
+                        '21', '22', '23', '24', '25', '26', '27', '28', '29', '30',
+                        '31', '32', '33', '34', '35', '36', '37', '38', '39', '40',
+                        '41', '42', '43', '44', '45', '46', '47', '48', '49', '50',
+                        '51', '52', '53', '54', '55', '56', '57', '58', '59', '60',
+                        '61', '62', '63', '64', '65', '66', '67', '68', '69', '70',
+                        '71', '72', '73', '74', '75', '76', '77', '78', '79', '80',
+                        '81', '82', '83', '84', '85', '86', '87', '88', '89', '90',
+                        '91', '92', '93', '94', '95', '96', '97', '98', '99', '100',
+                        '101', '102', '103', '104', '105', '106', '107', '108', '109', '110',
+                        '111', '111A', '111B', '111C', '111D', '111E', '111F', '111G', '111H',
+                        '112I', '112H', '112G', '112F', '112E', '112D', '112C', '112B', '112A', '112',
+                        '113', '114', '115', '116', '117', '118', '119', '120',
+                        '121', '122', '123', '124', '125', '126', '127', '128']
+
+    L_inclusion_list = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
+                        '11', '12', '13', '14', '15', '16', '17', '18', '19', '20',
+                        '21', '22', '23', '24', '25', '26', '27', '28', '29', '30',
+                        '31', '32', '33', '34', '35', '36', '37', '38', '39', '40',
+                        '41', '42', '43', '44', '45', '46', '47', '48', '49', '50',
+                        '51', '52', '53', '54', '55', '56', '57', '58', '59', '60',
+                        '61', '62', '63', '64', '65', '66', '67', '68', '69', '70',
+                        '71', '72', '73', '74', '75', '76', '77', '78', '79', '80',
+                        '81', '82', '83', '84', '85', '86', '87', '88', '89', '90',
+                        '91', '92', '93', '94', '95', '96', '97', '98', '99', '100',
+                        '101', '102', '103', '104', '105', '106', '107', '108', '109', '110',
+                        '111', '112', '113', '114', '115', '116', '117', '118', '119', '120',
+                        '121', '122', '123', '124', '125', '126', '127']
+
+    H_dict = {'1': 0, '2': 1, '3': 2, '4': 3, '5': 4, '6': 5, '7': 6, '8': 7, '9': 8, '10': 9,
+              '11': 10, '12': 11, '13': 12, '14': 13, '15': 14, '16': 15, '17': 16, '18': 17, '19': 18, '20': 19,
+              '21': 20, '22': 21, '23': 22, '24': 23, '25': 24, '26': 25, '27': 26, '28': 27, '29': 28, '30': 29,
+              '31': 30, '32': 31, '33': 32, '34': 33, '35': 34, '36': 35, '37': 36, '38': 37, '39': 38, '40': 39,
+              '41': 40, '42': 41, '43': 42, '44': 43, '45': 44, '46': 45, '47': 46, '48': 47, '49': 48, '50': 49,
+              '51': 50, '52': 51, '53': 52, '54': 53, '55': 54, '56': 55, '57': 56, '58': 57, '59': 58, '60': 59,
+              '61': 60, '62': 61, '63': 62, '64': 63, '65': 64, '66': 65, '67': 66, '68': 67, '69': 68, '70': 69,
+              '71': 70, '72': 71, '73': 72, '74': 73, '75': 74, '76': 75, '77': 76, '78': 77, '79': 78, '80': 79,
+              '81': 80, '82': 81, '83': 82, '84': 83, '85': 84, '86': 85, '87': 86, '88': 87, '89': 88, '90': 89,
+              '91': 90, '92': 91, '93': 92, '94': 93, '95': 94, '96': 95, '97': 96, '98': 97, '99': 98, '100': 99,
+              '101': 100, '102': 101, '103': 102, '104': 103, '105': 104, '106': 105, '107': 106, '108': 107,
+              '109': 108, '110': 109,
+              '111': 110, '111A': 111, '111B': 112, '111C': 113, '111D': 114, '111E': 115, '111F': 116, '111G': 117,
+              '111H': 118,
+              '112I': 119, '112H': 120, '112G': 121, '112F': 122, '112E': 123, '112D': 124, '112C': 125, '112B': 126,
+              '112A': 127, '112': 128,
+              '113': 129, '114': 130, '115': 131, '116': 132, '117': 133, '118': 134, '119': 135, '120': 136,
+              '121': 137, '122': 138, '123': 139, '124': 140, '125': 141, '126': 142, '127': 143, '128': 144}
+
+    L_dict = {'1': 0, '2': 1, '3': 2, '4': 3, '5': 4, '6': 5, '7': 6, '8': 7, '9': 8, '10': 9,
+              '11': 10, '12': 11, '13': 12, '14': 13, '15': 14, '16': 15, '17': 16, '18': 17, '19': 18, '20': 19,
+              '21': 20, '22': 21, '23': 22, '24': 23, '25': 24, '26': 25, '27': 26, '28': 27, '29': 28, '30': 29,
+              '31': 30, '32': 31, '33': 32, '34': 33, '35': 34, '36': 35, '37': 36, '38': 37, '39': 38, '40': 39,
+              '41': 40, '42': 41, '43': 42, '44': 43, '45': 44, '46': 45, '47': 46, '48': 47, '49': 48, '50': 49,
+              '51': 50, '52': 51, '53': 52, '54': 53, '55': 54, '56': 55, '57': 56, '58': 57, '59': 58, '60': 59,
+              '61': 60, '62': 61, '63': 62, '64': 63, '65': 64, '66': 65, '67': 66, '68': 67, '69': 68, '70': 69,
+              '71': 70, '72': 71, '73': 72, '74': 73, '75': 74, '76': 75, '77': 76, '78': 77, '79': 78, '80': 79,
+              '81': 80, '82': 81, '83': 82, '84': 83, '85': 84, '86': 85, '87': 86, '88': 87, '89': 88, '90': 89,
+              '91': 90, '92': 91, '93': 92, '94': 93, '95': 94, '96': 95, '97': 96, '98': 97, '99': 98, '100': 99,
+              '101': 100, '102': 101, '103': 102, '104': 103, '105': 104, '106': 105, '107': 106, '108': 107,
+              '109': 108, '110': 109,
+              '111': 110, '112': 111, '113': 112, '114': 113, '115': 114, '116': 115, '117': 116, '118': 117,
+              '119': 118, '120': 119,
+              '121': 120, '122': 121, '123': 122, '124': 123, '125': 124, '126': 125, '127': 126, '128': 127}
 
-        L_inclusion_list = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
-                            '11', '12', '13', '14', '15', '16', '17', '18', '19', '20',
-                            '21', '22', '23', '24', '25', '26', '27', '28', '29', '30',
-                            '31', '32', '33', '34', '35', '36', '37', '38', '39', '40',
-                            '41', '42', '43', '44', '45', '46', '47', '48', '49', '50',
-                            '51', '52', '53', '54', '55', '56', '57', '58', '59', '60',
-                            '61', '62', '63', '64', '65', '66', '67', '68', '69', '70',
-                            '71', '72', '73', '74', '75', '76', '77', '78', '79', '80',
-                            '81', '82', '83', '84', '85', '86', '87', '88', '89', '90',
-                            '91', '92', '93', '94', '95', '96', '97', '98', '99', '100',
-                            '101', '102', '103', '104', '105', '106', '107', '108', '109', '110',
-                            '111', '112', '113', '114', '115', '116', '117', '118', '119', '120',
-                            '121', '122', '123', '124', '125', '126', '127']
-
-        H_dict = {'1': 0, '2': 1, '3': 2, '4': 3, '5': 4, '6': 5, '7': 6, '8': 7, '9': 8, '10': 9,
-                  '11': 10, '12': 11, '13': 12, '14': 13, '15': 14, '16': 15, '17': 16, '18': 17, '19': 18, '20': 19,
-                  '21': 20, '22': 21, '23': 22, '24': 23, '25': 24, '26': 25, '27': 26, '28': 27, '29': 28, '30': 29,
-                  '31': 30, '32': 31, '33': 32, '34': 33, '35': 34, '36': 35, '37': 36, '38': 37, '39': 38, '40': 39,
-                  '41': 40, '42': 41, '43': 42, '44': 43, '45': 44, '46': 45, '47': 46, '48': 47, '49': 48, '50': 49,
-                  '51': 50, '52': 51, '53': 52, '54': 53, '55': 54, '56': 55, '57': 56, '58': 57, '59': 58, '60': 59,
-                  '61': 60, '62': 61, '63': 62, '64': 63, '65': 64, '66': 65, '67': 66, '68': 67, '69': 68, '70': 69,
-                  '71': 70, '72': 71, '73': 72, '74': 73, '75': 74, '76': 75, '77': 76, '78': 77, '79': 78, '80': 79,
-                  '81': 80, '82': 81, '83': 82, '84': 83, '85': 84, '86': 85, '87': 86, '88': 87, '89': 88, '90': 89,
-                  '91': 90, '92': 91, '93': 92, '94': 93, '95': 94, '96': 95, '97': 96, '98': 97, '99': 98, '100': 99,
-                  '101': 100, '102': 101, '103': 102, '104': 103, '105': 104, '106': 105, '107': 106, '108': 107,
-                  '109': 108, '110': 109,
-                  '111': 110, '111A': 111, '111B': 112, '111C': 113, '111D': 114, '111E': 115, '111F': 116, '111G': 117,
-                  '111H': 118,
-                  '112I': 119, '112H': 120, '112G': 121, '112F': 122, '112E': 123, '112D': 124, '112C': 125, '112B': 126,
-                  '112A': 127, '112': 128,
-                  '113': 129, '114': 130, '115': 131, '116': 132, '117': 133, '118': 134, '119': 135, '120': 136,
-                  '121': 137, '122': 138, '123': 139, '124': 140, '125': 141, '126': 142, '127': 143, '128': 144}
-
-        L_dict = {'1': 0, '2': 1, '3': 2, '4': 3, '5': 4, '6': 5, '7': 6, '8': 7, '9': 8, '10': 9,
-                  '11': 10, '12': 11, '13': 12, '14': 13, '15': 14, '16': 15, '17': 16, '18': 17, '19': 18, '20': 19,
-                  '21': 20, '22': 21, '23': 22, '24': 23, '25': 24, '26': 25, '27': 26, '28': 27, '29': 28, '30': 29,
-                  '31': 30, '32': 31, '33': 32, '34': 33, '35': 34, '36': 35, '37': 36, '38': 37, '39': 38, '40': 39,
-                  '41': 40, '42': 41, '43': 42, '44': 43, '45': 44, '46': 45, '47': 46, '48': 47, '49': 48, '50': 49,
-                  '51': 50, '52': 51, '53': 52, '54': 53, '55': 54, '56': 55, '57': 56, '58': 57, '59': 58, '60': 59,
-                  '61': 60, '62': 61, '63': 62, '64': 63, '65': 64, '66': 65, '67': 66, '68': 67, '69': 68, '70': 69,
-                  '71': 70, '72': 71, '73': 72, '74': 73, '75': 74, '76': 75, '77': 76, '78': 77, '79': 78, '80': 79,
-                  '81': 80, '82': 81, '83': 82, '84': 83, '85': 84, '86': 85, '87': 86, '88': 87, '89': 88, '90': 89,
-                  '91': 90, '92': 91, '93': 92, '94': 93, '95': 94, '96': 95, '97': 96, '98': 97, '99': 98, '100': 99,
-                  '101': 100, '102': 101, '103': 102, '104': 103, '105': 104, '106': 105, '107': 106, '108': 107,
-                  '109': 108, '110': 109,
-                  '111': 110, '112': 111, '113': 112, '114': 113, '115': 114, '116': 115, '117': 116, '118': 117,
-                  '119': 118, '120': 119,
-                  '121': 120, '122': 121, '123': 122, '124': 123, '125': 124, '126': 125, '127': 126, '128': 127}
-
-        N_mAbs = len(infile_H["Id"])
-
-        for i in range(N_mAbs):
-            H_tmp = 145 * ['-']
-            L_tmp = 127 * ['-']
-            for col in infile_H.columns:
-                if (col in H_inclusion_list):
-                    H_tmp[H_dict[col]] = infile_H.iloc[i][col]
-            for col in infile_L.columns:
-                if (col in L_inclusion_list):
-                    L_tmp[L_dict[col]] = infile_L.iloc[i][col]
-
-            aa_string = ''
-            for aa in H_tmp + L_tmp:
-                aa_string += aa
-            outfile.write(infile_H.iloc[i, 0] + " " + aa_string)
-            outfile.write("\n")
+    name_list = []
+    seq_list = []
+    N_mAbs = len(infile_H["Id"])
+    for i in range(N_mAbs):
+        H_tmp = 145 * ['-']
+        L_tmp = 127 * ['-']
+        for col in infile_H.columns:
+            if (col in H_inclusion_list):
+                H_tmp[H_dict[col]] = infile_H.iloc[i][col]
+        for col in infile_L.columns:
+            if (col in L_inclusion_list):
+                L_tmp[L_dict[col]] = infile_L.iloc[i][col]
+
+        name_list.append(infile_H.iloc[i, 0])
+        aa_string = ''
+        for aa in H_tmp + L_tmp:
+            aa_string += aa
+        seq_list.append(aa_string)
     # Cleanup
     for filename in ('seq_H.fasta', 'seq_L.fasta',
                  'seq_aligned_H.csv', 'seq_aligned_KL.csv'):
         os.remove(filename)
-    # TODO: return prepared data instead of file
-    # Also remove seq_aligned_HL.txt
-    return output_filename
-
-
-def load_input_data(filename):
-    name_list = []
-    seq_list = []
-    with open(filename) as datafile:
-        for line in datafile:
-            line = line.strip().split()
-            name_list.append(line[0])
-            seq_list.append(line[1])
     return name_list, seq_list
 
 
@@ -256,7 +241,6 @@ def generate_features(name_list, seq_list):
     else:
         raise ValueError('Only `csv` and `fasta` (case-insensitive) are valid in_format values')
     print(f'Found {len(antibodies)} antibodies, processing', file=sys.stderr)
-    anarci_align(antibodies)
-    name_list, seq_list = load_input_data('seq_aligned_HL.txt')
+    name_list, seq_list = anarci_align(antibodies)
     df = generate_features(name_list, seq_list)
     df.to_csv(args.o, index=False)

From f23cb3f42785219c57732fbccd18361574da0661 Mon Sep 17 00:00:00 2001
From: synedraacus <alexeymorozov1991@gmail.com>
Date: Mon, 29 Apr 2024 16:06:12 +0200
Subject: [PATCH 4/5] Edited README

---
 README.md | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index a2b4acb..6933da5 100644
--- a/README.md
+++ b/README.md
@@ -3,9 +3,20 @@ DeepSP is an antibody-specific surrogate model that can generate 30 spatial prop
 
 # How to generate descriptors (features) using DeepSP
 
-- Prepare your input file according to the format DeepSP_input.csv
+DeepSP can be run either using an IPython notebook or as a callable script.
+
+### Using IPython notebook
+- Prepare your input file in the same format as DeepSP_input.csv
 - Run the notebook file DeepSP_predictor.ipynb
-- DeepSP structural properties for sequences inputed, would be polulated and saved to a csv file.
+- DeepSP structural properties for input sequences would be calculated and saved
+to `DeepSP_descriptors.csv`.
+
+### As a callable script
+- Activate conda environment from `environment.yml`
+- Prepare your input either as a CSV or as a directory of FASTAs (each should
+contain one antibody, with heavy and light chain IDs postfixed `_VH` and `_VL`
+respectively)
+- Call `./DeepSP_predict.py -i <input files> --in_format <fasta|csv> -o <out.csv>`
 
 # Citation
 

From 45506711c497746a4b19ab3bdaa144397653ad3a Mon Sep 17 00:00:00 2001
From: synedraacus <alexeymorozov1991@gmail.com>
Date: Tue, 30 Apr 2024 13:47:50 +0200
Subject: [PATCH 5/5] Removed type hints for compatibility with older Pythons

---
 DeepSP_predict.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/DeepSP_predict.py b/DeepSP_predict.py
index 97e88f9..1e0008b 100755
--- a/DeepSP_predict.py
+++ b/DeepSP_predict.py
@@ -13,7 +13,8 @@
 import pandas as pd
 import numpy as np
 
-def process_csv_input(files: list[str]) -> list[tuple[SeqRecord]]:
+
+def process_csv_input(files):
     """
     Take antibody sequences from a CSV input.
 
@@ -35,7 +36,7 @@ def process_csv_input(files: list[str]) -> list[tuple[SeqRecord]]:
     return antibodies
 
 
-def process_fasta_input(files: list[str]) -> list[tuple[SeqRecord]]:
+def process_fasta_input(files):
     """
     Take antibody sequences from FASTA input.
 
@@ -61,8 +62,7 @@ def process_fasta_input(files: list[str]) -> list[tuple[SeqRecord]]:
     return antibodies
 
 
-def anarci_align(antibodies: list[tuple[SeqRecord]],
-                 output_filename: str = 'seq_aligned_HL.txt') -> str:
+def anarci_align(antibodies):
     """
     Call ANARCI on both chains of the antibody, then merge the outputs in format
     accepted by downstream analyses. Returns a filename with merged results.