Lailabcode · SynedraAcus · Apr 29, 2024 · Apr 29, 2024 · Apr 29, 2024 · Apr 29, 2024
diff --git a/DeepSP_predict.py b/DeepSP_predict.py
@@ -0,0 +1,246 @@
+#! /usr/bin/env python3
+
+from argparse import ArgumentParser
+from Bio import SeqIO
+from Bio.SeqRecord import SeqRecord
+from Bio.Seq import Seq
+from csv import DictReader
+from keras.models import model_from_json
+
+import os
+import subprocess
+import sys
+import pandas as pd
+import numpy as np
+
+
+def process_csv_input(files):
+    """
+    Take antibody sequences from a CSV input.
+
+    Assumes that column names are 'Name,Heavy_Chain,Light_Chain'; other columns,
+    if any, are ignored.
+    """
+    antibodies = []
+    for csv_file in files:
+        with open(csv_file) as csv_input:
+            reader = DictReader(csv_input)
+            for antibody in reader:
+                antibodies.append([SeqRecord(id=antibody['Name'],
+                                             seq=Seq(antibody['Heavy_Chain']),
+                                             description=''),
+                                   SeqRecord(id=antibody['Name'],
+                                             seq=Seq(antibody['Light_Chain']),
+                                             description='')
+                                   ])
+    return antibodies
+
+
+def process_fasta_input(files):
+    """
+    Take antibody sequences from FASTA input.
+
+    Assumes that each FASTA file contains two records named '{ab_id}_VH' and
+    '{ab_id}_VL'. Raises exception if any of these records is absent or any
+    additional records are found; order of records in the file does not matter.
+    """
+    antibodies = []
+    for fasta_file in args.i:
+        name = ''
+        heavy = ''
+        light = ''
+        for record in SeqIO.parse(fasta_file, 'fasta'):
+            if record.id.endswith('_VH'):
+                name = '_'.join(record.id.split('_')[:-1])
+                heavy = record.seq
+            elif record.id.endswith('_VL'):
+                light = record.seq
+            else:
+                raise ValueError(f'Invalid postfix in FASTA record name {record.id}')
+        antibodies.append([SeqRecord(id=name, seq=heavy, description=''),
+                           SeqRecord(id=name, seq=light, description='')])
+    return antibodies
+
+
+def anarci_align(antibodies):
+    """
+    Call ANARCI on both chains of the antibody, then merge the outputs in format
+    accepted by downstream analyses. Returns a filename with merged results.
+    """
+    with open('seq_H.fasta', mode='w') as h_fasta:
+        SeqIO.write([x[0] for x in antibodies], h_fasta, 'fasta')
+    with open('seq_L.fasta', mode='w') as l_fasta:
+        SeqIO.write([x[1] for x in antibodies], l_fasta, 'fasta')
+    print(f'Calling ANARCI', file=sys.stderr)
+    subprocess.run(['ANARCI', '-i', 'seq_H.fasta', '-o', 'seq_aligned',
+                    '-s', 'imgt', '-r', 'heavy', '--csv'])
+    subprocess.run(['ANARCI', '-i', 'seq_L.fasta', '-o', 'seq_aligned',
+                    '-s', 'imgt', '-r', 'light', '--csv'])
+    infile_H = pd.read_csv('seq_aligned_H.csv')
+    infile_L = pd.read_csv('seq_aligned_KL.csv')
+    H_inclusion_list = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
+                        '11', '12', '13', '14', '15', '16', '17', '18', '19', '20',
+                        '21', '22', '23', '24', '25', '26', '27', '28', '29', '30',
+                        '31', '32', '33', '34', '35', '36', '37', '38', '39', '40',
+                        '41', '42', '43', '44', '45', '46', '47', '48', '49', '50',
+                        '51', '52', '53', '54', '55', '56', '57', '58', '59', '60',
+                        '61', '62', '63', '64', '65', '66', '67', '68', '69', '70',
+                        '71', '72', '73', '74', '75', '76', '77', '78', '79', '80',
+                        '81', '82', '83', '84', '85', '86', '87', '88', '89', '90',
+                        '91', '92', '93', '94', '95', '96', '97', '98', '99', '100',
+                        '101', '102', '103', '104', '105', '106', '107', '108', '109', '110',
+                        '111', '111A', '111B', '111C', '111D', '111E', '111F', '111G', '111H',
+                        '112I', '112H', '112G', '112F', '112E', '112D', '112C', '112B', '112A', '112',
+                        '113', '114', '115', '116', '117', '118', '119', '120',
+                        '121', '122', '123', '124', '125', '126', '127', '128']
+
+    L_inclusion_list = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
+                        '11', '12', '13', '14', '15', '16', '17', '18', '19', '20',
+                        '21', '22', '23', '24', '25', '26', '27', '28', '29', '30',
+                        '31', '32', '33', '34', '35', '36', '37', '38', '39', '40',
+                        '41', '42', '43', '44', '45', '46', '47', '48', '49', '50',
+                        '51', '52', '53', '54', '55', '56', '57', '58', '59', '60',
+                        '61', '62', '63', '64', '65', '66', '67', '68', '69', '70',
+                        '71', '72', '73', '74', '75', '76', '77', '78', '79', '80',
+                        '81', '82', '83', '84', '85', '86', '87', '88', '89', '90',
+                        '91', '92', '93', '94', '95', '96', '97', '98', '99', '100',
+                        '101', '102', '103', '104', '105', '106', '107', '108', '109', '110',
+                        '111', '112', '113', '114', '115', '116', '117', '118', '119', '120',
+                        '121', '122', '123', '124', '125', '126', '127']
+
+    H_dict = {'1': 0, '2': 1, '3': 2, '4': 3, '5': 4, '6': 5, '7': 6, '8': 7, '9': 8, '10': 9,
+              '11': 10, '12': 11, '13': 12, '14': 13, '15': 14, '16': 15, '17': 16, '18': 17, '19': 18, '20': 19,
+              '21': 20, '22': 21, '23': 22, '24': 23, '25': 24, '26': 25, '27': 26, '28': 27, '29': 28, '30': 29,
+              '31': 30, '32': 31, '33': 32, '34': 33, '35': 34, '36': 35, '37': 36, '38': 37, '39': 38, '40': 39,
+              '41': 40, '42': 41, '43': 42, '44': 43, '45': 44, '46': 45, '47': 46, '48': 47, '49': 48, '50': 49,
+              '51': 50, '52': 51, '53': 52, '54': 53, '55': 54, '56': 55, '57': 56, '58': 57, '59': 58, '60': 59,
+              '61': 60, '62': 61, '63': 62, '64': 63, '65': 64, '66': 65, '67': 66, '68': 67, '69': 68, '70': 69,
+              '71': 70, '72': 71, '73': 72, '74': 73, '75': 74, '76': 75, '77': 76, '78': 77, '79': 78, '80': 79,
+              '81': 80, '82': 81, '83': 82, '84': 83, '85': 84, '86': 85, '87': 86, '88': 87, '89': 88, '90': 89,
+              '91': 90, '92': 91, '93': 92, '94': 93, '95': 94, '96': 95, '97': 96, '98': 97, '99': 98, '100': 99,
+              '101': 100, '102': 101, '103': 102, '104': 103, '105': 104, '106': 105, '107': 106, '108': 107,
+              '109': 108, '110': 109,
+              '111': 110, '111A': 111, '111B': 112, '111C': 113, '111D': 114, '111E': 115, '111F': 116, '111G': 117,
+              '111H': 118,
+              '112I': 119, '112H': 120, '112G': 121, '112F': 122, '112E': 123, '112D': 124, '112C': 125, '112B': 126,
+              '112A': 127, '112': 128,
+              '113': 129, '114': 130, '115': 131, '116': 132, '117': 133, '118': 134, '119': 135, '120': 136,
+              '121': 137, '122': 138, '123': 139, '124': 140, '125': 141, '126': 142, '127': 143, '128': 144}
+
+    L_dict = {'1': 0, '2': 1, '3': 2, '4': 3, '5': 4, '6': 5, '7': 6, '8': 7, '9': 8, '10': 9,
+              '11': 10, '12': 11, '13': 12, '14': 13, '15': 14, '16': 15, '17': 16, '18': 17, '19': 18, '20': 19,
+              '21': 20, '22': 21, '23': 22, '24': 23, '25': 24, '26': 25, '27': 26, '28': 27, '29': 28, '30': 29,
+              '31': 30, '32': 31, '33': 32, '34': 33, '35': 34, '36': 35, '37': 36, '38': 37, '39': 38, '40': 39,
+              '41': 40, '42': 41, '43': 42, '44': 43, '45': 44, '46': 45, '47': 46, '48': 47, '49': 48, '50': 49,
+              '51': 50, '52': 51, '53': 52, '54': 53, '55': 54, '56': 55, '57': 56, '58': 57, '59': 58, '60': 59,
+              '61': 60, '62': 61, '63': 62, '64': 63, '65': 64, '66': 65, '67': 66, '68': 67, '69': 68, '70': 69,
+              '71': 70, '72': 71, '73': 72, '74': 73, '75': 74, '76': 75, '77': 76, '78': 77, '79': 78, '80': 79,
+              '81': 80, '82': 81, '83': 82, '84': 83, '85': 84, '86': 85, '87': 86, '88': 87, '89': 88, '90': 89,
+              '91': 90, '92': 91, '93': 92, '94': 93, '95': 94, '96': 95, '97': 96, '98': 97, '99': 98, '100': 99,
+              '101': 100, '102': 101, '103': 102, '104': 103, '105': 104, '106': 105, '107': 106, '108': 107,
+              '109': 108, '110': 109,
+              '111': 110, '112': 111, '113': 112, '114': 113, '115': 114, '116': 115, '117': 116, '118': 117,
+              '119': 118, '120': 119,
+              '121': 120, '122': 121, '123': 122, '124': 123, '125': 124, '126': 125, '127': 126, '128': 127}
+
+    name_list = []
+    seq_list = []
+    N_mAbs = len(infile_H["Id"])
+    for i in range(N_mAbs):
+        H_tmp = 145 * ['-']
+        L_tmp = 127 * ['-']
+        for col in infile_H.columns:
+            if (col in H_inclusion_list):
+                H_tmp[H_dict[col]] = infile_H.iloc[i][col]
+        for col in infile_L.columns:
+            if (col in L_inclusion_list):
+                L_tmp[L_dict[col]] = infile_L.iloc[i][col]
+
+        name_list.append(infile_H.iloc[i, 0])
+        aa_string = ''
+        for aa in H_tmp + L_tmp:
+            aa_string += aa
+        seq_list.append(aa_string)
+    # Cleanup
+    for filename in ('seq_H.fasta', 'seq_L.fasta',
+                 'seq_aligned_H.csv', 'seq_aligned_KL.csv'):
+        os.remove(filename)
+    return name_list, seq_list
+
+
+def one_hot_encoder(s):
+    d = {'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4, 'G': 5, 'H': 6, 'I': 7, 'K': 8, 'L': 9, 'M': 10, 'N': 11, 'P': 12, 'Q': 13, 'R': 14, 'S': 15, 'T': 16, 'V': 17, 'W': 18, 'Y': 19, '-': 20}
+
+    x = np.zeros((len(d), len(s)))
+    x[[d[c] for c in s], range(len(s))] = 1
+
+    return x
+
+
+def generate_features(name_list, seq_list):
+    # sappos
+    X = seq_list
+    X = [one_hot_encoder(s=x) for x in X]
+    X = np.transpose(np.asarray(X), (0, 2, 1))
+    X = np.asarray(X)
+    json_file = open('Conv1D_regressionSAPpos.json', 'r')
+    loaded_model_json = json_file.read()
+    json_file.close()
+    loaded_model = model_from_json(loaded_model_json)
+    # load weights into model
+    loaded_model.load_weights("Conv1D_regression_SAPpos.h5")
+    loaded_model.compile(optimizer='adam', loss='mae', metrics=['mae'])
+    sap_pos = loaded_model.predict(X)
+
+    # scmpos
+    json_file = open('Conv1D_regressionSCMpos.json', 'r')
+    loaded_model_json = json_file.read()
+    json_file.close()
+    loaded_model = model_from_json(loaded_model_json)
+    # load weights into model
+    loaded_model.load_weights("Conv1D_regression_SCMpos.h5")
+    loaded_model.compile(optimizer='adam', loss='mae', metrics=['mae'])
+    scm_pos = loaded_model.predict(X)
+
+    # scmneg
+    json_file = open('Conv1D_regressionSCMneg.json', 'r')
+    loaded_model_json = json_file.read()
+    json_file.close()
+    loaded_model = model_from_json(loaded_model_json)
+    # load weights into model
+    loaded_model.load_weights("Conv1D_regression_SCMneg.h5")
+    loaded_model.compile(optimizer='adam', loss='mae', metrics=['mae'])
+    scm_neg = loaded_model.predict(X)
+
+    features = ['Name', 'SAP_pos_CDRH1', 'SAP_pos_CDRH2', 'SAP_pos_CDRH3', 'SAP_pos_CDRL1', 'SAP_pos_CDRL2',
+                'SAP_pos_CDRL3', 'SAP_pos_CDR', 'SAP_pos_Hv', 'SAP_pos_Lv', 'SAP_pos_Fv',
+                'SCM_pos_CDRH1', 'SCM_pos_CDRH2', 'SCM_pos_CDRH3', 'SCM_pos_CDRL1', 'SCM_pos_CDRL2', 'SCM_pos_CDRL3',
+                'SCM_pos_CDR', 'SCM_pos_Hv', 'SCM_pos_Lv', 'SCM_pos_Fv',
+                'SCM_neg_CDRH1', 'SCM_neg_CDRH2', 'SCM_neg_CDRH3', 'SCM_neg_CDRL1', 'SCM_neg_CDRL2', 'SCM_neg_CDRL3',
+                'SCM_neg_CDR', 'SCM_neg_Hv', 'SCM_neg_Lv', 'SCM_neg_Fv']
+    df = pd.concat([pd.DataFrame(name_list), pd.DataFrame(sap_pos), pd.DataFrame(scm_pos), pd.DataFrame(scm_neg)],
+                   ignore_index=True, axis=1, );
+    df.columns = features
+    return df
+
+
+
+if __name__ == '__main__':
+    parser = ArgumentParser('DeepSP prediction')
+    parser.add_argument('-i', type=str, nargs='+',
+                        help='Input file(s)')
+    parser.add_argument('--in_format', type=str, default='fasta',
+                        help='Input format (`fasta` or `csv`)')
+    parser.add_argument('-o', type=str, default='out.csv',
+                        help='Output CSV path')
+    args = parser.parse_args()
+    if args.in_format.lower() == 'csv':
+        antibodies = process_csv_input(args.i)
+    elif args.in_format.lower() == 'fasta':
+        antibodies = process_fasta_input(args.i)
+    else:
+        raise ValueError('Only `csv` and `fasta` (case-insensitive) are valid in_format values')
+    print(f'Found {len(antibodies)} antibodies, processing', file=sys.stderr)
+    name_list, seq_list = anarci_align(antibodies)
+    df = generate_features(name_list, seq_list)
+    df.to_csv(args.o, index=False)
diff --git a/README.md b/README.md
@@ -3,9 +3,20 @@ DeepSP is an antibody-specific surrogate model that can generate 30 spatial prop
 
 # How to generate descriptors (features) using DeepSP
 
-- Prepare your input file according to the format DeepSP_input.csv
+DeepSP can be run either using an IPython notebook or as a callable script.
+
+### Using IPython notebook
+- Prepare your input file in the same format as DeepSP_input.csv
 - Run the notebook file DeepSP_predictor.ipynb
-- DeepSP structural properties for sequences inputed, would be polulated and saved to a csv file.
+- DeepSP structural properties for input sequences would be calculated and saved
+to `DeepSP_descriptors.csv`.
+
+### As a callable script
+- Activate conda environment from `environment.yml`
+- Prepare your input either as a CSV or as a directory of FASTAs (each should
+contain one antibody, with heavy and light chain IDs postfixed `_VH` and `_VL`
+respectively)
+- Call `./DeepSP_predict.py -i <input files> --in_format <fasta|csv> -o <out.csv>`
 
 # Citation
 

diff --git a/test_fasta/mAb1.fasta b/test_fasta/mAb1.fasta
@@ -0,0 +1,4 @@
+>mAb1_VH
+EVQLVESGGGLVQPGRSLRLSCAASGFTFDDYAMHWVRQAPGKGLEWVSAITWNSGHIDYADSVEGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAKVSYLSTASSLDYWGQGTLVTVSS
+>mAB_VL
+DIQMTQSPSSLSASVGDRVTITCRASQGIRNYLAWYQQKPGKAPKLLIYAASTLQSGVPSRFSGSGSGTDFTLTISSLQPEDVATYYCQRYNRAPYTFGQGTKVEIK
diff --git a/test_fasta/mAb2.fasta b/test_fasta/mAb2.fasta
@@ -0,0 +1,4 @@
+>mAb2_VH
+EVQLVESGGGLVQPGGSLRLSCAASGFTFSDSWIHWVRQAPGKGLEWVAWISPYGGSTYYADSVKGRFTISADTSKNTAYLQMNSLRAEDTAVYYCARRHWPGGFDYWGQGTLVTVSA
+>mAb2_VL
+DIQMTQSPSSLSASVGDRVTITCRASQDVSTAVAWYQQKPGKAPKLLIYSASFLYSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQYLYHPATFGQGTKVEIK
diff --git a/test_fasta/mAb3.fasta b/test_fasta/mAb3.fasta
@@ -0,0 +1,4 @@
+>mAb3_VL
+DILLTQSPVILSVSPGERVSFSCRASQSIGTNIHWYQQRTNGSPRLLIKYASESISGIPSRFSGSGSGTDFTLSINSVESEDIADYYCQQNNNWPTTFGAGTKLELK
+>mAb3_VH
+QVQLKQSGPGLVQPSQSLSITCTVSGFSLTNYGVHWVRQSPGKGLEWLGVIWSGGNTDYNTPFTSRLSINKDNSKSQVFFKMNSLQSNDTAIYYCARALTYYDYEFAYWGQGTLVTVSA