diff --git a/DeepSP_predict.py b/DeepSP_predict.py new file mode 100755 index 0000000..1e0008b --- /dev/null +++ b/DeepSP_predict.py @@ -0,0 +1,246 @@ +#! /usr/bin/env python3 + +from argparse import ArgumentParser +from Bio import SeqIO +from Bio.SeqRecord import SeqRecord +from Bio.Seq import Seq +from csv import DictReader +from keras.models import model_from_json + +import os +import subprocess +import sys +import pandas as pd +import numpy as np + + +def process_csv_input(files): + """ + Take antibody sequences from a CSV input. + + Assumes that column names are 'Name,Heavy_Chain,Light_Chain'; other columns, + if any, are ignored. + """ + antibodies = [] + for csv_file in files: + with open(csv_file) as csv_input: + reader = DictReader(csv_input) + for antibody in reader: + antibodies.append([SeqRecord(id=antibody['Name'], + seq=Seq(antibody['Heavy_Chain']), + description=''), + SeqRecord(id=antibody['Name'], + seq=Seq(antibody['Light_Chain']), + description='') + ]) + return antibodies + + +def process_fasta_input(files): + """ + Take antibody sequences from FASTA input. + + Assumes that each FASTA file contains two records named '{ab_id}_VH' and + '{ab_id}_VL'. Raises exception if any of these records is absent or any + additional records are found; order of records in the file does not matter. + """ + antibodies = [] + for fasta_file in args.i: + name = '' + heavy = '' + light = '' + for record in SeqIO.parse(fasta_file, 'fasta'): + if record.id.endswith('_VH'): + name = '_'.join(record.id.split('_')[:-1]) + heavy = record.seq + elif record.id.endswith('_VL'): + light = record.seq + else: + raise ValueError(f'Invalid postfix in FASTA record name {record.id}') + antibodies.append([SeqRecord(id=name, seq=heavy, description=''), + SeqRecord(id=name, seq=light, description='')]) + return antibodies + + +def anarci_align(antibodies): + """ + Call ANARCI on both chains of the antibody, then merge the outputs in format + accepted by downstream analyses. Returns a filename with merged results. + """ + with open('seq_H.fasta', mode='w') as h_fasta: + SeqIO.write([x[0] for x in antibodies], h_fasta, 'fasta') + with open('seq_L.fasta', mode='w') as l_fasta: + SeqIO.write([x[1] for x in antibodies], l_fasta, 'fasta') + print(f'Calling ANARCI', file=sys.stderr) + subprocess.run(['ANARCI', '-i', 'seq_H.fasta', '-o', 'seq_aligned', + '-s', 'imgt', '-r', 'heavy', '--csv']) + subprocess.run(['ANARCI', '-i', 'seq_L.fasta', '-o', 'seq_aligned', + '-s', 'imgt', '-r', 'light', '--csv']) + infile_H = pd.read_csv('seq_aligned_H.csv') + infile_L = pd.read_csv('seq_aligned_KL.csv') + H_inclusion_list = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', + '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', + '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', + '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', + '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', + '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', + '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', + '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', + '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', + '91', '92', '93', '94', '95', '96', '97', '98', '99', '100', + '101', '102', '103', '104', '105', '106', '107', '108', '109', '110', + '111', '111A', '111B', '111C', '111D', '111E', '111F', '111G', '111H', + '112I', '112H', '112G', '112F', '112E', '112D', '112C', '112B', '112A', '112', + '113', '114', '115', '116', '117', '118', '119', '120', + '121', '122', '123', '124', '125', '126', '127', '128'] + + L_inclusion_list = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', + '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', + '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', + '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', + '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', + '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', + '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', + '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', + '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', + '91', '92', '93', '94', '95', '96', '97', '98', '99', '100', + '101', '102', '103', '104', '105', '106', '107', '108', '109', '110', + '111', '112', '113', '114', '115', '116', '117', '118', '119', '120', + '121', '122', '123', '124', '125', '126', '127'] + + H_dict = {'1': 0, '2': 1, '3': 2, '4': 3, '5': 4, '6': 5, '7': 6, '8': 7, '9': 8, '10': 9, + '11': 10, '12': 11, '13': 12, '14': 13, '15': 14, '16': 15, '17': 16, '18': 17, '19': 18, '20': 19, + '21': 20, '22': 21, '23': 22, '24': 23, '25': 24, '26': 25, '27': 26, '28': 27, '29': 28, '30': 29, + '31': 30, '32': 31, '33': 32, '34': 33, '35': 34, '36': 35, '37': 36, '38': 37, '39': 38, '40': 39, + '41': 40, '42': 41, '43': 42, '44': 43, '45': 44, '46': 45, '47': 46, '48': 47, '49': 48, '50': 49, + '51': 50, '52': 51, '53': 52, '54': 53, '55': 54, '56': 55, '57': 56, '58': 57, '59': 58, '60': 59, + '61': 60, '62': 61, '63': 62, '64': 63, '65': 64, '66': 65, '67': 66, '68': 67, '69': 68, '70': 69, + '71': 70, '72': 71, '73': 72, '74': 73, '75': 74, '76': 75, '77': 76, '78': 77, '79': 78, '80': 79, + '81': 80, '82': 81, '83': 82, '84': 83, '85': 84, '86': 85, '87': 86, '88': 87, '89': 88, '90': 89, + '91': 90, '92': 91, '93': 92, '94': 93, '95': 94, '96': 95, '97': 96, '98': 97, '99': 98, '100': 99, + '101': 100, '102': 101, '103': 102, '104': 103, '105': 104, '106': 105, '107': 106, '108': 107, + '109': 108, '110': 109, + '111': 110, '111A': 111, '111B': 112, '111C': 113, '111D': 114, '111E': 115, '111F': 116, '111G': 117, + '111H': 118, + '112I': 119, '112H': 120, '112G': 121, '112F': 122, '112E': 123, '112D': 124, '112C': 125, '112B': 126, + '112A': 127, '112': 128, + '113': 129, '114': 130, '115': 131, '116': 132, '117': 133, '118': 134, '119': 135, '120': 136, + '121': 137, '122': 138, '123': 139, '124': 140, '125': 141, '126': 142, '127': 143, '128': 144} + + L_dict = {'1': 0, '2': 1, '3': 2, '4': 3, '5': 4, '6': 5, '7': 6, '8': 7, '9': 8, '10': 9, + '11': 10, '12': 11, '13': 12, '14': 13, '15': 14, '16': 15, '17': 16, '18': 17, '19': 18, '20': 19, + '21': 20, '22': 21, '23': 22, '24': 23, '25': 24, '26': 25, '27': 26, '28': 27, '29': 28, '30': 29, + '31': 30, '32': 31, '33': 32, '34': 33, '35': 34, '36': 35, '37': 36, '38': 37, '39': 38, '40': 39, + '41': 40, '42': 41, '43': 42, '44': 43, '45': 44, '46': 45, '47': 46, '48': 47, '49': 48, '50': 49, + '51': 50, '52': 51, '53': 52, '54': 53, '55': 54, '56': 55, '57': 56, '58': 57, '59': 58, '60': 59, + '61': 60, '62': 61, '63': 62, '64': 63, '65': 64, '66': 65, '67': 66, '68': 67, '69': 68, '70': 69, + '71': 70, '72': 71, '73': 72, '74': 73, '75': 74, '76': 75, '77': 76, '78': 77, '79': 78, '80': 79, + '81': 80, '82': 81, '83': 82, '84': 83, '85': 84, '86': 85, '87': 86, '88': 87, '89': 88, '90': 89, + '91': 90, '92': 91, '93': 92, '94': 93, '95': 94, '96': 95, '97': 96, '98': 97, '99': 98, '100': 99, + '101': 100, '102': 101, '103': 102, '104': 103, '105': 104, '106': 105, '107': 106, '108': 107, + '109': 108, '110': 109, + '111': 110, '112': 111, '113': 112, '114': 113, '115': 114, '116': 115, '117': 116, '118': 117, + '119': 118, '120': 119, + '121': 120, '122': 121, '123': 122, '124': 123, '125': 124, '126': 125, '127': 126, '128': 127} + + name_list = [] + seq_list = [] + N_mAbs = len(infile_H["Id"]) + for i in range(N_mAbs): + H_tmp = 145 * ['-'] + L_tmp = 127 * ['-'] + for col in infile_H.columns: + if (col in H_inclusion_list): + H_tmp[H_dict[col]] = infile_H.iloc[i][col] + for col in infile_L.columns: + if (col in L_inclusion_list): + L_tmp[L_dict[col]] = infile_L.iloc[i][col] + + name_list.append(infile_H.iloc[i, 0]) + aa_string = '' + for aa in H_tmp + L_tmp: + aa_string += aa + seq_list.append(aa_string) + # Cleanup + for filename in ('seq_H.fasta', 'seq_L.fasta', + 'seq_aligned_H.csv', 'seq_aligned_KL.csv'): + os.remove(filename) + return name_list, seq_list + + +def one_hot_encoder(s): + d = {'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4, 'G': 5, 'H': 6, 'I': 7, 'K': 8, 'L': 9, 'M': 10, 'N': 11, 'P': 12, 'Q': 13, 'R': 14, 'S': 15, 'T': 16, 'V': 17, 'W': 18, 'Y': 19, '-': 20} + + x = np.zeros((len(d), len(s))) + x[[d[c] for c in s], range(len(s))] = 1 + + return x + + +def generate_features(name_list, seq_list): + # sappos + X = seq_list + X = [one_hot_encoder(s=x) for x in X] + X = np.transpose(np.asarray(X), (0, 2, 1)) + X = np.asarray(X) + json_file = open('Conv1D_regressionSAPpos.json', 'r') + loaded_model_json = json_file.read() + json_file.close() + loaded_model = model_from_json(loaded_model_json) + # load weights into model + loaded_model.load_weights("Conv1D_regression_SAPpos.h5") + loaded_model.compile(optimizer='adam', loss='mae', metrics=['mae']) + sap_pos = loaded_model.predict(X) + + # scmpos + json_file = open('Conv1D_regressionSCMpos.json', 'r') + loaded_model_json = json_file.read() + json_file.close() + loaded_model = model_from_json(loaded_model_json) + # load weights into model + loaded_model.load_weights("Conv1D_regression_SCMpos.h5") + loaded_model.compile(optimizer='adam', loss='mae', metrics=['mae']) + scm_pos = loaded_model.predict(X) + + # scmneg + json_file = open('Conv1D_regressionSCMneg.json', 'r') + loaded_model_json = json_file.read() + json_file.close() + loaded_model = model_from_json(loaded_model_json) + # load weights into model + loaded_model.load_weights("Conv1D_regression_SCMneg.h5") + loaded_model.compile(optimizer='adam', loss='mae', metrics=['mae']) + scm_neg = loaded_model.predict(X) + + features = ['Name', 'SAP_pos_CDRH1', 'SAP_pos_CDRH2', 'SAP_pos_CDRH3', 'SAP_pos_CDRL1', 'SAP_pos_CDRL2', + 'SAP_pos_CDRL3', 'SAP_pos_CDR', 'SAP_pos_Hv', 'SAP_pos_Lv', 'SAP_pos_Fv', + 'SCM_pos_CDRH1', 'SCM_pos_CDRH2', 'SCM_pos_CDRH3', 'SCM_pos_CDRL1', 'SCM_pos_CDRL2', 'SCM_pos_CDRL3', + 'SCM_pos_CDR', 'SCM_pos_Hv', 'SCM_pos_Lv', 'SCM_pos_Fv', + 'SCM_neg_CDRH1', 'SCM_neg_CDRH2', 'SCM_neg_CDRH3', 'SCM_neg_CDRL1', 'SCM_neg_CDRL2', 'SCM_neg_CDRL3', + 'SCM_neg_CDR', 'SCM_neg_Hv', 'SCM_neg_Lv', 'SCM_neg_Fv'] + df = pd.concat([pd.DataFrame(name_list), pd.DataFrame(sap_pos), pd.DataFrame(scm_pos), pd.DataFrame(scm_neg)], + ignore_index=True, axis=1, ); + df.columns = features + return df + + + +if __name__ == '__main__': + parser = ArgumentParser('DeepSP prediction') + parser.add_argument('-i', type=str, nargs='+', + help='Input file(s)') + parser.add_argument('--in_format', type=str, default='fasta', + help='Input format (`fasta` or `csv`)') + parser.add_argument('-o', type=str, default='out.csv', + help='Output CSV path') + args = parser.parse_args() + if args.in_format.lower() == 'csv': + antibodies = process_csv_input(args.i) + elif args.in_format.lower() == 'fasta': + antibodies = process_fasta_input(args.i) + else: + raise ValueError('Only `csv` and `fasta` (case-insensitive) are valid in_format values') + print(f'Found {len(antibodies)} antibodies, processing', file=sys.stderr) + name_list, seq_list = anarci_align(antibodies) + df = generate_features(name_list, seq_list) + df.to_csv(args.o, index=False) diff --git a/README.md b/README.md index a2b4acb..6933da5 100644 --- a/README.md +++ b/README.md @@ -3,9 +3,20 @@ DeepSP is an antibody-specific surrogate model that can generate 30 spatial prop # How to generate descriptors (features) using DeepSP -- Prepare your input file according to the format DeepSP_input.csv +DeepSP can be run either using an IPython notebook or as a callable script. + +### Using IPython notebook +- Prepare your input file in the same format as DeepSP_input.csv - Run the notebook file DeepSP_predictor.ipynb -- DeepSP structural properties for sequences inputed, would be polulated and saved to a csv file. +- DeepSP structural properties for input sequences would be calculated and saved +to `DeepSP_descriptors.csv`. + +### As a callable script +- Activate conda environment from `environment.yml` +- Prepare your input either as a CSV or as a directory of FASTAs (each should +contain one antibody, with heavy and light chain IDs postfixed `_VH` and `_VL` +respectively) +- Call `./DeepSP_predict.py -i --in_format -o ` # Citation diff --git a/test_fasta/mAb1.fasta b/test_fasta/mAb1.fasta new file mode 100644 index 0000000..727ecae --- /dev/null +++ b/test_fasta/mAb1.fasta @@ -0,0 +1,4 @@ +>mAb1_VH +EVQLVESGGGLVQPGRSLRLSCAASGFTFDDYAMHWVRQAPGKGLEWVSAITWNSGHIDYADSVEGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAKVSYLSTASSLDYWGQGTLVTVSS +>mAB_VL +DIQMTQSPSSLSASVGDRVTITCRASQGIRNYLAWYQQKPGKAPKLLIYAASTLQSGVPSRFSGSGSGTDFTLTISSLQPEDVATYYCQRYNRAPYTFGQGTKVEIK \ No newline at end of file diff --git a/test_fasta/mAb2.fasta b/test_fasta/mAb2.fasta new file mode 100644 index 0000000..c170638 --- /dev/null +++ b/test_fasta/mAb2.fasta @@ -0,0 +1,4 @@ +>mAb2_VH +EVQLVESGGGLVQPGGSLRLSCAASGFTFSDSWIHWVRQAPGKGLEWVAWISPYGGSTYYADSVKGRFTISADTSKNTAYLQMNSLRAEDTAVYYCARRHWPGGFDYWGQGTLVTVSA +>mAb2_VL +DIQMTQSPSSLSASVGDRVTITCRASQDVSTAVAWYQQKPGKAPKLLIYSASFLYSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQYLYHPATFGQGTKVEIK \ No newline at end of file diff --git a/test_fasta/mAb3.fasta b/test_fasta/mAb3.fasta new file mode 100644 index 0000000..116c1fe --- /dev/null +++ b/test_fasta/mAb3.fasta @@ -0,0 +1,4 @@ +>mAb3_VL +DILLTQSPVILSVSPGERVSFSCRASQSIGTNIHWYQQRTNGSPRLLIKYASESISGIPSRFSGSGSGTDFTLSINSVESEDIADYYCQQNNNWPTTFGAGTKLELK +>mAb3_VH +QVQLKQSGPGLVQPSQSLSITCTVSGFSLTNYGVHWVRQSPGKGLEWLGVIWSGGNTDYNTPFTSRLSINKDNSKSQVFFKMNSLQSNDTAIYYCARALTYYDYEFAYWGQGTLVTVSA \ No newline at end of file