From b3435aa7347ffeb9959d5f59c3973bcdae7b108e Mon Sep 17 00:00:00 2001 From: Matteo Tiberti Date: Mon, 13 Jan 2025 11:56:36 +0100 Subject: [PATCH 1/4] [WIP] partial implementation --- mavisp/methods.py | 122 +++++++++++++++++++++++++++------------------- mavisp/modules.py | 16 ++---- 2 files changed, 77 insertions(+), 61 deletions(-) diff --git a/mavisp/methods.py b/mavisp/methods.py index d5c7a9b..33cd0bb 100644 --- a/mavisp/methods.py +++ b/mavisp/methods.py @@ -99,6 +99,8 @@ class MutateXBinding(Method): target_chain = 'A' measure = "Binding with" complex_status = "heterodimer" + averages_filename = 'energies.csv' + stds_filename = 'energies_std.csv' def __init__(self, version, complex_status=None): @@ -109,6 +111,57 @@ def __init__(self, version, complex_status=None): self.interactors = [] + def _parse_mutatex_energy_file(self, fname, data_type, interactor): + + try: + df = pd.read_csv(fname) + except Exception as e: + this_error = f"Exception {type(e).__name__} occurred when parsing the MutateX csv file. Arguments:{e.args}" + raise MAVISpMultipleError(warning=warnings, + critical=[MAVISpCriticalError(this_error)]) + + # create residue column + df['residue'] = df['WT residue type'] + df['Residue #'].astype(str) + + # detect and handle homodimer case + chains = set(df['chain ID'].unique()) + + if self.target_chain in chains: + df = df[ df['chain ID'] == self.target_chain ] + + elif set(df['chain ID'].unique()) != self.homodimer_chains: + message = "chain ID in FoldX energy file must be either A or B (heterodimer case) or AB (homodimer case)" + raise MAVISpMultipleError(critical=[MAVISpCriticalError(message)], + warning=[]) + + df = df.drop(['WT residue type', 'Residue #', 'chain ID'], axis=1) + + # stack remaining columns + df = df.set_index('residue') + df = df.stack() + df = df.reset_index() + + # create mutation column + df['mutations'] = df['residue'] + df['level_1'] + df = df.set_index('mutations') + + # drop now useless columns, rename + df = df.drop(['residue', 'level_1'], axis=1) + + # handle space around measure + if self.measure == "": + measure = "" + else: + measure = f"{self.measure} " + + # handle data type + if data_type is None or data_type == '': + colname = f"{self.type} ({measure}{interactor}, {self.complex_status}, {self.version}, {self.unit})" + else: + colname = f"{self.type} ({measure}{interactor}, {self.complex_status}, {self.version}, {self.unit}, {data_type})" + + return df.rename(columns={0 : colname}) + def parse(self, dir_path): warnings = [] @@ -128,63 +181,30 @@ def parse(self, dir_path): mutatex_files = os.listdir(interactor_dir) - if len(mutatex_files) != 1: - raise MAVISpMultipleError(critical=[MAVISpCriticalError(f"zero or multiple files found in {interactor_dir}; exactly one expected")], - warning=warnings) - - mutatex_file = mutatex_files[0] - - try: - df = pd.read_csv(os.path.join(interactor_dir, mutatex_file)) - except Exception as e: - this_error = f"Exception {type(e).__name__} occurred when parsing the MutateX csv file. Arguments:{e.args}" + if self.averages_filename not in mutatex_files: + this_error = f"energies.csv file not found in {interactor_dir}" raise MAVISpMultipleError(warning=warnings, - critical=[MAVISpCriticalError(this_error)]) - - # create residue column - df['residue'] = df['WT residue type'] + df['Residue #'].astype(str) - - # detect and handle homodimer case - chains = set(df['chain ID'].unique()) - - if self.target_chain in chains: - df = df[ df['chain ID'] == self.target_chain ] - - elif set(df['chain ID'].unique()) != self.homodimer_chains: - message = "chain ID in FoldX energy file must be either A or B (heterodimer case) or AB (homodimer case)" - raise MAVISpMultipleError(critical=[MAVISpCriticalError(message)], - warning=[]) - - df = df.drop(['WT residue type', 'Residue #', 'chain ID'], axis=1) - - # stack remaining columns - df = df.set_index('residue') - df = df.stack() - df = df.reset_index() - - # create mutation column - df['mutations'] = df['residue'] + df['level_1'] - df = df.set_index('mutations') + critical=[MAVISpCriticalError(this_error)]) - # drop now useless columns, rename - df = df.drop(['residue', 'level_1'], axis=1) + averages_df = self._parse_mutatex_energy_file(os.path.join(interactor_dir, self.averages_filename), '', interactor) - # handle space around measure - if self.measure == "": - measure = "" + if self.stds_filename in mutatex_files: + stds_df = self._parse_mutatex_energy_file(os.path.join(interactor_dir, self.stds_filename), 'st. dev.', interactor) else: - measure = f"{self.measure} " - - df = df.rename(columns={0 : f"{self.type} ({measure}{interactor}, {self.complex_status}, {self.version}, {self.unit})"}) + warnings.append(MAVISpWarningError("standard deviation file not found for MutateX data")) + stds_df = None if all_data is None: - all_data = df + all_data = averages_df else: - all_data = all_data.join(df, how='outer') + all_data = all_data.join(averages_df, how='outer') + + if stds_df is not None: + all_data = all_dat.join(stds_df, how='outer') return all_data, warnings -class MutateXDNABinding(Method): +class MutateXDNABinding(MutateXBinding): unit = "kcal/mol" type = "Local Int. With DNA" @@ -194,8 +214,6 @@ class MutateXDNABinding(Method): measure = "" complex_status = "heterodimer" - parse = MutateXBinding.parse - class RosettaDDGPredictionStability(Method): unit = "kcal/mol" @@ -349,6 +367,7 @@ def parse(self, dir_path): rosetta_file = rosetta_files[0] mutation_data = self._parse_aggregate_csv(os.path.join(interactor_dir, rosetta_file), warnings) + mutation_data_std = None elif len(rosetta_files) > 1 and all( [ os.path.isdir(os.path.join(interactor_dir, f)) for f in rosetta_files] ): mutation_data = None @@ -371,6 +390,7 @@ def parse(self, dir_path): mutation_data = mutation_data.join(conformer_data) mutation_data = pd.DataFrame(mutation_data.mean(axis=1), columns=['total_score']) + mutation_data_std = pd.DataFrame(mutation_data.std(axis=1), columns=['total_score']) else: text = f"dataset {interactor_dir} was not either a single files, or multiple directories containing one file" @@ -384,6 +404,10 @@ def parse(self, dir_path): else: all_data = all_data.join(mutation_data, how='outer') + if mutation_data_std is not None: + mutation_data_std = mutation_data_std.rename(columns={'total_score':f'{self.type} (Binding with {interactor}, {self.complex_status}, {self.version}, {self.unit}, st. dev.)'}) + all_data = all_data.join(mutation_data_std, how='outer') + return all_data, warnings class AlloSigma(Method): diff --git a/mavisp/modules.py b/mavisp/modules.py index afabd52..95545c0 100644 --- a/mavisp/modules.py +++ b/mavisp/modules.py @@ -405,8 +405,7 @@ def ingest(self, mutations): except MAVISpMultipleError as e: if len(e.critical) > 0: raise - else: - e = None + warnings += e.warning module_dir_files = os.listdir(os.path.join(self.data_dir, self.module_dir)) @@ -425,12 +424,9 @@ def ingest(self, mutations): self.data = self.data.drop(columns=['res_num', 'sas_sc_rel']) - if e is None and len(warnings) > 0: + if len(warnings) > 0: raise MAVISpMultipleError(warning=warnings, critical=[]) - elif len(warnings) > 0: - e.warning.extend(warnings) - raise e def _generate_local_interactions_classification(self, row, ci, stab_co=1.0): @@ -507,8 +503,7 @@ def ingest(self, mutations): except MAVISpMultipleError as e: if len(e.critical) > 0: raise - else: - e = None + warnings += e.warning rsa = self._parse_sas(os.path.join(self.data_dir, self.module_dir, self.sas_filename), warnings) @@ -525,12 +520,9 @@ def ingest(self, mutations): self.data = self.data.drop(columns=['res_num', 'sas_sc_rel']) - if e is None and len(warnings) > 0: + if len(warnings) > 0: raise MAVISpMultipleError(warning=warnings, critical=[]) - elif len(warnings) > 0: - e.warning.extend(warnings) - raise e def _generate_local_interactions_DNA_classification(self, row, ci, stab_co=1.0): From e532801790ffd0c5f99cc1b83574720676f8ad90 Mon Sep 17 00:00:00 2001 From: Konstantina Gkopi Date: Sun, 14 Dec 2025 14:14:36 +0100 Subject: [PATCH 2/4] Added st.dev column for MutateXBinding and RosettaDDGPredictionBinding --- mavisp/methods.py | 191 ++++++++++++++++++++++++++++++---------------- 1 file changed, 126 insertions(+), 65 deletions(-) diff --git a/mavisp/methods.py b/mavisp/methods.py index b2c6fa5..66c549d 100644 --- a/mavisp/methods.py +++ b/mavisp/methods.py @@ -91,17 +91,18 @@ def parse(self, dir_path): return averages_df, stds_df, warnings -class MutateXBinding(Method): - +class MutateXBinding(Method): + #a parsing class; produces binding ΔΔG data + unit = "kcal/mol" type = "Local Int." - heterodimer_chains = set(['A']) - homodimer_chains = set(['AB']) + heterodimer_chains = set(['A']) # If chain A exists → keep A only. + homodimer_chains = set(['AB']) # If chain A does NOT exist → the ONLY valid alternative is homodimer AB. target_chain = 'A' measure = "Binding with" - complex_status = "heterodimer" averages_filename = 'energies.csv' stds_filename = 'energies_std.csv' + complex_status = "heterodimer" def __init__(self, version, complex_status=None): @@ -112,7 +113,9 @@ def __init__(self, version, complex_status=None): self.interactors = [] - def _parse_mutatex_energy_file(self, fname, data_type, interactor): + # data_type is either '' or 'st. dev.' + def _parse_mutatex_binding_file(self, fname, interactor, data_type): + """Parse a single MutateX binding file (average or std).""" try: df = pd.read_csv(fname) @@ -120,90 +123,98 @@ def _parse_mutatex_energy_file(self, fname, data_type, interactor): this_error = f"Exception {type(e).__name__} occurred when parsing the MutateX csv file. Arguments:{e.args}" raise MAVISpMultipleError(warning=warnings, critical=[MAVISpCriticalError(this_error)]) - - # create residue column + + # Create residue column df['residue'] = df['WT residue type'] + df['Residue #'].astype(str) - # detect and handle homodimer case + # Detect and handle homodimer case chains = set(df['chain ID'].unique()) - + + # If chain A exists, KEEP ONLY rows where chain ID == 'A'. if self.target_chain in chains: df = df[ df['chain ID'] == self.target_chain ] - + # If chain A does NOT exist, the ONLY valid alternative is homodimer AB. elif set(df['chain ID'].unique()) != self.homodimer_chains: message = "chain ID in FoldX energy file must be either A or B (heterodimer case) or AB (homodimer case)" raise MAVISpMultipleError(critical=[MAVISpCriticalError(message)], - warning=[]) + warning=[]) + # Drop unnecessary columns df = df.drop(['WT residue type', 'Residue #', 'chain ID'], axis=1) - # stack remaining columns - df = df.set_index('residue') - df = df.stack() - df = df.reset_index() + # Stack remaining columns + df = df.set_index('residue') # set 'residue' column as index + df = df.stack() # rotates columns downward and makes the dataframe long-format (level_1 contains the original column names and 0 contains the values) + df = df.reset_index() # reset index to turn the index into a column - # create mutation column - df['mutations'] = df['residue'] + df['level_1'] - df = df.set_index('mutations') + # Create mutation column + df['mutations'] = df['residue'] + df['level_1'] # concatenate 'residue' and 'level_1' columns to create 'mutations' column + df = df.set_index('mutations') # set 'mutations' column as index - # drop now useless columns, rename + # Drop now useless columns, rename df = df.drop(['residue', 'level_1'], axis=1) # handle space around measure if self.measure == "": - measure = "" + measure = "" else: - measure = f"{self.measure} " + measure = f"{self.measure} " - # handle data type + # rename column Local Int. (Binding with B, heterodimer, FoldX5, kcal/mol) if data_type is None or data_type == '': - colname = f"{self.type} ({measure}{interactor}, {self.complex_status}, {self.version}, {self.unit})" + colname = f"{self.type} ({self.measure} {interactor}, {self.complex_status}, {self.version}, {self.unit})" else: - colname = f"{self.type} ({measure}{interactor}, {self.complex_status}, {self.version}, {self.unit}, {data_type})" - - return df.rename(columns={0 : colname}) + colname = f"{self.type} ({self.measure} {interactor}, {self.complex_status}, {self.version}, {self.unit}, {data_type})" + + return df.rename(columns={0 : colname}) # rename the sinle column named 0 to the formatted name def parse(self, dir_path): - + """ reads the MutateX output files (energies.csv + energies_std.csv) for each interactor, converts them into mutation-indexed dataframes, and returns them to the Local interaction module.""" + warnings = [] + all_data = None - interactors = os.listdir(dir_path) - self.interactors = interactors + interactors = os.listdir(dir_path) #list of subfolders in dir_path + self.interactors = interactors #store interactors in the instance variable if len(interactors) == 0: raise MAVISpMultipleError(critical=[MAVISpCriticalError("no interactor folders found")], warning=warnings) - all_data = None - for interactor in interactors: - + interactor_dir = os.path.join(dir_path, interactor) - mutatex_files = os.listdir(interactor_dir) + # expect energies.csv file per interactor if self.averages_filename not in mutatex_files: this_error = f"energies.csv file not found in {interactor_dir}" raise MAVISpMultipleError(warning=warnings, - critical=[MAVISpCriticalError(this_error)]) - - averages_df = self._parse_mutatex_energy_file(os.path.join(interactor_dir, self.averages_filename), '', interactor) + critical=[MAVISpCriticalError(this_error)]) + + # Parse averages file + averages_df = self._parse_mutatex_binding_file(os.path.join(interactor_dir, self.averages_filename), interactor, '') + # Parse stds file if it exists if self.stds_filename in mutatex_files: - stds_df = self._parse_mutatex_energy_file(os.path.join(interactor_dir, self.stds_filename), 'st. dev.', interactor) + stds_df = self._parse_mutatex_binding_file(os.path.join(interactor_dir, self.stds_filename), interactor, 'st. dev.') else: - warnings.append(MAVISpWarningError("standard deviation file not found for MutateX data")) + warnings.append(MAVISpWarningError("standard deviation file not found for MutateX binding data")) stds_df = None - + + # Combine averages and stds data + if stds_df is not None: + interactor_data = averages_df.join(stds_df, how='outer') + else: + interactor_data = averages_df + + # Combine data across interactors if all_data is None: - all_data = averages_df + all_data = interactor_data else: - all_data = all_data.join(averages_df, how='outer') + all_data = all_data.join(interactor_data, how='outer') - if stds_df is not None: - all_data = all_dat.join(stds_df, how='outer') - - return all_data, warnings + return all_data, warnings class MutateXDNABinding(MutateXBinding): @@ -329,7 +340,6 @@ def parse(self, dir_path): return avg_mutation_data, std_mutation_data, warnings class RosettaDDGPredictionBinding(Method): - unit = "kcal/mol" type = "Local Int." chain = 'A' @@ -350,8 +360,8 @@ def parse(self, dir_path): warnings = [] - interactors = os.listdir(dir_path) - self.interactors = interactors + interactors = os.listdir(dir_path) + self.interactors = interactors if len(interactors) == 0: raise MAVISpMultipleError(critical=[MAVISpCriticalError("no interactor folders found")], @@ -361,55 +371,106 @@ def parse(self, dir_path): for interactor in interactors: - interactor_dir = os.path.join(dir_path, interactor) - rosetta_files = os.listdir(interactor_dir) - + interactor_dir = os.path.join(dir_path, interactor) + rosetta_files = os.listdir(interactor_dir) + + # Identify the correct files + agg_file = None + struct_file = None + + for f in rosetta_files: + if f.endswith('_aggregate.csv'): + agg_file = os.path.join(interactor_dir, f) + elif f.endswith('_structures.csv'): + struct_file = os.path.join(interactor_dir, f) + + + # Expect either a single file or multiple directories containing one file each or agg_file + struct_file if len(rosetta_files) == 1 and os.path.isfile(os.path.join(interactor_dir, rosetta_files[0])): - + rosetta_file = rosetta_files[0] + # Parse single aggregate CSV file mutation_data = self._parse_aggregate_csv(os.path.join(interactor_dir, rosetta_file), warnings) - mutation_data_std = None + # Multiple directories containing one file each elif len(rosetta_files) > 1 and all( [ os.path.isdir(os.path.join(interactor_dir, f)) for f in rosetta_files] ): mutation_data = None - for c, conformer_dir in enumerate(rosetta_files): + for c, conformer_dir in enumerate(rosetta_files): - conformer_files = os.listdir(os.path.join(interactor_dir, conformer_dir)) + conformer_files = os.listdir(os.path.join(interactor_dir, conformer_dir)) if len(conformer_files) != 1: text = "only one file per conformer is supported for RosettaDDGPrediction" raise MAVISpMultipleError(critical=[MAVISpCriticalError(text)], warning=warnings) - - conformer_data = self._parse_aggregate_csv(os.path.join(interactor_dir, conformer_dir, conformer_files[0]), warnings) - + + conformer_data = self._parse_aggregate_csv(os.path.join(interactor_dir, conformer_dir, conformer_files[0]), warnings) conformer_data = conformer_data.rename(columns={'total_score' : f'total_score_{c}'}) if mutation_data is None: mutation_data = conformer_data else: mutation_data = mutation_data.join(conformer_data) - + + # Average total_score across conformers mutation_data = pd.DataFrame(mutation_data.mean(axis=1), columns=['total_score']) - mutation_data_std = pd.DataFrame(mutation_data.std(axis=1), columns=['total_score']) + + elif agg_file is not None: + mutation_data = self._parse_aggregate_csv(agg_file, warnings) + + # Parse struct_file exists + if struct_file is not None: + std_df = self._parse_structure_csv(struct_file, warnings) + + if std_df is not None: + std_df.columns = [ + f"{self.type} (Binding with {interactor}, {self.complex_status}, {self.version}, {self.unit}, st. dev.)" + ] + + mutation_data = mutation_data.join(std_df, how="outer") else: text = f"dataset {interactor_dir} was not either a single files, or multiple directories containing one file" raise MAVISpMultipleError(critical=[MAVISpCriticalError(text)], warning=warnings) + mutation_data = mutation_data.rename(columns={'total_score':f'{self.type} (Binding with {interactor}, {self.complex_status}, {self.version}, {self.unit})'}) if all_data is None: all_data = mutation_data else: all_data = all_data.join(mutation_data, how='outer') + + # return the combined data for all interactors + return all_data, warnings - if mutation_data_std is not None: - mutation_data_std = mutation_data_std.rename(columns={'total_score':f'{self.type} (Binding with {interactor}, {self.complex_status}, {self.version}, {self.unit}, st. dev.)'}) - all_data = all_data.join(mutation_data_std, how='outer') - return all_data, warnings + def _parse_structure_csv(self, csvf, warnings): + """Parse the RosettaDDGPrediction binding structure CSV file.""" + try: + df = pd.read_csv(csvf) + except Exception as e: + this_error = f"Exception {type(e).__name__} while reading structure CSV: {e.args}" + raise MAVISpMultipleError( + warning=warnings, + critical=[MAVISpCriticalError(this_error)] + ) + + #keep only ddg rows + df = df[df["state"] == "ddg"] + + if df.empty: + warnings.append(f"{csvf}: no ddg rows found") + return None + + #group by mutation and compute stdev of total_score + std_series = df.groupby("mutation_label")["total_score"].std() + + #turn into DataFrame + std_df = std_series.to_frame(name ="total_score") + + return std_df class AlloSigma(Method): From 6accee5e0158d48c4fc67c13c3c7df42d23dfb83 Mon Sep 17 00:00:00 2001 From: Matteo Tiberti Date: Tue, 6 Jan 2026 12:45:04 +0100 Subject: [PATCH 3/4] first complete implementation for simple mode --- mavisp/methods.py | 91 ++++++++++++++++++++++------------------------- 1 file changed, 42 insertions(+), 49 deletions(-) diff --git a/mavisp/methods.py b/mavisp/methods.py index 66c549d..781886e 100644 --- a/mavisp/methods.py +++ b/mavisp/methods.py @@ -96,8 +96,8 @@ class MutateXBinding(Method): unit = "kcal/mol" type = "Local Int." - heterodimer_chains = set(['A']) # If chain A exists → keep A only. - homodimer_chains = set(['AB']) # If chain A does NOT exist → the ONLY valid alternative is homodimer AB. + heterodimer_chains = set(['A']) + homodimer_chains = set(['AB']) target_chain = 'A' measure = "Binding with" averages_filename = 'energies.csv' @@ -114,8 +114,8 @@ def __init__(self, version, complex_status=None): self.interactors = [] # data_type is either '' or 'st. dev.' - def _parse_mutatex_binding_file(self, fname, interactor, data_type): - """Parse a single MutateX binding file (average or std).""" + def _parse_mutatex_binding_energy_file(self, fname, interactor, data_type): + """Parse a single MutateX binding energy file (average or std).""" try: df = pd.read_csv(fname) @@ -156,15 +156,17 @@ def _parse_mutatex_binding_file(self, fname, interactor, data_type): # handle space around measure if self.measure == "": - measure = "" + measure = "" else: - measure = f"{self.measure} " + measure = f"{self.measure} " # rename column Local Int. (Binding with B, heterodimer, FoldX5, kcal/mol) if data_type is None or data_type == '': - colname = f"{self.type} ({self.measure} {interactor}, {self.complex_status}, {self.version}, {self.unit})" + data_type = "" else: - colname = f"{self.type} ({self.measure} {interactor}, {self.complex_status}, {self.version}, {self.unit}, {data_type})" + data_type = f", {data_type}" + + colname = f"{self.type} ({measure}{interactor}, {self.complex_status}, {self.version}, {self.unit}{data_type})" return df.rename(columns={0 : colname}) # rename the sinle column named 0 to the formatted name @@ -174,8 +176,8 @@ def parse(self, dir_path): warnings = [] all_data = None - interactors = os.listdir(dir_path) #list of subfolders in dir_path - self.interactors = interactors #store interactors in the instance variable + interactors = os.listdir(dir_path) # list of subfolders in dir_path + self.interactors = interactors # store interactors in the instance variable if len(interactors) == 0: raise MAVISpMultipleError(critical=[MAVISpCriticalError("no interactor folders found")], @@ -193,13 +195,13 @@ def parse(self, dir_path): critical=[MAVISpCriticalError(this_error)]) # Parse averages file - averages_df = self._parse_mutatex_binding_file(os.path.join(interactor_dir, self.averages_filename), interactor, '') + averages_df = self._parse_mutatex_binding_energy_file(os.path.join(interactor_dir, self.averages_filename), interactor, '') # Parse stds file if it exists if self.stds_filename in mutatex_files: - stds_df = self._parse_mutatex_binding_file(os.path.join(interactor_dir, self.stds_filename), interactor, 'st. dev.') + stds_df = self._parse_mutatex_binding_energy_file(os.path.join(interactor_dir, self.stds_filename), interactor, 'st. dev.') else: - warnings.append(MAVISpWarningError("standard deviation file not found for MutateX binding data")) + warnings.append(MAVISpWarningError("standard deviation file not found for MutateX binding energy data")) stds_df = None # Combine averages and stds data @@ -339,13 +341,14 @@ def parse(self, dir_path): return avg_mutation_data, std_mutation_data, warnings -class RosettaDDGPredictionBinding(Method): +class RosettaDDGPredictionBinding(RosettaDDGPredictionStability): + unit = "kcal/mol" type = "Local Int." chain = 'A' complex_status = 'heterodimer' - - _parse_aggregate_csv = RosettaDDGPredictionStability._parse_aggregate_csv + aggregate_fname = 'ddg_mutations_aggregate.csv' + structures_fname = 'ddg_mutations_structures.csv' def __init__(self, version, complex_status=None): @@ -365,7 +368,7 @@ def parse(self, dir_path): if len(interactors) == 0: raise MAVISpMultipleError(critical=[MAVISpCriticalError("no interactor folders found")], - warning=[]) + warning=[]) all_data = None @@ -378,20 +381,28 @@ def parse(self, dir_path): agg_file = None struct_file = None - for f in rosetta_files: - if f.endswith('_aggregate.csv'): - agg_file = os.path.join(interactor_dir, f) - elif f.endswith('_structures.csv'): - struct_file = os.path.join(interactor_dir, f) - - - # Expect either a single file or multiple directories containing one file each or agg_file + struct_file - if len(rosetta_files) == 1 and os.path.isfile(os.path.join(interactor_dir, rosetta_files[0])): - + # Expect either a single file or multiple directories containing one file each + if len(rosetta_files) == 1 and os.path.isfile(os.path.join(interactor_dir, rosetta_files[0])) and rosetta_files[0] == self.aggregate_fname: rosetta_file = rosetta_files[0] # Parse single aggregate CSV file mutation_data = self._parse_aggregate_csv(os.path.join(interactor_dir, rosetta_file), warnings) + # or structures file (which we can use to calculate average and stdev) + elif (len(rosetta_files) == 1 and\ + os.path.isfile(os.path.join(interactor_dir, rosetta_files[0])) and \ + rosetta_files[0] == self.structures_fname) or\ + (set(rosetta_files) == set([self.aggregate_fname, self.structures_fname]) and\ + os.path.isfile(os.path.join(interactor_dir, rosetta_files[0])) and\ + os.path.isfile(os.path.join(interactor_dir, rosetta_files[1]))): + + if len(rosetta_files) == 2: + warnings.append(MAVISpWarningError(f"for {interactor}, both Rosetta aggregate and structures file were found; the aggregate file will be ignored")) + + mutation_data, std_df = self._parse_structure_csv(os.path.join(interactor_dir, self.structures_fname), warnings) + + std_df = std_df.rename(columns={'total_score' : f"{self.type} (Binding with {interactor}, {self.complex_status}, {self.version}, {self.unit}, st. dev.)"}) + mutation_data = mutation_data.join(std_df, how="outer") + # Multiple directories containing one file each elif len(rosetta_files) > 1 and all( [ os.path.isdir(os.path.join(interactor_dir, f)) for f in rosetta_files] ): mutation_data = None @@ -415,26 +426,11 @@ def parse(self, dir_path): # Average total_score across conformers mutation_data = pd.DataFrame(mutation_data.mean(axis=1), columns=['total_score']) - elif agg_file is not None: - mutation_data = self._parse_aggregate_csv(agg_file, warnings) - - # Parse struct_file exists - if struct_file is not None: - std_df = self._parse_structure_csv(struct_file, warnings) - - if std_df is not None: - std_df.columns = [ - f"{self.type} (Binding with {interactor}, {self.complex_status}, {self.version}, {self.unit}, st. dev.)" - ] - - mutation_data = mutation_data.join(std_df, how="outer") - else: - text = f"dataset {interactor_dir} was not either a single files, or multiple directories containing one file" + text = f"dataset {interactor_dir} did not contain an expected folder structure" raise MAVISpMultipleError(critical=[MAVISpCriticalError(text)], warning=warnings) - mutation_data = mutation_data.rename(columns={'total_score':f'{self.type} (Binding with {interactor}, {self.complex_status}, {self.version}, {self.unit})'}) if all_data is None: @@ -445,7 +441,6 @@ def parse(self, dir_path): # return the combined data for all interactors return all_data, warnings - def _parse_structure_csv(self, csvf, warnings): """Parse the RosettaDDGPrediction binding structure CSV file.""" try: @@ -460,17 +455,15 @@ def _parse_structure_csv(self, csvf, warnings): #keep only ddg rows df = df[df["state"] == "ddg"] - if df.empty: - warnings.append(f"{csvf}: no ddg rows found") - return None - #group by mutation and compute stdev of total_score std_series = df.groupby("mutation_label")["total_score"].std() + average_series = df.groupby("mutation_label")["total_score"].mean() #turn into DataFrame std_df = std_series.to_frame(name ="total_score") + average_df = average_series.to_frame(name ="total_score") - return std_df + return average_df, std_df class AlloSigma(Method): From e6c9b5bd2232bce6405d90d0a08797ad5d61ade7 Mon Sep 17 00:00:00 2001 From: Matteo Tiberti Date: Tue, 6 Jan 2026 13:31:21 +0100 Subject: [PATCH 4/4] moved location of function definition --- mavisp/methods.py | 53 +++++++++++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 25 deletions(-) diff --git a/mavisp/methods.py b/mavisp/methods.py index 781886e..f14dbd4 100644 --- a/mavisp/methods.py +++ b/mavisp/methods.py @@ -171,7 +171,10 @@ def _parse_mutatex_binding_energy_file(self, fname, interactor, data_type): return df.rename(columns={0 : colname}) # rename the sinle column named 0 to the formatted name def parse(self, dir_path): - """ reads the MutateX output files (energies.csv + energies_std.csv) for each interactor, converts them into mutation-indexed dataframes, and returns them to the Local interaction module.""" + """ + reads the MutateX output files (energies.csv + energies_std.csv) for each interactor, + converts them into mutation-indexed dataframes, and returns them to the Local interaction module. + """ warnings = [] all_data = None @@ -359,6 +362,30 @@ def __init__(self, version, complex_status=None): self.interactors = [] + def _parse_structure_csv(self, csvf, warnings): + """Parse the RosettaDDGPrediction binding structure CSV file.""" + try: + df = pd.read_csv(csvf) + except Exception as e: + this_error = f"Exception {type(e).__name__} while reading structure CSV: {e.args}" + raise MAVISpMultipleError( + warning=warnings, + critical=[MAVISpCriticalError(this_error)] + ) + + #keep only ddg rows + df = df[df["state"] == "ddg"] + + #group by mutation and compute stdev of total_score + std_series = df.groupby("mutation_label")["total_score"].std() + average_series = df.groupby("mutation_label")["total_score"].mean() + + #turn into DataFrame + std_df = std_series.to_frame(name ="total_score") + average_df = average_series.to_frame(name ="total_score") + + return average_df, std_df + def parse(self, dir_path): warnings = [] @@ -441,30 +468,6 @@ def parse(self, dir_path): # return the combined data for all interactors return all_data, warnings - def _parse_structure_csv(self, csvf, warnings): - """Parse the RosettaDDGPrediction binding structure CSV file.""" - try: - df = pd.read_csv(csvf) - except Exception as e: - this_error = f"Exception {type(e).__name__} while reading structure CSV: {e.args}" - raise MAVISpMultipleError( - warning=warnings, - critical=[MAVISpCriticalError(this_error)] - ) - - #keep only ddg rows - df = df[df["state"] == "ddg"] - - #group by mutation and compute stdev of total_score - std_series = df.groupby("mutation_label")["total_score"].std() - average_series = df.groupby("mutation_label")["total_score"].mean() - - #turn into DataFrame - std_df = std_series.to_frame(name ="total_score") - average_df = average_series.to_frame(name ="total_score") - - return average_df, std_df - class AlloSigma(Method): name = "AlloSigma"