diff --git a/mavisp/methods.py b/mavisp/methods.py index 6fdfa97..f14dbd4 100644 --- a/mavisp/methods.py +++ b/mavisp/methods.py @@ -91,14 +91,17 @@ def parse(self, dir_path): return averages_df, stds_df, warnings -class MutateXBinding(Method): - +class MutateXBinding(Method): + #a parsing class; produces binding ΔΔG data + unit = "kcal/mol" type = "Local Int." heterodimer_chains = set(['A']) homodimer_chains = set(['AB']) target_chain = 'A' measure = "Binding with" + averages_filename = 'energies.csv' + stds_filename = 'energies_std.csv' complex_status = "heterodimer" def __init__(self, version, complex_status=None): @@ -110,82 +113,115 @@ def __init__(self, version, complex_status=None): self.interactors = [] - def parse(self, dir_path): - - warnings = [] - - interactors = os.listdir(dir_path) - self.interactors = interactors - - if len(interactors) == 0: - raise MAVISpMultipleError(critical=[MAVISpCriticalError("no interactor folders found")], - warning=warnings) - - all_data = None + # data_type is either '' or 'st. dev.' + def _parse_mutatex_binding_energy_file(self, fname, interactor, data_type): + """Parse a single MutateX binding energy file (average or std).""" - for interactor in interactors: - - interactor_dir = os.path.join(dir_path, interactor) - - mutatex_files = os.listdir(interactor_dir) + try: + df = pd.read_csv(fname) + except Exception as e: + this_error = f"Exception {type(e).__name__} occurred when parsing the MutateX csv file. Arguments:{e.args}" + raise MAVISpMultipleError(warning=warnings, + critical=[MAVISpCriticalError(this_error)]) + + # Create residue column + df['residue'] = df['WT residue type'] + df['Residue #'].astype(str) - if len(mutatex_files) != 1: - raise MAVISpMultipleError(critical=[MAVISpCriticalError(f"zero or multiple files found in {interactor_dir}; exactly one expected")], - warning=warnings) + # Detect and handle homodimer case + chains = set(df['chain ID'].unique()) + + # If chain A exists, KEEP ONLY rows where chain ID == 'A'. + if self.target_chain in chains: + df = df[ df['chain ID'] == self.target_chain ] + # If chain A does NOT exist, the ONLY valid alternative is homodimer AB. + elif set(df['chain ID'].unique()) != self.homodimer_chains: + message = "chain ID in FoldX energy file must be either A or B (heterodimer case) or AB (homodimer case)" + raise MAVISpMultipleError(critical=[MAVISpCriticalError(message)], + warning=[]) + + # Drop unnecessary columns + df = df.drop(['WT residue type', 'Residue #', 'chain ID'], axis=1) - mutatex_file = mutatex_files[0] + # Stack remaining columns + df = df.set_index('residue') # set 'residue' column as index + df = df.stack() # rotates columns downward and makes the dataframe long-format (level_1 contains the original column names and 0 contains the values) + df = df.reset_index() # reset index to turn the index into a column - try: - df = pd.read_csv(os.path.join(interactor_dir, mutatex_file)) - except Exception as e: - this_error = f"Exception {type(e).__name__} occurred when parsing the MutateX csv file. Arguments:{e.args}" - raise MAVISpMultipleError(warning=warnings, - critical=[MAVISpCriticalError(this_error)]) + # Create mutation column + df['mutations'] = df['residue'] + df['level_1'] # concatenate 'residue' and 'level_1' columns to create 'mutations' column + df = df.set_index('mutations') # set 'mutations' column as index - # create residue column - df['residue'] = df['WT residue type'] + df['Residue #'].astype(str) + # Drop now useless columns, rename + df = df.drop(['residue', 'level_1'], axis=1) - # detect and handle homodimer case - chains = set(df['chain ID'].unique()) + # handle space around measure + if self.measure == "": + measure = "" + else: + measure = f"{self.measure} " - if self.target_chain in chains: - df = df[ df['chain ID'] == self.target_chain ] + # rename column Local Int. (Binding with B, heterodimer, FoldX5, kcal/mol) + if data_type is None or data_type == '': + data_type = "" + else: + data_type = f", {data_type}" - elif set(df['chain ID'].unique()) != self.homodimer_chains: - message = "chain ID in FoldX energy file must be either A or B (heterodimer case) or AB (homodimer case)" - raise MAVISpMultipleError(critical=[MAVISpCriticalError(message)], - warning=[]) + colname = f"{self.type} ({measure}{interactor}, {self.complex_status}, {self.version}, {self.unit}{data_type})" + + return df.rename(columns={0 : colname}) # rename the sinle column named 0 to the formatted name - df = df.drop(['WT residue type', 'Residue #', 'chain ID'], axis=1) + def parse(self, dir_path): + """ + reads the MutateX output files (energies.csv + energies_std.csv) for each interactor, + converts them into mutation-indexed dataframes, and returns them to the Local interaction module. + """ + + warnings = [] + all_data = None - # stack remaining columns - df = df.set_index('residue') - df = df.stack() - df = df.reset_index() + interactors = os.listdir(dir_path) # list of subfolders in dir_path + self.interactors = interactors # store interactors in the instance variable - # create mutation column - df['mutations'] = df['residue'] + df['level_1'] - df = df.set_index('mutations') + if len(interactors) == 0: + raise MAVISpMultipleError(critical=[MAVISpCriticalError("no interactor folders found")], + warning=warnings) - # drop now useless columns, rename - df = df.drop(['residue', 'level_1'], axis=1) + for interactor in interactors: + + interactor_dir = os.path.join(dir_path, interactor) + mutatex_files = os.listdir(interactor_dir) - # handle space around measure - if self.measure == "": - measure = "" + # expect energies.csv file per interactor + if self.averages_filename not in mutatex_files: + this_error = f"energies.csv file not found in {interactor_dir}" + raise MAVISpMultipleError(warning=warnings, + critical=[MAVISpCriticalError(this_error)]) + + # Parse averages file + averages_df = self._parse_mutatex_binding_energy_file(os.path.join(interactor_dir, self.averages_filename), interactor, '') + + # Parse stds file if it exists + if self.stds_filename in mutatex_files: + stds_df = self._parse_mutatex_binding_energy_file(os.path.join(interactor_dir, self.stds_filename), interactor, 'st. dev.') else: - measure = f"{self.measure} " - - df = df.rename(columns={0 : f"{self.type} ({measure}{interactor}, {self.complex_status}, {self.version}, {self.unit})"}) - + warnings.append(MAVISpWarningError("standard deviation file not found for MutateX binding energy data")) + stds_df = None + + # Combine averages and stds data + if stds_df is not None: + interactor_data = averages_df.join(stds_df, how='outer') + else: + interactor_data = averages_df + + # Combine data across interactors if all_data is None: - all_data = df + all_data = interactor_data else: - all_data = all_data.join(df, how='outer') + all_data = all_data.join(interactor_data, how='outer') - return all_data, warnings + return all_data, warnings -class MutateXDNABinding(Method): +class MutateXDNABinding(MutateXBinding): unit = "kcal/mol" type = "Local Int. With DNA" @@ -195,8 +231,6 @@ class MutateXDNABinding(Method): measure = "" complex_status = "heterodimer" - parse = MutateXBinding.parse - class RosettaDDGPredictionStability(Method): unit = "kcal/mol" @@ -310,14 +344,14 @@ def parse(self, dir_path): return avg_mutation_data, std_mutation_data, warnings -class RosettaDDGPredictionBinding(Method): +class RosettaDDGPredictionBinding(RosettaDDGPredictionStability): unit = "kcal/mol" type = "Local Int." chain = 'A' complex_status = 'heterodimer' - - _parse_aggregate_csv = RosettaDDGPredictionStability._parse_aggregate_csv + aggregate_fname = 'ddg_mutations_aggregate.csv' + structures_fname = 'ddg_mutations_structures.csv' def __init__(self, version, complex_status=None): @@ -328,53 +362,99 @@ def __init__(self, version, complex_status=None): self.interactors = [] + def _parse_structure_csv(self, csvf, warnings): + """Parse the RosettaDDGPrediction binding structure CSV file.""" + try: + df = pd.read_csv(csvf) + except Exception as e: + this_error = f"Exception {type(e).__name__} while reading structure CSV: {e.args}" + raise MAVISpMultipleError( + warning=warnings, + critical=[MAVISpCriticalError(this_error)] + ) + + #keep only ddg rows + df = df[df["state"] == "ddg"] + + #group by mutation and compute stdev of total_score + std_series = df.groupby("mutation_label")["total_score"].std() + average_series = df.groupby("mutation_label")["total_score"].mean() + + #turn into DataFrame + std_df = std_series.to_frame(name ="total_score") + average_df = average_series.to_frame(name ="total_score") + + return average_df, std_df + def parse(self, dir_path): warnings = [] - interactors = os.listdir(dir_path) - self.interactors = interactors + interactors = os.listdir(dir_path) + self.interactors = interactors if len(interactors) == 0: raise MAVISpMultipleError(critical=[MAVISpCriticalError("no interactor folders found")], - warning=[]) + warning=[]) all_data = None for interactor in interactors: - interactor_dir = os.path.join(dir_path, interactor) - rosetta_files = os.listdir(interactor_dir) - - if len(rosetta_files) == 1 and os.path.isfile(os.path.join(interactor_dir, rosetta_files[0])): - + interactor_dir = os.path.join(dir_path, interactor) + rosetta_files = os.listdir(interactor_dir) + + # Identify the correct files + agg_file = None + struct_file = None + + # Expect either a single file or multiple directories containing one file each + if len(rosetta_files) == 1 and os.path.isfile(os.path.join(interactor_dir, rosetta_files[0])) and rosetta_files[0] == self.aggregate_fname: rosetta_file = rosetta_files[0] + # Parse single aggregate CSV file mutation_data = self._parse_aggregate_csv(os.path.join(interactor_dir, rosetta_file), warnings) + # or structures file (which we can use to calculate average and stdev) + elif (len(rosetta_files) == 1 and\ + os.path.isfile(os.path.join(interactor_dir, rosetta_files[0])) and \ + rosetta_files[0] == self.structures_fname) or\ + (set(rosetta_files) == set([self.aggregate_fname, self.structures_fname]) and\ + os.path.isfile(os.path.join(interactor_dir, rosetta_files[0])) and\ + os.path.isfile(os.path.join(interactor_dir, rosetta_files[1]))): + + if len(rosetta_files) == 2: + warnings.append(MAVISpWarningError(f"for {interactor}, both Rosetta aggregate and structures file were found; the aggregate file will be ignored")) + + mutation_data, std_df = self._parse_structure_csv(os.path.join(interactor_dir, self.structures_fname), warnings) + + std_df = std_df.rename(columns={'total_score' : f"{self.type} (Binding with {interactor}, {self.complex_status}, {self.version}, {self.unit}, st. dev.)"}) + mutation_data = mutation_data.join(std_df, how="outer") + + # Multiple directories containing one file each elif len(rosetta_files) > 1 and all( [ os.path.isdir(os.path.join(interactor_dir, f)) for f in rosetta_files] ): mutation_data = None - for c, conformer_dir in enumerate(rosetta_files): + for c, conformer_dir in enumerate(rosetta_files): - conformer_files = os.listdir(os.path.join(interactor_dir, conformer_dir)) + conformer_files = os.listdir(os.path.join(interactor_dir, conformer_dir)) if len(conformer_files) != 1: text = "only one file per conformer is supported for RosettaDDGPrediction" raise MAVISpMultipleError(critical=[MAVISpCriticalError(text)], warning=warnings) - - conformer_data = self._parse_aggregate_csv(os.path.join(interactor_dir, conformer_dir, conformer_files[0]), warnings) - + + conformer_data = self._parse_aggregate_csv(os.path.join(interactor_dir, conformer_dir, conformer_files[0]), warnings) conformer_data = conformer_data.rename(columns={'total_score' : f'total_score_{c}'}) if mutation_data is None: mutation_data = conformer_data else: mutation_data = mutation_data.join(conformer_data) - + + # Average total_score across conformers mutation_data = pd.DataFrame(mutation_data.mean(axis=1), columns=['total_score']) else: - text = f"dataset {interactor_dir} was not either a single files, or multiple directories containing one file" + text = f"dataset {interactor_dir} did not contain an expected folder structure" raise MAVISpMultipleError(critical=[MAVISpCriticalError(text)], warning=warnings) @@ -384,7 +464,8 @@ def parse(self, dir_path): all_data = mutation_data else: all_data = all_data.join(mutation_data, how='outer') - + + # return the combined data for all interactors return all_data, warnings class AlloSigma(Method): diff --git a/mavisp/modules.py b/mavisp/modules.py index 36d518d..d28b7f7 100644 --- a/mavisp/modules.py +++ b/mavisp/modules.py @@ -526,8 +526,7 @@ def ingest(self, mutations): except MAVISpMultipleError as e: if len(e.critical) > 0: raise - else: - e = None + warnings += e.warning module_dir_files = os.listdir(os.path.join(self.data_dir, self.module_dir)) @@ -546,12 +545,9 @@ def ingest(self, mutations): self.data = self.data.drop(columns=['res_num', 'sas_sc_rel']) - if e is None and len(warnings) > 0: + if len(warnings) > 0: raise MAVISpMultipleError(warning=warnings, critical=[]) - elif len(warnings) > 0: - e.warning.extend(warnings) - raise e def _generate_local_interactions_classification(self, row, ci, stab_co=1.0): @@ -628,8 +624,7 @@ def ingest(self, mutations): except MAVISpMultipleError as e: if len(e.critical) > 0: raise - else: - e = None + warnings += e.warning rsa = self._parse_sas(os.path.join(self.data_dir, self.module_dir, self.sas_filename), warnings) @@ -646,12 +641,9 @@ def ingest(self, mutations): self.data = self.data.drop(columns=['res_num', 'sas_sc_rel']) - if e is None and len(warnings) > 0: + if len(warnings) > 0: raise MAVISpMultipleError(warning=warnings, critical=[]) - elif len(warnings) > 0: - e.warning.extend(warnings) - raise e def _generate_local_interactions_DNA_classification(self, row, ci, stab_co=1.0):