diff --git a/ms2rescore/__main__.py b/ms2rescore/__main__.py index 85309ff..52e377f 100644 --- a/ms2rescore/__main__.py +++ b/ms2rescore/__main__.py @@ -6,6 +6,7 @@ import json import logging import sys +from datetime import datetime from pathlib import Path from typing import Union @@ -196,7 +197,13 @@ def profile(fnc, filepath): def inner(*args, **kwargs): with cProfile.Profile() as profiler: return_value = fnc(*args, **kwargs) - profiler.dump_stats(filepath + ".profile.prof") + + # Add timestamp to profiler output filename + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + profile_filename = f"{filepath}.profile_{timestamp}.prof" + profiler.dump_stats(profile_filename) + LOGGER.info(f"Profile data written to: {profile_filename}") + return return_value return inner @@ -248,6 +255,7 @@ def main(tims=False): # Run MS²Rescore try: if config["ms2rescore"]["profile"]: + LOGGER.info("Profiling enabled") profiled_rescore = profile(rescore, config["ms2rescore"]["output_path"]) profiled_rescore(configuration=config) else: diff --git a/ms2rescore/core.py b/ms2rescore/core.py index 4ea8103..6784ee1 100644 --- a/ms2rescore/core.py +++ b/ms2rescore/core.py @@ -13,7 +13,10 @@ from ms2rescore.parse_spectra import add_precursor_values from ms2rescore.report import generate from ms2rescore.rescoring_engines import mokapot, percolator -from ms2rescore.rescoring_engines.mokapot import add_peptide_confidence, add_psm_confidence +from ms2rescore.rescoring_engines.mokapot import ( + add_peptide_confidence, + add_psm_confidence, +) logger = logging.getLogger(__name__) @@ -34,7 +37,9 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: f"Running MS²Rescore with following configuration: {json.dumps(configuration, indent=4)}" ) config = configuration["ms2rescore"] - output_file_root = config["output_path"] + output_file_root = config["output_path"].split(".intermidiate.")[ + 0 + ] # if no intermediate, takes full name # Write full configuration including defaults to file with open(output_file_root + ".full-config.json", "w") as f: @@ -59,6 +64,19 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: logger.debug( f"PSMs already contain the following rescoring features: {psm_list_feature_names}" ) + # ckeck if all features are already present + for fgen_name, fgen_config in list(config["feature_generators"].items()): + # conf = config.copy() + # conf.update(fgen_config) + fgen_features = FEATURE_GENERATORS[fgen_name]().feature_names + if set(fgen_features).issubset(psm_list_feature_names): + logger.debug( + f"Skipping feature generator {fgen_name} because all features are already " + "present in the PSM file." + ) + feature_names[fgen_name] = set(fgen_features) + feature_names["psm_file"] = psm_list_feature_names - set(fgen_features) + del config["feature_generators"][fgen_name] # Add missing precursor info from spectrum file if needed required_ms_data = { @@ -89,9 +107,17 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: "files or disable the feature generator." ) continue - - # Add features - fgen.add_features(psm_list) + try: + fgen.add_features(psm_list) + except (Exception, KeyboardInterrupt) as e: + logger.error( + f"Error while adding features from {fgen_name}: {e}, writing intermediary output..." + ) + # Write intermediate TSV + psm_utils.io.write_file( + psm_list, output_file_root + ".intermediate.psms.tsv", filetype="tsv" + ) + raise e logger.debug(f"Adding features from {fgen_name}: {set(fgen.feature_names)}") feature_names[fgen_name] = set(fgen.feature_names) @@ -114,6 +140,12 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: ) psm_list = psm_list[psms_with_features] + if "mumble" in config["psm_generator"]: + from ms2rescore.utils import filter_mumble_psms + + # Remove PSMs where matched_ions_pct drops 25% below the original hit + psm_list = filter_mumble_psms(psm_list, threshold=0.75) + # Write feature names to file _write_feature_names(feature_names, output_file_root) @@ -160,10 +192,12 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: protein_kwargs=protein_kwargs, **config["rescoring_engine"]["mokapot"], ) - except exceptions.RescoringError as e: + except (Exception, KeyboardInterrupt) as e: # Write output - logger.info(f"Writing intermediary output to {output_file_root}.psms.tsv...") - psm_utils.io.write_file(psm_list, output_file_root + ".psms.tsv", filetype="tsv") + logger.info(f"Writing intermediary output to {output_file_root}.intermediate.psms.tsv...") + psm_utils.io.write_file( + psm_list, output_file_root + ".intermediate.psms.tsv", filetype="tsv" + ) # Reraise exception raise e @@ -219,7 +253,10 @@ def _write_feature_names(feature_names, output_file_root): def _log_id_psms_before(psm_list: PSMList, fdr: float = 0.01, max_rank: int = 1) -> int: """Log #PSMs identified before rescoring.""" id_psms_before = ( - (psm_list["qvalue"] <= 0.01) & (psm_list["rank"] <= max_rank) & (~psm_list["is_decoy"]) + (psm_list["qvalue"] <= 0.01) + & (psm_list["rank"] <= max_rank) + & (~psm_list["is_decoy"]) + & ([metadata.get("original_psm", True) for metadata in psm_list["metadata"]]) ).sum() logger.info( f"Found {id_psms_before} identified PSMs with rank <= {max_rank} at {fdr} FDR before " @@ -285,7 +322,9 @@ def _calculate_confidence(psm_list: PSMList) -> PSMList: ) # Recalculate confidence - new_confidence = lin_psm_data.assign_confidence(scores=psm_list["score"]) + new_confidence = lin_psm_data.assign_confidence( + scores=list(psm_list["score"]) + ) # explicity make it a list to avoid TypingError: Failed in nopython mode pipeline (step: nopython frontend) in mokapot # Add new confidence estimations to PSMList add_psm_confidence(psm_list, new_confidence) diff --git a/ms2rescore/exceptions.py b/ms2rescore/exceptions.py index ba5492a..68b1b43 100644 --- a/ms2rescore/exceptions.py +++ b/ms2rescore/exceptions.py @@ -41,3 +41,9 @@ class RescoringError(MS2RescoreError): """Error while rescoring PSMs.""" pass + + +class ParseSpectrumError(MS2RescoreError): + """Error while rescoring PSMs.""" + + pass diff --git a/ms2rescore/feature_generators/__init__.py b/ms2rescore/feature_generators/__init__.py index d89f087..4956868 100644 --- a/ms2rescore/feature_generators/__init__.py +++ b/ms2rescore/feature_generators/__init__.py @@ -7,14 +7,14 @@ from ms2rescore.feature_generators.deeplc import DeepLCFeatureGenerator from ms2rescore.feature_generators.im2deep import IM2DeepFeatureGenerator from ms2rescore.feature_generators.ionmob import IonMobFeatureGenerator -from ms2rescore.feature_generators.maxquant import MaxQuantFeatureGenerator +from ms2rescore.feature_generators.ms2 import MS2FeatureGenerator from ms2rescore.feature_generators.ms2pip import MS2PIPFeatureGenerator FEATURE_GENERATORS: dict[str, type[FeatureGeneratorBase]] = { "basic": BasicFeatureGenerator, "ms2pip": MS2PIPFeatureGenerator, "deeplc": DeepLCFeatureGenerator, - "maxquant": MaxQuantFeatureGenerator, "ionmob": IonMobFeatureGenerator, "im2deep": IM2DeepFeatureGenerator, + "ms2": MS2FeatureGenerator, } diff --git a/ms2rescore/feature_generators/basic.py b/ms2rescore/feature_generators/basic.py index bc2222e..3e9aa37 100644 --- a/ms2rescore/feature_generators/basic.py +++ b/ms2rescore/feature_generators/basic.py @@ -31,13 +31,24 @@ def __init__(self, *args, **kwargs) -> None: """ super().__init__(*args, **kwargs) - self._feature_names = None @property def feature_names(self) -> List[str]: - if self._feature_names is None: - raise ValueError("Feature names have not been set yet. First run `add_features`.") - return self._feature_names + return [ + "charge_n", + "charge_1", + "charge_2", + "charge_3", + "charge_4", + "charge_5", + "charge_6", + "abs_ms1_error_ppm", + "search_engine_score", + "theoretical_mass", + "experimental_mass", + "mass_error", + "pep_len", + ] def add_features(self, psm_list: PSMList) -> None: """ @@ -51,11 +62,10 @@ def add_features(self, psm_list: PSMList) -> None: """ logger.info("Adding basic features to PSMs.") - self._feature_names = [] # Reset feature names - charge_states = np.array([psm.peptidoform.precursor_charge for psm in psm_list]) precursor_mzs = psm_list["precursor_mz"] scores = psm_list["score"] + peptide_lengths = np.array([len(psm.peptidoform.sequence) for psm in psm_list]) has_charge = None not in charge_states has_mz = None not in precursor_mzs and has_charge @@ -63,16 +73,16 @@ def add_features(self, psm_list: PSMList) -> None: if has_charge: charge_n = charge_states - charge_one_hot, one_hot_names = _one_hot_encode_charge(charge_states) - self._feature_names.extend(["charge_n"] + one_hot_names) + charge_one_hot, _ = _one_hot_encode_charge(charge_states) if has_mz: # Charge also required for theoretical m/z theo_mz = np.array([psm.peptidoform.theoretical_mz for psm in psm_list]) abs_ms1_error_ppm = np.abs((precursor_mzs - theo_mz) / theo_mz * 10**6) - self._feature_names.append("abs_ms1_error_ppm") - if has_score: - self._feature_names.append("search_engine_score") + if has_mz and has_charge: + experimental_mass = (precursor_mzs * charge_n) - (charge_n * 1.007276466812) + theoretical_mass = (theo_mz * charge_n) - (charge_n * 1.007276466812) + mass_error = experimental_mass - theoretical_mass for i, psm in enumerate(psm_list): psm.rescoring_features.update( @@ -81,6 +91,10 @@ def add_features(self, psm_list: PSMList) -> None: **charge_one_hot[i] if has_charge else {}, **{"abs_ms1_error_ppm": abs_ms1_error_ppm[i]} if has_mz else {}, **{"search_engine_score": scores[i]} if has_score else {}, + **{"theoretical_mass": theoretical_mass[i]} if has_mz and has_charge else {}, + **{"experimental_mass": experimental_mass[i]} if has_mz and has_charge else {}, + **{"mass_error": mass_error[i]} if has_mz and has_charge else {}, + **{"pep_len": peptide_lengths[i]}, ) ) @@ -88,15 +102,18 @@ def add_features(self, psm_list: PSMList) -> None: def _one_hot_encode_charge( charge_states: np.ndarray, ) -> Tuple[Iterable[Dict[str, int]], List[str]]: - """One-hot encode charge states.""" + """One-hot encode charge states with fixed range 1-6.""" n_entries = len(charge_states) - min_charge = np.min(charge_states) - max_charge = np.max(charge_states) + heading = [f"charge_{i}" for i in range(1, 7)] - mask = np.zeros((n_entries, max_charge - min_charge + 1), dtype=bool) - mask[np.arange(n_entries), charge_states - min_charge] = 1 - one_hot = mask.view("i1") + # Create mask for charges 1-6 + mask = np.zeros((n_entries, 6), dtype=bool) - heading = [f"charge_{i}" for i in range(min_charge, max_charge + 1)] + # Set the appropriate charge position to 1 for each entry + for i, charge in enumerate(charge_states): + if charge is not None and 1 <= charge <= 6: + mask[i, int(charge) - 1] = 1 + + one_hot = mask.view("i1") return [dict(zip(heading, row)) for row in one_hot], heading diff --git a/ms2rescore/feature_generators/deeplc.py b/ms2rescore/feature_generators/deeplc.py index f372815..4c2f0b4 100644 --- a/ms2rescore/feature_generators/deeplc.py +++ b/ms2rescore/feature_generators/deeplc.py @@ -15,23 +15,22 @@ """ -import contextlib import logging -import os -from collections import defaultdict -from inspect import getfullargspec -from itertools import chain from typing import List, Union import numpy as np from psm_utils import PSMList +from deeplc.core import predict, finetune +from deeplc.calibration import SplineTransformerCalibration from ms2rescore.feature_generators.base import FeatureGeneratorBase from ms2rescore.parse_spectra import MSDataType -os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" logger = logging.getLogger(__name__) +# Suppress verbose onnx2torch logging +logging.getLogger("onnx2torch").setLevel(logging.WARNING) + class DeepLCFeatureGenerator(FeatureGeneratorBase): """DeepLC retention time-based feature generator.""" @@ -41,7 +40,6 @@ class DeepLCFeatureGenerator(FeatureGeneratorBase): def __init__( self, *args, - lower_score_is_better: bool = False, calibration_set_size: Union[int, float, None] = None, processes: int = 1, **kwargs, @@ -54,8 +52,7 @@ def __init__( Parameters ---------- - lower_score_is_better - Whether a lower PSM score denotes a better matching PSM. Default: False + calibration_set_size: int or float Amount of best PSMs to use for DeepLC calibration. If this value is lower than the number of available PSMs, all PSMs will be used. (default: 0.15) @@ -72,34 +69,42 @@ def __init__( """ super().__init__(*args, **kwargs) - self.lower_psm_score_better = lower_score_is_better self.calibration_set_size = calibration_set_size - self.processes = processes self.deeplc_kwargs = kwargs or {} self._verbose = logger.getEffectiveLevel() <= logging.DEBUG - # Lazy-load DeepLC - from deeplc import DeepLC + self.model = self.deeplc_kwargs.get("model", None) - self.DeepLC = DeepLC + self.calibration = None - # Remove any kwargs that are not DeepLC arguments - self.deeplc_kwargs = { - k: v for k, v in self.deeplc_kwargs.items() if k in getfullargspec(DeepLC).args - } - self.deeplc_kwargs.update({"config_file": None}) + # Prepare DeepLC predict kwargs + self.predict_kwargs = { + k: v for k, v in self.deeplc_kwargs.items() if k in ["device", "batch_size"] + } # getfullargspec(predict).args does not work on this outer predict function + self.predict_kwargs["num_workers"] = processes - # Set default DeepLC arguments + # Prepare DeepLC finetune kwargs if "deeplc_retrain" not in self.deeplc_kwargs: self.deeplc_kwargs["deeplc_retrain"] = False - - self.deeplc_predictor = None - if "path_model" in self.deeplc_kwargs: - self.user_model = self.deeplc_kwargs.pop("path_model") - logging.debug(f"Using user-provided DeepLC model {self.user_model}.") - else: - self.user_model = None + return # skip the rest of the init if no retraining + + if self.deeplc_kwargs["deeplc_retrain"]: + self.finetune_kwargs = { + k: v + for k, v in self.deeplc_kwargs.items() + if k + in [ + "epochs", + "device", + "batch_size", + "learning_rate", + "patience", + "trainable_layers", + "validation_split", + ] + } + self.finetune_kwargs["num_workers"] = processes @property def feature_names(self) -> List[str]: @@ -116,131 +121,231 @@ def add_features(self, psm_list: PSMList) -> None: """Add DeepLC-derived features to PSMs.""" logger.info("Adding DeepLC-derived features to PSMs.") + psm_list_df = psm_list.to_dataframe() + psm_list_df = psm_list_df[ + [ + "peptidoform", + "retention_time", + "run", + "qvalue", + "is_decoy", + ] + ] + psm_list_df["sequence"] = psm_list_df["peptidoform"].apply(lambda x: x.modified_sequence) - # Get easy-access nested version of PSMList - psm_dict = psm_list.get_psm_dict() - - # Run DeepLC for each spectrum file - current_run = 1 - total_runs = sum(len(runs) for runs in psm_dict.values()) - - for runs in psm_dict.values(): - # Reset DeepLC predictor for each collection of runs - self.deeplc_predictor = None - self.selected_model = None - for run, psms in runs.items(): - peptide_rt_diff_dict = defaultdict( - lambda: { - "observed_retention_time_best": np.inf, - "predicted_retention_time_best": np.inf, - "rt_diff_best": np.inf, - } - ) - logger.info( - f"Running DeepLC for PSMs from run ({current_run}/{total_runs}): `{run}`..." - ) + if self.deeplc_kwargs["deeplc_retrain"]: + # Filter high-confidence target PSMs once for transfer learning + target_mask = (psm_list["qvalue"] <= 0.01) & (~psm_list["is_decoy"]) + target_psms = psm_list[target_mask] - # Disable wild logging to stdout by Tensorflow, unless in debug mode - - with ( - contextlib.redirect_stdout(open(os.devnull, "w", encoding="utf-8")) - if not self._verbose - else contextlib.nullcontext() - ): - # Make new PSM list for this run (chain PSMs per spectrum to flat list) - psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values()))) - - psm_list_calibration = self._get_calibration_psms(psm_list_run) - logger.debug(f"Calibrating DeepLC with {len(psm_list_calibration)} PSMs...") - self.deeplc_predictor = self.DeepLC( - n_jobs=self.processes, - verbose=self._verbose, - path_model=self.selected_model or self.user_model, - **self.deeplc_kwargs, - ) - self.deeplc_predictor.calibrate_preds(psm_list_calibration) - # Still calibrate for each run, but do not try out all model options. - # Just use model that was selected based on first run - if not self.selected_model: - self.selected_model = list(self.deeplc_predictor.model.keys()) - self.deeplc_kwargs["deeplc_retrain"] = False - logger.debug( - f"Selected DeepLC model {self.selected_model} based on " - "calibration of first run. Using this model (after new " - "calibrations) for the remaining runs." - ) - - logger.debug("Predicting retention times...") - predictions = np.array(self.deeplc_predictor.make_preds(psm_list_run)) - observations = psm_list_run["retention_time"] - rt_diffs_run = np.abs(predictions - observations) - - logger.debug("Adding features to PSMs...") - for i, psm in enumerate(psm_list_run): - psm["rescoring_features"].update( - { - "observed_retention_time": observations[i], - "predicted_retention_time": predictions[i], - "rt_diff": rt_diffs_run[i], - } - ) - peptide = psm.peptidoform.proforma.split("\\")[0] # remove charge - if peptide_rt_diff_dict[peptide]["rt_diff_best"] > rt_diffs_run[i]: - peptide_rt_diff_dict[peptide] = { - "observed_retention_time_best": observations[i], - "predicted_retention_time_best": predictions[i], - "rt_diff_best": rt_diffs_run[i], - } - for psm in psm_list_run: - psm["rescoring_features"].update( - peptide_rt_diff_dict[psm.peptidoform.proforma.split("\\")[0]] - ) - current_run += 1 - - def _get_calibration_psms(self, psm_list: PSMList): - """Get N best scoring target PSMs for calibration.""" - psm_list_targets = psm_list[~psm_list["is_decoy"]] - if self.calibration_set_size: - n_psms = self._get_number_of_calibration_psms(psm_list_targets) - indices = np.argsort(psm_list_targets["score"]) - indices = indices[:n_psms] if self.lower_psm_score_better else indices[-n_psms:] - return psm_list_targets[indices] - else: - identified_psms = psm_list_targets[psm_list_targets["qvalue"] <= 0.01] - if len(identified_psms) == 0: - raise ValueError( - "No target PSMs with q-value <= 0.01 found. Please set calibration set size for calibrating deeplc." - ) - elif (len(identified_psms) < 500) & (self.deeplc_kwargs["deeplc_retrain"]): - logger.warning( - " Less than 500 target PSMs with q-value <= 0.01 found for retraining. Consider turning of deeplc_retrain, as this is likely not enough data for retraining." - ) - return identified_psms + # Determine best run for transfer learning + best_run = self._best_run_by_shared_proteoforms( + target_psms["run"], + target_psms["peptidoform"], + ) + + # Fine-tune on best run + best_run_psms = target_psms[target_psms["run"] == best_run] + logger.debug( + f"Fine-tuning DeepLC on run '{best_run}'... with {len(best_run_psms)} PSMs" + ) + self.model = finetune( + best_run_psms, + model=self.model, + train_kwargs=self.finetune_kwargs, + ) + + # Predict retention times for all PSMs + logger.info("Predicting retention times with DeepLC...") + psm_list_df["predicted_retention_time"] = predict( + psm_list, model=self.model, predict_kwargs=self.predict_kwargs + ) + + # Calibrate predictions per run + logger.info("Calibrating predicted retention times per run...") + for run in psm_list_df["run"].unique(): + run_df = psm_list_df[psm_list_df["run"] == run].copy() + + # Get calibration data (target PSMs only) + observed_rt_calibration, predicted_rt_calibration = self._get_calibration_data(run_df) + + # Fit calibration and transform all predictions for this run + calibration = SplineTransformerCalibration() + calibration.fit( + target=observed_rt_calibration, + source=predicted_rt_calibration, + ) + + calibrated_rt = calibration.transform(run_df["predicted_retention_time"].values) + + # Update predictions with calibrated values + psm_list_df.loc[psm_list_df["run"] == run, "predicted_retention_time"] = calibrated_rt + + psm_list_df["observed_retention_time"] = psm_list_df["retention_time"] + psm_list_df["rt_diff"] = ( + psm_list_df["observed_retention_time"] - psm_list_df["predicted_retention_time"] + ).abs() + psm_list_df_best = psm_list_df.loc[ + psm_list_df.groupby(["run", "sequence"])["rt_diff"].idxmin() + ] + + psm_list_df = psm_list_df.merge( + psm_list_df_best[ + [ + "run", + "sequence", + "observed_retention_time", + "predicted_retention_time", + "rt_diff", + ] + ], + on=["run", "sequence"], + how="left", + suffixes=("", "_best"), + ) + + psm_list_feature_dicts = psm_list_df[self.feature_names].to_dict(orient="records") + # Add features to PSMs + logger.debug("Adding features to PSMs...") + for psm, features in zip(psm_list, psm_list_feature_dicts): + psm.rescoring_features.update(features) + + def _get_calibration_data(self, run_df) -> tuple[np.ndarray, np.ndarray]: + """Get calibration data (observed and predicted RTs) from run dataframe. + + Only target (non-decoy) PSMs are used for calibration. - def _get_number_of_calibration_psms(self, psm_list): - """Get number of calibration PSMs given `calibration_set_size` and total number of PSMs.""" + Parameters + ---------- + run_df : pd.DataFrame + Dataframe containing PSMs for a single run, with columns: + 'retention_time', 'predicted_retention_time', 'qvalue', 'is_decoy' + + Returns + ------- + tuple[np.ndarray, np.ndarray] + Observed and predicted retention times for calibration + """ + # Filter to target PSMs only + target_df = run_df[~run_df["is_decoy"]].copy() + target_df = target_df.sort_values("qvalue", ascending=True) + + # Determine number of calibration PSMs if isinstance(self.calibration_set_size, float): if not 0 < self.calibration_set_size <= 1: raise ValueError( "If `calibration_set_size` is a float, it cannot be smaller than " "or equal to 0 or larger than 1." ) - else: - num_calibration_psms = round(len(psm_list) * self.calibration_set_size) + num_calibration_psms = round(len(target_df) * self.calibration_set_size) elif isinstance(self.calibration_set_size, int): - if self.calibration_set_size > len(psm_list): + if self.calibration_set_size > len(target_df): logger.warning( - f"Requested number of calibration PSMs ({self.calibration_set_size}" - f") is larger than total number of PSMs ({len(psm_list)}). Using " - "all PSMs for calibration." + f"Requested number of calibration PSMs ({self.calibration_set_size}) " + f"is larger than total number of target PSMs ({len(target_df)}). Using " + "all target PSMs for calibration." ) - num_calibration_psms = len(psm_list) + num_calibration_psms = len(target_df) else: num_calibration_psms = self.calibration_set_size else: - raise TypeError( - "Expected float or int for `calibration_set_size`. Got " - f"{type(self.calibration_set_size)} instead. " + # Use PSMs with q-value <= 0.01 + num_calibration_psms = (target_df["qvalue"] <= 0.01).sum() + + logger.debug(f"Using {num_calibration_psms} target PSMs for calibration") + + # Select calibration PSMs (assuming they are sorted by q-value) + calibration_df = target_df.head(num_calibration_psms) + + return ( + calibration_df["retention_time"].values, + calibration_df["predicted_retention_time"].values, + ) + + @staticmethod + def _best_run_by_shared_proteoforms(runs, proteoforms): + """ + Return the run whose proteoform set has the largest total overlap with all other runs: + score(run_i) = sum_{j != i} |P_i ∩ P_j| + + Tie / degenerate handling (per request): + - If no run shares anything (all scores == 0): warn and return the first run (by first appearance). + - If multiple runs are tied for best score: return the first among them (by first appearance). + """ + logger.debug("Determining best run for transfer learning based on shared proteoforms.") + runs = np.asarray(runs) + proteoforms = np.asarray(proteoforms) + if runs.shape[0] != proteoforms.shape[0]: + raise ValueError("runs and proteoforms must have the same length.") + if runs.size == 0: + raise ValueError("Empty input: runs/proteoforms must contain at least one entry.") + + # Preserve run order by first appearance + run_to_idx = {} + run_order = [] + run_idx = np.empty(runs.shape[0], dtype=np.int64) + for i, r in enumerate(runs): + if r not in run_to_idx: + run_to_idx[r] = len(run_order) + run_order.append(r) + run_idx[i] = run_to_idx[r] + + # Fast path: sparse incidence matrix + try: + from scipy.sparse import coo_matrix + except ImportError: + # Fallback (slower): set-based + run_sets = {} + for r, p in zip(runs, proteoforms): + run_sets.setdefault(r, set()).add(p) + + scores = [] + for r in run_order: + Pi = run_sets[r] + s = 0 + for r2 in run_order: + if r2 is r: + continue + s += len(Pi & run_sets[r2]) + scores.append(s) + + scores = np.asarray(scores, dtype=np.int64) + max_score = scores.max() + best_candidates = np.flatnonzero(scores == max_score) + + if max_score == 0: + logger.warning( + "No runs share any identified proteoforms with other runs; transfer learning might not be as effective." + ) + return run_order[0] + + return run_order[int(best_candidates[0])] + + # Encode proteoforms (order does not matter for correctness) + _, prot_idx = np.unique(proteoforms, return_inverse=True) + + # De-duplicate (run, proteoform) pairs + pairs = np.unique(np.stack([run_idx, prot_idx], axis=1), axis=0) + r = pairs[:, 0] + p = pairs[:, 1] + + M = coo_matrix( + (np.ones(len(r), dtype=np.int8), (r, p)), + shape=(len(run_order), int(prot_idx.max()) + 1), + ).tocsr() + + # Overlap matrix O[i,j] = |P_i ∩ P_j| + overlap = (M @ M.T).toarray() + np.fill_diagonal(overlap, 0) + + scores = overlap.sum(axis=1).astype(np.int64) + max_score = int(scores.max()) + best_candidates = np.flatnonzero(scores == max_score) + + if max_score == 0: + logger.warning( + "No runs share any identified proteoforms with other runs; transfer learning might not be as effective." ) - logger.debug(f"Using {num_calibration_psms} PSMs for calibration") - return num_calibration_psms + return run_order[0] + + return run_order[int(best_candidates[0])] diff --git a/ms2rescore/feature_generators/im2deep.py b/ms2rescore/feature_generators/im2deep.py index 3ddbaff..2380060 100644 --- a/ms2rescore/feature_generators/im2deep.py +++ b/ms2rescore/feature_generators/im2deep.py @@ -8,23 +8,18 @@ """ -import contextlib import logging -import os -from inspect import getfullargspec -from itertools import chain -from typing import List +from typing import List, Union import numpy as np -import pandas as pd -from im2deep.utils import im2ccs -from im2deep.im2deep import predict_ccs from psm_utils import PSMList +from im2deep.core import predict +from im2deep.calibration import LinearCCSCalibration, get_default_reference +from im2deep.utils import im2ccs from ms2rescore.feature_generators.base import FeatureGeneratorBase from ms2rescore.parse_spectra import MSDataType -os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" logger = logging.getLogger(__name__) @@ -35,6 +30,8 @@ class IM2DeepFeatureGenerator(FeatureGeneratorBase): def __init__( self, + multi: bool = False, + calibration_set_size: Union[int, float] = None, *args, processes: int = 1, **kwargs, @@ -52,14 +49,24 @@ def __init__( """ super().__init__(*args, **kwargs) - self._verbose = logger.getEffectiveLevel() <= logging.DEBUG + self.multi = multi + if self.multi: + raise NotImplementedError( + "Multi-IM mode is not yet implemented for IM2DeepFeatureGenerator." + ) + + self.calibration_set_size = calibration_set_size - # Remove any kwargs that are not IM2Deep arguments self.im2deep_kwargs = kwargs or {} - self.im2deep_kwargs = { - k: v for k, v in self.im2deep_kwargs.items() if k in getfullargspec(predict_ccs).args + self._verbose = logger.getEffectiveLevel() <= logging.DEBUG + + self.model = self.im2deep_kwargs.get("model", None) + + # Prepare IM2Deep predict kwargs + self.predict_kwargs = { + k: v for k, v in self.im2deep_kwargs.items() if k in ["device", "batch_size"] } - self.im2deep_kwargs["n_jobs"] = processes + self.predict_kwargs["num_workers"] = processes @property def feature_names(self) -> List[str]: @@ -73,100 +80,125 @@ def feature_names(self) -> List[str]: def add_features(self, psm_list: PSMList) -> None: """Add IM2Deep-derived features to PSMs""" + logger.info("Adding IM2Deep-derived features to PSMs") - # Get easy-access nested version of PSMlist - psm_dict = psm_list.get_psm_dict() + psm_list_df = psm_list.to_dataframe() + psm_list_df = psm_list_df[ + [ + "peptidoform", + "ion_mobility", + "precursor_mz", + "run", + "qvalue", + "is_decoy", + "metadata", + ] + ] - # Run IM2Deep for each spectrum file - current_run = 1 - total_runs = sum(len(runs) for runs in psm_dict.values()) + psm_list_df["sequence"] = psm_list_df["peptidoform"].apply(lambda x: x.modified_sequence) + psm_list_df["charge"] = [pep.precursor_charge for pep in psm_list_df["peptidoform"]] + psm_list_df["ccs_observed_im2deep"] = im2ccs( + psm_list_df["ion_mobility"], + psm_list_df["precursor_mz"], + psm_list_df["charge"], + ) - for runs in psm_dict.values(): - # Reset IM2Deep predictor for each collection of runs - for run, psms in runs.items(): - logger.info( - f"Running IM2Deep for PSMs from run ({current_run}/{total_runs}): `{run}`..." - ) + # Make predictions with IM2Deep + logger.info("Predicting CCS values with IM2Deep...") + psm_list_df["predicted_CCS_uncalibrated"] = predict( + psm_list, model=self.model, predict_kwargs=self.predict_kwargs + ) - # Disable wild logging to stdout by TensorFlow, unless in debug mode - with ( - contextlib.redirect_stdout(open(os.devnull, "w", encoding="utf-8")) - if not self._verbose - else contextlib.nullcontext() - ): - # Make new PSM list for this run (chain PSMs per spectrum to flat list) - psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values()))) - - logger.debug("Calibrating IM2Deep...") - - # Convert ion mobility to CCS and calibrate CCS values - psm_list_run_df = psm_list_run.to_dataframe() - psm_list_run_df["charge"] = [ - pep.precursor_charge for pep in psm_list_run_df["peptidoform"] - ] - psm_list_run_df["ccs_observed"] = im2ccs( - psm_list_run_df["ion_mobility"], - psm_list_run_df["precursor_mz"], - psm_list_run_df["charge"], - ) - - # Create dataframe with high confidence hits for calibration - cal_psm_df = self.make_calibration_df(psm_list_run_df) - - # Make predictions with IM2Deep - logger.debug("Predicting CCS values...") - predictions = predict_ccs( - psm_list_run, cal_psm_df, write_output=False, **self.im2deep_kwargs - ) - - # Add features to PSMs - logger.debug("Adding features to PSMs...") - observations = psm_list_run_df["ccs_observed"] - ccs_diffs_run = np.abs(predictions - observations) - for i, psm in enumerate(psm_list_run): - psm["rescoring_features"].update( - { - "ccs_observed_im2deep": observations[i], - "ccs_predicted_im2deep": predictions[i], - "ccs_error_im2deep": ccs_diffs_run[i], - "abs_ccs_error_im2deep": np.abs(ccs_diffs_run[i]), - "perc_ccs_error_im2deep": np.abs(ccs_diffs_run[i]) - / observations[i] - * 100, - } - ) - - current_run += 1 - - @staticmethod - def make_calibration_df(psm_list_df: pd.DataFrame, threshold: float = 0.25) -> pd.DataFrame: - """ - Make dataframe for calibration of IM2Deep predictions. + # getting reference CCS values for calibration + source_dataframe = get_default_reference(multi=self.multi) + + # Create dataframe with high confidence hits for calibration + logger.info("Calibrating predicted CCS values per run...") + for run in psm_list_df["run"].unique(): + run_df = psm_list_df[psm_list_df["run"] == run].copy() + + calibration_df = self._get_im_calibration_data(run_df) + + calibration = LinearCCSCalibration() + calibration.fit( + psm_df_target=calibration_df, + psm_df_source=source_dataframe, + ) + + calibrated_im = calibration.transform( + run_df[["peptidoform", "predicted_CCS_uncalibrated"]] + ) + + # Update predictions with calibrated values + psm_list_df.loc[psm_list_df["run"] == run, "predicted_CCS_uncalibrated"] = ( + calibrated_im + ) + + # Apply calibration shifts + psm_list_df.rename( + columns={"predicted_CCS_uncalibrated": "ccs_predicted_im2deep"}, inplace=True + ) + psm_list_df["ccs_error_im2deep"] = ( + psm_list_df["ccs_predicted_im2deep"] - psm_list_df["ccs_observed_im2deep"] + ) + psm_list_df["abs_ccs_error_im2deep"] = np.abs(psm_list_df["ccs_error_im2deep"]) + psm_list_df["perc_ccs_error_im2deep"] = ( + np.abs(psm_list_df["ccs_error_im2deep"]) / psm_list_df["ccs_observed_im2deep"] * 100 + ) + + psm_list_feature_dicts = psm_list_df[self.feature_names].to_dict(orient="records") + # Add features to PSMs + logger.debug("Adding features to PSMs...") + for psm, features in zip(psm_list, psm_list_feature_dicts): + psm.rescoring_features.update(features) + + def _get_im_calibration_data(self, run_df) -> tuple[np.ndarray, np.ndarray]: + """Get calibration data (observed and predicted CCS values) from run dataframe. + + Only target (non-decoy) PSMs are used for calibration. Parameters ---------- - psm_list_df - DataFrame with PSMs. - threshold - Percentage of highest scoring identified target PSMs to use for calibration, - default 0.25. - + run_df : pd.DataFrame + Dataframe containing PSMs for a single run, with columns: + 'ccs_observed_im2deep', 'ccs_predicted_im2deep', 'qvalue', 'is_decoy' Returns ------- - pd.DataFrame - DataFrame with high confidence hits for calibration. - + tuple[np.ndarray, np.ndarray] + Observed and predicted CCS values for calibration """ - identified_psms = psm_list_df[ - (psm_list_df["qvalue"] < 0.01) - & (~psm_list_df["is_decoy"]) - & (psm_list_df["charge"] < 5) # predictions do not go higher for IM2Deep - ] - calibration_psms = identified_psms[ - identified_psms["qvalue"] < identified_psms["qvalue"].quantile(1 - threshold) - ] - logger.debug( - f"Number of high confidence hits for calculating shift: {len(calibration_psms)}" + # Filter to target PSMs only + target_df = run_df[~run_df["is_decoy"]].copy() + target_df = target_df.sort_values("qvalue", ascending=True) + + # Determine number of calibration PSMs + if isinstance(self.calibration_set_size, float): + if not 0 < self.calibration_set_size <= 1: + raise ValueError( + "If `calibration_set_size` is a float, it cannot be smaller than " + "or equal to 0 or larger than 1." + ) + num_calibration_psms = round(len(target_df) * self.calibration_set_size) + elif isinstance(self.calibration_set_size, int): + if self.calibration_set_size > len(target_df): + logger.warning( + f"Requested number of calibration PSMs ({self.calibration_set_size}) " + f"is larger than total number of target PSMs ({len(target_df)}). Using " + "all target PSMs for calibration." + ) + num_calibration_psms = len(target_df) + else: + num_calibration_psms = self.calibration_set_size + else: + # Use PSMs with q-value <= 0.01 + num_calibration_psms = (target_df["qvalue"] <= 0.01).sum() + + logger.debug(f"Using {num_calibration_psms} target PSMs for calibration") + + # Select calibration PSMs (assuming they are sorted by q-value) + calibration_df = target_df.head(num_calibration_psms) + + return calibration_df[["peptidoform", "ccs_observed_im2deep"]].rename( + columns={"ccs_observed_im2deep": "CCS"} ) - return calibration_psms diff --git a/ms2rescore/feature_generators/ionmob.py b/ms2rescore/feature_generators/ionmob.py index d6b4a88..7154e6b 100644 --- a/ms2rescore/feature_generators/ionmob.py +++ b/ms2rescore/feature_generators/ionmob.py @@ -29,7 +29,11 @@ try: from ionmob import __file__ as ionmob_file from ionmob.preprocess.data import to_tf_dataset_inference - from ionmob.utilities.chemistry import VARIANT_DICT, calculate_mz, reduced_mobility_to_ccs + from ionmob.utilities.chemistry import ( + VARIANT_DICT, + calculate_mz, + reduced_mobility_to_ccs, + ) from ionmob.utilities.tokenization import tokenizer_from_json from ionmob.utilities.utility import get_ccs_shift except ImportError: diff --git a/ms2rescore/feature_generators/maxquant.py b/ms2rescore/feature_generators/maxquant.py deleted file mode 100644 index 45205c2..0000000 --- a/ms2rescore/feature_generators/maxquant.py +++ /dev/null @@ -1,208 +0,0 @@ -""" -Feature generator for PSMs from the MaxQuant search engine. - -MaxQuant msms.txt files contain various metrics from peptide-spectrum matching that can be used -to generate rescoring features. These include features related to the mass errors of the seven -fragment ions with the highest intensities, and features related to the ion current of the -identified fragment ions. - -""" - -import logging -from typing import List, Tuple - -import numpy as np -from psm_utils import PSMList - -from ms2rescore.exceptions import MS2RescoreError -from ms2rescore.feature_generators.base import FeatureGeneratorBase - -logger = logging.getLogger(__name__) - - -class MaxQuantFeatureGenerator(FeatureGeneratorBase): - """Generate MaxQuant-derived features.""" - - available_features = [ - "mean_error_top7", - "sq_mean_error_top7", - "stdev_error_top7", - "ln_explained_ion_current", - "ln_nterm_ion_current_ratio", - "ln_cterm_ion_current_ratio", - "ln_ms2_ion_current", - ] - - def __init__(self, *args, **kwargs) -> None: - """ - Generate MaxQuant-derived features. - - Attributes - ---------- - feature_names: list[str] - Names of the features that will be added to the PSMs. - - Raises - ------ - MissingMetadataError - If the required metadata entries are not present in the PSMs. - - """ - super().__init__(*args, **kwargs) - self._feature_names = self.available_features.copy() - - @property - def feature_names(self) -> List[str]: - return self._feature_names - - def add_features(self, psm_list: PSMList): - """ - Add MaxQuant-derived features to PSMs. - - Parameters - ---------- - psm_list - PSMs to add features to. - - """ - # Check if all PSMs are from MaxQuant - if not self._all_psms_from_maxquant(psm_list): - self._feature_names = [] # Set feature names to empty list to indicate none added - logger.warning("Not all PSMs are from MaxQuant. Skipping MaxQuant feature generation.") - return - else: - self._feature_names = self.available_features # Reset feature names - logger.info("Adding MaxQuant-derived features to PSMs.") - - # Infer mass deviations column name - for column_name in [ - "Mass deviations [Da]", - "Mass Deviations [Da]", - "Mass deviations [ppm]", - "Mass Deviations [ppm]", - ]: - if column_name in psm_list[0]["metadata"].keys(): - self._mass_deviations_key = column_name - break - else: - raise MissingMetadataError( - "No mass deviations entry in PSM metadata. Cannot compute MaxQuant features." - ) - - # Check other columns - for column_name in ["Intensities", "Matches", "Intensity coverage"]: - if column_name not in psm_list[0]["metadata"].keys(): - raise MissingMetadataError( - f"Missing {column_name} entry in PSM metadata. Cannot compute MaxQuant features." - ) - - # Add features to PSMs - for psm in psm_list: - psm["rescoring_features"].update(self._compute_features(psm["metadata"])) - - @staticmethod - def _all_psms_from_maxquant(psm_list): - """Check if the PSMs are from MaxQuant.""" - return (psm_list["source"] == "msms").all() - - def _compute_features(self, psm_metadata): - """Compute features from derived from intensities and mass errors.""" - features = {} - if all(k in psm_metadata.keys() for k in ["Intensities", self._mass_deviations_key]): - ( - features["mean_error_top7"], - features["sq_mean_error_top7"], - features["stdev_error_top7"], - ) = self._calculate_top7_peak_features( - psm_metadata["Intensities"], psm_metadata[self._mass_deviations_key] - ) - - if all(k in psm_metadata.keys() for k in ["Intensities", "Matches", "Intensity coverage"]): - ( - features["ln_explained_ion_current"], - features["ln_nterm_ion_current_ratio"], - features["ln_cterm_ion_current_ratio"], - features["ln_ms2_ion_current"], - ) = self._calculate_ion_current_features( - psm_metadata["Matches"], - psm_metadata["Intensities"], - psm_metadata["Intensity coverage"], - ) - - return features - - @staticmethod - def _calculate_top7_peak_features(intensities: str, mass_errors: str) -> Tuple[np.ndarray]: - """ - Calculate "top 7 peak"-related search engine features. - The following features are calculated: - - mean_error_top7: Mean of mass errors of the seven fragment ion peaks with the - highest intensities - - sq_mean_error_top7: Squared MeanErrorTop7 - - stdev_error_top7: Standard deviation of mass errors of the seven fragment ion - peaks with the highest intensities - """ - try: - intensities = [float(i) for i in intensities.split(";")] - mass_errors = [float(i) for i in mass_errors.split(";")] - except ValueError: - return 0.0, 0.0, 0.0 - - indices_most_intens = np.array(intensities).argsort()[-1:-8:-1] - mass_errors_top7 = [(mass_errors[i]) for i in indices_most_intens] - mean_error_top7 = np.mean(mass_errors_top7) - sq_mean_error_top7 = mean_error_top7**2 - stdev_error_top7 = np.std(mass_errors_top7) - - return mean_error_top7, sq_mean_error_top7, stdev_error_top7 - - @staticmethod - def _calculate_ion_current_features( - matches: str, intensities: str, intensity_coverage: str - ) -> Tuple[np.ndarray]: - """ - Calculate ion current related search engine features. - The following features are calculated: - - ln_explained_ion_current: Summed intensity of identified fragment ions, - divided by that of all fragment ions, logged - - ln_nterm_ion_current_ratio: Summed intensity of identified N-terminal - fragments, divided by that of all identified fragments, logged - - ln_cterm_ion_current_ratio: Summed intensity of identified N-terminal - fragments, divided by that of all identified fragments, logged - - ln_ms2_ion_current: Summed intensity of all observed fragment ions, logged - """ - pseudo_count = 0.00001 - try: - ln_explained_ion_current = float(intensity_coverage) + pseudo_count - summed_intensities = sum([float(i) for i in intensities.split(";")]) - except ValueError: - return 0.0, 0.0, 0.0, 0.0 - - # Calculate ratio between matched b- and y-ion intensities - y_ion_int = sum( - [ - float(intensities.split(";")[i]) - for i, m in enumerate(matches.split(";")) - if m.startswith("y") - ] - ) - y_int_ratio = y_ion_int / summed_intensities - - ln_nterm_ion_current_ratio = (y_int_ratio + pseudo_count) * ln_explained_ion_current - ln_cterm_ion_current_ratio = (1 - y_int_ratio + pseudo_count) * ln_explained_ion_current - ln_ms2_ion_current = summed_intensities / ln_explained_ion_current - - out = [ - ln_explained_ion_current, - ln_nterm_ion_current_ratio, - ln_cterm_ion_current_ratio, - ln_ms2_ion_current, - ] - - return tuple([np.log(x) for x in out]) - - -class MissingMetadataError(MS2RescoreError): - """Exception raised when a required metadata entry is missing.""" - - pass diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py new file mode 100644 index 0000000..e0574b7 --- /dev/null +++ b/ms2rescore/feature_generators/ms2.py @@ -0,0 +1,88 @@ +""" +MS2-based feature generator. + +""" + +import logging + +from typing import List, Optional + +from psm_utils import PSMList +from ms2rescore_rs import ms2_features_from_ms2spectra +from ms2rescore.feature_generators.base import FeatureGeneratorBase + +logger = logging.getLogger(__name__) + + +class MS2FeatureGenerator(FeatureGeneratorBase): + """DeepLC retention time-based feature generator.""" + + def __init__( + self, + *args, + spectrum_path: Optional[str] = None, + spectrum_id_pattern: str = "(.*)", + fragmentation_model: str = "cidhcd", + mass_mode: str = "monoisotopic", + processes: int = 1, + calculate_hyperscore: bool = True, + **kwargs, + ) -> None: + """ + Generate MS2-based features for rescoring. + + Parameters + ---------- + + Attributes + ---------- + feature_names: list[str] + Names of the features that will be added to the PSMs. + + """ + super().__init__(*args, **kwargs) + + self.spectrum_path = spectrum_path + self.spectrum_id_pattern = spectrum_id_pattern + self.fragmentation_model = fragmentation_model.lower() + self.mass_mode = mass_mode.lower() + self.processes = processes + self.calculate_hyperscore = calculate_hyperscore + + @property + def feature_names(self) -> List[str]: + return [ + "ln_explained_intensity", + "ln_total_intensity", + "ln_explained_intensity_ratio", + "ln_explained_b_ion_ratio", + "ln_explained_y_ion_ratio", + "longest_b_ion_sequence", + "longest_y_ion_sequence", + "matched_b_ions", + "matched_b_ions_pct", + "matched_y_ions", + "matched_y_ions_pct", + "matched_ions_pct", + "hyperscore", + ] + + def add_features(self, psm_list: PSMList) -> None: + logger.info("Adding MS2-derived features to PSMs.") + + spectra = psm_list["spectrum"] + # Keep parity with your current behavior: + proformas = [psm.peptidoform.proforma.split("/")[0] for psm in psm_list] + seq_lens = [len(psm.peptidoform.sequence) for psm in psm_list] + + feature_dicts = ms2_features_from_ms2spectra( + spectra=spectra, + proformas=proformas, + seq_lens=seq_lens, + fragmentation_model=self.fragmentation_model, + mass_mode=self.mass_mode, + calculate_hyperscore=self.calculate_hyperscore, + ) + + for psm, feats in zip(psm_list, feature_dicts): + psm.rescoring_features.update(feats) diff --git a/ms2rescore/feature_generators/ms2pip.py b/ms2rescore/feature_generators/ms2pip.py index c830a79..b51a9dd 100644 --- a/ms2rescore/feature_generators/ms2pip.py +++ b/ms2rescore/feature_generators/ms2pip.py @@ -24,23 +24,15 @@ """ import logging -import multiprocessing -import os -import warnings -from itertools import chain -from typing import List, Optional, Union +from typing import Optional + +from ms2pip import process_MS2_spectra +from ms2rescore_rs import ms2pip_features_from_prediction_peak_arrays -import numpy as np -import pandas as pd -from ms2pip import correlate -from ms2pip.exceptions import NoMatchingSpectraFound -from ms2pip.result import ProcessingResult from psm_utils import PSMList -from rich.progress import track -from ms2rescore.feature_generators.base import FeatureGeneratorBase, FeatureGeneratorException +from ms2rescore.feature_generators.base import FeatureGeneratorBase from ms2rescore.parse_spectra import MSDataType -from ms2rescore.utils import infer_spectrum_path logger = logging.getLogger(__name__) @@ -58,7 +50,7 @@ def __init__( spectrum_path: Optional[str] = None, spectrum_id_pattern: str = "(.*)", model_dir: Optional[str] = None, - processes: 1, + processes: int = 1, **kwargs, ) -> None: """ @@ -182,213 +174,35 @@ def add_features(self, psm_list: PSMList) -> None: """ logger.info("Adding MS²PIP-derived features to PSMs.") - psm_dict = psm_list.get_psm_dict() - current_run = 1 - total_runs = sum(len(runs) for runs in psm_dict.values()) - - for runs in psm_dict.values(): - for run, psms in runs.items(): - logger.info( - f"Running MS²PIP for PSMs from run ({current_run}/{total_runs}) `{run}`..." - ) - psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values()))) - spectrum_filename = infer_spectrum_path(self.spectrum_path, run) - logger.debug(f"Using spectrum file `{spectrum_filename}`") - try: - os.environ.pop("CUDA_VISIBLE_DEVICES", None) - ms2pip_results = correlate( - psms=psm_list_run, - spectrum_file=str(spectrum_filename), - spectrum_id_pattern=self.spectrum_id_pattern, - model=self.model, - ms2_tolerance=self.ms2_tolerance, - compute_correlations=False, - model_dir=self.model_dir, - processes=self.processes, - ) - except NoMatchingSpectraFound as e: - raise FeatureGeneratorException( - f"Could not find any matching spectra for PSMs from run `{run}`. " - "Please check that the `spectrum_id_pattern` and `psm_id_pattern` " - "options are configured correctly. See " - "https://ms2rescore.readthedocs.io/en/latest/userguide/configuration/#mapping-psms-to-spectra" - " for more information." - ) from e - self._calculate_features(psm_list_run, ms2pip_results) - current_run += 1 - - def _calculate_features( - self, psm_list: PSMList, ms2pip_results: List[ProcessingResult] - ) -> None: - """Calculate features from all MS²PIP results and add to PSMs.""" - logger.debug("Calculating features from predicted spectra") - with multiprocessing.Pool(int(self.processes)) as pool: - # Use imap, so we can use a progress bar - counts_failed = 0 - for result, features in zip( - ms2pip_results, - track( - pool.imap(self._calculate_features_single, ms2pip_results, chunksize=1000), - total=len(ms2pip_results), - description="Calculating features...", - transient=True, - ), - ): - if features: - # Cannot use result.psm directly, as it is a copy from MS²PIP multiprocessing - try: - psm_list[result.psm_index]["rescoring_features"].update(features) - except (AttributeError, TypeError): - psm_list[result.psm_index]["rescoring_features"] = features - else: - counts_failed += 1 - - if counts_failed > 0: - logger.warning(f"Failed to calculate features for {counts_failed} PSMs") - - def _calculate_features_single(self, processing_result: ProcessingResult) -> Union[dict, None]: - """Calculate MS²PIP-based features for single PSM.""" - if ( - processing_result.observed_intensity is None - or processing_result.predicted_intensity is None - ): - return None - - # Suppress RuntimeWarnings about invalid values - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - - # Convert intensities to arrays - target_b = processing_result.predicted_intensity["b"].clip(np.log2(0.001)) - target_y = processing_result.predicted_intensity["y"].clip(np.log2(0.001)) - target_all = np.concatenate([target_b, target_y]) - prediction_b = processing_result.observed_intensity["b"].clip(np.log2(0.001)) - prediction_y = processing_result.observed_intensity["y"].clip(np.log2(0.001)) - prediction_all = np.concatenate([prediction_b, prediction_y]) - - # Prepare 'unlogged' intensity arrays - target_b_unlog = 2**target_b - 0.001 - target_y_unlog = 2**target_y - 0.001 - target_all_unlog = 2**target_all - 0.001 - prediction_b_unlog = 2**prediction_b - 0.001 - prediction_y_unlog = 2**prediction_y - 0.001 - prediction_all_unlog = 2**prediction_all - 0.001 - - # Calculate absolute differences - abs_diff_b = np.abs(target_b - prediction_b) - abs_diff_y = np.abs(target_y - prediction_y) - abs_diff_all = np.abs(target_all - prediction_all) - abs_diff_b_unlog = np.abs(target_b_unlog - prediction_b_unlog) - abs_diff_y_unlog = np.abs(target_y_unlog - prediction_y_unlog) - abs_diff_all_unlog = np.abs(target_all_unlog - prediction_all_unlog) - - # Compute features - feature_values = [ - # Features between spectra in log space - np.corrcoef(target_all, prediction_all)[0][1], # Pearson all ions - np.corrcoef(target_b, prediction_b)[0][1], # Pearson b ions - np.corrcoef(target_y, prediction_y)[0][1], # Pearson y ions - _mse(target_all, prediction_all), # MSE all ions - _mse(target_b, prediction_b), # MSE b ions - _mse(target_y, prediction_y), # MSE y ions - np.min(abs_diff_all), # min_abs_diff_norm - np.max(abs_diff_all), # max_abs_diff_norm - np.quantile(abs_diff_all, 0.25), # abs_diff_Q1_norm - np.quantile(abs_diff_all, 0.5), # abs_diff_Q2_norm - np.quantile(abs_diff_all, 0.75), # abs_diff_Q3_norm - np.mean(abs_diff_all), # mean_abs_diff_norm - np.std(abs_diff_all), # std_abs_diff_norm - np.min(abs_diff_b), # ionb_min_abs_diff_norm - np.max(abs_diff_b), # ionb_max_abs_diff_norm - np.quantile(abs_diff_b, 0.25), # ionb_abs_diff_Q1_norm - np.quantile(abs_diff_b, 0.5), # ionb_abs_diff_Q2_norm - np.quantile(abs_diff_b, 0.75), # ionb_abs_diff_Q3_norm - np.mean(abs_diff_b), # ionb_mean_abs_diff_norm - np.std(abs_diff_b), # ionb_std_abs_diff_norm - np.min(abs_diff_y), # iony_min_abs_diff_norm - np.max(abs_diff_y), # iony_max_abs_diff_norm - np.quantile(abs_diff_y, 0.25), # iony_abs_diff_Q1_norm - np.quantile(abs_diff_y, 0.5), # iony_abs_diff_Q2_norm - np.quantile(abs_diff_y, 0.75), # iony_abs_diff_Q3_norm - np.mean(abs_diff_y), # iony_mean_abs_diff_norm - np.std(abs_diff_y), # iony_std_abs_diff_norm - np.dot(target_all, prediction_all), # Dot product all ions - np.dot(target_b, prediction_b), # Dot product b ions - np.dot(target_y, prediction_y), # Dot product y ions - _cosine_similarity(target_all, prediction_all), # Cos similarity all ions - _cosine_similarity(target_b, prediction_b), # Cos similarity b ions - _cosine_similarity(target_y, prediction_y), # Cos similarity y ions - # Same features in normal space - np.corrcoef(target_all_unlog, prediction_all_unlog)[0][1], # Pearson all - np.corrcoef(target_b_unlog, prediction_b_unlog)[0][1], # Pearson b - np.corrcoef(target_y_unlog, prediction_y_unlog)[0][1], # Pearson y - _spearman(target_all_unlog, prediction_all_unlog), # Spearman all ions - _spearman(target_b_unlog, prediction_b_unlog), # Spearman b ions - _spearman(target_y_unlog, prediction_y_unlog), # Spearman y ions - _mse(target_all_unlog, prediction_all_unlog), # MSE all ions - _mse(target_b_unlog, prediction_b_unlog), # MSE b ions - _mse(target_y_unlog, prediction_y_unlog), # MSE y ions, - # Ion type with min absolute difference - 0 if np.min(abs_diff_b_unlog) <= np.min(abs_diff_y_unlog) else 1, - # Ion type with max absolute difference - 0 if np.max(abs_diff_b_unlog) >= np.max(abs_diff_y_unlog) else 1, - np.min(abs_diff_all_unlog), # min_abs_diff - np.max(abs_diff_all_unlog), # max_abs_diff - np.quantile(abs_diff_all_unlog, 0.25), # abs_diff_Q1 - np.quantile(abs_diff_all_unlog, 0.5), # abs_diff_Q2 - np.quantile(abs_diff_all_unlog, 0.75), # abs_diff_Q3 - np.mean(abs_diff_all_unlog), # mean_abs_diff - np.std(abs_diff_all_unlog), # std_abs_diff - np.min(abs_diff_b_unlog), # ionb_min_abs_diff - np.max(abs_diff_b_unlog), # ionb_max_abs_diff_norm - np.quantile(abs_diff_b_unlog, 0.25), # ionb_abs_diff_Q1 - np.quantile(abs_diff_b_unlog, 0.5), # ionb_abs_diff_Q2 - np.quantile(abs_diff_b_unlog, 0.75), # ionb_abs_diff_Q3 - np.mean(abs_diff_b_unlog), # ionb_mean_abs_diff - np.std(abs_diff_b_unlog), # ionb_std_abs_diff - np.min(abs_diff_y_unlog), # iony_min_abs_diff - np.max(abs_diff_y_unlog), # iony_max_abs_diff - np.quantile(abs_diff_y_unlog, 0.25), # iony_abs_diff_Q1 - np.quantile(abs_diff_y_unlog, 0.5), # iony_abs_diff_Q2 - np.quantile(abs_diff_y_unlog, 0.75), # iony_abs_diff_Q3 - np.mean(abs_diff_y_unlog), # iony_mean_abs_diff - np.std(abs_diff_y_unlog), # iony_std_abs_diff - np.dot(target_all_unlog, prediction_all_unlog), # Dot product all ions - np.dot(target_b_unlog, prediction_b_unlog), # Dot product b ions - np.dot(target_y_unlog, prediction_y_unlog), # Dot product y ions - _cosine_similarity(target_all_unlog, prediction_all_unlog), # Cos similarity all - _cosine_similarity(target_b_unlog, prediction_b_unlog), # Cos similarity b ions - _cosine_similarity(target_y_unlog, prediction_y_unlog), # Cos similarity y ions - ] - - features = dict( - zip( - self.feature_names, - [0.0 if np.isnan(ft) else ft for ft in feature_values], - ) + ms2pip_results = process_MS2_spectra( + psms=psm_list, + model=self.model, + model_dir=self.model_dir, + processes=self.processes, ) - - return features - - -def _spearman(x: np.ndarray, y: np.ndarray) -> float: - """Spearman rank correlation.""" - x = np.array(x) - y = np.array(y) - x_rank = pd.Series(x).rank() - y_rank = pd.Series(y).rank() - return np.corrcoef(x_rank, y_rank)[0][1] - - -def _mse(x: np.ndarray, y: np.ndarray) -> float: - """Mean squared error""" - x = np.array(x) - y = np.array(y) - return np.mean((x - y) ** 2) - - -def _cosine_similarity(x: np.ndarray, y: np.ndarray) -> float: - """Cosine similarity""" - x = np.array(x) - y = np.array(y) - return np.dot(x, y) / (np.linalg.norm(x, 2) * np.linalg.norm(y, 2)) + self._calculate_features(psm_list, ms2pip_results) + + def _calculate_features(self, psm_list, ms2pip_results): + idx = [] + pred_b = [] + pred_y = [] + obs_b = [] + obs_y = [] + + for r in ms2pip_results: + if r.observed_intensity is None or r.predicted_intensity is None: + continue + idx.append(r.psm_index) + pred_b.append(r.predicted_intensity["b"]) + pred_y.append(r.predicted_intensity["y"]) + obs_b.append(r.observed_intensity["b"]) + obs_y.append(r.observed_intensity["y"]) + + results = ms2pip_features_from_prediction_peak_arrays(idx, pred_b, pred_y, obs_b, obs_y) + + for psm_index, feats in results: + if feats: + try: + psm_list[psm_index]["rescoring_features"].update(feats) + except (AttributeError, TypeError): + psm_list[psm_index]["rescoring_features"] = feats diff --git a/ms2rescore/gui/__main__.py b/ms2rescore/gui/__main__.py index 583a856..aa0724b 100644 --- a/ms2rescore/gui/__main__.py +++ b/ms2rescore/gui/__main__.py @@ -1,5 +1,6 @@ """Entrypoint for MS²Rescore GUI.""" +import contextlib import multiprocessing import os import sys diff --git a/ms2rescore/package_data/config_default.json b/ms2rescore/package_data/config_default.json index 805be2c..29042e9 100644 --- a/ms2rescore/package_data/config_default.json +++ b/ms2rescore/package_data/config_default.json @@ -12,6 +12,7 @@ }, "maxquant": {} }, + "psm_generator": {}, "rescoring_engine": { "mokapot": { "train_fdr": 0.01, diff --git a/ms2rescore/package_data/config_schema.json b/ms2rescore/package_data/config_schema.json index 3840a9e..c56edbc 100644 --- a/ms2rescore/package_data/config_schema.json +++ b/ms2rescore/package_data/config_schema.json @@ -62,6 +62,18 @@ "mokapot": {} } }, + "psm_generator": { + "description": "PSM generator and their configuration.", + "type": "object", + "minProperties": 0, + "maxProperties": 1, + "patternProperties": { + ".*": { "$ref": "#/definitions/psm_generator" }, + "mumble": { + "$ref": "#/definitions/mumble" + } + } + }, "config_file": { "description": "Path to configuration file", "oneOf": [{ "type": "string" }, { "type": "null" }] @@ -189,6 +201,7 @@ "oneOf": [{ "type": "boolean" }, { "type": "null" }], "default": false } + } } }, @@ -203,6 +216,11 @@ "type": "object", "additionalProperties": true }, + "psm_generator": { + "description": "Additional PSM feature generator configuration", + "type": "object", + "additionalProperties": true + }, "basic": { "$ref": "#/definitions/feature_generator", "description": "Basic feature generator configuration", @@ -283,6 +301,30 @@ } } }, + "mumble": { + "$ref": "#/definitions/psm_generator", + "description": "Mumble PSM generator configuration using Mubmle", + "type": "object", + "additionalProperties": true, + "properties": { + "aa_combinations": { + "description": "Additional amino acid combinations to consider as mass shift", + "type": "integer", + "default": 0 + }, + "fasta_file": { + "description": "Maximum number of modifications per peptide", + "oneOf": [{ "type": "string" }, { "type": "null" }], + "default": false + }, + "mass_error": { + "description": "mass error in Da", + "type": "number", + "default": 0.2 + } + } + }, + "mokapot": { "$ref": "#/definitions/rescoring_engine", "description": "Mokapot rescoring engine configuration. Additional properties are passed to the Mokapot brew function.", diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py index 65b3dcd..0f5df3c 100644 --- a/ms2rescore/parse_psms.py +++ b/ms2rescore/parse_psms.py @@ -84,7 +84,9 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList: { "before_rescoring_score": psm.score, "before_rescoring_qvalue": psm.qvalue, - "before_rescoring_pep": psm.pep, + "before_rescoring_pep": psm.pep + if psm.pep is not None + else float("nan"), # until fixed in psm_utils "before_rescoring_rank": psm.rank, } ) @@ -109,6 +111,41 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList: psm_list.add_fixed_modifications(config["fixed_modifications"]) psm_list.apply_fixed_modifications() + if config["psm_id_pattern"]: + pattern = re.compile(config["psm_id_pattern"]) + logger.debug("Applying 'psm_id_pattern'...") + logger.debug( + f"Parsing '{psm_list[0].spectrum_id}' to '{_match_psm_ids(psm_list[0].spectrum_id, pattern)}'" + ) + new_ids = [_match_psm_ids(old_id, pattern) for old_id in psm_list["spectrum_id"]] + psm_list["spectrum_id"] = new_ids + + # Addition of Modifications for mass shifts in the PSMs with Mumble + if "mumble" in config["psm_generator"]: + try: + from mumble import PSMHandler + except ImportError: + raise MS2RescoreConfigurationError( + "mumble is not installed. Please install it with: pip install ms2rescore[mumble]" + ) + logger.debug("Applying modifications for mass shifts using Mumble...") + # set inlcude original psm to True and include decoy psm to true + config["psm_generator"]["mumble"]["include_original_psm"] = True + config["psm_generator"]["mumble"]["include_decoy_psm"] = True + mumble_config = config["psm_generator"]["mumble"] + + # Check if psm_list[0].rescoring_features is empty or not + if psm_list[0].rescoring_features: + logger.debug("Removing psm_file rescoring features from PSMs...") + # psm_list.remove_rescoring_features() # TODO add this to psm_utils + for psm in psm_list: + psm.rescoring_features = {} + + psm_handler = PSMHandler( + **mumble_config, + ) + psm_list = psm_handler.add_modified_psms(psm_list) + return psm_list @@ -120,7 +157,7 @@ def _read_psms(config, psm_list): psm_list = [] for current_file, psm_file in enumerate(config["psm_file"]): logger.info( - f"Reading PSMs from PSM file ({current_file+1}/{total_files}): '{psm_file}'..." + f"Reading PSMs from PSM file ({current_file + 1}/{total_files}): '{psm_file}'..." ) psm_list.extend( psm_utils.io.read_file( @@ -186,7 +223,7 @@ def _parse_values_from_spectrum_id( ["retention_time", "ion_mobility"], ): if pattern: - logger.debug(f"Parsing {label} from spectrum_id with regex pattern " f"{pattern}") + logger.debug(f"Parsing {label} from spectrum_id with regex pattern {pattern}") try: pattern = re.compile(pattern) psm_list[key] = [ diff --git a/ms2rescore/parse_spectra.py b/ms2rescore/parse_spectra.py index 103619c..bedbe7d 100644 --- a/ms2rescore/parse_spectra.py +++ b/ms2rescore/parse_spectra.py @@ -3,11 +3,12 @@ import logging import re from enum import Enum -from itertools import chain -from typing import Optional, Set, Tuple +from typing import Optional, Set import numpy as np -from ms2rescore_rs import Precursor, get_precursor_info +from ms2rescore_rs import Precursor, get_ms2_spectra, MS2Spectrum +from rich.progress import track + from psm_utils import PSMList from ms2rescore.exceptions import MS2RescoreConfigurationError, MS2RescoreError @@ -24,7 +25,6 @@ class MSDataType(str, Enum): precursor_mz = "precursor m/z" ms2_spectra = "MS2 spectra" - # Mimic behavior of StrEnum (Python >=3.11) def __str__(self): return self.value @@ -69,7 +69,7 @@ def add_precursor_values( # Check which data types are missing # Missing if: all values are 0, OR any values are None/NaN missing_data_types = set() - if spectrum_path is None: + if MSDataType.ms2_spectra in required_data_types: missing_data_types.add(MSDataType.ms2_spectra) rt_values = np.asarray(psm_list["retention_time"]) @@ -101,39 +101,67 @@ def add_precursor_values( "Spectrum path must be provided to parse precursor values that are not present in the" " PSM list." ) + else: + LOGGER.debug( + "Missing required data types: %s. Parsing from spectrum files.", + ", ".join(str(dt) for dt in data_types_to_parse), + ) # Get precursor values from spectrum files LOGGER.info("Parsing precursor info from spectrum files...") - mz, rt, im = _get_precursor_values(psm_list, spectrum_path, spectrum_id_pattern) + _add_precursor_values(psm_list, spectrum_path, spectrum_id_pattern) + + # Extract precursor values from MS2 spectrum objects in a single pass + precursor_data = [ + (ms2.precursor.rt, ms2.precursor.im, ms2.precursor.mz) for ms2 in psm_list["spectrum"] + ] + rts, ims, mzs = map(np.array, zip(*precursor_data)) # Determine which data types were successfully found in spectrum files - # ms2rescore_rs always returns 0.0 for missing values - found_data_types = {MSDataType.ms2_spectra} # MS2 spectra available when processing files - if np.all(rt != 0.0): + found_data_types = {MSDataType.ms2_spectra} + + # Add found data types: if missing and all zeros, raise error + if not np.all(rts == 0.0): found_data_types.add(MSDataType.retention_time) - if np.all(im != 0.0): + if MSDataType.retention_time in data_types_to_parse: + LOGGER.debug( + "Missing retention time values in PSM list. Updating from spectrum files." + ) + psm_list["retention_time"] = rts + elif MSDataType.retention_time in data_types_to_parse: + raise SpectrumParsingError( + "Retention time values are required but not available in spectrum files " + "(all values are zero)." + ) + + if not np.all(ims == 0.0): found_data_types.add(MSDataType.ion_mobility) - if np.all(mz != 0.0): + if MSDataType.ion_mobility in data_types_to_parse: + LOGGER.debug("Missing ion mobility values in PSM list. Updating from spectrum files.") + psm_list["ion_mobility"] = ims + elif MSDataType.ion_mobility in data_types_to_parse: + raise SpectrumParsingError( + "Ion mobility values are required but not available in spectrum files " + "(all values are zero)." + ) + + if np.all(mzs == 0.0): found_data_types.add(MSDataType.precursor_mz) + if MSDataType.precursor_mz in data_types_to_parse: + LOGGER.debug("Missing precursor m/z values in PSM list. Updating from spectrum files.") + psm_list["precursor_mz"] = mzs + elif MSDataType.precursor_mz in data_types_to_parse: + raise SpectrumParsingError( + "Precursor m/z values are required but not available in spectrum files " + "(all values are zero)." + ) - # Update PSM list with missing precursor values that were found - update_types = data_types_to_parse & found_data_types - - if MSDataType.retention_time in update_types: - LOGGER.debug("Missing retention time values in PSM list. Updating from spectrum files.") - psm_list["retention_time"] = rt - if MSDataType.ion_mobility in update_types: - LOGGER.debug("Missing ion mobility values in PSM list. Updating from spectrum files.") - psm_list["ion_mobility"] = im - if MSDataType.precursor_mz in update_types: - LOGGER.debug("Missing precursor m/z values in PSM list. Updating from spectrum files.") - psm_list["precursor_mz"] = mz - elif ( + # Check if precursor m/z values are consistent between PSMs and spectrum files + if ( MSDataType.precursor_mz not in missing_data_types and MSDataType.precursor_mz in found_data_types ): - # Check if precursor m/z values are consistent between PSMs and spectrum files - mz_diff = np.abs(psm_list["precursor_mz"] - mz) + mz_diff = np.abs(psm_list["precursor_mz"] - mzs) if np.mean(mz_diff) > 1e-2: LOGGER.warning( "Mismatch between precursor m/z values in PSM list and spectrum files (mean " @@ -149,20 +177,23 @@ def add_precursor_values( return available_data_types -def _apply_spectrum_id_pattern( - precursors: dict[str, Precursor], pattern: str +def _acquire_observed_spectra_dict( + ms2: list[MS2Spectrum], pattern: str, spectrum_ids: list[str] ) -> dict[str, Precursor]: """Apply spectrum ID pattern to precursor IDs.""" # Map precursor IDs using regex pattern compiled_pattern = re.compile(pattern) - id_mapping = { - match.group(1): spectrum_id - for spectrum_id in precursors.keys() - if (match := compiled_pattern.search(spectrum_id)) is not None + spectrum_ids_set = set(spectrum_ids) # For faster lookup + + ms2_observed_spectra_mapping = { + match.group(1): ms2_spectrum + for ms2_spectrum in ms2 + if (match := compiled_pattern.search(str(ms2_spectrum.identifier))) is not None + and match.group(1) in spectrum_ids_set } # Validate that any IDs were matched - if not id_mapping: + if not ms2_observed_spectra_mapping: raise MS2RescoreConfigurationError( "'spectrum_id_pattern' did not match any spectrum-file IDs. Please check and try " "again. See " @@ -170,64 +201,33 @@ def _apply_spectrum_id_pattern( "for more information." ) - # Validate that the same number of unique IDs were matched - elif len(id_mapping) != len(precursors): - new_id, old_id = next(iter(id_mapping.items())) - raise MS2RescoreConfigurationError( - "'spectrum_id_pattern' resulted in a different number of unique spectrum IDs. This " - "indicates issues with the regex pattern. Please check and try again. " - f"Example old ID: '{old_id}' -> new ID: '{new_id}'. " - "See https://ms2rescore.readthedocs.io/en/stable/userguide/configuration/#mapping-psms-to-spectra " - "for more information." - ) - - precursors = {new_id: precursors[orig_id] for new_id, orig_id in id_mapping.items()} + return ms2_observed_spectra_mapping - return precursors - -def _get_precursor_values( +def _add_precursor_values( psm_list: PSMList, spectrum_path: str, spectrum_id_pattern: Optional[str] = None -) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: +) -> None: """Get precursor m/z, RT, and IM from spectrum files.""" # Iterate over different runs in PSM list - precursor_dict = dict() - psm_dict = psm_list.get_psm_dict() - for runs in psm_dict.values(): - for run_name, psms in runs.items(): - psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values()))) - spectrum_file = infer_spectrum_path(spectrum_path, run_name) - - LOGGER.debug("Reading spectrum file: '%s'", spectrum_file) - precursors: dict[str, Precursor] = get_precursor_info(str(spectrum_file)) - - # Parse spectrum IDs with regex pattern if provided - if spectrum_id_pattern: - precursors = _apply_spectrum_id_pattern(precursors, spectrum_id_pattern) - - # Ensure all PSMs have precursor values - for psm in psm_list_run: - if psm.spectrum_id not in precursors: - raise MS2RescoreConfigurationError( - "Mismatch between PSM and spectrum file IDs. Could not find precursor " - f"values for PSM with ID {psm.spectrum_id} in run {run_name}.\n" - "Please check that the `spectrum_id_pattern` and `psm_id_pattern` options " - "are configured correctly. See " - "https://ms2rescore.readthedocs.io/en/stable/userguide/configuration/#mapping-psms-to-spectra" - " for more information.\n" - f"Example ID from PSM file: {psm.spectrum_id}\n" - f"Example ID from spectrum file: {list(precursors.keys())[0]}" - ) - - # Store precursor values in dictionary - precursor_dict[run_name] = precursors - - # Reshape precursor values into arrays matching PSM list - mzs = np.fromiter((precursor_dict[psm.run][psm.spectrum_id].mz for psm in psm_list), float) - rts = np.fromiter((precursor_dict[psm.run][psm.spectrum_id].rt for psm in psm_list), float) - ims = np.fromiter((precursor_dict[psm.run][psm.spectrum_id].im for psm in psm_list), float) - - return mzs, rts, ims + if spectrum_id_pattern is None: + spectrum_id_pattern = r"^(.*)$" # Match entire identifier if no pattern provided + + for run_name in track(set(psm_list["run"])): + run_mask = psm_list["run"] == run_name + psm_list_run = psm_list[run_mask] + spectrum_file = infer_spectrum_path(spectrum_path, run_name) + + LOGGER.debug("Reading spectrum file: '%s'", spectrum_file) + ms2_spectra: list[MS2Spectrum] = get_ms2_spectra(str(spectrum_file)) + + # Parse spectrum IDs with regex pattern if provided + ms2_spectra_dict = _acquire_observed_spectra_dict( + ms2_spectra, spectrum_id_pattern, psm_list_run["spectrum_id"] + ) + + psm_list_run["spectrum"] = [ + ms2_spectra_dict[spec_id] for spec_id in psm_list_run["spectrum_id"] + ] class SpectrumParsingError(MS2RescoreError): diff --git a/ms2rescore/report/__main__.py b/ms2rescore/report/__main__.py index 86fd5f2..6ab5e6d 100644 --- a/ms2rescore/report/__main__.py +++ b/ms2rescore/report/__main__.py @@ -1,6 +1,8 @@ import logging +from pathlib import Path import click +import psm_utils.io from rich.logging import RichHandler from ms2rescore.report.generate import generate_report @@ -9,8 +11,19 @@ @click.command() -@click.argument("output_prefix", type=str) -def main(**kwargs): +@click.argument("psm_file", type=click.Path(exists=True)) +@click.option( + "--output", + "-o", + type=click.Path(), + default=None, + help="Output path for the report HTML file. If not provided, will be based on PSM file name.", +) +def main(psm_file, output): + """Generate MS²Rescore report from a PSM TSV file. + + PSM_FILE: Path to the PSM TSV file (e.g., output.psms.tsv) + """ logging.getLogger("mokapot").setLevel(logging.WARNING) logging.basicConfig( level=logging.INFO, @@ -19,7 +32,38 @@ def main(**kwargs): ) try: - generate_report(kwargs["output_prefix"]) + psm_file_path = Path(psm_file) + + # Determine output path + if output: + output_path = Path(output) + else: + # Try to infer from ms2rescore naming convention + if ".ms2rescore.psms.tsv" in psm_file_path.name: + output_prefix = str(psm_file_path).replace(".psms.tsv", "") + else: + # Use the PSM file name without extension + output_prefix = str(psm_file_path.with_suffix("")) + output_path = Path(output_prefix + ".report.html") + + logger.info(f"Reading PSMs from {psm_file_path}...") + psm_list = psm_utils.io.read_file(psm_file_path, filetype="tsv", show_progressbar=True) + + logger.info("Generating report...") + # Try to infer output prefix for finding other files + if ".ms2rescore.psms.tsv" in psm_file_path.name: + output_prefix = str(psm_file_path).replace(".psms.tsv", "") + else: + output_prefix = str(psm_file_path.with_suffix("")) + + generate_report( + output_path_prefix=output_prefix, + psm_list=psm_list, + output_file=output_path, + ) + + logger.info(f"✓ Report generated: {output_path}") + except Exception as e: logger.exception(e) exit(1) diff --git a/ms2rescore/report/charts.py b/ms2rescore/report/charts.py index 669bcc6..71db605 100644 --- a/ms2rescore/report/charts.py +++ b/ms2rescore/report/charts.py @@ -1,7 +1,9 @@ """Collection of Plotly-based charts for reporting results of MS²Rescore.""" +import importlib.resources import warnings from collections import defaultdict +from pathlib import Path from typing import Dict, List, Optional, Tuple, Union import mokapot @@ -635,3 +637,369 @@ def feature_ecdf_auc_bar( }, color_discrete_map=color_discrete_map, ) + + +def rt_scatter( + df: pd.DataFrame, + predicted_column: str = "Predicted retention time", + observed_column: str = "Observed retention time", + xaxis_label: str = "Observed retention time", + yaxis_label: str = "Predicted retention time", + plot_title: str = "Predicted vs. observed retention times", +) -> go.Figure: + """ + Plot a scatter plot of the predicted vs. observed retention times. + + Parameters + ---------- + df : pd.DataFrame + Dataframe containing the predicted and observed retention times. + predicted_column : str, optional + Name of the column containing the predicted retention times, by default + ``Predicted retention time``. + observed_column : str, optional + Name of the column containing the observed retention times, by default + ``Observed retention time``. + xaxis_label : str, optional + X-axis label, by default ``Observed retention time``. + yaxis_label : str, optional + Y-axis label, by default ``Predicted retention time``. + plot_title : str, optional + Scatter plot title, by default ``Predicted vs. observed retention times`` + + """ + # Draw scatter + fig = px.scatter( + df, + x=observed_column, + y=predicted_column, + opacity=0.3, + ) + + # Draw diagonal line + fig.add_scatter( + x=[min(df[observed_column]), max(df[observed_column])], + y=[min(df[observed_column]), max(df[observed_column])], + mode="lines", + line=dict(color="red", width=3, dash="dash"), + ) + + # Hide legend + fig.update_layout( + title=plot_title, + showlegend=False, + xaxis_title=xaxis_label, + yaxis_title=yaxis_label, + ) + + return fig + + +def rt_distribution_baseline( + df: pd.DataFrame, + predicted_column: str = "Predicted retention time", + observed_column: str = "Observed retention time", +) -> go.Figure: + """ + Plot a distribution plot of the relative mean absolute error of the current + DeepLC performance compared to the baseline performance. + + Parameters + ---------- + df : pd.DataFrame + Dataframe containing the predicted and observed retention times. + predicted_column : str, optional + Name of the column containing the predicted retention times, by default + ``Predicted retention time``. + observed_column : str, optional + Name of the column containing the observed retention times, by default + ``Observed retention time``. + + """ + # Get baseline data from deeplc package + try: + import deeplc.package_data + + baseline_path = ( + Path(importlib.resources.files(deeplc.package_data)) + / "baseline_performance" + / "baseline_predictions.csv" + ) + baseline_df = pd.read_csv(baseline_path) + except (ImportError, FileNotFoundError): + # If deeplc is not installed or baseline data not found, return empty figure + fig = go.Figure() + fig.add_annotation( + text="DeepLC baseline data not available. Install DeepLC to view performance comparison.", + showarrow=False, + ) + return fig + + baseline_df["rel_mae_best"] = baseline_df[ + ["rel_mae_transfer_learning", "rel_mae_new_model", "rel_mae_calibrate"] + ].min(axis=1) + baseline_df.fillna(0.0, inplace=True) + + # Calculate current RMAE and percentile compared to baseline + mae = sum(abs(df[observed_column] - df[predicted_column])) / len(df.index) + mae_rel = (mae / max(df[observed_column])) * 100 + percentile = round((baseline_df["rel_mae_transfer_learning"] < mae_rel).mean() * 100, 1) + + # Calculate x-axis range with 5% padding + all_values = np.append(baseline_df["rel_mae_transfer_learning"].values, mae_rel) + padding = (all_values.max() - all_values.min()) / 20 # 5% padding + x_min = all_values.min() - padding + x_max = all_values.max() + padding + + # Make labels human-readable + hover_label_mapping = { + "train_number": "Training dataset size", + "rel_mae_transfer_learning": "RMAE with transfer learning", + "rel_mae_new_model": "RMAE with new model from scratch", + "rel_mae_calibrate": "RMAE with calibrating existing model", + "rel_mae_best": "RMAE with best method", + } + label_mapping = hover_label_mapping.copy() + label_mapping.update({"Unnamed: 0": "Dataset"}) + + # Generate plot + fig = px.histogram( + data_frame=baseline_df, + x="rel_mae_best", + marginal="rug", + hover_data=hover_label_mapping.keys(), + hover_name="Unnamed: 0", + labels=label_mapping, + opacity=0.8, + ) + fig.add_vline( + x=mae_rel, + line_width=3, + line_dash="dash", + line_color="red", + annotation_text=f"Current performance (percentile {percentile}%)", + annotation_position="top left", + name="Current performance", + row=1, + ) + fig.update_xaxes(range=[x_min, x_max]) + fig.update_layout( + title=(f"Current DeepLC performance compared to {len(baseline_df.index)} datasets"), + xaxis_title="Relative mean absolute error (%)", + ) + + return fig + + +def score_scatter_plot_df( + psm_df: pd.DataFrame, + fdr_threshold: float = 0.01, +) -> go.Figure: + """ + Plot PSM scores before and after rescoring from a dataframe. + + Parameters + ---------- + psm_df + Dataframe with PSM information including score_before, score_after, + qvalue_before, qvalue_after, and is_decoy columns. + fdr_threshold + FDR threshold for drawing threshold lines. + + Returns + ------- + go.Figure + Plotly figure with score comparison. + """ + if "score_before" not in psm_df.columns or "score_after" not in psm_df.columns: + figure = go.Figure() + figure.add_annotation( + text="No before/after score data available for comparison.", + showarrow=False, + ) + return figure + + # Prepare data + plot_df = psm_df.copy() + plot_df["PSM type"] = plot_df["is_decoy"].map({True: "decoy", False: "target"}) + + # Get score thresholds + try: + score_threshold_before = ( + plot_df[plot_df["qvalue_before"] <= fdr_threshold] + .sort_values("qvalue_before", ascending=False)["score_before"] + .iloc[0] + ) + except (IndexError, KeyError): + score_threshold_before = None + + try: + score_threshold_after = ( + plot_df[plot_df["qvalue_after"] <= fdr_threshold] + .sort_values("qvalue_after", ascending=False)["score_after"] + .iloc[0] + ) + except (IndexError, KeyError): + score_threshold_after = None + + # Plot + fig = px.scatter( + data_frame=plot_df, + x="score_before", + y="score_after", + color="PSM type", + marginal_x="histogram", + marginal_y="histogram", + opacity=0.1, + labels={ + "score_before": "PSM score (before rescoring)", + "score_after": "PSM score (after rescoring)", + }, + ) + + # Draw FDR thresholds + if score_threshold_before: + fig.add_vline(x=score_threshold_before, line_dash="dash", row=1, col=1) + fig.add_vline(x=score_threshold_before, line_dash="dash", row=2, col=1) + if score_threshold_after: + fig.add_hline(y=score_threshold_after, line_dash="dash", row=1, col=1) + fig.add_hline(y=score_threshold_after, line_dash="dash", row=1, col=2) + + return fig + + +def fdr_plot_comparison_df( + psm_df: pd.DataFrame, +) -> go.Figure: + """ + Plot number of identifications in function of FDR threshold before/after rescoring from dataframe. + + Parameters + ---------- + psm_df + Dataframe with PSM information including qvalue_before, qvalue_after, and is_decoy columns. + + Returns + ------- + go.Figure + Plotly figure with FDR comparison. + """ + if "qvalue_before" not in psm_df.columns or "qvalue_after" not in psm_df.columns: + figure = go.Figure() + figure.add_annotation( + text="No before/after q-value data available for comparison.", + showarrow=False, + ) + return figure + + # Filter targets only + targets = psm_df[~psm_df["is_decoy"]].copy() + + # Prepare data in long format + plot_data = pd.concat( + [ + targets[["qvalue_before"]] + .rename(columns={"qvalue_before": "q-value"}) + .assign(**{"before/after": "before rescoring"}), + targets[["qvalue_after"]] + .rename(columns={"qvalue_after": "q-value"}) + .assign(**{"before/after": "after rescoring"}), + ] + ) + + # Plot + fig = px.ecdf( + data_frame=plot_data, + x="q-value", + color="before/after", + log_x=True, + ecdfnorm=None, + labels={ + "q-value": "FDR threshold", + "before/after": "", + }, + color_discrete_map={ + "before rescoring": "#316395", + "after rescoring": "#319545", + }, + ) + fig.add_vline(x=0.01, line_dash="dash", line_color="black") + fig.update_layout(yaxis_title="Identified PSMs") + return fig + + +def identification_overlap_df( + psm_df: pd.DataFrame, + fdr_threshold: float = 0.01, +) -> go.Figure: + """ + Plot stacked bar charts of removed, retained, and gained PSMs and peptides from dataframe. + + Parameters + ---------- + psm_df + Dataframe with PSM information including qvalue_before, qvalue_after, + is_decoy, and peptidoform columns. + fdr_threshold + FDR threshold for counting identifications. + + Returns + ------- + go.Figure + Plotly figure with identification overlap. + """ + if "qvalue_before" not in psm_df.columns or "qvalue_after" not in psm_df.columns: + figure = go.Figure() + figure.add_annotation( + text="No before/after q-value data available for comparison.", + showarrow=False, + ) + return figure + + overlap_data = defaultdict(dict) + + # PSM level + targets = psm_df[~psm_df["is_decoy"]] + psms_before = set(targets[targets["qvalue_before"] <= fdr_threshold].index) + psms_after = set(targets[targets["qvalue_after"] <= fdr_threshold].index) + + overlap_data["removed"]["psms"] = -len(psms_before - psms_after) + overlap_data["retained"]["psms"] = len(psms_after.intersection(psms_before)) + overlap_data["gained"]["psms"] = len(psms_after - psms_before) + + # Peptide level + if "peptidoform" in psm_df.columns: + peptides_before = set( + targets[targets["qvalue_before"] <= fdr_threshold]["peptidoform"].unique() + ) + peptides_after = set( + targets[targets["qvalue_after"] <= fdr_threshold]["peptidoform"].unique() + ) + + overlap_data["removed"]["peptides"] = -len(peptides_before - peptides_after) + overlap_data["retained"]["peptides"] = len(peptides_after.intersection(peptides_before)) + overlap_data["gained"]["peptides"] = len(peptides_after - peptides_before) + + colors = ["#953331", "#316395", "#319545"] + levels = list(overlap_data["retained"].keys()) + fig = plotly.subplots.make_subplots(rows=len(levels), cols=1) + + for i, level in enumerate(levels): + for (item, data), color in zip(overlap_data.items(), colors): + if level not in data: + continue + fig.add_trace( + go.Bar( + y=[level], + x=[data[level]], + marker={"color": color}, + orientation="h", + name=item, + showlegend=True if i == 0 else False, + ), + row=i + 1, + col=1, + ) + fig.update_layout(barmode="relative") + + return fig diff --git a/ms2rescore/report/generate.py b/ms2rescore/report/generate.py index 4a5d411..1b82c91 100644 --- a/ms2rescore/report/generate.py +++ b/ms2rescore/report/generate.py @@ -4,7 +4,6 @@ import json import logging from datetime import datetime -from itertools import cycle from pathlib import Path from typing import Dict, Optional @@ -24,8 +23,11 @@ import ms2rescore.report.charts as charts import ms2rescore.report.templates as templates from ms2rescore.report.utils import ( + calculate_fdr_stats, + create_psm_dataframe, get_confidence_estimates, get_feature_values, + infer_feature_names_from_psm_list, read_feature_names, ) @@ -50,6 +52,8 @@ def generate_report( psm_list: Optional[psm_utils.PSMList] = None, feature_names: Optional[Dict[str, set]] = None, use_txt_log: bool = False, + output_file: Optional[Path] = None, + use_mokapot: bool = False, ): """ Generate the report. @@ -68,6 +72,11 @@ def generate_report( use_txt_log If True, the log file will be read from ``output_path_prefix + ".log.txt"`` instead of ``output_path_prefix + ".log.html"``. + output_file + Path to the output HTML file. If not provided, will be ``output_path_prefix + ".report.html"``. + use_mokapot + If True, use mokapot LinearConfidence objects for overview charts (legacy mode). + If False (default), use PSM dataframe directly. """ files = _collect_files(output_path_prefix, use_txt_log=use_txt_log) @@ -80,27 +89,53 @@ def generate_report( else: raise FileNotFoundError("PSM file not found and no PSM list provided.") - # Read config - config = json.loads(files["configuration"].read_text()) + # Create comprehensive dataframe from PSM list + logger.debug("Creating PSM dataframe...") + psm_df = create_psm_dataframe(psm_list) - logger.debug("Recalculating confidence estimates...") - fasta_file = config["ms2rescore"]["fasta_file"] - confidence_before, confidence_after = get_confidence_estimates(psm_list, fasta_file) + # Try to read config, but don't fail if it doesn't exist + config = None + if files["configuration"] and files["configuration"].is_file(): + try: + config = json.loads(files["configuration"].read_text()) + except (json.JSONDecodeError, FileNotFoundError): + logger.warning("Could not read configuration file. Proceeding without it.") + config = {"ms2rescore": {}} + else: + logger.info("No configuration file found. Proceeding without it.") + config = {"ms2rescore": {}} + + # Generate overview context + if use_mokapot and config.get("ms2rescore", {}).get("fasta_file"): + logger.debug("Recalculating confidence estimates with mokapot...") + fasta_file = config["ms2rescore"]["fasta_file"] + confidence_before, confidence_after = get_confidence_estimates(psm_list, fasta_file) + overview_context = _get_overview_context(confidence_before, confidence_after) + else: + logger.debug("Generating overview from PSM dataframe...") + overview_context = _get_overview_context_df(psm_df) - overview_context = _get_overview_context(confidence_before, confidence_after) target_decoy_context = _get_target_decoy_context(psm_list) features_context = _get_features_context(psm_list, files, feature_names=feature_names) config_context = _get_config_context(config) log_context = _get_log_context(files) + # Get PSM filename(s) for metadata + if config.get("ms2rescore", {}).get("psm_file"): + psm_filenames = "\n".join( + [Path(id_file).name for id_file in config["ms2rescore"]["psm_file"]] + ) + elif files["PSMs"]: + psm_filenames = files["PSMs"].name + else: + psm_filenames = "Unknown" + context = { "plotlyjs_version": get_plotlyjs_version(), "metadata": { "generated_on": datetime.now().strftime("%d/%m/%Y %H:%M:%S"), "ms2rescore_version": ms2rescore.__version__, # TODO: Write during run? - "psm_filename": "\n".join( - [Path(id_file).name for id_file in config["ms2rescore"]["psm_file"]] - ), + "psm_filename": psm_filenames, }, "main_tabs": [ { @@ -136,7 +171,7 @@ def generate_report( ], } - _render_and_write(output_path_prefix, **context) + _render_and_write(output_path_prefix, output_file=output_file, **context) def _collect_files(output_path_prefix, use_txt_log=False): @@ -197,6 +232,88 @@ def _get_stats_context(confidence_before, confidence_after): return stats +def _get_stats_context_df(psm_df: pd.DataFrame, fdr_threshold: float = 0.01) -> list: + """Return context for overview statistics pane from dataframe.""" + stats = [] + + if "qvalue_before" not in psm_df.columns or "qvalue_after" not in psm_df.columns: + return stats + + targets = psm_df[~psm_df["is_decoy"]] + + # PSM level stats + psms_before = len(targets[targets["qvalue_before"] <= fdr_threshold]) + psms_after = len(targets[targets["qvalue_after"] <= fdr_threshold]) + + if psms_before > 0: + increase = (psms_after - psms_before) / psms_before * 100 + stats.append( + { + "item": "PSMs", + "card_color": "card-bg-blue", + "number": psms_after, + "diff": f"({psms_after - psms_before:+})", + "percentage": f"{increase:.1f}%", + "is_increase": increase > 0, + "bar_percentage": psms_before / psms_after * 100 + if increase > 0 + else psms_after / psms_before * 100, + "bar_color": "#24a143" if increase > 0 else "#a12424", + } + ) + + # Peptide level stats + if "peptidoform" in psm_df.columns: + peptides_before = targets[targets["qvalue_before"] <= fdr_threshold][ + "peptidoform" + ].nunique() + peptides_after = targets[targets["qvalue_after"] <= fdr_threshold]["peptidoform"].nunique() + + if peptides_before > 0: + increase = (peptides_after - peptides_before) / peptides_before * 100 + stats.append( + { + "item": "Peptides", + "card_color": "card-bg-green", + "number": peptides_after, + "diff": f"({peptides_after - peptides_before:+})", + "percentage": f"{increase:.1f}%", + "is_increase": increase > 0, + "bar_percentage": peptides_before / peptides_after * 100 + if increase > 0 + else peptides_after / peptides_before * 100, + "bar_color": "#24a143" if increase > 0 else "#a12424", + } + ) + + return stats + + +def _get_overview_context_df(psm_df: pd.DataFrame) -> dict: + """Return context for overview tab from dataframe.""" + logger.debug("Generating overview charts from PSM dataframe...") + return { + "stats": _get_stats_context_df(psm_df), + "charts": [ + { + "title": TEXTS["charts"]["score_comparison"]["title"], + "description": TEXTS["charts"]["score_comparison"]["description"], + "chart": charts.score_scatter_plot_df(psm_df).to_html(**PLOTLY_HTML_KWARGS), + }, + { + "title": TEXTS["charts"]["fdr_comparison"]["title"], + "description": TEXTS["charts"]["fdr_comparison"]["description"], + "chart": charts.fdr_plot_comparison_df(psm_df).to_html(**PLOTLY_HTML_KWARGS), + }, + { + "title": TEXTS["charts"]["identification_overlap"]["title"], + "description": TEXTS["charts"]["identification_overlap"]["description"], + "chart": charts.identification_overlap_df(psm_df).to_html(**PLOTLY_HTML_KWARGS), + }, + ], + } + + def _get_overview_context(confidence_before, confidence_after) -> dict: """Return context for overview tab.""" logger.debug("Generating overview charts...") @@ -260,38 +377,67 @@ def _get_features_context( context: dict[str, list] = {"charts": []} # Get feature names, mapping with generator, and flat list + if feature_names is None or not feature_names: + # Try to read from file first + feature_names = read_feature_names(files.get("feature names")) + + # If file doesn't exist or is empty, infer from PSM list + if not feature_names: + logger.info("Feature names file not found. Inferring from PSM list...") + feature_names = infer_feature_names_from_psm_list(psm_list) + + # If still no features, return empty context if not feature_names: - feature_names = read_feature_names(files["feature names"]) + logger.warning("No features found in PSM list. Skipping feature charts.") + return context + + # Convert sets to lists if needed (for compatibility with both sources) + feature_names = {k: list(v) if isinstance(v, set) else v for k, v in feature_names.items()} + feature_names_flat = [f_name for f_list in feature_names.values() for f_name in f_list] feature_names_inv = {name: gen for gen, f_list in feature_names.items() for name in f_list} - # Get fixed color map for feature generators - color_map = dict(zip(feature_names.keys(), cycle(px.colors.qualitative.Plotly))) + # Fixed color map for feature generators + FEATURE_GENERATOR_COLORS = { + "ms2pip": "#16BA27", + "deeplc": "#EC4807", + "im2deep": "#1E25EA", + "ionmob": "#AB63FA", # remove ionmob? + "basic": "#000000", + "psm_file": "#F1ED06", + "other": "#FF6692", + } + color_map = {fg: FEATURE_GENERATOR_COLORS.get(fg, "#FFFFFF") for fg in feature_names.keys()} # feature weights - if not files["feature weights"]: - logger.warning("Could not find feature weights files. Skipping feature weights plot.") + if not files.get("feature weights") or not files["feature weights"].is_file(): + logger.info("Feature weights file not found. Skipping feature weights plot.") else: - feature_weights = pd.read_csv(files["feature weights"], sep="\t").melt( - var_name="feature", value_name="weight" - ) - feature_weights["feature"] = feature_weights["feature"].str.replace( - r"^(feature:)?", "", regex=True - ) - feature_weights["feature_generator"] = feature_weights["feature"].map(feature_names_inv) + try: + feature_weights = pd.read_csv(files["feature weights"], sep="\t").melt( + var_name="feature", value_name="weight" + ) + feature_weights["feature"] = feature_weights["feature"].str.replace( + r"^(feature:)?", "", regex=True + ) + feature_weights["feature_generator"] = feature_weights["feature"].map( + feature_names_inv + ) - context["charts"].append( - { - "title": TEXTS["charts"]["feature_usage"]["title"], - "description": TEXTS["charts"]["feature_usage"]["description"], - "chart": charts.feature_weights_by_generator( - feature_weights, color_discrete_map=color_map - ).to_html(**PLOTLY_HTML_KWARGS) - + charts.feature_weights(feature_weights, color_discrete_map=color_map).to_html( - **PLOTLY_HTML_KWARGS - ), - } - ) + context["charts"].append( + { + "title": TEXTS["charts"]["feature_usage"]["title"], + "description": TEXTS["charts"]["feature_usage"]["description"], + "chart": charts.feature_weights_by_generator( + feature_weights, color_discrete_map=color_map + ).to_html(**PLOTLY_HTML_KWARGS) + + charts.feature_weights( + feature_weights, color_discrete_map=color_map + ).to_html(**PLOTLY_HTML_KWARGS), + } + ) + except Exception as e: + logger.warning(f"Could not generate feature weights plot: {e}") # Individual feature performance features = get_feature_values(psm_list, feature_names_flat) @@ -322,14 +468,12 @@ def _get_features_context( # DeepLC specific charts if "deeplc" in feature_names: - import deeplc.plot - - scatter_chart = deeplc.plot.scatter( + scatter_chart = charts.rt_scatter( df=features[(~psm_list["is_decoy"]) & (psm_list["qvalue"] <= 0.01)], predicted_column="predicted_retention_time_best", observed_column="observed_retention_time_best", ) - baseline_chart = deeplc.plot.distribution_baseline( + baseline_chart = charts.rt_distribution_baseline( df=features[(~psm_list["is_decoy"]) & (psm_list["qvalue"] <= 0.01)], predicted_column="predicted_retention_time_best", observed_column="observed_retention_time_best", @@ -345,9 +489,7 @@ def _get_features_context( # IM2Deep specific charts if "im2deep" in feature_names: - import deeplc.plot - - scatter_chart = deeplc.plot.scatter( + scatter_chart = charts.rt_scatter( df=features[(~psm_list["is_decoy"]) & (psm_list["qvalue"] <= 0.01)], predicted_column="ccs_predicted_im2deep", observed_column="ccs_observed_im2deep", @@ -366,31 +508,22 @@ def _get_features_context( # ionmob specific charts if "ionmob" in feature_names: - try: - import deeplc.plot - - scatter_chart = deeplc.plot.scatter( - df=features[(~psm_list["is_decoy"]) & (psm_list["qvalue"] <= 0.01)], - predicted_column="ccs_predicted", - observed_column="ccs_observed", - xaxis_label="Observed CCS", - yaxis_label="Predicted CCS", - plot_title="Predicted vs. observed CCS - ionmob", - ) - - context["charts"].append( - { - "title": TEXTS["charts"]["ionmob_performance"]["title"], - "description": TEXTS["charts"]["ionmob_performance"]["description"], - "chart": scatter_chart.to_html(**PLOTLY_HTML_KWARGS), - } - ) + scatter_chart = charts.rt_scatter( + df=features[(~psm_list["is_decoy"]) & (psm_list["qvalue"] <= 0.01)], + predicted_column="ccs_predicted", + observed_column="ccs_observed", + xaxis_label="Observed CCS", + yaxis_label="Predicted CCS", + plot_title="Predicted vs. observed CCS - ionmob", + ) - # TODO: for now, ionmob plot will only be available if deeplc is installed. Since ionmob does not have a dependency on deeplc, this should be changed in the future. - except ImportError: - logger.warning( - "Could not import deeplc.plot, skipping ionmob CCS prediction performance plot. Please install DeepLC to generate this plot." - ) + context["charts"].append( + { + "title": TEXTS["charts"]["ionmob_performance"]["title"], + "description": TEXTS["charts"]["ionmob_performance"]["description"], + "chart": scatter_chart.to_html(**PLOTLY_HTML_KWARGS), + } + ) return context @@ -416,9 +549,13 @@ def _get_log_context(files: Dict[str, Path]) -> dict: return {"log": "Log file format not recognized."} -def _render_and_write(output_path_prefix: str, **context): +def _render_and_write(output_path_prefix: str, output_file: Optional[Path] = None, **context): """Render template with context and write to HTML file.""" - report_path = Path(output_path_prefix + ".report.html").resolve() + if output_file: + report_path = Path(output_file).resolve() + else: + report_path = Path(output_path_prefix + ".report.html").resolve() + logger.info("Writing report to %s", report_path.as_posix()) # Use importlib.resources for PyInstaller compatibility diff --git a/ms2rescore/report/utils.py b/ms2rescore/report/utils.py index d5fbfb8..3cf8219 100644 --- a/ms2rescore/report/utils.py +++ b/ms2rescore/report/utils.py @@ -4,7 +4,7 @@ from collections import defaultdict from csv import DictReader from pathlib import Path -from typing import Optional, Tuple +from typing import Dict, Optional, Tuple import pandas as pd import psm_utils @@ -15,13 +15,52 @@ logger = logging.getLogger(__name__) -def read_feature_names(feature_names_path: Path) -> dict: +def read_feature_names(feature_names_path: Optional[Path]) -> dict: """Read feature names and mapping with feature generator from file.""" feature_names = defaultdict(list) - with open(feature_names_path) as f: - reader = DictReader(f, delimiter="\t") - for line in reader: - feature_names[line["feature_generator"]].append(line["feature_name"]) + if not feature_names_path or not feature_names_path.is_file(): + return feature_names + + try: + with open(feature_names_path) as f: + reader = DictReader(f, delimiter="\t") + for line in reader: + feature_names[line["feature_generator"]].append(line["feature_name"]) + except (FileNotFoundError, KeyError, ValueError) as e: + logger.warning(f"Could not read feature names file: {e}") + + return feature_names + + +def infer_feature_names_from_psm_list(psm_list: psm_utils.PSMList) -> Dict[str, list]: + """Infer feature names and generators from PSM list when no feature_names file exists.""" + feature_names = defaultdict(list) + + if not psm_list or not psm_list[0].rescoring_features: + return feature_names + + # Get all feature names from the first PSM + all_features = list(psm_list[0].rescoring_features.keys()) + + # Try to infer generator from feature name patterns + for fname in all_features: + fname_lower = fname.lower() + + # Pattern matching for common feature generators + if any(x in fname_lower for x in ["ms2pip", "spec_pearson", "spec_spearman", "ionmatch"]): + feature_names["ms2pip"].append(fname) + elif any(x in fname_lower for x in ["deeplc", "retention_time", "rt_diff"]): + feature_names["deeplc"].append(fname) + elif any(x in fname_lower for x in ["im2deep", "ccs_predicted_im2deep", "ccs_observed_im2deep"]): + feature_names["im2deep"].append(fname) + elif any(x in fname_lower for x in ["ionmob", "ccs_predicted", "ccs_observed"]) and "im2deep" not in fname_lower: + feature_names["ionmob"].append(fname) + elif any(x in fname_lower for x in ["basic", "charge", "missed_cleavages"]): + feature_names["basic"].append(fname) + else: + # Unknown generator - use "other" category + feature_names["other"].append(fname) + return feature_names @@ -75,3 +114,122 @@ def get_confidence_estimates( logger.warning("Could not assign confidence estimates for %s rescoring.", when) return confidence["before"], confidence["after"] + + +def create_psm_dataframe(psm_list: psm_utils.PSMList) -> pd.DataFrame: + """ + Create a comprehensive dataframe from PSM list with all necessary information. + + This dataframe includes: + - Basic PSM information (peptide, score, qvalue, is_decoy, etc.) + - Before rescoring scores from provenance data + - All rescoring features + + Parameters + ---------- + psm_list + PSM list to convert to dataframe. + + Returns + ------- + pd.DataFrame + Dataframe with all PSM information. + """ + # Start with basic PSM dataframe + psm_df = psm_list.to_dataframe() + + # Add before rescoring scores from provenance data + try: + provenance_df = pd.DataFrame.from_records(psm_list["provenance_data"]) + if "before_rescoring_score" in provenance_df.columns: + psm_df["score_before"] = provenance_df["before_rescoring_score"].astype(float) + if "before_rescoring_qvalue" in provenance_df.columns: + psm_df["qvalue_before"] = provenance_df["before_rescoring_qvalue"].astype(float) + except (KeyError, ValueError) as e: + logger.warning("Could not extract before rescoring scores from provenance data: %s", e) + psm_df["score_before"] = None + psm_df["qvalue_before"] = None + + # Add rescoring features + if psm_list[0].rescoring_features: + features_df = get_feature_values(psm_list) + # Merge features with PSM dataframe (they should have same index) + psm_df = pd.concat([psm_df, features_df], axis=1) + + # Rename current score/qvalue to score_after/qvalue_after for clarity + psm_df["score_after"] = psm_df["score"] + psm_df["qvalue_after"] = psm_df["qvalue"] + + return psm_df + + +def calculate_fdr_stats( + psm_df: pd.DataFrame, + score_column: str = "score", + fdr_threshold: float = 0.01, +) -> Dict[str, int]: + """ + Calculate FDR statistics from PSM dataframe. + + Parameters + ---------- + psm_df + Dataframe with PSM information including score, qvalue, and is_decoy columns. + score_column + Column name for scores to use. + fdr_threshold + FDR threshold for counting identifications. + + Returns + ------- + dict + Dictionary with counts of identifications at different levels. + """ + stats = {} + + # PSM level + targets = psm_df[~psm_df["is_decoy"]] + stats["psms"] = len(targets[targets["qvalue"] <= fdr_threshold]) + + # Peptide level (unique peptide sequences) + if "peptidoform" in psm_df.columns: + unique_peptides = targets[targets["qvalue"] <= fdr_threshold]["peptidoform"].nunique() + stats["peptides"] = unique_peptides + + return stats + + +def get_score_threshold_at_fdr( + psm_df: pd.DataFrame, + score_column: str = "score", + qvalue_column: str = "qvalue", + fdr_threshold: float = 0.01, +) -> Optional[float]: + """ + Get score threshold at a given FDR threshold. + + Parameters + ---------- + psm_df + Dataframe with PSM information. + score_column + Column name for scores. + qvalue_column + Column name for q-values. + fdr_threshold + FDR threshold. + + Returns + ------- + float or None + Score threshold at the FDR threshold, or None if no PSMs pass threshold. + """ + try: + threshold = ( + psm_df[psm_df[qvalue_column] <= fdr_threshold] + .sort_values(qvalue_column, ascending=False)[score_column] + .iloc[0] + ) + return threshold + except (IndexError, KeyError): + return None diff --git a/ms2rescore/rescoring_engines/percolator.py b/ms2rescore/rescoring_engines/percolator.py index 192950b..32d856c 100644 --- a/ms2rescore/rescoring_engines/percolator.py +++ b/ms2rescore/rescoring_engines/percolator.py @@ -19,8 +19,8 @@ import logging import subprocess -from typing import Any, Dict, Optional from copy import deepcopy +from typing import Any, Dict, Optional import psm_utils diff --git a/ms2rescore/utils.py b/ms2rescore/utils.py index 1fc686b..0780e81 100644 --- a/ms2rescore/utils.py +++ b/ms2rescore/utils.py @@ -3,8 +3,10 @@ from glob import glob from pathlib import Path from typing import Optional, Union +import numpy as np from ms2rescore_rs import is_supported_file_type +from psm_utils import PSMList from ms2rescore.exceptions import MS2RescoreConfigurationError @@ -92,3 +94,57 @@ def _is_minitdf(spectrum_file: str) -> bool: files = set(Path(spectrum_file).glob("*ms2spectrum.bin")) files.update(Path(spectrum_file).glob("*ms2spectrum.parquet")) return len(files) >= 2 + + +def filter_mumble_psms(psm_list: PSMList, threshold=1) -> PSMList: + """ + Filter out mumble PSMs with `matched_ions_pct` lower than the original hit. + + Parameters + ---------- + psm_list : PSMList + List of PSMs to filter + threshold : float, optional + Threshold to lower the maximum matched_ions_pct of the original hit + """ + # Extract relevant fields from the PSM list + original_hit = np.array([metadata.get("original_psm") for metadata in psm_list["metadata"]]) + spectrum_indices = np.array([psm.spectrum_id for psm in psm_list]) + runs = np.array([psm.run for psm in psm_list]) + + # Check if matched_ions_pct exists + if "matched_ions_pct" not in psm_list[0].rescoring_features: + return psm_list + + matched_ions = np.array([psm.rescoring_features["matched_ions_pct"] for psm in psm_list]) + + # Create unique keys for each (run, spectrum_id) + unique_keys = np.core.defchararray.add(runs.astype(str), spectrum_indices.astype(str)) + unique_keys_indices, inverse_indices = np.unique(unique_keys, return_inverse=True) + + # Initialize an array to store the `matched_ions_pct` of original hits per group + original_matched_ions_pct = np.full( + len(unique_keys_indices), -np.inf + ) # Default to -inf for groups without original hits + + # Assign the `matched_ions_pct` of original hits to their groups + np.maximum.at( + original_matched_ions_pct, inverse_indices[original_hit], matched_ions[original_hit] + ) + + # lower the maximum with the threshold + original_matched_ions_pct = original_matched_ions_pct * threshold + + # Broadcast the original `matched_ions_pct` back to all PSMs in each group + original_matched_ions_for_all = original_matched_ions_pct[inverse_indices] + + # Determine the filtering condition + keep = np.logical_or( + original_hit, # Always keep original hits + matched_ions + >= original_matched_ions_for_all, # Keep hits with `matched_ions_pct` >= the original + ) + + # Filter PSMs + logger.debug(f"Filtered out {len(psm_list) - np.sum(keep)} mumble PSMs.") + return psm_list[keep] diff --git a/pyproject.toml b/pyproject.toml index 380982b..059fb30 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,6 @@ dependencies = [ "click>=7", "customtkinter>=5,<6", "deeplc>=3.1", - "deeplcretrainer", "im2deep>=1.1", "jinja2>=3", "lxml>=4.5", @@ -56,6 +55,7 @@ dependencies = [ [project.optional-dependencies] ionmob = ["ionmob>=0.2", "tensorflow"] +mumble = ["rustyms>=0.8.3", "mumble>=0.2.0"] idxml = ["pyopenms>=3.3"] [dependency-groups]