diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 09a9cc13..9af97a39 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -28,6 +28,8 @@ jobs: cargo build -p rustyms --no-default-features --features rand cargo build -p rustyms --no-default-features --features rayon cargo build -p rustyms --no-default-features --features mzdata + cargo build -p rustyms --no-default-features --features glycan-render + cargo build -p rustyms --no-default-features --features glycan-render-bitmap fmt: runs-on: ubuntu-latest diff --git a/.github/workflows/scripts/update-all-databases.sh b/.github/workflows/scripts/update-all-databases.sh index 06e6dbd3..51f37bf8 100644 --- a/.github/workflows/scripts/update-all-databases.sh +++ b/.github/workflows/scripts/update-all-databases.sh @@ -41,11 +41,9 @@ function make-ontologies { curl https://raw.githubusercontent.com/HUPO-PSI/mzIdentML/master/cv/XLMOD.obo \ > ${db_data}/XLMOD.obo curl -L http://purl.obolibrary.org/obo/gno.obo \ - | sed '/(property_value: GNO:00000(022|023|041|042|101|102) .*$\n)|(def: .*$\n)/d' \ - | gzip -c \ - > ${db_data}/GNOme.obo.gz + > ${db_data}/GNOme.obo curl -L https://glycosmos.org/download/glycosmos_glycans_list.csv \ - | gzip -c > ${db_data}/glycosmos_glycans_list.csv.gz + > ${db_data}/glycosmos_glycans_list.csv echo "Serializing the other databases..." diff --git a/.gitignore b/.gitignore index aeaa2116..259ef0c0 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,7 @@ Cargo.lock errors.dat *.dat.Z *.dat +*.html GNOme.obo glycosmos_glycans_list.csv .venv/ diff --git a/Cargo.toml b/Cargo.toml index 2b0e5225..65e82702 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,15 +27,17 @@ codegen-units = 1 [workspace.dependencies] afl = "0.15" +base64 = "0.22" bincode = "1.3" clap = { version = "4.5", features = ["derive", "cargo"] } directories = "6.0" flate2 = "1.0" iai-callgrind = "0.14" itertools = "0.14" -mzdata = "0.44" +mzdata = {version="0.49", default-features = false, features = ["miniz_oxide"]} ndarray = "0.16" ordered-float = { version = "4.6", features = ["serde"] } +png = "0.17" probability = "0.20" pyo3 = "0.23" rand = "0.9" @@ -45,8 +47,10 @@ roxmltree = "0.20" serde = { version = "1.0", features = ["derive", "rc"] } serde_json = "1.0" similar = "2.7" +swash = "0.2" thin-vec = { version = "0.2", features = ["serde"] } -uom = { version = "0.36", features = ["use_serde", "usize", "isize"] } +uom = { version = "0.36", default-features = false, features = ["use_serde", "usize", "isize", "f64"] } +zeno = {version = "0.3.2" } [workspace.lints.rust] unexpected_cfgs = { level = "allow", check-cfg = [ diff --git a/examples/de-novo-align/Cargo.toml b/examples/de-novo-align/Cargo.toml index 5d6392dd..0c96605b 100644 --- a/examples/de-novo-align/Cargo.toml +++ b/examples/de-novo-align/Cargo.toml @@ -7,7 +7,7 @@ license.workspace = true publish = false [dependencies] -rustyms = { path = "../../rustyms" } +rustyms = { path = "../../rustyms", default-features=false, features = ["align", "identification"] } clap = { workspace = true } itertools = { workspace = true } rayon = { workspace = true } diff --git a/examples/ion-explorer/Cargo.toml b/examples/ion-explorer/Cargo.toml new file mode 100644 index 00000000..ee4a8b1e --- /dev/null +++ b/examples/ion-explorer/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "ion-explorer" +version = "0.1.0" +authors.workspace = true +edition.workspace = true +license.workspace = true +publish = false + +[dependencies] +clap = { workspace = true } +directories = { workspace = true } +itertools = { workspace = true } +mzdata = { workspace = true, features = ["mgf", "mzml", "thermo"] } +rustyms = { path = "../../rustyms", default-features = false, features = [ + "mzdata", + "identification", +] } +serde_json = { workspace = true } + +[lints] +workspace = true diff --git a/examples/ion-explorer/README.md b/examples/ion-explorer/README.md new file mode 100644 index 00000000..a5826f94 --- /dev/null +++ b/examples/ion-explorer/README.md @@ -0,0 +1,7 @@ +# Ion explorer + +``` +argo run -p ion-explorer --release -- --in-path /home/douwe/Downloads/raw_data/COVID19_IgG_14_DENOVO_5.csv --raw-file-directory /home/douwe/Downloads/raw_data/ --out-path . +``` + +This takes an identified peptides file and extracts and bins all peaks surrounding the main fragment series. This helps in discovering which fragments and neutral losses are actually generated by the used fragmentation method. It returns a separate csv file for all fragments with the mz, count, and average intensity for all detected peaks. \ No newline at end of file diff --git a/examples/ion-explorer/figure.r b/examples/ion-explorer/figure.r new file mode 100644 index 00000000..e861c9ba --- /dev/null +++ b/examples/ion-explorer/figure.r @@ -0,0 +1,33 @@ +library(ggplot2) + +for (file in c("start", "fragment_b", "fragment_y", "fragment_v_0", "fragment_w_0", "fragment_w_1", "fragment_w_2", "fragment_w_3", "fragment_w_4", "fragment_w_5", "fragment_w_6", "fragment_precursor")) { + data = read.csv(paste(file, ".csv", sep=""), header=TRUE); + # data = data[data$count > 1000,]; + + plot = ggplot(data, aes(x=mz, y=count, size=avg_intensity, colour=avg_intensity)) + + geom_point() + + scale_size_continuous(range = c(0.25, 3)) + + xlab("Difference to theoretical ion") + + theme_bw() + + if (file == "fragment_y") { + plot = plot + geom_vline(xintercept=0, linetype="dashed") + geom_vline(xintercept=+18.011-0.984, linetype="dashed") + geom_vline(xintercept=-25.979+18.011, linetype="dashed") + } + + ggsave(paste(file, ".png", sep=""), plot) +} + +for (file in c("comparison_b_N_N[U:Deamidated]", "comparison_y_N_N[U:Deamidated]")) { + data = read.csv(paste(file, ".csv", sep=""), header=TRUE); + data = data[data$count_a > 20 | data$count_b > 20,]; + max_a = max(data$avg_intensity_a); + max_b = max(data$avg_intensity_b); + + ggplot(data, aes(x=mz, y=log2(count_a/count_b), colour=rgb(avg_intensity_a / max_a, 0, avg_intensity_b / max_b), size=avg_intensity_a)) + + geom_point() + + scale_size_continuous(range = c(0.25, 3)) + + theme_bw() + + guides(colour="none") + + ggsave(paste(file, ".png", sep="")) +} diff --git a/examples/ion-explorer/src/main.rs b/examples/ion-explorer/src/main.rs new file mode 100644 index 00000000..d68f2a5a --- /dev/null +++ b/examples/ion-explorer/src/main.rs @@ -0,0 +1,386 @@ +#![allow(non_snake_case)] // charge_independent_Y needs the capital as it means the glycan fragmentation +use std::{ + collections::BTreeMap, + fs::File, + io::{BufReader, BufWriter}, + path::Path, +}; + +use clap::Parser; +use directories::ProjectDirs; +use itertools::Itertools; +use mzdata::{ + io::{MZFileReader, SpectrumSource}, + mzpeaks::{CentroidPeak, PeakCollection}, + prelude::SpectrumLike, + spectrum::{MultiLayerSpectrum, RefPeakDataLevel}, +}; +use rustyms::{ + fragment::{FragmentKind, FragmentType}, + identification::{SpectrumId, SpectrumIds}, + model::{ChargeRange, PrimaryIonSeries, SatelliteIonSeries, SatelliteLocation}, + modification::{Ontology, SimpleModification}, + *, +}; + +#[derive(Parser)] +struct Cli { + /// The input identified peptides file + #[arg(short, long)] + in_path: String, + /// The output path to output the resulting csv file + #[arg(short, long)] + out_path: String, + /// The raw file to use for any file without a raw file + #[arg(long)] + raw_file: Option, + /// The directory where to find raw files that are named in the peptides file + #[arg(long)] + raw_file_directory: String, + /// To turn off loading the custom modifications database from the Annotator (if installed) + #[arg(long)] + no_custom_mods: bool, + /// The bin width of the mz bins + #[arg(long, default_value = "0.25")] + resolution: f64, + /// The high end of the range for looking at diagnostic/immonium ions, selects all peaks in 0..=`max_start` + #[arg(long, default_value = "200.0")] + max_start: f64, + /// The number of Thompson to select before a fragment ion + #[arg(long, default_value = "100.0")] + before_fragment: f64, + /// The number of Thompson to select after a fragment ion + #[arg(long, default_value = "100.0")] + after_fragment: f64, +} + +fn main() { + let args = Cli::parse(); + + let model = FragmentationModel::none() + .clone() + .b(PrimaryIonSeries::default()) + .d(SatelliteIonSeries::default().location(SatelliteLocation { + rules: Vec::new(), + base: Some(6), + })) + .v(SatelliteIonSeries::default().location(SatelliteLocation { + rules: Vec::new(), + base: Some(6), + })) + .w(SatelliteIonSeries::default().location(SatelliteLocation { + rules: Vec::new(), + base: Some(6), + })) + .y(PrimaryIonSeries::default()) + .precursor(Vec::new(), Vec::new(), (0, None), ChargeRange::PRECURSOR); + + let path = ProjectDirs::from("com", "com.snijderlab.annotator", "") + .unwrap() + .config_dir() + .join("../custom_modifications.json"); + let custom_database = if args.no_custom_mods || !path.exists() { + None + } else { + Some(serde_json::from_reader(BufReader::new(File::open(path).unwrap())).unwrap()) + }; + let peptides = rustyms::identification::open_identified_peptides_file( + &args.in_path, + custom_database.as_ref(), + ) + .expect("Could not open identified peptides file") + .filter_map(|a| a.ok()) + .into_group_map_by(|l| match l.scans() { + SpectrumIds::FileKnown(spectra) => spectra.first().map(|s| s.0.clone()), + _ => None, + }); + + let mut stack = Stack::default(); + + for (file, peptides) in peptides { + let mut file = mzdata::io::MZReaderType::open_path(file.map(|f|Path::new(&args.raw_file_directory).join(f)).or(args.raw_file.as_ref().map(|p| p.into())).expect("The raw file parameter has to be defined if there are peptides without a defined raw file")).unwrap(); + + for peptide in peptides { + if peptide.charge().is_none() { + continue; + } + if let Some(cpi) = peptide.peptide() { + let id = match peptide.scans() { + SpectrumIds::FileKnown(spectra) => { + spectra.first().and_then(|s| s.1.first().cloned()) + } + SpectrumIds::FileNotKnown(ids) => ids.first().cloned(), + _ => None, + }; + if let Some(spectrum) = match id { + Some(SpectrumId::Index(i)) => file.get_spectrum_by_index(i), + Some(SpectrumId::Native(n)) => file.get_spectrum_by_id(&n), + _ => continue, + } { + let cpi = cpi.compound_peptidoform(); + let fragments = + cpi.generate_theoretical_fragments(peptide.charge().unwrap(), &model); + extract_and_merge( + &mut stack, + &spectrum, + &fragments, + &[( + (AminoAcid::Asparagine, Vec::new()), + ( + AminoAcid::Asparagine, + vec![Ontology::Unimod.find_id(7, None).unwrap()], + ), + )], + &cpi, + &args, + ); + } + } + } + } + stack.store(Path::new(&args.out_path)); +} + +type ComparisonKey = (AminoAcid, Vec); + +fn extract_and_merge( + stack: &mut Stack, + spectrum: &MultiLayerSpectrum, + fragments: &[Fragment], + comparisons: &[(ComparisonKey, ComparisonKey)], + peptidoform: &CompoundPeptidoformIon, + args: &Cli, +) { + let spectrum = match spectrum.peaks() { + RefPeakDataLevel::Centroid(c) => c, + _ => return, + }; + for fragment in fragments { + if let Some(mz) = fragment.mz(MassMode::Monoisotopic) { + let key = match fragment.ion { + FragmentType::d(_, _, d, _, _) + | FragmentType::v(_, _, d, _) + | FragmentType::w(_, _, d, _, _) => (fragment.ion.kind(), Some(d)), + _ => (fragment.ion.kind(), None), + }; + let low = mz.value - args.before_fragment; + let high = mz.value + args.after_fragment; + let sub_spectrum = spectrum.between(low, high, mzdata::prelude::Tolerance::Da(0.0)); + merge_stack( + stack.fragments.entry(key).or_default(), + sub_spectrum, + mz.value, + args.resolution, + ); + // Comparison + let (kind, pos) = match fragment.ion { + FragmentType::a(pos, _) + | FragmentType::b(pos, _) + | FragmentType::c(pos, _) + | FragmentType::x(pos, _) + | FragmentType::y(pos, _) + | FragmentType::z(pos, _) => (fragment.ion.kind(), pos), + _ => continue, + }; + let seq = &peptidoform.peptidoform_ions() + [fragment.peptidoform_ion_index.unwrap_or_default()] + .peptidoforms()[fragment.peptidoform_index.unwrap_or_default()][pos.sequence_index]; + let key: ComparisonKey = ( + seq.aminoacid.aminoacid(), + seq.modifications + .iter() + .filter_map(|m| m.simple()) + .cloned() + .collect(), + ); + for (a, b) in comparisons { + if key == *a { + merge_comparison_stack( + stack + .comparison + .entry((kind, a.clone(), b.clone())) + .or_default(), + sub_spectrum, + mz.value, + args.resolution, + true, + ); + } else if key == *b { + merge_comparison_stack( + stack + .comparison + .entry((kind, a.clone(), b.clone())) + .or_default(), + sub_spectrum, + mz.value, + args.resolution, + false, + ); + } + } + } + } + // Get start + let sub_spectrum = spectrum.between(0.0, args.max_start, mzdata::prelude::Tolerance::Da(0.0)); + merge_stack(&mut stack.start, sub_spectrum, 0.0, args.resolution); +} + +fn merge_stack(points: &mut Vec, slice: &[CentroidPeak], center: f64, resolution: f64) { + for found_peak in slice { + let normalised_mz = ((found_peak.mz - center) / resolution).round() * resolution; + match points.binary_search_by(|p| p.mz.total_cmp(&normalised_mz)) { + Ok(index) => { + points[index].count += 1; + points[index].total_intensity += found_peak.intensity as f64; + } + Err(index) => { + points.insert( + index, + Point { + mz: normalised_mz, + count: 1, + total_intensity: found_peak.intensity as f64, + }, + ); + } + } + } +} + +fn merge_comparison_stack( + points: &mut Vec, + slice: &[CentroidPeak], + center: f64, + resolution: f64, + is_a: bool, +) { + for found_peak in slice { + let normalised_mz = ((found_peak.mz - center) / resolution).round() * resolution; + match points.binary_search_by(|p| p.mz.total_cmp(&normalised_mz)) { + Ok(index) => { + if is_a { + points[index].count_a += 1; + points[index].total_intensity_a += found_peak.intensity as f64; + } else { + points[index].count_b += 1; + points[index].total_intensity_b += found_peak.intensity as f64; + } + } + Err(index) => { + points.insert( + index, + if is_a { + ComparisonPoint { + mz: normalised_mz, + count_a: 1, + count_b: 0, + total_intensity_a: found_peak.intensity as f64, + total_intensity_b: 0.0, + } + } else { + ComparisonPoint { + mz: normalised_mz, + count_a: 0, + count_b: 1, + total_intensity_a: 0.0, + total_intensity_b: found_peak.intensity as f64, + } + }, + ); + } + } + } +} + +#[derive(Default, Debug)] +struct Stack { + start: Vec, + fragments: BTreeMap<(FragmentKind, Option), Vec>, + comparison: BTreeMap<(FragmentKind, ComparisonKey, ComparisonKey), Vec>, +} + +impl Stack { + fn store(&self, base_path: &Path) { + write_stack(&base_path.join("start.csv"), &self.start); + for (key, stack) in self.fragments.iter() { + let path = match key { + (i, None) => base_path.join(format!("fragment_{i}.csv")), + (i, Some(d)) => base_path.join(format!("fragment_{i}_{d}.csv")), + }; + write_stack(&path, stack); + } + for ((fragment, a, b), stack) in self.comparison.iter() { + let a = format!("{}{}", a.0, a.1.iter().map(|m| format!("[{m}]")).join("")); + let b = format!("{}{}", b.0, b.1.iter().map(|m| format!("[{m}]")).join("")); + let path = base_path.join(format!("comparison_{fragment}_{a}_{b}.csv")); + write_comparison_stack(&path, stack); + } + } +} + +fn write_stack(path: &Path, points: &[Point]) { + let out_file = BufWriter::new(File::create(path).unwrap()); + rustyms::csv::write_csv( + out_file, + points.iter().map(|p| { + [ + ("mz".to_string(), p.mz.to_string()), + ("count".to_string(), p.count.to_string()), + ( + "avg_intensity".to_string(), + (p.total_intensity / p.count as f64).to_string(), + ), + ] + }), + ) + .unwrap(); +} + +fn write_comparison_stack(path: &Path, points: &[ComparisonPoint]) { + let out_file = BufWriter::new(File::create(path).unwrap()); + rustyms::csv::write_csv( + out_file, + points.iter().map(|p| { + [ + ("mz".to_string(), p.mz.to_string()), + ("count_a".to_string(), p.count_a.to_string()), + ("count_b".to_string(), p.count_b.to_string()), + ( + "avg_intensity_a".to_string(), + if p.count_a == 0 { + 0.0 + } else { + p.total_intensity_a / p.count_a as f64 + } + .to_string(), + ), + ( + "avg_intensity_b".to_string(), + if p.count_b == 0 { + 0.0 + } else { + p.total_intensity_b / p.count_b as f64 + } + .to_string(), + ), + ] + }), + ) + .unwrap(); +} + +#[derive(Debug)] +struct Point { + mz: f64, + count: usize, + total_intensity: f64, +} + +#[derive(Debug)] +struct ComparisonPoint { + mz: f64, + count_a: usize, + count_b: usize, + total_intensity_a: f64, + total_intensity_b: f64, +} diff --git a/examples/multi-annotator/src/main.rs b/examples/multi-annotator/src/main.rs index f3051d99..68d8ab07 100644 --- a/examples/multi-annotator/src/main.rs +++ b/examples/multi-annotator/src/main.rs @@ -13,24 +13,22 @@ use itertools::Itertools; use mzdata::io::{MZFileReader, SpectrumSource}; use rayon::prelude::*; use rustyms::{ + model::MatchingParameters, spectrum::{Score, Scores}, - system::{e, usize::Charge, Mass}, + system::{e, usize::Charge}, *, }; use spectrum::{AnnotatedPeak, PeakSpectrum}; #[derive(Parser)] struct Cli { - /// The input csv file, should have the following columns: 'path', 'scan_index', 'z', 'sequence', and can have 'fragmentation' (etd/td_etd/ethcd/etcad/hot eacid/eacid/ead/hcd/cid/all/none, defaults to the global model) + /// The input csv file, should have the following columns: 'path', 'scan_index', 'z', 'sequence', and can have 'fragmentation' (etd/td_etd/ethcd/etcad/eacid/ead/hcd/cid/all/none, defaults to the global model) #[arg(short, long)] in_path: String, /// The output path to output the resulting csv file #[arg(short, long)] out_path: String, - /// The tolerance for matching fragments, use `ppm` or `da` to control the unit, e.g. `10.0ppm` or `2.3da` - #[arg(short, long, default_value_t = Tolerance::new_ppm(20.0), value_parser=mass_tolerance_parse)] - pub tolerance: Tolerance, - /// Global model, will be overruled by line specific models (etd/td_etd/ethcd/etcad/hot eacid/eacid/ead/hcd/cid/all/none) + /// Global model, will be overruled by line specific models (etd/td_etd/ethcd/etcad/eacid/ead/hcd/cid/all/none) #[arg(long, default_value_t = String::from("all"))] model: String, /// Turns on reporting of glycan Y-ions in a charge independent manner @@ -47,27 +45,24 @@ struct Cli { no_custom_mods: bool, } -fn mass_tolerance_parse(input: &str) -> Result, &'static str> { - input.parse().map_err(|()| "Invalid tolerance parameter") -} - -fn select_model(text: &str, default: &Model) -> Model { +fn select_model(text: &str, default: &'static FragmentationModel) -> &'static FragmentationModel { match text.to_ascii_lowercase().as_str() { - "etd" => Model::etd(), - "td_etd" => Model::td_etd(), - "ethcd" | "etcad" => Model::ethcd(), - "hot eacid" | "eacid" => Model::hot_eacid(), - "ead" => Model::ead(), - "hcd" | "cid" => Model::cid_hcd(), - "all" => Model::all(), - "none" => Model::none(), - _ => default.clone(), + "etd" => FragmentationModel::etd(), + "td_etd" => FragmentationModel::td_etd(), + "ethcd" | "etcad" => FragmentationModel::ethcd(), + "eacid" => FragmentationModel::eacid(), + "ead" => FragmentationModel::ead(), + "hcd" | "cid" => FragmentationModel::cid_hcd(), + "all" => FragmentationModel::all(), + "none" => FragmentationModel::none(), + _ => default, } } fn main() { let args = Cli::parse(); - let model = select_model(&args.model, &Model::all()); + let model = select_model(&args.model, FragmentationModel::all()); + let parameters = MatchingParameters::default(); let path = ProjectDirs::from("com", "com.snijderlab.annotator", "") .unwrap() .config_dir() @@ -112,19 +107,19 @@ fn main() { .unwrap(); let selected_model = line .index_column("fragmentation") - .map_or_else(|_| model.clone(), |(text, _)| select_model(text, &model)); + .map_or_else(|_| model, |(text, _)| select_model(text, model)); if let Some(spectrum) = file.get_spectrum_by_index(scan_index) { let fragments = - peptide.generate_theoretical_fragments(Charge::new::(z), &model); + peptide.generate_theoretical_fragments(Charge::new::(z), selected_model); let annotated = spectrum.annotate( peptide, &fragments, - &selected_model, + ¶meters, MassMode::Monoisotopic, ); let scores: &Scores = &annotated - .scores(&fragments, &selected_model, MassMode::Monoisotopic) + .scores(&fragments, ¶meters, MassMode::Monoisotopic) .1[0][0]; let mut row: BTreeMap<_, _> = line.into(); @@ -202,7 +197,7 @@ fn main() { .map(|(i, _)| { if annotated.spectrum().any(|p: &AnnotatedPeak| { p.annotation.iter().any(|a: &Fragment| { - matches!(a.ion, FragmentType::w(s) | FragmentType::d(s) if s.sequence_index + matches!(a.ion, FragmentType::w(s, _, 0, _, _) | FragmentType::d(s, _, 0, _, _) if s.sequence_index == SequencePosition::Index(i)) }) }) { diff --git a/rustyms-generate-databases/src/gnome.rs b/rustyms-generate-databases/src/gnome.rs index 6c114088..09959e21 100644 --- a/rustyms-generate-databases/src/gnome.rs +++ b/rustyms-generate-databases/src/gnome.rs @@ -117,7 +117,7 @@ impl std::str::FromStr for GnoSubsumption { } fn parse_gnome() -> HashMap { - let obo = OboOntology::from_file("rustyms-generate-databases/data/GNOme.obo.gz") + let obo = OboOntology::from_file("rustyms-generate-databases/data/GNOme.obo") .expect("Not a valid obo file"); let mut mods = HashMap::new(); @@ -221,7 +221,7 @@ fn parse_gnome_structures() -> HashMap { let mut glycans = HashMap::new(); let mut errors = 0; for line in parse_csv( - "rustyms-generate-databases/data/glycosmos_glycans_list.csv.gz", + "rustyms-generate-databases/data/glycosmos_glycans_list.csv", b',', None, ) diff --git a/rustyms-generate-databases/src/main.rs b/rustyms-generate-databases/src/main.rs index 98574e91..8eb1c271 100644 --- a/rustyms-generate-databases/src/main.rs +++ b/rustyms-generate-databases/src/main.rs @@ -54,6 +54,16 @@ include!("../../rustyms/src/shared/neutral_loss.rs"); include!("../../rustyms/src/shared/modification.rs"); include!("../../rustyms/src/shared/aminoacid.rs"); +mod fragment { + use super::*; + use serde::{Deserialize, Serialize}; + /// The index in the branches as stored in the structure + pub type GlycanBranchIndex = usize; + /// The index in the branches when the branches are sorted on mass, this is used to properly render the names of the branches for human consumption + pub type GlycanBranchMassIndex = usize; + include!("../../rustyms/src/shared/glycan_position.rs"); +} + impl crate::Element { pub fn is_valid(self, _isotope: Option) -> bool { true diff --git a/rustyms-generate-imgt/src/structs.rs b/rustyms-generate-imgt/src/structs.rs index 3f1bb907..f6901152 100644 --- a/rustyms-generate-imgt/src/structs.rs +++ b/rustyms-generate-imgt/src/structs.rs @@ -4,7 +4,7 @@ use std::str::FromStr; use crate::imgt_gene::IMGTGene; use crate::shared::{AnnotatedSequence, Gene, Species}; -use rustyms::AminoAcid; +use rustyms::{AminoAcid, IsAminoAcid}; #[derive(Debug)] pub struct DataItem { @@ -57,7 +57,12 @@ impl Display for Region { // self.found_seq.0, self.found_seq .as_ref() - .map(|seq| seq.1 .0.iter().map(|a| a.char()).collect::()) + .map(|seq| seq + .1 + .0 + .iter() + .map(|a| a.pro_forma_definition()) + .collect::()) .unwrap_or_else(|e| format!(": {e}")), ) } @@ -237,7 +242,10 @@ impl std::fmt::Debug for AASequence { write!( f, "[{}]", - self.0.iter().map(|a| a.char()).collect::() + self.0 + .iter() + .map(|a| a.pro_forma_definition()) + .collect::() ) } } diff --git a/rustyms-py/src/lib.rs b/rustyms-py/src/lib.rs index 13c23e5f..ae1910fa 100644 --- a/rustyms-py/src/lib.rs +++ b/rustyms-py/src/lib.rs @@ -6,7 +6,7 @@ use std::num::NonZeroU16; use ordered_float::OrderedFloat; use pyo3::{exceptions::PyValueError, prelude::*, types::PyType}; -use rustyms::{AnnotatableSpectrum, Chemical, Linked, MultiChemical}; +use rustyms::{AnnotatableSpectrum, Chemical, IsAminoAcid, Linked, MultiChemical, Tolerance}; /// Mass mode enum. #[pyclass(eq, eq_int)] @@ -440,7 +440,7 @@ impl AminoAcid { } fn __str__(&self) -> String { - self.0.char().to_string() + self.0.pro_forma_definition().to_string() } fn __repr__(&self) -> String { @@ -763,15 +763,56 @@ enum FragmentationModel { CidHcd, Etd, Ethcd, + Ead, + Eacid, + Uvpd, } /// Helper function to match a [`FragmentationModel`] to a rustyms Model. -fn match_model(model: &FragmentationModel) -> PyResult { +fn match_model(model: &FragmentationModel) -> PyResult { match model { - FragmentationModel::All => Ok(rustyms::Model::all()), - FragmentationModel::CidHcd => Ok(rustyms::Model::cid_hcd()), - FragmentationModel::Etd => Ok(rustyms::Model::etd()), - FragmentationModel::Ethcd => Ok(rustyms::Model::ethcd()), + FragmentationModel::All => Ok(rustyms::FragmentationModel::all().clone()), + FragmentationModel::CidHcd => Ok(rustyms::FragmentationModel::cid_hcd().clone()), + FragmentationModel::Etd => Ok(rustyms::FragmentationModel::etd().clone()), + FragmentationModel::Ethcd => Ok(rustyms::FragmentationModel::ethcd().clone()), + FragmentationModel::Ead => Ok(rustyms::FragmentationModel::ead().clone()), + FragmentationModel::Eacid => Ok(rustyms::FragmentationModel::eacid().clone()), + FragmentationModel::Uvpd => Ok(rustyms::FragmentationModel::uvpd().clone()), + } +} + +/// Parameters for matching theoretical fragments to measured data +/// +/// Parameters +/// ---------- +/// parameters : MatchingParameters +/// The parameters +/// +#[pyclass] +#[derive(Clone)] +pub struct MatchingParameters(rustyms::model::MatchingParameters); + +#[pymethods] +impl MatchingParameters { + /// Create default parameters + #[staticmethod] + fn new() -> Self { + MatchingParameters(rustyms::model::MatchingParameters::default()) + } + + /// Set the tolerance to a certain ppm value + #[setter] + fn tolerance_ppm(&mut self, tolerance: f64) { + self.0.tolerance = rustyms::Tolerance::new_ppm(tolerance); + } + + /// Set the tolerance to a certain absolute Thompson value + #[setter] + fn tolerance_thompson(&mut self, tolerance: f64) { + self.0.tolerance = + rustyms::Tolerance::new_absolute(rustyms::system::MassOverCharge::new::< + rustyms::system::mz, + >(tolerance)); } } @@ -1124,7 +1165,7 @@ impl Peptidoform { self.0 .sequence() .iter() - .map(|x| x.aminoacid.char()) + .map(|x| x.aminoacid.pro_forma_definition()) .collect() } @@ -1487,6 +1528,8 @@ impl RawSpectrum { /// The peptidoform to annotate the spectrum with. /// model : FragmentationModel /// The model to use for the fragmentation. + /// parameters : MatchingParameters + /// The parameters to use for the matching. /// mode : MassMode /// The mode to use for the mass. /// @@ -1500,11 +1543,12 @@ impl RawSpectrum { /// ValueError /// If the model is not one of the valid models. /// - #[pyo3(signature = (peptidoform, model, mode=&MassMode::Monoisotopic))] + #[pyo3(signature = (peptidoform, model, parameters, mode=&MassMode::Monoisotopic))] fn annotate( &self, peptidoform: CompoundPeptidoformIon, model: &FragmentationModel, + parameters: &MatchingParameters, mode: &MassMode, ) -> PyResult { let rusty_model = match_model(model)?; @@ -1517,7 +1561,7 @@ impl RawSpectrum { Ok(AnnotatedSpectrum(self.0.annotate( peptidoform.0, &fragments, - &rusty_model, + ¶meters.0, match mode { MassMode::Monoisotopic => rustyms::MassMode::Monoisotopic, MassMode::Average => rustyms::MassMode::Average, diff --git a/rustyms/Cargo.toml b/rustyms/Cargo.toml index c15bd34c..d9ad91a1 100644 --- a/rustyms/Cargo.toml +++ b/rustyms/Cargo.toml @@ -13,7 +13,6 @@ repository = "https://github.com/snijderlab/rustyms" readme = "README.md" include = [ "src/**/*", - "databases/**/*.gz", "README.md", "build.rs", "benches/**/*", @@ -32,12 +31,17 @@ rayon = { workspace = true, optional = true } regex = { workspace = true } serde = { workspace = true } similar = { workspace = true } +swash = {workspace = true, optional = true} thin-vec = { workspace = true } uom = { workspace = true } +zeno = { workspace = true, optional = true } [dev-dependencies] +base64 = { workspace = true } iai-callgrind = { workspace = true } +png = { workspace = true } serde_json = { workspace = true } +directories = {workspace = true} [features] default = [ @@ -48,11 +52,15 @@ default = [ "isotopes", "rand", "mzdata", + "glycan-render", + "glycan-render-bitmap", ] imgt = [] align = [] identification = [] isotopes = ["probability", "ndarray"] +glycan-render = [] +glycan-render-bitmap = ["zeno", "swash", "glycan-render"] [[bench]] name = "iai" diff --git a/rustyms/README.md b/rustyms/README.md index 05b0fe3d..3d1c422a 100644 --- a/rustyms/README.md +++ b/rustyms/README.md @@ -29,18 +29,19 @@ this crate enables the reading of [mgf](rawfile::mgf), doing [spectrum annotatio ```rust # fn main() -> Result<(), rustyms::error::CustomError> { # let raw_file_path = "data/annotated_example.mgf"; -use rustyms::{*, system::{usize::Charge, e}}; +use rustyms::{*, model::*, system::{usize::Charge, e}}; // Open example raw data (this is the built in mgf reader, look into mzdata for more advanced raw file readers) let spectrum = rawfile::mgf::open(raw_file_path)?; // Parse the given ProForma definition let peptide = CompoundPeptidoformIon::pro_forma("[Gln->pyro-Glu]-QVQEVSERTHGGNFD", None)?; // Generate theoretical fragments for this peptide given EThcD fragmentation -let model = Model::ethcd(); +let model = FragmentationModel::ethcd(); let fragments = peptide.generate_theoretical_fragments(Charge::new::(2), &model); +let parameters = MatchingParameters::default(); // Annotate the raw data with the theoretical fragments -let annotated = spectrum[0].annotate(peptide, &fragments, &model, MassMode::Monoisotopic); +let annotated = spectrum[0].annotate(peptide, &fragments, ¶meters, MassMode::Monoisotopic); // Calculate a peak false discovery rate for this annotation -let (fdr, _) = annotated.fdr(&fragments, &model, MassMode::Monoisotopic); +let (fdr, _) = annotated.fdr(&fragments, ¶meters, MassMode::Monoisotopic); // This is the incorrect sequence for this spectrum so the peak FDR will indicate this # dbg!(&fdr, fdr.peaks_sigma(), fdr.peaks_fdr(), fdr.peaks_score()); assert!(fdr.peaks_sigma() > 2.0); @@ -80,3 +81,5 @@ It has multiple features which allow you to slim it down if needed (all are enab * `rand` - allows the generation of random peptides. * `rayon` - enables parallel iterators using rayon, mostly for `imgt` but also in consecutive align. * `mzdata` - enables integration with [mzdata](https://github.com/mobiusklein/mzdata) which has more advanced raw file support. +* `glycan-render` - enables the rendering to SVGs for glycans and glycan fragments +* `glycan-render-bitmap` - enables the rendering to bitmaps for glycans, by enabling the optional dependencies zeno and swash diff --git a/rustyms/data/glycan.mgf b/rustyms/data/glycan.mgf index f57eab92..2d484f60 100644 --- a/rustyms/data/glycan.mgf +++ b/rustyms/data/glycan.mgf @@ -2,7 +2,7 @@ BEGIN IONS PEPMASS=660.2457879192369 CHARGE=1+ TITLE=MS/MS scan at 1.535 min with Intensity: 604.0 -SEQUENCE=N[GlycanStructure:Hex(Hex,HexNAc)] +SEQUENCE=N[G:G01141WK] 189.48956 5050.0 283.62076 5050.0 diff --git a/rustyms/images/glycan_root.svg b/rustyms/images/glycan_root.svg new file mode 100644 index 00000000..579ca532 --- /dev/null +++ b/rustyms/images/glycan_root.svg @@ -0,0 +1 @@ +pepNArg \ No newline at end of file diff --git a/rustyms/src/align/alignment.rs b/rustyms/src/align/alignment.rs index 3375a4fd..10ffcdfc 100644 --- a/rustyms/src/align/alignment.rs +++ b/rustyms/src/align/alignment.rs @@ -14,6 +14,7 @@ use super::scoring::*; use crate::align::mass_alignment::determine_final_score; use crate::align::mass_alignment::score_pair; use crate::helper_functions::next_num; +use crate::model::GlycanModel; use crate::peptidoform::AtMax; use crate::peptidoform::Linear; use crate::system::Mass; @@ -210,6 +211,7 @@ impl<'lifetime, A: AtMax, B: AtMax> Alignment<'lifet false, SequencePosition::Index(index_a), 0, + &GlycanModel::DISALLOW, ) .0 .iter() @@ -223,6 +225,7 @@ impl<'lifetime, A: AtMax, B: AtMax> Alignment<'lifet false, SequencePosition::Index(index_b), 0, + &GlycanModel::DISALLOW, ) .0 .iter() @@ -403,6 +406,7 @@ impl, B: AtMax> Alignment<'_, A, B> { false, SequencePosition::Index(index), 0, + &GlycanModel::DISALLOW, ) .0 }) @@ -428,6 +432,7 @@ impl, B: AtMax> Alignment<'_, A, B> { false, SequencePosition::Index(index), 0, + &GlycanModel::DISALLOW, ) .0 }) diff --git a/rustyms/src/align/mass_alignment.rs b/rustyms/src/align/mass_alignment.rs index a748a316..44ea5f45 100644 --- a/rustyms/src/align/mass_alignment.rs +++ b/rustyms/src/align/mass_alignment.rs @@ -1,6 +1,7 @@ use std::fmt::Debug; use crate::{ + model::GlycanModel, peptidoform::{AtMax, SimpleLinear}, system::Mass, MassMode, MolecularFormula, Multi, Peptidoform, SequenceElement, SequencePosition, @@ -304,6 +305,7 @@ fn calculate_masses( false, SequencePosition::Index(i), 0, + &GlycanModel::DISALLOW, ) .0 }) @@ -483,9 +485,10 @@ impl std::ops::IndexMut<[usize; 2]> for Matrix { #[expect(clippy::missing_panics_doc)] mod tests { use super::score; - use crate::align::scoring::AlignScoring; - use crate::{CheckedAminoAcid, SequencePosition}; - use crate::{MolecularFormula, Multi, SequenceElement}; + use crate::{ + align::scoring::AlignScoring, model::GlycanModel, CheckedAminoAcid, MolecularFormula, + Multi, SequenceElement, SequencePosition, + }; #[test] fn pair() { @@ -505,7 +508,8 @@ mod tests { &mut Vec::new(), false, SequencePosition::default(), - 0 + 0, + &GlycanModel::DISALLOW, ) .0) .sum::>()[0] @@ -522,7 +526,8 @@ mod tests { &mut Vec::new(), false, SequencePosition::default(), - 0 + 0, + &GlycanModel::DISALLOW, ) .0) .sum::>()[0] diff --git a/rustyms/src/align/multi_alignment.rs b/rustyms/src/align/multi_alignment.rs index 24e72a8f..6359de3d 100644 --- a/rustyms/src/align/multi_alignment.rs +++ b/rustyms/src/align/multi_alignment.rs @@ -36,7 +36,7 @@ impl MultiAlignmentLine<'_, Complexity> { { print!( "{}{}", - piece.1.aminoacid.char(), + piece.1.aminoacid, "ยท".repeat(piece.0.step as usize - 1) ); } diff --git a/rustyms/src/aminoacid/aminoacid.rs b/rustyms/src/aminoacid/aminoacid.rs new file mode 100644 index 00000000..fbadb7fe --- /dev/null +++ b/rustyms/src/aminoacid/aminoacid.rs @@ -0,0 +1,685 @@ +//! Module used define the implementations for the [`IsAminoAcid`] trait + +use std::{borrow::Cow, collections::HashMap}; + +use serde::{Deserialize, Serialize}; + +use crate::{ + formula::MolecularFormula, + fragment::{Fragment, FragmentKind, FragmentType, PeptidePosition, SatelliteLabel}, + model::*, + molecular_charge::CachedCharge, + Multi, MultiChemical, SequencePosition, +}; + +use super::is_amino_acid::IsAminoAcid; + +impl std::fmt::Display for dyn IsAminoAcid { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.pro_forma_definition()) + } +} + +include!("../shared/aminoacid.rs"); + +impl std::fmt::Display for AminoAcid { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.pro_forma_definition()) + } +} + +impl IsAminoAcid for AminoAcid { + /// Get the single letter representation of the amino acid + fn one_letter_code(&self) -> Option { + Some(match self { + Self::Alanine => 'A', + Self::AmbiguousAsparagine => 'B', + Self::Cysteine => 'C', + Self::AsparticAcid => 'D', + Self::GlutamicAcid => 'E', + Self::Phenylalanine => 'F', + Self::Glycine => 'G', + Self::Histidine => 'H', + Self::Isoleucine => 'I', + Self::AmbiguousLeucine => 'J', + Self::Lysine => 'K', + Self::Leucine => 'L', + Self::Methionine => 'M', + Self::Asparagine => 'N', + Self::Pyrrolysine => 'O', + Self::Proline => 'P', + Self::Glutamine => 'Q', + Self::Arginine => 'R', + Self::Serine => 'S', + Self::Threonine => 'T', + Self::Selenocysteine => 'U', + Self::Valine => 'V', + Self::Tryptophan => 'W', + Self::Unknown => 'X', + Self::Tyrosine => 'Y', + Self::AmbiguousGlutamine => 'Z', + }) + } + + fn pro_forma_definition(&self) -> Cow<'_, str> { + Cow::Borrowed(match self { + Self::Alanine => "A", + Self::AmbiguousAsparagine => "B", + Self::Cysteine => "C", + Self::AsparticAcid => "D", + Self::GlutamicAcid => "E", + Self::Phenylalanine => "F", + Self::Glycine => "G", + Self::Histidine => "H", + Self::Isoleucine => "I", + Self::AmbiguousLeucine => "J", + Self::Lysine => "K", + Self::Leucine => "L", + Self::Methionine => "M", + Self::Asparagine => "N", + Self::Pyrrolysine => "O", + Self::Proline => "P", + Self::Glutamine => "Q", + Self::Arginine => "R", + Self::Serine => "S", + Self::Threonine => "T", + Self::Selenocysteine => "U", + Self::Valine => "V", + Self::Tryptophan => "W", + Self::Unknown => "X", + Self::Tyrosine => "Y", + Self::AmbiguousGlutamine => "Z", + }) + } + + /// Get the 3 letter code for the amino acid + fn three_letter_code(&self) -> Option> { + Some(Cow::Borrowed(match self { + Self::Alanine => "Ala", + Self::AmbiguousAsparagine => "Asx", + Self::Cysteine => "Cys", + Self::AsparticAcid => "Asp", + Self::GlutamicAcid => "Glu", + Self::Phenylalanine => "Phe", + Self::Glycine => "Gly", + Self::Histidine => "His", + Self::Isoleucine => "Ile", + Self::AmbiguousLeucine => "Xle", + Self::Lysine => "Lys", + Self::Leucine => "Leu", + Self::Methionine => "Met", + Self::Asparagine => "Asn", + Self::Pyrrolysine => "Pyl", + Self::Proline => "Pro", + Self::Glutamine => "Gln", + Self::Arginine => "Arg", + Self::Serine => "Ser", + Self::Threonine => "Thr", + Self::Selenocysteine => "Sec", + Self::Valine => "Val", + Self::Tryptophan => "Trp", + Self::Unknown => "Xaa", + Self::Tyrosine => "Tyr", + Self::AmbiguousGlutamine => "Glx", + })) + } + + /// Get the full name for the amino acid + fn name(&self) -> Cow<'_, str> { + Cow::Borrowed(match self { + Self::Alanine => "Alanine", + Self::AmbiguousAsparagine => "AmbiguousAsparagine", + Self::Cysteine => "Cysteine", + Self::AsparticAcid => "AsparticAcid", + Self::GlutamicAcid => "GlutamicAcid", + Self::Phenylalanine => "Phenylalanine", + Self::Glycine => "Glycine", + Self::Histidine => "Histidine", + Self::Isoleucine => "Isoleucine", + Self::AmbiguousLeucine => "AmbiguousLeucine", + Self::Lysine => "Lysine", + Self::Leucine => "Leucine", + Self::Methionine => "Methionine", + Self::Asparagine => "Asparagine", + Self::Pyrrolysine => "Pyrrolysine", + Self::Proline => "Proline", + Self::Glutamine => "Glutamine", + Self::Arginine => "Arginine", + Self::Serine => "Serine", + Self::Threonine => "Threonine", + Self::Selenocysteine => "Selenocysteine", + Self::Valine => "Valine", + Self::Tryptophan => "Tryptophan", + Self::Unknown => "Unknown", + Self::Tyrosine => "Tyrosine", + Self::AmbiguousGlutamine => "AmbiguousGlutamine", + }) + } + + fn side_chain( + &self, + sequence_index: SequencePosition, + peptidoform_index: usize, + ) -> Cow<'_, Multi> { + let crate::SequencePosition::Index(sequence_index) = sequence_index else { + return Cow::Owned(Multi::default()); + }; + Cow::Owned(match self { + Self::Alanine => molecular_formula!(H 3 C 1).into(), + Self::Arginine => molecular_formula!(H 10 C 4 N 3).into(), // One of the H's counts as the charge carrier and is added later + Self::Asparagine => molecular_formula!(H 4 C 2 O 1 N 1).into(), + Self::AsparticAcid => molecular_formula!(H 3 C 2 O 2).into(), + Self::AmbiguousAsparagine => vec![ + molecular_formula!(H 4 C 2 O 1 N 1 (crate::AmbiguousLabel::AminoAcid{option: Self::Asparagine, sequence_index, peptidoform_index})), + molecular_formula!(H 3 C 2 O 2 (crate::AmbiguousLabel::AminoAcid{option: Self::AsparticAcid, sequence_index, peptidoform_index})), + ] + .into(), + Self::Cysteine => molecular_formula!(H 3 C 1 S 1).into(), + Self::Glutamine => molecular_formula!(H 6 C 3 O 1 N 1).into(), + Self::GlutamicAcid => molecular_formula!(H 5 C 3 O 2).into(), + Self::AmbiguousGlutamine => vec![ + molecular_formula!(H 6 C 3 O 1 N 1 (crate::AmbiguousLabel::AminoAcid{option: Self::Glutamine, sequence_index, peptidoform_index})), + molecular_formula!(H 5 C 3 O 2 (crate::AmbiguousLabel::AminoAcid{option: Self::GlutamicAcid, sequence_index, peptidoform_index})), + ] + .into(), + Self::Glycine => molecular_formula!(H 1).into(), + Self::Histidine => molecular_formula!(H 5 C 4 N 2).into(), + Self::AmbiguousLeucine | Self::Isoleucine | Self::Leucine => { + molecular_formula!(H 9 C 4).into() + } + Self::Lysine => molecular_formula!(H 10 C 4 N 1).into(), + Self::Methionine => molecular_formula!(H 7 C 3 S 1).into(), + Self::Phenylalanine => molecular_formula!(H 7 C 7).into(), + Self::Proline => molecular_formula!(H 5 C 3).into(), + Self::Pyrrolysine => molecular_formula!(H 17 C 9 O 1 N 2).into(), + Self::Selenocysteine => molecular_formula!(H 3 C 1 Se 1).into(), + Self::Serine => molecular_formula!(H 3 C 1 O 1).into(), + Self::Threonine => molecular_formula!(H 5 C 2 O 1).into(), + Self::Tryptophan => molecular_formula!(H 8 C 9 N 1).into(), + Self::Tyrosine => molecular_formula!(H 7 C 7 O 1).into(), + Self::Valine => molecular_formula!(H 7 C 3).into(), + Self::Unknown => molecular_formula!().into(), + }) + } + + // TODO: Take side chain mutations into account (maybe define pyrrolysine as a mutation) + fn satellite_ion_fragments( + &self, + sequence_index: SequencePosition, + peptidoform_index: usize, + ) -> Option>> { + let crate::SequencePosition::Index(sequence_index) = sequence_index else { + return None; + }; + + match self { + Self::Alanine + | Self::Glycine + | Self::Histidine + | Self::Phenylalanine + | Self::Proline + | Self::Tryptophan + | Self::Tyrosine + | Self::Unknown => None, + Self::Arginine => Some(Cow::Owned(vec![( + SatelliteLabel::None, + molecular_formula!(H 9 C 2 N 2), + )])), + Self::Asparagine => Some(Cow::Owned(vec![( + SatelliteLabel::None, + molecular_formula!(H 2 C 1 N 1 O 1), + )])), + Self::AsparticAcid => Some(Cow::Owned(vec![( + SatelliteLabel::None, + molecular_formula!(H 1 C 1 O 2), + )])), + Self::AmbiguousAsparagine => Some(Cow::Owned(vec![ + ( + SatelliteLabel::None, + molecular_formula!(H 2 C 1 N 1 O 1 (crate::AmbiguousLabel::AminoAcid{option: Self::Asparagine, sequence_index, peptidoform_index})), + ), + ( + SatelliteLabel::None, + molecular_formula!(H 1 C 1 O 2 (crate::AmbiguousLabel::AminoAcid{option: Self::AsparticAcid, sequence_index, peptidoform_index})), + ), + ])), + Self::Cysteine => Some(Cow::Owned(vec![( + SatelliteLabel::None, + molecular_formula!(H 1 S 1), + )])), + Self::Glutamine => Some(Cow::Owned(vec![( + SatelliteLabel::None, + molecular_formula!(H 4 C 2 N 1 O 1), + )])), + Self::GlutamicAcid => Some(Cow::Owned(vec![( + SatelliteLabel::None, + molecular_formula!(H 3 C 2 O 2), + )])), + Self::AmbiguousGlutamine => Some(Cow::Owned(vec![ + ( + SatelliteLabel::None, + molecular_formula!(H 4 C 2 N 1 O 1 (crate::AmbiguousLabel::AminoAcid{option: Self::Glutamine, sequence_index, peptidoform_index})), + ), + ( + SatelliteLabel::None, + molecular_formula!(H 3 C 2 O 2 (crate::AmbiguousLabel::AminoAcid{option: Self::GlutamicAcid, sequence_index, peptidoform_index})), + ), + ])), + Self::Isoleucine => Some(Cow::Owned(vec![ + (SatelliteLabel::A, molecular_formula!(H 3 C 1)), + (SatelliteLabel::B, molecular_formula!(H 5 C 2)), + ])), + Self::Leucine => Some(Cow::Owned(vec![( + SatelliteLabel::None, + molecular_formula!(H 7 C 3), + )])), + Self::AmbiguousLeucine => Some(Cow::Owned(vec![ + ( + SatelliteLabel::A, + molecular_formula!(H 3 C 1 (crate::AmbiguousLabel::AminoAcid{option: Self::Isoleucine, sequence_index, peptidoform_index})), + ), + ( + SatelliteLabel::B, + molecular_formula!(H 5 C 2 (crate::AmbiguousLabel::AminoAcid{option: Self::Isoleucine, sequence_index, peptidoform_index})), + ), + ( + SatelliteLabel::None, + molecular_formula!(H 7 C 3 (crate::AmbiguousLabel::AminoAcid{option: Self::Leucine, sequence_index, peptidoform_index})), + ), + ])), + Self::Lysine => Some(Cow::Owned(vec![( + SatelliteLabel::None, + molecular_formula!(H 8 C 3 N 1), + )])), + Self::Methionine => Some(Cow::Owned(vec![( + SatelliteLabel::None, + molecular_formula!(H 5 C 2 S 1), + )])), + Self::Pyrrolysine => Some(Cow::Owned(vec![( + SatelliteLabel::None, + molecular_formula!(H 15 C 9 N 2 O 1), + )])), + Self::Selenocysteine => Some(Cow::Owned(vec![( + SatelliteLabel::None, + molecular_formula!(Se 1), + )])), + Self::Serine => Some(Cow::Owned(vec![( + SatelliteLabel::None, + molecular_formula!(H 1 O 1), + )])), + Self::Threonine => Some(Cow::Owned(vec![ + (SatelliteLabel::None, molecular_formula!(H 1 O 1)), + (SatelliteLabel::None, molecular_formula!(H 3 C 1)), + ])), + Self::Valine => Some(Cow::Owned(vec![( + SatelliteLabel::None, + molecular_formula!(H 3 C 1), + )])), // Technically two options, but both have the same mass + } + } +} + +impl AminoAcid { + /// All amino acids with a unique mass (no I/L in favour of J, no B, no Z, and no X) + pub const UNIQUE_MASS_AMINO_ACIDS: &'static [Self] = &[ + Self::Glycine, + Self::Alanine, + Self::Arginine, + Self::Asparagine, + Self::AsparticAcid, + Self::Cysteine, + Self::Glutamine, + Self::GlutamicAcid, + Self::Histidine, + Self::AmbiguousLeucine, + Self::Lysine, + Self::Methionine, + Self::Phenylalanine, + Self::Proline, + Self::Serine, + Self::Threonine, + Self::Tryptophan, + Self::Tyrosine, + Self::Valine, + Self::Selenocysteine, + Self::Pyrrolysine, + ]; + + /// All 20 canonical amino acids + pub const CANONICAL_AMINO_ACIDS: &'static [Self] = &[ + Self::Glycine, + Self::Alanine, + Self::Arginine, + Self::Asparagine, + Self::AsparticAcid, + Self::Cysteine, + Self::Glutamine, + Self::GlutamicAcid, + Self::Histidine, + Self::Leucine, + Self::Isoleucine, + Self::Lysine, + Self::Methionine, + Self::Phenylalanine, + Self::Proline, + Self::Serine, + Self::Threonine, + Self::Tryptophan, + Self::Tyrosine, + Self::Valine, + ]; + + // TODO: generalise over used storage type, so using molecularformula, monoisotopic mass, or average mass, also make sure that AAs can return these numbers in a const fashion + #[expect(clippy::too_many_lines, clippy::too_many_arguments)] + pub(crate) fn fragments( + self, + n_term: &( + Multi, + HashMap>, + ), + c_term: &( + Multi, + HashMap>, + ), + modifications: &( + Multi, + HashMap>, + ), + charge_carriers: &mut CachedCharge, + sequence_index: SequencePosition, + sequence_length: usize, + ions: &PossibleIons, + peptidoform_ion_index: usize, + peptidoform_index: usize, + allow_terminal: (bool, bool), + ) -> Vec { + let mut base_fragments = Vec::with_capacity(ions.size_upper_bound()); + let n_pos = PeptidePosition::n(sequence_index, sequence_length); + let c_pos = PeptidePosition::c(sequence_index, sequence_length); + + if allow_terminal.0 { + if let Some(settings) = &ions.a { + base_fragments.extend(Fragment::generate_series( + &(self.formulas_inner(sequence_index, peptidoform_index) + * (modifications + .1 + .get(&FragmentKind::a) + .unwrap_or(&modifications.0) + - molecular_formula!(H 1 C 1 O 1))), + peptidoform_ion_index, + peptidoform_index, + &FragmentType::a(n_pos, 0), + n_term.1.get(&FragmentKind::a).unwrap_or(&n_term.0), + charge_carriers, + settings, + )); + } + if let Some(settings) = &ions.b { + base_fragments.extend(Fragment::generate_series( + &(self.formulas_inner(sequence_index, peptidoform_index) + * (modifications + .1 + .get(&FragmentKind::b) + .unwrap_or(&modifications.0) + - molecular_formula!(H 1))), + peptidoform_ion_index, + peptidoform_index, + &FragmentType::b(n_pos, 0), + n_term.1.get(&FragmentKind::b).unwrap_or(&n_term.0), + charge_carriers, + settings, + )); + } + if let Some(settings) = &ions.c { + base_fragments.extend(Fragment::generate_series( + &(self.formulas_inner(sequence_index, peptidoform_index) + * (modifications + .1 + .get(&FragmentKind::c) + .unwrap_or(&modifications.0) + + molecular_formula!(H 2 N 1))), + peptidoform_ion_index, + peptidoform_index, + &FragmentType::c(n_pos, 0), + n_term.1.get(&FragmentKind::c).unwrap_or(&n_term.0), + charge_carriers, + settings, + )); + } + for (aa, distance) in &ions.d.0 { + if let Some(satellite_fragments) = + aa.satellite_ion_fragments(sequence_index - *distance, peptidoform_index) + { + for (label, formula) in satellite_fragments.iter() { + base_fragments.extend(Fragment::generate_series( + &(modifications + .1 + .get(&FragmentKind::d) + .unwrap_or(&modifications.0) + * self.formulas_inner(sequence_index, peptidoform_index) + + molecular_formula!(H 1 C 1 O 1) + - formula), + peptidoform_ion_index, + peptidoform_index, + &FragmentType::d(n_pos, *aa, *distance, 0, *label), + n_term.1.get(&FragmentKind::d).unwrap_or(&n_term.0), + charge_carriers, + &ions.d.1, + )); + } + } + } + } + if allow_terminal.1 { + for (aa, distance) in &ions.v.0 { + base_fragments.extend(Fragment::generate_series( + &(self.formulas_inner(sequence_index, peptidoform_index) + * -aa.formulas_inner(sequence_index + *distance, peptidoform_index) + + molecular_formula!(H 3 C 2 N 1 O 1)), + peptidoform_ion_index, + peptidoform_index, + &FragmentType::v(c_pos, *aa, *distance, 0), + c_term.1.get(&FragmentKind::v).unwrap_or(&c_term.0), + charge_carriers, + &ions.v.1, + )); + } + for (aa, distance) in &ions.w.0 { + if let Some(satellite_fragments) = + aa.satellite_ion_fragments(sequence_index - *distance, peptidoform_index) + { + for (label, formula) in satellite_fragments.iter() { + base_fragments.extend(Fragment::generate_series( + &(modifications + .1 + .get(&FragmentKind::w) + .unwrap_or(&modifications.0) + * self.formulas_inner(sequence_index, peptidoform_index) + + molecular_formula!(H 2 N 1) + - formula), + peptidoform_ion_index, + peptidoform_index, + &FragmentType::w(c_pos, *aa, *distance, 0, *label), + c_term.1.get(&FragmentKind::w).unwrap_or(&c_term.0), + charge_carriers, + &ions.w.1, + )); + } + } + } + if let Some(settings) = &ions.x { + base_fragments.extend(Fragment::generate_series( + &(self.formulas_inner(sequence_index, peptidoform_index) + * (modifications + .1 + .get(&FragmentKind::x) + .unwrap_or(&modifications.0) + + molecular_formula!(C 1 O 1) + - molecular_formula!(H 1))), + peptidoform_ion_index, + peptidoform_index, + &FragmentType::x(c_pos, 0), + c_term.1.get(&FragmentKind::x).unwrap_or(&c_term.0), + charge_carriers, + settings, + )); + } + if let Some(settings) = &ions.y { + base_fragments.extend(Fragment::generate_series( + &(self.formulas_inner(sequence_index, peptidoform_index) + * (modifications + .1 + .get(&FragmentKind::y) + .unwrap_or(&modifications.0) + + molecular_formula!(H 1))), + peptidoform_ion_index, + peptidoform_index, + &FragmentType::y(c_pos, 0), + c_term.1.get(&FragmentKind::y).unwrap_or(&c_term.0), + charge_carriers, + settings, + )); + } + if let Some(settings) = &ions.z { + base_fragments.extend(Fragment::generate_series( + &(self.formulas_inner(sequence_index, peptidoform_index) + * (modifications + .1 + .get(&FragmentKind::z) + .unwrap_or(&modifications.0) + - molecular_formula!(H 2 N 1))), + peptidoform_ion_index, + peptidoform_index, + &FragmentType::z(c_pos, 0), + c_term.1.get(&FragmentKind::z).unwrap_or(&c_term.0), + charge_carriers, + settings, + )); + } + } + + if allow_terminal.0 && allow_terminal.1 { + if let Some((charge, losses)) = &ions.immonium { + base_fragments.extend(Fragment::generate_all( + &(self.formulas_inner(sequence_index, peptidoform_index) + * (modifications + .1 + .get(&FragmentKind::immonium) + .unwrap_or(&modifications.0) + - molecular_formula!(C 1 O 1))), + peptidoform_ion_index, + peptidoform_index, + &FragmentType::Immonium(n_pos, self.into()), // TODO: get the actual sequence element here + &Multi::default(), + &losses + .iter() + .filter(|(aa, _)| aa.contains(&self)) + .flat_map(|(_, l)| l.iter()) + .map(|l| vec![l.clone()]) + .collect::>(), + charge_carriers, + *charge, + )); + } + } + base_fragments + } + + /// Check if two amino acids are considered identical. X is identical to anything, J to IL, B to ND, Z to EQ. + pub(crate) fn canonical_identical(self, rhs: Self) -> bool { + match (self, rhs) { + (a, b) if a == b => true, + (Self::Unknown, _) + | (_, Self::Unknown) + | (Self::AmbiguousLeucine, Self::Leucine | Self::Isoleucine) + | (Self::Leucine | Self::Isoleucine, Self::AmbiguousLeucine) + | (Self::AmbiguousAsparagine, Self::Asparagine | Self::AsparticAcid) + | (Self::Asparagine | Self::AsparticAcid, Self::AmbiguousAsparagine) + | (Self::AmbiguousGlutamine, Self::Glutamine | Self::GlutamicAcid) + | (Self::Glutamine | Self::GlutamicAcid, Self::AmbiguousGlutamine) => true, + _ => false, + } + } +} + +#[cfg(test)] +#[expect(clippy::unreadable_literal, clippy::missing_panics_doc)] +mod tests { + use super::*; + + #[test] + fn mass() { + let weight_ala = AminoAcid::Alanine.formulas()[0].average_weight(); + let mass_ala = AminoAcid::Alanine.formulas()[0].monoisotopic_mass(); + assert_ne!(weight_ala, mass_ala); + assert!((weight_ala.value - 71.07793).abs() < 1e-5); + assert!((mass_ala.value - 71.037113783).abs() < 1e-5); + } + + #[test] + fn mass_lysine() { + let weight_lys = AminoAcid::Lysine.formulas()[0].average_weight(); + let mass_lys = AminoAcid::Lysine.formulas()[0].monoisotopic_mass(); + assert_ne!(weight_lys, mass_lys); + assert!((weight_lys.value - 128.17240999999999).abs() < 1e-5); + assert!((mass_lys.value - 128.094963010536).abs() < 1e-5); + } + + #[test] + fn masses() { + let known = &[ + ('A', 71.03711, 71.08), + ('R', 156.10111, 156.2), + ('N', 114.04293, 114.1), + ('D', 115.02694, 115.1), + ('C', 103.00919, 103.1), + ('E', 129.04259, 129.1), + ('Q', 128.05858, 128.1), + ('G', 57.02146, 57.05), + ('H', 137.05891, 137.1), + ('I', 113.08406, 113.2), + ('L', 113.08406, 113.2), + ('K', 128.09496, 128.2), + ('M', 131.04049, 131.2), + ('F', 147.06841, 147.2), + ('P', 97.05276, 97.12), + ('S', 87.03203, 87.08), + ('T', 101.04768, 101.1), + ('W', 186.07931, 186.2), + ('Y', 163.06333, 163.2), + ('V', 99.06841, 99.13), + ]; + + for (aa, mono_mass, average_weight) in known { + let aa = AminoAcid::try_from(*aa).unwrap(); + let (mono, weight) = ( + aa.formulas()[0].monoisotopic_mass().value, + aa.formulas()[0].average_weight().value, + ); + println!( + "{}: {} {} {} {}", + aa.pro_forma_definition(), + mono, + mono_mass, + weight, + average_weight + ); + assert!((mono - *mono_mass).abs() < 1e-5); + assert!((weight - *average_weight).abs() < 1e-1); + } + } + + #[test] + fn read_aa() { + assert_eq!( + AminoAcid::try_from('B').unwrap(), + AminoAcid::AmbiguousAsparagine + ); + assert_eq!( + AminoAcid::try_from(b'B').unwrap(), + AminoAcid::AmbiguousAsparagine + ); + assert_eq!(AminoAcid::try_from('c'), Ok(AminoAcid::Cysteine)); + assert_eq!(AminoAcid::try_from('๐Ÿฆ€'), Err(())); + } +} diff --git a/rustyms/src/aminoacid/is_amino_acid.rs b/rustyms/src/aminoacid/is_amino_acid.rs new file mode 100644 index 00000000..1ea0bc9d --- /dev/null +++ b/rustyms/src/aminoacid/is_amino_acid.rs @@ -0,0 +1,63 @@ +//! Module used to create the [`IsAminoAcid`] trait + +use crate::{ + formula::MolecularFormula, fragment::SatelliteLabel, system::Mass, MassMode, Multi, + MultiChemical, SequencePosition, +}; + +use std::borrow::Cow; + +/// A general trait to define amino acids. +pub trait IsAminoAcid: MultiChemical { + /// The full name for this amino acid. + fn name(&self) -> Cow<'_, str>; + /// The three letter code for this amino acid. Or None if there is no common three letter + /// definition for this amino acid. + fn three_letter_code(&self) -> Option>; + /// The one letter code for this amino acid. Or None if there is no common single character + /// definition for this amino acid. + #[doc(alias = "code")] + fn one_letter_code(&self) -> Option; + /// The ProForma definition for this amino acid. If this is not a simple amino acid it can be + /// defined as an amino acid with an additional modification. For example `X[H9C2N2]` could be + /// used if Arginine was not defined as `R` in ProForma. + fn pro_forma_definition(&self) -> Cow<'_, str>; + /// The monoisotopic mass of this amino acid. Should be redefined for better performance. + fn monoisotopic_mass(&self) -> Cow<'_, Multi> { + Cow::Owned( + self.formulas() + .iter() + .map(MolecularFormula::monoisotopic_mass) + .collect(), + ) + } + /// The average weight of this amino acid. Should be redefined for better performance. + fn average_weight(&self) -> Cow<'_, Multi> { + Cow::Owned( + self.formulas() + .iter() + .map(MolecularFormula::average_weight) + .collect(), + ) + } + /// The mass with a given mass mode for this amino acid. Should be redefined for better performance. + fn mass(&self, mode: MassMode) -> Cow<'_, Multi> { + Cow::Owned(self.formulas().iter().map(|f| f.mass(mode)).collect()) + } + /// The molecular formula of the side chain of the amino acid. The `sequence_index` and + /// `peptidoform_index` are used to keep track of ambiguous amino acids. + fn side_chain( + &self, + sequence_index: SequencePosition, + peptidoform_index: usize, + ) -> Cow<'_, Multi>; + /// The molecular formulas that can fragment for satellite ions (d and w). Commonly the fragment + /// after the second carbon into the side chain. `MolecularFormula::default()` can be returned + /// if no satellite ions are possible. The `sequence_index` and `peptidoform_index` are used to + /// keep track of ambiguous amino acids. + fn satellite_ion_fragments( + &self, + sequence_index: SequencePosition, + peptidoform_index: usize, + ) -> Option>>; +} diff --git a/rustyms/src/aminoacid/mod.rs b/rustyms/src/aminoacid/mod.rs new file mode 100644 index 00000000..63b0fada --- /dev/null +++ b/rustyms/src/aminoacid/mod.rs @@ -0,0 +1,7 @@ +mod aminoacid; +mod is_amino_acid; +pub mod pka; +pub mod properties; + +pub use aminoacid::*; +pub use is_amino_acid::*; diff --git a/rustyms/src/aminoacid/pka.rs b/rustyms/src/aminoacid/pka.rs new file mode 100644 index 00000000..f605e95c --- /dev/null +++ b/rustyms/src/aminoacid/pka.rs @@ -0,0 +1,367 @@ +//! Module used to store and calculate pKa and isoelectric point values for a given [`AminoAcid`] or [Peptidoform] respectively + +use serde::{Deserialize, Serialize}; + +use crate::{ + aminoacid::properties::ChargeClass, modification::SimpleModificationInner, AminoAcid, AtMax, + Peptidoform, SemiAmbiguous, +}; + +use super::is_amino_acid::IsAminoAcid; + +/// A source for pKa values, which can be used to calculate the pKa for peptidoforms. +pub trait PKaSource { + /// Get the pKa values for the given amino acid and modifications. + #[allow(non_snake_case)] + fn pKa( + amino_acid: AA, + side_chain_modifications: impl Iterator>, + n_terminal_modifications: Option>>, + c_terminal_modifications: Option>>, + ) -> Option; +} + +impl> Peptidoform { + /// Get the calculated isoelectric point (pI) for the peptidoform, or None if any sequence elements lack pKa values. + /// + /// The isoelectric point is the pH at which the net charge of the peptidoform is zero. This is determined using a binary + /// search between pH 0 and 14. The charge at each pH is computed using the Henderson-Hasselbalch equation with pKa values + /// from the provided `PKaSource`, considering N-terminal, C-terminal, and sidechain ionizable groups. + /// + /// # Example + /// ```rust + /// # use rustyms::{Peptidoform, aminoacid::pka::{PKaSource, PKaLide1991}}; + /// // Create a SemiAmbiguous Peptidoform for glutamic acid (E) and Alanine (A) + /// let peptidoform = Peptidoform::pro_forma(&"EMEVEESPEK", None).unwrap().into_semi_ambiguous().unwrap(); + /// let pi = peptidoform.isoelectic_point::(); + /// // The calculated pI is approximately 3.57 based on Lide 1991 pKa values + /// assert_eq!(pi.map(|v| (v * 100.0).round() / 100.0), Some(3.57)); + /// ``` + /// + /// # Shortcomings + /// - **Naive Approach**: Does not account for interactions between ionizable groups. + /// - **Modifications Ignored**: Modifications affecting pKa are not considered. + /// - **Environmental Factors**: Assumes pKa values are independent of sequence and environment. + /// + /// Get the calculated pKa value for the given peptidoform, or None if any of the sequence elements do not have a defined pKa. + #[allow(non_snake_case)] + pub fn isoelectic_point>(&self) -> Option { + let sequence = self.sequence(); + if sequence.is_empty() { + return None; + } + + // Collect all ionizable groups with their pKa values + let mut ionizable = Vec::with_capacity(sequence.len() + 2); + + // Handle N-terminal + let first = sequence.first()?; + ionizable.push(( + ChargeClass::Positive, + Source::pKa( + first.aminoacid.aminoacid(), + first.modifications.iter().filter_map(|m| m.simple()), + Some(self.get_n_term().iter().filter_map(|m| m.simple())), + (self.len() == 1).then_some(self.get_c_term().iter().filter_map(|m| m.simple())), + )? + .n_term(), + )); // N-terminal is always positive + + // Handle C-terminal + let last = sequence.last()?; + ionizable.push(( + ChargeClass::Negative, + Source::pKa( + last.aminoacid.aminoacid(), + last.modifications.iter().filter_map(|m| m.simple()), + (self.len() == 1).then_some(self.get_n_term().iter().filter_map(|m| m.simple())), + Some(self.get_c_term().iter().filter_map(|m| m.simple())), + )? + .c_term(), + )); // C-terminal is always negative + + // Handle sidechains + for (index, aa) in sequence.iter().enumerate() { + if let Some(sidechain) = Source::pKa( + aa.aminoacid.aminoacid(), + aa.modifications.iter().filter_map(|m| m.simple()), + (index == 0).then_some(self.get_n_term().iter().filter_map(|m| m.simple())), + (index == self.len() - 1) + .then_some(self.get_n_term().iter().filter_map(|m| m.simple())), + )? + .sidechain() + { + let charge_class = aa.aminoacid.aminoacid().charge_class(); + match charge_class { + ChargeClass::Positive | ChargeClass::Negative => { + ionizable.push((charge_class, sidechain)); + } + ChargeClass::Unknown => return None, + ChargeClass::Uncharged => (), + } + } + } + + // Binary search between pH 0-14 to find isoelectric point + let mut low = 0.0; + let mut high = 14.0; + let mut new_pi = 7.775; + const EPSILON: f64 = 0.0001; + + while (high - low) > EPSILON { + new_pi = (low + high) / 2.0; + let charge = calculate_charge(new_pi, &ionizable); + + if charge > 0.0 { + low = new_pi; + } else { + high = new_pi; + } + } + + Some(new_pi) + } +} + +fn calculate_charge(pH: f64, ionizable: &[(ChargeClass, f64)]) -> f64 { + let mut charge = 0.0; + + for (class, pka) in ionizable { + match class { + ChargeClass::Positive => charge += 1.0 / (10.0_f64.powf(pH - pka) + 1.0), + ChargeClass::Negative => charge -= 1.0 / (10.0_f64.powf(pka - pH) + 1.0), + _ => {} + } + } + + charge +} +/// The pKa for a specific Amino Acid +#[derive(Copy, Clone, PartialEq, PartialOrd, Debug, Serialize, Deserialize)] +pub struct AminoAcidPKa { + n_term: f64, + sidechain: Option, + c_term: f64, +} + +impl AminoAcidPKa { + const fn new(n_term: f64, sidechain: Option, c_term: f64) -> Self { + Self { + n_term, + sidechain, + c_term, + } + } + + /// Get the pKa value for the n-term of the Amino acid + pub const fn n_term(self) -> f64 { + self.n_term + } + + /// Get the pKa value for the side-chain group of the Amino acid + pub const fn sidechain(self) -> Option { + self.sidechain + } + + /// Get the pKa value for the c-term of the Amino acid + pub const fn c_term(self) -> f64 { + self.c_term + } +} + +/// pKa values from Lide, D. R. (1991). Handbook of Chemistry and Physics: A Ready Reference Book of Chemical and Physical Data. +pub struct PKaLide1991; + +impl PKaSource for PKaLide1991 { + fn pKa( + amino_acid: AminoAcid, + mut side_chain_modifications: impl Iterator>, + n_terminal_modifications: Option>>, + c_terminal_modifications: Option>>, + ) -> Option { + if side_chain_modifications.next().is_some() + || n_terminal_modifications.is_some_and(|mut m| m.next().is_some()) + || c_terminal_modifications.is_some_and(|mut m| m.next().is_some()) + { + return None; + } + match amino_acid { + AminoAcid::Arginine => Some(AminoAcidPKa::new(9.00, Some(12.10), 2.03)), + AminoAcid::Histidine => Some(AminoAcidPKa::new(9.09, Some(6.04), 1.70)), + AminoAcid::Lysine => Some(AminoAcidPKa::new(9.16, Some(10.67), 2.15)), + AminoAcid::AsparticAcid => Some(AminoAcidPKa::new(9.66, Some(3.71), 1.95)), + AminoAcid::GlutamicAcid => Some(AminoAcidPKa::new(9.58, Some(4.15), 2.16)), + AminoAcid::Tyrosine => Some(AminoAcidPKa::new(9.04, Some(10.10), 2.24)), + AminoAcid::Cysteine => Some(AminoAcidPKa::new(10.28, Some(8.14), 1.91)), + AminoAcid::Alanine => Some(AminoAcidPKa::new(9.71, None, 2.33)), + AminoAcid::Glycine => Some(AminoAcidPKa::new(9.58, None, 2.34)), + AminoAcid::Proline => Some(AminoAcidPKa::new(10.47, None, 1.95)), + AminoAcid::Serine => Some(AminoAcidPKa::new(9.05, None, 2.13)), + AminoAcid::Threonine => Some(AminoAcidPKa::new(8.96, None, 2.20)), + AminoAcid::Methionine => Some(AminoAcidPKa::new(9.08, None, 2.16)), + AminoAcid::Phenylalanine => Some(AminoAcidPKa::new(9.09, None, 2.18)), + AminoAcid::Tryptophan => Some(AminoAcidPKa::new(9.34, None, 2.38)), + AminoAcid::Valine => Some(AminoAcidPKa::new(9.52, None, 2.27)), + AminoAcid::Isoleucine => Some(AminoAcidPKa::new(9.60, None, 2.26)), + AminoAcid::Leucine => Some(AminoAcidPKa::new(9.58, None, 2.32)), + AminoAcid::Glutamine => Some(AminoAcidPKa::new(9.00, None, 2.18)), + AminoAcid::Asparagine => Some(AminoAcidPKa::new(8.73, None, 2.16)), + _ => None, + } + } +} + +/// pKa values from Lehninger, A. L., Nelson, D. L., & Cox, M. M. (2005). Lehninger Principles of Biochemistry. Macmillan. +pub struct PKaLehninger; + +impl PKaSource for PKaLehninger { + fn pKa( + amino_acid: AminoAcid, + mut side_chain_modifications: impl Iterator>, + n_terminal_modifications: Option>>, + c_terminal_modifications: Option>>, + ) -> Option { + if side_chain_modifications.next().is_some() + || n_terminal_modifications.is_some_and(|mut m| m.next().is_some()) + || c_terminal_modifications.is_some_and(|mut m| m.next().is_some()) + { + return None; + } + match amino_acid { + AminoAcid::Arginine => Some(AminoAcidPKa::new(9.04, Some(12.48), 2.17)), + AminoAcid::Histidine => Some(AminoAcidPKa::new(9.17, Some(6.00), 1.82)), + AminoAcid::Lysine => Some(AminoAcidPKa::new(8.95, Some(10.53), 2.18)), + AminoAcid::AsparticAcid => Some(AminoAcidPKa::new(9.60, Some(3.65), 1.88)), + AminoAcid::GlutamicAcid => Some(AminoAcidPKa::new(9.67, Some(4.25), 2.19)), + AminoAcid::Tyrosine => Some(AminoAcidPKa::new(9.11, Some(10.07), 2.20)), + AminoAcid::Cysteine => Some(AminoAcidPKa::new(10.28, Some(8.18), 1.96)), + AminoAcid::Alanine => Some(AminoAcidPKa::new(9.69, None, 2.34)), + AminoAcid::Glycine => Some(AminoAcidPKa::new(9.60, None, 2.34)), + AminoAcid::Proline => Some(AminoAcidPKa::new(10.96, None, 1.99)), + AminoAcid::Serine => Some(AminoAcidPKa::new(9.15, None, 2.21)), + AminoAcid::Threonine => Some(AminoAcidPKa::new(9.62, None, 2.11)), + AminoAcid::Methionine => Some(AminoAcidPKa::new(9.21, None, 2.28)), + AminoAcid::Phenylalanine => Some(AminoAcidPKa::new(9.13, None, 1.83)), + AminoAcid::Tryptophan => Some(AminoAcidPKa::new(9.39, None, 2.38)), + AminoAcid::Valine => Some(AminoAcidPKa::new(9.62, None, 2.32)), + AminoAcid::Isoleucine => Some(AminoAcidPKa::new(9.68, None, 2.36)), + AminoAcid::Leucine => Some(AminoAcidPKa::new(9.60, None, 2.36)), + AminoAcid::Glutamine => Some(AminoAcidPKa::new(9.13, None, 2.17)), + AminoAcid::Asparagine => Some(AminoAcidPKa::new(8.80, None, 2.02)), + _ => None, + } + } +} + +#[cfg(test)] +#[expect(clippy::float_cmp, clippy::missing_panics_doc)] +mod tests { + use super::*; + use crate::{modification::SimpleModification, Peptidoform, SemiAmbiguous}; + + // Helper to create a Peptidoform from a list of amino acids + fn create_peptidoform(aas: &str) -> Peptidoform { + Peptidoform::pro_forma(aas, None) + .unwrap() + .into_semi_ambiguous() + .unwrap() + } + + // Helper function to test pKa values for a given source + fn test_pka>( + test_cases: &[(AminoAcid, Option<(f64, Option, f64)>)], + ) { + for (aa, maybe_values) in test_cases { + if let Some((n_term, sidechain, c_term)) = maybe_values { + let pka = Source::pKa( + *aa, + std::iter::empty::(), + None::>, + None::>, + ) + .unwrap_or_else(|| panic!("Missing pKa for {aa:?}")); + let round = |v: f64| (v * 100.0).round() / 100.0; + + assert_eq!(round(pka.n_term()), *n_term, "N-term mismatch for {aa:?}"); + assert_eq!( + pka.sidechain().map(round), + *sidechain, + "Sidechain mismatch for {aa:?}" + ); + assert_eq!(round(pka.c_term()), *c_term, "C-term mismatch for {aa:?}"); + } else { + assert!(maybe_values.is_none(), "Expected None for {aa:?}"); + } + } + } + + // Helper function to test an isoelectric point value given a source + fn test_isoelectric_point>(cases: &[(&str, Option)]) { + for &(seq, expected) in cases { + let peptide = create_peptidoform(seq); + let round = |v: f64| (v * 100.0).round() / 100.0; + let iso = peptide.isoelectic_point::(); + assert_eq!( + iso.map(round), + expected, + "Isoelectric point mismatch for peptide: {seq}" + ); + } + } + + #[test] + fn test_pka_lide1991() { + let test_cases = [ + (AminoAcid::Arginine, Some((9.00, Some(12.10), 2.03))), + (AminoAcid::GlutamicAcid, Some((9.58, Some(4.15), 2.16))), + (AminoAcid::Alanine, Some((9.71, None, 2.33))), + (AminoAcid::Histidine, Some((9.09, Some(6.04), 1.70))), + (AminoAcid::Unknown, None), + ]; + + test_pka::(&test_cases); + } + + #[test] + fn test_pka_lehninger() { + let test_cases = [ + (AminoAcid::Cysteine, Some((10.28, Some(8.18), 1.96))), + (AminoAcid::AsparticAcid, Some((9.60, Some(3.65), 1.88))), + (AminoAcid::Isoleucine, Some((9.68, None, 2.36))), + (AminoAcid::Tryptophan, Some((9.39, None, 2.38))), + (AminoAcid::Selenocysteine, None), + ]; + + test_pka::(&test_cases); + } + + #[test] + fn test_isoelectric_point_lide1991() { + let test_cases = [ + ("E", Some(3.16)), + ("A", Some(6.02)), + ("DE", Some(2.85)), + ("HR", Some(10.6)), + ("KDEH", Some(5.17)), + ("AXRT", None), + ("AXRT[Oxidation]", None), + ]; + + test_isoelectric_point::(&test_cases); + } + + #[test] + fn test_isoelectric_point_lehninger() { + let test_cases = [ + ("G", Some(5.97)), + ("Y", Some(5.65)), + ("CQ", Some(6.23)), + ("KP", Some(9.74)), + ("FIVS", Some(5.67)), + ("TKLB", None), + ("TK[Oxidation]LB", None), + ]; + + test_isoelectric_point::(&test_cases); + } +} diff --git a/rustyms/src/aminoacid_properties.rs b/rustyms/src/aminoacid/properties.rs similarity index 100% rename from rustyms/src/aminoacid_properties.rs rename to rustyms/src/aminoacid/properties.rs diff --git a/rustyms/src/aminoacids.rs b/rustyms/src/aminoacids.rs deleted file mode 100644 index 687ec960..00000000 --- a/rustyms/src/aminoacids.rs +++ /dev/null @@ -1,693 +0,0 @@ -use serde::{Deserialize, Serialize}; - -use crate::{ - formula::MolecularFormula, - fragment::{Fragment, FragmentType, PeptidePosition}, - model::*, - molecular_charge::CachedCharge, - system::Mass, - MassMode, Multi, MultiChemical, NeutralLoss, SequencePosition, -}; - -use std::borrow::Cow; - -/// A general trait to define amino acids. -pub trait IsAminoAcid { - /// The full name for this amino acid. - fn name(&self) -> Cow<'_, str>; - /// The three letter code for this amino acid. Or None if there is no common three letter - /// definition for this amino acid. - fn three_letter_code(&self) -> Option>; - /// The one letter code for this amino acid. Or None if there is no common single character - /// definition for this amino acid. - #[doc(alias = "code")] - fn one_letter_code(&self) -> Option; - /// The ProForma definition for this amino acid. If this is not a simple amino acid it can be - /// defined as an amino acid with an additional modification. For example `X[H9C2N2]` could be - /// used if Arginine was not defined as `R` in ProForma. - fn pro_forma_definition(&self) -> Cow<'_, str>; - /// The full molecular formula for this amino acid. It allows multiple molecular formulas to - /// allow ambiguous amino acids such as B and Z. - fn formulas(&self) -> Cow<'_, Multi>; - /// The monoisotopic mass of this amino acid. Should be redefined for better performance. - fn monoisotopic_mass(&self) -> Cow<'_, Multi> { - Cow::Owned( - self.formulas() - .iter() - .map(MolecularFormula::monoisotopic_mass) - .collect(), - ) - } - /// The average weight of this amino acid. Should be redefined for better performance. - fn average_weight(&self) -> Cow<'_, Multi> { - Cow::Owned( - self.formulas() - .iter() - .map(MolecularFormula::average_weight) - .collect(), - ) - } - /// The mass with a given mass mode for this amino acid. Should be redefined for better performance. - fn mass(&self, mode: MassMode) -> Cow<'_, Multi> { - Cow::Owned(self.formulas().iter().map(|f| f.mass(mode)).collect()) - } - /// The molecular formula of the side chain of the amino acid. - fn side_chain(&self) -> Cow<'_, Multi>; - /// The molecular formulas that can fragment for satellite ions (d and w). Commonly the fragment - /// after the second carbon into the side chain. `MolecularFormula::default()` can be returned - /// if no satellite ions are possible. - fn satellite_ion_fragments(&self) -> Option>>; - /// Common neutral losses for the immonium ion of this amino acid. - fn immonium_losses(&self) -> Cow<'_, [NeutralLoss]>; -} - -include!("shared/aminoacid.rs"); - -impl AminoAcid { - /// All amino acids with a unique mass (no I/L in favour of J, no B, no Z, and no X) - pub const UNIQUE_MASS_AMINO_ACIDS: &'static [Self] = &[ - Self::Glycine, - Self::Alanine, - Self::Arginine, - Self::Asparagine, - Self::AsparticAcid, - Self::Cysteine, - Self::Glutamine, - Self::GlutamicAcid, - Self::Histidine, - Self::AmbiguousLeucine, - Self::Lysine, - Self::Methionine, - Self::Phenylalanine, - Self::Proline, - Self::Serine, - Self::Threonine, - Self::Tryptophan, - Self::Tyrosine, - Self::Valine, - Self::Selenocysteine, - Self::Pyrrolysine, - ]; - - /// All 20 canonical amino acids - pub const CANONICAL_AMINO_ACIDS: &'static [Self] = &[ - Self::Glycine, - Self::Alanine, - Self::Arginine, - Self::Asparagine, - Self::AsparticAcid, - Self::Cysteine, - Self::Glutamine, - Self::GlutamicAcid, - Self::Histidine, - Self::Leucine, - Self::Isoleucine, - Self::Lysine, - Self::Methionine, - Self::Phenylalanine, - Self::Proline, - Self::Serine, - Self::Threonine, - Self::Tryptophan, - Self::Tyrosine, - Self::Valine, - ]; - - // TODO: Take side chain mutations into account (maybe define pyrrolysine as a mutation) - /// # Panics - /// When the sequence index is terminal. - pub(crate) fn satellite_ion_fragments( - self, - sequence_index: SequencePosition, - peptidoform_index: usize, - ) -> Multi { - let crate::SequencePosition::Index(sequence_index) = sequence_index else { - panic!("Not allowed to call satellite ion fragments with a terminal sequence index") - }; - match self { - Self::Alanine - | Self::Glycine - | Self::Histidine - | Self::Phenylalanine - | Self::Proline - | Self::Tryptophan - | Self::Tyrosine - | Self::Unknown => Multi::default(), - Self::Arginine => molecular_formula!(H 9 C 2 N 2).into(), - Self::Asparagine => molecular_formula!(H 2 C 1 N 1 O 1).into(), - Self::AsparticAcid => molecular_formula!(H 1 C 1 O 2).into(), - Self::AmbiguousAsparagine => vec![ - molecular_formula!(H 2 C 1 N 1 O 1 (crate::AmbiguousLabel::AminoAcid{option: Self::Asparagine, sequence_index, peptidoform_index})), - molecular_formula!(H 1 C 1 O 2 (crate::AmbiguousLabel::AminoAcid{option: Self::AsparticAcid, sequence_index, peptidoform_index})), - ] - .into(), - Self::Cysteine => molecular_formula!(H 1 S 1).into(), - Self::Glutamine => molecular_formula!(H 4 C 2 N 1 O 1).into(), - Self::GlutamicAcid => molecular_formula!(H 3 C 2 O 2).into(), - Self::AmbiguousGlutamine => vec![ - molecular_formula!(H 4 C 2 N 1 O 1 (crate::AmbiguousLabel::AminoAcid{option: Self::Glutamine, sequence_index, peptidoform_index})), - molecular_formula!(H 3 C 2 O 2 (crate::AmbiguousLabel::AminoAcid{option: Self::GlutamicAcid, sequence_index, peptidoform_index})), - ] - .into(), - Self::Isoleucine => vec![ - molecular_formula!(H 3 C 1), - molecular_formula!(H 5 C 2), - ] - .into(), - Self::Leucine => molecular_formula!(H 7 C 3).into(), - Self::AmbiguousLeucine => vec![ - molecular_formula!(H 3 C 1 (crate::AmbiguousLabel::AminoAcid{option: Self::Isoleucine, sequence_index, peptidoform_index})), - molecular_formula!(H 5 C 2 (crate::AmbiguousLabel::AminoAcid{option: Self::Isoleucine, sequence_index, peptidoform_index})), - molecular_formula!(H 7 C 3 (crate::AmbiguousLabel::AminoAcid{option: Self::Leucine, sequence_index, peptidoform_index})), - ] - .into(), - Self::Lysine => molecular_formula!(H 8 C 3 N 1).into(), - Self::Methionine => molecular_formula!(H 5 C 2 S 1).into(), - Self::Pyrrolysine => molecular_formula!(H 15 C 9 N 2 O 1).into(), - Self::Selenocysteine => molecular_formula!(Se 1).into(), - Self::Serine => molecular_formula!(H 1 O 1).into(), - Self::Threonine => vec![ - molecular_formula!(H 1 O 1), - molecular_formula!(H 3 C 1), - ] - .into(), - Self::Valine => molecular_formula!(H 3 C 1).into(), // Technically two options, but both have the same mass - } - } - - /// All losses from the base immonium ions. Compiled from the sources below. - /// - /// | AA | [Wikipedia](https://upload.wikimedia.org/wikipedia/commons/thumb/0/01/Amino_acid_fragment_ions.png/400px-Amino_acid_fragment_ions.png) | 0.1016/1044-0305(93)87006-X | [ionsource](https://www.ionsource.com/Card/immon/immon.htm) | [10.1002/chin.199624319](http://dx.doi.org/10.1002/chin.199624319) | [Prospector (MS-Comp)](https://prospector.ucsf.edu/prospector/cgi-bin/msform.cgi?form=mscomp) | [10.1186/1477-5956-9-2](http://dx.doi.org/10.1186/1477-5956-9-2) | 10.1016/j.ymeth.2004.08.013 | 10.1385/1597452750 (table 5) | 10.1021/ac902712f | [Prospector (MS-Product)](https://prospector.ucsf.edu/prospector/cgi-bin/msform.cgi?form=msproduct) | [ThermoFisher](https://tools.thermofisher.com/content/sfs/brochures/cms_040030.pdf) | 10.1074/mcp.O113.035915 | 10.1074/mcp.O113.035915 | 10.1021/ac902712f | [Prospector (MS-Product)](https://prospector.ucsf.edu/prospector/cgi-bin/msform.cgi?form=msproduct) | | 10.1385/1597452750 (table 5) | Sources | Best mass | Best formula | Loss | Loss formula | Interpreted loss | Interpreted formula | Final | - /// |-------|---------------------------------------------------------------------------------------------------------------------------|-----------------------------|------------------------------------------------|------------------------------------------|-----------------------------------------------------------------------|-----------------------------------------|-----------------------------|------------------------------|-------------------|--------------------------------------------------------------------------|---------------------------------------------------------------------|-------------------------|-------------------------|-------------------|--------------------------------------------------------------------------|------------------------------|---------|----------:|--------------|----------|--------------|------------------|-------------------------|------------| - /// | A | 44 | 44 | | 44 | | 44 | | 44.05 | | | 44.0500 | | | | | | 6 | 44.0500 | | | | | | | - /// | R | 129 | 129 | | 129 | | 129 | | 129.11 | | | 129.1140 | 129.1135 | C5H13N4+ | | | | 8 | 129.1138 | C5H13N4+ | | | | | | - /// | | | | 185 | | 185 | | | | | | | | | | | | 2 | 185 | | -55.8862 | | C-2O-2 | | C-2O-2 | - /// | | | | | | | | | 115.09 | | | | | | | | | 1 | 115.09 | | 14.0238 | | C1H2 | | C1H2 | - /// | | 112 | 112 | 112 | 112 | 112 | 112 | 112.09 | 112.09 | | 112.0869 | 112.0875 | | | | C5H10N3+ | C5H10N3+ | 12 | 112.0872 | C5H10N3+ | 17.0266 | H3N1 | | | H3N1 | - /// | | 100 | 100 | 100 | 100 | 100 | 100 | 100.09 | | | 100.0869 | 100.0875 | | | | C4H10N3+ | | 10 | 100.0872 | C4H10N3+ | 29.0266 | C1H3N1 | | | C1H3N1 | - /// | | 87 | 87 | 87 | 87 | 87 | 87 | 87.09 | 87.09 | 87.0922 | 87.0917 | | | | C4H11N2+ | C4H11N2+ | | 12 | 87.0920 | C4H11N2+ | 42.0218 | C2H2N2 | | | C2H2N2 | - /// | | 73 | 73 | 73 | | 73 | 72 | 73.00 | | 73.0640 | | | | | C2H7N3+ | | | 8 | 73.0640 | C2H7N3+ | 56.0498 | C3H6N1 | | | C3H6N1 | - /// | | 70 | 70 | 70 | 70 | 70 | 70 | 70.07 | 70.07 | 70.0657 | 70.0651 | 70.0657 | | | C4H8N1+ | C4H8N1+ | | 13 | 70.0655 | C4H8N1+ | 59.0483 | C1H5N3 | | | C1H5N3 | - /// | | | | | | | | | 60.06 | | | | | | | | | 1 | 60.06 | | 69.0538 | | C3H4N2O-1 | C2H6N1O1+ | C3H4N2O-1 | - /// | | | 59 | | | | 59 | | | 59.0483 | | | | | CH5N3+ | | | 4 | 59.0483 | CH5N3+ | 70.0655 | C4H8N1 | | | C4H8N1 | - /// | | | | | | | | | | 43.0296 | | | | | C1H3N2+ | | | 2 | 43.0296 | C1H3N2+ | 86.0842 | C4H10N2 | | | C4H10N2 | - /// | | 29 | | | | | | | | | | | | | | | | 1 | 29 | | 100.1138 | | | H1N2/C1H1O1/C1H3N1/C2H5 | | - /// | N | 87 | 87 | 87 | 87 | 87 | 87 | 87.09 | 87.06 | | 87.0553 | 87.0558 | | | | C3H7N2O1+ | | 11 | 87.0556 | C3H7N2O1+ | | | | | | - /// | | 70 | 70 | 70 | 70 | | 70 | | 70.03 | | | 70.0293 | | | | | C3H4N1O1+ | 8 | 70.0293 | C3H4N1O1+ | 17.0263 | H3N1 | | | H3N1 | - /// | D | 88 | 88 | 88 | 88 | 88 | 88 | 88.04 | 88.04 | 88.0399 | 88.0393 | 88.0399 | | | C3H6N1O2+ | C3H6N1O2+ | | 13 | 88.0397 | C3H6N1O2+ | | | | | | - /// | | 70 | | 70 | 70 | | 70 | | 70.03 | | | 70.0293 | | | | | C3H4N1O1+ | 7 | 70.0293 | C3H4N1O1+ | 18.0104 | H2O1 | | | H2O1 | - /// | C | 76 | | | 76 | | 76 | | | | | 76.0221 | | | | | | 4 | 76.0221 | | | | | | | - /// | E | 102 | 102 | | 102 | 102 | 102 | 102.06 | 102.05 | 102.0555 | 102.0550 | 102.0555 | 102.0550 | C4H8N1O2+ | C4H8N1O2+ | C4H8N1O2+ | | 14 | 102.0553 | C4H8N1O2+ | | | | | | - /// | | | | | 91 | | | | | | | | | | | | | 1 | 91 | | 11.0553 | | | | | - /// | | | | | 84 | | | | 84.04 | | | 84.0449 | | | | | C4H6N1O1+ | 4 | 84.0449 | C4H6N1O1+ | 18.0104 | H2O1 | | | H2O1 | - /// | Q | 101 | 101 | 101 | 101 | 101 | 101 | 101.11 | 101.11 | | 101.0709 | 101.0715 | 101.0709 | C4H9N2O1+ | | C4H9N2O1+ | | 13 | 101.0711 | C4H9N2O1+ | | | | | | - /// | | 129 | 129 | 129 | 129 | 129 | 129 | 129.1 | 129.11 | | 129.0659 | 129.1028 | | | | C5H9N2O2+ | | 11 | 129.0844 | C5H9N2O2+ | -28.0133 | C-1O-1 | | | C-1O-1 | - /// | | 84 | 84 | 84 | 84 | 84 | 84 | 84.08 | 84.04 | 84.0813 | 84.0444 | 84.0449 | | | C5H10N1+ | C4H6N1O1+ | C4H6N1O1+ | 14 | 84.0569 | C5H10N1+ | 17.0142 | H3N1 | | | H3N1 | - /// | | 56 | | | 56 | | 56 | | 56.05 | | | 56.0500 | | | | | | 5 | 56.0500 | | 45.0211 | | C1H3N1O1 | | C1H3N1O1 | - /// | G | 30 | 30 | | 30 | | 30 | | 30.03 | 30.0344 | | 30.0344 | | | C1H4N1+ | | | 8 | 30.0344 | C1H4N1+ | | | | | | - /// | H | 110 | 110 | 110 | 110 | 110 | 110 | 110.07 | 110.07 | 110.0718 | 110.0713 | 110.0718 | 110.0713 | C5H8N3+ | C5H8N3+ | C5H8N3+ | | 15 | 110.0716 | C5H8N3+ | | | | | | - /// | | 166 | 166 | | | | 166 | | | | | | | | | | | 3 | 166 | | -55.9284 | | C-2O-2 | | C-2O-2 | - /// | | 138 | 138 | | | | 138 | 138.07 | | | 138.0662 | | | | | C6H8N3O1+ | | 6 | 138.0662 | C6H8N3O1+ | -27.9946 | C-1O-1 | | | C-1O-1 | - /// | | 123 | 123 | | | | 123 | | | | | | | | | | | 3 | 123 | | -12.9284 | | H3O-1 | | H3O-1 | - /// | | 121 | 121 | | | | 121 | | | | | | | | | | | 3 | 121 | | -10.9284 | | H5O-1 | | H5O-1 | - /// | | 82 | 82 | | | | 82 | | | 82.0531 | | | | | C4H6N2+ | | | 5 | 82.0531 | C4H6N2+ | 28.0185 | C1H2N1 | | | C1H2N1 | - /// | I/L/J | 86 | 86 | 86 | 86 | 86 | 86 | 86.1 | 86.10 | 86.0970 | 86.0964 | 86.0970 | | | C5H12N+ | C5H12N1+ | | 13 | 86.0968 | C5H12N+ | | | | | | - /// | | 72 | 72 | | 72 | | 72 | | | | | 72.0449 | | | | | | 5 | 72.0449 | | 14.0519 | | C1H2 | | C1H2 | - /// | | 44 | | | 44 | | 44 | | | | | 44.0500 | | | | | | 4 | 44.0500 | | 42.0468 | | C3H6 | | C3H6 | - /// | K | 101 | 101 | 101 | 101 | 101 | 101 | 101.11 | 101.11 | | 101.1073 | 101.1079 | | | | C5H13N2+ | | 11 | 101.1076 | C5H13N2+ | | | | | | - /// | | 129 | 129 | 129 | 129 | 129 | 129 | 129.1 | 129.11 | | 129.1022 | | 129.1022 | C6H13N2O1+ | | C6H13N2O1+ | | 12 | 129.1022 | C6H13N2O1+ | -27.9946 | C-1O-1 | | | C-1O-1 | - /// | | | | | | | | | | | 126.0913 | | | | | C7H12N1O1+ | | 2 | 126.0913 | C7H12N1O1+ | -24.9837 | C-2H1N1O-1 | | | C-2H1N1O-1 | - /// | | 112 | 112 | | 112 | | 112 | | | | | | | | | | | 4 | 112 | | -10.8924 | | H5O-1 | | H5O-1 | - /// | | 84 | 84 | 84 | 84 | 84 | 84 | 84.08 | 84.08 | 84.0813 | 84.0808 | 84.0813 | | | C5H10N1+ | C5H10N1+ | C5H10N1+ | 14 | 84.0811 | C5H10N1+ | 17.0265 | H3N1 | | | H3N1 | - /// | | 70 | 70 | | | | 70 | | | | | | | | | | | 3 | 70 | | 31.1076 | | C1H5N1 | | C1H5N1 | - /// | | | | | | | | | 56.05 | | | 56.0500 | | | | | | 2 | 56.0500 | | 45.0576 | | C2H7N1 | | C2H7N1 | - /// | M | 104 | 104 | 104 | 104 | 104 | 104 | 104.05 | 104.06 | | 104.0528 | 104.0534 | | | | C4H10N1S1+ | | 11 | 104.0531 | C4H10N1S1+ | | | | | | - /// | | | | | | | 70 | | | | | | | | | | | | 70 | | 34.0531 | | H2S1 | | H2S1 | - /// | | 61 | 61 | | | | 61 | | | 61.0112 | | | | | C2H5S1+ | | | 5 | 61.0112 | C2H5S1+ | 43.0419 | C2H3N1 | | | C2H3N1 | - /// | | | | | | | | | | | | | | | | | C3H6N1+ | 1 | ?? | C3H6N1+ | ?? | C1H4S1 | | | C1H4S1 | - /// | F | 120 | 120 | 120 | 120 | 120 | 120 | 120.08 | 120.08 | 120.0813 | 120.0808 | 120.0813 | 120.0808 | C8H10N+ | C8H10N1+ | C8H10N1+ | | 15 | 120.0811 | C8H10N+ | | | | | | - /// | | 91 | 91 | | 91 | | 91 | | | | | 91.0548 | | | | | | 5 | 91.0548 | | 29.0263 | | C1H3N1 | C7H7+ | C1H3N1 | - /// | P | 70 | 70 | 70 | 70 | 70 | 70 | 70.07 | 70.07 | 70.0657 | 70.0651 | 70.0657 | | | C4H8N1+ | C4H8N1+ | | 13 | 70.0655 | C4H8N1+ | | | | | | - /// | | | | 126 | | 126 | | 126.06 | | | 126.055 | | | | | C6H8N1O2+ | | 5 | 126.0550 | C6H8N1O2+ | -55.9895 | C-2O-2 | | | C-2O-2 | - /// | S | 60 | 60 | 60 | 60 | 60 | 60 | 60.04 | 60.04 | | 60.0444 | 60.0449 | | | | C2H6N1O1+ | | 11 | 60.0447 | C2H6N1O1+ | | | | | | - /// | | | | | | | | | | | | | | | | | C2H4N1+ | 1 | ?? | C2H4N1+ | ?? | H2O1 | | | H2O1 | - /// | T | 74 | 74 | 74 | 74 | 74 | 74 | | 74.06 | 74.0606 | 74.0600 | 74.0606 | | | C3H8N1O1+ | C3H8N1O1+ | | 12 | 74.0604 | C3H8N1O1+ | | | | | | - /// | | | | | | | | | | | | | | | | | C3H6N1O1+ | 1 | ?? | C3H6N1O1+ | ?? | H2N1 | | | H2N1 | - /// | W | 159 | 159 | 159 | 159 | 159 | 159 | 159.09 | 159.09 | | 159.0917 | 159.0922 | 159.0917 | C10H11N2+ | | C10H11N2+ | | 13 | 159.0919 | C10H11N2+ | | | | | | - /// | | | 171 | | | | 171 | | | | | | | | | | | 2 | 171 | | -11.9081 | | H4O-1 | | H4O-1 | - /// | | 170 | 170 | 170 | | 170 | 170 | | | | | | | | | | | 5 | 170 | | -10.9081 | | H5O-1 | | H5O-1 | - /// | | 132 | | | 132 | | 132 | | 132.08 | | | 132.0813 | | | | | | 5 | 132.0813 | | 27.0106 | | C1H1N1 | | C1H1N1 | - /// | | 130 | 130 | 130 | 130 | 130 | 130 | | 130.07 | | | 130.0657 | | | | | | 8 | 130.0657 | | 29.0262 | | C1H3N1 | | C1H3N1 | - /// | | 117 | 117 | 117 | 117 | 117 | 117 | | | | | 117.0578 | | | | | | 7 | 117.0578 | | 42.0341 | | C2H4N1 | | C2H4N1 | - /// | | 100 | | | | | | | | | | | | | | | | 1 | 100 | | 59.0919 | | C3H9N1/C2H7N2 | | | - /// | | | | | 77 | | 77 | | | | | 77.0391 | | | | | | 3 | 77.0391 | | 82.0528 | | C4H6N2 | C6H5 | C4H6N2 | - /// | | 11 | | | | | | | | | | | | | | | | 1 | 11 | | 148.0919 | | | | | - /// | Y | 136 | 136 | 136 | 136 | 136 | 136 | 136.08 | 136.08 | 136.0762 | 136.0757 | 136.0762 | 136.0757 | C8H10N1O1+ | C8H10N1O1+ | C8H10N1O1+ | | 15 | 136.0760 | C8H10N1O1+ | | | | | | - /// | | 107 | 107 | | 107 | | 107 | | | | | 107.0497 | | | | | | 5 | 107.0497 | | 29.0263 | | C1H3N1 | | C1H3N1 | - /// | | 91 | 91 | | 91 | | 91 | | | | | 91.0548 | | | | | | 5 | 91.0548 | | 45.0212 | | C1H3N1O1 | | C1H3N1O1 | - /// | | | | | | | | | | 55.0184 | | | | | C3H3O1+ | | | 2 | 55.0184 | C3H3O1+ | 81.0576 | C5H7N1 | | | C5H7N1 | - /// | V | 72 | 72 | 72 | 72 | 72 | 72 | 72.08 | 72.08 | 72.0813 | 72.0808 | 72.0813 | | | C4H10N1+ | C4H10N1+ | | 13 | 72.0811 | C4H10N1+ | | | | | | - /// | | 69 | | | 69 | | 69 | | | | | 69.0704 | | | | | | 4 | 69.0704 | | 3.0107 | | C1H1O-1 | | C1H1O-1 | - /// | | 55 | | | 55 | | 55 | | | | | 55.0548 | | | | | | 4 | 55.0548 | | 17.0263 | | H3N1 | | H3N1 | - /// | | 44 | | | | | | | | | | | | | | | | 1 | 44 | | 28.0811 | | C1H2N1 | | C1H2N1 | - /// | | | | | 41 | | 41 | | | | | 41.0391 | | | | | | 3 | 41.0391 | | 31.0420 | | C1H5N1 | | C1H5N1 | - fn immonium_losses(self) -> Vec { - // TODO: For B/Z there are common immonium ions, but the mass is the same (meaning the loss is different), find a way of representing that - match self { - Self::Arginine => vec![ - NeutralLoss::Gain(molecular_formula!(C 2 O 2)), - NeutralLoss::Loss(molecular_formula!(C 1 H 2)), - NeutralLoss::Loss(molecular_formula!(H 3 N 1)), - NeutralLoss::Loss(molecular_formula!(C 1 H 3 N 1)), - NeutralLoss::Loss(molecular_formula!(C 2 H 2 N 2)), - NeutralLoss::Loss(molecular_formula!(C 3 H 6 N 2)), - NeutralLoss::Loss(molecular_formula!(C 1 H 5 N 3)), - NeutralLoss::Loss(molecular_formula!(C 3 H 4 N 2 O -1)), - NeutralLoss::Loss(molecular_formula!(C 4 H 8 N 1)), - NeutralLoss::Loss(molecular_formula!(C 4 H 10 N 2)), - ], - Self::Asparagine => vec![NeutralLoss::Loss(molecular_formula!(H 3 N 1))], - Self::AsparticAcid | Self::GlutamicAcid | Self::Serine => { - vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))] - } - Self::Glutamine => vec![ - NeutralLoss::Gain(molecular_formula!(C 1 O 1)), - NeutralLoss::Loss(molecular_formula!(H 3 N 1)), - NeutralLoss::Loss(molecular_formula!(C 1 H 3 N 1 O 1)), - ], - Self::Histidine => vec![ - NeutralLoss::Gain(molecular_formula!(C 2 O 2)), - NeutralLoss::Gain(molecular_formula!(C 1 O 1)), - NeutralLoss::Loss(molecular_formula!(H 3 O -1)), - NeutralLoss::Loss(molecular_formula!(H 5 O -1)), - NeutralLoss::Loss(molecular_formula!(C 1 H 2 N 1)), - ], - Self::Leucine | Self::Isoleucine | Self::AmbiguousLeucine => vec![ - NeutralLoss::Loss(molecular_formula!(C 1 H 2)), - NeutralLoss::Loss(molecular_formula!(C 3 H 6)), - ], - Self::Lysine => vec![ - NeutralLoss::Gain(molecular_formula!(C 1 O 1)), - NeutralLoss::Loss(molecular_formula!(C -2 H 1 N 1 O -1)), - NeutralLoss::Loss(molecular_formula!(H 5 O -1)), - NeutralLoss::Loss(molecular_formula!(H 3 N 1)), - NeutralLoss::Loss(molecular_formula!(C 1 H 5 N 1)), - NeutralLoss::Loss(molecular_formula!(C 2 H 7 N 1)), - ], - Self::Methionine => vec![ - NeutralLoss::Loss(molecular_formula!(H 2 S 1)), - NeutralLoss::Loss(molecular_formula!(C 2 H 3 N 1)), - NeutralLoss::Loss(molecular_formula!(C 1 H 4 S 1)), - ], - Self::Phenylalanine => vec![NeutralLoss::Gain(molecular_formula!(C 2 O 2))], - Self::Threonine => vec![NeutralLoss::Loss(molecular_formula!(H 2 N 1))], - Self::Tryptophan => vec![ - NeutralLoss::Loss(molecular_formula!(H 4 O -1)), - NeutralLoss::Loss(molecular_formula!(H 5 O -1)), - NeutralLoss::Loss(molecular_formula!(C 1 H 1 N 1)), - NeutralLoss::Loss(molecular_formula!(C 1 H 3 N 1)), - NeutralLoss::Loss(molecular_formula!(C 2 H 4 N 1)), - NeutralLoss::Loss(molecular_formula!(C 4 H 6 N 2)), - ], - Self::Tyrosine => vec![ - NeutralLoss::Loss(molecular_formula!(C 1 H 3 N 1)), - NeutralLoss::Loss(molecular_formula!(C 1 H 3 N 1 O 1)), - NeutralLoss::Loss(molecular_formula!(C 5 H 7 N 1)), - ], - Self::Valine => vec![ - NeutralLoss::Loss(molecular_formula!(C 1 H 1 O -1)), - NeutralLoss::Loss(molecular_formula!(H 3 N 1)), - NeutralLoss::Loss(molecular_formula!(C 1 H 2 N 1)), - NeutralLoss::Loss(molecular_formula!(C 1 H 5 N 1)), - ], - _ => Vec::new(), - } - } - - // TODO: generalise over used storage type, so using molecularformula, monoisotopic mass, or average mass, also make sure that AAs can return these numbers in a const fashion - #[expect(clippy::too_many_lines, clippy::too_many_arguments)] - pub(crate) fn fragments( - self, - n_term: &Multi, - c_term: &Multi, - modifications: &Multi, - charge_carriers: &mut CachedCharge, - sequence_index: SequencePosition, - sequence_length: usize, - ions: &PossibleIons, - peptidoform_ion_index: usize, - peptidoform_index: usize, - allow_terminal: (bool, bool), - ) -> Vec { - let mut base_fragments = Vec::with_capacity(ions.size_upper_bound()); - let n_pos = PeptidePosition::n(sequence_index, sequence_length); - let c_pos = PeptidePosition::c(sequence_index, sequence_length); - - if ions.a.0 && allow_terminal.0 { - base_fragments.extend(Fragment::generate_all( - &(self.formulas_inner(sequence_index, peptidoform_index) - * (modifications - molecular_formula!(H 1 C 1 O 1))), - peptidoform_ion_index, - peptidoform_index, - &FragmentType::a(n_pos), - n_term, - ions.a.1, - charge_carriers, - ions.a.2, - )); - } - if ions.b.0 && allow_terminal.0 { - base_fragments.extend(Fragment::generate_all( - &(self.formulas_inner(sequence_index, peptidoform_index) - * (modifications - molecular_formula!(H 1))), - peptidoform_ion_index, - peptidoform_index, - &FragmentType::b(n_pos), - n_term, - ions.b.1, - charge_carriers, - ions.b.2, - )); - } - if ions.c.0 && allow_terminal.0 { - base_fragments.extend(Fragment::generate_all( - &(self.formulas_inner(sequence_index, peptidoform_index) - * (modifications + molecular_formula!(H 2 N 1))), - peptidoform_ion_index, - peptidoform_index, - &FragmentType::c(n_pos), - n_term, - ions.c.1, - charge_carriers, - ions.c.2, - )); - } - if ions.d.0 && allow_terminal.0 { - base_fragments.extend(Fragment::generate_all( - &(-self.satellite_ion_fragments(sequence_index, peptidoform_index) - * modifications - * self.formulas_inner(sequence_index, peptidoform_index) - + molecular_formula!(H 1 C 1 O 1)), - peptidoform_ion_index, - peptidoform_index, - &FragmentType::d(n_pos), - n_term, - ions.d.1, - charge_carriers, - ions.d.2, - )); - } - if ions.v.0 && allow_terminal.1 { - base_fragments.extend(Fragment::generate_all( - &molecular_formula!(H 3 C 2 N 1 O 1).into(), - peptidoform_ion_index, - peptidoform_index, - &FragmentType::v(c_pos), - c_term, - ions.v.1, - charge_carriers, - ions.v.2, - )); - } - if ions.w.0 && allow_terminal.1 { - base_fragments.extend(Fragment::generate_all( - &(-self.satellite_ion_fragments(sequence_index, peptidoform_index) - * modifications - * self.formulas_inner(sequence_index, peptidoform_index) - + molecular_formula!(H 2 N 1)), - peptidoform_ion_index, - peptidoform_index, - &FragmentType::w(c_pos), - c_term, - ions.w.1, - charge_carriers, - ions.w.2, - )); - } - if ions.x.0 && allow_terminal.1 { - base_fragments.extend(Fragment::generate_all( - &(self.formulas_inner(sequence_index, peptidoform_index) - * (modifications + molecular_formula!(C 1 O 1) - molecular_formula!(H 1))), - peptidoform_ion_index, - peptidoform_index, - &FragmentType::x(c_pos), - c_term, - ions.x.1, - charge_carriers, - ions.x.2, - )); - } - if ions.y.0 && allow_terminal.1 { - base_fragments.extend(Fragment::generate_all( - &(self.formulas_inner(sequence_index, peptidoform_index) - * (modifications + molecular_formula!(H 1))), - peptidoform_ion_index, - peptidoform_index, - &FragmentType::y(c_pos), - c_term, - ions.y.1, - charge_carriers, - ions.y.2, - )); - } - if ions.z.0 && allow_terminal.1 { - base_fragments.extend(Fragment::generate_all( - &(self.formulas_inner(sequence_index, peptidoform_index) - * (modifications - molecular_formula!(H 2 N 1))), - peptidoform_ion_index, - peptidoform_index, - &FragmentType::z(c_pos), - c_term, - ions.z.1, - charge_carriers, - ions.z.2, - )); - base_fragments.extend(Fragment::generate_all( - &(self.formulas_inner(sequence_index, peptidoform_index) - * (modifications - molecular_formula!(H 1 N 1))), - peptidoform_ion_index, - peptidoform_index, - &FragmentType::zยท(c_pos), - c_term, - ions.z.1, - charge_carriers, - ions.z.2, - )); - } - - if ions.immonium.0 && allow_terminal.0 && allow_terminal.1 { - base_fragments.extend(Fragment::generate_all( - &(self.formulas_inner(sequence_index, peptidoform_index) - * (modifications - molecular_formula!(C 1 O 1))), - peptidoform_ion_index, - peptidoform_index, - &FragmentType::Immonium(n_pos, self.into()), // TODO: get the actual sequenceelement here - &Multi::default(), - self.immonium_losses().as_slice(), - charge_carriers, - ions.immonium.1, - )); - } - base_fragments - } - - /// Get the single letter representation of the amino acid - pub const fn char(self) -> char { - match self { - Self::Alanine => 'A', - Self::AmbiguousAsparagine => 'B', - Self::Cysteine => 'C', - Self::AsparticAcid => 'D', - Self::GlutamicAcid => 'E', - Self::Phenylalanine => 'F', - Self::Glycine => 'G', - Self::Histidine => 'H', - Self::Isoleucine => 'I', - Self::AmbiguousLeucine => 'J', - Self::Lysine => 'K', - Self::Leucine => 'L', - Self::Methionine => 'M', - Self::Asparagine => 'N', - Self::Pyrrolysine => 'O', - Self::Proline => 'P', - Self::Glutamine => 'Q', - Self::Arginine => 'R', - Self::Serine => 'S', - Self::Threonine => 'T', - Self::Selenocysteine => 'U', - Self::Valine => 'V', - Self::Tryptophan => 'W', - Self::Unknown => 'X', - Self::Tyrosine => 'Y', - Self::AmbiguousGlutamine => 'Z', - } - } - - /// Get the 3 letter code for the amino acid - pub const fn code(self) -> &'static str { - match self { - Self::Alanine => "Ala", - Self::AmbiguousAsparagine => "Asx", - Self::Cysteine => "Cys", - Self::AsparticAcid => "Asp", - Self::GlutamicAcid => "Glu", - Self::Phenylalanine => "Phe", - Self::Glycine => "Gly", - Self::Histidine => "His", - Self::Isoleucine => "Ile", - Self::AmbiguousLeucine => "Xle", - Self::Lysine => "Lys", - Self::Leucine => "Leu", - Self::Methionine => "Met", - Self::Asparagine => "Asn", - Self::Pyrrolysine => "Pyl", - Self::Proline => "Pro", - Self::Glutamine => "Gln", - Self::Arginine => "Arg", - Self::Serine => "Ser", - Self::Threonine => "Thr", - Self::Selenocysteine => "Sec", - Self::Valine => "Val", - Self::Tryptophan => "Trp", - Self::Unknown => "Xaa", - Self::Tyrosine => "Tyr", - Self::AmbiguousGlutamine => "Glx", - } - } - - /// Get the full name for the amino acid - pub const fn name(self) -> &'static str { - match self { - Self::Alanine => "Alanine", - Self::AmbiguousAsparagine => "AmbiguousAsparagine", - Self::Cysteine => "Cysteine", - Self::AsparticAcid => "AsparticAcid", - Self::GlutamicAcid => "GlutamicAcid", - Self::Phenylalanine => "Phenylalanine", - Self::Glycine => "Glycine", - Self::Histidine => "Histidine", - Self::Isoleucine => "Isoleucine", - Self::AmbiguousLeucine => "AmbiguousLeucine", - Self::Lysine => "Lysine", - Self::Leucine => "Leucine", - Self::Methionine => "Methionine", - Self::Asparagine => "Asparagine", - Self::Pyrrolysine => "Pyrrolysine", - Self::Proline => "Proline", - Self::Glutamine => "Glutamine", - Self::Arginine => "Arginine", - Self::Serine => "Serine", - Self::Threonine => "Threonine", - Self::Selenocysteine => "Selenocysteine", - Self::Valine => "Valine", - Self::Tryptophan => "Tryptophan", - Self::Unknown => "Unknown", - Self::Tyrosine => "Tyrosine", - Self::AmbiguousGlutamine => "AmbiguousGlutamine", - } - } - - /// Check if two amino acids are considered identical. X is identical to anything, J to IL, B to ND, Z to EQ. - pub(crate) fn canonical_identical(self, rhs: Self) -> bool { - match (self, rhs) { - (a, b) if a == b => true, - (Self::Unknown, _) - | (_, Self::Unknown) - | (Self::AmbiguousLeucine, Self::Leucine | Self::Isoleucine) - | (Self::Leucine | Self::Isoleucine, Self::AmbiguousLeucine) - | (Self::AmbiguousAsparagine, Self::Asparagine | Self::AsparticAcid) - | (Self::Asparagine | Self::AsparticAcid, Self::AmbiguousAsparagine) - | (Self::AmbiguousGlutamine, Self::Glutamine | Self::GlutamicAcid) - | (Self::Glutamine | Self::GlutamicAcid, Self::AmbiguousGlutamine) => true, - _ => false, - } - } -} - -impl std::fmt::Display for AminoAcid { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", self.char()) - } -} - -#[cfg(test)] -#[expect(clippy::unreadable_literal, clippy::missing_panics_doc)] -mod tests { - use super::*; - - #[test] - fn mass() { - let weight_ala = AminoAcid::Alanine.formulas()[0].average_weight(); - let mass_ala = AminoAcid::Alanine.formulas()[0].monoisotopic_mass(); - assert_ne!(weight_ala, mass_ala); - assert!((weight_ala.value - 71.07793).abs() < 1e-5); - assert!((mass_ala.value - 71.037113783).abs() < 1e-5); - } - - #[test] - fn mass_lysine() { - let weight_lys = AminoAcid::Lysine.formulas()[0].average_weight(); - let mass_lys = AminoAcid::Lysine.formulas()[0].monoisotopic_mass(); - assert_ne!(weight_lys, mass_lys); - assert!((weight_lys.value - 128.17240999999999).abs() < 1e-5); - assert!((mass_lys.value - 128.094963010536).abs() < 1e-5); - } - - #[test] - fn masses() { - let known = &[ - ('A', 71.03711, 71.08), - ('R', 156.10111, 156.2), - ('N', 114.04293, 114.1), - ('D', 115.02694, 115.1), - ('C', 103.00919, 103.1), - ('E', 129.04259, 129.1), - ('Q', 128.05858, 128.1), - ('G', 57.02146, 57.05), - ('H', 137.05891, 137.1), - ('I', 113.08406, 113.2), - ('L', 113.08406, 113.2), - ('K', 128.09496, 128.2), - ('M', 131.04049, 131.2), - ('F', 147.06841, 147.2), - ('P', 97.05276, 97.12), - ('S', 87.03203, 87.08), - ('T', 101.04768, 101.1), - ('W', 186.07931, 186.2), - ('Y', 163.06333, 163.2), - ('V', 99.06841, 99.13), - ]; - - for (aa, mono_mass, average_weight) in known { - let aa = AminoAcid::try_from(*aa).unwrap(); - let (mono, weight) = ( - aa.formulas()[0].monoisotopic_mass().value, - aa.formulas()[0].average_weight().value, - ); - println!( - "{}: {} {} {} {}", - aa.char(), - mono, - mono_mass, - weight, - average_weight - ); - assert!((mono - *mono_mass).abs() < 1e-5); - assert!((weight - *average_weight).abs() < 1e-1); - } - } - - #[test] - fn read_aa() { - assert_eq!( - AminoAcid::try_from('B').unwrap(), - AminoAcid::AmbiguousAsparagine - ); - assert_eq!( - AminoAcid::try_from(b'B').unwrap(), - AminoAcid::AmbiguousAsparagine - ); - assert_eq!(AminoAcid::try_from('c'), Ok(AminoAcid::Cysteine)); - assert_eq!(AminoAcid::try_from('๐Ÿฆ€'), Err(())); - } -} diff --git a/rustyms/src/checked_aminoacid.rs b/rustyms/src/checked_aminoacid.rs index f709c828..7fd74c0c 100644 --- a/rustyms/src/checked_aminoacid.rs +++ b/rustyms/src/checked_aminoacid.rs @@ -3,7 +3,8 @@ use std::marker::PhantomData; use serde::{Deserialize, Serialize}; use crate::{ - AminoAcid, Chemical, MolecularFormula, Multi, MultiChemical, SemiAmbiguous, UnAmbiguous, + fragment::SatelliteLabel, AminoAcid, Chemical, IsAminoAcid, MolecularFormula, Multi, + MultiChemical, SemiAmbiguous, UnAmbiguous, }; /// A checked amino acid. This wraps an [`AminoAcid`] to keep track of the maximal complexity of @@ -279,24 +280,44 @@ impl CheckedAminoAcid { self.aminoacid.canonical_identical(rhs.aminoacid) } - /// Get the description of the amino acid as a single character - pub const fn char(self) -> char { - self.aminoacid.char() + /// Get the underlying (unchecked) amino acid + pub const fn aminoacid(self) -> AminoAcid { + self.aminoacid + } +} + +impl IsAminoAcid for CheckedAminoAcid { + fn name(&self) -> std::borrow::Cow<'_, str> { + self.aminoacid.name() } - /// Get the 3 letter code for the amino acid - pub const fn code(self) -> &'static str { - self.aminoacid.code() + fn three_letter_code(&self) -> Option> { + self.aminoacid.three_letter_code() } - /// Get the full name of the amino acid - pub const fn name(self) -> &'static str { - self.aminoacid.name() + fn one_letter_code(&self) -> Option { + self.aminoacid.one_letter_code() } - /// Get the underlying (unchecked) amino acid - pub const fn aminoacid(self) -> AminoAcid { + fn pro_forma_definition(&self) -> std::borrow::Cow<'_, str> { + self.aminoacid.pro_forma_definition() + } + + fn satellite_ion_fragments( + &self, + sequence_index: crate::SequencePosition, + peptidoform_index: usize, + ) -> Option>> { self.aminoacid + .satellite_ion_fragments(sequence_index, peptidoform_index) + } + + fn side_chain( + &self, + sequence_index: crate::SequencePosition, + peptidoform_index: usize, + ) -> std::borrow::Cow<'_, Multi> { + self.aminoacid.side_chain(sequence_index, peptidoform_index) } } @@ -398,7 +419,7 @@ impl Default for CheckedAminoAcid { impl std::fmt::Display for CheckedAminoAcid { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", self.char()) + write!(f, "{}", self.pro_forma_definition()) } } diff --git a/rustyms/src/databases/gnome.dat b/rustyms/src/databases/gnome.dat index 7ddd75b6..f43397aa 100644 Binary files a/rustyms/src/databases/gnome.dat and b/rustyms/src/databases/gnome.dat differ diff --git a/rustyms/src/databases/xlmod.dat b/rustyms/src/databases/xlmod.dat index a35224ac..239d85cd 100644 Binary files a/rustyms/src/databases/xlmod.dat and b/rustyms/src/databases/xlmod.dat differ diff --git a/rustyms/src/element.rs b/rustyms/src/element.rs index 015d8def..451b6af0 100644 --- a/rustyms/src/element.rs +++ b/rustyms/src/element.rs @@ -82,7 +82,7 @@ impl Element { let mut max = None; for iso in &elemental_data()[self as usize - 1].2 { let chance = iso.2 * f64::from(n); - if max.map_or(true, |m: (Mass, f64)| chance > m.1) { + if max.is_none_or(|m: (Mass, f64)| chance > m.1) { max = Some((iso.1, chance)); } } diff --git a/rustyms/src/formula.rs b/rustyms/src/formula.rs index c85739f0..57719799 100644 --- a/rustyms/src/formula.rs +++ b/rustyms/src/formula.rs @@ -1,4 +1,5 @@ use crate::{ + fragment::GlycanPosition, system::{da, fraction, Mass, OrderedMass, Ratio}, MassMode, }; @@ -9,6 +10,7 @@ use std::fmt::Write; mod formula_shared; pub use formula_shared::*; +use itertools::Itertools; impl From<&MolecularFormula> for OrderedMass { /// Create an ordered mass from the monoisotopic mass (needed for [`Multi`](crate::Multi)) @@ -130,17 +132,33 @@ impl std::fmt::Display for AmbiguousLabel { option, sequence_index, peptidoform_index, - } => write!(f, "{option}@p{peptidoform_index}i{sequence_index}"), + } => write!( + f, + "{option}@p{}i{}", + peptidoform_index + 1, + sequence_index + 1 + ), Self::Modification { id, sequence_index, peptidoform_index, - } => write!(f, "\x23{id}@p{peptidoform_index}i{sequence_index}"), + } => write!(f, "\x23{id}@p{}i{}", peptidoform_index + 1, sequence_index), Self::ChargeCarrier(formula) => write!(f, "[{}]", formula.hill_notation()), Self::CrossLinkBound(name) => write!(f, "intact{name}"), Self::CrossLinkBroken(name, formula) => { write!(f, "broken{name}@{}", formula.hill_notation()) } + Self::GlycanFragment(bonds) => { + write!(f, "Y{}", bonds.iter().map(GlycanPosition::label).join("Y")) + } + Self::GlycanFragmentComposition(composition) => write!( + f, + "Y{}", + composition + .iter() + .map(|(sugar, amount)| format!("{sugar}{amount}")) + .join("") + ), } } } diff --git a/rustyms/src/fragment.rs b/rustyms/src/fragment.rs index b93b362e..f5d6b197 100644 --- a/rustyms/src/fragment.rs +++ b/rustyms/src/fragment.rs @@ -2,6 +2,7 @@ use std::{ borrow::Cow, + cmp::Ordering, fmt::{Debug, Display}, }; @@ -9,31 +10,33 @@ use itertools::Itertools; use ordered_float::OrderedFloat; use serde::{Deserialize, Serialize}; +#[cfg(feature = "glycan-render")] +use crate::glycan::GlycanSelection; use crate::{ - glycan::MonoSaccharide, - model::ChargeRange, + glycan::{GlycanBranchIndex, GlycanBranchMassIndex, MonoSaccharide}, + model::{ChargeRange, PossiblePrimaryIons}, molecular_charge::{CachedCharge, MolecularCharge}, system::{ f64::{MassOverCharge, Ratio}, usize::Charge, OrderedMassOverCharge, }, - AmbiguousLabel, AminoAcid, Chemical, MassMode, Modification, MolecularFormula, Multi, - NeutralLoss, SemiAmbiguous, SequenceElement, SequencePosition, Tolerance, + AmbiguousLabel, AminoAcid, Chemical, IsAminoAcid, MassMode, Modification, MolecularFormula, + Multi, NeutralLoss, SemiAmbiguous, SequenceElement, SequencePosition, Tolerance, }; -/// A theoretical fragment of a peptide +/// A theoretical fragment #[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug, Serialize, Deserialize, Default)] pub struct Fragment { /// The theoretical composition pub formula: Option, /// The charge pub charge: Charge, - /// All possible annotations for this fragment saved as a tuple of peptide index and its type + /// The annotation for this fragment pub ion: FragmentType, - /// The peptidoform this fragment comes from, saved as the index into the list of peptidoform in the overarching [`crate::CompoundPeptidoform`] struct + /// The peptidoform this fragment comes from, saved as the index into the list of peptidoform in the overarching [`crate::CompoundPeptidoformIon`] struct pub peptidoform_ion_index: Option, - /// The peptide this fragment comes from, saved as the index into the list of peptides in the overarching [`crate::Peptidoform`] struct + /// The peptide this fragment comes from, saved as the index into the list of peptides in the overarching [`crate::PeptidoformIon`] struct pub peptidoform_index: Option, /// Any neutral losses applied pub neutral_loss: Vec, @@ -46,6 +49,175 @@ pub struct Fragment { } impl Fragment { + /// Write the fragment as an mzPAF string + pub fn to_mzPAF(&self) -> String { + let mut output = String::new(); + if self.auxiliary { + output.push('&'); + } + // Push the ion type info (plus maybe some neutral losses if needed) + output.push_str(&match &self.ion { + FragmentType::a(pos, variant) + | FragmentType::b(pos, variant) + | FragmentType::c(pos, variant) + | FragmentType::x(pos, variant) + | FragmentType::y(pos, variant) => format!( + "{}{}{}", + self.ion.kind(), + pos.series_number, + if *variant == 0 { + String::new() + } else { + format!("{variant:+}H") + } + ), + FragmentType::z(pos, variant) => format!( + "{}{}{}", + self.ion.kind(), + pos.series_number, + if *variant == 1 { + String::new() + } else { + format!("{:+}H", variant - 1) + } + ), + FragmentType::d(pos, _, distance, variant, label) => { + if *distance == 0 { + format!( + "d{label}{}{}", + pos.series_number, + if *variant == 0 { + String::new() + } else { + format!("{variant:+}H") + } + ) + } else { + format!( + "a{}-{}{}", + pos.series_number, + "C1", // TODO: get specific formula + if *variant == 0 { + String::new() + } else { + format!("{variant:+}H") + } + ) + } + } + FragmentType::v(pos, _, distance, variant) => { + if *distance == 0 { + format!( + "v{}{}", + pos.series_number, + if *variant == 0 { + String::new() + } else { + format!("{variant:+}H") + } + ) + } else { + format!( + "y{}-{}{}", + pos.series_number, + "C1", // TODO: get specific formula + if *variant == 0 { + String::new() + } else { + format!("{variant:+}H") + } + ) + } + } + FragmentType::w(pos, _, distance, variant, label) => { + if *distance == 0 { + format!( + "w{label}{}{}", + pos.series_number, + if *variant == 0 { + String::new() + } else { + format!("{variant:+}H") + } + ) + } else { + format!( + "z{}-{}{}", + pos.series_number, + "C1", // TODO: get specific formula + if *variant == 0 { + String::new() + } else { + format!("{variant:+}H") + } + ) + } + } + FragmentType::Precursor => "p".to_string(), + FragmentType::PrecursorSideChainLoss(_, aa) => format!("p-r[sidechain_{aa}]"), + FragmentType::Immonium(_, seq) => format!( + "I{}{}", + seq.aminoacid, + seq.modifications.iter().map(|m| format!("[{m}]")).join("") // TODO: how to handle ambiguous mods? maybe store somewhere which where applied for this fragment + ), + FragmentType::Unknown(num) => { + format!("?{}", num.map_or(String::new(), |u| u.to_string())) + } + FragmentType::Diagnostic(_) + | FragmentType::B { .. } + | FragmentType::BComposition(_, _) + | FragmentType::Y(_) + | FragmentType::YComposition(_, _) => self + .formula + .as_ref() + .map_or_else(|| "?".to_string(), |f| format!("f{{{f}}}")), // TODO: better way of storing? + FragmentType::Internal(name, a, b) => name.as_ref().map_or_else( + || format!("m{}:{}", a.sequence_index + 1, b.sequence_index + 1), + |name| { + format!( + "m{}:{}{}", + a.sequence_index + 1, + b.sequence_index + 1, + match name { + (BackboneNFragment::a, BackboneCFragment::x) + | (BackboneNFragment::b, BackboneCFragment::y) + | (BackboneNFragment::c, BackboneCFragment::z) => "", + (BackboneNFragment::a, BackboneCFragment::y) => "-CO", + (BackboneNFragment::a, BackboneCFragment::z) => "-CHNO", + (BackboneNFragment::b, BackboneCFragment::x) => "+CO", + (BackboneNFragment::b, BackboneCFragment::z) => "-NH", + (BackboneNFragment::c, BackboneCFragment::x) => "+CHNO", + (BackboneNFragment::c, BackboneCFragment::y) => "+NH", + } + ) + }, + ), + }); + // More losses + for loss in &self.neutral_loss { + output.push_str(&match loss { + NeutralLoss::SideChainLoss(_, aa) => format!("-r[sidechain_{aa}]"), + l => l.to_string(), + }); + } + // Isotopes: not handled + // Charge state + if self.charge.value != 1 { + output.push_str(&format!("^{}", self.charge.value)); + } + // Deviation + match self.deviation { + Some(Tolerance::Absolute(abs)) => output.push_str(&format!("/{}", abs.value)), + Some(Tolerance::Relative(ppm)) => output.push_str(&format!("/{}ppm", ppm.value)), + None => (), + } + // Confidence + if let Some(confidence) = self.confidence { + output.push_str(&format!("*{confidence}")); + } + output + } + /// Get the mz pub fn mz(&self, mode: MassMode) -> Option { self.formula.as_ref().map(|f| { @@ -96,7 +268,7 @@ impl Fragment { peptidoform_index: usize, annotation: &FragmentType, termini: &Multi, - neutral_losses: &[NeutralLoss], + neutral_losses: &[Vec], charge_carriers: &mut CachedCharge, charge_range: ChargeRange, ) -> Vec { @@ -105,17 +277,60 @@ impl Fragment { .cartesian_product(theoretical_mass.iter()) .cartesian_product(charge_carriers.range(charge_range)) .cartesian_product(std::iter::once(None).chain(neutral_losses.iter().map(Some))) - .map(|(((term, mass), charge), loss)| Self { + .map(|(((term, mass), charge), losses)| Self { formula: Some( term + mass + charge.formula_inner(SequencePosition::default(), peptidoform_index) - + loss.unwrap_or(&NeutralLoss::Gain(MolecularFormula::default())), + + losses + .iter() + .flat_map(|l| l.iter()) + .sum::(), ), charge: Charge::new::(charge.charge().value.try_into().unwrap()), ion: annotation.clone(), peptidoform_ion_index: Some(peptidoform_ion_index), peptidoform_index: Some(peptidoform_index), - neutral_loss: loss.map(|l| vec![l.clone()]).unwrap_or_default(), + neutral_loss: losses.map(Clone::clone).unwrap_or_default(), + deviation: None, + confidence: None, + auxiliary: false, + }) + .collect() + } + /// Generate a list of possible fragments from the list of possible preceding termini and neutral losses + /// # Panics + /// When the charge range results in a negative charge + #[must_use] + pub fn generate_series( + theoretical_mass: &Multi, + peptidoform_ion_index: usize, + peptidoform_index: usize, + annotation: &FragmentType, + termini: &Multi, + charge_carriers: &mut CachedCharge, + settings: &PossiblePrimaryIons, + ) -> Vec { + termini + .iter() + .cartesian_product(theoretical_mass.iter()) + .cartesian_product(charge_carriers.range(settings.1)) + .cartesian_product(std::iter::once(None).chain(settings.0.iter().map(Some))) + .cartesian_product(settings.2.iter()) + .map(|((((term, mass), charge), losses), variant)| Self { + formula: Some( + term + mass + + charge.formula_inner(SequencePosition::default(), peptidoform_index) + + losses + .iter() + .flat_map(|l| l.iter()) + .sum::() + + molecular_formula!(H 1) * variant, + ), + charge: Charge::new::(charge.charge().value.try_into().unwrap()), + ion: annotation.with_variant(*variant), + peptidoform_ion_index: Some(peptidoform_ion_index), + peptidoform_index: Some(peptidoform_index), + neutral_loss: losses.map(|l| l.clone()).unwrap_or_default(), deviation: None, confidence: None, auxiliary: false, @@ -265,18 +480,7 @@ impl PeptidePosition { } } -/// The definition of the position of an ion inside a glycan -#[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Serialize, Deserialize)] -pub struct GlycanPosition { - /// The depth starting at the amino acid - pub inner_depth: usize, - /// The series number (from the ion series terminal) - pub series_number: usize, - /// The branch naming - pub branch: Vec, - /// The aminoacid index where this glycan is attached - pub attachment: Option<(AminoAcid, usize)>, -} +include!("shared/glycan_position.rs"); impl GlycanPosition { /// Get the branch names @@ -286,7 +490,7 @@ impl GlycanPosition { self.branch .iter() .enumerate() - .map(|(i, b)| { + .map(|(i, (_, b))| { if i == 0 { char::from_u32( (0x03B1..=0x03C9) @@ -324,7 +528,7 @@ pub enum DiagnosticPosition { /// A position on a glycan Glycan(GlycanPosition, MonoSaccharide), /// A position on a compositional glycan (attachment AA + sequence index + the sugar) - GlycanCompositional(MonoSaccharide, Option<(AminoAcid, usize)>), + GlycanCompositional(MonoSaccharide, Option<(AminoAcid, SequencePosition)>), /// A position on a peptide Peptide(PeptidePosition, AminoAcid), /// Labile modification @@ -333,34 +537,60 @@ pub enum DiagnosticPosition { Reporter, } +/// A label for a satellite ion, none for most amino acids but a or b for Thr and Ile +#[derive( + Clone, Copy, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Serialize, Deserialize, Default, +)] +pub enum SatelliteLabel { + /// No label needed + #[default] + None, + /// Heaviest of the two options + A, + /// Lightest of the two options + B, +} + +impl std::fmt::Display for SatelliteLabel { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{}", + match self { + Self::None => "", + Self::A => "a", + Self::B => "b", + } + ) + } +} + /// The possible types of fragments -#[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Serialize, Deserialize, Default)] +#[derive(Clone, Eq, PartialEq, Hash, Debug, Serialize, Deserialize, Default)] #[expect(non_camel_case_types)] pub enum FragmentType { /// a - a(PeptidePosition), + a(PeptidePosition, i8), /// b - b(PeptidePosition), + b(PeptidePosition, i8), /// c - c(PeptidePosition), - /// d - d(PeptidePosition), + c(PeptidePosition, i8), + /// d a position, originating amino acid, distance from a break, variant, + d(PeptidePosition, AminoAcid, u8, i8, SatelliteLabel), /// v - v(PeptidePosition), + v(PeptidePosition, AminoAcid, u8, i8), /// w - w(PeptidePosition), + w(PeptidePosition, AminoAcid, u8, i8, SatelliteLabel), /// x - x(PeptidePosition), + x(PeptidePosition, i8), /// y - y(PeptidePosition), + y(PeptidePosition, i8), /// z - z(PeptidePosition), - /// zยท - zยท(PeptidePosition), + z(PeptidePosition, i8), // glycan A fragment (Never generated) //A(GlycanPosition), /// glycan B fragment - B(GlycanPosition), + // B(GlycanPosition), // glycan C fragment (Never generated) //C(GlycanPosition), // glycan X fragment (Never generated) @@ -369,12 +599,25 @@ pub enum FragmentType { Y(Vec), // glycan Z fragment (Never generated) // Z(GlycanPosition), - /// Internal glycan fragment, meaning both a B and Y breakages (and potentially multiple of both), resulting in a set of monosaccharides - Oxonium(Vec), + /// B glycan fragment, potentially with additional Y breakages + B { + /// The root break + b: GlycanPosition, + /// The branch breakages + y: Vec, + /// All branches that are not broken + end: Vec, + }, /// A B or internal glycan fragment for a glycan where only the composition is known, also saves the attachment (AA + sequence index) - OxoniumComposition(Vec<(MonoSaccharide, isize)>, Option<(AminoAcid, usize)>), + BComposition( + Vec<(MonoSaccharide, isize)>, + Option<(AminoAcid, SequencePosition)>, + ), /// A B or internal glycan fragment for a glycan where only the composition is known, also saves the attachment (AA + sequence index) - YComposition(Vec<(MonoSaccharide, isize)>, Option<(AminoAcid, usize)>), + YComposition( + Vec<(MonoSaccharide, isize)>, + Option<(AminoAcid, SequencePosition)>, + ), /// Immonium ion Immonium(PeptidePosition, SequenceElement), /// Precursor with amino acid side chain loss @@ -394,20 +637,134 @@ pub enum FragmentType { Precursor, } +impl std::cmp::Ord for FragmentType { + fn cmp(&self, other: &Self) -> Ordering { + // Sort of type first (precursor/abcxyz/dw/v) + match (self, other) { + // Peptide + (Self::Precursor, Self::Precursor) => Ordering::Equal, + (Self::Precursor, _) => Ordering::Less, + (_, Self::Precursor) => Ordering::Greater, + (Self::a(s, sv), Self::a(o, ov)) => s.cmp(o).then(sv.cmp(ov)), + (Self::a(_, _), _) => Ordering::Less, + (_, Self::a(_, _)) => Ordering::Greater, + (Self::b(s, sv), Self::b(o, ov)) => s.cmp(o).then(sv.cmp(ov)), + (Self::b(_, _), _) => Ordering::Less, + (_, Self::b(_, _)) => Ordering::Greater, + (Self::c(s, sv), Self::c(o, ov)) => s.cmp(o).then(sv.cmp(ov)), + (Self::c(_, _), _) => Ordering::Less, + (_, Self::c(_, _)) => Ordering::Greater, + (Self::x(s, sv), Self::x(o, ov)) => s.cmp(o).then(sv.cmp(ov)), + (Self::x(_, _), _) => Ordering::Less, + (_, Self::x(_, _)) => Ordering::Greater, + (Self::y(s, sv), Self::y(o, ov)) => s.cmp(o).then(sv.cmp(ov)), + (Self::y(_, _), _) => Ordering::Less, + (_, Self::y(_, _)) => Ordering::Greater, + (Self::z(s, sv), Self::z(o, ov)) => s.cmp(o).then(sv.cmp(ov)), + (Self::z(_, _), _) => Ordering::Less, + (_, Self::z(_, _)) => Ordering::Greater, + (Self::d(s, _, sd, sv, sl), Self::d(o, _, od, ov, ol)) => { + s.cmp(o).then(sd.cmp(od)).then(sv.cmp(ov)).then(sl.cmp(ol)) + } + (Self::d(_, _, _, _, _), _) => Ordering::Less, + (_, Self::d(_, _, _, _, _)) => Ordering::Greater, + (Self::w(s, _, sd, sv, sl), Self::w(o, _, od, ov, ol)) => { + s.cmp(o).then(sd.cmp(od)).then(sv.cmp(ov)).then(sl.cmp(ol)) + } + (Self::w(_, _, _, _, _), _) => Ordering::Less, + (_, Self::w(_, _, _, _, _)) => Ordering::Greater, + (Self::v(s, _, sd, sv), Self::v(o, _, od, ov)) => { + s.cmp(o).then(sd.cmp(od)).then(sv.cmp(ov)) + } + (Self::v(_, _, _, _), _) => Ordering::Less, + (_, Self::v(_, _, _, _)) => Ordering::Greater, + (Self::Immonium(s, _), Self::Immonium(o, _)) => s.cmp(o), + (Self::Immonium(_, _), _) => Ordering::Less, + (_, Self::Immonium(_, _)) => Ordering::Greater, + (Self::PrecursorSideChainLoss(s, _), Self::PrecursorSideChainLoss(o, _)) => s.cmp(o), + (Self::PrecursorSideChainLoss(_, _), _) => Ordering::Less, + (_, Self::PrecursorSideChainLoss(_, _)) => Ordering::Greater, + (Self::Internal(st, sa, sb), Self::Internal(ot, oa, ob)) => { + sa.cmp(oa).then(sb.cmp(ob)).then(st.cmp(ot)) + } + (Self::Internal(_, _, _), _) => Ordering::Less, + (_, Self::Internal(_, _, _)) => Ordering::Greater, + // Glycans + (Self::B { b: sb, y: sy, .. }, Self::B { b: ob, y: oy, .. }) => { + sy.len().cmp(&oy.len()).then(sb.cmp(ob)) + } + (Self::Y(s), Self::Y(o)) => s.len().cmp(&o.len()), + (Self::B { y: sy, .. }, Self::Y(o)) => { + (sy.len() + 1).cmp(&o.len()).then(Ordering::Greater) + } + (Self::Y(s), Self::B { y: oy, .. }) => { + s.len().cmp(&(oy.len() + 1)).then(Ordering::Less) + } + (Self::B { .. }, _) => Ordering::Less, + (_, Self::B { .. }) => Ordering::Greater, + (Self::Y(_), _) => Ordering::Less, + (_, Self::Y(_)) => Ordering::Greater, + (Self::BComposition(s, sl), Self::BComposition(o, ol)) + | (Self::YComposition(s, sl), Self::YComposition(o, ol)) => { + s.len().cmp(&o.len()).then(sl.cmp(ol)) + } + (Self::BComposition(s, sl), Self::YComposition(o, ol)) => s + .len() + .cmp(&o.len()) + .then(sl.cmp(ol)) + .then(Ordering::Greater), + (Self::YComposition(s, sl), Self::BComposition(o, ol)) => { + s.len().cmp(&o.len()).then(sl.cmp(ol)).then(Ordering::Less) + } + (Self::BComposition(_, _), _) => Ordering::Less, + (_, Self::BComposition(_, _)) => Ordering::Greater, + (Self::YComposition(_, _), _) => Ordering::Less, + (_, Self::YComposition(_, _)) => Ordering::Greater, + // Other + (Self::Diagnostic(s), Self::Diagnostic(o)) => s.cmp(o), + (Self::Diagnostic(_), _) => Ordering::Less, + (_, Self::Diagnostic(_)) => Ordering::Greater, + (Self::Unknown(s), Self::Unknown(o)) => s.cmp(o), + } + } +} + +impl std::cmp::PartialOrd for FragmentType { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + impl FragmentType { + /// Get a main ion series fragment with the specified variant, or pass the fragment type through unchanged + #[must_use] + pub fn with_variant(&self, variant: i8) -> Self { + match self { + Self::a(p, _) => Self::a(*p, variant), + Self::b(p, _) => Self::b(*p, variant), + Self::c(p, _) => Self::c(*p, variant), + Self::d(p, a, d, _, l) => Self::d(*p, *a, *d, variant, *l), + Self::v(p, a, d, _) => Self::v(*p, *a, *d, variant), + Self::w(p, a, d, _, l) => Self::w(*p, *a, *d, variant, *l), + Self::x(p, _) => Self::x(*p, variant), + Self::y(p, _) => Self::y(*p, variant), + Self::z(p, _) => Self::z(*p, variant), + other => other.clone(), + } + } + /// Get the position of this ion (or None if it is a precursor ion) pub const fn position(&self) -> Option<&PeptidePosition> { match self { - Self::a(n) - | Self::b(n) - | Self::c(n) - | Self::d(n) - | Self::v(n) - | Self::w(n) - | Self::x(n) - | Self::y(n) - | Self::z(n) - | Self::zยท(n) + Self::a(n, _) + | Self::b(n, _) + | Self::c(n, _) + | Self::d(n, _, _, _, _) + | Self::v(n, _, _, _) + | Self::w(n, _, _, _, _) + | Self::x(n, _) + | Self::y(n, _) + | Self::z(n, _) | Self::Diagnostic(DiagnosticPosition::Peptide(n, _)) | Self::Immonium(n, _) | Self::PrecursorSideChainLoss(n, _) => Some(n), @@ -415,10 +772,33 @@ impl FragmentType { } } - /// Get the glycan position of this ion (or None not applicable) + /// Get the root glycan position of this ion (or None if not applicable), Y is not defined as it does not have a root break pub const fn glycan_position(&self) -> Option<&GlycanPosition> { match self { - Self::B(n) | Self::Diagnostic(DiagnosticPosition::Glycan(n, _)) => Some(n), + Self::Diagnostic(DiagnosticPosition::Glycan(b, _)) | Self::B { b, .. } => Some(b), + _ => None, + } + } + + /// Get the glycan break positions of this ion (or None if not applicable), gives the sequence index, the root break, and the branch breaks. + /// Only available with feature 'glycan-render'. + #[cfg(feature = "glycan-render")] + pub fn glycan_break_positions( + &self, + ) -> Option<(Option, GlycanSelection<'_>)> { + match self { + Self::Diagnostic(DiagnosticPosition::Glycan(n, _)) => Some(( + n.attachment.map(|(_, p)| p), + GlycanSelection::SingleSugar(n), + )), + Self::Y(breaks) => Some(( + breaks.first().and_then(|p| p.attachment.map(|(_, p)| p)), + GlycanSelection::Subtree(None, breaks), + )), + Self::B { b, y, .. } => Some(( + b.attachment.map(|(_, p)| p), + GlycanSelection::Subtree(Some(b), y), + )), _ => None, } } @@ -426,28 +806,29 @@ impl FragmentType { /// Get the position label, unless it is a precursor ion pub fn position_label(&self) -> Option { match self { - Self::a(n) - | Self::b(n) - | Self::c(n) - | Self::d(n) - | Self::v(n) - | Self::w(n) - | Self::x(n) - | Self::y(n) - | Self::z(n) - | Self::zยท(n) + Self::a(n, _) + | Self::b(n, _) + | Self::c(n, _) + | Self::d(n, _, _, _, _) + | Self::v(n, _, _, _) + | Self::w(n, _, _, _, _) + | Self::x(n, _) + | Self::y(n, _) + | Self::z(n, _) | Self::Diagnostic(DiagnosticPosition::Peptide(n, _)) | Self::Immonium(n, _) | Self::PrecursorSideChainLoss(n, _) => Some(n.series_number.to_string()), - Self::B(n) | Self::Diagnostic(DiagnosticPosition::Glycan(n, _)) => Some(n.label()), - Self::Y(bonds) => Some(bonds.iter().map(GlycanPosition::label).join("")), - Self::Oxonium(breakages) => Some( - breakages - .iter() - .map(std::string::ToString::to_string) - .join(""), + Self::Diagnostic(DiagnosticPosition::Glycan(n, _)) => Some(n.label()), + Self::Y(bonds) => Some(bonds.iter().map(GlycanPosition::label).join("Y")), + Self::B { b, y, end } => Some( + b.label() + + "Y" + + &y.iter() + .chain(end.iter()) + .map(GlycanPosition::label) + .join("Y"), ), - Self::YComposition(sugars, _) | Self::OxoniumComposition(sugars, _) => Some( + Self::YComposition(sugars, _) | Self::BComposition(sugars, _) => Some( sugars .iter() .map(|(sugar, amount)| format!("{sugar}{amount}")) @@ -466,64 +847,125 @@ impl FragmentType { } } - /// Get the label for this fragment type - pub fn label(&self) -> Cow { - match self { - Self::a(_) => Cow::Borrowed("a"), - Self::b(_) => Cow::Borrowed("b"), - Self::c(_) => Cow::Borrowed("c"), - Self::d(_) => Cow::Borrowed("d"), - Self::v(_) => Cow::Borrowed("v"), - Self::w(_) => Cow::Borrowed("w"), - Self::x(_) => Cow::Borrowed("x"), - Self::y(_) => Cow::Borrowed("y"), - Self::z(_) => Cow::Borrowed("z"), - Self::zยท(_) => Cow::Borrowed("zยท"), - Self::B(_) => Cow::Borrowed("B"), - Self::Y(_) | Self::YComposition(_, _) => Cow::Borrowed("Y"), - Self::Diagnostic(DiagnosticPosition::Peptide(_, aa)) => { - Cow::Owned(format!("d{}", aa.char())) + /// Get the label for this fragment type, the first argument is the optional superscript prefix, the second is the main label + pub fn label(&self) -> (Option, Cow) { + let get_label = |ion: &'static str, v: i8| { + if v == 0 { + Cow::Borrowed(ion) + } else { + Cow::Owned(format!( + "{ion}{}", + if v < 0 { + "\'".repeat((-v) as usize) + } else { + "ยท".repeat(v as usize) + } + )) } - Self::Diagnostic(DiagnosticPosition::Reporter) => Cow::Borrowed("r"), - Self::Diagnostic(DiagnosticPosition::Labile(m)) => Cow::Owned(format!("d{m}")), + }; + + match self { + Self::a(_, v) => (None, get_label("a", *v)), + Self::b(_, v) => (None, get_label("b", *v)), + Self::c(_, v) => (None, get_label("c", *v)), + Self::d(_, _, n, v, l) => ( + (*n != 0).then_some(n.to_string()), + Cow::Owned(format!( + "d{l}{}", + if *v < 0 { + "\'".repeat((-v) as usize) + } else { + "ยท".repeat(*v as usize) + } + )), + ), + Self::v(_, _, n, v) => ((*n != 0).then_some(n.to_string()), get_label("v", *v)), + Self::w(_, _, n, v, l) => ( + (*n != 0).then_some(n.to_string()), + Cow::Owned(format!( + "w{l}{}", + if *v < 0 { + "\'".repeat((-v) as usize) + } else { + "ยท".repeat(*v as usize) + } + )), + ), + Self::x(_, v) => (None, get_label("x", *v)), + Self::y(_, v) => (None, get_label("y", *v)), + Self::z(_, v) => (None, get_label("z", *v)), + Self::B { .. } | Self::BComposition(_, _) => (None, Cow::Borrowed("B")), + Self::Y(_) | Self::YComposition(_, _) => (None, Cow::Borrowed("Y")), + Self::Diagnostic(DiagnosticPosition::Peptide(_, aa)) => ( + None, + Cow::Owned( + aa.one_letter_code() + .map(|c| format!("d{c}")) + .or_else(|| aa.three_letter_code().map(|c| format!("d{c}"))) + .unwrap_or_else(|| format!("d{}", aa.name())), + ), + ), + Self::Diagnostic(DiagnosticPosition::Reporter) => (None, Cow::Borrowed("r")), + Self::Diagnostic(DiagnosticPosition::Labile(m)) => (None, Cow::Owned(format!("d{m}"))), Self::Diagnostic( DiagnosticPosition::Glycan(_, sug) | DiagnosticPosition::GlycanCompositional(sug, _), - ) => Cow::Owned(format!("d{sug}")), - Self::Oxonium(_) | Self::OxoniumComposition(_, _) => Cow::Borrowed("oxonium"), - Self::Immonium(_, aa) => Cow::Owned(format!("i{}", aa.aminoacid.char())), - Self::PrecursorSideChainLoss(_, aa) => Cow::Owned(format!("p-s{}", aa.char())), - Self::Precursor => Cow::Borrowed("p"), - Self::Internal(fragmentation, _, _) => Cow::Owned(format!( - "m{}", - fragmentation.map_or(String::new(), |(n, c)| format!("{n}:{c}")), - )), - Self::Unknown(series) => Cow::Owned(format!( - "?{}", - series.map_or(String::new(), |s| s.to_string()), - )), + ) => (None, Cow::Owned(format!("d{sug}"))), + Self::Immonium(_, aa) => ( + None, + Cow::Owned( + aa.aminoacid + .one_letter_code() + .map(|c| format!("i{c}")) + .or_else(|| aa.aminoacid.three_letter_code().map(|c| format!("i{c}"))) + .unwrap_or_else(|| format!("i{}", aa.aminoacid.name())), + ), + ), + Self::PrecursorSideChainLoss(_, aa) => ( + None, + Cow::Owned( + aa.one_letter_code() + .map(|c| format!("p-s{c}")) + .or_else(|| aa.three_letter_code().map(|c| format!("p-s{c}"))) + .unwrap_or_else(|| format!("p-s{}", aa.name())), + ), + ), + Self::Precursor => (None, Cow::Borrowed("p")), + Self::Internal(fragmentation, _, _) => ( + None, + Cow::Owned(format!( + "m{}", + fragmentation.map_or(String::new(), |(n, c)| format!("{n}:{c}")), + )), + ), + Self::Unknown(series) => ( + None, + Cow::Owned(format!( + "?{}", + series.map_or(String::new(), |s| s.to_string()), + )), + ), } } /// Get the kind of fragment, easier to match against pub const fn kind(&self) -> FragmentKind { match self { - Self::a(_) => FragmentKind::a, - Self::b(_) => FragmentKind::b, - Self::c(_) => FragmentKind::c, - Self::d(_) => FragmentKind::d, - Self::v(_) => FragmentKind::v, - Self::w(_) => FragmentKind::w, - Self::x(_) => FragmentKind::x, - Self::y(_) => FragmentKind::y, - Self::z(_) | Self::zยท(_) => FragmentKind::z, + Self::a(_, _) => FragmentKind::a, + Self::b(_, _) => FragmentKind::b, + Self::c(_, _) => FragmentKind::c, + Self::d(_, _, _, _, _) => FragmentKind::d, + Self::v(_, _, _, _) => FragmentKind::v, + Self::w(_, _, _, _, _) => FragmentKind::w, + Self::x(_, _) => FragmentKind::x, + Self::y(_, _) => FragmentKind::y, + Self::z(_, _) => FragmentKind::z, Self::Y(_) | Self::YComposition(_, _) => FragmentKind::Y, Self::Diagnostic( DiagnosticPosition::Glycan(_, _) | DiagnosticPosition::GlycanCompositional(_, _), ) - | Self::B(_) - | Self::Oxonium(_) - | Self::OxoniumComposition(_, _) => FragmentKind::Oxonium, + | Self::B { .. } + | Self::BComposition(_, _) => FragmentKind::B, Self::Diagnostic(_) => FragmentKind::diagnostic, Self::Immonium(_, _) => FragmentKind::immonium, Self::PrecursorSideChainLoss(_, _) => FragmentKind::precursor_side_chain_loss, @@ -536,10 +978,12 @@ impl FragmentType { impl Display for FragmentType { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let (sup, label) = self.label(); write!( f, - "{}{}", - self.label(), + "{}{}{}", + sup.unwrap_or_default(), + label, self.position_label().unwrap_or_default() ) } @@ -622,7 +1066,7 @@ pub enum FragmentKind { /// glycan Y fragment, generated by one or more branches broken Y, /// B or glycan diagnostic ion or Internal glycan fragment, meaning both a B and Y breakages (and potentially multiple of both), resulting in a set of monosaccharides - Oxonium, + B, /// Immonium ion immonium, /// Precursor with amino acid side chain loss @@ -653,7 +1097,7 @@ impl Display for FragmentKind { Self::w => "w", Self::z => "z", Self::Y => "Y", - Self::Oxonium => "oxonium", + Self::B => "oxonium", Self::immonium => "immonium", Self::precursor_side_chain_loss => "precursor side chain loss", Self::diagnostic => "diagnostic", diff --git a/rustyms/src/fragmentation_tests.rs b/rustyms/src/fragmentation_tests.rs index af798037..755754ae 100644 --- a/rustyms/src/fragmentation_tests.rs +++ b/rustyms/src/fragmentation_tests.rs @@ -36,13 +36,14 @@ fn triple_a() { (74.036231, "zยท+1"), (232.129183, "precursor"), ]; - let model = Model::none() + let model = FragmentationModel::none() + .clone() .a(PrimaryIonSeries::default()) .b(PrimaryIonSeries::default()) .c(PrimaryIonSeries::default()) .x(PrimaryIonSeries::default()) .y(PrimaryIonSeries::default()) - .z(PrimaryIonSeries::default()); + .z(PrimaryIonSeries::default().variants(vec![0, 1])); test( theoretical_fragments, Peptidoform::pro_forma("AAA", None) @@ -84,13 +85,14 @@ fn with_modifications() { (150.034512, "zยท1"), (419.159491, "precursor"), ]; - let model = Model::none() + let model = FragmentationModel::none() + .clone() .a(PrimaryIonSeries::default()) .b(PrimaryIonSeries::default()) .c(PrimaryIonSeries::default()) .x(PrimaryIonSeries::default()) .y(PrimaryIonSeries::default()) - .z(PrimaryIonSeries::default()); + .z(PrimaryIonSeries::default().variants(vec![0, 1])); test( theoretical_fragments, Peptidoform::pro_forma("[Gln->pyro-Glu]-QAAM[Oxidation]", None).unwrap(), @@ -174,7 +176,9 @@ fn higher_charges() { (147.058660, "a2+1"), (62.424038, "precursor"), ]; - let model = Model::none().a(PrimaryIonSeries::default()); + let model = FragmentationModel::none() + .clone() + .a(PrimaryIonSeries::default()); test( theoretical_fragments, Peptidoform::pro_forma("ACD", None) @@ -328,13 +332,14 @@ fn all_aminoacids() { (102.067531, "zยท1"), (2395.132183, "precursor"), ]; - let model = Model::none() + let model = FragmentationModel::none() + .clone() .a(PrimaryIonSeries::default()) .b(PrimaryIonSeries::default()) .c(PrimaryIonSeries::default()) .x(PrimaryIonSeries::default()) .y(PrimaryIonSeries::default()) - .z(PrimaryIonSeries::default()); + .z(PrimaryIonSeries::default().variants(vec![0, 1])); test( theoretical_fragments, Peptidoform::pro_forma("ARNDCQEGHILKMFPSTWYV", None) @@ -415,7 +420,9 @@ fn glycan_structure_fragmentation() { ), ]; - let model = Model::none().glycan(GlycanModel::DISALLOW.allow_structural(true)); + let model = FragmentationModel::none() + .clone() + .glycan(GlycanModel::DISALLOW.allow_structural(true)); test( theoretical_fragments, Peptidoform::pro_forma("MVSHHN[GNO:G43728NL]LTTGATLINEQWLLTTAK", None) @@ -495,7 +502,9 @@ fn glycan_composition_fragmentation() { "OxY1ฮฒY3ฮฑB5/Y2ฮฒY2ฮฑB5/Y2ฮฒY3ฮฑB6/Y3ฮฒY1ฮฑB5/End1ฮฒY4ฮฑB5:N1H3", ), ]; - let model = Model::none().glycan(GlycanModel::DISALLOW.compositional_range(0..=10)); + let model = FragmentationModel::none() + .clone() + .glycan(GlycanModel::DISALLOW.compositional_range(0..=10)); test( theoretical_fragments, Peptidoform::pro_forma("MVSHHN[Glycan:N4H5S1]LTTGATLINEQWLLTTAK", None) @@ -575,7 +584,8 @@ fn intra_link() { let peptide = CompoundPeptidoformIon::pro_forma("K[C:DSSO#XL1]GK[#XL1]FLK", Some(&custom_database())) .unwrap(); - let model = Model::none() + let model = FragmentationModel::none() + .clone() .b(PrimaryIonSeries::default()) .y(PrimaryIonSeries::default()) .allow_cross_link_cleavage(true); @@ -601,7 +611,8 @@ fn ensure_no_double_xl_labels_breaking() { let peptide = CompoundPeptidoformIon::pro_forma("EVQLVESGGGLVQPGGSLRLSC[C:Disulfide#XL1]AASGFNIKDTYIHWVRQAPGKGLEWVARIYPTNGYTRYADSVKGRFTISADTSKNTAYLQMNSLRAEDTAVYYC[#XL1]SRWGGDGFYAMDYWGQGTLVTVSSASTKGPSVFPLAPSSKSTSGGTAALGC[C:Disulfide#XL2]LVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSSLGTQTYIC[#XL2]NVNHKPSNTKVDKKVEPKSC[C:Disulfide#XL3]DKT//DIQMTQSPSSLSASVGDRVTITC[C:Disulfide#XL4]RASQDVNTAVAWYQQKPGKAPKLLIYSASFLYSGVPSRFSGSRSGTDFTLTISSLQPEDFATYYC[#XL4]QQHYTTPPTFGQGTKVEIKRTVAAPSVFIFPPSDEQLKSGTASVVC[C:Disulfide#XL5]LLNNFYPREAKVQWKVDNALQSGNSQESVTEQDSKDSTYSLSSTLTLSKADYEKHKVYAC[#XL5]EVTHQGLSSPVTKSFNRGEC[#XL3]", Some(&custom_database())) .unwrap(); - let model = Model::none() + let model = FragmentationModel::none() + .clone() .b(PrimaryIonSeries::default()) .y(PrimaryIonSeries::default()) .allow_cross_link_cleavage(true); @@ -631,7 +642,8 @@ fn ensure_no_double_xl_labels_non_breaking() { let peptide = CompoundPeptidoformIon::pro_forma("EVQLVESGGGLVQPGGSLRLSC[C:Disulfide#XL1]AASGFNIKDTYIHWVRQAPGKGLEWVARIYPTNGYTRYADSVKGRFTISADTSKNTAYLQMNSLRAEDTAVYYC[#XL1]SRWGGDGFYAMDYWGQGTLVTVSSASTKGPSVFPLAPSSKSTSGGTAALGC[C:Disulfide#XL2]LVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSSLGTQTYIC[#XL2]NVNHKPSNTKVDKKVEPKSC[C:Disulfide#XL3]DKT//DIQMTQSPSSLSASVGDRVTITC[C:Disulfide#XL4]RASQDVNTAVAWYQQKPGKAPKLLIYSASFLYSGVPSRFSGSRSGTDFTLTISSLQPEDFATYYC[#XL4]QQHYTTPPTFGQGTKVEIKRTVAAPSVFIFPPSDEQLKSGTASVVC[C:Disulfide#XL5]LLNNFYPREAKVQWKVDNALQSGNSQESVTEQDSKDSTYSLSSTLTLSKADYEKHKVYAC[#XL5]EVTHQGLSSPVTKSFNRGEC[#XL3]", Some(&custom_database())) .unwrap(); - let model = Model::none() + let model = FragmentationModel::none() + .clone() .b(PrimaryIonSeries::default()) .y(PrimaryIonSeries::default()) .allow_cross_link_cleavage(false); @@ -663,7 +675,8 @@ fn ensure_no_double_xl_labels_small_breaking() { Some(&custom_database()), ) .unwrap(); - let model = Model::none() + let model = FragmentationModel::none() + .clone() .b(PrimaryIonSeries::default()) .y(PrimaryIonSeries::default()) .allow_cross_link_cleavage(true); @@ -683,7 +696,8 @@ fn ensure_no_double_xl_labels_small_non_breaking() { Some(&custom_database()), ) .unwrap(); - let model = Model::none() + let model = FragmentationModel::none() + .clone() .b(PrimaryIonSeries::default()) .y(PrimaryIonSeries::default()) .allow_cross_link_cleavage(false); @@ -699,7 +713,7 @@ fn ensure_no_double_xl_labels_small_non_breaking() { fn test( theoretical_fragments: &[(f64, &str)], peptide: impl Into, - model: &Model, + model: &FragmentationModel, charge: usize, allow_left_over_generated: bool, allow_double_theoretical: bool, diff --git a/rustyms/src/glycan/glycan_structure.rs b/rustyms/src/glycan/glycan_structure.rs index c6be1d90..d12ad603 100644 --- a/rustyms/src/glycan/glycan_structure.rs +++ b/rustyms/src/glycan/glycan_structure.rs @@ -5,7 +5,10 @@ use std::{fmt::Display, hash::Hash}; use itertools::Itertools; use serde::{Deserialize, Serialize}; -use super::{glycan_parse_list, BaseSugar, MonoSaccharide, PositionedGlycanStructure}; +use super::{ + glycan_parse_list, BaseSugar, GlycanBranchIndex, GlycanBranchMassIndex, MonoSaccharide, + PositionedGlycanStructure, +}; use crate::{ error::{Context, CustomError}, formula::{Chemical, MolecularFormula}, @@ -115,30 +118,34 @@ impl GlycanStructure { fn internal_pos( self, inner_depth: usize, - branch: &[usize], + branch: &[(GlycanBranchIndex, GlycanBranchMassIndex)], ) -> (PositionedGlycanStructure, usize) { // Sort the branches on decreasing molecular weight - let mut branches = self.branches; - branches.sort_unstable_by(|a, b| { - b.formula() - .monoisotopic_mass() - .partial_cmp(&a.formula().monoisotopic_mass()) - .unwrap() - }); + let branches = self + .branches + .into_iter() + .enumerate() + .sorted_unstable_by(|(_, a), (_, b)| { + b.formula() + .monoisotopic_mass() + .partial_cmp(&a.formula().monoisotopic_mass()) + .unwrap() + }) + .collect_vec(); // Get the correct branch indices adding a new layer of indices when needed let branches: Vec<(PositionedGlycanStructure, usize)> = if branches.len() == 1 { branches .into_iter() - .map(|b| b.internal_pos(inner_depth + 1, branch)) + .map(|(_, b)| b.internal_pos(inner_depth + 1, branch)) .collect() } else { branches .into_iter() .enumerate() - .map(|(i, b)| { + .map(|(mass_index, (index, b))| { let mut new_branch = branch.to_vec(); - new_branch.push(i); + new_branch.push((index, mass_index)); b.internal_pos(inner_depth + 1, &new_branch) }) .collect() diff --git a/rustyms/src/glycan/mod.rs b/rustyms/src/glycan/mod.rs index 14f00d27..f4bfa520 100644 --- a/rustyms/src/glycan/mod.rs +++ b/rustyms/src/glycan/mod.rs @@ -3,7 +3,11 @@ mod glycan_structure; mod monosaccharide; mod positioned_structure; +#[cfg(feature = "glycan-render")] +mod render; pub use glycan_structure::*; pub use monosaccharide::*; pub use positioned_structure::*; +#[cfg(feature = "glycan-render")] +pub use render::{GlycanDirection, GlycanRoot, GlycanSelection, RenderedGlycan}; diff --git a/rustyms/src/glycan/monosaccharide.rs b/rustyms/src/glycan/monosaccharide.rs index c7c58e7d..6fe3a9e5 100644 --- a/rustyms/src/glycan/monosaccharide.rs +++ b/rustyms/src/glycan/monosaccharide.rs @@ -2,9 +2,10 @@ use crate::{ fragment::{DiagnosticPosition, Fragment, FragmentType}, + model::GlycanModel, molecular_charge::CachedCharge, system::usize::Charge, - AminoAcid, Model, Multi, NeutralLoss, + AminoAcid, FragmentationModel, Multi, NeutralLoss, }; include!("../shared/glycan.rs"); @@ -44,12 +45,12 @@ impl MonoSaccharide { /// Generate the theoretical fragments, if any monosaccharide is present a negative number of times no fragments are generated. pub(crate) fn theoretical_fragments( composition: &[(Self, isize)], - model: &Model, + model: &FragmentationModel, peptidoform_ion_index: usize, peptidoform_index: usize, charge_carriers: &mut CachedCharge, full_formula: &Multi, - attachment: Option<(AminoAcid, usize)>, + attachment: Option<(AminoAcid, SequencePosition)>, ) -> Vec { if composition.iter().any(|(_, a)| u16::try_from(*a).is_err()) { // u16: negative + also ensure it fits within the bounds of the molecular formula structure @@ -73,7 +74,7 @@ impl MonoSaccharide { Charge::default(), peptidoform_ion_index, peptidoform_index, - FragmentType::OxoniumComposition(composition.clone(), attachment), + FragmentType::BComposition(composition.clone(), attachment), ) .with_charge_range(charge_carriers, model.glycan.oxonium_charge_range) .flat_map(|o| o.with_neutral_losses(&model.glycan.neutral_losses)), @@ -100,6 +101,7 @@ impl MonoSaccharide { peptidoform_index, DiagnosticPosition::GlycanCompositional(sugar.clone(), attachment), false, + &model.glycan, ) .into_iter() .flat_map(|d| { @@ -114,7 +116,7 @@ impl MonoSaccharide { /// Get all unique combinations of monosaccharides within the given range of number of monosaccharides used /// # Panics /// If any if the composition options has more then [`isize::MAX`] sugars. - fn composition_options( + pub fn composition_options( composition: &[(Self, isize)], range: std::ops::RangeInclusive, ) -> Vec> { @@ -159,13 +161,13 @@ impl MonoSaccharide { } /// Generate all uncharged diagnostic ions for this monosaccharide. - /// According to: . pub(crate) fn diagnostic_ions( &self, peptidoform_ion_index: usize, peptidoform_index: usize, position: DiagnosticPosition, add_base: bool, + model: &GlycanModel, ) -> Vec { let base = Fragment::new( self.formula(), @@ -174,48 +176,14 @@ impl MonoSaccharide { peptidoform_index, FragmentType::Diagnostic(position), ); - let mut result = - if matches!(self.base_sugar, BaseSugar::Hexose(_)) && self.substituents.is_empty() { - vec![ - base.with_neutral_loss(&NeutralLoss::Loss(molecular_formula!(H 2 O 1))), - base.with_neutral_loss(&NeutralLoss::Loss(molecular_formula!(H 4 O 2))), - base.with_neutral_loss(&NeutralLoss::Loss(molecular_formula!(C 1 H 6 O 3))), - base.with_neutral_loss(&NeutralLoss::Loss(molecular_formula!(C 2 H 6 O 3))), - ] - } else if matches!(self.base_sugar, BaseSugar::Hexose(_)) - && self.substituents == [GlycanSubstituent::NAcetyl] - { - vec![ - base.with_neutral_loss(&NeutralLoss::Loss(molecular_formula!(H 2 O 1))), - base.with_neutral_loss(&NeutralLoss::Loss(molecular_formula!(H 4 O 2))), - base.with_neutral_loss(&NeutralLoss::Loss(molecular_formula!(C 2 H 4 O 2))), - base.with_neutral_loss(&NeutralLoss::Loss(molecular_formula!(C 1 H 6 O 3))), - base.with_neutral_loss(&NeutralLoss::Loss(molecular_formula!(C 2 H 6 O 3))), - base.with_neutral_loss(&NeutralLoss::Loss(molecular_formula!(C 4 H 8 O 4))), - ] - } else if matches!(self.base_sugar, BaseSugar::Nonose) - && (self.substituents - == [ - GlycanSubstituent::Amino, - GlycanSubstituent::Acetyl, - GlycanSubstituent::Acid, - ] - || self.substituents - == [ - GlycanSubstituent::Amino, - GlycanSubstituent::Glycolyl, - GlycanSubstituent::Acid, - ]) - { - // Neu5Ac and Neu5Gc - vec![base.with_neutral_loss(&NeutralLoss::Loss(molecular_formula!(H 2 O 1)))] - } else { - return Vec::new(); // Do not add this full glycan as diagnostic ion - }; - if add_base { - result.push(base); - } - result + model + .specific_neutral_losses + .iter() + .filter(|(ms, precise, _)| ms.equivalent(self, *precise)) + .flat_map(|(_, _, losses)| losses) + .map(|loss| base.with_neutral_loss(loss)) + .chain(std::iter::repeat_n(base.clone(), usize::from(add_base))) + .collect() } } @@ -347,7 +315,9 @@ mod tests { ); assert_eq!( parse("D-Araf"), - MonoSaccharide::new(BaseSugar::Pentose(Some(PentoseIsomer::Arabinose)), &[]).furanose() + MonoSaccharide::new(BaseSugar::Pentose(Some(PentoseIsomer::Arabinose)), &[]) + .furanose() + .configuration(Configuration::D) ); assert_eq!( parse("Xyl-onic"), diff --git a/rustyms/src/glycan/positioned_structure.rs b/rustyms/src/glycan/positioned_structure.rs index 81741d0f..d946f852 100644 --- a/rustyms/src/glycan/positioned_structure.rs +++ b/rustyms/src/glycan/positioned_structure.rs @@ -1,5 +1,5 @@ //! Handle positioned glycan structures -use std::hash::Hash; +use std::{hash::Hash, ops::RangeInclusive}; use itertools::Itertools; use serde::{Deserialize, Serialize}; @@ -8,13 +8,19 @@ use super::MonoSaccharide; use crate::{ formula::{Chemical, MolecularFormula}, fragment::{Fragment, FragmentType, GlycanBreakPos, GlycanPosition}, + model::GlycanModel, molecular_charge::CachedCharge, system::usize::Charge, - AminoAcid, Model, Multi, SequencePosition, + AminoAcid, FragmentationModel, Multi, SequencePosition, }; use crate::uom::num_traits::Zero; +/// The index in the branches as stored in the structure +pub type GlycanBranchIndex = usize; +/// The index in the branches when the branches are sorted on mass, this is used to properly render the names of the branches for human consumption +pub type GlycanBranchMassIndex = usize; + /// Rose tree representation of glycan structure #[derive(Debug, Eq, PartialEq, Clone, Hash, Serialize, Deserialize)] pub struct PositionedGlycanStructure { @@ -22,7 +28,15 @@ pub struct PositionedGlycanStructure { pub(super) branches: Vec, pub(super) inner_depth: usize, pub(super) outer_depth: usize, - pub(super) branch: Vec, + /// The branches taken to get to this location (from the root) as the index in the branches and the index in the branches when sorted by mass. + /// For a general glycan with a fucose on the first hexnac and a bisection after the core double + /// hexnac + hex, this variable will contain an empty list for the root hexnac. For the fucose + /// this variable will contain `[(0, 1)]` indicating it is the first branch in the structure but + /// the second branch if the branches are sorted by mass. For the monosaccharides in the left + /// bisection this variable will contain `[(1, 0), (0, 0)]`, indicating that it took the main + /// branch (and not the fucose) and that it took the left branch for the second bisection which + /// is heavier than the right branch. + pub(super) branch: Vec<(GlycanBranchIndex, GlycanBranchMassIndex)>, } impl Chemical for PositionedGlycanStructure { @@ -41,16 +55,133 @@ impl Chemical for PositionedGlycanStructure { } impl PositionedGlycanStructure { + /// All core options, with the Y breakage positions leading to this fragment + pub fn core_options( + &self, + range: RangeInclusive, + peptidoform_index: usize, + attachment: Option<(AminoAcid, SequencePosition)>, + ) -> Vec<(Vec, MolecularFormula)> { + self.internal_break_points(0, peptidoform_index, attachment) + .iter() + .filter(|(_, _, depth)| range.contains(depth)) + .map(|(f, pos, _)| { + ( + pos.iter() + .filter(|b| !matches!(b, GlycanBreakPos::End(_))) + .map(GlycanBreakPos::position) + .cloned() + .collect(), + f.clone(), + ) + }) + .collect() + } + + /// All possible bonds that can be broken and the molecular formula that would be held over if these bonds all broke and the broken off parts are lost. + fn internal_break_points( + &self, + depth: u8, + peptidoform_index: usize, + attachment: Option<(AminoAcid, SequencePosition)>, + ) -> Vec<(MolecularFormula, Vec, u8)> { + // Find every internal fragment ending at this bond (in a B breakage) (all bonds found are Y breakages and endings) + // Walk through all branches and determine all possible breakages + if self.branches.is_empty() { + vec![ + ( + self.formula_inner(SequencePosition::default(), peptidoform_index), + vec![GlycanBreakPos::End(self.position(attachment))], + depth + u8::from(!self.sugar.is_fucose()), + ), + ( + MolecularFormula::default(), + vec![GlycanBreakPos::Y(self.position(attachment))], + depth, + ), + ] + } else { + self.branches + .iter() + .map(|b| { + b.internal_break_points( + depth + u8::from(!b.sugar.is_fucose()), + peptidoform_index, + attachment, + ) + }) // get all previous options + .fold(Vec::new(), |accumulator, branch_options| { + if accumulator.is_empty() { + branch_options + } else { + let mut new_accumulator = Vec::new(); + for base in &accumulator { + for option in &branch_options { + new_accumulator.push(( + &option.0 + &base.0, + [option.1.clone(), base.1.clone()].concat(), + option.2.max(base.2), + )); + } + } + new_accumulator + } + }) + .into_iter() + .map(|(m, b, d)| { + ( + m + self + .sugar + .formula_inner(SequencePosition::default(), peptidoform_index), + b, + d, + ) + }) + .chain(std::iter::once(( + // add the option of it breaking here + MolecularFormula::default(), + vec![GlycanBreakPos::Y(self.position(attachment))], + depth, + ))) + .collect() + } + } + + /// Get uncharged diagnostic ions from all positions + fn diagnostic_ions( + &self, + peptidoform_ion_index: usize, + peptidoform_index: usize, + attachment: Option<(AminoAcid, SequencePosition)>, + model: &GlycanModel, + ) -> Vec { + let mut output = self.sugar.diagnostic_ions( + peptidoform_ion_index, + peptidoform_index, + crate::fragment::DiagnosticPosition::Glycan( + self.position(attachment), + self.sugar.clone(), + ), + true, + model, + ); + output.extend(self.branches.iter().flat_map(|b| { + b.diagnostic_ions(peptidoform_ion_index, peptidoform_index, attachment, model) + })); + + output + } + /// Generate all theoretical fragments for this glycan /// * `full_formula` the total formula of the whole peptide + glycan pub fn generate_theoretical_fragments( &self, - model: &Model, + model: &FragmentationModel, peptidoform_ion_index: usize, peptidoform_index: usize, charge_carriers: &mut CachedCharge, full_formula: &Multi, - attachment: Option<(AminoAcid, usize)>, + attachment: Option<(AminoAcid, SequencePosition)>, ) -> Vec { model .glycan @@ -67,13 +198,13 @@ impl PositionedGlycanStructure { .collect_vec(); // Generate all Y fragments base_fragments.extend( - self.internal_break_points(peptidoform_index, attachment) + self.internal_break_points(0, peptidoform_index, attachment) .iter() - .filter(|(_, bonds)| { + .filter(|(_, bonds, _)| { bonds.iter().all(|b| !matches!(b, GlycanBreakPos::B(_))) && !bonds.iter().all(|b| matches!(b, GlycanBreakPos::End(_))) }) - .flat_map(move |(f, bonds)| { + .flat_map(move |(f, bonds, _)| { full_formula.iter().map(move |full| { Fragment::new( full - self.formula_inner( @@ -101,83 +232,59 @@ impl PositionedGlycanStructure { ); // Generate all diagnostic ions base_fragments.extend( - self.diagnostic_ions(peptidoform_ion_index, peptidoform_index, attachment) - .into_iter() - .flat_map(|f| { - f.with_charge_range(charge_carriers, model.glycan.oxonium_charge_range) - }), + self.diagnostic_ions( + peptidoform_ion_index, + peptidoform_index, + attachment, + &model.glycan, + ) + .into_iter() + .flat_map(|f| { + f.with_charge_range(charge_carriers, model.glycan.oxonium_charge_range) + }), ); base_fragments }) .unwrap_or_default() } - /// Get uncharged diagnostic ions from all positions - fn diagnostic_ions( - &self, - peptidoform_ion_index: usize, - peptidoform_index: usize, - attachment: Option<(AminoAcid, usize)>, - ) -> Vec { - let mut output = self.sugar.diagnostic_ions( - peptidoform_ion_index, - peptidoform_index, - crate::fragment::DiagnosticPosition::Glycan( - self.position(attachment), - self.sugar.clone(), - ), - true, - ); - output.extend( - self.branches.iter().flat_map(|b| { - b.diagnostic_ions(peptidoform_ion_index, peptidoform_index, attachment) - }), - ); - - output - } - /// Generate all fragments without charge and neutral loss options fn oxonium_fragments( &self, peptidoform_ion_index: usize, peptidoform_index: usize, - attachment: Option<(AminoAcid, usize)>, + attachment: Option<(AminoAcid, SequencePosition)>, ) -> Vec { - // Generate the basic single breakage B fragments - let mut base_fragments = vec![Fragment::new( - self.formula_inner(SequencePosition::default(), peptidoform_index), - Charge::zero(), - peptidoform_ion_index, - peptidoform_index, - FragmentType::B(self.position(attachment)), - )]; - // Extend with all internal fragments, meaning multiple breaking bonds - base_fragments.extend( - self.internal_break_points(peptidoform_index, attachment) - .into_iter() - .filter(|(_, breakages)| { - !breakages - .iter() - .all(|b| matches!(b, GlycanBreakPos::End(_))) - }) - .filter(|(m, _)| *m != MolecularFormula::default()) - .map(|(m, b)| { - ( - m, - [b, vec![GlycanBreakPos::B(self.position(attachment))]].concat(), - ) - }) - .map(|(formula, breakages)| { - Fragment::new( - formula, - Charge::zero(), - peptidoform_ion_index, - peptidoform_index, - FragmentType::Oxonium(breakages), - ) - }), - ); + // Find all B type fragments (with and without Y breakage) + let mut base_fragments = self + .internal_break_points(0, peptidoform_index, attachment) + .iter() + .filter(|(m, _, _)| *m != MolecularFormula::default()) + .map(|(formula, breakages, _)| { + Fragment::new( + formula.clone(), + Charge::zero(), + peptidoform_ion_index, + peptidoform_index, + FragmentType::B { + b: self.position(attachment), + y: breakages + .iter() + .filter(|b| matches!(b, GlycanBreakPos::Y(_))) + .map(GlycanBreakPos::position) + .cloned() + .collect(), + end: breakages + .iter() + .filter(|b| matches!(b, GlycanBreakPos::End(_))) + .map(GlycanBreakPos::position) + .cloned() + .collect(), + }, + ) + }) + .collect_vec(); + // Extend with the theoretical fragments for all branches of this position base_fragments.extend(self.branches.iter().flat_map(|b| { b.oxonium_fragments(peptidoform_ion_index, peptidoform_index, attachment) @@ -185,64 +292,7 @@ impl PositionedGlycanStructure { base_fragments } - /// All possible bonds that can be broken and the molecular formula that would be held over if these bonds all broke and the broken off parts are lost. - fn internal_break_points( - &self, - peptidoform_index: usize, - attachment: Option<(AminoAcid, usize)>, - ) -> Vec<(MolecularFormula, Vec)> { - // Find every internal fragment ending at this bond (in a B breakage) (all bonds found are Y breakages and endings) - // Walk through all branches and determine all possible breakages - if self.branches.is_empty() { - vec![ - ( - self.formula_inner(SequencePosition::default(), peptidoform_index), - vec![GlycanBreakPos::End(self.position(attachment))], - ), - ( - MolecularFormula::default(), - vec![GlycanBreakPos::Y(self.position(attachment))], - ), - ] - } else { - self.branches - .iter() - .map(|b| b.internal_break_points(peptidoform_index, attachment)) // get all previous options - .fold(Vec::new(), |accumulator, branch_options| { - if accumulator.is_empty() { - branch_options - } else { - let mut new_accumulator = Vec::new(); - for base in &accumulator { - for option in &branch_options { - new_accumulator.push(( - &option.0 + &base.0, - [option.1.clone(), base.1.clone()].concat(), - )); - } - } - new_accumulator - } - }) - .into_iter() - .map(|(m, b)| { - ( - m + self - .sugar - .formula_inner(SequencePosition::default(), peptidoform_index), - b, - ) - }) - .chain(std::iter::once(( - // add the option of it breaking here - MolecularFormula::default(), - vec![GlycanBreakPos::Y(self.position(attachment))], - ))) - .collect() - } - } - - fn position(&self, attachment: Option<(AminoAcid, usize)>) -> GlycanPosition { + fn position(&self, attachment: Option<(AminoAcid, SequencePosition)>) -> GlycanPosition { GlycanPosition { inner_depth: self.inner_depth, series_number: self.outer_depth + 1, diff --git a/rustyms/src/glycan/render/absolute.rs b/rustyms/src/glycan/render/absolute.rs new file mode 100644 index 00000000..bbd85d70 --- /dev/null +++ b/rustyms/src/glycan/render/absolute.rs @@ -0,0 +1,460 @@ +use itertools::Itertools; + +use crate::{ + fragment::GlycanPosition, + glycan::{ + render::{ + element::GlycanRoot, + shape::{Colour, Shape}, + }, + GlycanBranchIndex, GlycanBranchMassIndex, GlycanStructure, RenderedGlycan, + }, + Chemical, +}; + +use super::element::GlycanSelection; + +impl GlycanStructure { + /// Render this glycan to the internal representation. This can then be rendered to SVG or a bitmap. + /// * `basis`: the text or symbol to draw at the root of the tree. + /// * `column_size`: the size (in pixels) of one block in the glycan, the full size with the padding and sugar size included. + /// * `sugar_size`: the size (in pixels) of a monosaccharide. + /// * `stroke_size`: the size (in pixels) of the strokes in the graphic. + /// * `direction`: the direction the draw the image in. + /// * `selection`: the selection of the glycan to draw, used to render fragments. + /// * `foreground`: the colour to be used for the foreground, in RGB order. + /// * `background`: the colour to be used for the background, in RGB order, this is used to fill 'empty' sugars if the isomeric state is unknown. + /// * `footnotes`: used to gather modification texts that are too big to place in line. The caller will have to find their own way of displaying this to the user. + /// + /// # Errors + /// If the underlying buffer errors the error is returned. Otherwise `Ok(false)` is returned if the given `root_break` is not valid, and `Ok(true)` is returned if the rendering was fully successful. + pub fn render<'a>( + &'a self, + basis: GlycanRoot, + column_size: f32, + sugar_size: f32, + stroke_size: f32, + direction: GlycanDirection, + selection: GlycanSelection<'a>, + foreground: [u8; 3], + background: [u8; 3], + footnotes: &'a mut Vec, + ) -> Option { + self.position_absolute(0, &[], footnotes).render( + basis, + column_size, + sugar_size, + stroke_size, + direction, + selection, + foreground, + background, + footnotes, + ) + } + + /// Build the rendered glycan. + fn position_absolute( + &self, + depth: usize, + path: &[(GlycanBranchIndex, GlycanBranchMassIndex)], + footnotes: &mut Vec, + ) -> AbsolutePositionedGlycan { + let (shape, colour, inner_modifications, outer_modifications) = self.sugar.get_shape(); + // Automatically make footnotes out of long outer modification texts + let outer_modifications = if outer_modifications.len() > 6 { + let index = footnotes.iter().position(|e| *e == outer_modifications); + index.map_or_else( + || { + let index = footnotes.len(); + footnotes.push(outer_modifications); + OuterModifications::Footnote(index) + }, + OuterModifications::Footnote, + ) + } else if !outer_modifications.is_empty() { + OuterModifications::Text(outer_modifications) + } else { + OuterModifications::Empty + }; + + if self.branches.is_empty() { + AbsolutePositionedGlycan { + y: 0, + x: 0.0, + mid_point: 0.5, + width: 1.0, + shape, + colour, + inner_modifications, + outer_modifications, + position: GlycanPosition { + inner_depth: depth, + series_number: depth, + branch: path.to_vec(), + attachment: None, + }, + title: self.sugar.to_string(), + branch_index: 0, + branches: Vec::new(), + sides: Vec::new(), + } + } else { + let mut y_depth = 0; + let mut branches = Vec::new(); + let mut sides = Vec::new(); + for (mass_index, (branch_index, branch)) in self + .branches + .iter() + .enumerate() + .sorted_unstable_by(|(_, a), (_, b)| { + b.formula() + .monoisotopic_mass() + .partial_cmp(&a.formula().monoisotopic_mass()) + .unwrap() + }) + .enumerate() + .sorted_unstable_by(|a, b| (a.1 .0.cmp(&b.1 .0))) + { + let mut new_path = path.to_vec(); + new_path.push((branch_index, mass_index)); + let mut rendered = branch.position_absolute( + depth + 1, + if self.branches.len() > 1 { + &new_path + } else { + path + }, + footnotes, + ); + rendered.branch_index = branch_index; + if rendered.is_sideways() && sides.len() < 2 { + if sides.is_empty() && rendered.shape == Shape::Triangle { + rendered.shape = Shape::LeftPointingTriangle; + } else if sides.len() == 1 && rendered.shape == Shape::Triangle { + rendered.shape = Shape::RightPointingTriangle; + } + sides.push(rendered); + } else { + y_depth = y_depth.max(rendered.y); + branches.push(rendered); + } + } + // Update all branch placements + let mut displacement = 0.0; + for branch in &mut branches { + branch.transpose(y_depth - branch.y, displacement); + displacement += branch.width; + } + if !branches.is_empty() { + y_depth += 1; + } + // Determine the center point for this sugar + let mut center = match branches.len() { + 0 => 0.5, + 1 => branches[0].mid_point, + n => { + // Find the median midpoint of the branches + (branches[n / 2 - (n + 1) % 2].x + + branches[n / 2 - (n + 1) % 2].mid_point + + branches[n / 2].x + + branches[n / 2].mid_point) + / 2.0 + } + }; + let mut width = branches.last().map_or(1.0, |b| b.x + b.width); + if !sides.is_empty() { + sides[0].transpose(y_depth, center + 0.5); + width = width.max(center + 0.5 + sides[0].width); + } + if sides.len() == 2 { + let mut x = center - 0.5 - sides[1].width; + if x < 0.0 { + let shift = -x; + center += shift; + for branch in &mut branches { + branch.transpose(0, shift); + } + sides[0].transpose(0, shift); + width += shift; + x = 0.0; + } + sides[1].transpose(y_depth, x); + } + AbsolutePositionedGlycan { + y: y_depth, + x: 0.0, + mid_point: center, + width, + shape, + colour, + inner_modifications, + outer_modifications, + position: GlycanPosition { + inner_depth: depth, + series_number: depth, + branch: path.to_vec(), + attachment: None, + }, + title: self.sugar.to_string(), + branch_index: 0, + branches, + sides, + } + } + } +} + +/// An absolute positioned glycan. +#[derive(Debug, Clone)] +pub(super) struct AbsolutePositionedGlycan { + /// The depth of this sugar along the main axis of the glycan, starting at 0 at the top (in the leaves) + pub(super) y: usize, + /// The sideways placement of this whole tree starting at 0 at the leftmost monosaccharide, 1.0 is the width of one monosaccharide + pub(super) x: f32, + /// The sideways placement of this sugar within this tree, for the absolute sideways placement of this sugar add this to `x` + pub(super) mid_point: f32, + /// The total width of the (sub)tree with all of its branches and sides + pub(super) width: f32, + /// The shape of the monosaccharide + pub(super) shape: Shape, + /// The colour of the monosaccharide + pub(super) colour: Colour, + /// Text to be shown inside the monosaccharide + pub(super) inner_modifications: String, + /// Text to be shown outside the monosaccharide + pub(super) outer_modifications: OuterModifications, + /// The position of this sugar + pub(super) position: GlycanPosition, + /// Full name of the glycan + pub(super) title: String, + /// The index into the branches of the parent monosaccharide + pub(super) branch_index: usize, + /// All branches that go up the tree + pub(super) branches: Vec, + /// All branches that go to the side (Fucoses) + pub(super) sides: Vec, +} + +#[derive(Debug, Clone)] +/// Modifications that are to be shown outside of the monosaccharide +pub(super) enum OuterModifications { + /// Too long of a text, or it did not fit, so show as a footnote + Footnote(usize), + /// Text + Text(String), + /// No modification + Empty, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +/// The direction of the rendered glycan +pub enum GlycanDirection { + /// A top down tree, with the root at the bottom + TopDown, + /// A left to right tree, with the root at the right + LeftToRight, +} + +/// A subtree of a rendered glycan, used to restrict the canvas for glycan fragments +#[derive(Debug, Clone)] +pub(super) struct SubTree<'a> { + /// The root for this sub tree + pub(super) tree: &'a AbsolutePositionedGlycan, + /// Total depth of the glycans with the breaks applied + pub(super) depth: usize, + /// The horizontal offset from the left + pub(super) left_offset: f32, + /// The horizontal offset from the right + pub(super) right_offset: f32, + /// If this fragment is topped by a breaking symbol, needed to calculate the correct height for the canvas + pub(super) break_top: bool, + /// If this fragment is bottomed by a breaking symbol, needed to calculate the correct height for the canvas + pub(super) break_bottom: bool, + /// All breaking branches, standardised to the linked root + pub(super) branch_breaks: Vec<(usize, Vec<(GlycanBranchIndex, GlycanBranchMassIndex)>)>, +} + +impl AbsolutePositionedGlycan { + /// Transpose this glycan and all of its branches + fn transpose(&mut self, y: usize, x: f32) { + self.y += y; + self.x += x; + for branch in &mut self.branches { + branch.transpose(y, x); + } + for side in &mut self.sides { + side.transpose(y, x); + } + } + + /// Check if this sugar should be rendered to the side of the parent sugar + fn is_sideways(&self) -> bool { + self.colour == Colour::Red + && self.shape == Shape::Triangle + && self.branches.is_empty() + && self.sides.is_empty() + } + + /// Get the subtree starting on the given position, return None if the starting position is not valid, it also indicates the depth of this subtree for the given branch breakages and if a break tops the structure + pub(super) fn get_subtree<'a>(&'a self, selection: GlycanSelection<'a>) -> Option> { + /// Calculate the maximal depth, break top, left and right offset + fn canvas_size( + tree: &AbsolutePositionedGlycan, + breakages: &[(usize, Vec<(GlycanBranchIndex, GlycanBranchMassIndex)>)], + ) -> (usize, bool, f32, f32) { + let lx = (tree.x + tree.mid_point - 0.5).max(0.0); + let rx = (tree.width - tree.mid_point - 0.5).max(0.0); + // The tree is cut here + if breakages.iter().any(|b| b.0 == 0) { + return (0, true, lx, rx); + }; + + let total_branches = tree.branches.len() + tree.sides.len(); + let (depth, break_top, left_offset, right_offset) = match total_branches { + 0 => (0, false, lx, rx), + 1 => tree.branches.first().map_or((0, false, lx, rx), |branch| { + canvas_size( + branch, + &breakages + .iter() + .map(|b| (b.0 - 1, b.1.clone())) + .collect_vec(), + ) + }), + _ => tree + .branches + .iter() + .enumerate() + .map(|(i, branch)| { + ( + i, + canvas_size( + branch, + &breakages + .iter() + .filter(|b| { + b.1.first().map(|b| b.0) == Some(branch.branch_index) + }) + .map(|b| (b.0 - 1, b.1[1..].to_vec())) + .collect_vec(), + ), + ) + }) + .fold((0, false, lx, rx), |acc, (i, v)| { + ( + acc.0.max(v.0), + if v.0 >= acc.0 { v.1 } else { acc.1 }, + if i == 0 { v.2 } else { acc.2 }, + if i == tree.branches.len() - 1 { + v.3 + } else { + acc.3 + }, + ) + }), + }; + ( + depth + 1, + break_top, + if tree.sides.len() == 2 { + left_offset.min(tree.x + tree.mid_point - 1.5).max(0.0) + } else { + left_offset + }, + if tree.sides.is_empty() { + right_offset + } else { + right_offset.min(tree.width - tree.mid_point - 1.5).max(0.0) + }, + ) + } + + let (tree, rules, break_bottom) = match selection { + GlycanSelection::Subtree(root, branch_breaks) => { + let start = root.unwrap_or(&self.position); + let mut tree = self; + let mut depth = 0; + let mut branch_choices = start.branch.clone(); + branch_choices.reverse(); + while depth < start.inner_depth { + depth += 1; + + let total_branches = tree.branches.len() + tree.sides.len(); + match total_branches { + 0 => return None, + 1 => tree = tree.branches.first().or_else(|| tree.sides.first())?, + _ => { + let index = branch_choices.pop()?; + tree = tree + .branches + .iter() + .chain(tree.sides.iter()) + .find(|b| b.branch_index == index.0)?; + } + } + } + + let rules = branch_breaks + .iter() + .filter(|b| { + b.inner_depth >= start.inner_depth && b.branch.starts_with(&start.branch) + }) + .map(|b| { + ( + b.inner_depth - start.inner_depth, + b.branch[start.branch.len()..].to_vec(), + ) + }) + .collect_vec(); + (tree, rules, root.is_some()) + } + GlycanSelection::SingleSugar(position) => { + let mut tree = self; + let mut depth = 0; + let mut branch_choices = position.branch.clone(); + branch_choices.reverse(); + while depth < position.inner_depth { + depth += 1; + + let total_branches = tree.branches.len() + tree.sides.len(); + match total_branches { + 0 => return None, + 1 => tree = tree.branches.first().or_else(|| tree.sides.first())?, + _ => { + let index = branch_choices.pop()?; + tree = tree + .branches + .iter() + .find(|b| b.branch_index == index.0) + .or_else(|| { + tree.sides.iter().find(|b| b.branch_index == index.0) + })?; + } + } + } + + let rules = tree + .branches + .iter() + .chain(tree.sides.iter()) + .map(|b| { + (1, vec![(b.branch_index, b.branch_index)]) + // TODO: the mass_index should be stored here, but currently that is unused so for now this does not introduce incorrect behaviour + }) + .collect_vec(); + (tree, rules, true) + } + }; + let (depth, break_top, left_offset, right_offset) = canvas_size(tree, &rules); + Some(SubTree { + tree, + depth, + left_offset, + right_offset, + break_top, + break_bottom, + branch_breaks: rules, + }) + } +} diff --git a/rustyms/src/glycan/render/bitmap.rs b/rustyms/src/glycan/render/bitmap.rs new file mode 100644 index 00000000..465bb47b --- /dev/null +++ b/rustyms/src/glycan/render/bitmap.rs @@ -0,0 +1,387 @@ +use itertools::Itertools; +use swash::{ + scale::{Render, ScaleContext, Source}, + FontRef, +}; +use zeno::{Fill, Format, Mask, PathBuilder, Point, Scratch, Stroke, Vector}; + +use crate::glycan::{render::element::Element, RenderedGlycan}; + +use super::element::{TextAnchor, TextBaseline}; + +impl RenderedGlycan { + /// Render this glycan as an RGBA bitmap. + /// * `format`: the used strategy for antialiasing. + /// * `font`: the font for rendering text. + /// * `context`: the context for caching rendering text. + /// # Panics + /// If the glyph renderer failed. See [`swash::scale::Render::render`]. + pub fn to_bitmap( + &self, + format: Format, + font: FontRef, + context: &mut ScaleContext, + ) -> (Vec, usize) { + let mask_factor = if format == Format::Alpha { 1 } else { 4 }; + let image_width = self.size.0.ceil() as usize; + let mut image = std::iter::repeat([ + self.background[0], + self.background[1], + self.background[2], + 0, + ]) + .take(image_width * self.size.1.ceil() as usize) + .flatten() + .collect_vec(); + + let mut scratch = Scratch::new(); + let mut stroke_mask = Vec::new(); + let mut fill_mask = Vec::new(); + for element in &self.elements { + // Draw into the mask + let (x, y, mask_width, fill, stroke) = match element { + Element::Line { + from, + to, + stroke, + stroke_size, + } => { + let xmin = (from.0.min(to.0) - stroke_size).floor(); + let xmax = (from.0.max(to.0) + stroke_size).ceil(); + let ymin = (from.1.min(to.1) - stroke_size).floor(); + let ymax = (from.1.max(to.1) + stroke_size).ceil(); + let width = (xmax - xmin) as usize; + let height = (ymax - ymin) as usize; + let commands = vec![ + zeno::Command::MoveTo(Vector::new( + from.0 - xmin + stroke_size / 2.0, + from.1 - ymin + stroke_size / 2.0, + )), + zeno::Command::LineTo(Vector::new( + to.0 - xmin + stroke_size / 2.0, + to.1 - ymin + stroke_size / 2.0, + )), + zeno::Command::Close, + ]; + stroke_mask.fill(0); + stroke_mask.resize(height * width * mask_factor, 0); + Mask::with_scratch(&commands, &mut scratch) + .format(format) + .style(Stroke::new(*stroke_size)) + .size(width as u32, height as u32) + .render_into(&mut stroke_mask, None); + (xmin as usize, ymin as usize, width, None, Some(*stroke)) + } + Element::Circle { + r, + center, + fill, + stroke, + stroke_size, + svg_header: _, + } => { + let width = (center.0.fract() + r * 2.0 + stroke_size).ceil() as usize; + let height = (center.1.fract() + r * 2.0 + stroke_size).ceil() as usize; + let mut commands = Vec::new(); + commands.add_circle( + ( + center.0.fract() + r + stroke_size / 2.0, + center.1.fract() + r + stroke_size / 2.0, + ), + *r, + ); + if fill.is_some() { + fill_mask.fill(0); + fill_mask.resize(height * width * mask_factor, 0); + Mask::with_scratch(&commands, &mut scratch) + .format(format) + .style(Fill::NonZero) + .size(width as u32, height as u32) + .render_into(&mut fill_mask, None); + } + stroke_mask.fill(0); + stroke_mask.resize(height * width * mask_factor, 0); + Mask::with_scratch(&commands, &mut scratch) + .format(format) + .style(Stroke::new(*stroke_size)) + .size(width as u32, height as u32) + .render_into(&mut stroke_mask, None); + ( + (center.0 - r) as usize, + (center.1 - r) as usize, + width, + *fill, + Some(*stroke), + ) + } + Element::Rectangle { + top, + w, + h, + fill, + stroke, + stroke_size, + svg_header: _, + } => { + let width = (top.0.fract() + w + stroke_size).ceil() as usize; + let height = (top.1.fract() + h + stroke_size).ceil() as usize; + let mut commands = Vec::new(); + commands.add_rect( + ( + top.0.fract() + stroke_size / 2.0, + top.1.fract() + stroke_size / 2.0, + ), + *w, + *h, + ); + fill_mask.fill(0); + fill_mask.resize(height * width * mask_factor, 0); + Mask::with_scratch(&commands, &mut scratch) + .format(format) + .style(Fill::NonZero) + .size(width as u32, height as u32) + .render_into(&mut fill_mask, None); + stroke_mask.fill(0); + stroke_mask.resize(height * width * mask_factor, 0); + Mask::with_scratch(&commands, &mut scratch) + .format(format) + .style(Stroke::new(*stroke_size)) + .size(width as u32, height as u32) + .render_into(&mut stroke_mask, None); + ( + (top.0 - stroke_size / 2.0) as usize, + (top.1 - stroke_size / 2.0) as usize, + width, + Some(*fill), + Some(*stroke), + ) + } + Element::Polygon { + points, + fill, + stroke, + stroke_size, + svg_header: _, + bevel, + } => { + let (xmin, xmax, ymin, ymax) = points + .iter() + .fold((f32::MAX, f32::MIN, f32::MAX, f32::MIN), |acc, (x, y)| { + (acc.0.min(*x), acc.1.max(*x), acc.2.min(*y), acc.3.max(*y)) + }); + let xmin = (xmin - stroke_size).floor(); + let xmax = (xmax + stroke_size).ceil(); + let ymin = (ymin - stroke_size).floor(); + let ymax = (ymax + stroke_size).ceil(); + let width = (xmax - xmin) as usize; + let height = (ymax - ymin) as usize; + let mut commands = Vec::with_capacity(points.len() + 2); + commands.push(zeno::Command::MoveTo(Point::new( + points[0].0 - xmin + stroke_size / 2.0, + points[0].1 - ymin + stroke_size / 2.0, + ))); + for point in points { + commands.push(zeno::Command::LineTo(Point::new( + point.0 - xmin + stroke_size / 2.0, + point.1 - ymin + stroke_size / 2.0, + ))); + } + commands.push(zeno::Command::Close); + fill_mask.fill(0); + fill_mask.resize(height * width * mask_factor, 0); + Mask::with_scratch(&commands, &mut scratch) + .format(format) + .style(Fill::NonZero) + .size(width as u32, height as u32) + .render_into(&mut fill_mask, None); + stroke_mask.fill(0); + stroke_mask.resize(height * width * mask_factor, 0); + Mask::with_scratch(&commands, &mut scratch) + .format(format) + .style(Stroke::new(*stroke_size).join(if *bevel { + zeno::Join::Bevel + } else { + zeno::Join::Miter + })) + .size(width as u32, height as u32) + .render_into(&mut stroke_mask, None); + ( + xmin as usize, + ymin as usize, + width, + Some(*fill), + Some(*stroke), + ) + } + Element::Text { + text, + position, + anchor, + baseline, + fill, + size, + italic: _, // Needs a separate font + } => { + let mut scaler = context.builder(font).size(*size).hint(true).build(); + let metrics = font.metrics(&[]); + let normalisation_factor = size / f32::from(metrics.units_per_em); + let y_offset = (match baseline { + TextBaseline::Hanging => metrics.ascent, + TextBaseline::Middle => metrics.ascent - metrics.x_height / 2.0, + TextBaseline::Ideographic => metrics.ascent + metrics.descent, + }) + .mul_add(-normalisation_factor, position.1); + let mut width = 0.0; + for c in text.chars() { + let id = font.charmap().map(c); + width += font.glyph_metrics(&[]).advance_width(id); + } + + let x_offset = (match anchor { + TextAnchor::Start => 0.0, + TextAnchor::Middle => width / 2.0, + TextAnchor::End => width, + }) + .mul_add(-normalisation_factor, position.0); + + let mut offset = 0.0; + for c in text.chars() { + let id = font.charmap().map(c); + let glyph_metrics = font.glyph_metrics(&[]); + let mask = Render::new(&[Source::Outline]) + .format(format) + .offset(Vector::new( + (x_offset + offset).fract(), + y_offset.fract() - 1.0, + )) + .render(&mut scaler, id) + .unwrap(); + draw_mask( + (&mut image, image_width), + (&mask.data, mask.placement.width as usize), + (x_offset + offset + mask.placement.left as f32) as usize, + (y_offset + mask.placement.top as f32) as usize, + *fill, + format, + ); + + offset += glyph_metrics.advance_width(id) * normalisation_factor; + } + (0, 0, 0, None, None) + } + Element::Curve { + start, + points, + stroke, + stroke_size, + } => { + let (xmin, xmax, ymin, ymax) = points.iter().fold( + (f32::MAX, f32::MIN, f32::MAX, f32::MIN), + |acc, (a, b, x, y)| { + ( + acc.0.min(*x).min(*a), + acc.1.max(*x).max(*a), + acc.2.min(*y).min(*b), + acc.3.max(*y).max(*b), + ) + }, + ); + let xmin = (xmin - stroke_size).floor(); + let xmax = (xmax + stroke_size).ceil(); + let ymin = (ymin - stroke_size).floor(); + let ymax = (ymax + stroke_size).ceil(); + let width = (xmax - xmin) as usize; + let height = (ymax - ymin) as usize; + let mut commands = Vec::with_capacity(points.len() + 1); + commands.push(zeno::Command::MoveTo(Point::new( + start.0 - xmin + stroke_size / 2.0, + start.1 - ymin + stroke_size / 2.0, + ))); + for point in points { + commands.push(zeno::Command::QuadTo( + Point::new( + point.0 - xmin + stroke_size / 2.0, + point.1 - ymin + stroke_size / 2.0, + ), + Point::new( + point.2 - xmin + stroke_size / 2.0, + point.3 - ymin + stroke_size / 2.0, + ), + )); + } + stroke_mask.fill(0); + stroke_mask.resize(height * width * mask_factor, 0); + Mask::with_scratch(&commands, &mut scratch) + .format(format) + .style(Stroke::new(*stroke_size)) + .size(width as u32, height as u32) + .render_into(&mut stroke_mask, None); + (xmin as usize, ymin as usize, width, None, Some(*stroke)) + } + }; + if let Some(fill) = fill { + draw_mask( + (&mut image, image_width), + (&fill_mask, mask_width), + x, + y, + fill, + format, + ); + } + if let Some(stroke) = stroke { + draw_mask( + (&mut image, image_width), + (&stroke_mask, mask_width), + x, + y, + stroke, + format, + ); + } + } + (image, image_width) + } +} + +/// Draw the specified mask onto the specified image +#[allow(clippy::identity_op, clippy::needless_pass_by_value)] // I like the + 0 in position calculations for symmetry reasons and image tuple looks makes no sense to pass by reference +fn draw_mask( + image: (&mut [u8], usize), + mask: (&[u8], usize), + x: usize, + y: usize, + colour: [u8; 3], + format: Format, +) { + let mask_factor = if format == Format::Alpha { 1 } else { 4 }; + let mask_height = mask.0.len() / mask_factor / mask.1; + for r in 0..mask_height { + for w in 0..mask.1 { + let image_pos = ((r + y) * image.1 + (w + x)) * 4; + let mask_pos = (r * mask.1 + w) * mask_factor; + + if image_pos >= image.0.len() || mask_pos >= mask.0.len() { + continue; + } + + if format == Format::Alpha { + image.0[image_pos + 0] = blend(mask.0[mask_pos], colour[0], image.0[image_pos + 0]); + image.0[image_pos + 1] = blend(mask.0[mask_pos], colour[1], image.0[image_pos + 1]); + image.0[image_pos + 2] = blend(mask.0[mask_pos], colour[2], image.0[image_pos + 2]); + } else { + image.0[image_pos + 0] = + blend(mask.0[mask_pos + 0], colour[0], image.0[image_pos + 0]); + image.0[image_pos + 1] = + blend(mask.0[mask_pos + 1], colour[1], image.0[image_pos + 1]); + image.0[image_pos + 2] = + blend(mask.0[mask_pos + 2], colour[2], image.0[image_pos + 2]); + } + image.0[image_pos + 3] = 255; + } + } +} + +const fn blend(alpha: u8, foreground: u8, background: u8) -> u8 { + (((alpha as u16 * foreground as u16) + (255 - alpha) as u16 * background as u16) / 255) as u8 +} diff --git a/rustyms/src/glycan/render/element.rs b/rustyms/src/glycan/render/element.rs new file mode 100644 index 00000000..19b60732 --- /dev/null +++ b/rustyms/src/glycan/render/element.rs @@ -0,0 +1,1162 @@ +use std::f32::consts::PI; + +use itertools::Itertools; + +use crate::{ + fragment::GlycanPosition, + glycan::{ + render::{ + absolute::{AbsolutePositionedGlycan, OuterModifications}, + shape::{Colour, Shape}, + }, + GlycanBranchIndex, GlycanBranchMassIndex, GlycanDirection, + }, +}; + +/// A rendered glycan, contains all information needed to render this to svg or a bitmap. +pub struct RenderedGlycan { + /// The size of the canvas + pub(super) size: (f32, f32), + /// All elements to be rendered + pub(super) elements: Vec, + /// The background colour + pub(super) background: [u8; 3], + /// Midpoint in pixels from the right for a top down glycan or in pixels from the top for a left to right glycan + pub midpoint: f32, +} + +#[derive(Debug, Clone)] +pub(super) enum Element { + Line { + from: (f32, f32), + to: (f32, f32), + stroke: [u8; 3], + stroke_size: f32, + }, + Circle { + r: f32, + center: (f32, f32), + fill: Option<[u8; 3]>, + stroke: [u8; 3], + stroke_size: f32, + svg_header: String, + }, + Rectangle { + top: (f32, f32), + w: f32, + h: f32, + fill: [u8; 3], + stroke: [u8; 3], + stroke_size: f32, + svg_header: String, + }, + Polygon { + points: Vec<(f32, f32)>, + fill: [u8; 3], + stroke: [u8; 3], + stroke_size: f32, + svg_header: String, + bevel: bool, + }, + Curve { + start: (f32, f32), + points: Vec<(f32, f32, f32, f32)>, + stroke: [u8; 3], + stroke_size: f32, + }, + Text { + text: String, + position: (f32, f32), + anchor: TextAnchor, + baseline: TextBaseline, + fill: [u8; 3], + size: f32, + italic: bool, + }, +} + +#[derive(Debug, Clone, Copy, Eq, PartialEq)] +pub(super) enum TextAnchor { + Start, + Middle, + End, +} + +#[derive(Debug, Clone, Copy, Eq, PartialEq)] +pub(super) enum TextBaseline { + Hanging, + Middle, + Ideographic, +} + +/// The symbol or text to use at the base of a glycan. +/// +#[doc = include_str!("../../../images/glycan_root.svg")] +/// +/// _Glycan [G01670UQ](http://glytoucan.org/Structures/Glycans/G01670UQ) using the different root types: None, Line, Symbol, Text("pep"), Text("N"), Text("Arg")_ +/// +/// ```rust +/// # use rustyms::glycan::{GlycanStructure, GlycanDirection, GlycanRoot, GlycanSelection}; +/// const COLUMN_SIZE: f32 = 30.0; +/// const SUGAR_SIZE: f32 = 15.0; +/// const STROKE_SIZE: f32 = 1.5; +/// let mut output = String::new(); +/// let mut footnotes = Vec::new(); +/// let short_iupac = "Neu5Ac(a2-6)Gal(b1-4)GlcNAc(b1-2)Man(a1-3)[Gal(b1-4)GlcNAc(b1-2)Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc(?1-"; // Definition for G01670UQ +/// let structure = GlycanStructure::from_short_iupac(short_iupac, 0..short_iupac.len(), 0).unwrap(); +/// for root in [ +/// GlycanRoot::None, +/// GlycanRoot::Line, +/// GlycanRoot::Symbol, +/// GlycanRoot::Text("pep".to_string()), +/// GlycanRoot::Text("N".to_string()), +/// GlycanRoot::Text("Arg".to_string()), +/// ] { +/// let rendered = structure +/// .render( +/// root, +/// COLUMN_SIZE, +/// SUGAR_SIZE, +/// STROKE_SIZE, +/// GlycanDirection::TopDown, +/// GlycanSelection::FULL, +/// [0, 0, 0], +/// [255, 255, 255], +/// &mut footnotes, +/// ) +/// .unwrap(); +/// rendered.to_svg(&mut output).unwrap(); +/// } +/// ``` +/// This examples shows how to generate SVGs for all the different root types as seen in the above picture. +/// Note that this writes all SVGs after each other to the variable `output`. Also note that this writes +/// all modifications that did not fit inside the image in the variable `footnotes` and this will need to +/// be dealt with by the caller, as indicated in [`GlycanStructure::render`](crate::glycan::GlycanStructure::render). +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Default)] +pub enum GlycanRoot { + /// No symbol, this will also not draw a line from the root sugar + #[default] + None, + /// No symbol, but this will draw a line from the root sugar + Line, + /// A tilde ('~') like symbol to indicate the full peptidoform + Symbol, + /// A piece of text, take care to not make this too big as it will be cut off in the image. + /// Commonly used options are 'pep' to indicate the full peptidoform, or to indicate the + /// attached amino acid any of 'Arg', or 'N'. + Text(String), +} + +/// The selected (part) of a glycan to render, using [`Self::FULL`] is a shortcut to get the full glycan. +#[derive(Clone, Debug, PartialEq, Eq)] +pub enum GlycanSelection<'a> { + /// A subtree of the glycan, with potentially a break of the root of the subtree and breaks in the branches. + /// If no breaks are specified the full glycan is shown. The root is the first monosaccharide to be included + /// in the rendering. The fragment will not include the indicated glycan positions for the branch breaks. + Subtree(Option<&'a GlycanPosition>, &'a [GlycanPosition]), + /// A single sugar, all it branches will be shown as broken. + SingleSugar(&'a GlycanPosition), +} + +impl GlycanSelection<'static> { + /// A shorthand for a full glycan. + pub const FULL: Self = Self::Subtree(None, &[]); +} + +impl AbsolutePositionedGlycan { + /// Render this glycan to the internal rendering representation, returns None if the root break contains an invalid position. + pub(super) fn render<'a>( + &'a self, + basis: GlycanRoot, + column_size: f32, + sugar_size: f32, + stroke_size: f32, + direction: GlycanDirection, + selection: GlycanSelection<'a>, + foreground: [u8; 3], + background: [u8; 3], + footnotes: &'a mut Vec, + ) -> Option { + fn render_element( + buffer: &mut Vec, + element: &AbsolutePositionedGlycan, + column_size: f32, + sugar_size: f32, + stroke_size: f32, + direction: GlycanDirection, + x_offset: f32, + y_offset: f32, + breaks: &[(usize, Vec<(GlycanBranchIndex, GlycanBranchMassIndex)>)], + foreground: [u8; 3], + background: [u8; 3], + incoming_stroke: (f32, f32, f32, f32), + footnotes: &mut Vec, + ) { + let raw_x = element.x - x_offset; + let raw_y = element.y as f32 - y_offset; + + let total_branches = element.branches.len() + element.sides.len(); + let mut strokes = vec![incoming_stroke]; + // First all lines to get good stacking behaviour + for (side, branch) in element + .branches + .iter() + .map(|b| (false, b)) + .chain(element.sides.iter().map(|b| (true, b))) + { + let origin_x = (raw_x + element.mid_point) * column_size; + let origin_y = (raw_y + 0.5) * column_size; + let base_x = (branch.x + branch.mid_point - x_offset) * column_size; + if (total_branches == 1 && breaks.iter().any(|b| b.0 == 1)) + || breaks + .iter() + .any(|b| b.0 == 1 && b.1.first().map(|b| b.0) == Some(branch.branch_index)) + { + let base_y = + (raw_y - 0.5 + f32::from(side)).mul_add(column_size, stroke_size * 0.5); + let angle = f32::atan2(base_y - origin_y, base_x - origin_x); + buffer.push(Element::Line { + from: pick_point((origin_x, origin_y), direction), + to: pick_point((base_x, base_y), direction), + stroke: foreground, + stroke_size, + }); + let x1 = (sugar_size / 2.0).mul_add(0.5f32.mul_add(PI, -angle).cos(), base_x); + let y1 = (sugar_size / 2.0).mul_add(-0.5f32.mul_add(PI, -angle).sin(), base_y); + let x2 = (sugar_size / 2.0).mul_add(-0.5f32.mul_add(PI, -angle).cos(), base_x); + let y2 = (sugar_size / 2.0).mul_add(0.5f32.mul_add(PI, -angle).sin(), base_y); + buffer.push(Element::Line { + from: pick_point((x1, y1), direction), + to: pick_point((x2, y2), direction), + stroke: foreground, + stroke_size, + }); + let x3 = (stroke_size * 2.0).mul_add(-angle.cos(), x1); + let y3 = (stroke_size * 2.0).mul_add(-angle.sin(), y1); + buffer.push(Element::Line { + from: pick_point((x1, y1), direction), + to: pick_point((x3, y3), direction), + stroke: foreground, + stroke_size, + }); + let offset = 0.25f32.mul_add(column_size, stroke_size); + let r = 0.25f32 + .mul_add(column_size, -stroke_size) + .min(sugar_size * 0.25); + let adjusted_x = offset.mul_add(-angle.cos(), base_x); + let adjusted_y = offset.mul_add(-angle.sin(), base_y); + buffer.push(Element::Circle { + r, + center: pick_point((adjusted_x, adjusted_y), direction), + fill: None, + stroke: foreground, + stroke_size, + svg_header: String::new(), + }); + strokes.push(pick_box( + ( + origin_x.min(base_x), + base_y.min(origin_y), + origin_x.max(base_x), + base_y.max(origin_y), + ), + direction, + )); + } else { + let base_y = ((branch.y as f32) - y_offset + 0.5) * column_size; + buffer.push(Element::Line { + from: pick_point((origin_x, origin_y), direction), + to: pick_point((base_x, base_y), direction), + stroke: foreground, + stroke_size, + }); + strokes.push(pick_box( + ( + origin_x.min(base_x), + base_y.min(origin_y), + origin_x.max(base_x), + base_y.max(origin_y), + ), + direction, + )); + } + } + + // Render the sugar + let fill = if element.colour == Colour::Background { + background + } else { + element.colour.rgb() + }; + let title = format!( + " data-sugar=\"{}\" data-position=\"{}-{}\"", + element.title, + element.position.inner_depth, + element.position.branch.iter().map(|b| b.0).join(",") + ); + match element.shape { + Shape::Circle => buffer.push(Element::Circle { + r: sugar_size / 2.0, + center: pick_point( + ( + (raw_x + element.mid_point) * column_size, + (raw_y + 0.5) * column_size, + ), + direction, + ), + fill: Some(fill), + stroke: foreground, + stroke_size, + svg_header: title, + }), + Shape::Square => buffer.push(Element::Rectangle { + top: pick_point( + ( + (raw_x + element.mid_point).mul_add(column_size, -sugar_size / 2.0), + (raw_y + 0.5).mul_add(column_size, -sugar_size / 2.0), + ), + direction, + ), + w: sugar_size, + h: sugar_size, + fill, + stroke: foreground, + stroke_size, + svg_header: title, + }), + Shape::Rectangle => { + let (base_x, base_y) = pick_point( + ( + ((raw_x + element.mid_point) * column_size), + (raw_y + 0.5) * column_size, + ), + direction, + ); + buffer.push(Element::Rectangle { + top: (base_x - sugar_size / 2.0, base_y - sugar_size / 4.0), + w: sugar_size, + h: sugar_size / 2.0, + fill, + stroke: foreground, + stroke_size, + svg_header: title, + }); + } + Shape::Triangle => { + let (base_x, base_y) = pick_point( + ( + ((raw_x + element.mid_point) * column_size), + (raw_y + 0.5) * column_size, + ), + direction, + ); + let x1 = base_x - sugar_size / 2.0; + let x2 = x1 + sugar_size / 2.0; + let x3 = x1 + sugar_size; + let y1 = base_y - sugar_size / 2.0; + let y2 = y1 + sugar_size; + + buffer.push(Element::Polygon { + points: vec![(x1, y2), (x2, y1), (x3, y2)], + fill, + stroke: foreground, + stroke_size, + svg_header: title, + bevel: false, + }); + } + Shape::LeftPointingTriangle => { + let x1 = (raw_x + element.mid_point).mul_add(column_size, -sugar_size / 2.0); + let x2 = x1 + sugar_size; + let y1 = (raw_y + 0.5).mul_add(column_size, -sugar_size / 2.0); + let y2 = y1 + sugar_size / 2.0; + let y3 = y1 + sugar_size; + + buffer.push(Element::Polygon { + points: vec![ + pick_point((x1, y2), direction), + pick_point((x2, y1), direction), + pick_point((x2, y3), direction), + ], + fill, + stroke: foreground, + stroke_size, + svg_header: title, + bevel: false, + }); + } + Shape::RightPointingTriangle => { + let x1 = (raw_x + element.mid_point).mul_add(column_size, -sugar_size / 2.0); + let x2 = x1 + sugar_size; + let y1 = (raw_y + 0.5).mul_add(column_size, -sugar_size / 2.0); + let y2 = y1 + sugar_size / 2.0; + let y3 = y1 + sugar_size; + + buffer.push(Element::Polygon { + points: vec![ + pick_point((x1, y1), direction), + pick_point((x2, y2), direction), + pick_point((x1, y3), direction), + ], + fill, + stroke: foreground, + stroke_size, + svg_header: title, + bevel: false, + }); + } + Shape::Diamond => { + let (base_x, base_y) = pick_point( + ( + ((raw_x + element.mid_point) * column_size), + (raw_y + 0.5) * column_size, + ), + direction, + ); + let x1 = base_x - sugar_size / 2.0; + let x2 = x1 + sugar_size / 2.0; + let x3 = x1 + sugar_size; + let y1 = base_y - sugar_size / 2.0; + let y2 = y1 + sugar_size / 2.0; + let y3 = y1 + sugar_size; + + buffer.push(Element::Polygon { + points: vec![(x2, y1), (x3, y2), (x2, y3), (x1, y2)], + fill, + stroke: foreground, + stroke_size, + svg_header: title, + bevel: false, + }); + } + Shape::FlatDiamond => { + let (base_x, base_y) = pick_point( + ( + ((raw_x + element.mid_point) * column_size), + (raw_y + 0.5) * column_size, + ), + direction, + ); + let x1 = base_x - sugar_size / 2.0; + let x2 = x1 + sugar_size / 2.0; + let x3 = x1 + sugar_size; + let y1 = base_y - sugar_size / 4.0; + let y2 = y1 + sugar_size / 4.0; + let y3 = y1 + sugar_size / 2.0; + + buffer.push(Element::Polygon { + points: vec![(x2, y1), (x3, y2), (x2, y3), (x1, y2)], + fill, + stroke: foreground, + stroke_size, + svg_header: title, + bevel: false, + }); + } + Shape::Hexagon => { + let a = sugar_size / 2.0 / 3.0_f32.sqrt(); + let (base_x, base_y) = pick_point( + ( + ((raw_x + element.mid_point) * column_size), + (raw_y + 0.5) * column_size, + ), + direction, + ); + let x1 = base_x - sugar_size / 2.0; + let x2 = x1 + a; + let x3 = x1 + sugar_size - a; + let x4 = x1 + sugar_size; + let y1 = base_y - sugar_size / 4.0; + let y2 = y1 + sugar_size / 4.0; + let y3 = y1 + sugar_size / 2.0; + + buffer.push(Element::Polygon { + points: vec![(x1, y2), (x2, y1), (x3, y1), (x4, y2), (x3, y3), (x2, y3)], + fill, + stroke: foreground, + stroke_size, + svg_header: title, + bevel: false, + }); + } + Shape::Pentagon => { + let (base_x, base_y) = pick_point( + ( + ((raw_x + element.mid_point) * column_size), + (raw_y + 0.5) * column_size, + ), + direction, + ); + let a = (18.0 / 360.0 * 2.0 * PI).cos() * sugar_size / 2.0; + let b = (18.0 / 360.0 * 2.0 * PI).sin() * sugar_size / 2.0; + let c = (36.0 / 360.0 * 2.0 * PI).cos() * sugar_size / 2.0; + let d = (36.0 / 360.0 * 2.0 * PI).sin() * sugar_size / 2.0; + let x1 = base_x - a; + let x2 = base_x - d; + let x3 = base_x; + let x4 = base_x + d; + let x5 = base_x + a; + let y1 = base_y - sugar_size / 2.0; + let y2 = y1 + sugar_size / 2.0 - b; + let y3 = y1 + sugar_size / 2.0 + c; + + buffer.push(Element::Polygon { + points: vec![(x1, y2), (x3, y1), (x5, y2), (x4, y3), (x2, y3)], + fill, + stroke: foreground, + stroke_size, + svg_header: title, + bevel: false, + }); + } + Shape::Star => { + // The Phi constant, the ratio for the "golden ratio" + const PHI: f32 = 1.618_034_f32; + // Calculate sizes of parts of the pentagram + let a = (18.0 / 360.0 * 2.0 * PI).cos() * sugar_size / 2.0; + let b = (18.0 / 360.0 * 2.0 * PI).sin() * sugar_size / 2.0; + let c = (36.0 / 360.0 * 2.0 * PI).cos() * sugar_size / 2.0; + let d = (36.0 / 360.0 * 2.0 * PI).sin() * sugar_size / 2.0; + let e = 2.0 * a / (54.0 / 360.0 * 2.0 * PI).sin() / (1.0 + 1.0 / PHI); + let f = (18.0 / 360.0 * 2.0 * PI) + .cos() + .mul_add(e, -(sugar_size / 2.0)); + let g = (18.0 / 360.0 * 2.0 * PI).sin() * e; + let h = (sugar_size / 2.0 - b) * (18.0 / 360.0 * 2.0 * PI).tan(); + let j = (18.0 / 360.0 * 2.0 * PI).tan() * g; + // Calculate the positions of the pentagram points + let (base_x, base_y) = pick_point( + ( + ((raw_x + element.mid_point) * column_size), + (raw_y + 0.5) * column_size, + ), + direction, + ); + let x1 = base_x - a; + let x2 = base_x - d; + let x3 = base_x - g; + let x4 = base_x - h; + let x5 = base_x; + let x6 = base_x + h; + let x7 = base_x + g; + let x8 = base_x + d; + let x9 = base_x + a; + let y1 = base_y - sugar_size / 2.0; + let y2 = y1 + sugar_size / 2.0 - b; + let y3 = y1 + sugar_size / 2.0 + j; + let y4 = y1 + sugar_size / 2.0 + f; + let y5 = y1 + sugar_size / 2.0 + c; + + buffer.push(Element::Polygon { + points: vec![ + (x1, y2), + (x4, y2), + (x5, y1), + (x6, y2), + (x9, y2), + (x7, y3), + (x8, y5), + (x5, y4), + (x2, y5), + (x3, y3), + ], + fill, + stroke: foreground, + stroke_size, + svg_header: title, + bevel: false, + }); + } + Shape::CrossedSquare => { + let (base_x, base_y) = pick_point( + ( + ((raw_x + element.mid_point) * column_size), + (raw_y + 0.5) * column_size, + ), + direction, + ); + let x1 = base_x - sugar_size / 2.0; + let y1 = base_y - sugar_size / 2.0; + let x2 = x1 + sugar_size; + let y2 = y1 + sugar_size; + + buffer.push(Element::Polygon { + points: vec![(x1, y1), (x2, y1), (x2, y2)], + fill, + stroke: foreground, + stroke_size, + svg_header: String::new(), + bevel: true, + }); + buffer.push(Element::Polygon { + points: vec![(x1, y1), (x1, y2), (x2, y2)], + fill, + stroke: foreground, + stroke_size, + svg_header: String::new(), + bevel: true, + }); + buffer.push(Element::Polygon { + points: vec![(x1, y1), (x2, y1), (x2, y2), (x1, y2)], + fill, + stroke: foreground, + stroke_size, + svg_header: title, + bevel: false, + }); + } + Shape::DividedDiamond => { + let (base_x, base_y) = pick_point( + ( + ((raw_x + element.mid_point) * column_size), + (raw_y + 0.5) * column_size, + ), + direction, + ); + let x1 = base_x - sugar_size / 2.0; + let x2 = x1 + sugar_size / 2.0; + let x3 = x1 + sugar_size; + let y1 = base_y - sugar_size / 2.0; + let y2 = y1 + sugar_size / 2.0; + let y3 = y1 + sugar_size; + + buffer.push(Element::Polygon { + points: vec![(x1, y2), (x2, y1), (x3, y2)], + fill, + stroke: foreground, + stroke_size, + svg_header: String::new(), + bevel: true, + }); + buffer.push(Element::Polygon { + points: vec![(x1, y2), (x2, y3), (x3, y2)], + fill, + stroke: foreground, + stroke_size, + svg_header: String::new(), + bevel: true, + }); + buffer.push(Element::Polygon { + points: vec![(x1, y2), (x2, y1), (x3, y2), (x2, y3)], + fill, + stroke: foreground, + stroke_size, + svg_header: title, + bevel: false, + }); + } + Shape::DividedTriangle => { + let (base_x, base_y) = pick_point( + ( + ((raw_x + element.mid_point) * column_size), + (raw_y + 0.5) * column_size, + ), + direction, + ); + let x1 = base_x - sugar_size / 2.0; + let x2 = x1 + sugar_size / 2.0; + let x3 = x1 + sugar_size; + let y1 = base_y - sugar_size / 2.0; + let y2 = y1 + sugar_size; + + buffer.push(Element::Polygon { + points: vec![(x2, y1), (x3, y2), (x2, y2)], + fill, + stroke: foreground, + stroke_size, + svg_header: String::new(), + bevel: true, + }); + buffer.push(Element::Polygon { + points: vec![(x2, y1), (x1, y2), (x2, y2)], + fill, + stroke: foreground, + stroke_size, + svg_header: String::new(), + bevel: true, + }); + buffer.push(Element::Polygon { + points: vec![(x2, y1), (x3, y2), (x1, y2)], + fill, + stroke: foreground, + stroke_size, + svg_header: title, + bevel: false, + }); + } + } + if !element.inner_modifications.is_empty() { + buffer.push(Element::Text { + text: element.inner_modifications.clone(), + position: pick_point( + ( + (raw_x + element.mid_point) * column_size, + (raw_y + 0.5) * column_size, + ), + direction, + ), + anchor: TextAnchor::Middle, + baseline: TextBaseline::Middle, + fill: foreground, + size: sugar_size / 2.0, + italic: true, + }); + } + if let Some((pos_x, pos_y, anchor, text)) = text_location( + &element.outer_modifications, + element.shape, + pick_point( + ( + (raw_x + element.mid_point - 0.5) * column_size, + raw_y * column_size, + ), + direction, + ), + column_size, + sugar_size, + stroke_size, + &strokes, + footnotes, + ) { + buffer.push(Element::Text { + text, + position: (pos_x, pos_y), + anchor, + baseline: TextBaseline::Hanging, + fill: foreground, + size: sugar_size / 2.0, + italic: false, + }); + } + // Render all connected sugars + for (index, branch) in element + .branches + .iter() + .chain(element.sides.iter()) + .enumerate() + { + if !((total_branches == 1 && breaks.iter().any(|b| b.0 == 1)) + || breaks + .iter() + .any(|b| b.0 == 1 && b.1.first().map(|b| b.0) == Some(branch.branch_index))) + { + render_element( + buffer, + branch, + column_size, + sugar_size, + stroke_size, + direction, + x_offset, + y_offset, + &breaks + .iter() + .filter(|b| { + (total_branches > 1 + && b.1.first().map(|b| b.0) == Some(branch.branch_index) + || total_branches == 1) + && b.0 > 0 + }) + .map(|b| (b.0 - 1, b.1[usize::from(total_branches > 1)..].to_vec())) + .collect_vec(), + foreground, + background, + strokes[index + 1], + footnotes, + ); + } + } + } + + let sub_tree = self.get_subtree(selection)?; + + let width = + (sub_tree.tree.x + sub_tree.tree.width - sub_tree.left_offset - sub_tree.right_offset) + * column_size; + let depth = sub_tree.depth as f32 + if sub_tree.break_top { 0.75 } else { 0.0 }; + let height = depth * column_size + + if sub_tree.break_bottom { + 3.5 * stroke_size + } else { + (match basis { + GlycanRoot::None => 0.0_f32, + GlycanRoot::Line | GlycanRoot::Symbol => 0.5, + GlycanRoot::Text(_) => 1.0, + }) * column_size + }; + + let size = pick_point((width, height), direction); + + let mut buffer = Vec::new(); + let stroke = if sub_tree.break_bottom { + let base_x = + (sub_tree.tree.x + sub_tree.tree.mid_point - sub_tree.left_offset) * column_size; + let base_y = depth.mul_add(column_size, stroke_size * 3.0); + buffer.push(Element::Line { + from: pick_point((base_x, (depth - 0.5) * column_size), direction), + to: pick_point((base_x, base_y), direction), + stroke: foreground, + stroke_size, + }); + buffer.push(Element::Line { + from: pick_point((base_x - sugar_size / 2.0, base_y), direction), + to: pick_point((base_x + sugar_size / 2.0, base_y), direction), + stroke: foreground, + stroke_size, + }); + let bs_x = base_x - sugar_size / 2.0; + buffer.push(Element::Line { + from: pick_point((bs_x, stroke_size.mul_add(-2.0, base_y)), direction), + to: pick_point((bs_x, base_y), direction), + stroke: foreground, + stroke_size, + }); + (base_x, (depth - 0.5) * column_size, base_x, base_y) + } else { + match basis { + GlycanRoot::None => ( + sub_tree.tree.x + sub_tree.tree.mid_point, + (depth + 0.5) * column_size, + sub_tree.tree.x + sub_tree.tree.mid_point, + (depth + 0.5) * column_size, + ), + GlycanRoot::Line => { + let base_x = (sub_tree.tree.x + sub_tree.tree.mid_point - sub_tree.left_offset) + * column_size; + let base_y = depth.mul_add(column_size, (column_size - sugar_size) / 2.0); + buffer.push(Element::Line { + from: pick_point((base_x, (depth - 0.5) * column_size), direction), + to: pick_point((base_x, base_y), direction), + stroke: foreground, + stroke_size, + }); + (base_x, (depth - 0.5) * column_size, base_x, base_y) + } + GlycanRoot::Symbol => { + let base_x = (sub_tree.tree.x + sub_tree.tree.mid_point - sub_tree.left_offset) + * column_size; + let base_y = depth.mul_add(column_size, (column_size - sugar_size) / 2.0); + buffer.push(Element::Line { + from: pick_point((base_x, (depth - 0.5) * column_size), direction), + to: pick_point((base_x, base_y), direction), + stroke: foreground, + stroke_size, + }); + buffer.push(Element::Curve { + start: pick_point( + (base_x - (sugar_size * 0.75).min(column_size * 0.5), base_y), + direction, + ), + points: vec![ + pick_double_point( + ( + sugar_size.mul_add(-0.5, base_x), + sugar_size.mul_add(0.5, base_y), + base_x, + base_y, + ), + direction, + ), + pick_double_point( + ( + sugar_size.mul_add(0.5, base_x), + sugar_size.mul_add(-0.5, base_y), + base_x + (sugar_size * 0.75).min(column_size * 0.5), + base_y, + ), + direction, + ), + ], + stroke: foreground, + stroke_size, + }); + (base_x, (depth - 0.5) * column_size, base_x, base_y) + } + GlycanRoot::Text(basis) => { + let base_x = (sub_tree.tree.x + sub_tree.tree.mid_point - sub_tree.left_offset) + * column_size; + let base_y = depth.mul_add(column_size, (column_size - sugar_size) / 2.0); + buffer.push(Element::Line { + from: pick_point((base_x, (depth - 0.5) * column_size), direction), + to: pick_point((base_x, base_y), direction), + stroke: foreground, + stroke_size, + }); + if direction == GlycanDirection::TopDown { + buffer.push(Element::Text { + text: basis, + position: (base_x, base_y + sugar_size), + anchor: TextAnchor::Middle, + baseline: TextBaseline::Ideographic, + fill: foreground, + size: sugar_size, + italic: false, + }); + } else { + buffer.push(Element::Text { + text: basis, + position: ((depth + 1.0) * column_size, base_x), + anchor: TextAnchor::End, + baseline: TextBaseline::Middle, + fill: foreground, + size: sugar_size, + italic: false, + }); + } + (base_x, (depth - 0.5) * column_size, base_x, base_y) + } + } + }; + + // If the full glycan has broken off immediately draw the break symbol + if sub_tree.branch_breaks.iter().any(|r| r.0 == 0) { + let origin_y = depth.mul_add(column_size, (column_size - sugar_size) / 2.0); + let base_x = + (sub_tree.tree.x + sub_tree.tree.mid_point - sub_tree.left_offset) * column_size; + let base_y = (depth - 0.5).mul_add(column_size, stroke_size * 0.5); + let angle = -PI / 2.0; // Always straight + buffer.push(Element::Line { + from: pick_point((base_x, origin_y), direction), + to: pick_point((base_x, base_y), direction), + stroke: foreground, + stroke_size, + }); + let x1 = (sugar_size / 2.0).mul_add(0.5f32.mul_add(PI, -angle).cos(), base_x); + let y1 = (sugar_size / 2.0).mul_add(-0.5f32.mul_add(PI, -angle).sin(), base_y); + let x2 = (sugar_size / 2.0).mul_add(-0.5f32.mul_add(PI, -angle).cos(), base_x); + let y2 = (sugar_size / 2.0).mul_add(0.5f32.mul_add(PI, -angle).sin(), base_y); + buffer.push(Element::Line { + from: pick_point((x1, y1), direction), + to: pick_point((x2, y2), direction), + stroke: foreground, + stroke_size, + }); + let x3 = (stroke_size * 2.0).mul_add(-angle.cos(), x1); + let y3 = (stroke_size * 2.0).mul_add(-angle.sin(), y1); + buffer.push(Element::Line { + from: pick_point((x1, y1), direction), + to: pick_point((x3, y3), direction), + stroke: foreground, + stroke_size, + }); + let offset = 0.25f32.mul_add(column_size, stroke_size); + let r = 0.25f32 + .mul_add(column_size, -stroke_size) + .min(sugar_size * 0.25); + let adjusted_x = offset.mul_add(-angle.cos(), base_x); + let adjusted_y = offset.mul_add(-angle.sin(), base_y); + buffer.push(Element::Circle { + r, + center: pick_point((adjusted_x, adjusted_y), direction), + fill: None, + stroke: foreground, + stroke_size, + svg_header: String::new(), + }); + } else { + render_element( + &mut buffer, + sub_tree.tree, + column_size, + sugar_size, + stroke_size, + direction, + sub_tree.left_offset, + sub_tree.tree.y as f32 - (depth - 1.0), + &sub_tree.branch_breaks, + foreground, + background, + pick_box(stroke, direction), + footnotes, + ); + } + + Some(RenderedGlycan { + size, + elements: buffer, + background, + midpoint: (sub_tree.tree.mid_point - sub_tree.left_offset) * column_size, + }) + } +} + +/// Determine the best location for text, returns the x, y, text-anchor, and the contents +fn text_location( + outer_modifications: &OuterModifications, + shape: Shape, + position: (f32, f32), + column_size: f32, + sugar_size: f32, + stroke_size: f32, + strokes: &[(f32, f32, f32, f32)], // x1, y1, x2, y2 + footnotes: &mut Vec, +) -> Option<(f32, f32, TextAnchor, String)> { + let text = match outer_modifications { + OuterModifications::Empty => return None, + OuterModifications::Footnote(index) => (index + 1).to_string(), // Human numbering + OuterModifications::Text(text) => text.clone(), + }; + + // Stay within box, dodge strokes, if not fitting fall back to adding to footnotes + let text_height = sugar_size / 2.0; + let text_width = (text.len() as f32) * text_height; // Rule of thumb, on average text is thinner than square, so this should be a good upper limit + let vertical_padding = shape.height().mul_add(-sugar_size, column_size) / 2.0; + + if vertical_padding >= text_height { + let mut options = vec![ + ( + ( + (column_size - text_width).mul_add(0.5, position.0), + position.1, + (column_size + text_width).mul_add(0.5, position.0), + position.1 + text_height, + ), + TextAnchor::Middle, + ), + ( + ( + (column_size - text_width).mul_add(0.5, position.0), + position.1 + column_size - text_height, + (column_size + text_width).mul_add(0.5, position.0), + position.1 + column_size, + ), + TextAnchor::Middle, + ), + ( + ( + position.0 + column_size.mul_add(0.5, -stroke_size) - text_width, + position.1, + position.0 + column_size.mul_add(0.5, -stroke_size), + position.1 + text_height, + ), + TextAnchor::End, + ), + ( + ( + stroke_size.mul_add(2.0, column_size.mul_add(0.5, position.0)), + position.1 + column_size - text_height, + stroke_size.mul_add(2.0, column_size.mul_add(0.5, position.0) + text_width), + position.1 + column_size, + ), + TextAnchor::Start, + ), + ( + ( + position.0, + position.1, + position.0 + text_width, + position.1 + text_height, + ), + TextAnchor::Start, + ), + ( + ( + position.0 + column_size - text_width, + position.1, + position.0 + column_size, + position.1 + text_height, + ), + TextAnchor::End, + ), + ( + ( + position.0, + position.1 + column_size - text_height, + position.0 + text_width, + position.1 + column_size, + ), + TextAnchor::Start, + ), + ( + ( + position.0 + column_size - text_width, + position.1 + column_size - text_height, + position.0 + column_size, + position.1 + column_size, + ), + TextAnchor::End, + ), + ]; + + // Remove any options that put text outside of the box + options.retain(|(option, _)| { + option.0 >= position.0 + && option.2 <= position.0 + column_size + && option.1 >= position.1 + && option.3 <= position.1 + column_size + }); + + for stroke in strokes { + options.retain(|(option, _)| !hitbox_test(*option, *stroke)); + } + if let Some((option, anchor)) = options.first() { + let (x, y) = match *anchor { + TextAnchor::Start => (option.0, option.1), + TextAnchor::Middle => ((option.0 + option.2) / 2.0, option.1), + TextAnchor::End => (option.2, option.1), + }; + return Some((x, y, *anchor, text)); + } + } + + if let OuterModifications::Text(text) = outer_modifications { + let index = footnotes + .iter() + .position(|p| *p == *text) + .unwrap_or_else(|| { + footnotes.push(text.clone()); + footnotes.len() - 1 + }); + + text_location( + &OuterModifications::Footnote(index), + shape, + position, + column_size, + sugar_size, + stroke_size, + strokes, + footnotes, + ) + } else { + Some(( + position.0 + column_size, + position.1 + column_size - text_height, + TextAnchor::End, + text, + )) + } +} + +/// Test if two boxes hit +fn hitbox_test(box1: (f32, f32, f32, f32), box2: (f32, f32, f32, f32)) -> bool { + debug_assert!(box1.0 <= box1.2, "Invalid boxes: {box1:?} {box2:?}"); + debug_assert!(box2.0 <= box2.2, "Invalid boxes: {box1:?} {box2:?}"); + debug_assert!(box1.1 <= box1.3, "Invalid boxes: {box1:?} {box2:?}"); + debug_assert!(box2.1 <= box2.3, "Invalid boxes: {box1:?} {box2:?}"); + box1.2 > box2.0 && box1.0 < box2.2 && box1.3 > box2.1 && box1.1 < box2.3 +} + +fn pick_point(a: (T, T), direction: GlycanDirection) -> (T, T) { + if direction == GlycanDirection::TopDown { + a + } else { + (a.1, a.0) + } +} + +fn pick_double_point(a: (T, T, T, T), direction: GlycanDirection) -> (T, T, T, T) { + if direction == GlycanDirection::TopDown { + a + } else { + (a.1, a.0, a.3, a.2) + } +} + +fn pick_box(a: (T, T, T, T), direction: GlycanDirection) -> (T, T, T, T) { + if direction == GlycanDirection::TopDown { + a + } else { + (a.1, a.0, a.3, a.2) + } +} diff --git a/rustyms/src/glycan/render/mod.rs b/rustyms/src/glycan/render/mod.rs new file mode 100644 index 00000000..157466ac --- /dev/null +++ b/rustyms/src/glycan/render/mod.rs @@ -0,0 +1,11 @@ +mod absolute; +#[cfg(feature = "glycan-render-bitmap")] +mod bitmap; +mod element; +mod shape; +mod svg; +#[cfg(all(test, not(github_action)))] +mod test; + +pub use absolute::GlycanDirection; +pub use element::{GlycanRoot, GlycanSelection, RenderedGlycan}; diff --git a/rustyms/src/glycan/render/shape.rs b/rustyms/src/glycan/render/shape.rs new file mode 100644 index 00000000..fe066416 --- /dev/null +++ b/rustyms/src/glycan/render/shape.rs @@ -0,0 +1,479 @@ +use crate::glycan::{ + BaseSugar, Configuration, GlycanSubstituent, HeptoseIsomer, HexoseIsomer, MonoSaccharide, + NonoseIsomer, PentoseIsomer, +}; + +impl MonoSaccharide { + /// Get the shape, colour, inner modifications, and outer modifications for this monosaccharide. + pub(super) fn get_shape(&self) -> (Shape, Colour, String, String) { + // Common substitutions + let mut nacetyl = 0; + let mut acid = 0; + let mut amino = 0; + let mut deoxy = 0; + // Additional needed substitutions + let mut acetyl = 0; + let mut glycolyl = 0; + let mut nglycolyl = 0; + let mut o_carboxy_ethyl = 0; + let mut inner_modifications = if self.furanose { + "f".to_string() + } else { + String::new() + }; + if let Some(c) = &self.configuration { + inner_modifications.push_str(match *c { + Configuration::D => "D", + Configuration::L => "L", + Configuration::DD => "DD", + Configuration::LL => "LL", + Configuration::DL => "DL", + Configuration::LD => "LD", + }); + } + let mut outer_modifications = String::new(); + for m in &self.substituents { + match m { + GlycanSubstituent::NAcetyl => nacetyl += 1, + GlycanSubstituent::Acid => acid += 1, + GlycanSubstituent::Amino => amino += 1, + GlycanSubstituent::Deoxy => deoxy += 1, + GlycanSubstituent::Acetyl => acetyl += 1, + GlycanSubstituent::Glycolyl => glycolyl += 1, + GlycanSubstituent::OCarboxyEthyl => o_carboxy_ethyl += 1, + GlycanSubstituent::NGlycolyl => nglycolyl += 1, + GlycanSubstituent::Didehydro => inner_modifications.push_str("en"), + GlycanSubstituent::Alcohol => inner_modifications.push('o'), // Missing symbols: an for anhydro, on for lactone, am for lactam + _ => outer_modifications.push_str(m.notation()), + } + } + let outer_mods = |nacetyl: usize, + acid: usize, + amino: usize, + deoxy: usize, + acetyl: usize, + glycolyl: usize, + nglycolyl: usize, + o_carboxy_ethyl: usize| { + [ + GlycanSubstituent::NAcetyl.notation().repeat(nacetyl), + GlycanSubstituent::Acid.notation().repeat(acid), + GlycanSubstituent::Amino.notation().repeat(amino), + GlycanSubstituent::Deoxy.notation().repeat(deoxy), + GlycanSubstituent::Acetyl.notation().repeat(acetyl), + GlycanSubstituent::Glycolyl.notation().repeat(glycolyl), + GlycanSubstituent::NGlycolyl.notation().repeat(nglycolyl), + GlycanSubstituent::OCarboxyEthyl + .notation() + .repeat(o_carboxy_ethyl), + outer_modifications, + ] + .join("") + }; + match &self.base_sugar { + BaseSugar::Pentose(isomer) => ( + Shape::Star, + match isomer { + None | Some(PentoseIsomer::Xylulose) => Colour::Background, + Some(PentoseIsomer::Arabinose) => Colour::Green, + Some(PentoseIsomer::Lyxose) => Colour::Yellow, + Some(PentoseIsomer::Xylose) => Colour::Orange, + Some(PentoseIsomer::Ribose) => Colour::Pink, + }, + inner_modifications, + outer_mods( + nacetyl, + acid, + amino, + deoxy, + acetyl, + glycolyl, + nglycolyl, + o_carboxy_ethyl, + ), + ), + BaseSugar::Hexose(isomer) => { + if o_carboxy_ethyl > 0 && nacetyl > 0 { + ( + Shape::Hexagon, + Colour::Purple, + inner_modifications, + outer_mods( + nacetyl - 1, + acid, + amino, + deoxy, + acetyl, + glycolyl, + nglycolyl, + o_carboxy_ethyl - 1, + ), + ) + } else if o_carboxy_ethyl > 0 && nglycolyl > 0 { + ( + Shape::Hexagon, + Colour::LightBlue, + inner_modifications, + outer_mods( + nacetyl, + acid, + amino, + deoxy, + acetyl, + glycolyl, + nglycolyl - 1, + o_carboxy_ethyl - 1, + ), + ) + } else if o_carboxy_ethyl > 0 && amino > 0 { + ( + Shape::Hexagon, + Colour::Brown, + inner_modifications, + outer_mods( + nacetyl, + acid, + amino - 1, + deoxy, + acetyl, + glycolyl, + nglycolyl, + o_carboxy_ethyl - 1, + ), + ) + } else if deoxy > 1 { + let c = match isomer { + Some(HexoseIsomer::Glucose) => Colour::Blue, + Some(HexoseIsomer::Mannose) => Colour::Green, + Some(HexoseIsomer::Galactose) => Colour::Orange, + Some(HexoseIsomer::Altrose) => Colour::Pink, + Some(HexoseIsomer::Allose) => Colour::Purple, + Some(HexoseIsomer::Talose) => Colour::LightBlue, + Some(_) | None => Colour::Background, + }; + ( + Shape::Rectangle, + c, + inner_modifications, + outer_mods( + nacetyl, + acid, + amino, + deoxy - 2, + acetyl, + glycolyl, + nglycolyl, + o_carboxy_ethyl, + ), + ) + } else if amino > 1 && deoxy > 0 { + ( + Shape::Hexagon, + Colour::Blue, + inner_modifications, + outer_mods( + nacetyl, + acid, + amino - 2, + deoxy - 1, + acetyl, + glycolyl, + nglycolyl, + o_carboxy_ethyl, + ), + ) + } else if nacetyl > 0 && deoxy > 0 { + let c = match isomer { + Some(HexoseIsomer::Glucose) => Colour::Blue, + Some(HexoseIsomer::Mannose) => Colour::Green, + Some(HexoseIsomer::Galactose) => Colour::Red, + Some(HexoseIsomer::Altrose) => Colour::Pink, + Some(HexoseIsomer::Talose) => Colour::LightBlue, + Some(_) | None => Colour::Background, + }; + ( + Shape::DividedTriangle, + c, + inner_modifications, + outer_mods( + nacetyl - 1, + acid, + amino, + deoxy - 1, + acetyl, + glycolyl, + nglycolyl, + o_carboxy_ethyl, + ), + ) + } else if deoxy > 0 { + let c = match isomer { + Some(HexoseIsomer::Glucose) => Colour::Blue, + Some(HexoseIsomer::Mannose) => Colour::Green, + Some(HexoseIsomer::Galactose) => Colour::Red, + Some(HexoseIsomer::Gulose) => Colour::Orange, + Some(HexoseIsomer::Altrose) => Colour::Pink, + Some(HexoseIsomer::Talose) => Colour::LightBlue, + Some(_) | None => Colour::Background, + }; + ( + Shape::Triangle, + c, + inner_modifications, + outer_mods( + nacetyl, + acid, + amino, + deoxy - 1, + acetyl, + glycolyl, + nglycolyl, + o_carboxy_ethyl, + ), + ) + } else if acid > 0 || amino > 0 || nacetyl > 0 { + let c = match isomer { + Some(HexoseIsomer::Glucose) => Colour::Blue, + Some(HexoseIsomer::Mannose) => Colour::Green, + Some(HexoseIsomer::Galactose) => Colour::Yellow, + Some(HexoseIsomer::Gulose) => Colour::Orange, + Some(HexoseIsomer::Altrose) => Colour::Pink, + Some(HexoseIsomer::Allose) => Colour::Purple, + Some(HexoseIsomer::Talose) => Colour::LightBlue, + Some(HexoseIsomer::Idose) => Colour::Brown, + Some(_) | None => Colour::Background, + }; + let shape = if acid > 0 { + Shape::DividedDiamond + } else if amino > 0 { + Shape::CrossedSquare + } else { + Shape::Square + }; + ( + shape, + c, + inner_modifications, + outer_mods( + nacetyl - usize::from(shape == Shape::Square), + acid - usize::from(shape == Shape::DividedDiamond), + amino - usize::from(shape == Shape::CrossedSquare), + deoxy, + acetyl, + glycolyl, + nglycolyl, + o_carboxy_ethyl, + ), + ) + } else { + let (s, c) = match isomer { + None => (Shape::Circle, Colour::Background), + Some(HexoseIsomer::Glucose) => (Shape::Circle, Colour::Blue), + Some(HexoseIsomer::Mannose) => (Shape::Circle, Colour::Green), + Some(HexoseIsomer::Galactose) => (Shape::Circle, Colour::Yellow), + Some(HexoseIsomer::Gulose) => (Shape::Circle, Colour::Orange), + Some(HexoseIsomer::Altrose) => (Shape::Circle, Colour::Pink), + Some(HexoseIsomer::Allose) => (Shape::Circle, Colour::Purple), + Some(HexoseIsomer::Talose) => (Shape::Circle, Colour::LightBlue), + Some(HexoseIsomer::Idose) => (Shape::Circle, Colour::Brown), + Some(HexoseIsomer::Psicose) => (Shape::Pentagon, Colour::Pink), + Some(HexoseIsomer::Fructose) => (Shape::Pentagon, Colour::Green), + Some(HexoseIsomer::Sorbose) => (Shape::Pentagon, Colour::Orange), + Some(HexoseIsomer::Tagatose) => (Shape::Pentagon, Colour::Yellow), + }; + ( + s, + c, + inner_modifications, + outer_mods( + nacetyl, + acid, + amino, + deoxy, + acetyl, + glycolyl, + nglycolyl, + o_carboxy_ethyl, + ), + ) + } + } + BaseSugar::Heptose(Some(HeptoseIsomer::GlyceroMannoHeptopyranose)) => ( + Shape::Hexagon, + Colour::Green, + inner_modifications, + outer_mods( + nacetyl, + acid, + amino, + deoxy, + acetyl, + glycolyl, + nglycolyl, + o_carboxy_ethyl, + ), + ), + BaseSugar::Heptose(None) if acid > 1 && deoxy > 0 => ( + Shape::Hexagon, + Colour::Orange, + inner_modifications, + outer_mods( + nacetyl, + acid - 2, + amino, + deoxy - 1, + acetyl, + glycolyl, + nglycolyl, + o_carboxy_ethyl, + ), + ), + BaseSugar::Octose if acid > 0 && deoxy > 0 => ( + Shape::Hexagon, + Colour::Yellow, + inner_modifications, + outer_mods( + nacetyl, + acid - 1, + amino, + deoxy - 1, + acetyl, + glycolyl, + nglycolyl, + o_carboxy_ethyl, + ), + ), + BaseSugar::Nonose(isomer) if acid > 0 && amino > 0 => { + if amino > 1 && deoxy > 1 { + ( + Shape::FlatDiamond, + match isomer { + Some(NonoseIsomer::Pse) => Colour::Green, + Some(NonoseIsomer::Leg) => Colour::Yellow, + Some(NonoseIsomer::ELeg) => Colour::LightBlue, + Some(NonoseIsomer::Aci) => Colour::Pink, + _ => Colour::Background, + }, + inner_modifications, + outer_mods( + nacetyl, + acid - 1, + amino - 2, + deoxy - 2, + acetyl, + glycolyl, + nglycolyl, + o_carboxy_ethyl, + ), + ) + } else { + let colour = if deoxy > 0 { + if *isomer == Some(NonoseIsomer::Kdn) { + Colour::Green + } else { + Colour::Red + } + } else if acetyl > 0 { + Colour::Purple + } else if glycolyl > 0 { + Colour::LightBlue + } else { + Colour::Brown + }; + ( + Shape::Diamond, + colour, + inner_modifications, + outer_mods( + nacetyl, + acid - 1, + amino - 1, + deoxy - usize::from(colour == Colour::Red || colour == Colour::Green), + acetyl - usize::from(colour == Colour::Purple), + glycolyl - usize::from(colour == Colour::LightBlue), + nglycolyl, + o_carboxy_ethyl, + ), + ) + } + } + _ => ( + Shape::Hexagon, + Colour::Background, + inner_modifications, + outer_mods( + nacetyl, + acid, + amino, + deoxy, + acetyl, + glycolyl, + nglycolyl, + o_carboxy_ethyl, + ), + ), + } + } +} + +/// All colours from Symbol Nomenclature For Glycans (SNFG) +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +pub(super) enum Colour { + Background, + Blue, + Green, + Yellow, + Orange, + Pink, + Purple, + LightBlue, + Brown, + Red, +} + +impl Colour { + /// Represented as bytes 0..=255 + pub(super) const fn rgb(self) -> [u8; 3] { + match self { + Self::Background => [255, 255, 255], + Self::Blue => [0, 144, 188], + Self::Green => [0, 166, 81], + Self::Yellow => [255, 212, 0], + Self::Orange => [244, 121, 32], + Self::Pink => [246, 158, 161], + Self::Purple => [165, 67, 153], + Self::LightBlue => [143, 204, 233], + Self::Brown => [161, 122, 77], + Self::Red => [237, 28, 36], + } + } +} + +/// All symbols from Symbol Nomenclature For Glycans (SNFG) +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +pub(super) enum Shape { + Circle, + Square, + CrossedSquare, + DividedDiamond, + Triangle, + LeftPointingTriangle, + RightPointingTriangle, + DividedTriangle, + Rectangle, + Star, + Diamond, + FlatDiamond, + Hexagon, + Pentagon, +} + +impl Shape { + /// The height of a symbol as ratio to the width + pub(super) const fn height(self) -> f32 { + match self { + Self::Rectangle | Self::FlatDiamond | Self::Hexagon => 0.5, + _ => 1.0, + } + } +} diff --git a/rustyms/src/glycan/render/svg.rs b/rustyms/src/glycan/render/svg.rs new file mode 100644 index 00000000..7869dc15 --- /dev/null +++ b/rustyms/src/glycan/render/svg.rs @@ -0,0 +1,110 @@ +use std::fmt::Write; + +use itertools::Itertools; + +use crate::glycan::{ + render::element::{Element, TextAnchor, TextBaseline}, + RenderedGlycan, +}; + +impl RenderedGlycan { + /// Render this glycan as SVG. The SVG will be appended to the given buffer. + /// * `output`: the buffer to append the SVG to. + /// + /// # Errors + /// If the underlying buffer errors the error is returned. + pub fn to_svg(&self, mut output: impl Write) -> Result<(), std::fmt::Error> { + fn clr(clr: Option<&[u8; 3]>) -> String { + if let Some([r, g, b]) = clr { + format!("rgb({r},{g},{b})") + } else { + "transparent".to_string() + } + } + + write!( + output, + "", + self.size.0, self.size.1, self.midpoint + )?; + for element in &self.elements { + match element { + Element::Line { + from, + to, + stroke, + stroke_size, + } => write!(output, + "", + from.0, + from.1, + to.0, + to.1, + clr(Some(stroke)))?, + Element::Circle { + r, + center, + fill, + stroke, + stroke_size, + svg_header, + } => write!(output, + "", + center.0, + center.1, + clr(fill.as_ref()), + clr(Some(stroke)))?, + Element::Rectangle { + top, + w, + h, + fill, + stroke, + stroke_size, + svg_header, + } => write!(output, + "", + top.0, + top.1, + clr(Some(fill)), + clr(Some(stroke)))?, + Element::Polygon { + points, + fill, + stroke, + stroke_size, + svg_header, + bevel, + } => write!(output, + "", + points.iter().map(|(a, b)| format!("{a} {b}")).join(" "), + clr(Some(fill)), + clr(Some(stroke)), + if *bevel {" stroke-linejoin=\"bevel\""} else {""})?, + Element::Text { + text, + position, + anchor, + baseline, + fill, + size, + italic, + } => write!(output, + "{text}", + position.0, + position.1, + clr(Some(fill)), + match anchor {TextAnchor::Start => "start", TextAnchor::Middle => "middle", TextAnchor::End => "End"}, + match baseline {TextBaseline::Hanging => "hanging", TextBaseline::Middle => "middle", TextBaseline::Ideographic => "ideographic"}, + if *italic {" font-style=\"italic\""} else {""})?, + Element::Curve { start, points, stroke, stroke_size } => write!(output, + "", + start.0, start.1, + points.iter().map(|(a, b, c, d)| format!("Q {a} {b} {c} {d}")).join(" "), + clr(Some(stroke)))?, + } + } + write!(output, "")?; + Ok(()) + } +} diff --git a/rustyms/src/glycan/render/test.rs b/rustyms/src/glycan/render/test.rs new file mode 100644 index 00000000..d1294078 --- /dev/null +++ b/rustyms/src/glycan/render/test.rs @@ -0,0 +1,561 @@ +#![allow(clippy::missing_panics_doc)] +use base64::Engine; +use swash::{scale::ScaleContext, CacheKey, FontRef}; + +use crate::{ + fragment::GlycanPosition, + glycan::{render::element::GlycanSelection, GlycanDirection, GlycanRoot, GlycanStructure}, +}; +use std::{ + fmt::Write, + io::BufWriter, + path::{Path, PathBuf}, +}; + +pub struct Font { + // Full content of the font file + data: Vec, + // Offset to the table directory + offset: u32, + // Cache key + key: CacheKey, +} + +impl Font { + pub fn from_file(path: PathBuf, index: usize) -> Option { + // Read the full font file + let data = std::fs::read(path).ok()?; + // Create a temporary font reference for the first font in the file. + // This will do some basic validation, compute the necessary offset + // and generate a fresh cache key for us. + let font = FontRef::from_index(&data, index)?; + let (offset, key) = (font.offset, font.key); + // Return our struct with the original file data and copies of the + // offset and key from the font reference + Some(Self { data, offset, key }) + } + + // Create the transient font reference for accessing this crate's + // functionality. + pub fn as_ref(&self) -> FontRef { + // Note that you'll want to initialize the struct directly here as + // using any of the FontRef constructors will generate a new key which, + // while completely safe, will nullify the performance optimizations of + // the caching mechanisms used in this crate. + FontRef { + data: &self.data, + offset: self.offset, + key: self.key, + } + } +} + +#[test] +fn test_rendering() { + const COLUMN_SIZE: f32 = 30.0; + const SUGAR_SIZE: f32 = 15.0; + const STROKE_SIZE: f32 = 1.5; + + let font = Font::from_file( + std::fs::read_dir( + directories::UserDirs::font_dir( + &directories::UserDirs::new().expect("Could not find user directories"), + ) + .unwrap_or_else(|| Path::new("C:/WINDOWS/Fonts")), // Font directory not defined for windows + ) + .expect("Could not open font directory") + .find(|p| { + p.as_ref() + .is_ok_and(|p| p.file_name().eq_ignore_ascii_case("times.ttf")) + }) + .expect("No font files") + .expect("Could not open font file") + .path(), + 0, + ) + .expect("Invalid font"); + + let mut html = String::new(); + let mut footnotes = Vec::new(); + write!(&mut html, "Glycan render test").unwrap(); + + let codes = [ + ("G01670UQ", "Neu5Ac(a2-6)Gal(b1-4)GlcNAc(b1-2)Man(a1-3)[Gal(b1-4)GlcNAc(b1-2)Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc(?1-"), + ("G13523IF", "Fuc(?1-?)Gal(?1-?)GalNAc(?1-"), + ("G00613DO", "GlcN(b1-4)GlcNAc(b1-4)GlcNAc(b1-4)GlcNAc6S(?1-"), + ("G00621IU", "Neu5Gc(a2-3/6)Gal(b1-4)[Fuc(a1-3)]GlcNAc(b1-2)[Gal(a1-3)Gal(b1-4)GlcNAc(b1-4)]Man(a1-3)[Neu5Ac(a2-8)Neu5Ac(a2-3/6)Gal(b1-4)GlcNAc(b1-3)Gal(b1-4)GlcNAc(b1-2)[Neu5Ac(a2-3/6)Gal(b1-4)[Fuc(a1-3)]GlcNAc(b1-6)]Man(a1-6)]Man(b1-4)GlcNAc(b1-4)[Fuc(a1-6)]GlcNAc(?1-"), + ("G01464QV", "Rha2,3,4Ac3(a1-2)[Xyl(b1-3)]Ara(a1-"), + ("G04421VO", "Fruf(b2-1a)[Glc(a1-2)Glc(a1-2)Glc(a1-2)Glc(a1-2)Glc(a1-2)Glc(a1-2)]Glc"), + ("G04458LN", "Kdn(a2-3)Gal(b1-4)ManNAc(b1-2)[Kdn(a2-3)Gal(b1-4)GlcNAc(b1-4)]Man(a1-3)[GlcNAc(b1-4)][Kdn(a2-3)Gal(b1-4)GlcNAc(b1-2)[Neu5Gc(a2-3)Gal(b1-4)GlcNAc(b1-6)]Man(a1-6)]Man(b1-4)GlcNAc(b1-4)[Fuc(a1-6)]GlcNAc(b1-"), + ("G69524KC", "Xyl(?1-?)Ara(?1-?)[Gal(?1-?)]GlcA"), + ("G37707YH", "Fuc(a1-2)Gal(b1-4)[Fuc(a1-3)]GlcNAc(b1-2)[Gal(a1-3)Gal(b1-4)GlcNAc(b1-4)]Man(a1-3)[GlcNAc(b1-4)][Neu5Gc(a2-3/6)Gal(b1-4)[Fuc(a1-3)]GlcNAc(b1-3)Gal(b1-4)GlcNAc(b1-2)[Neu5Ac(a2-3/6)Gal(b1-4)[Fuc(a1-3)]GlcNAc(b1-6)]Man(a1-6)]Man(b1-4)GlcNAc(b1-4)[Fuc(a1-6)]GlcNAc(?1-"), + ("G07370RP", "Rha(a1-3)Qui(b1-4)Rha(a1-2)Glc(b1-2)[Rha(a1-6)]Glc(b1-"), + ("G11504PZ", "Dig3CMe(b1-3)Oli(b1-3)Oli(b1-"), + ("G64699IM", "GlcA(b1-3)GalNAc(b1-4)4eLeg?5,7Ac2(a2-"), + ("G14402AU", "D-Araf(b1-5)Dha(?2-3)[GalA(a1-4)GalA(a1-4)]GalA(a1-4)GalA"), + ("G08395BZ", "Glc(b1-2a)[Ido(b1-3)]Psif"), + ("G49642ZT", "Man(?1-?)[Man(?1-?)]Man(?1-?)[Man(?1-?)]Man(?1-?)GlcNAc(?1-?)[Fuc(?1-?)][Fuc(?1-?)]GlcNAc(?1-"), + ("G59426OB", "Hex(?1-?)HexNAc(?1-?)HexA(?1-?)Gal(?1-?)GalNAc-ol"), + ("G75424NV", "Hex?(?1-?)Hex?NAc(?1-?)[Hex?NAc(?1-?)]Hex?(?1-?)[Hex?(?1-?)[Hex?(?1-?)]Hex?(?1-?)][Hex?NAc(?1-?)]Hex?(?1-?)Hex?NAc(?1-?)Hex?NAc(?1-"), + ("G36128WO", "Ido(b1-3)ManNAc(?1-3)[Ido(b1-3)L-AllNAc(b1-3)Ido(b1-4)AltNAc(b1-6)]Tal(b1-4)D-Ido(?1-"), + ("G83422GV", "L-6dTal(a1-3)[Fuc(a1-2)Gal(b1-4)GlcNAc(b1-3)Gal(b1-4)]GlcNAc(b1-3)Gal(b1-3)[Neu5Ac(a2-3)Gal(b1-4)[Fuc(a1-3)]GlcNAc(b1-6)]GalNAc(a1-"), + ("G09073GJ","GalNAc(?1-?)GlcA2,3NAc2(?1-?)D-FucNAc"), + ("G00069DT","Neu(a2-3)Gal(b1-4)GlcNAc(b1-3)Gal(b1-4)GlcNAc(b1-3)Gal(b1-4)Glc(b1-"), + ("G00468KU","GlcNAc(b1-2)Man(a1-3)[GlcNAc(b1-4)][Man(a1-?)Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc(?1-"), + ("G75079FY","Neu5Ac(?2-?)Gal(?1-?)GlcNAc(?1-?)Man(?1-?)[Neu5Ac(?2-?)Gal(?1-?)GlcNAc(?1-?)Man(?1-?)][GlcNAc(?1-?)]Man(?1-?)GlcNAc(?1-?)[Fuc(?1-?)]GlcNAc"), + ]; + + let mut context = ScaleContext::new(); + for (index, (_, iupac)) in codes.iter().enumerate() { + let structure = GlycanStructure::from_short_iupac(iupac, 0..iupac.len(), 0).unwrap(); + let rendered = structure + .render( + crate::glycan::render::GlycanRoot::Text("pep".to_string()), + COLUMN_SIZE, + SUGAR_SIZE, + STROKE_SIZE, + if index % 3 == 0 { + GlycanDirection::LeftToRight + } else { + GlycanDirection::TopDown + }, + GlycanSelection::FULL, + [66, 66, 66], + [255, 255, 255], + &mut footnotes, + ) + .unwrap(); + rendered.to_svg(&mut html).unwrap(); + let (bitmap, width) = rendered.to_bitmap( + if index % 2 == 0 { + zeno::Format::subpixel_bgra() + } else { + zeno::Format::Alpha + }, + font.as_ref(), + &mut context, + ); + let mut buffer = Vec::new(); + let mut w = BufWriter::new(&mut buffer); + let mut encoder = + png::Encoder::new(&mut w, width as u32, (bitmap.len() / 4 / width) as u32); + encoder.set_color(png::ColorType::Rgba); + encoder.set_depth(png::BitDepth::Eight); + let mut writer = encoder.write_header().unwrap(); + writer.write_image_data(&bitmap).unwrap(); + drop(writer); + drop(w); + + write!(&mut html, "").unwrap(); + } + + for (index, selection) in [ + ( + 0, + GlycanSelection::Subtree( + Some(&GlycanPosition { + inner_depth: 2, + series_number: 2, + branch: Vec::new(), + attachment: None, + }), + &[], + ), + ), + ( + 0, + GlycanSelection::Subtree( + Some(&GlycanPosition { + inner_depth: 2, + series_number: 2, + branch: Vec::new(), + attachment: None, + }), + &[GlycanPosition { + inner_depth: 4, + series_number: 4, + branch: vec![(1, 1)], + attachment: None, + }], + ), + ), + ( + 0, + GlycanSelection::Subtree( + Some(&GlycanPosition { + inner_depth: 2, + series_number: 2, + branch: Vec::new(), + attachment: None, + }), + &[ + GlycanPosition { + inner_depth: 5, + series_number: 5, + branch: vec![(0, 0)], + attachment: None, + }, + GlycanPosition { + inner_depth: 3, + series_number: 3, + branch: vec![(1, 1)], + attachment: None, + }, + ], + ), + ), + ( + 0, + GlycanSelection::Subtree( + Some(&GlycanPosition { + inner_depth: 4, + series_number: 4, + branch: vec![(1, 1)], + attachment: None, + }), + &[], + ), + ), + ( + 14, + GlycanSelection::Subtree( + Some(&GlycanPosition { + inner_depth: 0, + series_number: 0, + branch: Vec::new(), + attachment: None, + }), + &[GlycanPosition { + inner_depth: 1, + series_number: 1, + branch: vec![(0, 0)], + attachment: None, + }], + ), + ), + ( + 14, + GlycanSelection::Subtree( + Some(&GlycanPosition { + inner_depth: 1, + series_number: 1, + branch: vec![(0, 0)], + attachment: None, + }), + &[GlycanPosition { + inner_depth: 2, + series_number: 2, + branch: vec![(0, 0)], + attachment: None, + }], + ), + ), + ( + 16, + GlycanSelection::Subtree( + Some(&GlycanPosition { + inner_depth: 1, + series_number: 1, + branch: Vec::new(), + attachment: None, + }), + &[ + GlycanPosition { + inner_depth: 3, + series_number: 3, + branch: vec![(0, 0)], + attachment: None, + }, + GlycanPosition { + inner_depth: 4, + series_number: 4, + branch: vec![(1, 1), (1, 1)], + attachment: None, + }, + ], + ), + ), + ( + 18, + GlycanSelection::Subtree( + Some(&GlycanPosition { + inner_depth: 1, + series_number: 1, + branch: vec![(0, 0)], + attachment: None, + }), + &[ + GlycanPosition { + inner_depth: 3, + series_number: 3, + branch: vec![(0, 0), (0, 0)], + attachment: None, + }, + GlycanPosition { + inner_depth: 6, + series_number: 6, + branch: vec![(0, 0), (1, 1)], + attachment: None, + }, + ], + ), + ), + ( + 1, + GlycanSelection::Subtree( + None, + &[GlycanPosition { + inner_depth: 1, + series_number: 1, + branch: Vec::new(), + attachment: None, + }], + ), + ), + ( + 1, + GlycanSelection::Subtree( + None, + &[GlycanPosition { + inner_depth: 2, + series_number: 2, + branch: Vec::new(), + attachment: None, + }], + ), + ), + ( + // Y5 + 21, + GlycanSelection::Subtree( + None, + &[GlycanPosition { + inner_depth: 0, + series_number: 5, + branch: Vec::new(), + attachment: None, + }], + ), + ), + ( + // B3Y1gY2bY2a + 21, + GlycanSelection::Subtree( + Some(&GlycanPosition { + inner_depth: 2, + series_number: 3, + branch: vec![], + attachment: None, + }), + &[ + GlycanPosition { + inner_depth: 3, + series_number: 1, + branch: vec![(1, 2)], + attachment: None, + }, + GlycanPosition { + inner_depth: 3, + series_number: 2, + branch: vec![(2, 1)], + attachment: None, + }, + GlycanPosition { + inner_depth: 3, + series_number: 2, + branch: vec![(0, 0)], + attachment: None, + }, + ], + ), + ), + ( + // B2aY1a + 21, + GlycanSelection::Subtree( + Some(&GlycanPosition { + inner_depth: 3, + series_number: 2, + branch: vec![(0, 0)], + attachment: None, + }), + &[GlycanPosition { + inner_depth: 4, + series_number: 1, + branch: vec![(0, 0)], + attachment: None, + }], + ), + ), + ( + // B2bY1b + 21, + GlycanSelection::Subtree( + Some(&GlycanPosition { + inner_depth: 3, + series_number: 2, + branch: vec![(2, 1)], + attachment: None, + }), + &[GlycanPosition { + inner_depth: 4, + series_number: 1, + branch: vec![(2, 1)], + attachment: None, + }], + ), + ), + ( + // B1b + 21, + GlycanSelection::Subtree( + Some(&GlycanPosition { + inner_depth: 4, + series_number: 1, + branch: vec![(2, 1)], + attachment: None, + }), + &[], + ), + ), + ( + // dHex3 + 21, + GlycanSelection::SingleSugar(&GlycanPosition { + inner_depth: 2, + series_number: 3, + branch: vec![], + attachment: None, + }), + ), + ( + // dHex2a + 21, + GlycanSelection::SingleSugar(&GlycanPosition { + inner_depth: 3, + series_number: 2, + branch: vec![(0, 0)], + attachment: None, + }), + ), + ( + // dHex2b + 21, + GlycanSelection::SingleSugar(&GlycanPosition { + inner_depth: 3, + series_number: 2, + branch: vec![(2, 1)], + attachment: None, + }), + ), + ( + // dHex1b + 21, + GlycanSelection::SingleSugar(&GlycanPosition { + inner_depth: 4, + series_number: 1, + branch: vec![(2, 1)], + attachment: None, + }), + ), + ( + // B4a' + 22, + GlycanSelection::Subtree( + Some(&GlycanPosition { + inner_depth: 3, + series_number: 4, + branch: vec![(0, 0), (1, 1)], + attachment: None, + }), + &[], + ), + ), + ] { + let structure = + GlycanStructure::from_short_iupac(codes[index].1, 0..codes[index].1.len(), 0).unwrap(); + if let Some(rendered) = structure.render( + crate::glycan::render::GlycanRoot::Symbol, + COLUMN_SIZE, + SUGAR_SIZE, + STROKE_SIZE, + GlycanDirection::TopDown, + selection, + [0, 0, 0], + [255, 255, 255], + &mut footnotes, + ) { + rendered.to_svg(&mut html).unwrap(); + let (bitmap, width) = rendered.to_bitmap( + if index % 2 == 0 { + zeno::Format::subpixel_bgra() + } else { + zeno::Format::Alpha + }, + font.as_ref(), + &mut context, + ); + let mut buffer = Vec::new(); + let mut w = BufWriter::new(&mut buffer); + let mut encoder = + png::Encoder::new(&mut w, width as u32, (bitmap.len() / 4 / width) as u32); + encoder.set_color(png::ColorType::Rgba); + encoder.set_depth(png::BitDepth::Eight); + let mut writer = encoder.write_header().unwrap(); + writer.write_image_data(&bitmap).unwrap(); + drop(writer); + drop(w); + + write!(&mut html, "").unwrap(); + } else { + write!(&mut html, "Render error: invalid root").unwrap(); + } + } + + let structure = GlycanStructure::from_short_iupac(codes[0].1, 0..codes[0].1.len(), 0).unwrap(); + for root in [ + GlycanRoot::None, + GlycanRoot::Line, + GlycanRoot::Symbol, + GlycanRoot::Text("pep".to_string()), + GlycanRoot::Text("N".to_string()), + GlycanRoot::Text("Arg".to_string()), + ] { + let rendered = structure + .render( + root, + COLUMN_SIZE, + SUGAR_SIZE, + STROKE_SIZE, + GlycanDirection::TopDown, + GlycanSelection::FULL, + [0, 0, 0], + [255, 255, 255], + &mut footnotes, + ) + .unwrap(); + rendered.to_svg(&mut html).unwrap(); + } + + write!(&mut html, "
").unwrap(); + if !footnotes.is_empty() { + write!(&mut html, "
    ").unwrap(); + for note in footnotes { + write!(&mut html, "
  1. {note}
  2. ").unwrap(); + } + write!(&mut html, "

").unwrap(); + } + for (code, _) in &codes { + write!( + &mut html, + "" + ) + .unwrap(); + } + write!(&mut html, "").unwrap(); + std::fs::write("../rendered_glycans.html", html).unwrap(); +} diff --git a/rustyms/src/helper_functions.rs b/rustyms/src/helper_functions.rs index f10dd7a0..9b3b05b0 100644 --- a/rustyms/src/helper_functions.rs +++ b/rustyms/src/helper_functions.rs @@ -1,6 +1,8 @@ #![allow(dead_code)] use std::{ + collections::HashMap, + hash::Hash, num::{IntErrorKind, ParseIntError}, ops::{Bound, Range, RangeBounds}, path::Path, @@ -418,6 +420,19 @@ pub fn f64_bits(value: f64) -> u64 { } } +pub fn merge_hashmap(one: HashMap, two: HashMap) -> HashMap +where + V: std::ops::MulAssign + Default, + K: Eq + Hash, +{ + let mut new = one; + for (key, value) in two { + let v = new.entry(key).or_default(); + *v *= value + } + new +} + /// Implement a binary operator for all ref cases after the implementation for the ref-ref case (assumes deref operator works) macro_rules! impl_binop_ref_cases { (impl $imp:ident, $method:ident for $t:ty, $u:ty, $o:ty) => { diff --git a/rustyms/src/identification/identified_peptide.rs b/rustyms/src/identification/identified_peptide.rs index 054ee678..cfa7beae 100644 --- a/rustyms/src/identification/identified_peptide.rs +++ b/rustyms/src/identification/identified_peptide.rs @@ -506,9 +506,9 @@ impl IdentifiedPeptide { precursor_mz: mz, .. }) | MetaData::MSFragger(MSFraggerData { mz, .. }) => Some(*mz), - MetaData::MZTab(MZTabData { mz, .. }) | MetaData::MaxQuant(MaxQuantData { mz, .. }) => { - *mz - } + MetaData::MZTab(MZTabData { mz, .. }) + | MetaData::MaxQuant(MaxQuantData { mz, .. }) + | MetaData::DeepNovoFamily(DeepNovoFamilyData { mz, .. }) => *mz, MetaData::Sage(SageData { mass, z, .. }) | MetaData::NovoB(NovoBData { mass, z, .. }) | MetaData::PLink(PLinkData { mass, z, .. }) => { @@ -516,8 +516,7 @@ impl IdentifiedPeptide { mass.value / (z.value as f64), )) } - MetaData::DeepNovoFamily(_) - | MetaData::Fasta(_) + MetaData::Fasta(_) | MetaData::SpectrumSequenceList(_) | MetaData::PowerNovo(_) | MetaData::PepNet(_) => None, diff --git a/rustyms/src/isobaric_sets.rs b/rustyms/src/isobaric_sets.rs index 64297d20..e4c27376 100644 --- a/rustyms/src/isobaric_sets.rs +++ b/rustyms/src/isobaric_sets.rs @@ -4,6 +4,7 @@ use itertools::Itertools; use crate::{ checked_aminoacid::CheckedAminoAcid, + model::GlycanModel, modification::{Modification, SimpleModification, SimpleModificationInner}, peptidoform::SimpleLinear, placement_rule::{PlacementRule, Position}, @@ -128,6 +129,7 @@ pub fn building_blocks( false, SequencePosition::default(), 0, + &GlycanModel::DISALLOW, ) .0 .iter() @@ -198,11 +200,19 @@ pub fn building_blocks( options }) .flat_map(|s| { - s.formulas_all(&[], &[], &mut Vec::new(), false, position, 0) - .0 - .iter() - .map(|f| (s.clone(), f.monoisotopic_mass())) - .collect_vec() + s.formulas_all( + &[], + &[], + &mut Vec::new(), + false, + position, + 0, + &GlycanModel::DISALLOW, + ) + .0 + .iter() + .map(|f| (s.clone(), f.monoisotopic_mass())) + .collect_vec() }) .collect(); options.sort_unstable_by(|a, b| a.1.partial_cmp(&b.1).unwrap()); diff --git a/rustyms/src/lib.rs b/rustyms/src/lib.rs index 96d410f9..18f9b2c3 100644 --- a/rustyms/src/lib.rs +++ b/rustyms/src/lib.rs @@ -34,8 +34,7 @@ mod formula; #[path = "shared/csv.rs"] pub mod csv; -pub mod aminoacid_properties; -mod aminoacids; +pub mod aminoacid; mod checked_aminoacid; mod element; pub mod error; @@ -69,26 +68,26 @@ pub mod spectrum; pub mod system; mod tolerance; -pub use crate::element::*; -pub use crate::formula::*; -pub use crate::formula_search::find_formulas; -pub use crate::isobaric_sets::{building_blocks, find_isobaric_sets}; -pub use crate::mass_mode::MassMode; -pub use crate::model::Model; -pub use crate::modification::{CrossLinkName, Modification}; -pub use crate::molecular_charge::MolecularCharge; -pub use crate::multi::*; -pub use crate::neutral_loss::*; -pub use crate::peptidoform::*; -pub use crate::protease::*; -pub use crate::sequence_element::SequenceElement; -pub use crate::sequence_position::*; -pub use crate::spectrum::{AnnotatableSpectrum, AnnotatedSpectrum, RawSpectrum}; -pub use crate::tolerance::*; -pub use aminoacids::AminoAcid; +pub use aminoacid::{AminoAcid, IsAminoAcid}; pub use checked_aminoacid::CheckedAminoAcid; +pub use element::*; +pub use formula::*; +pub use formula_search::find_formulas; pub use fragment::Fragment; +pub use isobaric_sets::{building_blocks, find_isobaric_sets}; +pub use mass_mode::MassMode; +pub use model::FragmentationModel; +pub use modification::{CrossLinkName, Modification}; +pub use molecular_charge::MolecularCharge; +pub use multi::*; +pub use neutral_loss::*; +pub use peptidoform::*; pub use peptidoform::{CompoundPeptidoformIon, Peptidoform, PeptidoformIon}; +pub use protease::*; +pub use sequence_element::SequenceElement; +pub use sequence_position::*; +pub use spectrum::{AnnotatableSpectrum, AnnotatedSpectrum, RawSpectrum}; +pub use tolerance::*; #[macro_use] extern crate uom; @@ -96,6 +95,8 @@ extern crate uom; #[cfg(test)] #[expect(clippy::missing_panics_doc)] mod test { + use crate::model::MatchingParameters; + use super::*; #[test] @@ -106,7 +107,7 @@ mod test { .unwrap(); let fragments = peptide.generate_theoretical_fragments( system::usize::Charge::new::(1), - &Model::all(), + &FragmentationModel::all(), ); println!("{}", fragments.len()); println!("{fragments:?}"); @@ -114,12 +115,14 @@ mod test { #[test] fn simple_matching() { - let model = Model::all(); + let model = FragmentationModel::all(); + let parameters = MatchingParameters::default(); let spectrum = rawfile::mgf::open("data/example.mgf").unwrap(); let peptide = CompoundPeptidoformIon::pro_forma("WFWF", None).unwrap(); let fragments = peptide .generate_theoretical_fragments(system::usize::Charge::new::(1), &model); - let annotated = spectrum[0].annotate(peptide, &fragments, &model, MassMode::Monoisotopic); + let annotated = + spectrum[0].annotate(peptide, &fragments, ¶meters, MassMode::Monoisotopic); println!("{annotated:?}"); } } diff --git a/rustyms/src/model.rs b/rustyms/src/model.rs deleted file mode 100644 index 3838f741..00000000 --- a/rustyms/src/model.rs +++ /dev/null @@ -1,796 +0,0 @@ -//! Handle model instantiation. - -use std::ops::RangeInclusive; - -use serde::{Deserialize, Serialize}; - -use crate::{ - fragment::PeptidePosition, - system::{e, f64::MassOverCharge, isize::Charge, mz}, - NeutralLoss, Tolerance, -}; - -/// Control what charges are allowed for an ion series. Defined as an inclusive range. -/// Any charge above the precursor charge will result in the quotient time the precursor -/// charge carriers + all options for the remainder within the limits of the precursor -/// charge carriers. -#[non_exhaustive] -#[derive(Copy, Clone, PartialEq, Eq, Hash, Debug, PartialOrd, Ord, Serialize, Deserialize)] -pub struct ChargeRange { - /// Start point - start: ChargePoint, - /// End point (inclusive) - end: ChargePoint, -} - -impl ChargeRange { - /// Get all possible charges for the given precursor charge. - pub fn charges(&self, precursor: Charge) -> RangeInclusive { - Charge::new::(self.start.to_absolute(precursor).value.max(1)) - ..=self.end.to_absolute(precursor) - } - - /// Get all possible charges for the given precursor charge. - pub fn charges_iter( - &self, - precursor: Charge, - ) -> impl DoubleEndedIterator + Clone { - (self.start.to_absolute(precursor).value.max(1)..=self.end.to_absolute(precursor).value) - .map(Charge::new::) - } - - /// Solely single charged - pub const ONE: Self = Self { - start: ChargePoint::Absolute(1), - end: ChargePoint::Absolute(1), - }; - /// Only the exact precursor charge - pub const PRECURSOR: Self = Self { - start: ChargePoint::Relative(0), - end: ChargePoint::Relative(0), - }; - /// Range from 1 to the precursor - pub const ONE_TO_PRECURSOR: Self = Self { - start: ChargePoint::Absolute(1), - end: ChargePoint::Relative(0), - }; -} - -/// A reference point for charge range definition. -#[derive(Copy, Clone, PartialEq, Eq, Hash, Debug, PartialOrd, Ord, Serialize, Deserialize)] -pub enum ChargePoint { - /// Relative to the precursor, with the given offset. - Relative(isize), - /// Absolute charge. - Absolute(isize), -} - -impl ChargePoint { - /// Get the absolute charge of this charge point given a precursor charge - fn to_absolute(self, precursor: Charge) -> Charge { - match self { - Self::Absolute(a) => Charge::new::(a), - Self::Relative(r) => Charge::new::(precursor.value + r), - } - } -} -/// A model for the fragmentation, allowing control over what theoretical fragments to generate. -#[non_exhaustive] -#[derive(Clone, PartialEq, Debug, Serialize, Deserialize)] -pub struct Model { - /// a series ions - pub a: PrimaryIonSeries, - /// b series ions - pub b: PrimaryIonSeries, - /// c series ions - pub c: PrimaryIonSeries, - /// d series ions (side chain fragmentation from a) - pub d: PrimaryIonSeries, - /// v series ions (full side chain broken off) - pub v: PrimaryIonSeries, - /// w series ions (side chain fragmentation from z) - pub w: PrimaryIonSeries, - /// x series ions - pub x: PrimaryIonSeries, - /// y series ions - pub y: PrimaryIonSeries, - /// z series ions - pub z: PrimaryIonSeries, - /// precursor ions - pub precursor: (Vec, ChargeRange), - /// immonium ions - pub immonium: (bool, ChargeRange), - /// m ions, loss of the amino acid side chain from the precursor (follows precursor charge) - pub m: bool, - /// If the neutral losses specific for modifications should be generated - pub modification_specific_neutral_losses: bool, - /// If the diagnostic ions specific for modifications should be generated with the allowed charge range - pub modification_specific_diagnostic_ions: (bool, ChargeRange), - /// Glycan fragmentation - pub glycan: GlycanModel, - /// Allow any MS cleavable cross-link to be cleaved - pub allow_cross_link_cleavage: bool, - /// The matching tolerance - pub tolerance: Tolerance, - /// The range in which fragments fall, can be used to limit the theoretical fragments to a known window - pub mz_range: RangeInclusive, -} - -/// The settings for any primary ion series -#[derive(Clone, PartialEq, Eq, Hash, Debug, Serialize, Deserialize)] -pub struct PrimaryIonSeries { - /// Which locations are assumed to lead to fragmentation - pub location: Location, - /// The allowed neutral losses - pub neutral_losses: Vec, - /// The allowed charges - pub charge_range: ChargeRange, -} - -impl PrimaryIonSeries { - /// Replace the location - #[must_use] - pub fn location(self, location: Location) -> Self { - Self { location, ..self } - } - /// Replace the neutral losses - #[must_use] - pub fn neutral_losses(self, neutral_losses: Vec) -> Self { - Self { - neutral_losses, - ..self - } - } - /// Replace the charge range - #[must_use] - pub fn charge_range(self, charge_range: ChargeRange) -> Self { - Self { - charge_range, - ..self - } - } -} - -impl std::default::Default for PrimaryIonSeries { - fn default() -> Self { - Self { - location: Location::All, - neutral_losses: Vec::new(), - charge_range: ChargeRange::ONE_TO_PRECURSOR, - } - } -} - -/// The settings for glycan fragmentation -#[derive(Clone, PartialEq, Eq, Hash, Debug, Serialize, Deserialize)] -pub struct GlycanModel { - /// Allows fragments from glycans with defined structures (i.e. GNO modifications) - pub allow_structural: bool, - /// Allows fragments from glycans where only the composition is known (i.e. `Glycan:Hex1`). - /// This allows any fragment containing any number of monosaccharides within this range. - pub compositional_range: RangeInclusive, - /// The allowed neutral losses - pub neutral_losses: Vec, - /// The allowed charges for oxonium ions (B, internal fragments etc) - pub oxonium_charge_range: ChargeRange, - /// The allowed charges for other glycan fragments (Y) - pub other_charge_range: ChargeRange, -} - -impl GlycanModel { - /// Sets the status of glycan fragments from structural modifications - #[must_use] - pub fn allow_structural(self, allow_structural: bool) -> Self { - Self { - allow_structural, - ..self - } - } - /// Set the range of monosaccharides that can result in composition fragments, see [`Self::compositional_range`]. - #[must_use] - pub fn compositional_range(self, compositional_range: RangeInclusive) -> Self { - Self { - compositional_range, - ..self - } - } - /// Replace the neutral losses - #[must_use] - pub fn neutral_losses(self, neutral_losses: Vec) -> Self { - Self { - neutral_losses, - ..self - } - } - /// Replace the charge range for oxonium ions (B, internal fragments etc) - #[must_use] - pub fn oxonium_charge_range(self, oxonium_charge_range: ChargeRange) -> Self { - Self { - oxonium_charge_range, - ..self - } - } - /// Replace the charge range for other glycan ions (Y etc) - #[must_use] - pub fn other_charge_range(self, other_charge_range: ChargeRange) -> Self { - Self { - other_charge_range, - ..self - } - } - /// Default set for models that allow glycan fragmentation - pub const ALLOW: Self = Self { - allow_structural: true, - compositional_range: 1..=10, - neutral_losses: Vec::new(), - oxonium_charge_range: ChargeRange::ONE, - other_charge_range: ChargeRange::ONE_TO_PRECURSOR, - }; - /// Default set for models that disallow glycan fragmentation - pub const DISALLOW: Self = Self { - allow_structural: false, - compositional_range: 0..=0, - neutral_losses: Vec::new(), - oxonium_charge_range: ChargeRange::ONE, - other_charge_range: ChargeRange::ONE_TO_PRECURSOR, - }; -} - -/// A struct to handle all possible fragments that could be generated on a single location -#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)] -#[non_exhaustive] -pub struct PossibleIons<'a> { - /// a series ions - pub a: (bool, &'a [NeutralLoss], ChargeRange), - /// b series ions - pub b: (bool, &'a [NeutralLoss], ChargeRange), - /// c series ions - pub c: (bool, &'a [NeutralLoss], ChargeRange), - /// d series ions (side chain fragmentation from a) - pub d: (bool, &'a [NeutralLoss], ChargeRange), - /// v series ions (full side chain broken off) - pub v: (bool, &'a [NeutralLoss], ChargeRange), - /// w series ions (side chain fragmentation from z) - pub w: (bool, &'a [NeutralLoss], ChargeRange), - /// x series ions - pub x: (bool, &'a [NeutralLoss], ChargeRange), - /// y series ions - pub y: (bool, &'a [NeutralLoss], ChargeRange), - /// z series ions - pub z: (bool, &'a [NeutralLoss], ChargeRange), - /// precursor ions - pub precursor: (&'a [NeutralLoss], ChargeRange), - /// immonium - pub immonium: (bool, ChargeRange), -} - -impl PossibleIons<'_> { - /// Give an upper bound for the number of theoretical fragment for these possible ions - pub fn size_upper_bound(&self) -> usize { - usize::from(self.a.0) * (self.a.1.len() + 1) - + usize::from(self.b.0) * (self.b.1.len() + 1) - + usize::from(self.c.0) * (self.c.1.len() + 1) - + usize::from(self.d.0) * 2 * (self.d.1.len() + 1) - + usize::from(self.v.0) * (self.v.1.len() + 1) - + usize::from(self.w.0) * 2 * (self.w.1.len() + 1) - + usize::from(self.x.0) * (self.x.1.len() + 1) - + usize::from(self.y.0) * (self.y.1.len() + 1) - + usize::from(self.z.0) * 2 * (self.z.1.len() + 1) - + self.precursor.0.len() - + 1 - } -} - -/// Builder style methods -impl Model { - /// Set a - #[must_use] - pub fn a(self, a: PrimaryIonSeries) -> Self { - Self { a, ..self } - } - /// Set b - #[must_use] - pub fn b(self, b: PrimaryIonSeries) -> Self { - Self { b, ..self } - } - /// Set c - #[must_use] - pub fn c(self, c: PrimaryIonSeries) -> Self { - Self { c, ..self } - } - /// Set d - #[must_use] - pub fn d(self, d: PrimaryIonSeries) -> Self { - Self { d, ..self } - } - /// Set v - #[must_use] - pub fn v(self, v: PrimaryIonSeries) -> Self { - Self { v, ..self } - } - /// Set w - #[must_use] - pub fn w(self, w: PrimaryIonSeries) -> Self { - Self { w, ..self } - } - /// Set x - #[must_use] - pub fn x(self, x: PrimaryIonSeries) -> Self { - Self { x, ..self } - } - /// Set y - #[must_use] - pub fn y(self, y: PrimaryIonSeries) -> Self { - Self { y, ..self } - } - /// Set z - #[must_use] - pub fn z(self, z: PrimaryIonSeries) -> Self { - Self { z, ..self } - } - /// Set glycan - #[must_use] - pub fn glycan(self, glycan: GlycanModel) -> Self { - Self { glycan, ..self } - } - /// Overwrite the precursor neutral losses - #[must_use] - pub fn precursor(self, neutral_loss: Vec, charges: ChargeRange) -> Self { - Self { - precursor: (neutral_loss, charges), - ..self - } - } - /// Set immonium - #[must_use] - pub fn immonium(self, state: (bool, ChargeRange)) -> Self { - Self { - immonium: state, - ..self - } - } - /// Set m - #[must_use] - pub fn m(self, state: bool) -> Self { - Self { m: state, ..self } - } - /// Set modification specific neutral losses - #[must_use] - pub fn modification_specific_neutral_losses(self, state: bool) -> Self { - Self { - modification_specific_neutral_losses: state, - ..self - } - } - /// Set modification specific diagnostic ions - #[must_use] - pub fn modification_specific_diagnostic_ions(self, state: (bool, ChargeRange)) -> Self { - Self { - modification_specific_diagnostic_ions: state, - ..self - } - } - /// Set the tolerance - #[must_use] - pub fn allow_cross_link_cleavage(self, state: bool) -> Self { - Self { - allow_cross_link_cleavage: state, - ..self - } - } - /// Set the tolerance - #[must_use] - pub fn tolerance(self, tolerance: impl Into>) -> Self { - Self { - tolerance: tolerance.into(), - ..self - } - } - /// Set the mz range - #[must_use] - pub fn mz_range(self, mz_range: RangeInclusive) -> Self { - Self { mz_range, ..self } - } -} - -impl Model { - /// Give all possible ions for the given N position - pub fn ions(&self, position: PeptidePosition) -> PossibleIons { - let c_position = position.flip_terminal(); - PossibleIons { - a: ( - self.a.location.possible(position), - self.a.neutral_losses.as_slice(), - self.a.charge_range, - ), - b: ( - self.b.location.possible(position), - self.b.neutral_losses.as_slice(), - self.b.charge_range, - ), - c: ( - self.c.location.possible(position), - self.c.neutral_losses.as_slice(), - self.c.charge_range, - ), - d: ( - self.d.location.possible(position), - self.d.neutral_losses.as_slice(), - self.d.charge_range, - ), - v: ( - self.v.location.possible(c_position), - self.v.neutral_losses.as_slice(), - self.v.charge_range, - ), - w: ( - self.w.location.possible(c_position), - self.w.neutral_losses.as_slice(), - self.w.charge_range, - ), - x: ( - self.x.location.possible(c_position), - self.x.neutral_losses.as_slice(), - self.x.charge_range, - ), - y: ( - self.y.location.possible(c_position), - self.y.neutral_losses.as_slice(), - self.y.charge_range, - ), - z: ( - self.z.location.possible(c_position), - self.z.neutral_losses.as_slice(), - self.z.charge_range, - ), - precursor: (self.precursor.0.as_slice(), self.precursor.1), - immonium: self.immonium, - } - } - - /// Generate all possible fragments - pub fn all() -> Self { - Self { - a: PrimaryIonSeries::default() - .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), - b: PrimaryIonSeries::default() - .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), - c: PrimaryIonSeries::default() - .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), - d: PrimaryIonSeries::default() - .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), - v: PrimaryIonSeries::default() - .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), - w: PrimaryIonSeries::default() - .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), - x: PrimaryIonSeries::default() - .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), - y: PrimaryIonSeries::default() - .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), - z: PrimaryIonSeries::default() - .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), - precursor: ( - vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))], - ChargeRange::PRECURSOR, - ), - immonium: (true, ChargeRange::ONE), - m: true, - modification_specific_neutral_losses: true, - modification_specific_diagnostic_ions: (true, ChargeRange::ONE), - glycan: GlycanModel::ALLOW - .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), - allow_cross_link_cleavage: true, - tolerance: Tolerance::new_ppm(20.0), - mz_range: MassOverCharge::new::(0.0)..=MassOverCharge::new::(f64::MAX), - } - } - - /// Generate no fragments (except for precursor) - pub fn none() -> Self { - Self { - a: PrimaryIonSeries::default().location(Location::None), - b: PrimaryIonSeries::default().location(Location::None), - c: PrimaryIonSeries::default().location(Location::None), - d: PrimaryIonSeries::default().location(Location::None), - v: PrimaryIonSeries::default().location(Location::None), - w: PrimaryIonSeries::default().location(Location::None), - x: PrimaryIonSeries::default().location(Location::None), - y: PrimaryIonSeries::default().location(Location::None), - z: PrimaryIonSeries::default().location(Location::None), - precursor: (vec![], ChargeRange::PRECURSOR), - immonium: (false, ChargeRange::ONE), - m: false, - modification_specific_neutral_losses: false, - modification_specific_diagnostic_ions: (false, ChargeRange::ONE), - glycan: GlycanModel::DISALLOW, - allow_cross_link_cleavage: false, - tolerance: Tolerance::new_ppm(20.0), - mz_range: MassOverCharge::new::(0.0)..=MassOverCharge::new::(f64::MAX), - } - } - - /// electron-transfer/higher-energy collisional dissociation - pub fn ethcd() -> Self { - Self { - a: PrimaryIonSeries::default().location(Location::TakeN { skip: 0, take: 1 }), - b: PrimaryIonSeries::default() - .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), - c: PrimaryIonSeries::default() - .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), - d: PrimaryIonSeries::default().location(Location::TakeN { skip: 0, take: 1 }), - v: PrimaryIonSeries::default().location(Location::None), - w: PrimaryIonSeries::default() - .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), - x: PrimaryIonSeries::default().location(Location::None), - y: PrimaryIonSeries::default() - .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), - z: PrimaryIonSeries::default() - .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), - precursor: ( - vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))], - ChargeRange::ONE_TO_PRECURSOR, - ), - immonium: (false, ChargeRange::ONE), - m: false, - modification_specific_neutral_losses: true, - modification_specific_diagnostic_ions: (true, ChargeRange::ONE), - glycan: GlycanModel::ALLOW - .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), - allow_cross_link_cleavage: true, - tolerance: Tolerance::new_ppm(20.0), - mz_range: MassOverCharge::new::(0.0)..=MassOverCharge::new::(f64::MAX), - } - } - - /// EAD - pub fn ead() -> Self { - Self { - a: PrimaryIonSeries::default() - .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), - b: PrimaryIonSeries::default() - .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), - c: PrimaryIonSeries::default() - .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), - d: PrimaryIonSeries::default() - .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), - v: PrimaryIonSeries::default() - .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), - w: PrimaryIonSeries::default() - .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), - x: PrimaryIonSeries::default() - .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), - y: PrimaryIonSeries::default() - .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), - z: PrimaryIonSeries::default() - .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), - precursor: ( - vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))], - ChargeRange::ONE_TO_PRECURSOR, - ), - immonium: (true, ChargeRange::ONE), - m: false, - modification_specific_neutral_losses: true, - modification_specific_diagnostic_ions: (true, ChargeRange::ONE), - glycan: GlycanModel::ALLOW - .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), - allow_cross_link_cleavage: true, - tolerance: Tolerance::new_ppm(20.0), - mz_range: MassOverCharge::new::(0.0)..=MassOverCharge::new::(f64::MAX), - } - } - - /// hot EACID - pub fn hot_eacid() -> Self { - Self { - a: PrimaryIonSeries::default() - .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), - b: PrimaryIonSeries::default() - .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), - c: PrimaryIonSeries::default() - .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), - d: PrimaryIonSeries::default() - .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), - v: PrimaryIonSeries::default().location(Location::None), - w: PrimaryIonSeries::default() - .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), - x: PrimaryIonSeries::default() - .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), - y: PrimaryIonSeries::default() - .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), - z: PrimaryIonSeries::default() - .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), - precursor: ( - vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))], - ChargeRange::ONE_TO_PRECURSOR, - ), - immonium: (false, ChargeRange::ONE), - m: false, - modification_specific_neutral_losses: true, - modification_specific_diagnostic_ions: (true, ChargeRange::ONE), - glycan: GlycanModel::ALLOW - .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), - allow_cross_link_cleavage: true, - tolerance: Tolerance::new_ppm(20.0), - mz_range: MassOverCharge::new::(0.0)..=MassOverCharge::new::(f64::MAX), - } - } - - /// CID Hcd - pub fn cid_hcd() -> Self { - Self { - a: PrimaryIonSeries::default() - .location(Location::TakeN { skip: 0, take: 1 }) - .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), - b: PrimaryIonSeries::default() - .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), - c: PrimaryIonSeries::default().location(Location::None), - d: PrimaryIonSeries::default() - .location(Location::TakeN { skip: 0, take: 1 }) - .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), - v: PrimaryIonSeries::default().location(Location::None), - w: PrimaryIonSeries::default().location(Location::None), - x: PrimaryIonSeries::default().location(Location::None), - y: PrimaryIonSeries::default() - .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), - z: PrimaryIonSeries::default().location(Location::None), - precursor: ( - vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))], - ChargeRange::PRECURSOR, - ), - immonium: (false, ChargeRange::ONE), - m: false, - modification_specific_neutral_losses: true, - modification_specific_diagnostic_ions: (true, ChargeRange::ONE), - glycan: GlycanModel::DISALLOW, - allow_cross_link_cleavage: true, - tolerance: Tolerance::new_ppm(20.0), - mz_range: MassOverCharge::new::(0.0)..=MassOverCharge::new::(f64::MAX), - } - } - - /// ETD - pub fn etd() -> Self { - Self { - a: PrimaryIonSeries::default().location(Location::None), - b: PrimaryIonSeries::default().location(Location::None), - c: PrimaryIonSeries::default() - .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), - d: PrimaryIonSeries::default().location(Location::None), - v: PrimaryIonSeries::default().location(Location::None), - w: PrimaryIonSeries::default().location(Location::None), // TODO: Are w ions also formed here? - x: PrimaryIonSeries::default().location(Location::None), - y: PrimaryIonSeries::default() - .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), - z: PrimaryIonSeries::default() - .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), - precursor: ( - vec![ - NeutralLoss::Loss(molecular_formula!(H 2 O 1)), - NeutralLoss::Loss(molecular_formula!(H 1 O 1)), - NeutralLoss::Loss(molecular_formula!(H 3 N 1)), - NeutralLoss::Loss(molecular_formula!(C 1 H 1 O 2)), - NeutralLoss::Loss(molecular_formula!(C 2 H 3 O 2)), - ], - ChargeRange { - start: ChargePoint::Relative(-2), - end: ChargePoint::Relative(0), - }, - ), - immonium: (false, ChargeRange::ONE), - m: false, - modification_specific_neutral_losses: true, - modification_specific_diagnostic_ions: (true, ChargeRange::ONE), - glycan: GlycanModel::DISALLOW, - allow_cross_link_cleavage: true, - tolerance: Tolerance::new_ppm(20.0), - mz_range: MassOverCharge::new::(0.0)..=MassOverCharge::new::(f64::MAX), - } - } - - /// Top Down ETD - pub fn td_etd() -> Self { - Self { - a: PrimaryIonSeries::default().location(Location::None), - b: PrimaryIonSeries::default().location(Location::None), - c: PrimaryIonSeries::default().neutral_losses(vec![ - NeutralLoss::Loss(molecular_formula!(H 2 O 1)), - NeutralLoss::Loss(molecular_formula!(H 3 N 1)), - NeutralLoss::Gain(molecular_formula!(H 1)), - NeutralLoss::Gain(molecular_formula!(H 2)), - NeutralLoss::Gain(molecular_formula!(H 3)), - ]), - d: PrimaryIonSeries::default().location(Location::None), - v: PrimaryIonSeries::default().location(Location::None), - w: PrimaryIonSeries::default().location(Location::None), - x: PrimaryIonSeries::default().location(Location::None), - y: PrimaryIonSeries::default().location(Location::None), - z: PrimaryIonSeries::default().neutral_losses(vec![ - NeutralLoss::Loss(molecular_formula!(H 2 O 1)), - NeutralLoss::Loss(molecular_formula!(H 3 N 1)), - NeutralLoss::Gain(molecular_formula!(H 1)), - NeutralLoss::Gain(molecular_formula!(H 2)), - NeutralLoss::Gain(molecular_formula!(H 3)), - ]), - precursor: ( - vec![ - NeutralLoss::Loss(molecular_formula!(H 2 O 1)), - NeutralLoss::Loss(molecular_formula!(H 1 O 1)), - NeutralLoss::Loss(molecular_formula!(H 3 N 1)), - NeutralLoss::Loss(molecular_formula!(C 1 H 1 O 2)), - NeutralLoss::Loss(molecular_formula!(C 2 H 3 O 2)), - NeutralLoss::Gain(molecular_formula!(H 1)), - NeutralLoss::Gain(molecular_formula!(H 2)), - NeutralLoss::Gain(molecular_formula!(H 3)), - ], - ChargeRange::PRECURSOR, - ), - immonium: (false, ChargeRange::ONE), - m: false, - modification_specific_neutral_losses: true, - modification_specific_diagnostic_ions: (true, ChargeRange::ONE), - glycan: GlycanModel::DISALLOW, - allow_cross_link_cleavage: true, - tolerance: Tolerance::new_ppm(20.0), - mz_range: MassOverCharge::new::(0.0)..=MassOverCharge::new::(f64::MAX), - } - } -} - -/// A location, or range of locations where an ion can be generated -#[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Default, Debug, Serialize, Deserialize)] -pub enum Location { - /// Skip the given number from the N terminal side - SkipN(usize), - /// Skip the given number of aminoacids from the N terminal and C terminal side respectively, only using the positions between these two - SkipNC(usize, usize), - /// Skip a certain number and then take a certain number of aminoacids - TakeN { - /// Skip this number of aminoacids - skip: usize, - /// Take this number of aminoacids - take: usize, - }, - /// Skip a given number from the C terminal side - SkipC(usize), - /// Take a given number of aminoacids from the C terminal side - TakeC(usize), - /// All positions (including 0 and len-1) - All, - /// Do not allow it anywhere - #[default] - None, -} - -impl Location { - /// Determine if an ion is possible on this location - /// # Panics - /// If the peptide position is a terminal position - pub const fn possible(&self, position: PeptidePosition) -> bool { - let crate::SequencePosition::Index(sequence_index) = position.sequence_index else { - panic!("Not allowed to call possible with a terminal PeptidePosition") - }; - match self { - Self::SkipN(n) => sequence_index >= *n, - Self::SkipNC(n, c) => { - sequence_index >= *n && position.sequence_length - sequence_index > *c - } - Self::TakeN { skip, take } => sequence_index >= *skip && sequence_index < *skip + *take, - Self::SkipC(n) => position.sequence_length - sequence_index > *n, - Self::TakeC(n) => position.sequence_length - sequence_index <= *n, - Self::All => position.series_number != position.sequence_length, - Self::None => false, - } - } -} - -#[test] -#[expect(clippy::missing_panics_doc, clippy::similar_names)] -fn location_all() { - let all = Model::all(); - let ions_n0 = all.ions(PeptidePosition::n(crate::SequencePosition::default(), 2)); - let ions_c0 = all.ions(PeptidePosition::c(crate::SequencePosition::default(), 2)); - assert!(ions_n0.a.0); - assert!(!ions_n0.x.0); - assert!(!ions_c0.a.0); - assert!(ions_c0.x.0); -} diff --git a/rustyms/src/model/built_in.rs b/rustyms/src/model/built_in.rs new file mode 100644 index 00000000..d4796a15 --- /dev/null +++ b/rustyms/src/model/built_in.rs @@ -0,0 +1,607 @@ +use std::sync::LazyLock; + +use crate::{ + fragment::FragmentKind, + glycan::{BaseSugar, GlycanSubstituent, MonoSaccharide}, + model::{ + ChargePoint, ChargeRange, FragmentationModel, GlycanModel, Location, PrimaryIonSeries, + SatelliteIonSeries, SatelliteLocation, + }, + AminoAcid, NeutralLoss, +}; + +use super::GlycanPeptideFragment; + +static MODEL_ALL: LazyLock = LazyLock::new(|| FragmentationModel { + a: PrimaryIonSeries::default() + .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), + b: PrimaryIonSeries::default() + .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), + c: PrimaryIonSeries::default() + .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), + d: SatelliteIonSeries::base() + .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), + v: SatelliteIonSeries::base() + .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), + w: SatelliteIonSeries::base() + .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), + x: PrimaryIonSeries::default() + .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), + y: PrimaryIonSeries::default() + .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), + z: PrimaryIonSeries::default() + .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), + precursor: ( + vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))], + Vec::new(), + (1, None), + ChargeRange::PRECURSOR, + ), + immonium: Some((ChargeRange::ONE, immonium_losses().clone())), + modification_specific_neutral_losses: true, + modification_specific_diagnostic_ions: Some(ChargeRange::ONE), + glycan: GlycanModel::default_allow() + .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), + allow_cross_link_cleavage: true, +}); + +static MODEL_NONE: LazyLock = LazyLock::new(|| FragmentationModel { + a: PrimaryIonSeries::none(), + b: PrimaryIonSeries::none(), + c: PrimaryIonSeries::none(), + d: SatelliteIonSeries::default(), + v: SatelliteIonSeries::default(), + w: SatelliteIonSeries::default(), + x: PrimaryIonSeries::none(), + y: PrimaryIonSeries::none(), + z: PrimaryIonSeries::none(), + precursor: (Vec::new(), Vec::new(), (0, None), ChargeRange::PRECURSOR), + immonium: None, + modification_specific_neutral_losses: false, + modification_specific_diagnostic_ions: None, + glycan: GlycanModel::DISALLOW, + allow_cross_link_cleavage: false, +}); + +static MODEL_UVPD: LazyLock = LazyLock::new(|| FragmentationModel { + a: PrimaryIonSeries::default().variants(vec![0, 1, 2]), + b: PrimaryIonSeries::default().variants(vec![0, 2]), + c: PrimaryIonSeries::default(), + d: SatelliteIonSeries::base(), + v: SatelliteIonSeries::base(), + w: SatelliteIonSeries::base(), + x: PrimaryIonSeries::default().variants(vec![-1, 0, 1, 2]), + y: PrimaryIonSeries::default().variants(vec![-2, -1, 0]), + z: PrimaryIonSeries::default(), + precursor: ( + vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))], + Vec::new(), + (0, None), + ChargeRange::PRECURSOR, + ), + immonium: Some((ChargeRange::ONE, immonium_losses().clone())), + modification_specific_neutral_losses: true, + modification_specific_diagnostic_ions: Some(ChargeRange::ONE), + glycan: GlycanModel::DISALLOW, + allow_cross_link_cleavage: false, +}); + +static MODEL_ETHCD: LazyLock = LazyLock::new(|| FragmentationModel { + a: PrimaryIonSeries::default().location(Location::TakeN { skip: 0, take: 1 }), + b: PrimaryIonSeries::default() + .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), + c: PrimaryIonSeries::default() + .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]) + .variants(vec![0, 1]), + d: SatelliteIonSeries::base(), + v: SatelliteIonSeries::default(), + w: SatelliteIonSeries::base() + .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), + x: PrimaryIonSeries::none(), + y: PrimaryIonSeries::default() + .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), + z: PrimaryIonSeries::default() + .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]) + .variants(vec![0, 1]), + precursor: ( + vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))], + Vec::new(), + (0, None), + ChargeRange::ONE_TO_PRECURSOR, + ), + immonium: None, + modification_specific_neutral_losses: true, + modification_specific_diagnostic_ions: Some(ChargeRange::ONE), + glycan: GlycanModel::default_allow() + .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]) + .default_peptide_fragment(GlycanPeptideFragment::FULL) + .peptide_fragment_rules(vec![ + ( + vec![AminoAcid::Asparagine, AminoAcid::Tryptophan], + vec![FragmentKind::c, FragmentKind::z, FragmentKind::w], + GlycanPeptideFragment::FULL, + ), + ( + vec![AminoAcid::Asparagine, AminoAcid::Tryptophan], + vec![FragmentKind::b, FragmentKind::y, FragmentKind::v], + GlycanPeptideFragment::CORE, + ), + ( + vec![AminoAcid::Serine, AminoAcid::Threonine], + vec![FragmentKind::c, FragmentKind::z, FragmentKind::w], + GlycanPeptideFragment::FULL, + ), + ( + vec![AminoAcid::Serine, AminoAcid::Threonine], + vec![FragmentKind::b, FragmentKind::y, FragmentKind::v], + GlycanPeptideFragment::FREE, + ), + ]), + allow_cross_link_cleavage: true, +}); + +static MODEL_EAD: LazyLock = LazyLock::new(|| FragmentationModel { + a: PrimaryIonSeries::default() + .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), + b: PrimaryIonSeries::default() + .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), + c: PrimaryIonSeries::default() + .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), + d: SatelliteIonSeries::base() + .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), + v: SatelliteIonSeries::base() + .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), + w: SatelliteIonSeries::base() + .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), + x: PrimaryIonSeries::default() + .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), + y: PrimaryIonSeries::default() + .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), + z: PrimaryIonSeries::default() + .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), + precursor: ( + vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))], + Vec::new(), + (0, None), + ChargeRange::ONE_TO_PRECURSOR, + ), + immonium: Some((ChargeRange::ONE, immonium_losses().clone())), + modification_specific_neutral_losses: true, + modification_specific_diagnostic_ions: Some(ChargeRange::ONE), + glycan: GlycanModel::default_allow() + .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), + allow_cross_link_cleavage: true, +}); + +static MODEL_EACID: LazyLock = LazyLock::new(|| FragmentationModel { + a: PrimaryIonSeries::default() + .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), + b: PrimaryIonSeries::default() + .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), + c: PrimaryIonSeries::default() + .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), + d: SatelliteIonSeries::base() + .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), + v: SatelliteIonSeries::base() + .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), + w: SatelliteIonSeries::base() + .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), + x: PrimaryIonSeries::default() + .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), + y: PrimaryIonSeries::default() + .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), + z: PrimaryIonSeries::default() + .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), + precursor: ( + vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))], + Vec::new(), + (0, None), + ChargeRange::ONE_TO_PRECURSOR, + ), + immonium: None, + modification_specific_neutral_losses: true, + modification_specific_diagnostic_ions: Some(ChargeRange::ONE), + glycan: GlycanModel::default_allow() + .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), + allow_cross_link_cleavage: true, +}); + +static MODEL_CID_HCD: LazyLock = LazyLock::new(|| FragmentationModel { + a: PrimaryIonSeries::default() + .location(Location::TakeN { skip: 0, take: 1 }) + .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), + b: PrimaryIonSeries::default() + .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), + c: PrimaryIonSeries::none(), + d: SatelliteIonSeries::base() + .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), + v: SatelliteIonSeries::default(), + w: SatelliteIonSeries::default(), + x: PrimaryIonSeries::none(), + y: PrimaryIonSeries::default() + .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), + z: PrimaryIonSeries::none(), + precursor: ( + vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))], + Vec::new(), + (0, None), + ChargeRange::PRECURSOR, + ), + immonium: None, + modification_specific_neutral_losses: true, + modification_specific_diagnostic_ions: Some(ChargeRange::ONE), + glycan: GlycanModel::default_allow().default_peptide_fragment(GlycanPeptideFragment::CORE), + allow_cross_link_cleavage: true, +}); + +static MODEL_ETD: LazyLock = LazyLock::new(|| FragmentationModel { + a: PrimaryIonSeries::none(), + b: PrimaryIonSeries::none(), + c: PrimaryIonSeries::default() + .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]) + .variants(vec![-1, 0, 2]), + d: SatelliteIonSeries::default(), + v: SatelliteIonSeries::base(), + w: SatelliteIonSeries::default() + .location(SatelliteLocation { + rules: vec![ + (vec![AminoAcid::Methionine], 5), + (vec![AminoAcid::Leucine, AminoAcid::GlutamicAcid], 2), + (vec![AminoAcid::Isoleucine, AminoAcid::AsparticAcid], 1), + ( + vec![ + AminoAcid::Valine, + AminoAcid::Asparagine, + AminoAcid::Threonine, + AminoAcid::Serine, + AminoAcid::Tryptophan, + AminoAcid::Histidine, + AminoAcid::Phenylalanine, + AminoAcid::Tyrosine, + ], + 0, + ), + ], + base: None, + }) + .variants(vec![-1, 0, 1]), + x: PrimaryIonSeries::none(), + y: PrimaryIonSeries::default() + .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), + z: PrimaryIonSeries::default() + .neutral_losses(vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]) + .variants(vec![-1, 0, 1, 2]), + precursor: ( + vec![ + NeutralLoss::Loss(molecular_formula!(H 2 O 1)), + NeutralLoss::Loss(molecular_formula!(H 1 O 1)), + NeutralLoss::Loss(molecular_formula!(H 3 N 1)), + ], + vec![ + ( + vec![AminoAcid::AsparticAcid], + vec![NeutralLoss::Loss(molecular_formula!(C 1 H 1 O 2))], + ), + ( + vec![AminoAcid::GlutamicAcid], + vec![NeutralLoss::Loss(molecular_formula!(C 2 H 3 O 2))], + ), + ], + (2, None), + ChargeRange { + start: ChargePoint::Relative(-2), + end: ChargePoint::Relative(0), + }, + ), + immonium: None, + modification_specific_neutral_losses: true, + modification_specific_diagnostic_ions: Some(ChargeRange::ONE), + glycan: GlycanModel::default_allow().default_peptide_fragment(GlycanPeptideFragment::FREE), + allow_cross_link_cleavage: true, +}); + +static MODEL_TD_ETD: LazyLock = LazyLock::new(|| FragmentationModel { + a: PrimaryIonSeries::none(), + b: PrimaryIonSeries::none(), + c: PrimaryIonSeries::default() + .neutral_losses(vec![ + NeutralLoss::Loss(molecular_formula!(H 2 O 1)), + NeutralLoss::Loss(molecular_formula!(H 3 N 1)), + ]) + .variants(vec![0, 1, 2]), + d: SatelliteIonSeries::default(), + v: SatelliteIonSeries::default(), + w: SatelliteIonSeries::default(), + x: PrimaryIonSeries::none(), + y: PrimaryIonSeries::none(), + z: PrimaryIonSeries::default() + .neutral_losses(vec![ + NeutralLoss::Loss(molecular_formula!(H 2 O 1)), + NeutralLoss::Loss(molecular_formula!(H 3 N 1)), + ]) + .variants(vec![-1, 0, 1, 2]), + precursor: ( + vec![ + NeutralLoss::Loss(molecular_formula!(H 2 O 1)), + NeutralLoss::Loss(molecular_formula!(H 1 O 1)), + NeutralLoss::Loss(molecular_formula!(H 3 N 1)), + NeutralLoss::Gain(molecular_formula!(H 1)), + NeutralLoss::Gain(molecular_formula!(H 2)), + NeutralLoss::Gain(molecular_formula!(H 3)), + ], + vec![ + ( + vec![AminoAcid::AsparticAcid], + vec![NeutralLoss::Loss(molecular_formula!(C 1 H 1 O 2))], + ), + ( + vec![AminoAcid::GlutamicAcid], + vec![NeutralLoss::Loss(molecular_formula!(C 2 H 3 O 2))], + ), + ], + (0, None), + ChargeRange::PRECURSOR, + ), + immonium: None, + modification_specific_neutral_losses: true, + modification_specific_diagnostic_ions: Some(ChargeRange::ONE), + glycan: GlycanModel::DISALLOW, + allow_cross_link_cleavage: true, +}); + +impl FragmentationModel { + /// Generate all possible fragments + pub fn all() -> &'static Self { + LazyLock::force(&MODEL_ALL) + } + + /// Generate no fragments (except for precursor) + pub fn none() -> &'static Self { + LazyLock::force(&MODEL_NONE) + } + + /// UVPD + /// 10.1021/acs.chemrev.9b00440 and 10.1021/jacs.6b05147 + pub fn uvpd() -> &'static Self { + LazyLock::force(&MODEL_UVPD) + } + + /// electron-transfer/higher-energy collisional dissociation + pub fn ethcd() -> &'static Self { + LazyLock::force(&MODEL_ETHCD) + } + + /// EAD + pub fn ead() -> &'static Self { + LazyLock::force(&MODEL_EAD) + } + + /// EAciD + pub fn eacid() -> &'static Self { + LazyLock::force(&MODEL_EACID) + } + + /// CID Hcd + pub fn cid_hcd() -> &'static Self { + LazyLock::force(&MODEL_CID_HCD) + } + + /// ETD 10.1002/jms.3919 + pub fn etd() -> &'static Self { + LazyLock::force(&MODEL_ETD) + } + + /// Top Down ETD + pub fn td_etd() -> &'static Self { + LazyLock::force(&MODEL_TD_ETD) + } +} + +/// All losses from the base immonium ions. Compiled from the sources below. +/// +/// | AA | [Wikipedia](https://upload.wikimedia.org/wikipedia/commons/thumb/0/01/Amino_acid_fragment_ions.png/400px-Amino_acid_fragment_ions.png) | 0.1016/1044-0305(93)87006-X | [ionsource](https://www.ionsource.com/Card/immon/immon.htm) | [10.1002/chin.199624319](http://dx.doi.org/10.1002/chin.199624319) | [Prospector (MS-Comp)](https://prospector.ucsf.edu/prospector/cgi-bin/msform.cgi?form=mscomp) | [10.1186/1477-5956-9-2](http://dx.doi.org/10.1186/1477-5956-9-2) | 10.1016/j.ymeth.2004.08.013 | 10.1385/1597452750 (table 5) | 10.1021/ac902712f | [Prospector (MS-Product)](https://prospector.ucsf.edu/prospector/cgi-bin/msform.cgi?form=msproduct) | [ThermoFisher](https://tools.thermofisher.com/content/sfs/brochures/cms_040030.pdf) | 10.1074/mcp.O113.035915 | 10.1074/mcp.O113.035915 | 10.1021/ac902712f | [Prospector (MS-Product)](https://prospector.ucsf.edu/prospector/cgi-bin/msform.cgi?form=msproduct) | | 10.1385/1597452750 (table 5) | Sources | Best mass | Best formula | Loss | Loss formula | Interpreted loss | Interpreted formula | Final | +/// |-------|---------------------------------------------------------------------------------------------------------------------------|-----------------------------|------------------------------------------------|------------------------------------------|-----------------------------------------------------------------------|-----------------------------------------|-----------------------------|------------------------------|-------------------|--------------------------------------------------------------------------|---------------------------------------------------------------------|-------------------------|-------------------------|-------------------|--------------------------------------------------------------------------|------------------------------|---------|----------:|--------------|----------|--------------|------------------|-------------------------|------------| +/// | A | 44 | 44 | | 44 | | 44 | | 44.05 | | | 44.0500 | | | | | | 6 | 44.0500 | | | | | | | +/// | R | 129 | 129 | | 129 | | 129 | | 129.11 | | | 129.1140 | 129.1135 | C5H13N4+ | | | | 8 | 129.1138 | C5H13N4+ | | | | | | +/// | | | | 185 | | 185 | | | | | | | | | | | | 2 | 185 | | -55.8862 | | C-2O-2 | | C-2O-2 | +/// | | | | | | | | | 115.09 | | | | | | | | | 1 | 115.09 | | 14.0238 | | C1H2 | | C1H2 | +/// | | 112 | 112 | 112 | 112 | 112 | 112 | 112.09 | 112.09 | | 112.0869 | 112.0875 | | | | C5H10N3+ | C5H10N3+ | 12 | 112.0872 | C5H10N3+ | 17.0266 | H3N1 | | | H3N1 | +/// | | 100 | 100 | 100 | 100 | 100 | 100 | 100.09 | | | 100.0869 | 100.0875 | | | | C4H10N3+ | | 10 | 100.0872 | C4H10N3+ | 29.0266 | C1H3N1 | | | C1H3N1 | +/// | | 87 | 87 | 87 | 87 | 87 | 87 | 87.09 | 87.09 | 87.0922 | 87.0917 | | | | C4H11N2+ | C4H11N2+ | | 12 | 87.0920 | C4H11N2+ | 42.0218 | C2H2N2 | | | C2H2N2 | +/// | | 73 | 73 | 73 | | 73 | 72 | 73.00 | | 73.0640 | | | | | C2H7N3+ | | | 8 | 73.0640 | C2H7N3+ | 56.0498 | C3H6N1 | | | C3H6N1 | +/// | | 70 | 70 | 70 | 70 | 70 | 70 | 70.07 | 70.07 | 70.0657 | 70.0651 | 70.0657 | | | C4H8N1+ | C4H8N1+ | | 13 | 70.0655 | C4H8N1+ | 59.0483 | C1H5N3 | | | C1H5N3 | +/// | | | | | | | | | 60.06 | | | | | | | | | 1 | 60.06 | | 69.0538 | | C3H4N2O-1 | C2H6N1O1+ | C3H4N2O-1 | +/// | | | 59 | | | | 59 | | | 59.0483 | | | | | CH5N3+ | | | 4 | 59.0483 | CH5N3+ | 70.0655 | C4H8N1 | | | C4H8N1 | +/// | | | | | | | | | | 43.0296 | | | | | C1H3N2+ | | | 2 | 43.0296 | C1H3N2+ | 86.0842 | C4H10N2 | | | C4H10N2 | +/// | | 29 | | | | | | | | | | | | | | | | 1 | 29 | | 100.1138 | | | H1N2/C1H1O1/C1H3N1/C2H5 | | +/// | N | 87 | 87 | 87 | 87 | 87 | 87 | 87.09 | 87.06 | | 87.0553 | 87.0558 | | | | C3H7N2O1+ | | 11 | 87.0556 | C3H7N2O1+ | | | | | | +/// | | 70 | 70 | 70 | 70 | | 70 | | 70.03 | | | 70.0293 | | | | | C3H4N1O1+ | 8 | 70.0293 | C3H4N1O1+ | 17.0263 | H3N1 | | | H3N1 | +/// | D | 88 | 88 | 88 | 88 | 88 | 88 | 88.04 | 88.04 | 88.0399 | 88.0393 | 88.0399 | | | C3H6N1O2+ | C3H6N1O2+ | | 13 | 88.0397 | C3H6N1O2+ | | | | | | +/// | | 70 | | 70 | 70 | | 70 | | 70.03 | | | 70.0293 | | | | | C3H4N1O1+ | 7 | 70.0293 | C3H4N1O1+ | 18.0104 | H2O1 | | | H2O1 | +/// | C | 76 | | | 76 | | 76 | | | | | 76.0221 | | | | | | 4 | 76.0221 | | | | | | | +/// | E | 102 | 102 | | 102 | 102 | 102 | 102.06 | 102.05 | 102.0555 | 102.0550 | 102.0555 | 102.0550 | C4H8N1O2+ | C4H8N1O2+ | C4H8N1O2+ | | 14 | 102.0553 | C4H8N1O2+ | | | | | | +/// | | | | | 91 | | | | | | | | | | | | | 1 | 91 | | 11.0553 | | | | | +/// | | | | | 84 | | | | 84.04 | | | 84.0449 | | | | | C4H6N1O1+ | 4 | 84.0449 | C4H6N1O1+ | 18.0104 | H2O1 | | | H2O1 | +/// | Q | 101 | 101 | 101 | 101 | 101 | 101 | 101.11 | 101.11 | | 101.0709 | 101.0715 | 101.0709 | C4H9N2O1+ | | C4H9N2O1+ | | 13 | 101.0711 | C4H9N2O1+ | | | | | | +/// | | 129 | 129 | 129 | 129 | 129 | 129 | 129.1 | 129.11 | | 129.0659 | 129.1028 | | | | C5H9N2O2+ | | 11 | 129.0844 | C5H9N2O2+ | -28.0133 | C-1O-1 | | | C-1O-1 | +/// | | 84 | 84 | 84 | 84 | 84 | 84 | 84.08 | 84.04 | 84.0813 | 84.0444 | 84.0449 | | | C5H10N1+ | C4H6N1O1+ | C4H6N1O1+ | 14 | 84.0569 | C5H10N1+ | 17.0142 | H3N1 | | | H3N1 | +/// | | 56 | | | 56 | | 56 | | 56.05 | | | 56.0500 | | | | | | 5 | 56.0500 | | 45.0211 | | C1H3N1O1 | | C1H3N1O1 | +/// | G | 30 | 30 | | 30 | | 30 | | 30.03 | 30.0344 | | 30.0344 | | | C1H4N1+ | | | 8 | 30.0344 | C1H4N1+ | | | | | | +/// | H | 110 | 110 | 110 | 110 | 110 | 110 | 110.07 | 110.07 | 110.0718 | 110.0713 | 110.0718 | 110.0713 | C5H8N3+ | C5H8N3+ | C5H8N3+ | | 15 | 110.0716 | C5H8N3+ | | | | | | +/// | | 166 | 166 | | | | 166 | | | | | | | | | | | 3 | 166 | | -55.9284 | | C-2O-2 | | C-2O-2 | +/// | | 138 | 138 | | | | 138 | 138.07 | | | 138.0662 | | | | | C6H8N3O1+ | | 6 | 138.0662 | C6H8N3O1+ | -27.9946 | C-1O-1 | | | C-1O-1 | +/// | | 123 | 123 | | | | 123 | | | | | | | | | | | 3 | 123 | | -12.9284 | | H3O-1 | | H3O-1 | +/// | | 121 | 121 | | | | 121 | | | | | | | | | | | 3 | 121 | | -10.9284 | | H5O-1 | | H5O-1 | +/// | | 82 | 82 | | | | 82 | | | 82.0531 | | | | | C4H6N2+ | | | 5 | 82.0531 | C4H6N2+ | 28.0185 | C1H2N1 | | | C1H2N1 | +/// | I/L/J | 86 | 86 | 86 | 86 | 86 | 86 | 86.1 | 86.10 | 86.0970 | 86.0964 | 86.0970 | | | C5H12N+ | C5H12N1+ | | 13 | 86.0968 | C5H12N+ | | | | | | +/// | | 72 | 72 | | 72 | | 72 | | | | | 72.0449 | | | | | | 5 | 72.0449 | | 14.0519 | | C1H2 | | C1H2 | +/// | | 44 | | | 44 | | 44 | | | | | 44.0500 | | | | | | 4 | 44.0500 | | 42.0468 | | C3H6 | | C3H6 | +/// | K | 101 | 101 | 101 | 101 | 101 | 101 | 101.11 | 101.11 | | 101.1073 | 101.1079 | | | | C5H13N2+ | | 11 | 101.1076 | C5H13N2+ | | | | | | +/// | | 129 | 129 | 129 | 129 | 129 | 129 | 129.1 | 129.11 | | 129.1022 | | 129.1022 | C6H13N2O1+ | | C6H13N2O1+ | | 12 | 129.1022 | C6H13N2O1+ | -27.9946 | C-1O-1 | | | C-1O-1 | +/// | | | | | | | | | | | 126.0913 | | | | | C7H12N1O1+ | | 2 | 126.0913 | C7H12N1O1+ | -24.9837 | C-2H1N1O-1 | | | C-2H1N1O-1 | +/// | | 112 | 112 | | 112 | | 112 | | | | | | | | | | | 4 | 112 | | -10.8924 | | H5O-1 | | H5O-1 | +/// | | 84 | 84 | 84 | 84 | 84 | 84 | 84.08 | 84.08 | 84.0813 | 84.0808 | 84.0813 | | | C5H10N1+ | C5H10N1+ | C5H10N1+ | 14 | 84.0811 | C5H10N1+ | 17.0265 | H3N1 | | | H3N1 | +/// | | 70 | 70 | | | | 70 | | | | | | | | | | | 3 | 70 | | 31.1076 | | C1H5N1 | | C1H5N1 | +/// | | | | | | | | | 56.05 | | | 56.0500 | | | | | | 2 | 56.0500 | | 45.0576 | | C2H7N1 | | C2H7N1 | +/// | M | 104 | 104 | 104 | 104 | 104 | 104 | 104.05 | 104.06 | | 104.0528 | 104.0534 | | | | C4H10N1S1+ | | 11 | 104.0531 | C4H10N1S1+ | | | | | | +/// | | | | | | | 70 | | | | | | | | | | | | 70 | | 34.0531 | | H2S1 | | H2S1 | +/// | | 61 | 61 | | | | 61 | | | 61.0112 | | | | | C2H5S1+ | | | 5 | 61.0112 | C2H5S1+ | 43.0419 | C2H3N1 | | | C2H3N1 | +/// | | | | | | | | | | | | | | | | | C3H6N1+ | 1 | ?? | C3H6N1+ | ?? | C1H4S1 | | | C1H4S1 | +/// | F | 120 | 120 | 120 | 120 | 120 | 120 | 120.08 | 120.08 | 120.0813 | 120.0808 | 120.0813 | 120.0808 | C8H10N+ | C8H10N1+ | C8H10N1+ | | 15 | 120.0811 | C8H10N+ | | | | | | +/// | | 91 | 91 | | 91 | | 91 | | | | | 91.0548 | | | | | | 5 | 91.0548 | | 29.0263 | | C1H3N1 | C7H7+ | C1H3N1 | +/// | P | 70 | 70 | 70 | 70 | 70 | 70 | 70.07 | 70.07 | 70.0657 | 70.0651 | 70.0657 | | | C4H8N1+ | C4H8N1+ | | 13 | 70.0655 | C4H8N1+ | | | | | | +/// | | | | 126 | | 126 | | 126.06 | | | 126.055 | | | | | C6H8N1O2+ | | 5 | 126.0550 | C6H8N1O2+ | -55.9895 | C-2O-2 | | | C-2O-2 | +/// | S | 60 | 60 | 60 | 60 | 60 | 60 | 60.04 | 60.04 | | 60.0444 | 60.0449 | | | | C2H6N1O1+ | | 11 | 60.0447 | C2H6N1O1+ | | | | | | +/// | | | | | | | | | | | | | | | | | C2H4N1+ | 1 | ?? | C2H4N1+ | ?? | H2O1 | | | H2O1 | +/// | T | 74 | 74 | 74 | 74 | 74 | 74 | | 74.06 | 74.0606 | 74.0600 | 74.0606 | | | C3H8N1O1+ | C3H8N1O1+ | | 12 | 74.0604 | C3H8N1O1+ | | | | | | +/// | | | | | | | | | | | | | | | | | C3H6N1O1+ | 1 | ?? | C3H6N1O1+ | ?? | H2N1 | | | H2N1 | +/// | W | 159 | 159 | 159 | 159 | 159 | 159 | 159.09 | 159.09 | | 159.0917 | 159.0922 | 159.0917 | C10H11N2+ | | C10H11N2+ | | 13 | 159.0919 | C10H11N2+ | | | | | | +/// | | | 171 | | | | 171 | | | | | | | | | | | 2 | 171 | | -11.9081 | | H4O-1 | | H4O-1 | +/// | | 170 | 170 | 170 | | 170 | 170 | | | | | | | | | | | 5 | 170 | | -10.9081 | | H5O-1 | | H5O-1 | +/// | | 132 | | | 132 | | 132 | | 132.08 | | | 132.0813 | | | | | | 5 | 132.0813 | | 27.0106 | | C1H1N1 | | C1H1N1 | +/// | | 130 | 130 | 130 | 130 | 130 | 130 | | 130.07 | | | 130.0657 | | | | | | 8 | 130.0657 | | 29.0262 | | C1H3N1 | | C1H3N1 | +/// | | 117 | 117 | 117 | 117 | 117 | 117 | | | | | 117.0578 | | | | | | 7 | 117.0578 | | 42.0341 | | C2H4N1 | | C2H4N1 | +/// | | 100 | | | | | | | | | | | | | | | | 1 | 100 | | 59.0919 | | C3H9N1/C2H7N2 | | | +/// | | | | | 77 | | 77 | | | | | 77.0391 | | | | | | 3 | 77.0391 | | 82.0528 | | C4H6N2 | C6H5 | C4H6N2 | +/// | | 11 | | | | | | | | | | | | | | | | 1 | 11 | | 148.0919 | | | | | +/// | Y | 136 | 136 | 136 | 136 | 136 | 136 | 136.08 | 136.08 | 136.0762 | 136.0757 | 136.0762 | 136.0757 | C8H10N1O1+ | C8H10N1O1+ | C8H10N1O1+ | | 15 | 136.0760 | C8H10N1O1+ | | | | | | +/// | | 107 | 107 | | 107 | | 107 | | | | | 107.0497 | | | | | | 5 | 107.0497 | | 29.0263 | | C1H3N1 | | C1H3N1 | +/// | | 91 | 91 | | 91 | | 91 | | | | | 91.0548 | | | | | | 5 | 91.0548 | | 45.0212 | | C1H3N1O1 | | C1H3N1O1 | +/// | | | | | | | | | | 55.0184 | | | | | C3H3O1+ | | | 2 | 55.0184 | C3H3O1+ | 81.0576 | C5H7N1 | | | C5H7N1 | +/// | V | 72 | 72 | 72 | 72 | 72 | 72 | 72.08 | 72.08 | 72.0813 | 72.0808 | 72.0813 | | | C4H10N1+ | C4H10N1+ | | 13 | 72.0811 | C4H10N1+ | | | | | | +/// | | 69 | | | 69 | | 69 | | | | | 69.0704 | | | | | | 4 | 69.0704 | | 3.0107 | | C1H1O-1 | | C1H1O-1 | +/// | | 55 | | | 55 | | 55 | | | | | 55.0548 | | | | | | 4 | 55.0548 | | 17.0263 | | H3N1 | | H3N1 | +/// | | 44 | | | | | | | | | | | | | | | | 1 | 44 | | 28.0811 | | C1H2N1 | | C1H2N1 | +/// | | | | | 41 | | 41 | | | | | 41.0391 | | | | | | 3 | 41.0391 | | 31.0420 | | C1H5N1 | | C1H5N1 | +static IMMONIUM_LOSSES: LazyLock, Vec)>> = LazyLock::new(|| + // TODO: For B/Z there are common immonium ions, but the mass is the same (meaning the loss is different), find a way of representing that + vec![( + vec![AminoAcid::Arginine], vec![ + NeutralLoss::Gain(molecular_formula!(C 2 O 2)), + NeutralLoss::Loss(molecular_formula!(C 1 H 2)), + NeutralLoss::Loss(molecular_formula!(H 3 N 1)), + NeutralLoss::Loss(molecular_formula!(C 1 H 3 N 1)), + NeutralLoss::Loss(molecular_formula!(C 2 H 2 N 2)), + NeutralLoss::Loss(molecular_formula!(C 3 H 6 N 2)), + NeutralLoss::Loss(molecular_formula!(C 1 H 5 N 3)), + NeutralLoss::Loss(molecular_formula!(C 3 H 4 N 2 O -1)), + NeutralLoss::Loss(molecular_formula!(C 4 H 8 N 1)), + NeutralLoss::Loss(molecular_formula!(C 4 H 10 N 2)), + ]), + (vec![AminoAcid::Asparagine], vec![NeutralLoss::Loss(molecular_formula!(H 3 N 1))]), + (vec![AminoAcid::AsparticAcid, AminoAcid::GlutamicAcid, AminoAcid::Serine], vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))]), + (vec![AminoAcid::Glutamine], vec![ + NeutralLoss::Gain(molecular_formula!(C 1 O 1)), + NeutralLoss::Loss(molecular_formula!(H 3 N 1)), + NeutralLoss::Loss(molecular_formula!(C 1 H 3 N 1 O 1)), + ]), + (vec![AminoAcid::Histidine], vec![ + NeutralLoss::Gain(molecular_formula!(C 2 O 2)), + NeutralLoss::Gain(molecular_formula!(C 1 O 1)), + NeutralLoss::Loss(molecular_formula!(H 3 O -1)), + NeutralLoss::Loss(molecular_formula!(H 5 O -1)), + NeutralLoss::Loss(molecular_formula!(C 1 H 2 N 1)), + ]), + (vec![AminoAcid::Leucine, AminoAcid::Isoleucine, AminoAcid::AmbiguousLeucine], vec![ + NeutralLoss::Loss(molecular_formula!(C 1 H 2)), + NeutralLoss::Loss(molecular_formula!(C 3 H 6)), + ]), + (vec![AminoAcid::Lysine], vec![ + NeutralLoss::Gain(molecular_formula!(C 1 O 1)), + NeutralLoss::Loss(molecular_formula!(C -2 H 1 N 1 O -1)), + NeutralLoss::Loss(molecular_formula!(H 5 O -1)), + NeutralLoss::Loss(molecular_formula!(H 3 N 1)), + NeutralLoss::Loss(molecular_formula!(C 1 H 5 N 1)), + NeutralLoss::Loss(molecular_formula!(C 2 H 7 N 1)), + ]), + (vec![AminoAcid::Methionine], vec![ + NeutralLoss::Loss(molecular_formula!(H 2 S 1)), + NeutralLoss::Loss(molecular_formula!(C 2 H 3 N 1)), + NeutralLoss::Loss(molecular_formula!(C 1 H 4 S 1)), + ]), + (vec![AminoAcid::Phenylalanine], vec![NeutralLoss::Gain(molecular_formula!(C 2 O 2))]), + (vec![AminoAcid::Threonine], vec![NeutralLoss::Loss(molecular_formula!(H 2 N 1))]), + (vec![AminoAcid::Tryptophan], vec![ + NeutralLoss::Loss(molecular_formula!(H 4 O -1)), + NeutralLoss::Loss(molecular_formula!(H 5 O -1)), + NeutralLoss::Loss(molecular_formula!(C 1 H 1 N 1)), + NeutralLoss::Loss(molecular_formula!(C 1 H 3 N 1)), + NeutralLoss::Loss(molecular_formula!(C 2 H 4 N 1)), + NeutralLoss::Loss(molecular_formula!(C 4 H 6 N 2)), + ]), + (vec![AminoAcid::Tyrosine], vec![ + NeutralLoss::Loss(molecular_formula!(C 1 H 3 N 1)), + NeutralLoss::Loss(molecular_formula!(C 1 H 3 N 1 O 1)), + NeutralLoss::Loss(molecular_formula!(C 5 H 7 N 1)), + ]), + (vec![AminoAcid::Valine], vec![ + NeutralLoss::Loss(molecular_formula!(C 1 H 1 O -1)), + NeutralLoss::Loss(molecular_formula!(H 3 N 1)), + NeutralLoss::Loss(molecular_formula!(C 1 H 2 N 1)), + NeutralLoss::Loss(molecular_formula!(C 1 H 5 N 1)), + ]), + ]); + +pub fn immonium_losses() -> &'static Vec<(Vec, Vec)> { + LazyLock::force(&IMMONIUM_LOSSES) +} + +/// Generate all uncharged diagnostic ions for this monosaccharide. +/// According to: . +static GLYCAN_LOSSES: LazyLock)>> = + LazyLock::new(|| { + vec![ + ( + MonoSaccharide::new(BaseSugar::Hexose(None), &[]), + false, + vec![ + NeutralLoss::Loss(molecular_formula!(H 2 O 1)), + NeutralLoss::Loss(molecular_formula!(H 4 O 2)), + NeutralLoss::Loss(molecular_formula!(C 1 H 6 O 3)), + NeutralLoss::Loss(molecular_formula!(C 2 H 6 O 3)), + ], + ), + ( + MonoSaccharide::new(BaseSugar::Hexose(None), &[GlycanSubstituent::NAcetyl]), + false, + vec![ + NeutralLoss::Loss(molecular_formula!(H 2 O 1)), + NeutralLoss::Loss(molecular_formula!(H 4 O 2)), + NeutralLoss::Loss(molecular_formula!(C 2 H 4 O 2)), + NeutralLoss::Loss(molecular_formula!(C 1 H 6 O 3)), + NeutralLoss::Loss(molecular_formula!(C 2 H 6 O 3)), + NeutralLoss::Loss(molecular_formula!(C 4 H 8 O 4)), + ], + ), + ( + MonoSaccharide::new( + BaseSugar::Nonose(None), + &[ + GlycanSubstituent::Amino, + GlycanSubstituent::Acetyl, + GlycanSubstituent::Acid, + ], + ) + .with_name("NeuAc"), + false, + vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))], + ), + ( + MonoSaccharide::new( + BaseSugar::Nonose(None), + &[ + GlycanSubstituent::Amino, + GlycanSubstituent::Glycolyl, + GlycanSubstituent::Acid, + ], + ) + .with_name("NeuGc"), + false, + vec![NeutralLoss::Loss(molecular_formula!(H 2 O 1))], + ), + ] + }); + +pub fn glycan_losses() -> &'static Vec<(MonoSaccharide, bool, Vec)> { + LazyLock::force(&GLYCAN_LOSSES) +} diff --git a/rustyms/src/model/charge.rs b/rustyms/src/model/charge.rs new file mode 100644 index 00000000..0ed155f1 --- /dev/null +++ b/rustyms/src/model/charge.rs @@ -0,0 +1,76 @@ +use std::ops::RangeInclusive; + +use serde::{Deserialize, Serialize}; + +use crate::system::{e, isize::Charge}; + +/// Control what charges are allowed for an ion series. Defined as an inclusive range. +/// Any charge above the precursor charge will result in the quotient time the precursor +/// charge carriers + all options for the remainder within the limits of the precursor +/// charge carriers. +#[non_exhaustive] +#[derive(Copy, Clone, PartialEq, Eq, Hash, Debug, PartialOrd, Ord, Serialize, Deserialize)] +pub struct ChargeRange { + /// Start point + pub start: ChargePoint, + /// End point (inclusive) + pub end: ChargePoint, +} + +impl ChargeRange { + /// Get the number of possible charges for the given precursor charge. + pub fn len(&self, precursor: Charge) -> usize { + (self.end.to_absolute(precursor).value - self.start.to_absolute(precursor).value.max(1)) + .unsigned_abs() + } + + /// Get all possible charges for the given precursor charge. + pub fn charges(&self, precursor: Charge) -> RangeInclusive { + Charge::new::(self.start.to_absolute(precursor).value.max(1)) + ..=self.end.to_absolute(precursor) + } + + /// Get all possible charges for the given precursor charge. + pub fn charges_iter( + &self, + precursor: Charge, + ) -> impl DoubleEndedIterator + Clone { + (self.start.to_absolute(precursor).value.max(1)..=self.end.to_absolute(precursor).value) + .map(Charge::new::) + } + + /// Solely single charged + pub const ONE: Self = Self { + start: ChargePoint::Absolute(1), + end: ChargePoint::Absolute(1), + }; + /// Only the exact precursor charge + pub const PRECURSOR: Self = Self { + start: ChargePoint::Relative(0), + end: ChargePoint::Relative(0), + }; + /// Range from 1 to the precursor + pub const ONE_TO_PRECURSOR: Self = Self { + start: ChargePoint::Absolute(1), + end: ChargePoint::Relative(0), + }; +} + +/// A reference point for charge range definition. +#[derive(Copy, Clone, PartialEq, Eq, Hash, Debug, PartialOrd, Ord, Serialize, Deserialize)] +pub enum ChargePoint { + /// Relative to the precursor, with the given offset. + Relative(isize), + /// Absolute charge. + Absolute(isize), +} + +impl ChargePoint { + /// Get the absolute charge of this charge point given a precursor charge + fn to_absolute(self, precursor: Charge) -> Charge { + match self { + Self::Absolute(a) => Charge::new::(a), + Self::Relative(r) => Charge::new::(precursor.value + r), + } + } +} diff --git a/rustyms/src/model/fragmentation.rs b/rustyms/src/model/fragmentation.rs new file mode 100644 index 00000000..c46d096b --- /dev/null +++ b/rustyms/src/model/fragmentation.rs @@ -0,0 +1,539 @@ +//! Handle model instantiation. + +use itertools::Itertools; +use serde::{Deserialize, Serialize}; + +use crate::{ + fragment::PeptidePosition, + model::{ChargeRange, GlycanModel}, + AminoAcid, MultiChemical, NeutralLoss, Peptidoform, SequenceElement, +}; + +/// A model for the fragmentation, allowing control over what theoretical fragments to generate. +#[non_exhaustive] +#[derive(Clone, PartialEq, Debug, Serialize, Deserialize)] +pub struct FragmentationModel { + /// a series ions + pub a: PrimaryIonSeries, + /// b series ions + pub b: PrimaryIonSeries, + /// c series ions + pub c: PrimaryIonSeries, + /// d series ions (side chain fragmentation from a) + pub d: SatelliteIonSeries, + /// v series ions (full side chain broken off from y) + pub v: SatelliteIonSeries, + /// w series ions (side chain fragmentation from z) + pub w: SatelliteIonSeries, + /// x series ions + pub x: PrimaryIonSeries, + /// y series ions + pub y: PrimaryIonSeries, + /// z series ions + pub z: PrimaryIonSeries, + /// precursor ions, standard neutral losses, amino acid specific neutral losses, and charge range + pub precursor: ( + Vec, + Vec<(Vec, Vec)>, + (u8, Option>), + ChargeRange, + ), + /// immonium ions + pub immonium: Option<(ChargeRange, Vec<(Vec, Vec)>)>, + /// If the neutral losses specific for modifications should be generated + pub modification_specific_neutral_losses: bool, + /// If the diagnostic ions specific for modifications should be generated with the allowed charge range + pub modification_specific_diagnostic_ions: Option, + /// Glycan fragmentation + pub glycan: GlycanModel, + /// Allow any MS cleavable cross-link to be cleaved + pub allow_cross_link_cleavage: bool, +} + +/// The settings for any satellite ion series +#[derive(Clone, PartialEq, Eq, Hash, Debug, Serialize, Deserialize)] +pub struct SatelliteIonSeries { + /// Which locations are assumed to lead to fragmentation + pub location: SatelliteLocation, + /// The allowed neutral losses + pub neutral_losses: Vec, + /// The allowed amino acid specific neutral losses + pub amino_acid_neutral_losses: Vec<(Vec, Vec)>, + /// The maximal number of side chain lost and the amino acids from which the side chains can be lost (or all if no selection is given) + pub amino_acid_side_chain_losses: (u8, Option>), + /// The allowed charges + pub charge_range: ChargeRange, + /// The allowed ion variants (e.g. w vs w+1 vs w-1) + pub allowed_variants: Vec, +} + +impl SatelliteIonSeries { + /// A default value where the base is `Some(0)` + #[must_use] + pub fn base() -> Self { + Self { + location: SatelliteLocation { + rules: Vec::new(), + base: Some(0), + }, + ..Self::default() + } + } + + /// Replace the location + #[must_use] + pub fn location(self, location: SatelliteLocation) -> Self { + Self { location, ..self } + } + /// Replace the neutral losses + #[must_use] + pub fn neutral_losses(self, neutral_losses: Vec) -> Self { + Self { + neutral_losses, + ..self + } + } + /// Replace the amino acid specific neutral losses + #[must_use] + pub fn amino_acid_neutral_losses( + self, + amino_acid_neutral_losses: Vec<(Vec, Vec)>, + ) -> Self { + Self { + amino_acid_neutral_losses, + ..self + } + } + /// Replace the amino acid side chain losses + #[must_use] + pub fn amino_acid_side_chain_losses( + self, + amino_acid_side_chain_losses: (u8, Option>), + ) -> Self { + Self { + amino_acid_side_chain_losses, + ..self + } + } + /// Replace the charge range + #[must_use] + pub fn charge_range(self, charge_range: ChargeRange) -> Self { + Self { + charge_range, + ..self + } + } + /// Replace the allowed variants, e.g. a vs a+1 vs a+2 + #[must_use] + pub fn variants(self, allowed_variants: Vec) -> Self { + Self { + allowed_variants, + ..self + } + } +} + +impl std::default::Default for SatelliteIonSeries { + fn default() -> Self { + Self { + location: SatelliteLocation::default(), + neutral_losses: Vec::new(), + amino_acid_neutral_losses: Vec::new(), + amino_acid_side_chain_losses: (0, None), + charge_range: ChargeRange::ONE_TO_PRECURSOR, + allowed_variants: vec![0], + } + } +} + +/// The settings for any primary ion series +#[derive(Clone, PartialEq, Eq, Hash, Debug, Serialize, Deserialize)] +pub struct PrimaryIonSeries { + /// Which locations are assumed to lead to fragmentation + pub location: Location, + /// The allowed neutral losses + pub neutral_losses: Vec, + /// The allowed amino acid specific neutral losses + pub amino_acid_neutral_losses: Vec<(Vec, Vec)>, + /// The maximal number of side chain lost and the amino acids from which the side chains can be lost (or all if no selection is given) + pub amino_acid_side_chain_losses: (u8, Option>), + /// The allowed charges + pub charge_range: ChargeRange, + /// The allowed ion variants (e.g. a vs a+1 vs a+2) + pub allowed_variants: Vec, +} + +impl PrimaryIonSeries { + /// Generate a new empty series + pub fn none() -> Self { + Self { + location: Location::None, + ..Default::default() + } + } + + /// Replace the location + #[must_use] + pub fn location(self, location: Location) -> Self { + Self { location, ..self } + } + /// Replace the neutral losses + #[must_use] + pub fn neutral_losses(self, neutral_losses: Vec) -> Self { + Self { + neutral_losses, + ..self + } + } + /// Replace the amino acid specific neutral losses + #[must_use] + pub fn amino_acid_neutral_losses( + self, + amino_acid_neutral_losses: Vec<(Vec, Vec)>, + ) -> Self { + Self { + amino_acid_neutral_losses, + ..self + } + } + /// Replace the amino acid side chain losses + #[must_use] + pub fn amino_acid_side_chain_losses( + self, + amino_acid_side_chain_losses: (u8, Option>), + ) -> Self { + Self { + amino_acid_side_chain_losses, + ..self + } + } + /// Replace the charge range + #[must_use] + pub fn charge_range(self, charge_range: ChargeRange) -> Self { + Self { + charge_range, + ..self + } + } + /// Replace the allowed variants, e.g. a vs a+1 vs a+2 + #[must_use] + pub fn variants(self, allowed_variants: Vec) -> Self { + Self { + allowed_variants, + ..self + } + } +} + +impl std::default::Default for PrimaryIonSeries { + fn default() -> Self { + Self { + location: Location::All, + neutral_losses: Vec::new(), + amino_acid_neutral_losses: Vec::new(), + amino_acid_side_chain_losses: (0, None), + charge_range: ChargeRange::ONE_TO_PRECURSOR, + allowed_variants: vec![0], + } + } +} + +/// Builder style methods +impl FragmentationModel { + /// Set a + #[must_use] + pub fn a(self, a: PrimaryIonSeries) -> Self { + Self { a, ..self } + } + /// Set b + #[must_use] + pub fn b(self, b: PrimaryIonSeries) -> Self { + Self { b, ..self } + } + /// Set c + #[must_use] + pub fn c(self, c: PrimaryIonSeries) -> Self { + Self { c, ..self } + } + /// Set d + #[must_use] + pub fn d(self, d: SatelliteIonSeries) -> Self { + Self { d, ..self } + } + /// Set v + #[must_use] + pub fn v(self, v: SatelliteIonSeries) -> Self { + Self { v, ..self } + } + /// Set w + #[must_use] + pub fn w(self, w: SatelliteIonSeries) -> Self { + Self { w, ..self } + } + /// Set x + #[must_use] + pub fn x(self, x: PrimaryIonSeries) -> Self { + Self { x, ..self } + } + /// Set y + #[must_use] + pub fn y(self, y: PrimaryIonSeries) -> Self { + Self { y, ..self } + } + /// Set z + #[must_use] + pub fn z(self, z: PrimaryIonSeries) -> Self { + Self { z, ..self } + } + /// Set glycan + #[must_use] + pub fn glycan(self, glycan: GlycanModel) -> Self { + Self { glycan, ..self } + } + /// Overwrite the precursor neutral losses + #[must_use] + pub fn precursor( + self, + neutral_loss: Vec, + amino_acid_specific_neutral_losses: Vec<(Vec, Vec)>, + amino_acid_side_chain_losses: (u8, Option>), + charges: ChargeRange, + ) -> Self { + Self { + precursor: ( + neutral_loss, + amino_acid_specific_neutral_losses, + amino_acid_side_chain_losses, + charges, + ), + ..self + } + } + /// Set immonium + #[must_use] + pub fn immonium( + self, + state: Option<(ChargeRange, Vec<(Vec, Vec)>)>, + ) -> Self { + Self { + immonium: state, + ..self + } + } + /// Set modification specific neutral losses + #[must_use] + pub fn modification_specific_neutral_losses(self, state: bool) -> Self { + Self { + modification_specific_neutral_losses: state, + ..self + } + } + /// Set modification specific diagnostic ions + #[must_use] + pub fn modification_specific_diagnostic_ions(self, state: Option) -> Self { + Self { + modification_specific_diagnostic_ions: state, + ..self + } + } + /// Set the tolerance + #[must_use] + pub fn allow_cross_link_cleavage(self, state: bool) -> Self { + Self { + allow_cross_link_cleavage: state, + ..self + } + } +} + +/// A location, or range of locations where an ion can be generated +#[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Default, Debug, Serialize, Deserialize)] +pub enum Location { + /// Skip the given number from the N terminal side + SkipN(usize), + /// Skip the given number of aminoacids from the N terminal and C terminal side respectively, only using the positions between these two + SkipNC(usize, usize), + /// Skip a certain number and then take a certain number of aminoacids + TakeN { + /// Skip this number of aminoacids + skip: usize, + /// Take this number of aminoacids + take: usize, + }, + /// Skip a given number from the C terminal side + SkipC(usize), + /// Take a given number of aminoacids from the C terminal side + TakeC(usize), + /// All positions (including 0 and len-1) + All, + /// Do not allow it anywhere + #[default] + None, +} + +impl Location { + /// Determine if an ion is possible on this location + /// # Panics + /// If the peptide position is a terminal position + pub const fn possible(&self, position: PeptidePosition) -> bool { + let crate::SequencePosition::Index(sequence_index) = position.sequence_index else { + panic!("Not allowed to call possible with a terminal PeptidePosition") + }; + match self { + Self::SkipN(n) => sequence_index >= *n, + Self::SkipNC(n, c) => { + sequence_index >= *n && position.sequence_length - sequence_index > *c + } + Self::TakeN { skip, take } => sequence_index >= *skip && sequence_index < *skip + *take, + Self::SkipC(n) => position.sequence_length - sequence_index > *n, + Self::TakeC(n) => position.sequence_length - sequence_index <= *n, + Self::All => position.series_number != position.sequence_length, + Self::None => false, + } + } +} + +/// The locations for a satellite ion. These are defined to form for any location where the parent +/// ion forms. And the maximal distance from the original backbone cleavage can be defined. +#[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Default, Debug, Serialize, Deserialize)] +pub struct SatelliteLocation { + /// A set of rules each indicate a set of aminoacids and the (0 based) distance they can occur + /// away from the original cleavage. + pub rules: Vec<(Vec, u8)>, + /// The base distance that this satellite ion can occur for any amino acid undefined in the + /// specific rules. + pub base: Option, +} + +impl SatelliteLocation { + /// Determine if a satellite ion is possible on this location + /// # Panics + /// If the peptide position is a terminal position + pub fn possible( + &self, + position: PeptidePosition, + peptidoform: &Peptidoform, + c_terminal: bool, + ) -> Vec<(AminoAcid, u8)> { + let crate::SequencePosition::Index(sequence_index) = position.sequence_index else { + panic!("Not allowed to call possible with a terminal PeptidePosition") + }; + let mut output = Vec::new(); + let max_distance = match (self.base, self.rules.iter().map(|r| r.1).max()) { + (Some(b), Some(r)) => b.max(r) + 1, + (Some(b), None) => b + 1, + (None, Some(r)) => r + 1, + (None, None) => return Vec::new(), + }; + let range = if c_terminal { + sequence_index + ..sequence_index + .saturating_add(max_distance as usize) + .min(peptidoform.len() - 1) + } else { + sequence_index.saturating_sub(max_distance as usize)..sequence_index + }; + for (index, seq) in peptidoform[range].iter().enumerate() { + if let Ok(distance) = u8::try_from(if c_terminal { + index + } else { + max_distance as usize - index + }) { + let mut allowed = None; + for rule in &self.rules { + if rule.0.contains(&seq.aminoacid.aminoacid()) { + allowed = Some(distance <= rule.1); + break; + } + } + if allowed + .or_else(|| self.base.map(|b| distance <= b)) + .is_some_and(|a| a) + { + output.push((seq.aminoacid.aminoacid(), distance)); + } + } + } + output + } +} + +/// Get all possible side chain losses for a given stretch of amino acids. The number indicates +/// the maximum total side chains lost, the selection restricts the set of amino acids that can +/// lose their side chain, if this is None all amino acids can loose their side chain. +/// +/// This might generate non sensible options for satellite ions (e.g. double side chain loss for v) +pub(crate) fn get_all_sidechain_losses( + slice: &[SequenceElement], + settings: &(u8, Option>), +) -> Vec> { + if settings.0 == 0 { + Vec::new() + } else { + let options: Vec = slice + .iter() + .filter_map(|seq| { + settings + .1 + .as_ref() + .is_none_or(|aa| aa.contains(&seq.aminoacid.aminoacid())) + .then_some(seq.aminoacid.aminoacid()) + }) + .unique() + .flat_map(|aa| { + aa.formulas() + .iter() + .map(|f| { + NeutralLoss::SideChainLoss(f - molecular_formula!(H 3 C 2 N 1 O 1), aa) + }) + .filter(|l| !l.is_empty()) + .collect::>() + }) + .collect(); + (1..=settings.0) + .flat_map(|k| options.iter().combinations(k as usize)) + .map(|o| o.into_iter().cloned().collect_vec()) + .collect() + } +} + +#[test] +#[allow(clippy::missing_panics_doc)] +fn side_chain_losses() { + let peptide = Peptidoform::pro_forma("FGGGTKLELKR", None) + .unwrap() + .into_simple_linear() + .unwrap(); + assert_eq!( + 0, + get_all_sidechain_losses(peptide.sequence(), &(0, None)).len() + ); + assert_eq!( + 1, + get_all_sidechain_losses( + peptide.sequence(), + &(1, Some(vec![AminoAcid::Phenylalanine])) + ) + .len() + ); + assert_eq!( + 0, + get_all_sidechain_losses(peptide.sequence(), &(1, Some(vec![AminoAcid::Glycine]))).len() + ); + assert_eq!( + 1, + get_all_sidechain_losses(peptide.sequence(), &(1, Some(vec![AminoAcid::Leucine]))).len() + ); + assert_eq!( + 6, + get_all_sidechain_losses(peptide.sequence(), &(1, None)).len() + ); + assert_eq!( + 3, + dbg!(get_all_sidechain_losses( + peptide.sequence(), + &(2, Some(vec![AminoAcid::Phenylalanine, AminoAcid::Leucine])) + )) + .len() + ); +} diff --git a/rustyms/src/model/glycan.rs b/rustyms/src/model/glycan.rs new file mode 100644 index 00000000..5bb9f5f7 --- /dev/null +++ b/rustyms/src/model/glycan.rs @@ -0,0 +1,232 @@ +use std::{ + collections::{BTreeMap, HashMap}, + hash::Hash, + ops::RangeInclusive, +}; + +use itertools::Itertools; +use serde::{Deserialize, Serialize}; + +use crate::{ + fragment::FragmentKind, glycan::MonoSaccharide, model::ChargeRange, AminoAcid, NeutralLoss, +}; + +use super::built_in::glycan_losses; + +/// The settings for glycan fragmentation +#[derive(Clone, PartialEq, Eq, Hash, Debug, Serialize, Deserialize)] +pub struct GlycanModel { + /// Allows fragments from glycans with defined structures (i.e. GNO modifications) + pub allow_structural: bool, + /// Allows fragments from glycans where only the composition is known (i.e. `Glycan:Hex1`). + /// This allows any fragment containing any number of monosaccharides within this range. + pub compositional_range: RangeInclusive, + /// The allowed neutral losses + pub neutral_losses: Vec, + /// Allowed neutral losses on diagnostic ions based on monosaccharides with a flag to indicate precise isomeric state matching + pub specific_neutral_losses: Vec<(MonoSaccharide, bool, Vec)>, + /// Glycan fragmentation on peptide fragments when no rules apply + pub default_peptide_fragment: GlycanPeptideFragment, + /// Peptide fragment rules + pub peptide_fragment_rules: Vec<(Vec, Vec, GlycanPeptideFragment)>, + /// The allowed charges for oxonium ions (B, internal fragments etc) + pub oxonium_charge_range: ChargeRange, + /// The allowed charges for other glycan fragments (Y) + pub other_charge_range: ChargeRange, +} + +impl GlycanModel { + /// Sets the status of glycan fragments from structural modifications + #[must_use] + pub fn allow_structural(self, allow_structural: bool) -> Self { + Self { + allow_structural, + ..self + } + } + /// Set the range of monosaccharides that can result in composition fragments, see [`Self::compositional_range`]. + #[must_use] + pub fn compositional_range(self, compositional_range: RangeInclusive) -> Self { + Self { + compositional_range, + ..self + } + } + /// Replace the neutral losses + #[must_use] + pub fn neutral_losses(self, neutral_losses: Vec) -> Self { + Self { + neutral_losses, + ..self + } + } + /// Replace the charge range for oxonium ions (B, internal fragments etc) + #[must_use] + pub fn oxonium_charge_range(self, oxonium_charge_range: ChargeRange) -> Self { + Self { + oxonium_charge_range, + ..self + } + } + /// Replace the charge range for other glycan ions (Y etc) + #[must_use] + pub fn other_charge_range(self, other_charge_range: ChargeRange) -> Self { + Self { + other_charge_range, + ..self + } + } + /// Replace the default rules for glycans on peptide fragments + #[must_use] + pub fn default_peptide_fragment(self, default_peptide_fragment: GlycanPeptideFragment) -> Self { + Self { + default_peptide_fragment, + ..self + } + } + /// Replace the specific rules for glycans on peptide fragments + #[must_use] + pub fn peptide_fragment_rules( + self, + peptide_fragment_rules: Vec<(Vec, Vec, GlycanPeptideFragment)>, + ) -> Self { + Self { + peptide_fragment_rules, + ..self + } + } + /// Default set for models that allow glycan fragmentation + pub fn default_allow() -> Self { + Self { + allow_structural: true, + compositional_range: 1..=10, + neutral_losses: Vec::new(), + specific_neutral_losses: glycan_losses().clone(), + default_peptide_fragment: GlycanPeptideFragment::CORE_AND_FREE, + peptide_fragment_rules: Vec::new(), + oxonium_charge_range: ChargeRange::ONE, + other_charge_range: ChargeRange::ONE_TO_PRECURSOR, + } + } + /// Default set for models that disallow glycan fragmentation + pub const DISALLOW: Self = Self { + allow_structural: false, + compositional_range: 0..=0, + neutral_losses: Vec::new(), + specific_neutral_losses: Vec::new(), + default_peptide_fragment: GlycanPeptideFragment::FULL, + peptide_fragment_rules: Vec::new(), + oxonium_charge_range: ChargeRange::ONE, + other_charge_range: ChargeRange::ONE_TO_PRECURSOR, + }; + + /// Get the possible glycan peptide fragments based on this attachment location. + /// This simplifies the rules somewhat to mostly contain unique rules in the fragment specific + /// rules. But it is not guarenteed to be fully unique. + pub fn get_peptide_fragments( + &self, + attachment: Option, + ) -> ( + GlycanPeptideFragment, + HashMap, + ) { + let base = self + .peptide_fragment_rules + .iter() + .find(|rule| attachment.is_some_and(|a| rule.0.contains(&a)) && rule.1.is_empty()) + .map_or(self.default_peptide_fragment, |rule| rule.2); + ( + base, + self.peptide_fragment_rules + .iter() + .filter(|rule| attachment.is_some_and(|a| rule.0.contains(&a))) + .flat_map(|rule| rule.1.iter().map(|f| (f, rule.2))) + .into_group_map() + .into_iter() + .map(|(f, settings)| (*f, settings.iter().fold(settings[0], |acc, v| acc + v))) + .collect(), + ) + } +} + +/// Rules to determine the glycan fragmentation for glycans on other fragments. +#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug, Serialize, Deserialize)] +pub struct GlycanPeptideFragment { + /// The full glycan stays attached + full: bool, + /// The glycan fragments and at any number of monosaccharides within the range (min, max, inclusive) stay attached (any fucoses on these fragments are always included and do not count towards the limit) + core: Option<(u8, u8)>, +} + +impl std::ops::Add<&Self> for GlycanPeptideFragment { + type Output = Self; + fn add(self, rhs: &Self) -> Self::Output { + Self { + full: self.full || rhs.full, + core: self + .core + .and_then(|c| rhs.core.map(|r| (c, r))) + .map(|(c, r)| (c.0.min(r.0), c.1.max(r.1))) + .or(self.core) + .or(rhs.core), + } + } +} + +impl GlycanPeptideFragment { + /// A default model that only allows full fragments + pub const FULL: Self = Self { + full: true, + core: None, + }; + /// A default model that only allows core and free (0..=1) + pub const CORE_AND_FREE: Self = Self { + full: false, + core: Some((0, 1)), + }; + /// A default model that only allows core (1..=1) + pub const CORE: Self = Self { + full: false, + core: Some((1, 1)), + }; + /// A default model that only allows free (0..=0) + pub const FREE: Self = Self { + full: false, + core: Some((0, 0)), + }; + + /// Get if this models allows full glycans on peptide fragments + pub const fn full(self) -> bool { + self.full + } + + /// Get if this models allows core fragments on peptide fragments and the depth range of those fragments + pub fn core(self) -> Option> { + self.core.map(|(min, max)| min..=max) + } +} + +// impl GlycanPeptideFragment { +// /// Check if this fragment contains anything not contained in the other fragment. None if this +// /// fragment is a subset of other, Some with the exclusive options if this fragment is disjoint +// /// or not a true subset of other. If the range of other fully fits inside the self range (so +// /// subtraction would result in two disjoint ranges for self) the full range of self is returned. +// fn simplify(self, other: Self) -> Option { +// match (self, other) { +// (Self::Full, Self::Full) => None, +// (Self::Full, Self::Core(_, _)) => Some(self), +// (Self::Core(_, _), Self::Full) => Some(self), +// (Self::Core(mins, maxs), Self::Core(mino, maxo)) => { +// if mins >= mino || maxs <= maxo { +// None +// } else if mins < mino && maxs <= maxo { +// Some(Self::Core(mins, mino)) +// } else if mins >= mino && maxs > maxo { +// Some(Self::Core(maxo, maxs)) +// } else { +// Some(self) +// } +// } +// } +// } +// } diff --git a/rustyms/src/model/mod.rs b/rustyms/src/model/mod.rs new file mode 100644 index 00000000..22aeadca --- /dev/null +++ b/rustyms/src/model/mod.rs @@ -0,0 +1,14 @@ +//! Handle parameters for fragmentation and matching + +mod built_in; +mod charge; +mod fragmentation; +mod glycan; +mod parameters; +mod possible_ions; + +pub use charge::*; +pub use fragmentation::*; +pub use glycan::*; +pub use parameters::*; +pub use possible_ions::*; diff --git a/rustyms/src/model/parameters.rs b/rustyms/src/model/parameters.rs new file mode 100644 index 00000000..0b7634c7 --- /dev/null +++ b/rustyms/src/model/parameters.rs @@ -0,0 +1,43 @@ +use std::ops::RangeInclusive; + +use serde::{Deserialize, Serialize}; + +use crate::{ + system::{mz, MassOverCharge}, + Tolerance, +}; + +/// Parameters for the matching, allowing control over when a match is allowed. +#[non_exhaustive] +#[derive(Clone, PartialEq, Debug, Serialize, Deserialize)] +pub struct MatchingParameters { + /// The matching tolerance + pub tolerance: Tolerance, + /// The range in which fragments fall, can be used to limit the theoretical fragments to a known window + pub mz_range: RangeInclusive, +} + +impl MatchingParameters { + /// Set the tolerance + #[must_use] + pub fn tolerance(self, tolerance: impl Into>) -> Self { + Self { + tolerance: tolerance.into(), + ..self + } + } + /// Set the mz range + #[must_use] + pub fn mz_range(self, mz_range: RangeInclusive) -> Self { + Self { mz_range, ..self } + } +} + +impl Default for MatchingParameters { + fn default() -> Self { + Self { + tolerance: Tolerance::new_ppm(20.0), + mz_range: MassOverCharge::new::(0.0)..=MassOverCharge::new::(f64::MAX), + } + } +} diff --git a/rustyms/src/model/possible_ions.rs b/rustyms/src/model/possible_ions.rs new file mode 100644 index 00000000..13f338a9 --- /dev/null +++ b/rustyms/src/model/possible_ions.rs @@ -0,0 +1,225 @@ +use crate::{ + fragment::PeptidePosition, + model::{get_all_sidechain_losses, ChargeRange, FragmentationModel}, + AminoAcid, NeutralLoss, Peptidoform, +}; + +/// The possibilities for primary ions, a list of all allowed neutral losses, all charge options, and all allowed variant ions +pub type PossiblePrimaryIons<'a> = (Vec>, ChargeRange, &'a [i8]); +/// The possibilities for satellite ions, a list of all satellite ions with their amino acid and +/// distance from the parent backbone cleavage, as well as all ion settings as for primary series. +pub type PossibleSatelliteIons<'a> = (Vec<(AminoAcid, u8)>, PossiblePrimaryIons<'a>); + +/// A struct to handle all possible fragments that could be generated on a single location +#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)] +#[non_exhaustive] +pub struct PossibleIons<'a> { + /// a series ions + pub a: Option>, + /// b series ions + pub b: Option>, + /// c series ions + pub c: Option>, + /// d series ions (side chain fragmentation from a) + pub d: PossibleSatelliteIons<'a>, + /// v series ions (full side chain broken off from y) + pub v: PossibleSatelliteIons<'a>, + /// w series ions (side chain fragmentation from z) + pub w: PossibleSatelliteIons<'a>, + /// x series ions + pub x: Option>, + /// y series ions + pub y: Option>, + /// z series ions + pub z: Option>, + /// immonium + pub immonium: Option<(ChargeRange, &'a [(Vec, Vec)])>, +} + +impl PossibleIons<'_> { + /// Give an upper bound for the number of theoretical fragment for these possible ions + pub fn size_upper_bound(&self) -> usize { + self.a + .as_ref() + .map(|o| (o.0.len() + 1) * o.2.len()) + .unwrap_or_default() + + self + .b + .as_ref() + .map(|o| (o.0.len() + 1) * o.2.len()) + .unwrap_or_default() + + self + .c + .as_ref() + .map(|o| (o.0.len() + 1) * o.2.len()) + .unwrap_or_default() + + self.d.0.len() * 2 * (self.d.1 .0.len() + 1) * self.d.1 .2.len() + + self.v.0.len() * (self.v.1 .0.len() + 1) * self.v.1 .2.len() + + self.w.0.len() * 2 * (self.w.1 .0.len() + 1) * self.w.1 .2.len() + + self + .x + .as_ref() + .map(|o| (o.0.len() + 1) * o.2.len()) + .unwrap_or_default() + + self + .y + .as_ref() + .map(|o| (o.0.len() + 1) * o.2.len()) + .unwrap_or_default() + + self + .z + .as_ref() + .map(|o| (o.0.len() + 1) * o.2.len()) + .unwrap_or_default() + + usize::from(self.immonium.is_some()) + } +} + +impl FragmentationModel { + /// Give all possible ions for the given position. + /// # Panics + /// If the position is a terminal position. + pub fn ions( + &self, + position: PeptidePosition, + peptidoform: &Peptidoform, + ) -> PossibleIons { + let crate::SequencePosition::Index(sequence_index) = position.sequence_index else { + panic!("Not allowed to call possible with a terminal PeptidePosition") + }; + + let get_neutral_losses = |neutral_losses: &Vec, + amino_acid_specific: &Vec<(Vec, Vec)>, + amino_acid_side_chains: &(u8, Option>), + c_terminal| { + let peptide_slice = &peptidoform[if c_terminal { + sequence_index..peptidoform.len() + } else { + 0..sequence_index + }]; + neutral_losses + .iter() + .chain( + peptide_slice + .iter() + .flat_map(|seq| { + amino_acid_specific.iter().filter_map(|(rule, loss)| { + rule.contains(&seq.aminoacid.aminoacid()).then_some(loss) + }) + }) + .flatten(), + ) + .map(|l| vec![l.clone()]) + .chain(get_all_sidechain_losses( + peptide_slice, + amino_acid_side_chains, + )) + .collect() + }; + + let c_position = position.flip_terminal(); + + PossibleIons { + a: self.a.location.possible(position).then_some(( + get_neutral_losses( + &self.a.neutral_losses, + &self.a.amino_acid_neutral_losses, + &self.a.amino_acid_side_chain_losses, + false, + ), + self.a.charge_range, + self.a.allowed_variants.as_slice(), + )), + b: self.b.location.possible(position).then_some(( + get_neutral_losses( + &self.b.neutral_losses, + &self.b.amino_acid_neutral_losses, + &self.b.amino_acid_side_chain_losses, + false, + ), + self.b.charge_range, + self.b.allowed_variants.as_slice(), + )), + c: self.c.location.possible(position).then_some(( + get_neutral_losses( + &self.c.neutral_losses, + &self.c.amino_acid_neutral_losses, + &self.c.amino_acid_side_chain_losses, + false, + ), + self.c.charge_range, + self.c.allowed_variants.as_slice(), + )), + d: ( + self.d.location.possible(position, peptidoform, false), + ( + get_neutral_losses( + &self.d.neutral_losses, + &self.d.amino_acid_neutral_losses, + &self.d.amino_acid_side_chain_losses, + false, + ), + self.d.charge_range, + self.d.allowed_variants.as_slice(), + ), + ), + v: ( + self.v.location.possible(c_position, peptidoform, true), + ( + get_neutral_losses( + &self.v.neutral_losses, + &self.v.amino_acid_neutral_losses, + &self.v.amino_acid_side_chain_losses, + false, + ), + self.v.charge_range, + self.v.allowed_variants.as_slice(), + ), + ), + w: ( + self.w.location.possible(c_position, peptidoform, true), + ( + get_neutral_losses( + &self.w.neutral_losses, + &self.w.amino_acid_neutral_losses, + &self.w.amino_acid_side_chain_losses, + false, + ), + self.w.charge_range, + self.w.allowed_variants.as_slice(), + ), + ), + x: self.x.location.possible(c_position).then_some(( + get_neutral_losses( + &self.x.neutral_losses, + &self.x.amino_acid_neutral_losses, + &self.x.amino_acid_side_chain_losses, + false, + ), + self.x.charge_range, + self.x.allowed_variants.as_slice(), + )), + y: self.y.location.possible(c_position).then_some(( + get_neutral_losses( + &self.y.neutral_losses, + &self.y.amino_acid_neutral_losses, + &self.y.amino_acid_side_chain_losses, + false, + ), + self.y.charge_range, + self.y.allowed_variants.as_slice(), + )), + z: self.z.location.possible(c_position).then_some(( + get_neutral_losses( + &self.z.neutral_losses, + &self.z.amino_acid_neutral_losses, + &self.z.amino_acid_side_chain_losses, + false, + ), + self.z.charge_range, + self.z.allowed_variants.as_slice(), + )), + immonium: self.immonium.as_ref().map(|(c, l)| (*c, l.as_slice())), + } + } +} diff --git a/rustyms/src/modification.rs b/rustyms/src/modification.rs index d0024055..a4ee4322 100644 --- a/rustyms/src/modification.rs +++ b/rustyms/src/modification.rs @@ -6,19 +6,22 @@ use serde::{Deserialize, Serialize}; use std::{ cmp::Ordering, - collections::{BTreeSet, HashSet}, + collections::{BTreeSet, HashMap, HashSet}, fmt::{Display, Write}, sync::Arc, }; use crate::{ + fragment::FragmentKind, glycan::{GlycanStructure, MonoSaccharide}, + helper_functions::merge_hashmap, + model::{GlycanModel, GlycanPeptideFragment}, molecular_charge::CachedCharge, peptidoform::Linked, placement_rule::{PlacementRule, Position}, system::OrderedMass, - AmbiguousLabel, AminoAcid, Chemical, DiagnosticIon, Fragment, Model, MolecularFormula, Multi, - NeutralLoss, Peptidoform, SequenceElement, SequencePosition, + AmbiguousLabel, AminoAcid, Chemical, DiagnosticIon, Fragment, FragmentationModel, + MolecularFormula, Multi, NeutralLoss, Peptidoform, SequenceElement, SequencePosition, }; include!("shared/modification.rs"); @@ -37,7 +40,7 @@ impl ModificationId { )), Ontology::Gnome => Some(format!( "http://glytoucan.org/Structures/Glycans/{}", - self.name + self.name.to_ascii_uppercase() )), Ontology::Resid => Some(format!( "https://proteininformationresource.org/cgi-bin/resid?id=AA{:04}", @@ -48,6 +51,21 @@ impl ModificationId { } } +impl Display for ModificationId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + if self.ontology == Ontology::Gnome { + write!( + f, + "{}:{}", + self.ontology.char(), + self.name.to_ascii_uppercase() + ) + } else { + write!(f, "{}:{}", self.ontology.char(), self.name) + } + } +} + /// The result of checking if a modification can be placed somewhere. #[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Clone)] pub enum RulePossible { @@ -104,30 +122,15 @@ impl Chemical for SimpleModificationInner { position: SequencePosition, peptidoform_index: usize, ) -> MolecularFormula { - match self { - Self::Mass(m) - | Self::Gno { - composition: GnoComposition::Weight(m), - .. - } => MolecularFormula::with_additional_mass(m.value), - Self::Gno { - composition: GnoComposition::Composition(monosaccharides), - .. - } - | Self::Glycan(monosaccharides) => monosaccharides - .iter() - .fold(MolecularFormula::default(), |acc, i| { - acc + i.0.formula_inner(position, peptidoform_index) * i.1 as i32 - }), - Self::GlycanStructure(glycan) - | Self::Gno { - composition: GnoComposition::Topology(glycan), - .. - } => glycan.formula_inner(position, peptidoform_index), - Self::Formula(formula) - | Self::Database { formula, .. } - | Self::Linker { formula, .. } => formula.clone(), - } + self.formula_inner( + position, + peptidoform_index, + GlycanPeptideFragment::FULL, + None, + ) + .to_vec() + .pop() + .unwrap() } } @@ -145,30 +148,82 @@ impl SimpleModificationInner { &self, sequence_index: SequencePosition, peptidoform_index: usize, - ) -> MolecularFormula { + glycan_fragmentation: GlycanPeptideFragment, + attachment: Option, + ) -> Multi { match self { Self::Mass(m) | Self::Gno { composition: GnoComposition::Weight(m), .. - } => MolecularFormula::with_additional_mass(m.value), + } => MolecularFormula::with_additional_mass(m.value).into(), Self::Gno { composition: GnoComposition::Composition(monosaccharides), .. } - | Self::Glycan(monosaccharides) => monosaccharides - .iter() - .fold(MolecularFormula::default(), |acc, i| { - acc + i.0.formula_inner(sequence_index, peptidoform_index) * i.1 as i32 - }), + | Self::Glycan(monosaccharides) => { + let mut options = Vec::new(); + + if let Some(range) = glycan_fragmentation.core() { + for option in MonoSaccharide::composition_options( + monosaccharides, + *range.start() as usize..=*range.end() as usize, + ) { + options.push( + option + .iter() + .fold(MolecularFormula::default(), |acc, i| { + acc + i.0.formula_inner(sequence_index, peptidoform_index) + * i.1 as i32 + }) + .with_label(AmbiguousLabel::GlycanFragmentComposition(option)), + ); + } + } + if glycan_fragmentation.full() { + options.push(monosaccharides.iter().fold( + MolecularFormula::default(), + |acc, i| { + acc + i.0.formula_inner(sequence_index, peptidoform_index) * i.1 as i32 + }, + )); + } + if options.is_empty() { + options.push(MolecularFormula::default()); + } + options.into() + } Self::GlycanStructure(glycan) | Self::Gno { composition: GnoComposition::Topology(glycan), .. - } => glycan.formula_inner(sequence_index, peptidoform_index), + } => { + let mut options = Vec::new(); + + if let Some(range) = glycan_fragmentation.core() { + for option in glycan.clone().determine_positions().core_options( + range, + peptidoform_index, + attachment.map(|a| (a, sequence_index)), + ) { + options.push( + option + .1 + .with_label(AmbiguousLabel::GlycanFragment(option.0)), + ); + } + } + if glycan_fragmentation.full() { + options.push(glycan.formula_inner(sequence_index, peptidoform_index)); + } + if options.is_empty() { + options.push(MolecularFormula::default()); + } + options.into() + } Self::Formula(formula) | Self::Database { formula, .. } - | Self::Linker { formula, .. } => formula.clone(), + | Self::Linker { formula, .. } => formula.into(), } } @@ -321,19 +376,8 @@ impl SimpleModificationInner { } if specification_compliant => { write!(f, "Formula:{formula}|INFO:Custom:{name}")?; } - Self::Database { - id: - ModificationId { - name, - ontology: Ontology::Custom, - .. - }, - .. - } if specification_compliant => { - write!(f, "C:{name}")?; - } Self::Database { id, .. } | Self::Gno { id, .. } | Self::Linker { id, .. } => { - write!(f, "{}:{}", id.ontology.char(), id.name)?; + write!(f, "{id}")?; } } Ok(()) @@ -499,19 +543,49 @@ impl Modification { allow_ms_cleavable: bool, sequence_index: SequencePosition, peptidoform_index: usize, - ) -> (Multi, HashSet) { + glycan_model: &GlycanModel, + attachment: Option, + ) -> ( + Multi, + HashMap>, + HashSet, + ) { match self { Self::Simple(modification) | Self::Ambiguous { modification, .. } => { match &**modification { // A linker that is not cross-linked is hydrolysed SimpleModificationInner::Linker { formula, .. } => ( (formula.clone() + molecular_formula!(H 2 O 1)).into(), + HashMap::new(), HashSet::new(), ), - s => ( - s.formula_inner(sequence_index, peptidoform_index).into(), - HashSet::new(), - ), + + s => { + let (default_rules, specific_rules) = + glycan_model.get_peptide_fragments(attachment); + + let f = s.formula_inner( + sequence_index, + peptidoform_index, + default_rules, + attachment, + ); + let specific = specific_rules + .into_iter() + .map(|(k, settings)| { + ( + k, + s.formula_inner( + sequence_index, + peptidoform_index, + settings, + attachment, + ), + ) + }) + .collect(); + (f, specific, HashSet::new()) + } } } Self::CrossLink { @@ -522,19 +596,29 @@ impl Modification { .. } => { if applied_cross_links.contains(name) { - (Multi::default(), HashSet::default()) + (Multi::default(), HashMap::new(), HashSet::default()) } else if visited_peptides.contains(other_peptide) { applied_cross_links.push(name.clone()); ( linker - .formula_inner(sequence_index, peptidoform_index) - .with_label(AmbiguousLabel::CrossLinkBound(name.clone())) - .into(), + .formula_inner( + sequence_index, + peptidoform_index, + glycan_model.default_peptide_fragment, + attachment, + ) + .with_label(&AmbiguousLabel::CrossLinkBound(name.clone())), + HashMap::new(), HashSet::from([name.clone()]), ) } else { applied_cross_links.push(name.clone()); - let link = linker.formula_inner(sequence_index, peptidoform_index); + let link = linker.formula_inner( + sequence_index, + peptidoform_index, + glycan_model.default_peptide_fragment, + attachment, + ); let (_, stubs, _) = side.allowed_rules(linker); if allow_ms_cleavable && !stubs.is_empty() { @@ -549,33 +633,39 @@ impl Modification { .unique() .collect(); let mut seen_peptides = HashSet::from([name.clone()]); + let mut specific = HashMap::new(); options.extend_from_slice(&{ - let (f, seen) = all_peptides[*other_peptide].formulas_inner( - *other_peptide, - all_peptides, - visited_peptides, - applied_cross_links, - false, - ); + let (f, f_specific, seen) = all_peptides[*other_peptide] + .formulas_inner( + *other_peptide, + all_peptides, + visited_peptides, + applied_cross_links, + false, + glycan_model, + ); seen_peptides.extend(seen); - (f + link) + specific = merge_hashmap(specific, f_specific); + (f * link) .with_label(&AmbiguousLabel::CrossLinkBound(name.clone())) .to_vec() }); - (options.into(), seen_peptides) + (options.into(), specific, seen_peptides) } else { - let (f, mut seen) = all_peptides[*other_peptide].formulas_inner( + let (f, specific, mut seen) = all_peptides[*other_peptide].formulas_inner( *other_peptide, all_peptides, visited_peptides, applied_cross_links, false, + glycan_model, ); seen.insert(name.clone()); ( - (f + link).with_label(&AmbiguousLabel::CrossLinkBound(name.clone())), + (f * link).with_label(&AmbiguousLabel::CrossLinkBound(name.clone())), + specific, seen, ) } @@ -636,12 +726,12 @@ impl Modification { /// Generate theoretical fragments for side chains (glycans) pub(crate) fn generate_theoretical_fragments( &self, - model: &Model, + model: &FragmentationModel, peptidoform_ion_index: usize, peptidoform_index: usize, charge_carriers: &mut CachedCharge, full_formula: &Multi, - attachment: Option<(AminoAcid, usize)>, + attachment: Option<(AminoAcid, SequencePosition)>, ) -> Vec { match self { Self::Simple(modification) | Self::Ambiguous { modification, .. } => modification @@ -662,12 +752,12 @@ impl SimpleModificationInner { /// Generate theoretical fragments for side chains (glycans) pub(crate) fn generate_theoretical_fragments( &self, - model: &Model, + model: &FragmentationModel, peptidoform_ion_index: usize, peptidoform_index: usize, charge_carriers: &mut CachedCharge, full_formula: &Multi, - attachment: Option<(AminoAcid, usize)>, + attachment: Option<(AminoAcid, SequencePosition)>, ) -> Vec { match self { Self::GlycanStructure(glycan) diff --git a/rustyms/src/neutral_loss.rs b/rustyms/src/neutral_loss.rs index b5d2ee2d..85b1608a 100644 --- a/rustyms/src/neutral_loss.rs +++ b/rustyms/src/neutral_loss.rs @@ -1,4 +1,8 @@ -use std::{fmt::Display, ops::Add, str::FromStr}; +use std::{ + fmt::Display, + ops::{Add, AddAssign}, + str::FromStr, +}; use serde::{Deserialize, Serialize}; @@ -17,7 +21,7 @@ impl NeutralLoss { /// Check if this neutral loss if empty (has an empty molecular formula) pub fn is_empty(&self) -> bool { match self { - Self::Loss(f) | Self::Gain(f) => f.is_empty(), + Self::Loss(f) | Self::Gain(f) | Self::SideChainLoss(f, _) => f.is_empty(), } } @@ -25,6 +29,7 @@ impl NeutralLoss { pub fn hill_notation_html(&self) -> String { match self { Self::Loss(c) => format!("-{}", c.hill_notation_html().trim_start_matches('+')), + Self::SideChainLoss(_, aa) => format!("-sidechain_{aa}"), Self::Gain(c) => format!("+{}", c.hill_notation_html().trim_start_matches('+')), } } @@ -33,6 +38,7 @@ impl NeutralLoss { pub fn hill_notation_fancy(&self) -> String { match self { Self::Loss(c) => format!("-{}", c.hill_notation_fancy().trim_start_matches('+')), + Self::SideChainLoss(_, aa) => format!("-sidechain_{aa}"), Self::Gain(c) => format!("+{}", c.hill_notation_fancy().trim_start_matches('+')), } } @@ -41,6 +47,7 @@ impl NeutralLoss { pub fn hill_notation(&self) -> String { match self { Self::Loss(c) => format!("-{}", c.hill_notation().trim_start_matches('+')), + Self::SideChainLoss(_, aa) => format!("-sidechain_{aa}"), Self::Gain(c) => format!("+{}", c.hill_notation().trim_start_matches('+')), } } @@ -98,6 +105,7 @@ impl Display for NeutralLoss { "{}", match self { Self::Loss(c) => format!("-{c}"), + Self::SideChainLoss(_, aa) => format!("-sidechain_{aa}"), Self::Gain(c) => format!("+{c}"), } ) @@ -109,20 +117,55 @@ impl std::ops::Add<&NeutralLoss> for &MolecularFormula { fn add(self, rhs: &NeutralLoss) -> Self::Output { match rhs { NeutralLoss::Gain(mol) => self + mol, - NeutralLoss::Loss(mol) => self - mol, + NeutralLoss::Loss(mol) | NeutralLoss::SideChainLoss(mol, _) => self - mol, } } } +impl std::ops::AddAssign<&NeutralLoss> for MolecularFormula { + fn add_assign(&mut self, rhs: &NeutralLoss) { + match rhs { + NeutralLoss::Gain(mol) => *self += mol, + NeutralLoss::Loss(mol) | NeutralLoss::SideChainLoss(mol, _) => *self -= mol, + } + } +} + +impl AddAssign for MolecularFormula { + fn add_assign(&mut self, rhs: NeutralLoss) { + *self += &rhs; + } +} + impl std::ops::Add<&NeutralLoss> for &Multi { type Output = Multi; fn add(self, rhs: &NeutralLoss) -> Self::Output { match rhs { NeutralLoss::Gain(mol) => self + mol, - NeutralLoss::Loss(mol) => self - mol, + NeutralLoss::Loss(mol) | NeutralLoss::SideChainLoss(mol, _) => self - mol, } } } impl_binop_ref_cases!(impl Add, add for MolecularFormula, NeutralLoss, MolecularFormula); impl_binop_ref_cases!(impl Add, add for Multi, NeutralLoss, Multi); + +impl<'a> std::iter::Sum<&'a NeutralLoss> for MolecularFormula { + fn sum>(iter: I) -> Self { + let mut output = Self::default(); + for value in iter { + output += value; + } + output + } +} + +impl std::iter::Sum for MolecularFormula { + fn sum>(iter: I) -> Self { + let mut output = Self::default(); + for value in iter { + output += value; + } + output + } +} diff --git a/rustyms/src/peptidoform/compound_peptidoform_ion.rs b/rustyms/src/peptidoform/compound_peptidoform_ion.rs index 87bd364c..b48780be 100644 --- a/rustyms/src/peptidoform/compound_peptidoform_ion.rs +++ b/rustyms/src/peptidoform/compound_peptidoform_ion.rs @@ -4,8 +4,8 @@ use itertools::Itertools; use serde::{Deserialize, Serialize}; use crate::{ - peptidoform::Linked, system::usize::Charge, Fragment, Model, MolecularFormula, Multi, - Peptidoform, PeptidoformIon, + peptidoform::Linked, system::usize::Charge, Fragment, FragmentationModel, MolecularFormula, + Multi, Peptidoform, PeptidoformIon, }; /// A single full ProForma entry. This entry can contain multiple sets of cross-linked peptides. @@ -63,7 +63,7 @@ impl CompoundPeptidoformIon { pub fn generate_theoretical_fragments( &self, max_charge: Charge, - model: &Model, + model: &FragmentationModel, ) -> Vec { let mut base = Vec::new(); for (index, peptidoform) in self.peptidoform_ions().iter().enumerate() { diff --git a/rustyms/src/peptidoform/find_modifications.rs b/rustyms/src/peptidoform/find_modifications.rs index 991172cc..2cce20ed 100644 --- a/rustyms/src/peptidoform/find_modifications.rs +++ b/rustyms/src/peptidoform/find_modifications.rs @@ -431,9 +431,8 @@ impl PeptideModificationSearch { .iter() .flat_map(|o| o.lookup(custom_database)) .filter(|modification| { - aminoacid.map_or(true, |aa| { - modification.2.is_possible_aa(aa, position).any_possible() - }) + aminoacid + .is_none_or(|aa| modification.2.is_possible_aa(aa, position).any_possible()) }) .filter(|modification| check_matches(in_place, &modification.2)) .map(|(_, _, modification)| modification) @@ -442,9 +441,8 @@ impl PeptideModificationSearch { modifications .iter() .filter(|modification| { - aminoacid.map_or(true, |aa| { - modification.is_possible_aa(aa, position).any_possible() - }) + aminoacid + .is_none_or(|aa| modification.is_possible_aa(aa, position).any_possible()) }) .filter(|modification| check_matches(in_place, modification)) .collect() diff --git a/rustyms/src/peptidoform/peptidoform.rs b/rustyms/src/peptidoform/peptidoform.rs index baf9acff..c23f0460 100644 --- a/rustyms/src/peptidoform/peptidoform.rs +++ b/rustyms/src/peptidoform/peptidoform.rs @@ -2,25 +2,26 @@ use crate::{ checked_aminoacid::CheckedAminoAcid, - fragment::{DiagnosticPosition, Fragment, FragmentType, PeptidePosition}, + fragment::{DiagnosticPosition, Fragment, FragmentKind, FragmentType, PeptidePosition}, glycan::MonoSaccharide, - helper_functions::{peptide_range_contains, RangeExtension}, + helper_functions::{self, merge_hashmap, peptide_range_contains, RangeExtension}, + model::{get_all_sidechain_losses, GlycanModel}, modification::{ - CrossLinkName, GnoComposition, LinkerSpecificity, Modification, SimpleModification, + self, CrossLinkName, GnoComposition, LinkerSpecificity, Modification, SimpleModification, SimpleModificationInner, }, molecular_charge::{CachedCharge, MolecularCharge}, peptidoform::*, placement_rule::PlacementRule, system::usize::Charge, - AmbiguousLabel, DiagnosticIon, Element, Model, MolecularFormula, Multi, MultiChemical, + AmbiguousLabel, DiagnosticIon, Element, FragmentationModel, MolecularFormula, Multi, NeutralLoss, Protease, SequenceElement, SequencePosition, }; use itertools::Itertools; use ordered_float::OrderedFloat; use serde::{Deserialize, Serialize}; use std::{ - collections::HashSet, + collections::{HashMap, HashSet}, fmt::{Display, Write}, marker::PhantomData, num::NonZeroU16, @@ -410,7 +411,7 @@ impl Peptidoform { self.charge_carriers = charge_carriers; } - /// The mass of the N terminal modifications. The global isotope modifications are NOT applied. + /// The mass of the N terminal placed modifications. The global isotope modifications are NOT applied. fn get_n_term_mass( &self, all_peptides: &[Peptidoform], @@ -418,23 +419,41 @@ impl Peptidoform { applied_cross_links: &mut Vec, allow_ms_cleavable: bool, peptidoform_index: usize, - ) -> Multi { - self.n_term.iter().fold(Multi::default(), |acc, f| { - if let Modification::Ambiguous { .. } = f { - acc - } else { - acc * f - .formula_inner( - all_peptides, - visited_peptides, - applied_cross_links, - allow_ms_cleavable, - SequencePosition::NTerm, - peptidoform_index, - ) - .0 - } - }) + molecular_formula!(H 1) + glycan_model: &GlycanModel, + ) -> ( + Multi, + HashMap>, + ) { + let (base, mut specific) = + self.n_term + .iter() + .fold((Multi::default(), HashMap::new()), |acc, f| { + if let Modification::Ambiguous { .. } = f { + acc + } else { + let attachment = all_peptides[peptidoform_index] + .sequence + .first() + .map(|s| s.aminoacid.aminoacid()); + let (formula, specific, _seen) = f.formula_inner( + all_peptides, + visited_peptides, + applied_cross_links, + allow_ms_cleavable, + SequencePosition::NTerm, + peptidoform_index, + glycan_model, + attachment, + ); + ( + acc.0 * formula, + crate::helper_functions::merge_hashmap(acc.1, specific), + ) + } + }); + let terminus = molecular_formula!(H 1); + specific.values_mut().for_each(|v| *v += terminus.clone()); + (base + terminus, specific) } /// The mass of the C terminal modifications. The global isotope modifications are NOT applied. @@ -445,23 +464,41 @@ impl Peptidoform { applied_cross_links: &mut Vec, allow_ms_cleavable: bool, peptidoform_index: usize, - ) -> Multi { - self.c_term.iter().fold(Multi::default(), |acc, f| { - if let Modification::Ambiguous { .. } = f { - acc - } else { - acc * f - .formula_inner( - all_peptides, - visited_peptides, - applied_cross_links, - allow_ms_cleavable, - SequencePosition::CTerm, - peptidoform_index, - ) - .0 - } - }) + molecular_formula!(H 1 O 1) + glycan_model: &GlycanModel, + ) -> ( + Multi, + HashMap>, + ) { + let (base, mut specific) = + self.c_term + .iter() + .fold((Multi::default(), HashMap::new()), |acc, f| { + if let Modification::Ambiguous { .. } = f { + acc + } else { + let attachment = all_peptides[peptidoform_index] + .sequence + .first() + .map(|s| s.aminoacid.aminoacid()); + let (formula, specific, _seen) = f.formula_inner( + all_peptides, + visited_peptides, + applied_cross_links, + allow_ms_cleavable, + SequencePosition::NTerm, + peptidoform_index, + glycan_model, + attachment, + ); + ( + acc.0 * formula, + crate::helper_functions::merge_hashmap(acc.1, specific), + ) + } + }); + let terminus = molecular_formula!(H 1 O 1); + specific.values_mut().for_each(|v| *v += terminus.clone()); + (base + terminus, specific) } /// Find all neutral losses in the given stretch of peptide (loss, peptide index, sequence index) @@ -477,53 +514,53 @@ impl Peptidoform { let own_losses = self .iter(range) .flat_map(|(pos, aa)| { - aa.modifications - .iter() - .filter_map(|modification| match modification { - Modification::Simple(modification) - | Modification::Ambiguous { modification, .. } => match &**modification { - SimpleModificationInner::Database { specificities, .. } => Some( - specificities - .iter() - .filter_map(move |(rules, rule_losses, _)| { - if PlacementRule::any_possible( - rules, - aa, - pos.sequence_index, - ) { - Some(rule_losses) - } else { - None - } - }) - .flatten() - .map(move |loss| { - (loss.clone(), peptidoform_index, pos.sequence_index) - }) - .collect_vec(), - ), - _ => None, // TODO: potentially hydrolysed cross-linkers could also have neutral losses - }, - Modification::CrossLink { - linker, - peptide, - side, - .. - } => { - if !ignore_peptides.contains(peptide) { - found_peptides.push(*peptide); - }; - let (neutral, _, _) = side.allowed_rules(linker); - Some( - neutral - .into_iter() - .map(|n| (n, peptidoform_index, pos.sequence_index)) - .collect_vec(), - ) - } - }) - .flatten() - .collect_vec() + match pos.sequence_index { + SequencePosition::NTerm => self.n_term.as_slice(), + SequencePosition::Index(_) => aa.modifications.as_slice(), + SequencePosition::CTerm => self.c_term.as_slice(), + } + .iter() + .filter_map(|modification| match modification { + Modification::Simple(modification) + | Modification::Ambiguous { modification, .. } => match &**modification { + SimpleModificationInner::Database { specificities, .. } => Some( + specificities + .iter() + .filter_map(move |(rules, rule_losses, _)| { + if PlacementRule::any_possible(rules, aa, pos.sequence_index) { + Some(rule_losses) + } else { + None + } + }) + .flatten() + .map(move |loss| { + (loss.clone(), peptidoform_index, pos.sequence_index) + }) + .collect_vec(), + ), + _ => None, // TODO: potentially hydrolysed cross-linkers could also have neutral losses + }, + Modification::CrossLink { + linker, + peptide, + side, + .. + } => { + if !ignore_peptides.contains(peptide) { + found_peptides.push(*peptide); + }; + let (neutral, _, _) = side.allowed_rules(linker); + Some( + neutral + .into_iter() + .map(|n| (n, peptidoform_index, pos.sequence_index)) + .collect_vec(), + ) + } + }) + .flatten() + .collect_vec() }) .collect_vec(); own_losses @@ -538,7 +575,7 @@ impl Peptidoform { fn diagnostic_ions(&self) -> Vec<(DiagnosticIon, DiagnosticPosition)> { self.iter(..) .flat_map(|(pos, aa)| { - aa.diagnostic_ions(pos.sequence_index) + aa.diagnostic_ions(pos.sequence_index, &self.n_term, &self.c_term) .into_iter() .map(move |diagnostic| { ( @@ -620,34 +657,44 @@ impl Peptidoform { &self, range: impl RangeBounds, aa_range: impl RangeBounds + Clone, - base: &Multi, + base: &( + Multi, + HashMap>, + ), all_peptides: &[Peptidoform], visited_peptides: &[usize], applied_cross_links: &mut Vec, allow_ms_cleavable: bool, peptidoform_index: usize, - ) -> (Multi, HashSet) { + glycan_model: &GlycanModel, + ) -> ( + Multi, + HashMap>, + HashSet, + ) { // Calculate all formulas for the selected AA range without any ambiguous modifications - let (formulas, seen) = self.sequence[( + let (formulas, specific, seen) = self.sequence[( aa_range.start_bound().cloned(), aa_range.end_bound().cloned(), )] .iter() .enumerate() .fold( - (base.clone(), HashSet::new()), + (base.0.clone(), base.1.clone(), HashSet::new()), |previous_aa_formulas, (index, aa)| { - let (f, s) = aa.formulas_base( + let (f, specific, s) = aa.formulas_base( all_peptides, visited_peptides, applied_cross_links, allow_ms_cleavable, SequencePosition::Index(index), peptidoform_index, + glycan_model, ); ( previous_aa_formulas.0 * f, - previous_aa_formulas.1.union(&s).cloned().collect(), + crate::helper_functions::merge_hashmap(previous_aa_formulas.1, specific), + previous_aa_formulas.2.union(&s).cloned().collect(), ) }, ); @@ -698,9 +745,9 @@ impl Peptidoform { }); // Determine the formula for all selected ambiguous modifications and create the labels - let all_ambiguous_options = previous_combinations + let (all_ambiguous_options, all_ambiguous_specific) = previous_combinations .into_iter() - .map(|current_selected_ambiguous| { + .flat_map(|current_selected_ambiguous| { current_selected_ambiguous .iter() .copied() @@ -720,24 +767,65 @@ impl Peptidoform { .. } = m { + let aa = self[*pos].aminoacid.aminoacid(); + let (default, specific) = + glycan_model.get_peptide_fragments(Some(aa)); (*mid == id).then(|| { - modification - .formula_inner(*pos, peptidoform_index) - .with_label(AmbiguousLabel::Modification { - id, - sequence_index: *pos, - peptidoform_index, - }) + ( + modification + .formula_inner( + *pos, + peptidoform_index, + default, + Some(aa), + ) + .with_label(&AmbiguousLabel::Modification { + id, + sequence_index: *pos, + peptidoform_index, + }), + specific + .into_iter() + .map(|(f, setting)| { + ( + f, + modification + .formula_inner( + *pos, + peptidoform_index, + setting, + Some(aa), + ) + .with_label( + &AmbiguousLabel::Modification { + id, + sequence_index: *pos, + peptidoform_index, + }, + ), + ) + }) + .collect(), + ) }) } else { None } }) }) - .sum::() + .collect_vec() }) - .collect::>(); - (formulas * all_ambiguous_options, seen) + .fold((Multi::default(), HashMap::new()), |acc, v| { + ( + acc.0 * v.0, + crate::helper_functions::merge_hashmap(acc.1, v.1), + ) + }); + ( + formulas * all_ambiguous_options, + crate::helper_functions::merge_hashmap(specific, all_ambiguous_specific), + seen, + ) } /// Generate the theoretical fragments for this peptide, with the given maximal charge of the fragments, and the given model. @@ -747,7 +835,7 @@ impl Peptidoform { pub(crate) fn generate_theoretical_fragments_inner( &self, max_charge: Charge, - model: &Model, + model: &FragmentationModel, peptidoform_ion_index: usize, peptidoform_index: usize, all_peptides: &[Peptidoform], @@ -767,7 +855,7 @@ impl Peptidoform { let position = PeptidePosition::n(SequencePosition::Index(sequence_index), self.len()); let mut cross_links = Vec::new(); let visited_peptides = vec![peptidoform_index]; - let (n_term, n_term_seen) = self.all_masses( + let (n_term, n_term_specific, n_term_seen) = self.all_masses( ..=sequence_index, ..sequence_index, &self.get_n_term_mass( @@ -776,6 +864,7 @@ impl Peptidoform { &mut cross_links, model.allow_cross_link_cleavage, peptidoform_index, + &model.glycan, ), model.modification_specific_neutral_losses, all_peptides, @@ -783,8 +872,9 @@ impl Peptidoform { &mut cross_links, model.allow_cross_link_cleavage, peptidoform_index, + &model.glycan, ); - let (c_term, c_term_seen) = self.all_masses( + let (c_term, c_term_specific, c_term_seen) = self.all_masses( sequence_index.., sequence_index + 1.., &self.get_c_term_mass( @@ -793,6 +883,7 @@ impl Peptidoform { &mut cross_links, model.allow_cross_link_cleavage, peptidoform_index, + &model.glycan, ), model.modification_specific_neutral_losses, all_peptides, @@ -800,37 +891,45 @@ impl Peptidoform { &mut cross_links, model.allow_cross_link_cleavage, peptidoform_index, + &model.glycan, ); if !n_term_seen.is_disjoint(&c_term_seen) { continue; // There is a link reachable from both sides so there is a loop } - let (modifications_total, modifications_cross_links) = self.sequence[sequence_index] - .modifications - .iter() - .fold((Multi::default(), HashSet::new()), |acc, m| { - let (f, s) = m.formula_inner( - all_peptides, - &[peptidoform_index], - &mut cross_links, - model.allow_cross_link_cleavage, - SequencePosition::Index(sequence_index), - peptidoform_index, - ); - (acc.0 * f, acc.1.union(&s).cloned().collect()) - }); + let (modifications_total, modifications_specific, modifications_cross_links) = + self.sequence[sequence_index].modifications.iter().fold( + (Multi::default(), HashMap::new(), HashSet::new()), + |acc, m| { + let (f, specific, s) = m.formula_inner( + all_peptides, + &[peptidoform_index], + &mut cross_links, + model.allow_cross_link_cleavage, + SequencePosition::Index(sequence_index), + peptidoform_index, + &model.glycan, + Some(self.sequence[sequence_index].aminoacid.aminoacid()), + ); + ( + acc.0 * f, + merge_hashmap(acc.1, specific), + acc.2.union(&s).cloned().collect(), + ) + }, + ); output.append( &mut self.sequence[sequence_index] .aminoacid .aminoacid() .fragments( - &n_term, - &c_term, - &modifications_total, + &(n_term, n_term_specific), + &(c_term, c_term_specific), + &(modifications_total, modifications_specific), &mut charge_carriers, SequencePosition::Index(sequence_index), self.sequence.len(), - &model.ions(position), + &model.ions(position, self), peptidoform_ion_index, peptidoform_index, ( @@ -840,47 +939,6 @@ impl Peptidoform { ), ), ); - - if model.m { - // p - sX fragment: precursor amino acid side chain losses - output.extend( - self.formulas_inner( - peptidoform_index, - all_peptides, - &[], - &mut Vec::new(), - model.allow_cross_link_cleavage, - ) - .0 - .iter() - .flat_map(|m| { - self.sequence[sequence_index] - .aminoacid - .formulas_inner( - SequencePosition::Index(sequence_index), - peptidoform_index, - ) - .iter() - .flat_map(|aa| { - Fragment::generate_all( - &((-modifications_total.clone()) + m.clone() - aa.clone() - + molecular_formula!(C 2 H 2 N 1 O 1)), - peptidoform_ion_index, - peptidoform_index, - &FragmentType::PrecursorSideChainLoss( - position, - self.sequence[sequence_index].aminoacid.aminoacid(), - ), - &Multi::default(), - &[], - &mut charge_carriers, - model.precursor.1, - ) - }) - .collect_vec() - }), - ); - } } for fragment in &mut output { fragment.formula = fragment.formula.as_ref().map(|f| { @@ -890,23 +948,46 @@ impl Peptidoform { } // Generate precursor peak - let (full_precursor, _all_cross_links) = self.formulas_inner( + let (full_precursor, _precursor_specific, _all_cross_links) = self.formulas_inner( peptidoform_index, all_peptides, &[], &mut Vec::new(), model.allow_cross_link_cleavage, + &model.glycan, ); // Allow neutral losses from modifications for the precursor let mut precursor_neutral_losses = if model.modification_specific_neutral_losses { self.potential_neutral_losses(.., all_peptides, peptidoform_index, &mut Vec::new()) .into_iter() - .map(|(n, _, _)| n) + .map(|(n, _, _)| vec![n]) .collect_vec() } else { Vec::new() }; - precursor_neutral_losses.extend_from_slice(&model.precursor.0); + // Add amino acid specific neutral losses + precursor_neutral_losses.extend( + model + .precursor + .1 + .iter() + .filter_map(|(rule, losses)| { + rule.iter() + .any(|aa| { + self.sequence + .iter() + .any(|seq| seq.aminoacid.aminoacid() == *aa) + }) + .then_some(losses) + }) + .flatten() + .map(|l| vec![l.clone()]), + ); + // Add amino acid side chain losses + precursor_neutral_losses + .extend(get_all_sidechain_losses(&self.sequence, &model.precursor.2)); + // Add all normal neutral losses + precursor_neutral_losses.extend(model.precursor.0.iter().map(|l| vec![l.clone()])); output.extend(Fragment::generate_all( &full_precursor, @@ -916,12 +997,11 @@ impl Peptidoform { &Multi::default(), &precursor_neutral_losses, &mut charge_carriers, - model.precursor.1, + model.precursor.3, )); // Add glycan fragmentation to all peptide fragments - // Assuming that only one glycan can ever fragment at the same time, - // and that no peptide fragmentation occurs during glycan fragmentation + // Assuming that only one glycan can ever fragment at the same time. let full_formula = self .formulas_inner( peptidoform_index, @@ -929,10 +1009,14 @@ impl Peptidoform { &[], &mut Vec::new(), model.allow_cross_link_cleavage, + &model.glycan, ) .0; for (sequence_index, position) in self.sequence.iter().enumerate() { - let attachment = (position.aminoacid.aminoacid(), sequence_index); + let attachment = ( + position.aminoacid.aminoacid(), + SequencePosition::Index(sequence_index), + ); for modification in &position.modifications { output.extend(modification.generate_theoretical_fragments( model, @@ -945,7 +1029,7 @@ impl Peptidoform { } } - if model.modification_specific_diagnostic_ions.0 { + if let Some(charge) = model.modification_specific_diagnostic_ions { // Add all modification diagnostic ions for (dia, pos) in self.diagnostic_ions() { output.extend( @@ -960,10 +1044,7 @@ impl Peptidoform { confidence: None, auxiliary: false, } - .with_charge_range( - &mut charge_carriers, - model.modification_specific_diagnostic_ions.1, - ), + .with_charge_range(&mut charge_carriers, charge), ); } } @@ -1016,15 +1097,23 @@ impl Peptidoform { &self, range: impl RangeBounds + Clone, aa_range: impl RangeBounds + Clone, - base: &Multi, + base: &( + Multi, + HashMap>, + ), apply_neutral_losses: bool, all_peptides: &[Peptidoform], visited_peptides: &[usize], applied_cross_links: &mut Vec, allow_ms_cleavable: bool, peptidoform_index: usize, - ) -> (Multi, HashSet) { - let (ambiguous_mods_masses, seen) = self.ambiguous_patterns( + glycan_model: &GlycanModel, + ) -> ( + Multi, + HashMap>, + HashSet, + ) { + let (ambiguous_mods_masses, mut specific, seen) = self.ambiguous_patterns( range.clone(), aa_range, base, @@ -1033,6 +1122,7 @@ impl Peptidoform { applied_cross_links, allow_ms_cleavable, peptidoform_index, + glycan_model, ); if apply_neutral_losses { let neutral_losses = self.potential_neutral_losses( @@ -1046,10 +1136,13 @@ impl Peptidoform { all_masses.extend(ambiguous_mods_masses.iter().cloned()); for loss in &neutral_losses { all_masses.extend((ambiguous_mods_masses.clone() + loss.0.clone()).to_vec()); + for option in specific.values_mut() { + *option = option.clone().with_neutral_loss(&loss.0); + } } - (all_masses.into(), seen) + (all_masses.into(), specific, seen) } else { - (ambiguous_mods_masses, seen) + (ambiguous_mods_masses, specific, seen) } } @@ -1059,6 +1152,7 @@ impl Peptidoform { } /// Gives all the formulas for the whole peptide with no C and N terminal modifications. With the global isotope modifications applied. + /// Ignores any potential glycan fragmentation. #[expect(clippy::missing_panics_doc)] // Global isotope mods are guaranteed to be correct fn bare_formulas_inner( &self, @@ -1080,6 +1174,7 @@ impl Peptidoform { allow_ms_cleavable, SequencePosition::Index(index), peptidoform_index, + &GlycanModel::DISALLOW, ) .0; } @@ -1103,30 +1198,40 @@ impl Peptidoform { visited_peptides: &[usize], applied_cross_links: &mut Vec, allow_ms_cleavable: bool, - ) -> (Multi, HashSet) { + glycan_model: &GlycanModel, + ) -> ( + Multi, + HashMap>, + HashSet, + ) { debug_assert!( !visited_peptides.contains(&peptidoform_index), "Cannot get the formula for a peptide that is already visited" ); let mut new_visited_peptides = vec![peptidoform_index]; new_visited_peptides.extend_from_slice(visited_peptides); - let mut formulas: Multi = self.get_n_term_mass( + let (n, n_specific) = self.get_n_term_mass( all_peptides, visited_peptides, applied_cross_links, allow_ms_cleavable, peptidoform_index, - ) * self.get_c_term_mass( + glycan_model, + ); + let (c, c_specific) = self.get_c_term_mass( all_peptides, visited_peptides, applied_cross_links, allow_ms_cleavable, peptidoform_index, + glycan_model, ); + let mut formulas: Multi = n * c; + let mut formulas_specific = helper_functions::merge_hashmap(n_specific, c_specific); let mut placed = vec![false; self.modifications_of_unknown_position.len()]; let mut seen = HashSet::new(); for (index, pos) in self.sequence.iter().enumerate() { - let (pos_f, pos_seen) = pos.formulas_greedy( + let (pos_f, specific, pos_seen) = pos.formulas_greedy( &mut placed, all_peptides, &new_visited_peptides, @@ -1134,15 +1239,23 @@ impl Peptidoform { allow_ms_cleavable, SequencePosition::Index(index), peptidoform_index, + glycan_model, ); formulas *= pos_f; + formulas_specific = merge_hashmap(formulas_specific, specific); seen.extend(pos_seen); } - (formulas - .iter() - .map(|f| f.with_global_isotope_modifications(&self.global).expect("Global isotope modification invalid in determination of all formulas for a peptide")) - .collect(), seen) + let formulas = formulas + .iter() + .map(|f| f.with_global_isotope_modifications(&self.global).expect("Global isotope modification invalid in determination of all formulas for a peptide")) + .collect(); + let formulas_specific = formulas_specific + .into_iter() + .map(|(k, f)| (k, f.iter().map(|f| f.with_global_isotope_modifications(&self.global).expect("Global isotope modification invalid in determination of all formulas for a peptide")).collect())) + .collect(); + + (formulas, formulas_specific, seen) } /// Display this peptide. @@ -1440,17 +1553,21 @@ impl> Peptidoform { pub fn generate_theoretical_fragments( &self, max_charge: Charge, - model: &Model, + model: &FragmentationModel, ) -> Vec { self.generate_theoretical_fragments_inner(max_charge, model, 0, 0, &[]) } /// Gives the formulas for the whole peptide. With the global isotope modifications applied. (Any B/Z will result in multiple possible formulas.) + /// Ignores any potential glycan fragmentation, assumes the glycan is always fully present. #[expect(clippy::missing_panics_doc)] // Can not panic (unless state is already corrupted) pub fn formulas(&self) -> Multi { - let mut formulas: Multi = - self.get_n_term_mass(&[], &[], &mut Vec::new(), false, 0) - * self.get_c_term_mass(&[], &[], &mut Vec::new(), false, 0); + let mut formulas: Multi = self + .get_n_term_mass(&[], &[], &mut Vec::new(), false, 0, &GlycanModel::DISALLOW) + .0 + * self + .get_c_term_mass(&[], &[], &mut Vec::new(), false, 0, &GlycanModel::DISALLOW) + .0; let mut placed = vec![false; self.modifications_of_unknown_position.len()]; for (index, pos) in self.sequence.iter().enumerate() { formulas *= pos @@ -1462,6 +1579,7 @@ impl> Peptidoform { false, SequencePosition::Index(index), 0, + &GlycanModel::DISALLOW, ) .0; } @@ -1480,10 +1598,11 @@ impl> Peptidoform { impl Peptidoform { /// Gives the formula for the whole peptide. With the global isotope modifications applied. + /// Ignores any potential glycan fragmentation, assumes the glycan is always fully present. #[expect(clippy::missing_panics_doc)] // Can not panic (unless state is already corrupted) pub fn formula(&self) -> MolecularFormula { let mut options = self - .formulas_inner(0, &[], &[], &mut Vec::new(), false) + .formulas_inner(0, &[], &[], &mut Vec::new(), false, &GlycanModel::DISALLOW) .0 .to_vec(); assert_eq!(options.len(), 1); diff --git a/rustyms/src/peptidoform/peptidoform_ion.rs b/rustyms/src/peptidoform/peptidoform_ion.rs index 578181a6..0877ccf8 100644 --- a/rustyms/src/peptidoform/peptidoform_ion.rs +++ b/rustyms/src/peptidoform/peptidoform_ion.rs @@ -4,12 +4,14 @@ use itertools::Itertools; use serde::{Deserialize, Serialize}; use crate::{ + model::GlycanModel, modification::{ CrossLinkName, CrossLinkSide, RulePossible, SimpleModification, SimpleModificationInner, }, peptidoform::Linked, system::usize::Charge, - Fragment, Model, MolecularCharge, MolecularFormula, Multi, Peptidoform, SequencePosition, + Fragment, FragmentationModel, MolecularCharge, MolecularFormula, Multi, Peptidoform, + SequencePosition, }; /// A single peptidoform ion, can contain multiple peptidoforms #[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Default, Serialize, Deserialize, Hash)] @@ -39,12 +41,23 @@ impl PeptidoformIon { } /// Gives all possible formulas for this peptidoform (including breakage of cross-links that can break). + /// Includes the full glycan, if there are any glycans. /// Assumes all peptides in this peptidoform are connected. /// If there are no peptides in this peptidoform it returns [`Multi::default`]. pub fn formulas(&self) -> Multi { self.0 .first() - .map(|p| p.formulas_inner(0, &self.0, &[], &mut Vec::new(), true).0) + .map(|p| { + p.formulas_inner( + 0, + &self.0, + &[], + &mut Vec::new(), + true, + &GlycanModel::DISALLOW, + ) + .0 + }) .unwrap_or_default() } @@ -52,7 +65,7 @@ impl PeptidoformIon { pub fn generate_theoretical_fragments( &self, max_charge: Charge, - model: &Model, + model: &FragmentationModel, ) -> Vec { self.generate_theoretical_fragments_inner(max_charge, model, 0) } @@ -61,7 +74,7 @@ impl PeptidoformIon { pub(super) fn generate_theoretical_fragments_inner( &self, max_charge: Charge, - model: &Model, + model: &FragmentationModel, peptidoform_ion_index: usize, ) -> Vec { let mut base = Vec::new(); diff --git a/rustyms/src/peptidoform/tests/parse.rs b/rustyms/src/peptidoform/tests/parse.rs index df1d7676..8c871fd8 100644 --- a/rustyms/src/peptidoform/tests/parse.rs +++ b/rustyms/src/peptidoform/tests/parse.rs @@ -9,8 +9,8 @@ use crate::{ }, placement_rule::{self, PlacementRule, Position}, system::{da, usize::Charge}, - AminoAcid, CompoundPeptidoformIon, Element, Model, MolecularCharge, MultiChemical, Peptidoform, - PeptidoformIon, + AminoAcid, CompoundPeptidoformIon, Element, FragmentationModel, MolecularCharge, MultiChemical, + Peptidoform, PeptidoformIon, }; #[test] @@ -419,9 +419,9 @@ fn parse_xl_inter() { //dbg!(&singular.sequence[0].modifications); assert_eq!( peptide.formulas().to_vec()[0], - (AminoAcid::Alanine.single_formula().unwrap() * 2 + (AminoAcid::Alanine.single_formula().unwrap() * 2_i32 + molecular_formula!(C 8 H 10 O 2) - + molecular_formula!(H 2 O 1) * 2) + + molecular_formula!(H 2 O 1) * 2_i32) .with_label(crate::AmbiguousLabel::CrossLinkBound( crate::CrossLinkName::Name("test".to_string()) )) @@ -431,7 +431,9 @@ fn parse_xl_inter() { #[test] fn dimeric_peptide() { // Only generate a single series, easier to reason about - let test_model = Model::none().a(PrimaryIonSeries::default()); + let test_model = FragmentationModel::none() + .clone() + .a(PrimaryIonSeries::default()); // With two different sequences let dimeric = CompoundPeptidoformIon::pro_forma("AA+CC", None).unwrap(); diff --git a/rustyms/src/rand.rs b/rustyms/src/rand.rs index 79d68d6e..bae36ddd 100644 --- a/rustyms/src/rand.rs +++ b/rustyms/src/rand.rs @@ -74,50 +74,42 @@ impl Distribution for StandardUniform { impl Distribution for StandardUniform { fn sample(&self, rng: &mut R) -> GlycanSubstituent { - match rng.random_range(1..=44) { + match rng.random_range(1..=36) { 1 => GlycanSubstituent::Acetimidoyl, 2 => GlycanSubstituent::Acetyl, - 3 => GlycanSubstituent::AcetylAlanyl, - 4 => GlycanSubstituent::AcetylGlutaminyl, - 5 => GlycanSubstituent::Acid, - 6 => GlycanSubstituent::Alanyl, - 7 => GlycanSubstituent::Alcohol, - 8 => GlycanSubstituent::Amino, - 9 => GlycanSubstituent::Aric, - 10 => GlycanSubstituent::CargoxyEthylidene, - 11 => GlycanSubstituent::Deoxy, - 12 => GlycanSubstituent::Didehydro, - 13 => GlycanSubstituent::DiHydroxyButyryl, - 14 => GlycanSubstituent::DiMethyl, - 15 => GlycanSubstituent::DiMethylAcetimidoyl, - 16 => GlycanSubstituent::DiMethylGlyceryl, - 17 => GlycanSubstituent::Element(rng.random()), - 18 => GlycanSubstituent::Ethanolamine, - 19 => GlycanSubstituent::EtOH, - 20 => GlycanSubstituent::Formyl, - 21 => GlycanSubstituent::Glyceryl, - 22 => GlycanSubstituent::Glycolyl, - 23 => GlycanSubstituent::Glycyl, - 24 => GlycanSubstituent::HydroxyButyryl, - 25 => GlycanSubstituent::HydroxyMethyl, - 26 => GlycanSubstituent::Lac, - 27 => GlycanSubstituent::Lactyl, - 28 => GlycanSubstituent::Methyl, - 29 => GlycanSubstituent::MethylAcetimidoyl, - 30 => GlycanSubstituent::MethylGlutamyl, - 31 => GlycanSubstituent::NAcetyl, - 32 => GlycanSubstituent::NDiMe, - 33 => GlycanSubstituent::NFo, - 34 => GlycanSubstituent::NGlycolyl, - 35 => GlycanSubstituent::OCarboxyEthyl, - 36 => GlycanSubstituent::PCholine, - 37 => GlycanSubstituent::Phosphate, - 38 => GlycanSubstituent::Pyruvyl, - 39 => GlycanSubstituent::Suc, - 40 => GlycanSubstituent::Sulfate, - 41 => GlycanSubstituent::Tauryl, - 42 => GlycanSubstituent::Ulo, - 43 => GlycanSubstituent::Ulof, + 3 => GlycanSubstituent::Acid, + 4 => GlycanSubstituent::Alanyl, + 5 => GlycanSubstituent::Alcohol, + 6 => GlycanSubstituent::Amino, + 7 => GlycanSubstituent::Aric, + 8 => GlycanSubstituent::CargoxyEthylidene, + 9 => GlycanSubstituent::Deoxy, + 10 => GlycanSubstituent::Didehydro, + 11 => GlycanSubstituent::DiMethyl, + 12 => GlycanSubstituent::Element(rng.random()), + 13 => GlycanSubstituent::Ethanolamine, + 14 => GlycanSubstituent::EtOH, + 15 => GlycanSubstituent::Formyl, + 16 => GlycanSubstituent::Glyceryl, + 17 => GlycanSubstituent::Glycolyl, + 18 => GlycanSubstituent::Glycyl, + 19 => GlycanSubstituent::HydroxyButyryl, + 20 => GlycanSubstituent::Lac, + 21 => GlycanSubstituent::Lactyl, + 22 => GlycanSubstituent::Methyl, + 23 => GlycanSubstituent::NAcetyl, + 24 => GlycanSubstituent::NDiMe, + 25 => GlycanSubstituent::NFo, + 26 => GlycanSubstituent::NGlycolyl, + 27 => GlycanSubstituent::OCarboxyEthyl, + 28 => GlycanSubstituent::PCholine, + 29 => GlycanSubstituent::Phosphate, + 30 => GlycanSubstituent::Pyruvyl, + 31 => GlycanSubstituent::Suc, + 32 => GlycanSubstituent::Sulfate, + 33 => GlycanSubstituent::Tauryl, + 34 => GlycanSubstituent::Ulo, + 35 => GlycanSubstituent::Ulof, _ => GlycanSubstituent::Water, } } @@ -134,7 +126,7 @@ impl Distribution for StandardUniform { 5 => BaseSugar::Hexose(None), 6 => BaseSugar::Heptose(None), 7 => BaseSugar::Octose, - 8 => BaseSugar::Nonose, + 8 => BaseSugar::Nonose(None), _ => BaseSugar::Decose, } } diff --git a/rustyms/src/sequence_element.rs b/rustyms/src/sequence_element.rs index e53ce32a..595334a3 100644 --- a/rustyms/src/sequence_element.rs +++ b/rustyms/src/sequence_element.rs @@ -1,9 +1,16 @@ #![warn(dead_code)] -use std::{collections::HashSet, fmt::Write, marker::PhantomData, num::NonZeroU32}; +use std::{ + collections::{HashMap, HashSet}, + fmt::Write, + marker::PhantomData, + num::NonZeroU32, +}; use crate::{ error::{Context, CustomError}, + fragment::FragmentKind, + model::GlycanModel, modification::{ CrossLinkName, LinkerSpecificity, Modification, RulePossible, SimpleModification, SimpleModificationInner, @@ -117,7 +124,7 @@ impl SequenceElement { if self.ambiguous.is_some() && last_ambiguous != self.ambiguous { write!(f, "(?")?; } - write!(f, "{}", self.aminoacid.char())?; + write!(f, "{}", self.aminoacid)?; for m in &self.modifications { let mut display_ambiguous = false; if let Modification::Ambiguous { id, .. } = m { @@ -139,38 +146,27 @@ impl SequenceElement { /// Get the molecular formulas for this position without any the ambiguous modifications pub(crate) fn formulas_base( &self, - all_peptides: &[Peptidoform], - visited_peptides: &[usize], + all_peptidoforms: &[Peptidoform], + visited_peptidoforms: &[usize], applied_cross_links: &mut Vec, allow_ms_cleavable: bool, sequence_index: SequencePosition, peptidoform_index: usize, - ) -> (Multi, HashSet) { - let (formula, seen) = self - .modifications - .iter() - .filter_map(|m| { - if let Modification::Ambiguous { .. } = m { - None - } else { - Some(m.formula_inner( - all_peptides, - visited_peptides, - applied_cross_links, - allow_ms_cleavable, - sequence_index, - peptidoform_index, - )) - } - }) - .fold((Multi::default(), HashSet::new()), |(am, av), (m, v)| { - (am * m, av.union(&v).cloned().collect()) - }); - ( - self.aminoacid - .formulas_inner(sequence_index, peptidoform_index) - * formula, - seen, + glycan_model: &GlycanModel, + ) -> ( + Multi, + HashMap>, + HashSet, + ) { + self.formulas_generic( + &mut |_| false, + all_peptidoforms, + visited_peptidoforms, + applied_cross_links, + allow_ms_cleavable, + sequence_index, + peptidoform_index, + glycan_model, ) } @@ -179,14 +175,77 @@ impl SequenceElement { pub(crate) fn formulas_greedy( &self, placed: &mut [bool], - all_peptides: &[Peptidoform], - visited_peptides: &[usize], + all_peptidoforms: &[Peptidoform], + visited_peptidoforms: &[usize], + applied_cross_links: &mut Vec, + allow_ms_cleavable: bool, + sequence_index: SequencePosition, + peptidoform_index: usize, + glycan_model: &GlycanModel, + ) -> ( + Multi, + HashMap>, + HashSet, + ) { + self.formulas_generic( + &mut |id| (!placed[id]).then(|| placed[id] = true).is_some(), + all_peptidoforms, + visited_peptidoforms, + applied_cross_links, + allow_ms_cleavable, + sequence_index, + peptidoform_index, + glycan_model, + ) + } + + /// Get the molecular formulas for this position with all ambiguous modifications, without any global isotype modifications + pub(crate) fn formulas_all( + &self, + all_peptidoforms: &[Peptidoform], + visited_peptidoforms: &[usize], + applied_cross_links: &mut Vec, + allow_ms_cleavable: bool, + sequence_index: SequencePosition, + peptidoform_index: usize, + glycan_model: &GlycanModel, + ) -> ( + Multi, + HashMap>, + HashSet, + ) { + let mut placed = + vec![false; all_peptidoforms[peptidoform_index].number_of_ambiguous_modifications()]; + self.formulas_greedy( + &mut placed, + all_peptidoforms, + visited_peptidoforms, + applied_cross_links, + allow_ms_cleavable, + sequence_index, + peptidoform_index, + glycan_model, + ) + } + + /// Get the molecular formulas for this position with the ambiguous modifications placed on the very first placed (and updating this in `placed`), without any global isotype modifications + #[expect(clippy::too_many_arguments)] + pub(crate) fn formulas_generic( + &self, + place_ambiguous: &mut impl FnMut(usize) -> bool, + all_peptidoforms: &[Peptidoform], + visited_peptidoforms: &[usize], applied_cross_links: &mut Vec, allow_ms_cleavable: bool, sequence_index: SequencePosition, peptidoform_index: usize, - ) -> (Multi, HashSet) { - let (formula, seen) = self + glycan_model: &GlycanModel, + ) -> ( + Multi, + HashMap>, + HashSet, + ) { + let (formula, specific, seen) = self .modifications .iter() .filter_map(|m| { @@ -194,67 +253,63 @@ impl SequenceElement { id, modification, .. } = m { - (!placed[*id]).then(|| { - placed[*id] = true; + place_ambiguous(*id).then(|| { + let (default, specific) = + glycan_model.get_peptide_fragments(Some(self.aminoacid.aminoacid())); ( - modification - .formula_inner(sequence_index, peptidoform_index) - .into(), + modification.formula_inner( + sequence_index, + peptidoform_index, + default, + Some(self.aminoacid.aminoacid()), + ), + specific + .into_iter() + .map(|(k, setting)| { + ( + k, + modification.formula_inner( + sequence_index, + peptidoform_index, + setting, + Some(self.aminoacid.aminoacid()), + ), + ) + }) + .collect(), HashSet::default(), ) }) } else { - Some(m.formula_inner( - all_peptides, - visited_peptides, + let (formula, specific, seen) = m.formula_inner( + all_peptidoforms, + visited_peptidoforms, applied_cross_links, allow_ms_cleavable, sequence_index, peptidoform_index, - )) + glycan_model, + Some(self.aminoacid.aminoacid()), + ); + Some((formula, specific, seen)) } }) - .fold((Multi::default(), HashSet::new()), |(am, av), (m, v)| { - (am * m, av.union(&v).cloned().collect()) - }); - ( - self.aminoacid - .formulas_inner(sequence_index, peptidoform_index) - * formula, - seen, - ) - } - - /// Get the molecular formulas for this position with all ambiguous modifications, without any global isotype modifications - pub(crate) fn formulas_all( - &self, - all_peptides: &[Peptidoform], - visited_peptides: &[usize], - applied_cross_links: &mut Vec, - allow_ms_cleavable: bool, - sequence_index: SequencePosition, - peptidoform_index: usize, - ) -> (Multi, HashSet) { - let (formula, seen) = self - .modifications - .iter() - .map(|m| { - m.formula_inner( - all_peptides, - visited_peptides, - applied_cross_links, - allow_ms_cleavable, - sequence_index, - peptidoform_index, - ) - }) - .fold((Multi::default(), HashSet::new()), |(am, av), (m, v)| { - (am * m, av.union(&v).cloned().collect()) - }); + .fold( + (Multi::default(), HashMap::new(), HashSet::new()), + |(am, asp, av), (m, sp, v)| { + ( + am * m, + crate::helper_functions::merge_hashmap(asp, sp), + av.union(&v).cloned().collect(), + ) + }, + ); + let own = self + .aminoacid + .formulas_inner(sequence_index, peptidoform_index); ( - self.aminoacid - .formulas_inner(sequence_index, peptidoform_index) - * formula, + formula * &own, + specific.into_iter().map(|(k, v)| (k, v * &own)).collect(), seen, ) } @@ -298,9 +353,19 @@ impl SequenceElement { } /// Get all possible diagnostic ions - pub(crate) fn diagnostic_ions(&self, position: SequencePosition) -> Vec { + pub(crate) fn diagnostic_ions( + &self, + position: SequencePosition, + n_term: &[Modification], + c_term: &[Modification], + ) -> Vec { let mut diagnostic_ions = Vec::new(); - for modification in &self.modifications { + let modifications = match position { + SequencePosition::NTerm => n_term, + SequencePosition::Index(_) => &self.modifications, + SequencePosition::CTerm => c_term, + }; + for modification in modifications { match modification { Modification::CrossLink { linker, side, .. } => { diagnostic_ions.extend_from_slice(&side.allowed_rules(linker).2); diff --git a/rustyms/src/shared/element.rs b/rustyms/src/shared/element.rs index fb564a60..1be81b52 100644 --- a/rustyms/src/shared/element.rs +++ b/rustyms/src/shared/element.rs @@ -505,135 +505,137 @@ impl TryFrom for Element { } } +impl Element { + /// Get the symbol for this element + pub const fn symbol(self) -> &'static str { + match self { + Self::H => "H", + Self::He => "He", + Self::Li => "Li", + Self::Be => "Be", + Self::B => "B", + Self::C => "C", + Self::N => "N", + Self::O => "O", + Self::F => "F", + Self::Ne => "Ne", + Self::Na => "Na", + Self::Mg => "Mg", + Self::Al => "Al", + Self::Si => "Si", + Self::P => "P", + Self::S => "S", + Self::Cl => "Cl", + Self::Ar => "Ar", + Self::K => "K", + Self::Ca => "Ca", + Self::Sc => "Sc", + Self::Ti => "Ti", + Self::V => "V", + Self::Cr => "Cr", + Self::Mn => "Mn", + Self::Fe => "Fe", + Self::Co => "Co", + Self::Ni => "Ni", + Self::Cu => "Cu", + Self::Zn => "Zn", + Self::Ga => "Ga", + Self::Ge => "Ge", + Self::As => "As", + Self::Se => "Se", + Self::Br => "Br", + Self::Kr => "Kr", + Self::Rb => "Rb", + Self::Sr => "Sr", + Self::Y => "Y", + Self::Zr => "Zr", + Self::Nb => "Nb", + Self::Mo => "Mo", + Self::Tc => "Tc", + Self::Ru => "Ru", + Self::Rh => "Rh", + Self::Pd => "Pd", + Self::Ag => "Ag", + Self::Cd => "Cd", + Self::In => "In", + Self::Sn => "Sn", + Self::Sb => "Sb", + Self::Te => "Te", + Self::I => "I", + Self::Xe => "Xe", + Self::Cs => "Cs", + Self::Ba => "Ba", + Self::La => "La", + Self::Ce => "Ce", + Self::Pr => "Pr", + Self::Nd => "Nd", + Self::Pm => "Pm", + Self::Sm => "Sm", + Self::Eu => "Eu", + Self::Gd => "Gd", + Self::Tb => "Tb", + Self::Dy => "Dy", + Self::Ho => "Ho", + Self::Er => "Er", + Self::Tm => "Tm", + Self::Yb => "Yb", + Self::Lu => "Lu", + Self::Hf => "Hf", + Self::Ta => "Ta", + Self::W => "W", + Self::Re => "Re", + Self::Os => "Os", + Self::Ir => "Ir", + Self::Pt => "Pt", + Self::Au => "Au", + Self::Hg => "Hg", + Self::Tl => "Tl", + Self::Pb => "Pb", + Self::Bi => "Bi", + Self::Po => "Po", + Self::At => "At", + Self::Rn => "Rn", + Self::Fr => "Fr", + Self::Ra => "Ra", + Self::Ac => "Ac", + Self::Th => "Th", + Self::Pa => "Pa", + Self::U => "U", + Self::Np => "Np", + Self::Pu => "Pu", + Self::Am => "Am", + Self::Cm => "Cm", + Self::Bk => "Bk", + Self::Cf => "Cf", + Self::Es => "Es", + Self::Fm => "Fm", + Self::Md => "Md", + Self::No => "No", + Self::Lr => "Lr", + Self::Rf => "Rf", + Self::Db => "Db", + Self::Sg => "Sg", + Self::Bh => "Bh", + Self::Hs => "Hs", + Self::Mt => "Mt", + Self::Ds => "Ds", + Self::Rg => "Rg", + Self::Cn => "Cn", + Self::Nh => "Nh", + Self::Fl => "Fl", + Self::Mc => "Mc", + Self::Lv => "Lv", + Self::Ts => "Ts", + Self::Og => "Og", + // Self::Proton => "Proton", + Self::Electron => "e", + } + } +} + impl std::fmt::Display for Element { - #[expect(clippy::too_many_lines)] fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "{}", - match self { - Self::H => "H", - Self::He => "He", - Self::Li => "Li", - Self::Be => "Be", - Self::B => "B", - Self::C => "C", - Self::N => "N", - Self::O => "O", - Self::F => "F", - Self::Ne => "Ne", - Self::Na => "Na", - Self::Mg => "Mg", - Self::Al => "Al", - Self::Si => "Si", - Self::P => "P", - Self::S => "S", - Self::Cl => "Cl", - Self::Ar => "Ar", - Self::K => "K", - Self::Ca => "Ca", - Self::Sc => "Sc", - Self::Ti => "Ti", - Self::V => "V", - Self::Cr => "Cr", - Self::Mn => "Mn", - Self::Fe => "Fe", - Self::Co => "Co", - Self::Ni => "Ni", - Self::Cu => "Cu", - Self::Zn => "Zn", - Self::Ga => "Ga", - Self::Ge => "Ge", - Self::As => "As", - Self::Se => "Se", - Self::Br => "Br", - Self::Kr => "Kr", - Self::Rb => "Rb", - Self::Sr => "Sr", - Self::Y => "Y", - Self::Zr => "Zr", - Self::Nb => "Nb", - Self::Mo => "Mo", - Self::Tc => "Tc", - Self::Ru => "Ru", - Self::Rh => "Rh", - Self::Pd => "Pd", - Self::Ag => "Ag", - Self::Cd => "Cd", - Self::In => "In", - Self::Sn => "Sn", - Self::Sb => "Sb", - Self::Te => "Te", - Self::I => "I", - Self::Xe => "Xe", - Self::Cs => "Cs", - Self::Ba => "Ba", - Self::La => "La", - Self::Ce => "Ce", - Self::Pr => "Pr", - Self::Nd => "Nd", - Self::Pm => "Pm", - Self::Sm => "Sm", - Self::Eu => "Eu", - Self::Gd => "Gd", - Self::Tb => "Tb", - Self::Dy => "Dy", - Self::Ho => "Ho", - Self::Er => "Er", - Self::Tm => "Tm", - Self::Yb => "Yb", - Self::Lu => "Lu", - Self::Hf => "Hf", - Self::Ta => "Ta", - Self::W => "W", - Self::Re => "Re", - Self::Os => "Os", - Self::Ir => "Ir", - Self::Pt => "Pt", - Self::Au => "Au", - Self::Hg => "Hg", - Self::Tl => "Tl", - Self::Pb => "Pb", - Self::Bi => "Bi", - Self::Po => "Po", - Self::At => "At", - Self::Rn => "Rn", - Self::Fr => "Fr", - Self::Ra => "Ra", - Self::Ac => "Ac", - Self::Th => "Th", - Self::Pa => "Pa", - Self::U => "U", - Self::Np => "Np", - Self::Pu => "Pu", - Self::Am => "Am", - Self::Cm => "Cm", - Self::Bk => "Bk", - Self::Cf => "Cf", - Self::Es => "Es", - Self::Fm => "Fm", - Self::Md => "Md", - Self::No => "No", - Self::Lr => "Lr", - Self::Rf => "Rf", - Self::Db => "Db", - Self::Sg => "Sg", - Self::Bh => "Bh", - Self::Hs => "Hs", - Self::Mt => "Mt", - Self::Ds => "Ds", - Self::Rg => "Rg", - Self::Cn => "Cn", - Self::Nh => "Nh", - Self::Fl => "Fl", - Self::Mc => "Mc", - Self::Lv => "Lv", - Self::Ts => "Ts", - Self::Og => "Og", - // Self::Proton => "Proton", - Self::Electron => "e", - } - ) + write!(f, "{}", self.symbol()) } } diff --git a/rustyms/src/shared/formula/structure.rs b/rustyms/src/shared/formula/structure.rs index 6b12204c..90870490 100644 --- a/rustyms/src/shared/formula/structure.rs +++ b/rustyms/src/shared/formula/structure.rs @@ -1,12 +1,15 @@ use ordered_float::OrderedFloat; use serde::{Deserialize, Serialize}; -use crate::{AminoAcid, CrossLinkName, Element, Multi, SequencePosition}; +use crate::{ + fragment::GlycanPosition, glycan::MonoSaccharide, AminoAcid, CrossLinkName, Element, Multi, + SequencePosition, +}; use std::{ fmt::Write, hash::Hash, num::NonZeroU16, - ops::{Add, AddAssign, Mul, Neg, Sub}, + ops::{Add, AddAssign, Mul, Neg, Sub, SubAssign}, }; use thin_vec::ThinVec; @@ -27,7 +30,7 @@ pub struct MolecularFormula { /// Keep track of what ambiguous option is used #[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug, Serialize, Deserialize)] pub enum AmbiguousLabel { - /// A ambiguous amino acid, with the actual amino acid used tracked + /// An ambiguous amino acid, with the actual amino acid used tracked AminoAcid { /// Which amino acid is used option: AminoAcid, @@ -36,7 +39,7 @@ pub enum AmbiguousLabel { /// Peptide index peptidoform_index: usize, }, - /// A ambiguous modification, with the actual position + /// An ambiguous modification, with the actual position Modification { /// Which ambiguous modification id: usize, @@ -51,6 +54,10 @@ pub enum AmbiguousLabel { CrossLinkBound(CrossLinkName), /// A broken cross-link, having the name and the stub that was left in its place CrossLinkBroken(CrossLinkName, MolecularFormula), + /// A glycan fragment on a peptide fragment, with the Y breakages that lead to that fragment + GlycanFragment(Vec), + /// A glycan fragment on a peptide fragment, with the monosaccharides that make up the fragment + GlycanFragmentComposition(Vec<(MonoSaccharide, isize)>), } /// Any item that has a clearly defined single molecular formula @@ -453,9 +460,26 @@ impl Mul<&i32> for &MolecularFormula { } } +impl Mul<&i8> for &MolecularFormula { + type Output = MolecularFormula; + fn mul(self, rhs: &i8) -> Self::Output { + MolecularFormula { + additional_mass: self.additional_mass * f64::from(*rhs), + elements: self + .elements + .iter() + .copied() + .map(|part| (part.0, part.1, part.2 * i32::from(*rhs))) + .collect(), + labels: self.labels.clone(), + } + } +} + impl_binop_ref_cases!(impl Add, add for MolecularFormula, MolecularFormula, MolecularFormula); impl_binop_ref_cases!(impl Sub, sub for MolecularFormula, MolecularFormula, MolecularFormula); impl_binop_ref_cases!(impl Mul, mul for MolecularFormula, i32, MolecularFormula); +impl_binop_ref_cases!(impl Mul, mul for MolecularFormula, i8, MolecularFormula); impl AddAssign<&Self> for MolecularFormula { fn add_assign(&mut self, rhs: &Self) { @@ -484,12 +508,46 @@ impl AddAssign<&Self> for MolecularFormula { } } +impl SubAssign<&Self> for MolecularFormula { + fn sub_assign(&mut self, rhs: &Self) { + self.labels.extend_from_slice(&rhs.labels); + let mut index_result = 0; + let mut index_rhs = 0; + self.additional_mass -= rhs.additional_mass; + while index_rhs < rhs.elements.len() { + let (el, i, n) = rhs.elements[index_rhs]; + if index_result < self.elements.len() { + let (re, ri, _) = self.elements[index_result]; + if el > re || (el == re && i > ri) { + index_result += 1; + } else if el == re && i == ri { + self.elements[index_result].2 -= n; + index_rhs += 1; + } else { + self.elements.insert(index_result, (el, i, -n)); + index_rhs += 1; + } + } else { + self.elements.push((el, i, -n)); + index_rhs += 1; + } + } + self.elements.retain(|el| el.2 != 0); + } +} + impl AddAssign for MolecularFormula { fn add_assign(&mut self, rhs: Self) { *self += &rhs; } } +impl SubAssign for MolecularFormula { + fn sub_assign(&mut self, rhs: Self) { + *self -= &rhs; + } +} + impl std::iter::Sum for MolecularFormula { fn sum>(iter: I) -> Self { let mut res = Self::default(); diff --git a/rustyms/src/shared/glycan.rs b/rustyms/src/shared/glycan.rs index 96e1dec4..294a1ead 100644 --- a/rustyms/src/shared/glycan.rs +++ b/rustyms/src/shared/glycan.rs @@ -9,22 +9,73 @@ use crate::{ Element, SequencePosition, ELEMENT_PARSE_LIST, }; -/// A monosaccharide with all its complexity +/// Glycan absolute configuration #[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Serialize, Deserialize)] +pub enum Configuration { + /// D configuration + D, + /// L configuration + L, + /// Double configuration D and D + DD, + /// Double configuration L and L + LL, + /// Double configuration D and L + DL, + /// Double configuration L and D + LD, +} + +/// A monosaccharide with all its complexity +#[derive(Clone, Ord, PartialOrd, Debug, Serialize, Deserialize)] pub struct MonoSaccharide { pub(super) base_sugar: BaseSugar, pub(super) substituents: Vec, pub(super) furanose: bool, + pub(super) configuration: Option, pub(super) proforma_name: Option, } impl MonoSaccharide { + fn equivalent(&self, other: &Self, precise: bool) -> bool { + self.base_sugar.equivalent(&other.base_sugar, precise) + && self.substituents == other.substituents + && (!precise + || (self.furanose == other.furanose && self.configuration == other.configuration)) + } +} + +impl std::cmp::PartialEq for MonoSaccharide { + fn eq(&self, other: &Self) -> bool { + self.equivalent(other, true) + } +} + +impl std::cmp::Eq for MonoSaccharide {} + +impl std::hash::Hash for MonoSaccharide { + fn hash(&self, hasher: &mut H) { + self.base_sugar.hash(hasher); + self.substituents.hash(hasher); + self.furanose.hash(hasher); + self.configuration.hash(hasher); + } +} + +impl MonoSaccharide { + /// Check if this is a fucose + pub fn is_fucose(&self) -> bool { + self.base_sugar == BaseSugar::Hexose(Some(HexoseIsomer::Galactose)) + && self.substituents.contains(&GlycanSubstituent::Deoxy) + } + /// Create a new monosaccharide pub fn new(sugar: BaseSugar, substituents: &[GlycanSubstituent]) -> Self { Self { base_sugar: sugar, substituents: substituents.to_owned(), furanose: false, + configuration: None, proforma_name: None, } } @@ -49,6 +100,16 @@ impl MonoSaccharide { } } + /// Set this saccharide up to be a certain configuration + #[must_use] + #[allow(dead_code)] + pub fn configuration(self, configuration: Configuration) -> Self { + Self { + configuration: Some(configuration), + ..self + } + } + /// Simplify a glycan composition to be sorted and deduplicated. /// Returns None if overflow occurred, meaning that there where more than `isize::MAX` or less then `isize::MIN` monosaccharides for one species. pub(crate) fn simplify_composition( @@ -116,9 +177,21 @@ impl MonoSaccharide { let line = original_line.to_ascii_lowercase(); let bytes = line.as_bytes(); let mut substituents = Vec::new(); + let mut configuration = None; + let mut epi = false; // ignore stuff - index += line[index..].ignore(&["keto-", "d-", "l-", "?-"]); + index += line[index..].ignore(&["keto-"]); + if line[index..].starts_with("d-") { + configuration = Some(Configuration::D); + index += 2; + } else if line[index..].starts_with("l-") { + configuration = Some(Configuration::L); + index += 2; + } else if line[index..].starts_with("?-") { + configuration = None; + index += 2; + } // Prefix mods let mut amount = 1; if bytes[index].is_ascii_digit() { @@ -160,19 +233,31 @@ impl MonoSaccharide { } index += line[index..].ignore(&["-"]); } - // Detect & ignore epi state - index += line[index..].ignore(&["e"]); + // Detect epi state + if line[index..].starts_with('e') { + epi = true; + index += 1; + } // Get the prefix mods if !line[index..].starts_with("dig") && !line[index..].starts_with("dha") { if let Some(o) = line[index..].take_any(PREFIX_SUBSTITUENTS, |e| { - substituents.extend(std::iter::repeat(e.clone()).take(amount)); + substituents.extend(std::iter::repeat(*e).take(amount)); }) { index += o; } index += line[index..].ignore(&["-"]); } // Another optional isomeric state - index += line[index..].ignore(&["d-", "l-", "?-"]); + if line[index..].starts_with("d-") { + configuration = Some(Configuration::D); + index += 2; + } else if line[index..].starts_with("l-") { + configuration = Some(Configuration::L); + index += 2; + } else if line[index..].starts_with("?-") { + configuration = None; + index += 2; + } // Base sugar let mut sugar = None; for sug in BASE_SUGARS { @@ -185,12 +270,18 @@ impl MonoSaccharide { let mut sugar = sugar .map(|(b, s)| { let mut alo = Self { - base_sugar: b, + base_sugar: match b { + BaseSugar::Nonose(Some(NonoseIsomer::Leg)) if epi => { + BaseSugar::Nonose(Some(NonoseIsomer::ELeg)) + } + other => other, + }, substituents, furanose: false, + configuration, proforma_name: None, }; - alo.substituents.extend(s.iter().cloned()); + alo.substituents.extend(s.iter().copied()); alo }) .ok_or_else(|| { @@ -243,14 +334,14 @@ impl MonoSaccharide { sugar.substituents.extend( e.iter() .flat_map(|s| std::iter::repeat(s).take(double_amount)) - .cloned(), + .copied(), ); if single_amount > 0 { sugar.substituents.extend( e.iter() .filter(|s| **s != GlycanSubstituent::Water) .flat_map(|s| std::iter::repeat(s).take(single_amount)) - .cloned(), + .copied(), ); } }) { @@ -272,7 +363,7 @@ impl MonoSaccharide { if let Some(o) = line[index..].take_any(POSTFIX_SUBSTITUENTS, |e| { sugar .substituents - .extend(std::iter::repeat(e.clone()).take(amount)); + .extend(std::iter::repeat(*e).take(amount)); }) { index += o; } else if let Some(o) = line[index..].take_any(ELEMENT_PARSE_LIST, |e| { @@ -455,11 +546,29 @@ pub enum BaseSugar { /// 8 carbon base sugar Octose, /// 9 carbon base sugar - Nonose, + Nonose(Option), /// 10 carbon base sugar Decose, } +impl BaseSugar { + fn equivalent(&self, other: &Self, precise: bool) -> bool { + match (self, other) { + (Self::None, Self::None) + | (Self::Sugar, Self::Sugar) + | (Self::Octose, Self::Octose) + | (Self::Decose, Self::Decose) + | (Self::Triose, Self::Triose) => true, + (Self::Tetrose(a), Self::Tetrose(b)) => !precise || a == b, + (Self::Pentose(a), Self::Pentose(b)) => !precise || a == b, + (Self::Hexose(a), Self::Hexose(b)) => !precise || a == b, + (Self::Heptose(a), Self::Heptose(b)) => !precise || a == b, + (Self::Nonose(a), Self::Nonose(b)) => !precise || a == b, + _ => false, + } + } +} + impl Display for BaseSugar { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( @@ -474,7 +583,7 @@ impl Display for BaseSugar { Self::Hexose(_) => "Hex", Self::Heptose(_) => "Hep", Self::Octose => "Oct", - Self::Nonose => "Non", + Self::Nonose(_) => "Non", Self::Decose => "Dec", } ) @@ -496,7 +605,7 @@ impl Chemical for BaseSugar { Self::Hexose(_) => molecular_formula!(H 10 C 6 O 5), Self::Heptose(_) => molecular_formula!(H 12 C 7 O 6), Self::Octose => molecular_formula!(H 14 C 8 O 7), - Self::Nonose => molecular_formula!(H 16 C 9 O 8), + Self::Nonose(_) => molecular_formula!(H 16 C 9 O 8), Self::Decose => molecular_formula!(H 18 C 10 O 9), } } @@ -568,19 +677,34 @@ pub enum HeptoseIsomer { Sedoheptulose, } +/// Any 9 carbon glycan, these isomers are modification specific (need the correct substituents +/// applied to be meaningful). These are to be used only to store isomeric state that was inferred +/// from other sources that cannot be tracked in other ways in the current structure. Any isomer +/// used that does not have the correct monosaccharide substituents applied is meaningless. +#[allow(dead_code)] +#[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Serialize, Deserialize)] +pub enum NonoseIsomer { + /// 3-Deoxy-D-glycero-D-galacto-non-2-ulopyranosonic acid + Kdn, + /// 5,7-Diamino-3,5,7,9-tetradeoxy-L-glycero-L-manno-non-2-ulopyranosonic acid + Pse, + /// 5,7-Diamino-3,5,7,9-tetradeoxy-D-glycero-D-galacto-non-2-ulopyranosonic acid + Leg, + /// 4 or 8 eLeg + ELeg, + /// 5,7-Diamino-3,5,7,9-tetradeoxy-L-glycero-L-altro-non-2-ulopyranosonic acid + Aci, +} + /// Any substituent on a monosaccharide. /// Source: table 3. #[allow(dead_code)] -#[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Serialize, Deserialize)] +#[derive(Clone, Copy, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Serialize, Deserialize)] pub enum GlycanSubstituent { ///`Am` N-acetimidoyl Acetimidoyl, ///`Ac` acetyl Acetyl, - ///`Ala2Ac` N-acetyl-D-alanyl - AcetylAlanyl, - ///`Gln2Ac` N-acetyl-glutaminyl - AcetylGlutaminyl, ///`A` acid Acid, ///`Ala` D-alanyl @@ -595,14 +719,8 @@ pub enum GlycanSubstituent { CargoxyEthylidene, ///`d` Deoxy Deoxy, - ///`3,4Hb` 3,4-dihydroxybutyryl - DiHydroxyButyryl, ///`DiMe` two methyl DiMethyl, - ///`AmMe2` N-(N,N-dimethyl-acetimidoyl) - DiMethylAcetimidoyl, - ///`Gr2,3Me2` 2,3-di-O-methyl-glyceryl - DiMethylGlyceryl, ///`en` didehydro an addition of a double bond Didehydro, ///`An` element that replaces a side chain @@ -629,10 +747,6 @@ pub enum GlycanSubstituent { Lactyl, ///`Me` methyl Methyl, - ///`AmMe` N-(N-methyl-acetimidoyl) - MethylAcetimidoyl, - ///5Glu2Me N-methyl-5-glutamyl - MethylGlutamyl, ///`NAc` N-acetyl NAcetyl, ///`N2DiMe` N linked double methyl @@ -663,58 +777,54 @@ pub enum GlycanSubstituent { Water, } +impl GlycanSubstituent { + /// Get the symbol used to denote this substituent + pub const fn notation(self) -> &'static str { + match self { + Self::Acetimidoyl => "Am", + Self::Acetyl => "Ac", + Self::Acid => "A", + Self::Alanyl => "Ala", + Self::Alcohol => "ol", + Self::Amino => "N", + Self::Aric => "aric", + Self::CargoxyEthylidene => "Pyr", + Self::Deoxy => "d", + Self::Didehydro => "en", + Self::DiMethyl => "Me2", + Self::Ethanolamine => "Etn", + Self::Element(el) => el.symbol(), + Self::EtOH => "EtOH", + Self::Formyl => "Fo", + Self::Glyceryl => "Gr", + Self::Glycolyl => "Gc", + Self::Glycyl => "Gly", + Self::HydroxyButyryl => "Hb", + Self::HydroxyMethyl => "HMe", + Self::Lac => "Lac", + Self::Lactyl => "Lt", + Self::Methyl => "Me", + Self::NAcetyl => "NAc", + Self::NDiMe => "NDiMe", + Self::NFo => "NFo", + Self::NGlycolyl => "NGc", + Self::OCarboxyEthyl => "carboxyethyl", + Self::PCholine => "PCho", + Self::Phosphate => "P", + Self::Pyruvyl => "Py", + Self::Suc => "Suc", + Self::Sulfate => "S", + Self::Tauryl => "Tau", + Self::Ulo => "ulo", + Self::Ulof => "ulof", + Self::Water => "water_loss", + } + } +} + impl Display for GlycanSubstituent { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "{}", - match self { - Self::Acetimidoyl => "Am".to_string(), - Self::Acetyl => "Ac".to_string(), - Self::AcetylAlanyl => "Ala2Ac".to_string(), - Self::AcetylGlutaminyl => "Gln2Ac".to_string(), - Self::Acid => "A".to_string(), - Self::Alanyl => "Ala".to_string(), - Self::Alcohol => "ol".to_string(), - Self::Amino => "N".to_string(), - Self::Aric => "aric".to_string(), - Self::CargoxyEthylidene => "Pyr".to_string(), - Self::Deoxy => "d".to_string(), - Self::Didehydro => "en".to_string(), - Self::DiHydroxyButyryl => "3,4Hb".to_string(), - Self::DiMethyl => "DiMe".to_string(), - Self::DiMethylAcetimidoyl => "AmMe2".to_string(), - Self::DiMethylGlyceryl => "Gr2,3Me2".to_string(), - Self::Ethanolamine => "Etn".to_string(), - Self::Element(el) => el.to_string(), - Self::EtOH => "EtOH".to_string(), - Self::Formyl => "Fo".to_string(), - Self::Glyceryl => "Gr".to_string(), - Self::Glycolyl => "Gc".to_string(), - Self::Glycyl => "Gly".to_string(), - Self::HydroxyButyryl => "Hb".to_string(), - Self::HydroxyMethyl => "HMe".to_string(), - Self::Lac => "Lac".to_string(), - Self::Lactyl => "Lt".to_string(), - Self::Methyl => "Me".to_string(), - Self::MethylAcetimidoyl => "AmMe".to_string(), - Self::MethylGlutamyl => "5Glu2Me".to_string(), - Self::NAcetyl => "NAc".to_string(), - Self::NDiMe => "NDiMe".to_string(), - Self::NFo => "NFo".to_string(), - Self::NGlycolyl => "NGc".to_string(), - Self::OCarboxyEthyl => "carboxyethyl".to_string(), - Self::PCholine => "PCho".to_string(), - Self::Phosphate => "P".to_string(), - Self::Pyruvyl => "Py".to_string(), - Self::Suc => "Suc".to_string(), - Self::Sulfate => "S".to_string(), - Self::Tauryl => "Tau".to_string(), - Self::Ulo => "ulo".to_string(), - Self::Ulof => "ulof".to_string(), - Self::Water => "water_loss".to_string(), - } - ) + write!(f, "{}", self.notation()) } } @@ -727,8 +837,6 @@ impl Chemical for GlycanSubstituent { let side = match self { Self::Acetimidoyl => molecular_formula!(H 5 C 2 N 1), Self::Acetyl => molecular_formula!(H 3 C 2 O 1), - Self::AcetylAlanyl => molecular_formula!(H 8 C 5 N 1 O 2), - Self::AcetylGlutaminyl => molecular_formula!(H 11 C 7 N 2 O 3), Self::Acid => molecular_formula!(H -1 O 2), // Together with the replacement below this is H-2 O+1 Self::Alanyl => molecular_formula!(H 6 C 3 N 1 O 1), Self::Alcohol => molecular_formula!(H 3 O 1), // Together with the replacement below this is H+2 @@ -737,10 +845,7 @@ impl Chemical for GlycanSubstituent { Self::CargoxyEthylidene => molecular_formula!(H 3 C 3 O 3), // double substituent, calculated to work with the additional side chain deletion Self::Deoxy => molecular_formula!(H 1), // Together with the replacement below this is O-1 Self::Didehydro => molecular_formula!(H -1 O 1), // Together with the replacement below this is H-2 - Self::DiHydroxyButyryl => molecular_formula!(H 7 C 4 O 3), Self::DiMethyl => molecular_formula!(H 5 C 2), // assumed to replace the both the OH and H on a single carbon - Self::DiMethylAcetimidoyl => molecular_formula!(H 9 C 4 N 1), - Self::DiMethylGlyceryl => molecular_formula!(H 9 C 5 O 3), Self::Ethanolamine => molecular_formula!(H 6 C 2 N 1 O 1), Self::EtOH => molecular_formula!(H 5 C 2 O 2), Self::Element(el) => MolecularFormula::new(&[(*el, None, 1)], &[]).unwrap(), @@ -752,8 +857,6 @@ impl Chemical for GlycanSubstituent { Self::HydroxyMethyl | Self::Ulo => molecular_formula!(H 3 C 1 O 2), // Ulo: replaces H, together with replacement below this is H2C1O1 Self::Lactyl => molecular_formula!(H 5 C 3 O 2), Self::Methyl => molecular_formula!(H 3 C 1), - Self::MethylAcetimidoyl => molecular_formula!(H 7 C 3 N 1), - Self::MethylGlutamyl => molecular_formula!(H 10 C 6 N 1 O 3), Self::NDiMe => molecular_formula!(H 6 C 2 N 1), Self::NFo => molecular_formula!(H 2 C 1 N 1 O 1), Self::NGlycolyl => molecular_formula!(H 4 C 2 N 1 O 2), diff --git a/rustyms/src/shared/glycan_lists.rs b/rustyms/src/shared/glycan_lists.rs index cbb5dd6b..7c70183f 100644 --- a/rustyms/src/shared/glycan_lists.rs +++ b/rustyms/src/shared/glycan_lists.rs @@ -34,12 +34,12 @@ const BASE_SUGARS: &[(&str, BaseSugar, &[GlycanSubstituent])] = &[ ), ( "neu", - BaseSugar::Nonose, + BaseSugar::Nonose(None), &[GlycanSubstituent::Amino, GlycanSubstituent::Acid], ), ( "sia", - BaseSugar::Nonose, + BaseSugar::Nonose(None), &[ GlycanSubstituent::Amino, GlycanSubstituent::Deoxy, @@ -48,7 +48,7 @@ const BASE_SUGARS: &[(&str, BaseSugar, &[GlycanSubstituent])] = &[ ), ( "kdn", - BaseSugar::Nonose, + BaseSugar::Nonose(Some(NonoseIsomer::Kdn)), &[ GlycanSubstituent::Amino, GlycanSubstituent::Deoxy, @@ -167,7 +167,7 @@ const BASE_SUGARS: &[(&str, BaseSugar, &[GlycanSubstituent])] = &[ ), ( "pse", - BaseSugar::Nonose, + BaseSugar::Nonose(Some(NonoseIsomer::Pse)), &[ GlycanSubstituent::Amino, GlycanSubstituent::Deoxy, @@ -178,7 +178,7 @@ const BASE_SUGARS: &[(&str, BaseSugar, &[GlycanSubstituent])] = &[ ), ( "leg", - BaseSugar::Nonose, + BaseSugar::Nonose(Some(NonoseIsomer::Leg)), &[ GlycanSubstituent::Acid, GlycanSubstituent::Amino, @@ -189,7 +189,7 @@ const BASE_SUGARS: &[(&str, BaseSugar, &[GlycanSubstituent])] = &[ ), ( "aci", - BaseSugar::Nonose, + BaseSugar::Nonose(Some(NonoseIsomer::Aci)), &[ GlycanSubstituent::Acid, GlycanSubstituent::Amino, @@ -205,23 +205,16 @@ const BASE_SUGARS: &[(&str, BaseSugar, &[GlycanSubstituent])] = &[ // * Add an additional level which defines the leaving group, to make the chemical formula difference easier const POSTFIX_SUBSTITUENTS: &[(&str, GlycanSubstituent)] = &[ ("ac", GlycanSubstituent::Acetyl), - ("ala2ac", GlycanSubstituent::AcetylAlanyl), ("ala", GlycanSubstituent::Alanyl), - ("amme2", GlycanSubstituent::DiMethylAcetimidoyl), - ("amme", GlycanSubstituent::MethylAcetimidoyl), ("am", GlycanSubstituent::Acetimidoyl), ("en", GlycanSubstituent::Didehydro), ("fo", GlycanSubstituent::Formyl), ("gc", GlycanSubstituent::Glycolyl), - ("gln2ac", GlycanSubstituent::AcetylGlutaminyl), - ("5glu2me", GlycanSubstituent::MethylGlutamyl), ("gly", GlycanSubstituent::Glycyl), ("gr", GlycanSubstituent::Glyceryl), - ("gr2,3Me2", GlycanSubstituent::DiMethylGlyceryl), ("4hb", GlycanSubstituent::HydroxyButyryl), ("3rhb", GlycanSubstituent::HydroxyButyryl), ("3shb", GlycanSubstituent::HydroxyButyryl), - ("3,4Hb", GlycanSubstituent::DiHydroxyButyryl), ("lt", GlycanSubstituent::Lactyl), ("lac", GlycanSubstituent::Lac), ("me", GlycanSubstituent::Methyl), @@ -277,6 +270,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![GlycanSubstituent::Phosphate], proforma_name: Some("phosphate".to_string()), furanose: false, + configuration: None, }, ), ( @@ -286,6 +280,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![GlycanSubstituent::Sulfate], proforma_name: Some("sulfate".to_string()), furanose: false, + configuration: None, }, ), ( @@ -295,6 +290,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![], proforma_name: Some("Sug".to_string()), furanose: false, + configuration: None, }, ), ( @@ -304,6 +300,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![], proforma_name: Some("Tri".to_string()), furanose: false, + configuration: None, }, ), ( @@ -313,6 +310,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![], proforma_name: Some("Tet".to_string()), furanose: false, + configuration: None, }, ), ( @@ -322,6 +320,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![], proforma_name: Some("Pen".to_string()), furanose: false, + configuration: None, }, ), ( @@ -331,6 +330,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![], proforma_name: Some("Pen".to_string()), furanose: false, + configuration: None, }, ), ( @@ -340,6 +340,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![], proforma_name: Some("Pen".to_string()), furanose: false, + configuration: None, }, ), ( @@ -349,6 +350,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![], proforma_name: Some("Pen".to_string()), furanose: false, + configuration: None, }, ), ( @@ -358,6 +360,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![], proforma_name: Some("Pen".to_string()), furanose: false, + configuration: None, }, ), ( @@ -367,6 +370,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![], proforma_name: Some("Pen".to_string()), furanose: false, + configuration: None, }, ), ( @@ -376,6 +380,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![GlycanSubstituent::Acid], proforma_name: Some("a-Hex".to_string()), furanose: false, + configuration: None, }, ), ( @@ -389,6 +394,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { ], proforma_name: Some("en,a-Hex".to_string()), furanose: false, + configuration: None, }, ), ( @@ -398,6 +404,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![], proforma_name: Some("d-Hex".to_string()), furanose: false, + configuration: None, }, ), ( @@ -407,6 +414,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![], proforma_name: Some("d-Hex".to_string()), furanose: false, + configuration: None, }, ), ( @@ -416,6 +424,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![GlycanSubstituent::NAcetyl, GlycanSubstituent::Sulfate], proforma_name: Some("HexNAc(S)".to_string()), furanose: false, + configuration: None, }, ), ( @@ -425,6 +434,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![GlycanSubstituent::NAcetyl], proforma_name: Some("HexNAc".to_string()), furanose: false, + configuration: None, }, ), ( @@ -434,6 +444,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![GlycanSubstituent::NAcetyl], proforma_name: Some("HexNAc".to_string()), furanose: false, + configuration: None, }, ), ( @@ -443,6 +454,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![GlycanSubstituent::NAcetyl], proforma_name: Some("HexNAc".to_string()), furanose: false, + configuration: None, }, ), ( @@ -452,6 +464,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![GlycanSubstituent::NAcetyl], proforma_name: Some("HexNAc".to_string()), furanose: false, + configuration: None, }, ), ( @@ -461,6 +474,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![GlycanSubstituent::NAcetyl], proforma_name: Some("HexNAc".to_string()), furanose: false, + configuration: None, }, ), ( @@ -470,6 +484,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![GlycanSubstituent::NAcetyl], proforma_name: Some("HexNAc".to_string()), furanose: false, + configuration: None, }, ), ( @@ -479,6 +494,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![GlycanSubstituent::NAcetyl], proforma_name: Some("HexNAc".to_string()), furanose: false, + configuration: None, }, ), ( @@ -488,6 +504,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![GlycanSubstituent::NAcetyl], proforma_name: Some("HexNAc".to_string()), furanose: false, + configuration: None, }, ), ( @@ -497,6 +514,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![GlycanSubstituent::Amino, GlycanSubstituent::Sulfate], proforma_name: Some("HexNS".to_string()), furanose: false, + configuration: None, }, ), ( @@ -506,6 +524,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![GlycanSubstituent::Amino], proforma_name: Some("HexN".to_string()), furanose: false, + configuration: None, }, ), ( @@ -515,6 +534,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![GlycanSubstituent::Sulfate], proforma_name: Some("HexS".to_string()), furanose: false, + configuration: None, }, ), ( @@ -524,6 +544,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![GlycanSubstituent::Phosphate], proforma_name: Some("HexP".to_string()), furanose: false, + configuration: None, }, ), ( @@ -533,6 +554,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![], proforma_name: Some("Hex".to_string()), furanose: false, + configuration: None, }, ), ( @@ -542,6 +564,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![], proforma_name: Some("Hex".to_string()), furanose: false, + configuration: None, }, ), ( @@ -551,6 +574,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![], proforma_name: Some("Hex".to_string()), furanose: false, + configuration: None, }, ), ( @@ -560,6 +584,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![], proforma_name: Some("Hex".to_string()), furanose: false, + configuration: None, }, ), ( @@ -569,6 +594,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![], proforma_name: Some("Hex".to_string()), furanose: false, + configuration: None, }, ), ( @@ -578,6 +604,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![], proforma_name: Some("Hex".to_string()), furanose: false, + configuration: None, }, ), ( @@ -587,6 +614,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![], proforma_name: Some("Hex".to_string()), furanose: false, + configuration: None, }, ), ( @@ -596,6 +624,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![], proforma_name: Some("Hex".to_string()), furanose: false, + configuration: None, }, ), ( @@ -605,6 +634,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![], proforma_name: Some("Hex".to_string()), furanose: false, + configuration: None, }, ), ( @@ -614,6 +644,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![], proforma_name: Some("Hep".to_string()), furanose: false, + configuration: None, }, ), ( @@ -623,6 +654,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![], proforma_name: Some("Oct".to_string()), furanose: false, + configuration: None, }, ), ( @@ -632,21 +664,23 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![GlycanSubstituent::Deoxy, GlycanSubstituent::Acid], proforma_name: Some("Oct".to_string()), furanose: false, + configuration: None, }, ), ( "non".to_string(), MonoSaccharide { - base_sugar: BaseSugar::Nonose, + base_sugar: BaseSugar::Nonose(None), substituents: vec![], proforma_name: Some("Non".to_string()), furanose: false, + configuration: None, }, ), ( "kdn".to_string(), MonoSaccharide { - base_sugar: BaseSugar::Nonose, + base_sugar: BaseSugar::Nonose(Some(NonoseIsomer::Kdn)), substituents: vec![ GlycanSubstituent::Amino, GlycanSubstituent::Deoxy, @@ -654,12 +688,13 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { ], proforma_name: Some("Non".to_string()), furanose: false, + configuration: None, }, ), ( "sia".to_string(), MonoSaccharide { - base_sugar: BaseSugar::Nonose, + base_sugar: BaseSugar::Nonose(None), substituents: vec![ GlycanSubstituent::Amino, GlycanSubstituent::Deoxy, @@ -667,6 +702,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { ], proforma_name: Some("Non".to_string()), furanose: false, + configuration: None, }, ), ( @@ -676,64 +712,69 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![], proforma_name: Some("Dec".to_string()), furanose: false, + configuration: None, }, ), ( "neu5ac".to_string(), MonoSaccharide { - base_sugar: BaseSugar::Nonose, + base_sugar: BaseSugar::Nonose(None), substituents: vec![ GlycanSubstituent::Amino, GlycanSubstituent::Acetyl, GlycanSubstituent::Acid, ], - proforma_name: Some("Neu5Ac".to_string()), + proforma_name: Some("NeuAc".to_string()), furanose: false, + configuration: None, }, ), ( "neuac".to_string(), MonoSaccharide { - base_sugar: BaseSugar::Nonose, + base_sugar: BaseSugar::Nonose(None), substituents: vec![ GlycanSubstituent::Amino, GlycanSubstituent::Acetyl, GlycanSubstituent::Acid, ], - proforma_name: Some("Neu5Ac".to_string()), + proforma_name: Some("NeuAc".to_string()), furanose: false, + configuration: None, }, ), ( "neu5gc".to_string(), MonoSaccharide { - base_sugar: BaseSugar::Nonose, + base_sugar: BaseSugar::Nonose(None), substituents: vec![ GlycanSubstituent::Amino, GlycanSubstituent::Glycolyl, GlycanSubstituent::Acid, ], - proforma_name: Some("Neu5Gc".to_string()), + proforma_name: Some("NeuGc".to_string()), furanose: false, + configuration: None, }, ), ( "neugc".to_string(), MonoSaccharide { - base_sugar: BaseSugar::Nonose, + base_sugar: BaseSugar::Nonose(None), substituents: vec![ GlycanSubstituent::Amino, GlycanSubstituent::Glycolyl, GlycanSubstituent::Acid, ], - proforma_name: Some("Neu5Gc".to_string()), + proforma_name: Some("NeuGc".to_string()), furanose: false, + configuration: None, }, ), ( "neu".to_string(), MonoSaccharide { - base_sugar: BaseSugar::Nonose, + base_sugar: BaseSugar::Nonose(None), substituents: vec![ GlycanSubstituent::Amino, GlycanSubstituent::Deoxy, @@ -741,6 +782,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { ], proforma_name: Some("Neu".to_string()), furanose: false, + configuration: None, }, ), ( @@ -750,6 +792,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![GlycanSubstituent::Deoxy], proforma_name: Some("Fuc".to_string()), furanose: false, + configuration: None, }, ), ( @@ -759,6 +802,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![], proforma_name: Some("Xxx".to_string()), furanose: false, + configuration: None, }, ), ( @@ -768,6 +812,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![GlycanSubstituent::Alcohol], proforma_name: None, furanose: false, + configuration: None, }, ), ( @@ -777,6 +822,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![GlycanSubstituent::Methyl], proforma_name: None, furanose: false, + configuration: None, }, ), // Single letter codes, by defining them like this they will be read but exported to the standard ProForma codes @@ -787,6 +833,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![GlycanSubstituent::Acetyl], proforma_name: None, furanose: false, + configuration: None, }, ), ( @@ -796,6 +843,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![GlycanSubstituent::Phosphate], proforma_name: Some("Hexphosphate".to_string()), // TODO: technically maybe not working when multiple are in there, think it through, should be two different elements, both getting counts after them furanose: false, + configuration: None, }, ), ( @@ -805,6 +853,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![], proforma_name: Some("Hex".to_string()), furanose: false, + configuration: None, }, ), ( @@ -814,6 +863,7 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![GlycanSubstituent::NAcetyl], proforma_name: Some("HexNAc".to_string()), furanose: false, + configuration: None, }, ), ( @@ -823,45 +873,49 @@ pub fn glycan_parse_list() -> &'static Vec<(String, MonoSaccharide)> { substituents: vec![GlycanSubstituent::Deoxy], proforma_name: Some("Fuc".to_string()), furanose: false, + configuration: None, }, ), ( "s".to_string(), MonoSaccharide { - base_sugar: BaseSugar::Nonose, + base_sugar: BaseSugar::Nonose(None), substituents: vec![ GlycanSubstituent::Amino, GlycanSubstituent::Acetyl, GlycanSubstituent::Acid, ], - proforma_name: Some("Neu5Ac".to_string()), + proforma_name: Some("NeuAc".to_string()), furanose: false, + configuration: None, }, ), ( "a".to_string(), MonoSaccharide { - base_sugar: BaseSugar::Nonose, + base_sugar: BaseSugar::Nonose(None), substituents: vec![ GlycanSubstituent::Amino, GlycanSubstituent::Acetyl, GlycanSubstituent::Acid, ], - proforma_name: Some("Neu5Ac".to_string()), + proforma_name: Some("NeuAc".to_string()), furanose: false, + configuration: None, }, ), ( "g".to_string(), MonoSaccharide { - base_sugar: BaseSugar::Nonose, + base_sugar: BaseSugar::Nonose(None), substituents: vec![ GlycanSubstituent::Amino, GlycanSubstituent::Glycolyl, GlycanSubstituent::Acid, ], - proforma_name: Some("Neu5Gc".to_string()), + proforma_name: Some("NeuGc".to_string()), furanose: false, + configuration: None, }, ), ] diff --git a/rustyms/src/shared/glycan_position.rs b/rustyms/src/shared/glycan_position.rs new file mode 100644 index 00000000..26602bc7 --- /dev/null +++ b/rustyms/src/shared/glycan_position.rs @@ -0,0 +1,12 @@ +/// The definition of the position of an ion inside a glycan +#[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Serialize, Deserialize)] +pub struct GlycanPosition { + /// The depth starting at the amino acid + pub inner_depth: usize, + /// The series number (from the ion series terminal) + pub series_number: usize, + /// The branch naming + pub branch: Vec<(GlycanBranchIndex, GlycanBranchMassIndex)>, + /// The aminoacid index where this glycan is attached + pub attachment: Option<(AminoAcid, SequencePosition)>, +} diff --git a/rustyms/src/shared/glycan_structure.rs b/rustyms/src/shared/glycan_structure.rs index 3159e567..eded6ecd 100644 --- a/rustyms/src/shared/glycan_structure.rs +++ b/rustyms/src/shared/glycan_structure.rs @@ -1,5 +1,5 @@ use crate::helper_functions::{end_of_enclosure, next_char}; -use std::ops::Range; +use std::ops::{Range, RangeInclusive}; /// Rose tree representation of glycan structure #[allow(dead_code)] diff --git a/rustyms/src/shared/multi.rs b/rustyms/src/shared/multi.rs index 6e9381c9..c7a6f352 100644 --- a/rustyms/src/shared/multi.rs +++ b/rustyms/src/shared/multi.rs @@ -1,12 +1,12 @@ use std::{ - ops::{Add, Deref, Mul, MulAssign, Neg, Sub}, + ops::{Add, AddAssign, Deref, Mul, MulAssign, Neg, Sub}, rc::Rc, }; use itertools::{Itertools, MinMaxResult}; use serde::{Deserialize, Serialize}; -use crate::system::OrderedMass; +use crate::{system::OrderedMass, MolecularFormula, NeutralLoss}; /// A collection of potentially multiple of the generic type, it is used be able to easily /// combine multiple of this multi struct into all possible combinations. @@ -110,6 +110,36 @@ where } } +impl AddAssign for Multi +where + M: Add + Clone, +{ + /// Adds this formula to all formulas in the multi formula + fn add_assign(&mut self, rhs: M) { + *self = Multi(self.0.iter().cloned().map(|m| m + rhs.clone()).collect()); + } +} + +impl AddAssign for &mut Multi +where + M: Add + Clone, +{ + /// Adds this formula to all formulas in the multi formula + fn add_assign(&mut self, rhs: M) { + **self = Multi(self.0.iter().cloned().map(|m| m + rhs.clone()).collect()); + } +} + +impl<'a, M> AddAssign<&'a M> for &mut Multi +where + M: Add<&'a M, Output = M> + Clone, +{ + /// Adds this formula to all formulas in the multi formula + fn add_assign(&mut self, rhs: &'a M) { + **self = Multi(self.0.iter().cloned().map(|m| m + rhs).collect()); + } +} + impl<'a, M> Sub<&'a M> for &'a Multi where &'a M: Sub + 'a, @@ -335,4 +365,16 @@ impl crate::Multi { .collect(), ) } + + pub(crate) fn with_neutral_loss(self, loss: &NeutralLoss) -> Self { + let mut new_options = Vec::with_capacity(self.0.len() * 2); + for option in self.0.iter() { + new_options.push(match loss { + NeutralLoss::Gain(m) => option + m, + NeutralLoss::Loss(m) | NeutralLoss::SideChainLoss(m, _) => option - m, + }) + } + new_options.extend_from_slice(&self.0); + Self(new_options.into()) + } } diff --git a/rustyms/src/shared/neutral_loss.rs b/rustyms/src/shared/neutral_loss.rs index c525cdc7..23c24190 100644 --- a/rustyms/src/shared/neutral_loss.rs +++ b/rustyms/src/shared/neutral_loss.rs @@ -5,6 +5,8 @@ pub enum NeutralLoss { Gain(MolecularFormula), /// Loss of a specific formula Loss(MolecularFormula), + /// Loss of a side chain of an amino acid + SideChainLoss(MolecularFormula, crate::AminoAcid), } /// A diagnostic ion, defined in M (not MH+) chemical formula diff --git a/rustyms/src/shared/sequence_position.rs b/rustyms/src/shared/sequence_position.rs index 446bbb48..5500a77d 100644 --- a/rustyms/src/shared/sequence_position.rs +++ b/rustyms/src/shared/sequence_position.rs @@ -11,6 +11,28 @@ pub enum SequencePosition { CTerm, } +/// Add to the index, the onus of making sure the index is still valid for the peptide is on the caller. +impl std::ops::Add for SequencePosition { + type Output = Self; + fn add(self, rhs: u8) -> Self::Output { + match self { + Self::Index(i) => Self::Index(i.saturating_add(rhs as usize)), + n => n, + } + } +} + +/// Subtract from the index, the onus of making sure the index is still valid for the peptide is on the caller. +impl std::ops::Sub for SequencePosition { + type Output = Self; + fn sub(self, rhs: u8) -> Self::Output { + match self { + Self::Index(i) => Self::Index(i.saturating_sub(rhs as usize)), + n => n, + } + } +} + impl Default for SequencePosition { fn default() -> Self { Self::Index(0) diff --git a/rustyms/src/spectrum/fdr.rs b/rustyms/src/spectrum/fdr.rs index 68744dca..239feee4 100644 --- a/rustyms/src/spectrum/fdr.rs +++ b/rustyms/src/spectrum/fdr.rs @@ -2,8 +2,9 @@ use itertools::Itertools; use serde::{Deserialize, Serialize}; use crate::{ + model::MatchingParameters, system::{MassOverCharge, Ratio}, - AnnotatedSpectrum, Fragment, MassMode, Model, WithinTolerance, + AnnotatedSpectrum, Fragment, MassMode, WithinTolerance, }; impl AnnotatedSpectrum { @@ -20,7 +21,7 @@ impl AnnotatedSpectrum { pub fn fdr( &self, fragments: &[Fragment], - model: &Model, + parameters: &MatchingParameters, mass_mode: MassMode, ) -> (Fdr, Vec>) { let mzs = fragments @@ -29,7 +30,7 @@ impl AnnotatedSpectrum { f.mz(mass_mode) .map(|mz| (mz, f.peptidoform_ion_index, f.peptidoform_index)) }) - .filter(|(mz, _, _)| model.mz_range.contains(mz)) + .filter(|(mz, _, _)| parameters.mz_range.contains(mz)) .collect_vec(); let individual_peptides = self @@ -52,7 +53,7 @@ impl AnnotatedSpectrum { }) .collect_vec() .as_slice(), - model, + parameters, ) }) .collect() @@ -61,13 +62,13 @@ impl AnnotatedSpectrum { ( self.internal_fdr( mzs.iter().map(|(mz, _, _)| *mz).collect_vec().as_slice(), - model, + parameters, ), individual_peptides, ) } - fn internal_fdr(&self, mzs: &[MassOverCharge], model: &Model) -> Fdr { + fn internal_fdr(&self, mzs: &[MassOverCharge], parameters: &MatchingParameters) -> Fdr { let mut results = Vec::with_capacity(51); let total_intensity = self.spectrum.iter().map(|s| s.intensity.0).sum::(); @@ -103,7 +104,7 @@ impl AnnotatedSpectrum { } } - if model + if parameters .tolerance .within(&self.spectrum[closest.0].experimental_mz, mass) && !peak_annotated[closest.0] diff --git a/rustyms/src/spectrum/fragmentation.rs b/rustyms/src/spectrum/fragmentation.rs index 2f5c052c..315745ad 100644 --- a/rustyms/src/spectrum/fragmentation.rs +++ b/rustyms/src/spectrum/fragmentation.rs @@ -1,4 +1,6 @@ -use crate::{system::MassOverCharge, CompoundPeptidoformIon, Fragment, MassMode, Model}; +use crate::{ + model::MatchingParameters, system::MassOverCharge, CompoundPeptidoformIon, Fragment, MassMode, +}; use super::AnnotatedSpectrum; @@ -26,22 +28,27 @@ pub trait AnnotatableSpectrum { &self, peptide: CompoundPeptidoformIon, theoretical_fragments: &[Fragment], - model: &Model, + parameters: &MatchingParameters, mode: MassMode, ) -> AnnotatedSpectrum { - let tolerance = model.tolerance.into(); + let tolerance = parameters.tolerance.into(); let mut annotated = Self::empty_annotated(self, peptide); for fragment in theoretical_fragments { // Determine fragment mz and see if it is within the model range. if let Some(mz) = fragment.mz(mode) { - if !model.mz_range.contains(&mz) { + if !parameters.mz_range.contains(&mz) { continue; } // Get the index of the element closest to this value if let Some(index) = Self::search(self, mz, tolerance) { - annotated.spectrum[index].annotation.push(fragment.clone()); + // Keep the theoretical fragments sorted to have the highest theoretical likelihood on top + match annotated.spectrum[index].annotation.binary_search(fragment) { + Ok(ai) | Err(ai) => annotated.spectrum[index] + .annotation + .insert(ai, fragment.clone()), + } } } } diff --git a/rustyms/src/spectrum/scores.rs b/rustyms/src/spectrum/scores.rs index f43253f9..a59c0e5f 100644 --- a/rustyms/src/spectrum/scores.rs +++ b/rustyms/src/spectrum/scores.rs @@ -5,8 +5,9 @@ use serde::{Deserialize, Serialize}; use crate::{ fragment::{Fragment, FragmentKind}, + model::MatchingParameters, peptidoform::UnAmbiguous, - AnnotatedSpectrum, MassMode, Model, Peptidoform, + AnnotatedSpectrum, MassMode, Peptidoform, }; impl AnnotatedSpectrum { @@ -16,14 +17,14 @@ impl AnnotatedSpectrum { pub fn scores( &self, fragments: &[Fragment], - model: &Model, + parameters: &MatchingParameters, mass_mode: MassMode, ) -> (Scores, Vec>) { let fragments = fragments .iter() .filter(|f| { f.mz(mass_mode) - .is_some_and(|mz| model.mz_range.contains(&mz)) + .is_some_and(|mz| parameters.mz_range.contains(&mz)) }) .collect_vec(); let total_intensity: f64 = self.spectrum.iter().map(|p| *p.intensity).sum(); @@ -107,9 +108,9 @@ impl AnnotatedSpectrum { .annotation .iter() .filter(|a| { - peptidoform_ion_index.map_or(true, |i| a.peptidoform_ion_index == Some(i)) - && peptidoform_index.map_or(true, |i| a.peptidoform_index == Some(i)) - && ion.map_or(true, |kind| a.ion.kind() == kind) + peptidoform_ion_index.is_none_or(|i| a.peptidoform_ion_index == Some(i)) + && peptidoform_index.is_none_or(|i| a.peptidoform_index == Some(i)) + && ion.is_none_or(|kind| a.ion.kind() == kind) }) .count() as u32; if number == 0 { @@ -124,9 +125,9 @@ impl AnnotatedSpectrum { let total_fragments = fragments .iter() .filter(|f| { - peptidoform_ion_index.map_or(true, |i| f.peptidoform_ion_index == Some(i)) - && peptidoform_index.map_or(true, |i| f.peptidoform_index == Some(i)) - && ion.map_or(true, |kind| f.ion.kind() == kind) + peptidoform_ion_index.is_none_or(|i| f.peptidoform_ion_index == Some(i)) + && peptidoform_index.is_none_or(|i| f.peptidoform_index == Some(i)) + && ion.is_none_or(|kind| f.ion.kind() == kind) }) .count() as u32; ( @@ -153,7 +154,7 @@ impl AnnotatedSpectrum { .filter(|a| { a.peptidoform_ion_index == Some(peptidoform_ion_index) && a.peptidoform_index == Some(peptidoform_index) - && ion.map_or(true, |kind| a.ion.kind() == kind) + && ion.is_none_or(|kind| a.ion.kind() == kind) }) .filter_map(|a| a.ion.position()) }) @@ -165,7 +166,7 @@ impl AnnotatedSpectrum { .filter(|f| { f.peptidoform_ion_index == Some(peptidoform_ion_index) && f.peptidoform_index == Some(peptidoform_index) - && ion.map_or(true, |i| i == f.ion.kind()) + && ion.is_none_or(|i| i == f.ion.kind()) }) .filter_map(|f| f.ion.position().map(|p| p.sequence_index)) .unique() @@ -185,8 +186,8 @@ impl AnnotatedSpectrum { .iter() .flat_map(|p| { p.annotation.iter().filter(|a| { - peptidoform_index.map_or(true, |i| a.peptidoform_index == Some(i)) - && ion.map_or(true, |kind| a.ion.kind() == kind) + peptidoform_index.is_none_or(|i| a.peptidoform_index == Some(i)) + && ion.is_none_or(|kind| a.ion.kind() == kind) }) }) .map(|f| f.formula.clone()) @@ -195,8 +196,8 @@ impl AnnotatedSpectrum { let total_fragments = fragments .iter() .filter(|f| { - peptidoform_index.map_or(true, |i| f.peptidoform_index == Some(i)) - && ion.map_or(true, |kind| f.ion.kind() == kind) + peptidoform_index.is_none_or(|i| f.peptidoform_index == Some(i)) + && ion.is_none_or(|kind| f.ion.kind() == kind) }) .map(|f| f.formula.clone()) .unique() @@ -270,7 +271,7 @@ impl AnnotatedSpectrum { .chain( [ FragmentKind::Y, - FragmentKind::Oxonium, + FragmentKind::B, FragmentKind::immonium, FragmentKind::precursor_side_chain_loss, FragmentKind::diagnostic,