diff --git a/pyproject.toml b/pyproject.toml index 248afd6..44ca9e5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ classifiers = [ "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", ] -version = "0.0.4" +version = "0.0.5" [tool.maturin] module-name = "ruranges" diff --git a/src/complement.rs b/src/complement.rs index dd74c3c..d393fd7 100644 --- a/src/complement.rs +++ b/src/complement.rs @@ -71,5 +71,6 @@ pub fn sweep_line_non_overlaps( } } + radsort::sort(&mut no_overlaps); no_overlaps } diff --git a/src/group_cumsum.rs b/src/group_cumsum.rs index f2d1b33..48b4f4e 100644 --- a/src/group_cumsum.rs +++ b/src/group_cumsum.rs @@ -1,6 +1,6 @@ use radsort::sort_by_key; -use crate::{ruranges_structs::{GroupType, PositionType}, sorts::build_subsequence_intervals}; +use crate::{ruranges_structs::{GroupType, MinInterval, PositionType}, sorts::build_subsequence_intervals}; pub fn sweep_line_cumsum( @@ -17,12 +17,10 @@ where sort_by_key(&mut ivals, |iv| (iv.chr, iv.start)); - let mut idx_out = Vec::with_capacity(chrs.len()); - let mut cumsum_start = Vec::with_capacity(chrs.len()); - let mut cumsum_end = Vec::with_capacity(chrs.len()); + let mut results= Vec::with_capacity(chrs.len()); if ivals.is_empty() { - return (idx_out, cumsum_start, cumsum_end); + return (Vec::with_capacity(chrs.len()),Vec::with_capacity(chrs.len()), Vec::with_capacity(chrs.len())); } let mut current_chr = ivals[0].chr; @@ -39,11 +37,21 @@ where let s = running_total; let e = running_total + len; - idx_out.push(iv.idx); - cumsum_start.push(s); - cumsum_end.push(e); + results.push(MinInterval {idx: iv.idx, start: s, end: e}); running_total = e; } - (idx_out, cumsum_start, cumsum_end) + sort_by_key(&mut results, |i| i.idx); + + let mut out_idxs = Vec::with_capacity(results.len()); + let mut out_starts = Vec::with_capacity(results.len()); + let mut out_ends = Vec::with_capacity(results.len()); + + for rec in results { + out_idxs.push(rec.idx); + out_starts.push(rec.start); + out_ends.push(rec.end); + } + + (out_idxs, out_starts, out_ends) } diff --git a/src/map_to_global.rs b/src/map_to_global.rs index 60384ca..ef24f19 100644 --- a/src/map_to_global.rs +++ b/src/map_to_global.rs @@ -1,8 +1,9 @@ use numpy::{IntoPyArray, PyArray1, PyReadonlyArray1}; use pyo3::prelude::*; +use radsort::sort_by_key; -use crate::ruranges_structs::{GroupType, PositionType}; +use crate::ruranges_structs::{GroupType, PositionType, StrandInterval}; #[allow(clippy::too_many_arguments)] @@ -37,10 +38,7 @@ pub fn map_to_global( debug_assert_eq!(q_tx.len(), q_fwd.len()); // ------------------- output buffers ----------------------------------- - let mut out_idx: Vec = Vec::new(); - let mut out_start = Vec::new(); - let mut out_end = Vec::new(); - let mut out_fwd = Vec::new(); + let mut results = Vec::new(); // ------------------- two-pointer sweep --------------------------------- let mut ei = 0usize; // exon pointer @@ -110,10 +108,7 @@ pub fn map_to_global( }; // push result - out_idx .push(idx); - out_start.push(g_start); - out_end .push(g_end); - out_fwd .push(local_f == ex_fwd[ek]); + results.push(StrandInterval {start: g_start, end: g_end, idx: idx, fwd: local_f == ex_fwd[ek]}); // advance inside query l = seg_end_local; @@ -132,5 +127,19 @@ pub fn map_to_global( } } - (out_idx, out_start, out_end, out_fwd) + sort_by_key(&mut results, |i| i.idx); + + let mut out_idxs = Vec::with_capacity(results.len()); + let mut out_starts = Vec::with_capacity(results.len()); + let mut out_ends = Vec::with_capacity(results.len()); + let mut out_strands = Vec::with_capacity(results.len()); + + for rec in results { + out_idxs.push(rec.idx); + out_starts.push(rec.start); + out_ends.push(rec.end); + out_strands.push(rec.fwd); + } + + (out_idxs, out_starts, out_ends, out_strands) } \ No newline at end of file diff --git a/src/nearest.rs b/src/nearest.rs index 09eb980..7321787 100644 --- a/src/nearest.rs +++ b/src/nearest.rs @@ -1,5 +1,7 @@ use std::{str::FromStr, time::Instant}; +use radsort::sort_by_key; + use crate::{ overlaps::{self, sweep_line_overlaps, sweep_line_overlaps_overlap_pair}, ruranges_structs::{GroupType, MinEvent, Nearest, OverlapPair, PositionType}, @@ -265,9 +267,7 @@ pub fn merge_three_way_by_index_distance( ) -> (Vec, Vec, Vec) { // We'll return tuples: (idx, idx2, distance). // You can adapt if you want a custom struct instead. - let mut idxs1 = Vec::new(); - let mut idxs2 = Vec::new(); - let mut distance = Vec::new(); + let mut results = Vec::new(); // Pointers over each input let (mut i, mut j, mut r) = (0_usize, 0_usize, 0_usize); @@ -376,9 +376,7 @@ pub fn merge_three_way_by_index_distance( } // Add to result let OverlapPair { idx, idx2 } = overlaps_slice[oi]; - idxs1.push(idx); - idxs2.push(idx2); - distance.push(dcur); + results.push(Nearest { idx: idx, idx2: idx2, distance: T::zero() }); oi += 1; } else { break; @@ -399,10 +397,7 @@ pub fn merge_three_way_by_index_distance( } used_distances.insert(dcur); } - let item = &left_slice[lj]; - idxs1.push(item.idx); - idxs2.push(item.idx2); - distance.push(dcur); + results.push(left_slice[lj]); lj += 1; } else { break; @@ -423,10 +418,7 @@ pub fn merge_three_way_by_index_distance( } used_distances.insert(dcur); } - let item = &right_slice[rr]; - idxs1.push(item.idx); - idxs2.push(item.idx2); - distance.push(dcur); + results.push(right_slice[rr]); rr += 1; } else { break; @@ -439,5 +431,17 @@ pub fn merge_three_way_by_index_distance( // done collecting up to k distinct distances for this index } - (idxs1, idxs2, distance) + sort_by_key(&mut results, |n| (n.idx, n.distance, n.idx2)); + + let mut out_idxs = Vec::with_capacity(results.len()); + let mut out_idxs2 = Vec::with_capacity(results.len()); + let mut out_distances = Vec::with_capacity(results.len()); + + for rec in results { + out_idxs.push(rec.idx); + out_idxs2.push(rec.idx2); + out_distances.push(rec.distance); + } + + (out_idxs, out_idxs2, out_distances) } diff --git a/src/overlaps.rs b/src/overlaps.rs index 0d9aacb..a8c87ac 100644 --- a/src/overlaps.rs +++ b/src/overlaps.rs @@ -1,12 +1,13 @@ use std::str::FromStr; +use std::time::{Duration, Instant}; +use radsort::sort_by_key; use rustc_hash::{FxHashMap, FxHashSet}; use crate::helpers::keep_first_by_idx; use crate::ruranges_structs::{GroupType, MaxEvent, MinEvent, OverlapPair, OverlapType, PositionType}; use crate::sorts::{ - self, build_sorted_events_single_collection_separate_outputs, - build_sorted_maxevents_with_starts_ends, + self, build_sorted_events_single_collection_separate_outputs, build_sorted_maxevents_with_starts_ends }; /// Perform a four-way merge sweep to find cross overlaps. @@ -43,7 +44,7 @@ pub fn overlaps( let overlap_type = OverlapType::from_str(overlap_type).unwrap(); let invert = overlap_type == OverlapType::Last; - if overlap_type == OverlapType::All && !contained { + let mut result_pairs = if overlap_type == OverlapType::All && !contained { // The common, super-optimized case sweep_line_overlaps( chrs, @@ -73,7 +74,7 @@ pub fn overlaps( &sorted_ends2, ); keep_first_by_idx(&mut pairs); - pairs.into_iter().map(|pair| (pair.idx, pair.idx2)).unzip() + pairs } else { let maxevents = compute_sorted_maxevents( chrs, @@ -87,13 +88,15 @@ pub fn overlaps( ); let mut pairs = sweep_line_overlaps_containment(maxevents); if overlap_type == OverlapType::All { - pairs.into_iter().map(|pair| (pair.idx, pair.idx2)).unzip() + pairs } else { keep_first_by_idx(&mut pairs); - pairs.into_iter().map(|pair| (pair.idx, pair.idx2)).unzip() + pairs } } - } + }; + sort_by_key(&mut result_pairs, |p| p.idx); + result_pairs.into_iter().map(|pair| (pair.idx, pair.idx2)).unzip() } pub fn sweep_line_overlaps_set1( @@ -479,6 +482,7 @@ pub fn compute_sorted_maxevents( } } + pub fn sweep_line_overlaps( chrs: &[C], starts: &[T], @@ -487,19 +491,19 @@ pub fn sweep_line_overlaps( starts2: &[T], ends2: &[T], slack: T, -) -> (Vec, Vec) { +) -> (Vec) { // We'll collect all cross overlaps here let mut overlaps = Vec::new(); - let mut overlaps2 = Vec::new(); - if chrs.is_empty() | chrs2.is_empty() { - return (overlaps, overlaps2); + let events = sorts::build_sorted_events(chrs, starts, ends, chrs2, starts2, ends2, slack); + + if events.is_empty() { + return overlaps; }; - let events = sorts::build_sorted_events(chrs, starts, ends, chrs2, starts2, ends2, slack); // Active sets let mut active1 = FxHashSet::default(); - let mut active2 = FxHashSet::default(); + let mut active2 =FxHashSet::default(); let mut current_chr = events.first().unwrap().chr; @@ -516,18 +520,21 @@ pub fn sweep_line_overlaps( if e.first_set { // Overlaps with all currently active intervals in set2 for &idx2 in active2.iter() { - overlaps.push(e.idx); - overlaps2.push(idx2); + overlaps.push(OverlapPair { + idx: e.idx, + idx2: idx2, + }); } // Now add it to active1 active1.insert(e.idx); } else { // Overlaps with all currently active intervals in set1 - for &idx1 in active1.iter() { - overlaps.push(idx1); - overlaps2.push(e.idx); - } - // Now add it to active2 + for &idx in active1.iter() { + overlaps.push(OverlapPair { + idx: idx, + idx2: e.idx, + }); + }; active2.insert(e.idx); } } else { @@ -540,5 +547,5 @@ pub fn sweep_line_overlaps( } } - (overlaps, overlaps2) + overlaps } \ No newline at end of file diff --git a/src/ruranges_structs.rs b/src/ruranges_structs.rs index f5060fa..e85fd1c 100644 --- a/src/ruranges_structs.rs +++ b/src/ruranges_structs.rs @@ -14,6 +14,21 @@ pub struct GenomicData { pub strands: Option>, } +#[derive(Debug, Clone)] +pub struct MinInterval { + pub start: T, + pub end: T, + pub idx: u32, +} + +#[derive(Debug, Clone)] +pub struct StrandInterval { + pub start: T, + pub end: T, + pub idx: u32, + pub fwd: bool, +} + #[derive(Debug, Clone)] pub struct Interval { pub group: C, @@ -68,7 +83,7 @@ pub struct OverlapPair { pub idx2: u32, } -#[derive(Debug, Clone, Hash)] +#[derive(Debug, Clone, Hash, Copy)] pub struct Nearest { pub distance: T, pub idx: u32, @@ -133,4 +148,12 @@ impl FromStr for OverlapType { _ => Err("Invalid direction string"), } } +} + + +pub struct SplicedRecord { + pub idx: u32, + pub start: T, + pub end: T, + pub strand: bool, } \ No newline at end of file diff --git a/src/spliced_subsequence.rs b/src/spliced_subsequence.rs index a7e451c..9d068db 100644 --- a/src/spliced_subsequence.rs +++ b/src/spliced_subsequence.rs @@ -1,3 +1,5 @@ +use radsort::sort_by_key; + use crate::{ ruranges_structs::{GroupType, PositionType, SplicedSubsequenceInterval}, sorts::build_sorted_subsequence_intervals, @@ -18,15 +20,20 @@ pub fn spliced_subseq( return (Vec::new(), Vec::new(), Vec::new(), Vec::new()); } + // ────────────── helper struct ────────────── + struct OutRec { + idx: u32, + start: T, + end: T, + strand: bool, + } + // pre-sorted by (chr, start, end) in caller code let intervals = build_sorted_subsequence_intervals(chrs, starts, ends, strand_flags); - let mut out_idxs = Vec::with_capacity(intervals.len()); - let mut out_starts = Vec::with_capacity(intervals.len()); - let mut out_ends = Vec::with_capacity(intervals.len()); - let mut out_strands = Vec::with_capacity(intervals.len()); + let mut out_recs: Vec> = Vec::with_capacity(intervals.len()); - let mut group_buf: Vec> = Vec::new(); + let mut group_buf: Vec> = Vec::new(); let mut current_chr = intervals[0].chr; let mut running_sum = T::zero(); @@ -37,8 +44,8 @@ pub fn spliced_subseq( } // total spliced length of this transcript - let total_len = group.last().unwrap().temp_cumsum; - let end_val = end.unwrap_or(total_len); + let total_len = group.last().unwrap().temp_cumsum; + let end_val = end.unwrap_or(total_len); // handle negative coordinates from 3′ end let global_start = if start < T::zero() { total_len + start } else { start }; @@ -70,19 +77,20 @@ pub fn spliced_subseq( } if st < en { - out_idxs .push(iv.idx); - out_starts .push(st); - out_ends .push(en); - // (+)*(+) or (-)*(-) ➜ '+', else '-' - out_strands.push(iv.forward_strand == processed_forward); + out_recs.push(OutRec { + idx: iv.idx, + start: st, + end: en, + strand: iv.forward_strand == processed_forward, // (+)*(+) or (-)*(-) ➜ '+' + }); } }; // iterate 5′→3′ in transcript space if group_forward { - for iv in group.iter_mut() { process_iv(iv); } + for iv in group.iter_mut() { process_iv(iv); } } else { - for iv in group.iter_mut().rev() { process_iv(iv); } + for iv in group.iter_mut().rev() { process_iv(iv); } } }; // ---------------------------------------------------------------------- @@ -108,9 +116,25 @@ pub fn spliced_subseq( } finalize_group(&mut group_buf); + sort_by_key(&mut out_recs, |r| r.idx); + + // ------------------ explode OutRec list into four parallel arrays ------------------ + let mut out_idxs = Vec::with_capacity(out_recs.len()); + let mut out_starts = Vec::with_capacity(out_recs.len()); + let mut out_ends = Vec::with_capacity(out_recs.len()); + let mut out_strands = Vec::with_capacity(out_recs.len()); + + for rec in out_recs { + out_idxs.push(rec.idx); + out_starts.push(rec.start); + out_ends.push(rec.end); + out_strands.push(rec.strand); + } + (out_idxs, out_starts, out_ends, out_strands) } + // ──────────────────────────────────────────────────────────────────────────── // multi-row variant: one pass over the exons, many slices per transcript // ──────────────────────────────────────────────────────────────────────────── diff --git a/src/subtract.rs b/src/subtract.rs index 6b9ad6f..ebdb71f 100644 --- a/src/subtract.rs +++ b/src/subtract.rs @@ -1,8 +1,9 @@ use num_traits::{PrimInt, Signed, Zero}; +use radsort::sort_by_key; use rustc_hash::FxHashMap; use std::hash::Hash; -use crate::{ruranges_structs::{GroupType, PositionType}, sorts}; +use crate::{ruranges_structs::{GroupType, Interval, MinEvent, MinInterval, PositionType}, sorts}; pub fn sweep_line_subtract( chrs1: &[G], @@ -25,10 +26,8 @@ pub fn sweep_line_subtract( let events = sorts::build_sorted_events_idxs(chrs1, starts1, ends1, chrs2, starts2, ends2, T::zero()); - // Output buffers - let mut result_idxs = Vec::new(); - let mut result_starts = Vec::new(); - let mut result_ends = Vec::new(); + let mut out_events = Vec::new(); + // Track how many set2 intervals are active let mut active2_count: i64 = 0; @@ -86,9 +85,7 @@ pub fn sweep_line_subtract( if let Some(start_pos) = active1.get(&e.idx).cloned().unwrap_or(None) { // We are capturing. End the sub-interval at e.pos if start_pos < pos { - result_idxs.push(e.idx); - result_starts.push(start_pos); - result_ends.push(pos); + out_events.push(MinInterval {start: start_pos, end: pos, idx: e.idx}); } } // Remove it from active1 @@ -108,9 +105,7 @@ pub fn sweep_line_subtract( if let Some(start_pos) = maybe_start { // Close at current event pos (exclusive or inclusive depends on your semantics) if start_pos < pos { - result_idxs.push(idx1); - result_starts.push(start_pos); - result_ends.push(pos); + out_events.push(MinInterval {start: start_pos, end: pos, idx: idx1}); } } } @@ -143,8 +138,18 @@ pub fn sweep_line_subtract( // 3. Move on to the next event } + sort_by_key(&mut out_events, |i| i.idx); // No final cleanup is strictly necessary if every set1 interval has a corresponding end event. + let mut out_idxs = Vec::with_capacity(out_events.len()); + let mut out_starts = Vec::with_capacity(out_events.len()); + let mut out_ends = Vec::with_capacity(out_events.len()); + + for rec in out_events { + out_idxs.push(rec.idx); + out_starts.push(rec.start); + out_ends.push(rec.end); + } - (result_idxs, result_starts, result_ends) + (out_idxs, out_starts, out_ends) }