From c1c4ca3943d7d94fce83d8f5478fe81828a983f3 Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Fri, 20 Mar 2026 12:56:06 -0400 Subject: [PATCH 1/3] feat: Add coordinate system support for output positions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce a `CoordSystem` enum (`OneClosed` / `ZeroHalfOpen`) that controls how start positions are represented in output Arrow batches. Each format defaults to its native coordinate convention: 1-based for SAM/BAM/CRAM, VCF/BCF, and GFF/GTF; 0-based for BED, BigBed, and BigWig — but callers can request either system explicitly. Rust changes: * `CoordSystem` enum in `oxbow::lib` with `Display/FromStr` and `start_offset_from()` for computing the adjustment between systems. * Every Model now carries a `coord_system` field (alignment, variant, gxf, bed, bbi base, bbi zoom, sequence). * Offset applied at the FieldBuilder level (alignment, variant, gxf, bed) or BatchBuilder level (bbi base, bbi zoom) during `push()`. * All scanner constructors accept an explicit `CoordSystem` parameter. Also edded `Default` trait impl on alignment and gxf Models. Python changes: * All pyo3 scanner classes accept a `coords` keyword argument ("01" or "11") and serialize it through `__getnewargs_ex__`. * All DataSource classes and from_* factory functions expose coords: `Literal["01", "11"]` with format-appropriate defaults. * BBI zoom scanners inherit `coord_system` from their base scanner. * All read_* functions pass the format-native CoordSystem. R changes: * All read_*_impl functions pass the format-native CoordSystem. --- oxbow/src/alignment/model.rs | 151 ++++++++-- oxbow/src/alignment/model/batch.rs | 15 +- oxbow/src/alignment/model/field.rs | 54 ++-- oxbow/src/alignment/scanner/bam.rs | 13 +- oxbow/src/alignment/scanner/cram.rs | 9 +- oxbow/src/alignment/scanner/sam.rs | 9 +- oxbow/src/bbi/model/base.rs | 30 +- oxbow/src/bbi/model/base/batch.rs | 29 +- oxbow/src/bbi/model/zoom.rs | 39 ++- oxbow/src/bbi/model/zoom/batch.rs | 27 +- oxbow/src/bbi/scanner/bbizoom.rs | 19 +- oxbow/src/bbi/scanner/bigbed.rs | 8 +- oxbow/src/bbi/scanner/bigwig.rs | 13 +- oxbow/src/bed/model.rs | 63 ++-- oxbow/src/bed/model/batch.rs | 90 +++--- oxbow/src/bed/model/field.rs | 21 +- oxbow/src/bed/scanner/bed.rs | 13 +- oxbow/src/gxf/model.rs | 73 +++-- oxbow/src/gxf/model/batch.rs | 18 +- oxbow/src/gxf/model/field.rs | 30 +- oxbow/src/gxf/scanner/gff.rs | 8 +- oxbow/src/gxf/scanner/gtf.rs | 8 +- oxbow/src/lib.rs | 58 ++++ oxbow/src/sequence/model.rs | 31 +- oxbow/src/sequence/model/batch.rs | 4 +- oxbow/src/sequence/scanner/fasta.rs | 26 +- oxbow/src/variant/model.rs | 40 ++- oxbow/src/variant/model/batch.rs | 11 +- oxbow/src/variant/model/field.rs | 24 +- oxbow/src/variant/scanner/bcf.rs | 7 +- oxbow/src/variant/scanner/vcf.rs | 7 +- py-oxbow/oxbow/_core/alignment.py | 11 +- py-oxbow/oxbow/_core/bbi.py | 10 +- py-oxbow/oxbow/_core/bed.py | 4 + py-oxbow/oxbow/_core/gxf.py | 9 +- py-oxbow/oxbow/_core/sequence.py | 4 + py-oxbow/oxbow/_core/variant.py | 6 + py-oxbow/src/alignment.rs | 55 +++- py-oxbow/src/bbi.rs | 88 ++++-- py-oxbow/src/bed.rs | 21 +- py-oxbow/src/gxf.rs | 36 ++- py-oxbow/src/sequence.rs | 16 +- py-oxbow/src/util.rs | 12 +- py-oxbow/src/variant.rs | 18 +- ...ment.TestCramFile.test_init_callstack.yaml | 6 +- .../test_bed.TestBedFile.test_batches.yaml | 264 ++++++++-------- ...t_scanners.TestPyBedScanner.test_scan.yaml | 285 +++--------------- py-oxbow/tests/test_coords.py | 153 ++++++++++ r-oxbow/src/rust/src/lib.rs | 155 ++++++++-- 49 files changed, 1376 insertions(+), 725 deletions(-) create mode 100644 py-oxbow/tests/test_coords.py diff --git a/oxbow/src/alignment/model.rs b/oxbow/src/alignment/model.rs index 64b66ca..539904e 100644 --- a/oxbow/src/alignment/model.rs +++ b/oxbow/src/alignment/model.rs @@ -10,21 +10,24 @@ use std::sync::Arc; use arrow::datatypes::{DataType, Field as ArrowField, Schema, SchemaRef}; -use crate::{OxbowError, Select}; +use crate::{CoordSystem, OxbowError, Select}; use field::{Field, DEFAULT_FIELD_NAMES}; use tag::TagDef; /// A data model for alignment records (SAM/BAM/CRAM). /// /// Encapsulates the schema-defining parameters for an alignment projection: -/// which standard fields to include and which auxiliary tags (with their -/// types) to materialize. +/// which standard fields to include, which auxiliary tags (with their +/// types) to materialize, and the output coordinate system. /// /// - `fields` selects which standard SAM fields become Arrow columns. /// `None` → all 12 standard fields. /// - `tag_defs` controls the tags struct column independently. /// `None` → no tags column. `Some(vec![])` → empty struct column. /// `Some(vec![...])` → struct column with the specified sub-fields. +/// - `coord_system` controls the coordinate system of position columns +/// (`pos`, `pnext`). `None` → 1-based closed (`"11"`), the SAM convention. +/// End coordinates are not affected. /// /// The model can produce an Arrow schema independently of any file header. /// @@ -32,17 +35,19 @@ use tag::TagDef; /// /// ``` /// use oxbow::alignment::model::Model; -/// use oxbow::Select; +/// use oxbow::{CoordSystem, Select}; /// -/// // Default: all 12 standard fields, no tags column. -/// let model = Model::new(Select::All, None).unwrap(); +/// // Default: all 12 standard fields, no tags column, 1-based coordinates. +/// let model = Model::new(Select::All, None, CoordSystem::OneClosed).unwrap(); /// assert_eq!(model.field_names().len(), 12); /// assert!(!model.has_tags()); +/// assert_eq!(model.coord_system(), CoordSystem::OneClosed); /// -/// // Custom: selected fields with tags. +/// // Custom: selected fields with tags, 0-based coordinates. /// let model = Model::new( /// Select::Some(vec!["qname".into(), "pos".into()]), /// Some(vec![("NM".into(), "i".into()), ("MD".into(), "Z".into())]), +/// CoordSystem::ZeroHalfOpen, /// ).unwrap(); /// assert_eq!(model.field_names(), vec!["qname", "pos"]); /// assert!(model.has_tags()); @@ -54,6 +59,7 @@ use tag::TagDef; pub struct Model { fields: Vec, tag_defs: Option>, + coord_system: CoordSystem, schema: SchemaRef, } @@ -64,9 +70,13 @@ impl Model { /// fields. `Select(vec)` → specific fields. `Omit` → no fields. /// - `tag_defs`: tag definitions as `(name, type_code)` pairs. `None` → /// no tags column. `Some(vec![])` → tags column with empty struct. + /// - `coord_system`: output coordinate system for position columns + /// (`pos`, `pnext`). `None` defaults to [`CoordSystem::OneClosed`] + /// (1-based, matching the SAM convention). pub fn new( fields: Select, tag_defs: Option>, + coord_system: CoordSystem, ) -> crate::Result { let field_names = match fields { Select::All => DEFAULT_FIELD_NAMES.iter().map(|&s| s.to_string()).collect(), @@ -89,20 +99,15 @@ impl Model { .collect::>>() }) .transpose()?; - let schema = Self::build_schema(&parsed_fields, tag_defs.as_deref()); Ok(Self { fields: parsed_fields, tag_defs, + coord_system, schema, }) } - /// Create a model with all 12 default standard fields and no tags. - pub fn default_fields() -> Self { - Self::new(Select::All, None).expect("default fields are always valid") - } - fn build_schema(fields: &[Field], tag_defs: Option<&[TagDef]>) -> SchemaRef { let mut arrow_fields: Vec = fields.iter().map(|f| f.get_arrow_field()).collect(); @@ -139,6 +144,11 @@ impl Model { self.tag_defs.is_some() } + /// The output coordinate system for position columns. + pub fn coord_system(&self) -> CoordSystem { + self.coord_system + } + /// The Arrow schema for this model. pub fn schema(&self) -> &SchemaRef { &self.schema @@ -192,13 +202,22 @@ impl Model { None }; - Self::new(Select::Some(projected_fields), tag_defs) + Self::new(Select::Some(projected_fields), tag_defs, self.coord_system) + } +} + +impl Default for Model { + fn default() -> Self { + Self::new(Select::All, None, CoordSystem::OneClosed) + .expect("default fields are always valid") } } impl PartialEq for Model { fn eq(&self, other: &Self) -> bool { - self.fields == other.fields && self.tag_defs == other.tag_defs + self.fields == other.fields + && self.tag_defs == other.tag_defs + && self.coord_system == other.coord_system } } @@ -225,6 +244,10 @@ impl fmt::Display for Model { } } + if self.coord_system != CoordSystem::OneClosed { + write!(f, ";coords={}", self.coord_system)?; + } + Ok(()) } } @@ -235,6 +258,7 @@ impl FromStr for Model { fn from_str(s: &str) -> Result { let mut fields: Option> = None; let mut tag_defs: Option> = None; + let mut coord_system: Option = None; for part in s.split(';') { let part = part.trim(); @@ -267,6 +291,8 @@ impl FromStr for Model { }) .collect(); tag_defs = Some(defs?); + } else if let Some(value) = part.strip_prefix("coords=") { + coord_system = Some(value.parse()?); } else { return Err(OxbowError::invalid_input(format!( "Invalid Model segment: '{}'", @@ -279,7 +305,11 @@ impl FromStr for Model { Some(names) => Select::Some(names), None => Select::All, }; - Self::new(fields, tag_defs) + Self::new( + fields, + tag_defs, + coord_system.unwrap_or(CoordSystem::OneClosed), + ) } } @@ -287,19 +317,22 @@ impl FromStr for Model { mod tests { use super::*; + const CS: CoordSystem = CoordSystem::OneClosed; + #[test] fn test_default_model() { - let model = Model::new(Select::All, None).unwrap(); + let model = Model::new(Select::All, None, CS).unwrap(); assert_eq!(model.field_names().len(), 12); assert!(!model.has_tags()); assert!(model.tag_defs().is_none()); assert_eq!(model.schema().fields().len(), 12); + assert_eq!(model.coord_system(), CoordSystem::OneClosed); } #[test] fn test_default_fields_constructor() { - let model = Model::default_fields(); - assert_eq!(model, Model::new(Select::All, None).unwrap()); + let model = Model::default(); + assert_eq!(model, Model::new(Select::All, None, CS).unwrap()); } #[test] @@ -307,6 +340,7 @@ mod tests { let model = Model::new( Select::Some(vec!["qname".into(), "flag".into(), "pos".into()]), None, + CS, ) .unwrap(); assert_eq!(model.field_names(), vec!["qname", "flag", "pos"]); @@ -319,6 +353,7 @@ mod tests { let model = Model::new( Select::Some(vec!["qname".into(), "pos".into()]), Some(vec![("NM".into(), "i".into()), ("MD".into(), "Z".into())]), + CS, ) .unwrap(); assert_eq!(model.field_names(), vec!["qname", "pos"]); @@ -331,7 +366,7 @@ mod tests { #[test] fn test_tags_empty_defs_is_empty_struct() { - let model = Model::new(Select::Some(vec!["qname".into()]), Some(vec![])).unwrap(); + let model = Model::new(Select::Some(vec!["qname".into()]), Some(vec![]), CS).unwrap(); assert!(model.has_tags()); assert!(model.tag_defs().unwrap().is_empty()); assert_eq!(model.schema().fields().len(), 2); @@ -344,7 +379,7 @@ mod tests { #[test] fn test_no_tags_when_tag_defs_none() { - let model = Model::new(Select::Some(vec!["qname".into(), "pos".into()]), None).unwrap(); + let model = Model::new(Select::Some(vec!["qname".into(), "pos".into()]), None, CS).unwrap(); assert!(!model.has_tags()); assert!(model.tag_defs().is_none()); assert_eq!(model.schema().fields().len(), 2); @@ -352,19 +387,19 @@ mod tests { #[test] fn test_invalid_field() { - let result = Model::new(Select::Some(vec!["invalid".into()]), None); + let result = Model::new(Select::Some(vec!["invalid".into()]), None, CS); assert!(result.is_err()); } #[test] fn test_invalid_tag_name() { - let result = Model::new(Select::All, Some(vec![("X".into(), "i".into())])); + let result = Model::new(Select::All, Some(vec![("X".into(), "i".into())]), CS); assert!(result.is_err()); } #[test] fn test_invalid_tag_type() { - let result = Model::new(Select::All, Some(vec![("NM".into(), "Q".into())])); + let result = Model::new(Select::All, Some(vec![("NM".into(), "Q".into())]), CS); assert!(result.is_err()); } @@ -373,6 +408,7 @@ mod tests { let model = Model::new( Select::Some(vec!["qname".into(), "flag".into(), "pos".into()]), Some(vec![("NM".into(), "i".into())]), + CS, ) .unwrap(); @@ -386,6 +422,7 @@ mod tests { let model = Model::new( Select::Some(vec!["qname".into(), "pos".into()]), Some(vec![("NM".into(), "i".into())]), + CS, ) .unwrap(); @@ -397,14 +434,26 @@ mod tests { #[test] fn test_project_unknown_column() { - let model = Model::default_fields(); + let model = Model::default(); let result = model.project(&["nonexistent".into()]); assert!(result.is_err()); } + #[test] + fn test_project_propagates_coord_system() { + let model = Model::new( + Select::Some(vec!["qname".into(), "pos".into()]), + None, + CoordSystem::ZeroHalfOpen, + ) + .unwrap(); + let projected = model.project(&["pos".into()]).unwrap(); + assert_eq!(projected.coord_system(), CoordSystem::ZeroHalfOpen); + } + #[test] fn test_display_defaults() { - let model = Model::default_fields(); + let model = Model::default(); assert_eq!(model.to_string(), "fields=*"); } @@ -413,21 +462,45 @@ mod tests { let model = Model::new( Select::Some(vec!["qname".into(), "pos".into()]), Some(vec![("NM".into(), "i".into()), ("MD".into(), "Z".into())]), + CS, ) .unwrap(); assert_eq!(model.to_string(), "fields=qname,pos;tags=NM:i,MD:Z"); } + #[test] + fn test_display_zero_half_open() { + let model = Model::new( + Select::Some(vec!["qname".into(), "pos".into()]), + None, + CoordSystem::ZeroHalfOpen, + ) + .unwrap(); + assert_eq!(model.to_string(), "fields=qname,pos;coords=01"); + } + + #[test] + fn test_display_one_closed_omitted() { + // OneClosed is the default; should not be emitted. + let model = Model::new( + Select::Some(vec!["pos".into()]), + None, + CoordSystem::OneClosed, + ) + .unwrap(); + assert_eq!(model.to_string(), "fields=pos"); + } + #[test] fn test_display_tags_no_defs() { - let model = Model::new(Select::Some(vec!["qname".into()]), Some(vec![])).unwrap(); + let model = Model::new(Select::Some(vec!["qname".into()]), Some(vec![]), CS).unwrap(); assert_eq!(model.to_string(), "fields=qname;tags"); } #[test] fn test_from_str_defaults() { let model: Model = "fields=*".parse().unwrap(); - assert_eq!(model, Model::default_fields()); + assert_eq!(model, Model::default()); } #[test] @@ -435,6 +508,7 @@ mod tests { let model = Model::new( Select::Some(vec!["qname".into(), "pos".into()]), Some(vec![("NM".into(), "i".into()), ("MD".into(), "Z".into())]), + CS, ) .unwrap(); let s = model.to_string(); @@ -444,7 +518,7 @@ mod tests { #[test] fn test_from_str_roundtrip_defaults() { - let model = Model::default_fields(); + let model = Model::default(); let s = model.to_string(); let parsed: Model = s.parse().unwrap(); assert_eq!(model, parsed); @@ -452,10 +526,24 @@ mod tests { #[test] fn test_from_str_roundtrip_empty_tags() { - let model = Model::new(Select::Some(vec!["qname".into()]), Some(vec![])).unwrap(); + let model = Model::new(Select::Some(vec!["qname".into()]), Some(vec![]), CS).unwrap(); + let s = model.to_string(); + let parsed: Model = s.parse().unwrap(); + assert_eq!(model, parsed); + } + + #[test] + fn test_from_str_roundtrip_coord_system() { + let model = Model::new( + Select::Some(vec!["qname".into(), "pos".into()]), + None, + CoordSystem::ZeroHalfOpen, + ) + .unwrap(); let s = model.to_string(); let parsed: Model = s.parse().unwrap(); assert_eq!(model, parsed); + assert_eq!(parsed.coord_system(), CoordSystem::ZeroHalfOpen); } #[test] @@ -463,6 +551,7 @@ mod tests { let model = Model::new( Select::Some(vec!["qname".into()]), Some(vec![("NM".into(), "i".into())]), + CS, ) .unwrap(); let cloned = model.clone(); @@ -475,11 +564,13 @@ mod tests { let m1 = Model::new( Select::Some(vec!["qname".into(), "rname".into(), "pos".into()]), None, + CS, ) .unwrap(); let m2 = Model::new( Select::Some(vec!["qname".into(), "rname".into(), "pos".into()]), None, + CS, ) .unwrap(); assert_eq!(m1.schema().as_ref(), m2.schema().as_ref()); diff --git a/oxbow/src/alignment/model/batch.rs b/oxbow/src/alignment/model/batch.rs index 676d7e5..33bf6a5 100644 --- a/oxbow/src/alignment/model/batch.rs +++ b/oxbow/src/alignment/model/batch.rs @@ -8,7 +8,12 @@ use indexmap::IndexMap; use noodles::sam::alignment::record::data::field::Tag; use crate::batch::{Push, RecordBatchBuilder}; -use crate::Select; +use crate::{CoordSystem, Select}; + +/// The coordinate system in which noodles returns alignment start positions. +/// `noodles::core::Position::get()` is always 1-based regardless of the underlying +/// format, so all alignment batch builders share this source coordinate system. +const SOURCE_CS: CoordSystem = CoordSystem::OneClosed; use super::field::Push as _; use super::field::{Field, FieldBuilder}; @@ -36,7 +41,7 @@ impl BatchBuilder { tag_defs: Option>, capacity: usize, ) -> crate::Result { - let model = Model::new(fields, tag_defs)?; + let model = Model::new(fields, tag_defs, CoordSystem::OneClosed)?; Self::from_model(&model, header, capacity) } @@ -52,6 +57,8 @@ impl BatchBuilder { .map(|(name, _)| name.to_string()) .collect(); + let coord_offset = model.coord_system().start_offset_from(SOURCE_CS); + let mut field_builders = IndexMap::new(); for field in model.fields() { let builder = match field { @@ -59,6 +66,9 @@ impl BatchBuilder { FieldBuilder::with_refs(field.clone(), capacity, &ref_names) .map_err(|e| crate::OxbowError::invalid_data(e.to_string()))? } + Field::Pos | Field::Pnext => { + FieldBuilder::new(field.clone(), capacity).with_coord_offset(coord_offset) + } _ => FieldBuilder::new(field.clone(), capacity), }; field_builders.insert(field.clone(), builder); @@ -301,6 +311,7 @@ mod tests { let model = Model::new( Select::Some(vec!["qname".into(), "pos".into()]), Some(vec![("NM".into(), "i".into())]), + CoordSystem::OneClosed, ) .unwrap(); let header = noodles::sam::Header::default(); diff --git a/oxbow/src/alignment/model/field.rs b/oxbow/src/alignment/model/field.rs index c78c413..b86a0ff 100644 --- a/oxbow/src/alignment/model/field.rs +++ b/oxbow/src/alignment/model/field.rs @@ -116,11 +116,11 @@ pub enum FieldBuilder { Qname(GenericStringBuilder), Flag(UInt16Builder), Rname(StringDictionaryBuilder), - Pos(Int32Builder), + Pos(Int32Builder, i32), Mapq(UInt8Builder), Cigar(GenericStringBuilder), Rnext(StringDictionaryBuilder), - Pnext(Int32Builder), + Pnext(Int32Builder, i32), Tlen(Int32Builder), Seq(GenericStringBuilder), Qual(GenericStringBuilder), @@ -138,11 +138,11 @@ impl FieldBuilder { Field::Qname => Self::Qname(GenericStringBuilder::::with_capacity(capacity, 1024)), Field::Flag => Self::Flag(UInt16Builder::with_capacity(capacity)), Field::Rname => Self::Rname(StringDictionaryBuilder::::new()), - Field::Pos => Self::Pos(Int32Builder::with_capacity(capacity)), + Field::Pos => Self::Pos(Int32Builder::with_capacity(capacity), 0), Field::Mapq => Self::Mapq(UInt8Builder::with_capacity(capacity)), Field::Cigar => Self::Cigar(GenericStringBuilder::::with_capacity(capacity, 1024)), Field::Rnext => Self::Rnext(StringDictionaryBuilder::::new()), - Field::Pnext => Self::Pnext(Int32Builder::with_capacity(capacity)), + Field::Pnext => Self::Pnext(Int32Builder::with_capacity(capacity), 0), Field::Tlen => Self::Tlen(Int32Builder::with_capacity(capacity)), Field::Seq => Self::Seq(GenericStringBuilder::::with_capacity(capacity, 1024)), Field::Qual => Self::Qual(GenericStringBuilder::::with_capacity(capacity, 1024)), @@ -150,6 +150,20 @@ impl FieldBuilder { } } + /// Sets the coordinate offset for start position fields (`pos`, `pnext`). + /// + /// The offset is added to start coordinates when appending records, converting + /// from the source coordinate system to the output coordinate system. Use + /// [`CoordSystem::start_offset_from`][crate::CoordSystem::start_offset_from] to + /// compute this value. Has no effect on other field variants. + pub fn with_coord_offset(self, offset: i32) -> Self { + match self { + Self::Pos(b, _) => Self::Pos(b, offset), + Self::Pnext(b, _) => Self::Pnext(b, offset), + other => other, + } + } + pub fn with_refs( field: Field, capacity: usize, @@ -186,14 +200,14 @@ impl FieldBuilder { let array = reset_dictarray_builder(builder); Arc::new(array) } - Self::Pos(builder) => Arc::new(builder.finish()), + Self::Pos(builder, _) => Arc::new(builder.finish()), Self::Mapq(builder) => Arc::new(builder.finish()), Self::Cigar(builder) => Arc::new(builder.finish()), Self::Rnext(builder) => { let array = reset_dictarray_builder(builder); Arc::new(array) } - Self::Pnext(builder) => Arc::new(builder.finish()), + Self::Pnext(builder, _) => Arc::new(builder.finish()), Self::Tlen(builder) => Arc::new(builder.finish()), Self::Seq(builder) => Arc::new(builder.finish()), Self::Qual(builder) => Arc::new(builder.finish()), @@ -226,10 +240,10 @@ impl Push<&noodles::sam::Record> for FieldBuilder { .and_then(|result| result.ok().map(|(name, _)| name.to_string())); builder.append_option(rname); } - Self::Pos(builder) => { + Self::Pos(builder, offset) => { let start = record .alignment_start() - .and_then(|result| result.ok().map(|pos| pos.get() as i32)); + .and_then(|result| result.ok().map(|pos| pos.get() as i32 + *offset)); builder.append_option(start); } Self::Mapq(builder) => { @@ -247,10 +261,10 @@ impl Push<&noodles::sam::Record> for FieldBuilder { .and_then(|result| result.ok().map(|(name, _)| name.to_string())); builder.append_option(rnext); } - Self::Pnext(builder) => { + Self::Pnext(builder, offset) => { let start = record .mate_alignment_start() - .and_then(|result| result.ok().map(|pos| pos.get() as i32)); + .and_then(|result| result.ok().map(|pos| pos.get() as i32 + *offset)); builder.append_option(start); } Self::Tlen(builder) => { @@ -294,10 +308,10 @@ impl Push<&noodles::bam::Record> for FieldBuilder { .and_then(|result| result.ok().map(|(name, _)| name.to_string())); builder.append_option(rname); } - Self::Pos(builder) => { + Self::Pos(builder, offset) => { let start = record .alignment_start() - .and_then(|result| result.ok().map(|pos| pos.get() as i32)); + .and_then(|result| result.ok().map(|pos| pos.get() as i32 + *offset)); builder.append_option(start); } Self::Mapq(builder) => { @@ -312,10 +326,10 @@ impl Push<&noodles::bam::Record> for FieldBuilder { .and_then(|result| result.ok().map(|(name, _)| name.to_string())); builder.append_option(rnext); } - Self::Pnext(builder) => { + Self::Pnext(builder, offset) => { let start = record .mate_alignment_start() - .and_then(|result| result.ok().map(|pos| pos.get() as i32)); + .and_then(|result| result.ok().map(|pos| pos.get() as i32 + *offset)); builder.append_option(start); } Self::Tlen(builder) => { @@ -359,8 +373,10 @@ impl Push<&noodles::sam::alignment::RecordBuf> for FieldBuilder { .and_then(|result| result.ok().map(|(name, _)| String::from_utf8_lossy(name))); builder.append_option(rname); } - Self::Pos(builder) => { - let start = record.alignment_start().map(|pos| pos.get() as i32); + Self::Pos(builder, offset) => { + let start = record + .alignment_start() + .map(|pos| pos.get() as i32 + *offset); builder.append_option(start); } Self::Mapq(builder) => { @@ -375,8 +391,10 @@ impl Push<&noodles::sam::alignment::RecordBuf> for FieldBuilder { .and_then(|result| result.ok().map(|(name, _)| String::from_utf8_lossy(name))); builder.append_option(rnext); } - Self::Pnext(builder) => { - let start = record.mate_alignment_start().map(|pos| pos.get() as i32); + Self::Pnext(builder, offset) => { + let start = record + .mate_alignment_start() + .map(|pos| pos.get() as i32 + *offset); builder.append_option(start); } Self::Tlen(builder) => { diff --git a/oxbow/src/alignment/scanner/bam.rs b/oxbow/src/alignment/scanner/bam.rs index 5f85218..09f2eba 100644 --- a/oxbow/src/alignment/scanner/bam.rs +++ b/oxbow/src/alignment/scanner/bam.rs @@ -10,7 +10,7 @@ use crate::alignment::model::BatchBuilder; use crate::alignment::scanner::batch_iterator::{BatchIterator, QueryBatchIterator}; use crate::alignment::AlignmentModel; use crate::util::query::{BgzfChunkReader, ByteRangeReader}; -use crate::Select; +use crate::{CoordSystem, Select}; /// A BAM scanner. /// @@ -30,7 +30,8 @@ use crate::Select; /// let header = fmt_reader.read_header().unwrap(); /// /// let tag_defs = Scanner::tag_defs(&mut fmt_reader, Some(1000)).unwrap(); -/// let scanner = Scanner::new(header, Select::All, Some(tag_defs)).unwrap(); +/// use oxbow::CoordSystem; +/// let scanner = Scanner::new(header, Select::All, Some(tag_defs), CoordSystem::OneClosed).unwrap(); /// let batches = scanner.scan(fmt_reader, None, None, Some(1000)); /// ``` pub struct Scanner { @@ -43,12 +44,14 @@ impl Scanner { /// /// - `fields`: standard SAM field selection. /// - `tag_defs`: `None` → no tags column. `Some(vec![])` → empty struct. + /// - `coord_system`: output coordinate system. `None` → 1-based closed. pub fn new( header: noodles::sam::Header, fields: Select, tag_defs: Option>, + coord_system: CoordSystem, ) -> crate::Result { - let model = AlignmentModel::new(fields, tag_defs)?; + let model = AlignmentModel::new(fields, tag_defs, coord_system)?; Ok(Self { header, model }) } @@ -297,7 +300,7 @@ mod tests { #[test] fn test_scan_with_multithreaded_reader() { let (header, fmt_reader) = mt_reader(); - let scanner = Scanner::new(header, Select::All, None).unwrap(); + let scanner = Scanner::new(header, Select::All, None, CoordSystem::OneClosed).unwrap(); let mut batches = scanner.scan(fmt_reader, None, None, Some(10)).unwrap(); let batch = batches.next().unwrap().unwrap(); @@ -308,7 +311,7 @@ mod tests { #[test] fn test_scan_query_with_multithreaded_reader() { let (header, fmt_reader) = mt_reader(); - let scanner = Scanner::new(header, Select::All, None).unwrap(); + let scanner = Scanner::new(header, Select::All, None, CoordSystem::OneClosed).unwrap(); let index = noodles::bam::bai::fs::read("../fixtures/sample.bam.bai").unwrap(); diff --git a/oxbow/src/alignment/scanner/cram.rs b/oxbow/src/alignment/scanner/cram.rs index a117c7f..0c1516d 100644 --- a/oxbow/src/alignment/scanner/cram.rs +++ b/oxbow/src/alignment/scanner/cram.rs @@ -10,7 +10,7 @@ use crate::alignment::model::tag::TagScanner; use crate::alignment::model::BatchBuilder; use crate::alignment::AlignmentModel; use crate::batch::{Push, RecordBatchBuilder as _}; -use crate::Select; +use crate::{CoordSystem, Select}; /// A CRAM scanner. /// @@ -35,7 +35,8 @@ use crate::Select; /// let header = fmt_reader.read_header().unwrap(); /// /// let tag_defs = Scanner::tag_defs(&mut fmt_reader, &header, Some(1000)).unwrap(); -/// let scanner = Scanner::new(header, Select::All, Some(tag_defs), repository).unwrap(); +/// use oxbow::CoordSystem; +/// let scanner = Scanner::new(header, Select::All, Some(tag_defs), repository, CoordSystem::OneClosed).unwrap(); /// let batches = scanner.scan(fmt_reader, None, None, Some(1000)); /// ``` pub struct Scanner { @@ -49,6 +50,7 @@ impl Scanner { /// /// - `fields`: standard SAM field selection. /// - `tag_defs`: `None` → no tags column. `Some(vec![])` → empty struct. + /// - `coord_system`: output coordinate system. `None` → 1-based closed. /// /// The FASTA repository is stored and used by scan methods for decoding. pub fn new( @@ -56,8 +58,9 @@ impl Scanner { fields: Select, tag_defs: Option>, repo: noodles::fasta::Repository, + coord_system: CoordSystem, ) -> crate::Result { - let model = AlignmentModel::new(fields, tag_defs)?; + let model = AlignmentModel::new(fields, tag_defs, coord_system)?; Ok(Self { header, model, diff --git a/oxbow/src/alignment/scanner/sam.rs b/oxbow/src/alignment/scanner/sam.rs index ea0daeb..75b51bb 100644 --- a/oxbow/src/alignment/scanner/sam.rs +++ b/oxbow/src/alignment/scanner/sam.rs @@ -10,7 +10,7 @@ use crate::alignment::model::BatchBuilder; use crate::alignment::scanner::batch_iterator::{BatchIterator, QueryBatchIterator}; use crate::alignment::AlignmentModel; use crate::util::query::{BgzfChunkReader, ByteRangeReader}; -use crate::Select; +use crate::{CoordSystem, Select}; /// A SAM scanner. /// @@ -30,7 +30,8 @@ use crate::Select; /// let header = fmt_reader.read_header().unwrap(); /// /// let tag_defs = Scanner::tag_defs(&mut fmt_reader, Some(1000)).unwrap(); -/// let scanner = Scanner::new(header, Select::All, Some(tag_defs)).unwrap(); +/// use oxbow::CoordSystem; +/// let scanner = Scanner::new(header, Select::All, Some(tag_defs), CoordSystem::OneClosed).unwrap(); /// let batches = scanner.scan(fmt_reader, None, None, Some(1000)); /// ``` pub struct Scanner { @@ -43,12 +44,14 @@ impl Scanner { /// /// - `fields`: standard SAM field selection. /// - `tag_defs`: `None` → no tags column. `Some(vec![])` → empty struct. + /// - `coord_system`: output coordinate system. `None` → 1-based closed. pub fn new( header: noodles::sam::Header, fields: Select, tag_defs: Option>, + coord_system: CoordSystem, ) -> crate::Result { - let model = AlignmentModel::new(fields, tag_defs)?; + let model = AlignmentModel::new(fields, tag_defs, coord_system)?; Ok(Self { header, model }) } diff --git a/oxbow/src/bbi/model/base.rs b/oxbow/src/bbi/model/base.rs index ee29bbf..39baf9d 100644 --- a/oxbow/src/bbi/model/base.rs +++ b/oxbow/src/bbi/model/base.rs @@ -6,7 +6,7 @@ use std::sync::Arc; use arrow::datatypes::{Field as ArrowField, Schema, SchemaRef}; pub use crate::bed::model::schema::BedSchema; -use crate::{OxbowError, Select}; +use crate::{CoordSystem, OxbowError, Select}; pub use batch::BatchBuilder; use field::{bed_standard_fields, FieldDef}; @@ -65,6 +65,7 @@ fn bed_schema_field_defs(bed_schema: &BedSchema) -> Vec { pub struct Model { bed_schema: BedSchema, fields: Vec, + coord_system: CoordSystem, schema: SchemaRef, } @@ -73,7 +74,11 @@ impl Model { /// /// - `bed_schema`: the parsing interpretation. /// - `fields`: column names to project. `None` → all fields from the schema. - pub fn new(bed_schema: BedSchema, fields: Select) -> crate::Result { + pub fn new( + bed_schema: BedSchema, + fields: Select, + coord_system: CoordSystem, + ) -> crate::Result { let all_defs = bed_schema_field_defs(&bed_schema); let projected = match fields { Select::All => all_defs, @@ -103,6 +108,7 @@ impl Model { Ok(Self { bed_schema, fields: projected, + coord_system, schema, }) } @@ -123,6 +129,10 @@ impl Model { self.fields.iter().map(|d| d.name.clone()).collect() } + pub fn coord_system(&self) -> CoordSystem { + self.coord_system + } + pub fn schema(&self) -> &SchemaRef { &self.schema } @@ -156,7 +166,11 @@ impl Model { .map(|d| d.name.clone()) .collect(); - Self::new(self.bed_schema.clone(), Select::Some(projected)) + Self::new( + self.bed_schema.clone(), + Select::Some(projected), + self.coord_system, + ) } } @@ -169,7 +183,7 @@ mod tests { #[test] fn test_bedgraph_model() { let bed_schema = BedSchema::new_bedgraph().unwrap(); - let model = Model::new(bed_schema, Select::All).unwrap(); + let model = Model::new(bed_schema, Select::All, CoordSystem::ZeroHalfOpen).unwrap(); assert_eq!(model.field_names(), vec!["chrom", "start", "end", "value"]); // BBI uses UInt32 for positions (AutoSql types) assert_eq!(model.schema().field(1).data_type(), &DataType::UInt32); @@ -179,7 +193,7 @@ mod tests { #[test] fn test_bed6_model() { let bed_schema: BedSchema = "bed6".parse().unwrap(); - let model = Model::new(bed_schema, Select::All).unwrap(); + let model = Model::new(bed_schema, Select::All, CoordSystem::ZeroHalfOpen).unwrap(); assert_eq!(model.field_names().len(), 6); assert_eq!(model.schema().field(1).data_type(), &DataType::UInt32); } @@ -187,7 +201,7 @@ mod tests { #[test] fn test_bed6_projection() { let bed_schema: BedSchema = "bed6".parse().unwrap(); - let model = Model::new(bed_schema, Select::All).unwrap(); + let model = Model::new(bed_schema, Select::All, CoordSystem::ZeroHalfOpen).unwrap(); let projected = model.project(&["chrom".into(), "strand".into()]).unwrap(); assert_eq!(projected.field_names(), vec!["chrom", "strand"]); } @@ -196,7 +210,7 @@ mod tests { fn test_custom_fields() { let defs = vec![FieldDef::new("extra".into(), FieldType::Float)]; let bed_schema = BedSchema::new(3, Some(defs)).unwrap(); - let model = Model::new(bed_schema, Select::All).unwrap(); + let model = Model::new(bed_schema, Select::All, CoordSystem::ZeroHalfOpen).unwrap(); assert_eq!(model.field_names(), vec!["chrom", "start", "end", "extra"]); assert_eq!(model.schema().field(3).data_type(), &DataType::Float32); } @@ -209,7 +223,7 @@ mod tests { FieldDef::new("c".into(), FieldType::String), ]; let bed_schema = BedSchema::new(3, Some(defs)).unwrap(); - let model = Model::new(bed_schema, Select::All).unwrap(); + let model = Model::new(bed_schema, Select::All, CoordSystem::ZeroHalfOpen).unwrap(); let projected = model.project(&["chrom".into(), "c".into()]).unwrap(); assert_eq!(projected.field_names(), vec!["chrom", "c"]); } diff --git a/oxbow/src/bbi/model/base/batch.rs b/oxbow/src/bbi/model/base/batch.rs index a70c61c..d23f141 100644 --- a/oxbow/src/bbi/model/base/batch.rs +++ b/oxbow/src/bbi/model/base/batch.rs @@ -7,7 +7,11 @@ use arrow::record_batch::{RecordBatch, RecordBatchOptions}; use indexmap::IndexMap; use crate::batch::{Push, RecordBatchBuilder}; -use crate::Select; +use crate::{CoordSystem, Select}; + +/// The coordinate system in which bigtools returns BBI positions. +/// bigtools provides raw 0-based coordinates from the file. +const SOURCE_CS: CoordSystem = CoordSystem::ZeroHalfOpen; use super::field::Push as _; pub use super::field::{FieldBuilder, FieldDef, FieldType}; @@ -17,6 +21,7 @@ pub use super::{BedSchema, BigBedRecord, BigWigRecord, Model}; pub struct BatchBuilder { schema: SchemaRef, row_count: usize, + coord_offset: i32, bed_schema: BedSchema, bed_schema_field_defs: Vec, builders: IndexMap, @@ -29,7 +34,7 @@ impl BatchBuilder { fields: Select, capacity: usize, ) -> crate::Result { - let model = Model::new(bed_schema, fields)?; + let model = Model::new(bed_schema, fields, CoordSystem::ZeroHalfOpen)?; Self::from_model(&model, capacity) } @@ -44,6 +49,7 @@ impl BatchBuilder { Ok(Self { schema: model.schema().clone(), row_count: 0, + coord_offset: model.coord_system().start_offset_from(SOURCE_CS), bed_schema: model.bed_schema().clone(), bed_schema_field_defs: model.bed_schema_field_defs(), builders, @@ -104,7 +110,8 @@ impl Push<&BigBedRecord<'_>> for BatchBuilder { if let Some(builder) = self.builders.get_mut(def) { match builder { FieldBuilder::Uint(b) => { - b.append_value(record.start); + let adjusted = (record.start as i64 + self.coord_offset as i64) as u32; + b.append_value(adjusted); } _ => { return Err(crate::OxbowError::invalid_data( @@ -186,7 +193,8 @@ impl Push<&BigWigRecord<'_>> for BatchBuilder { if let Some(builder) = self.builders.get_mut(def) { match builder { FieldBuilder::Uint(b) => { - b.append_value(record.start); + let adjusted = (record.start as i64 + self.coord_offset as i64) as u32; + b.append_value(adjusted); } _ => { return Err(crate::OxbowError::invalid_data( @@ -245,7 +253,7 @@ mod tests { #[test] fn test_batch_builder_new() { let bed_schema = create_test_bedschema(); - let model = Model::new(bed_schema, Select::All).unwrap(); + let model = Model::new(bed_schema, Select::All, CoordSystem::ZeroHalfOpen).unwrap(); let builder = BatchBuilder::from_model(&model, 10).unwrap(); assert_eq!(builder.schema().fields().len(), 4); @@ -255,7 +263,7 @@ mod tests { #[test] fn test_schema() { let bed_schema = create_test_bedschema(); - let model = Model::new(bed_schema, Select::All).unwrap(); + let model = Model::new(bed_schema, Select::All, CoordSystem::ZeroHalfOpen).unwrap(); let builder = BatchBuilder::from_model(&model, 10).unwrap(); let schema = builder.schema(); @@ -270,7 +278,7 @@ mod tests { #[test] fn test_push_bigbed_record() { let schema = create_test_bedschema(); - let model = Model::new(schema, Select::All).unwrap(); + let model = Model::new(schema, Select::All, CoordSystem::ZeroHalfOpen).unwrap(); let mut builder = BatchBuilder::from_model(&model, 10).unwrap(); let record = BigBedRecord { @@ -315,7 +323,7 @@ mod tests { #[test] fn test_push_bigwig_record() { let schema = create_test_bedschema(); - let model = Model::new(schema, Select::All).unwrap(); + let model = Model::new(schema, Select::All, CoordSystem::ZeroHalfOpen).unwrap(); let mut builder = BatchBuilder::from_model(&model, 10).unwrap(); let record = BigWigRecord { @@ -369,7 +377,7 @@ mod tests { #[test] fn test_finish_empty_batch() { let schema = create_test_bedschema(); - let model = Model::new(schema, Select::All).unwrap(); + let model = Model::new(schema, Select::All, CoordSystem::ZeroHalfOpen).unwrap(); let mut builder = BatchBuilder::from_model(&model, 10).unwrap(); let batch = builder.finish().unwrap(); @@ -381,7 +389,7 @@ mod tests { fn test_bigbed_bed6_no_custom() { // bed6 with no custom fields — standard fields 4-6 are in rest let bed_schema: BedSchema = "bed6".parse().unwrap(); - let model = Model::new(bed_schema, Select::All).unwrap(); + let model = Model::new(bed_schema, Select::All, CoordSystem::ZeroHalfOpen).unwrap(); let mut builder = BatchBuilder::from_model(&model, 10).unwrap(); let record = BigBedRecord { @@ -424,6 +432,7 @@ mod tests { let model = Model::new( bed_schema, Select::Some(vec!["chrom".into(), "strand".into()]), + CoordSystem::ZeroHalfOpen, ) .unwrap(); let mut builder = BatchBuilder::from_model(&model, 10).unwrap(); diff --git a/oxbow/src/bbi/model/zoom.rs b/oxbow/src/bbi/model/zoom.rs index 69003e4..71cf28a 100644 --- a/oxbow/src/bbi/model/zoom.rs +++ b/oxbow/src/bbi/model/zoom.rs @@ -7,7 +7,7 @@ use std::sync::Arc; use arrow::datatypes::{Field as ArrowField, Schema, SchemaRef}; -use crate::{OxbowError, Select}; +use crate::{CoordSystem, OxbowError, Select}; use field::{Field, DEFAULT_FIELD_NAMES}; pub struct BBIZoomRecord<'a> { @@ -45,17 +45,18 @@ impl<'a> BBIZoomRecord<'a> { /// /// ``` /// use oxbow::bbi::model::zoom::Model; -/// use oxbow::Select; +/// use oxbow::{CoordSystem, Select}; /// -/// let model = Model::new(Select::All).unwrap(); +/// let model = Model::new(Select::All, CoordSystem::ZeroHalfOpen).unwrap(); /// assert_eq!(model.field_names().len(), 8); /// -/// let model = Model::new(Select::Some(vec!["chrom".into(), "start".into(), "end".into(), "sum".into()])).unwrap(); +/// let model = Model::new(Select::Some(vec!["chrom".into(), "start".into(), "end".into(), "sum".into()]), CoordSystem::ZeroHalfOpen).unwrap(); /// assert_eq!(model.field_names().len(), 4); /// ``` #[derive(Clone, Debug)] pub struct Model { fields: Vec, + coord_system: CoordSystem, schema: SchemaRef, } @@ -63,7 +64,7 @@ impl Model { /// Create a new BBI zoom model. /// /// `fields`: field names. `None` → all 8 default fields. - pub fn new(fields: Select) -> crate::Result { + pub fn new(fields: Select, coord_system: CoordSystem) -> crate::Result { let field_names = match fields { Select::All => DEFAULT_FIELD_NAMES.iter().map(|&s| s.to_string()).collect(), Select::Some(names) => names, @@ -84,6 +85,7 @@ impl Model { Ok(Self { fields: parsed_fields, + coord_system, schema, }) } @@ -98,6 +100,11 @@ impl Model { self.fields.iter().map(|f| f.name().to_string()).collect() } + /// The output coordinate system. + pub fn coord_system(&self) -> CoordSystem { + self.coord_system + } + /// The Arrow schema. pub fn schema(&self) -> &SchemaRef { &self.schema @@ -134,13 +141,13 @@ impl Model { .map(|f| f.name().to_string()) .collect(); - Self::new(Select::Some(projected)) + Self::new(Select::Some(projected), self.coord_system) } } impl PartialEq for Model { fn eq(&self, other: &Self) -> bool { - self.fields == other.fields + self.fields == other.fields && self.coord_system == other.coord_system } } @@ -152,25 +159,24 @@ mod tests { #[test] fn test_defaults() { - let model = Model::new(Select::All).unwrap(); + let model = Model::new(Select::All, CoordSystem::ZeroHalfOpen).unwrap(); assert_eq!(model.field_names().len(), 8); assert_eq!(model.schema().fields().len(), 8); } #[test] fn test_custom() { - let model = Model::new(Select::Some(vec![ - "chrom".into(), - "start".into(), - "sum".into(), - ])) + let model = Model::new( + Select::Some(vec!["chrom".into(), "start".into(), "sum".into()]), + CoordSystem::ZeroHalfOpen, + ) .unwrap(); assert_eq!(model.field_names(), vec!["chrom", "start", "sum"]); } #[test] fn test_project() { - let model = Model::new(Select::All).unwrap(); + let model = Model::new(Select::All, CoordSystem::ZeroHalfOpen).unwrap(); let projected = model .project(&["chrom".into(), "min".into(), "max".into()]) .unwrap(); @@ -179,7 +185,10 @@ mod tests { #[test] fn test_invalid_field() { - let result = Model::new(Select::Some(vec!["invalid".into()])); + let result = Model::new( + Select::Some(vec!["invalid".into()]), + CoordSystem::ZeroHalfOpen, + ); assert!(result.is_err()); } } diff --git a/oxbow/src/bbi/model/zoom/batch.rs b/oxbow/src/bbi/model/zoom/batch.rs index be7bd34..293f133 100644 --- a/oxbow/src/bbi/model/zoom/batch.rs +++ b/oxbow/src/bbi/model/zoom/batch.rs @@ -7,6 +7,10 @@ use arrow::record_batch::{RecordBatch, RecordBatchOptions}; use indexmap::IndexMap; use crate::batch::{Push, RecordBatchBuilder}; +use crate::CoordSystem; + +/// The coordinate system in which bigtools returns BBI positions. +const SOURCE_CS: CoordSystem = CoordSystem::ZeroHalfOpen; use super::field::{Field, FieldBuilder, DEFAULT_FIELD_NAMES}; use super::BBIZoomRecord; @@ -15,6 +19,7 @@ use super::BBIZoomRecord; pub struct BatchBuilder { schema: SchemaRef, row_count: usize, + coord_offset: i32, fields: Vec, field_builders: IndexMap, } @@ -24,6 +29,7 @@ impl BatchBuilder { pub fn new( ref_names: &[String], field_names: Option>, + coord_system: CoordSystem, capacity: usize, ) -> crate::Result { let default_field_names: Vec = DEFAULT_FIELD_NAMES @@ -49,6 +55,7 @@ impl BatchBuilder { Ok(Self { schema, row_count: 0, + coord_offset: coord_system.start_offset_from(SOURCE_CS), fields, field_builders, }) @@ -89,7 +96,10 @@ impl Push<&BBIZoomRecord<'_>> for BatchBuilder { for (_, builder) in &mut self.field_builders { match builder { FieldBuilder::Chrom(builder) => builder.append_value(record.chrom), - FieldBuilder::Start(builder) => builder.append_value(record.start), + FieldBuilder::Start(builder) => { + let adjusted = (record.start as i64 + self.coord_offset as i64) as u32; + builder.append_value(adjusted); + } FieldBuilder::End(builder) => builder.append_value(record.end), FieldBuilder::BasesCovered(builder) => builder.append_value(record.bases_covered), FieldBuilder::Min(builder) => builder.append_value(record.min), @@ -119,7 +129,8 @@ mod tests { ]); let capacity = 10; - let builder = BatchBuilder::new(&ref_names, field_names, capacity); + let builder = + BatchBuilder::new(&ref_names, field_names, CoordSystem::ZeroHalfOpen, capacity); assert!(builder.is_ok()); let builder = builder.unwrap(); @@ -137,7 +148,9 @@ mod tests { ]); let capacity = 10; - let builder = BatchBuilder::new(&ref_names, field_names, capacity).unwrap(); + let builder = + BatchBuilder::new(&ref_names, field_names, CoordSystem::ZeroHalfOpen, capacity) + .unwrap(); let schema = builder.schema(); assert_eq!(schema.fields().len(), 3); @@ -156,7 +169,9 @@ mod tests { ]); let capacity = 10; - let mut builder = BatchBuilder::new(&ref_names, field_names, capacity).unwrap(); + let mut builder = + BatchBuilder::new(&ref_names, field_names, CoordSystem::ZeroHalfOpen, capacity) + .unwrap(); let record = BBIZoomRecord { chrom: "chr1", @@ -182,7 +197,9 @@ mod tests { ]); let capacity = 10; - let mut builder = BatchBuilder::new(&ref_names, field_names, capacity).unwrap(); + let mut builder = + BatchBuilder::new(&ref_names, field_names, CoordSystem::ZeroHalfOpen, capacity) + .unwrap(); let record1 = BBIZoomRecord { chrom: "chr1", diff --git a/oxbow/src/bbi/scanner/bbizoom.rs b/oxbow/src/bbi/scanner/bbizoom.rs index 4574b41..b1b853b 100644 --- a/oxbow/src/bbi/scanner/bbizoom.rs +++ b/oxbow/src/bbi/scanner/bbizoom.rs @@ -7,7 +7,7 @@ pub use super::BBIReader; use crate::bbi::model::zoom::BatchBuilder; use crate::bbi::model::zoom::Model; use crate::bbi::scanner::batch_iterator::zoom::{BBIZoomBatchIterator, BBIZoomQueryBatchIterator}; -use crate::Select; +use crate::{CoordSystem, Select}; /// A scanner for the summary statistics from BBI file zoom level. /// @@ -24,8 +24,8 @@ use crate::Select; /// let info = fmt_reader.info(); /// let ref_names = info.chrom_info.iter().map(|c| c.name.clone()).collect(); /// let zoom_levels: Vec = info.zoom_headers.iter().map(|h| h.reduction_level).collect(); -/// use oxbow::Select; -/// let scanner = Scanner::new(ref_names, zoom_levels[0], Select::All).unwrap(); +/// use oxbow::{CoordSystem, Select}; +/// let scanner = Scanner::new(ref_names, zoom_levels[0], Select::All, CoordSystem::ZeroHalfOpen).unwrap(); /// let batches = scanner.scan(BBIReader::BigWig(fmt_reader), None, None, Some(1000)); pub struct Scanner { ref_names: Vec, @@ -39,8 +39,9 @@ impl Scanner { ref_names: Vec, zoom_level: u32, fields: Select, + coord_system: CoordSystem, ) -> crate::Result { - let model = Model::new(fields)?; + let model = Model::new(fields, coord_system)?; Ok(Self { ref_names, zoom_level, @@ -69,11 +70,17 @@ impl Scanner { columns: Option>, capacity: usize, ) -> crate::Result { + let cs = self.model.coord_system(); match columns { - None => BatchBuilder::new(&self.ref_names, Some(self.model.field_names()), capacity), + None => BatchBuilder::new( + &self.ref_names, + Some(self.model.field_names()), + cs, + capacity, + ), Some(cols) => { let projected = self.model.project(&cols)?; - BatchBuilder::new(&self.ref_names, Some(projected.field_names()), capacity) + BatchBuilder::new(&self.ref_names, Some(projected.field_names()), cs, capacity) } } } diff --git a/oxbow/src/bbi/scanner/bigbed.rs b/oxbow/src/bbi/scanner/bigbed.rs index 4d1f7f3..92edd69 100644 --- a/oxbow/src/bbi/scanner/bigbed.rs +++ b/oxbow/src/bbi/scanner/bigbed.rs @@ -8,7 +8,7 @@ use crate::bbi::model::base::BatchBuilder; use crate::bbi::model::base::BedSchema; use crate::bbi::model::base::Model; use crate::bbi::scanner::batch_iterator::base::{BigBedBatchIterator, BigBedQueryBatchIterator}; -use crate::Select; +use crate::{CoordSystem, Select}; /// A BigBed scanner. /// @@ -24,7 +24,8 @@ use crate::Select; /// let info = fmt_reader.info(); /// /// use oxbow::Select; -/// let scanner = Scanner::new("bed12".parse().unwrap(), info.clone(), Select::All).unwrap(); +/// use oxbow::CoordSystem; +/// let scanner = Scanner::new("bed12".parse().unwrap(), info.clone(), Select::All, CoordSystem::ZeroHalfOpen).unwrap(); /// let batches = scanner.scan(fmt_reader, None, None, Some(1000)); pub struct Scanner { model: Model, @@ -37,8 +38,9 @@ impl Scanner { bed_schema: BedSchema, info: bigtools::BBIFileInfo, fields: Select, + coord_system: CoordSystem, ) -> crate::Result { - let model = Model::new(bed_schema, fields)?; + let model = Model::new(bed_schema, fields, coord_system)?; Ok(Self { model, info }) } diff --git a/oxbow/src/bbi/scanner/bigwig.rs b/oxbow/src/bbi/scanner/bigwig.rs index 847dc91..10cab05 100644 --- a/oxbow/src/bbi/scanner/bigwig.rs +++ b/oxbow/src/bbi/scanner/bigwig.rs @@ -8,7 +8,7 @@ use crate::bbi::model::base::BatchBuilder; use crate::bbi::model::base::BedSchema; use crate::bbi::model::base::Model; use crate::bbi::scanner::batch_iterator::base::{BigWigBatchIterator, BigWigQueryBatchIterator}; -use crate::Select; +use crate::{CoordSystem, Select}; /// A BigWig scanner. /// @@ -23,7 +23,8 @@ use crate::Select; /// let info = fmt_reader.info(); /// /// use oxbow::Select; -/// let scanner = Scanner::new(info.clone(), Select::All).unwrap(); +/// use oxbow::CoordSystem; +/// let scanner = Scanner::new(info.clone(), Select::All, CoordSystem::ZeroHalfOpen).unwrap(); /// let batches = scanner.scan(fmt_reader, None, None, Some(1000)); /// ``` pub struct Scanner { @@ -33,9 +34,13 @@ pub struct Scanner { impl Scanner { /// Creates a BigWig scanner from BBI file info and optional field names. - pub fn new(info: bigtools::BBIFileInfo, fields: Select) -> crate::Result { + pub fn new( + info: bigtools::BBIFileInfo, + fields: Select, + coord_system: CoordSystem, + ) -> crate::Result { let bed_schema: BedSchema = "bedGraph".parse().unwrap(); - let model = Model::new(bed_schema, fields)?; + let model = Model::new(bed_schema, fields, coord_system)?; Ok(Self { model, info }) } diff --git a/oxbow/src/bed/model.rs b/oxbow/src/bed/model.rs index 8bd3659..2d25cd7 100644 --- a/oxbow/src/bed/model.rs +++ b/oxbow/src/bed/model.rs @@ -14,7 +14,7 @@ use std::sync::Arc; use arrow::datatypes::{Field as ArrowField, Schema, SchemaRef}; -use crate::{OxbowError, Select}; +use crate::{CoordSystem, OxbowError, Select}; use field::Field; /// A data model for BED records. @@ -29,22 +29,23 @@ use field::Field; /// /// ``` /// use oxbow::bed::model::{Model, BedSchema}; -/// use oxbow::Select; +/// use oxbow::{CoordSystem, Select}; /// /// // BED6 with all fields. /// let bed_schema: BedSchema = "bed6".parse().unwrap(); -/// let model = Model::new(bed_schema, Select::All).unwrap(); +/// let model = Model::new(bed_schema, Select::All, CoordSystem::ZeroHalfOpen).unwrap(); /// assert_eq!(model.field_names().len(), 6); /// /// // BED6 projected to 3 fields. /// let bed_schema: BedSchema = "bed6".parse().unwrap(); -/// let model = Model::new(bed_schema, Select::Some(vec!["chrom".into(), "start".into(), "end".into()])).unwrap(); +/// let model = Model::new(bed_schema, Select::Some(vec!["chrom".into(), "start".into(), "end".into()]), CoordSystem::ZeroHalfOpen).unwrap(); /// assert_eq!(model.field_names().len(), 3); /// ``` #[derive(Clone, Debug)] pub struct Model { bed_schema: BedSchema, field_names: Vec, + coord_system: CoordSystem, schema: SchemaRef, } @@ -53,7 +54,11 @@ impl Model { /// /// - `bed_schema`: the parsing interpretation. /// - `fields`: column names to project. `None` → all fields from the schema. - pub fn new(bed_schema: BedSchema, fields: Select) -> crate::Result { + pub fn new( + bed_schema: BedSchema, + fields: Select, + coord_system: CoordSystem, + ) -> crate::Result { let available_names = bed_schema.field_names(); let projected_names = match fields { Select::All => available_names.clone(), @@ -98,6 +103,7 @@ impl Model { Ok(Self { bed_schema, field_names: projected_names, + coord_system, schema, }) } @@ -112,6 +118,11 @@ impl Model { self.field_names.clone() } + /// The output coordinate system for the start position column. + pub fn coord_system(&self) -> CoordSystem { + self.coord_system + } + /// The Arrow schema for the projected fields. pub fn schema(&self) -> &SchemaRef { &self.schema @@ -148,13 +159,19 @@ impl Model { .cloned() .collect(); - Self::new(self.bed_schema.clone(), Select::Some(projected)) + Self::new( + self.bed_schema.clone(), + Select::Some(projected), + self.coord_system, + ) } } impl PartialEq for Model { fn eq(&self, other: &Self) -> bool { - self.bed_schema == other.bed_schema && self.field_names == other.field_names + self.bed_schema == other.bed_schema + && self.field_names == other.field_names + && self.coord_system == other.coord_system } } @@ -167,7 +184,7 @@ mod tests { #[test] fn test_bed6_all_fields() { let bed_schema: BedSchema = "bed6".parse().unwrap(); - let model = Model::new(bed_schema, Select::All).unwrap(); + let model = Model::new(bed_schema, Select::All, CoordSystem::ZeroHalfOpen).unwrap(); assert_eq!(model.field_names().len(), 6); assert_eq!(model.schema().fields().len(), 6); } @@ -178,6 +195,7 @@ mod tests { let model = Model::new( bed_schema, Select::Some(vec!["chrom".into(), "start".into(), "end".into()]), + CoordSystem::ZeroHalfOpen, ) .unwrap(); assert_eq!(model.field_names(), vec!["chrom", "start", "end"]); @@ -187,14 +205,14 @@ mod tests { #[test] fn test_bed3_plus() { let bed_schema: BedSchema = "bed3+".parse().unwrap(); - let model = Model::new(bed_schema, Select::All).unwrap(); + let model = Model::new(bed_schema, Select::All, CoordSystem::ZeroHalfOpen).unwrap(); assert_eq!(model.field_names(), vec!["chrom", "start", "end", "rest"]); } #[test] fn test_bedgraph() { let bed_schema = BedSchema::new_bedgraph().unwrap(); - let model = Model::new(bed_schema, Select::All).unwrap(); + let model = Model::new(bed_schema, Select::All, CoordSystem::ZeroHalfOpen).unwrap(); assert_eq!(model.field_names(), vec!["chrom", "start", "end", "value"]); } @@ -205,7 +223,7 @@ mod tests { FieldDef::new("pValue".into(), FieldType::Float), ]; let bed_schema = BedSchema::new(3, Some(defs)).unwrap(); - let model = Model::new(bed_schema, Select::All).unwrap(); + let model = Model::new(bed_schema, Select::All, CoordSystem::ZeroHalfOpen).unwrap(); assert_eq!( model.field_names(), vec!["chrom", "start", "end", "signalValue", "pValue"] @@ -215,7 +233,7 @@ mod tests { #[test] fn test_project() { let bed_schema: BedSchema = "bed6+3".parse().unwrap(); - let model = Model::new(bed_schema, Select::All).unwrap(); + let model = Model::new(bed_schema, Select::All, CoordSystem::ZeroHalfOpen).unwrap(); let projected = model .project(&["chrom".into(), "end".into(), "BED6+1".into()]) .unwrap(); @@ -225,7 +243,7 @@ mod tests { #[test] fn test_project_unknown() { let bed_schema: BedSchema = "bed3".parse().unwrap(); - let model = Model::new(bed_schema, Select::All).unwrap(); + let model = Model::new(bed_schema, Select::All, CoordSystem::ZeroHalfOpen).unwrap(); let result = model.project(&["nonexistent".into()]); assert!(result.is_err()); } @@ -233,15 +251,23 @@ mod tests { #[test] fn test_invalid_field_name() { let bed_schema: BedSchema = "bed3".parse().unwrap(); - let result = Model::new(bed_schema, Select::Some(vec!["nonexistent".into()])); + let result = Model::new( + bed_schema, + Select::Some(vec!["nonexistent".into()]), + CoordSystem::ZeroHalfOpen, + ); assert!(result.is_err()); } #[test] fn test_bed3_projected_subset() { let bed_schema: BedSchema = "bed3".parse().unwrap(); - let model = - Model::new(bed_schema, Select::Some(vec!["chrom".into(), "end".into()])).unwrap(); + let model = Model::new( + bed_schema, + Select::Some(vec!["chrom".into(), "end".into()]), + CoordSystem::ZeroHalfOpen, + ) + .unwrap(); assert_eq!(model.field_names(), vec!["chrom", "end"]); } @@ -251,6 +277,7 @@ mod tests { let model = Model::new( bed_schema, Select::Some(vec!["chrom".into(), "strand".into(), "itemRgb".into()]), + CoordSystem::ZeroHalfOpen, ) .unwrap(); assert_eq!(model.field_names(), vec!["chrom", "strand", "itemRgb"]); @@ -264,7 +291,7 @@ mod tests { FieldDef::new("extra2".into(), FieldType::String), ]; let bed_schema = BedSchema::new(12, Some(defs)).unwrap(); - let model = Model::new(bed_schema, Select::All).unwrap(); + let model = Model::new(bed_schema, Select::All, CoordSystem::ZeroHalfOpen).unwrap(); assert_eq!(model.field_names().len(), 14); let projected = model @@ -281,7 +308,7 @@ mod tests { use arrow::datatypes::DataType; let bed_schema = BedSchema::new_bedgraph().unwrap(); - let model = Model::new(bed_schema, Select::All).unwrap(); + let model = Model::new(bed_schema, Select::All, CoordSystem::ZeroHalfOpen).unwrap(); // Standard fields use BED types (Int64 for positions) assert_eq!(model.schema().field(1).data_type(), &DataType::Int64); // Custom "value" field uses FieldDef type (Float32) diff --git a/oxbow/src/bed/model/batch.rs b/oxbow/src/bed/model/batch.rs index a034d95..a9f9c8b 100644 --- a/oxbow/src/bed/model/batch.rs +++ b/oxbow/src/bed/model/batch.rs @@ -8,9 +8,16 @@ use arrow::record_batch::{RecordBatch, RecordBatchOptions}; use indexmap::IndexMap; use crate::batch::{Push, RecordBatchBuilder}; +use crate::{CoordSystem, Select}; use super::field::Push as _; use super::field::{Field, FieldBuilder}; +use super::Model; + +/// The coordinate system in which noodles returns BED start positions. +/// noodles converts 0-based BED file values to 1-based `Position`, so the +/// source is always 1-based closed. +const SOURCE_CS: CoordSystem = CoordSystem::OneClosed; use super::field_def::{FieldBuilder as GenericFieldBuilder, FieldDef, Push as GenericPush}; use super::schema::BedSchema; @@ -18,65 +25,45 @@ use super::schema::BedSchema; pub struct BatchBuilder { schema: SchemaRef, row_count: usize, + _coord_offset: i32, bed_schema: BedSchema, standard_field_builders: IndexMap, custom_field_builders: IndexMap, } impl BatchBuilder { - /// Creates a new `BatchBuilder` from a [`Model`]. - pub fn from_model(model: &super::Model, capacity: usize) -> crate::Result { - Self::new(model.bed_schema(), Some(model.field_names()), capacity) - } - - /// Creates a new `BatchBuilder` for BED records. pub fn new( - bed_schema: &BedSchema, - field_names: Option>, + bed_schema: BedSchema, + fields: Select, capacity: usize, ) -> crate::Result { - // All the standard and custom field definitions for the given BED schema. - let standard_field_names = bed_schema.standard_field_names(); + let model = Model::new(bed_schema, fields, CoordSystem::ZeroHalfOpen)?; + Self::from_model(&model, capacity) + } + + /// Creates a new `BatchBuilder` from a [`Model`]. + pub fn from_model(model: &Model, capacity: usize) -> crate::Result { + let bed_schema = model.bed_schema().clone(); let custom_field_defs = bed_schema.custom_fields(); + let coord_offset = model.coord_system().start_offset_from(SOURCE_CS); - // Determine the projected fields and create builders. let mut projected_standard_fields = Vec::new(); let mut projected_custom_defs = Vec::new(); let mut standard_field_builders = IndexMap::new(); let mut custom_field_builders = IndexMap::new(); - match &field_names { - Some(projection) => { - for name in projection { - if let Some(def) = custom_field_defs.iter().find(|d| &d.name == name) { - projected_custom_defs.push(def.clone()); - let builder = GenericFieldBuilder::new(&def.ty, capacity)?; - custom_field_builders.insert(def.clone(), builder); - } else { - let field = Field::from_str(name)?; - projected_standard_fields.push(field.clone()); - let builder = FieldBuilder::new(field.clone(), capacity); - standard_field_builders.insert(field.clone(), builder); - } - } - } - None => { - projected_standard_fields.extend( - standard_field_names - .into_iter() - .map(|name| Field::from_str(&name)) - .collect::, _>>()?, - ); - for field in &projected_standard_fields { - let builder = FieldBuilder::new(field.clone(), capacity); - standard_field_builders.insert(field.clone(), builder); - } - for def in custom_field_defs { - projected_custom_defs.push(def.clone()); - let builder = GenericFieldBuilder::new(&def.ty, capacity)?; - custom_field_builders.insert(def.clone(), builder); - } + for name in &model.field_names() { + if let Some(def) = custom_field_defs.iter().find(|d| &d.name == name) { + projected_custom_defs.push(def.clone()); + let builder = GenericFieldBuilder::new(&def.ty, capacity)?; + custom_field_builders.insert(def.clone(), builder); + } else { + let field = Field::from_str(name)?; + projected_standard_fields.push(field.clone()); + let builder = + FieldBuilder::new(field.clone(), capacity).with_coord_offset(coord_offset); + standard_field_builders.insert(field.clone(), builder); } - }; + } // Build schema once let mut arrow_fields: Vec = projected_standard_fields @@ -91,6 +78,7 @@ impl BatchBuilder { Ok(Self { schema, row_count: 0, + _coord_offset: coord_offset, bed_schema: bed_schema.clone(), standard_field_builders, custom_field_builders, @@ -205,7 +193,7 @@ mod tests { #[test] fn test_batch_builder_new() { let bed_schema = BedSchema::new_from_nm(3, Some(2)).unwrap(); - let batch_builder = BatchBuilder::new(&bed_schema, None, 10).unwrap(); + let batch_builder = BatchBuilder::new(bed_schema, Select::All, 10).unwrap(); assert_eq!(batch_builder.schema().fields().len(), 5); } @@ -213,7 +201,7 @@ mod tests { #[test] fn test_push_bed_record() { let bed_schema = BedSchema::new_from_nm(3, Some(2)).unwrap(); - let mut batch_builder = BatchBuilder::new(&bed_schema, None, 10).unwrap(); + let mut batch_builder = BatchBuilder::new(bed_schema, Select::All, 10).unwrap(); let record = create_bed_record(); let result = batch_builder.push(&record); @@ -225,7 +213,7 @@ mod tests { let record = create_bed_record(); let bed_schema = BedSchema::new_from_nm(3, Some(0)).unwrap(); - let mut batch_builder = BatchBuilder::new(&bed_schema, None, 10).unwrap(); + let mut batch_builder = BatchBuilder::new(bed_schema, Select::All, 10).unwrap(); batch_builder.push(&record).unwrap(); let record_batch = batch_builder.finish(); assert!(record_batch.is_ok()); @@ -246,7 +234,7 @@ mod tests { .as_any() .downcast_ref::() .unwrap(); - assert_eq!(start_array.value(0), 101); + assert_eq!(start_array.value(0), 100); let end_array = record_batch .column(2) @@ -259,7 +247,7 @@ mod tests { #[test] fn test_finish_bedn_plus_m() { let bed_schema = BedSchema::new_from_nm(3, Some(2)).unwrap(); - let mut batch_builder = BatchBuilder::new(&bed_schema, None, 10).unwrap(); + let mut batch_builder = BatchBuilder::new(bed_schema, Select::All, 10).unwrap(); let record = create_bed_record(); batch_builder.push(&record).unwrap(); @@ -282,7 +270,7 @@ mod tests { .as_any() .downcast_ref::() .unwrap(); - assert_eq!(start_array.value(0), 101); + assert_eq!(start_array.value(0), 100); let end_array = record_batch .column(2) @@ -319,7 +307,7 @@ mod tests { let record = create_bed_record(); let bed_schema = BedSchema::new_from_nm(3, None).unwrap(); - let mut batch_builder = BatchBuilder::new(&bed_schema, None, 10).unwrap(); + let mut batch_builder = BatchBuilder::new(bed_schema, Select::All, 10).unwrap(); batch_builder.push(&record).unwrap(); let record_batch = batch_builder.finish(); assert!(record_batch.is_ok()); @@ -340,7 +328,7 @@ mod tests { .as_any() .downcast_ref::() .unwrap(); - assert_eq!(start_array.value(0), 101); + assert_eq!(start_array.value(0), 100); let end_array = record_batch .column(2) diff --git a/oxbow/src/bed/model/field.rs b/oxbow/src/bed/model/field.rs index fd6e3ef..2a1f052 100644 --- a/oxbow/src/bed/model/field.rs +++ b/oxbow/src/bed/model/field.rs @@ -120,7 +120,7 @@ impl FromStr for Field { /// A builder for an Arrow array (column) corresponding to a BED field. pub enum FieldBuilder { Chrom(GenericStringBuilder), - Start(Int64Builder), + Start(Int64Builder, i64), End(Int64Builder), Name(GenericStringBuilder), Score(UInt16Builder), @@ -144,7 +144,7 @@ impl FieldBuilder { pub fn new(field: Field, capacity: usize) -> Self { match field { Field::Chrom => Self::Chrom(GenericStringBuilder::::with_capacity(capacity, 1024)), - Field::Start => Self::Start(Int64Builder::with_capacity(capacity)), + Field::Start => Self::Start(Int64Builder::with_capacity(capacity), 0), Field::End => Self::End(Int64Builder::with_capacity(capacity)), Field::Name => Self::Name(GenericStringBuilder::::with_capacity(capacity, 1024)), Field::Score => Self::Score(UInt16Builder::with_capacity(capacity)), @@ -176,10 +176,18 @@ impl FieldBuilder { } } + /// Sets the coordinate offset for the start position field. + pub fn with_coord_offset(self, offset: i32) -> Self { + match self { + Self::Start(b, _) => Self::Start(b, offset as i64), + other => other, + } + } + pub fn finish(&mut self) -> arrow::array::ArrayRef { match self { Self::Chrom(builder) => Arc::new(builder.finish()), - Self::Start(builder) => Arc::new(builder.finish()), + Self::Start(builder, _) => Arc::new(builder.finish()), Self::End(builder) => Arc::new(builder.finish()), Self::Name(builder) => Arc::new(builder.finish()), Self::Score(builder) => Arc::new(builder.finish()), @@ -210,8 +218,11 @@ impl Push<&noodles::bed::Record<3>> for FieldBuilder { Self::Chrom(builder) => { builder.append_value(record.reference_sequence_name().to_string()); } - Self::Start(builder) => { - let start = record.feature_start().ok().map(|pos| pos.get() as i64); + Self::Start(builder, offset) => { + let start = record + .feature_start() + .ok() + .map(|pos| pos.get() as i64 + *offset); builder.append_option(start); } Self::End(builder) => { diff --git a/oxbow/src/bed/scanner/bed.rs b/oxbow/src/bed/scanner/bed.rs index 8bccd82..72f91c9 100644 --- a/oxbow/src/bed/scanner/bed.rs +++ b/oxbow/src/bed/scanner/bed.rs @@ -10,7 +10,7 @@ use crate::bed::model::BedSchema; use crate::bed::model::Model; use crate::bed::scanner::batch_iterator::{BatchIterator, QueryBatchIterator}; use crate::util::query::{BgzfChunkReader, ByteRangeReader}; -use crate::{OxbowError, Select}; +use crate::{CoordSystem, OxbowError, Select}; /// A BED scanner. /// @@ -29,7 +29,8 @@ use crate::{OxbowError, Select}; /// /// use oxbow::Select; /// let bed_schema = "bed6+3".parse().unwrap(); -/// let scanner = Scanner::new(bed_schema, Select::All).unwrap(); +/// use oxbow::CoordSystem; +/// let scanner = Scanner::new(bed_schema, Select::All, CoordSystem::ZeroHalfOpen).unwrap(); /// let batches = scanner.scan(fmt_reader, None, None, Some(1000)).unwrap(); /// ``` pub struct Scanner { @@ -41,8 +42,12 @@ impl Scanner { /// /// - `bed_schema`: the parsing interpretation. /// - `fields`: column names to project. `None` → all fields from the schema. - pub fn new(bed_schema: BedSchema, fields: Select) -> crate::Result { - let model = Model::new(bed_schema, fields)?; + pub fn new( + bed_schema: BedSchema, + fields: Select, + coord_system: CoordSystem, + ) -> crate::Result { + let model = Model::new(bed_schema, fields, coord_system)?; Ok(Self { model }) } diff --git a/oxbow/src/gxf/model.rs b/oxbow/src/gxf/model.rs index c8d4c0c..9406bde 100644 --- a/oxbow/src/gxf/model.rs +++ b/oxbow/src/gxf/model.rs @@ -8,7 +8,7 @@ use std::sync::Arc; use arrow::datatypes::{DataType, Field as ArrowField, Schema, SchemaRef}; -use crate::{OxbowError, Select}; +use crate::{CoordSystem, OxbowError, Select}; use attribute::AttributeDef; use field::{Field, DEFAULT_FIELD_NAMES}; @@ -24,6 +24,9 @@ use field::{Field, DEFAULT_FIELD_NAMES}; /// - `attr_defs` controls the attributes struct column independently. /// `None` → no attributes column. `Some(vec![])` → empty struct column. /// `Some(vec![...])` → struct column with the specified sub-fields. +/// - `coord_system` controls the coordinate system representation of the +/// output. The default is 1-based closed, which is the standard for GFF/GTF +/// files. Practically, this only affects start position values. /// /// The model can produce an Arrow schema independently of any file content. /// @@ -31,10 +34,10 @@ use field::{Field, DEFAULT_FIELD_NAMES}; /// /// ``` /// use oxbow::gxf::model::Model; -/// use oxbow::Select; +/// use oxbow::{CoordSystem, Select}; /// /// // Default: all 8 standard fields, no attributes column. -/// let model = Model::new(Select::All, None).unwrap(); +/// let model = Model::new(Select::All, None, CoordSystem::OneClosed).unwrap(); /// assert_eq!(model.field_names().len(), 8); /// assert!(!model.has_attributes()); /// @@ -42,6 +45,7 @@ use field::{Field, DEFAULT_FIELD_NAMES}; /// let model = Model::new( /// Select::Some(vec!["seqid".into(), "start".into(), "end".into()]), /// Some(vec![("gene_id".into(), "String".into())]), +/// CoordSystem::OneClosed, /// ).unwrap(); /// assert_eq!(model.field_names(), vec!["seqid", "start", "end"]); /// assert!(model.has_attributes()); @@ -53,6 +57,7 @@ use field::{Field, DEFAULT_FIELD_NAMES}; pub struct Model { fields: Vec, attr_defs: Option>, + coord_system: CoordSystem, schema: SchemaRef, } @@ -67,6 +72,7 @@ impl Model { pub fn new( fields: Select, attr_defs: Option>, + coord_system: CoordSystem, ) -> crate::Result { let field_names = match fields { Select::All => DEFAULT_FIELD_NAMES.iter().map(|&s| s.to_string()).collect(), @@ -94,15 +100,11 @@ impl Model { Ok(Self { fields: parsed_fields, attr_defs, + coord_system, schema, }) } - /// Create a model with all 8 default standard fields and no attributes. - pub fn default_fields() -> Self { - Self::new(Select::All, None).expect("default fields are always valid") - } - fn build_schema(fields: &[Field], attr_defs: Option<&[AttributeDef]>) -> SchemaRef { let mut arrow_fields: Vec = fields.iter().map(|f| f.get_arrow_field()).collect(); @@ -139,6 +141,11 @@ impl Model { self.attr_defs.is_some() } + /// The output coordinate system for the start position column. + pub fn coord_system(&self) -> CoordSystem { + self.coord_system + } + /// The Arrow schema for this model. pub fn schema(&self) -> &SchemaRef { &self.schema @@ -188,13 +195,22 @@ impl Model { None }; - Self::new(Select::Some(projected_fields), attr_defs) + Self::new(Select::Some(projected_fields), attr_defs, self.coord_system) + } +} + +impl Default for Model { + fn default() -> Self { + Self::new(Select::All, None, CoordSystem::OneClosed) + .expect("default fields are always valid") } } impl PartialEq for Model { fn eq(&self, other: &Self) -> bool { - self.fields == other.fields && self.attr_defs == other.attr_defs + self.fields == other.fields + && self.attr_defs == other.attr_defs + && self.coord_system == other.coord_system } } @@ -207,7 +223,7 @@ mod tests { #[test] fn test_default_model() { - let model = Model::new(Select::All, None).unwrap(); + let model = Model::new(Select::All, None, CoordSystem::OneClosed).unwrap(); assert_eq!(model.field_names().len(), 8); assert!(!model.has_attributes()); assert!(model.attr_defs().is_none()); @@ -216,8 +232,11 @@ mod tests { #[test] fn test_default_fields_constructor() { - let model = Model::default_fields(); - assert_eq!(model, Model::new(Select::All, None).unwrap()); + let model = Model::default(); + assert_eq!( + model, + Model::new(Select::All, None, CoordSystem::OneClosed).unwrap() + ); } #[test] @@ -225,6 +244,7 @@ mod tests { let model = Model::new( Select::Some(vec!["seqid".into(), "start".into(), "end".into()]), None, + CoordSystem::OneClosed, ) .unwrap(); assert_eq!(model.field_names(), vec!["seqid", "start", "end"]); @@ -237,6 +257,7 @@ mod tests { let model = Model::new( Select::Some(vec!["seqid".into(), "start".into()]), Some(vec![("gene_id".into(), "String".into())]), + CoordSystem::OneClosed, ) .unwrap(); assert_eq!(model.field_names(), vec!["seqid", "start"]); @@ -248,7 +269,12 @@ mod tests { #[test] fn test_attrs_empty_defs_is_empty_struct() { - let model = Model::new(Select::Some(vec!["seqid".into()]), Some(vec![])).unwrap(); + let model = Model::new( + Select::Some(vec!["seqid".into()]), + Some(vec![]), + CoordSystem::OneClosed, + ) + .unwrap(); assert!(model.has_attributes()); assert!(model.attr_defs().unwrap().is_empty()); assert_eq!(model.schema().fields().len(), 2); @@ -261,7 +287,12 @@ mod tests { #[test] fn test_no_attrs_when_attr_defs_none() { - let model = Model::new(Select::Some(vec!["seqid".into(), "start".into()]), None).unwrap(); + let model = Model::new( + Select::Some(vec!["seqid".into(), "start".into()]), + None, + CoordSystem::OneClosed, + ) + .unwrap(); assert!(!model.has_attributes()); assert!(model.attr_defs().is_none()); assert_eq!(model.schema().fields().len(), 2); @@ -269,7 +300,11 @@ mod tests { #[test] fn test_invalid_field() { - let result = Model::new(Select::Some(vec!["invalid".into()]), None); + let result = Model::new( + Select::Some(vec!["invalid".into()]), + None, + CoordSystem::OneClosed, + ); assert!(result.is_err()); } @@ -278,6 +313,7 @@ mod tests { let result = Model::new( Select::All, Some(vec![("gene_id".into(), "InvalidType".into())]), + CoordSystem::OneClosed, ); assert!(result.is_err()); } @@ -290,6 +326,7 @@ mod tests { ("gene_id".into(), "String".into()), ("tag".into(), "Array".into()), ]), + CoordSystem::OneClosed, ) .unwrap(); let tuples: Vec<_> = model @@ -312,6 +349,7 @@ mod tests { let model = Model::new( Select::Some(vec!["seqid".into(), "start".into(), "end".into()]), Some(vec![("gene_id".into(), "String".into())]), + CoordSystem::OneClosed, ) .unwrap(); @@ -325,6 +363,7 @@ mod tests { let model = Model::new( Select::Some(vec!["seqid".into(), "start".into()]), Some(vec![("gene_id".into(), "String".into())]), + CoordSystem::OneClosed, ) .unwrap(); @@ -338,7 +377,7 @@ mod tests { #[test] fn test_project_unknown_column() { - let model = Model::default_fields(); + let model = Model::default(); let result = model.project(&["nonexistent".into()]); assert!(result.is_err()); } diff --git a/oxbow/src/gxf/model/batch.rs b/oxbow/src/gxf/model/batch.rs index 994cc9e..87df25a 100644 --- a/oxbow/src/gxf/model/batch.rs +++ b/oxbow/src/gxf/model/batch.rs @@ -11,11 +11,14 @@ use crate::batch::{Push, RecordBatchBuilder}; use crate::gxf::model::attribute::{AttributeBuilder, AttributeDef, AttributeValue}; use crate::gxf::model::field::Push as _; use crate::gxf::model::field::{Field, FieldBuilder}; -use crate::Select; +use crate::{CoordSystem, Select}; + +/// The coordinate system in which noodles returns GTF/GFF positions. +const SOURCE_CS: CoordSystem = CoordSystem::OneClosed; use super::Model; -/// A builder for an Arrow record batch of GXF (GTF/GFF) features. +/// A builder for an Arrow record batch of GTF/GFF (GXF) features. pub struct BatchBuilder { schema: SchemaRef, row_count: usize, @@ -34,15 +37,22 @@ impl BatchBuilder { attr_defs: Option>, capacity: usize, ) -> crate::Result { - let model = Model::new(fields, attr_defs)?; + let model = Model::new(fields, attr_defs, CoordSystem::OneClosed)?; Self::from_model(&model, capacity) } /// Creates a new `BatchBuilder` from a [`Model`]. pub fn from_model(model: &Model, capacity: usize) -> crate::Result { + let coord_offset = model.coord_system().start_offset_from(SOURCE_CS); + let mut field_builders = IndexMap::new(); for field in model.fields() { - let builder = FieldBuilder::new(field.clone(), capacity); + let builder = match field { + Field::Start => { + FieldBuilder::new(field.clone(), capacity).with_coord_offset(coord_offset) + } + _ => FieldBuilder::new(field.clone(), capacity), + }; field_builders.insert(field.clone(), builder); } diff --git a/oxbow/src/gxf/model/field.rs b/oxbow/src/gxf/model/field.rs index 9db5e8b..f5fbd5e 100644 --- a/oxbow/src/gxf/model/field.rs +++ b/oxbow/src/gxf/model/field.rs @@ -81,7 +81,7 @@ pub enum FieldBuilder { SeqId(GenericStringBuilder), Source(GenericStringBuilder), Type(GenericStringBuilder), - Start(Int32Builder), + Start(Int32Builder, i32), End(Int32Builder), Score(Float32Builder), Strand(GenericStringBuilder), @@ -101,7 +101,7 @@ impl FieldBuilder { Self::Source(GenericStringBuilder::::with_capacity(capacity, 1024)) } Field::Type => Self::Type(GenericStringBuilder::::with_capacity(capacity, 1024)), - Field::Start => Self::Start(Int32Builder::with_capacity(capacity)), + Field::Start => Self::Start(Int32Builder::with_capacity(capacity), 0), Field::End => Self::End(Int32Builder::with_capacity(capacity)), Field::Score => Self::Score(Float32Builder::with_capacity(capacity)), Field::Strand => { @@ -111,12 +111,22 @@ impl FieldBuilder { } } + /// Sets the coordinate offset for the start position field. + /// + /// Has no effect on other field variants. + pub fn with_coord_offset(self, offset: i32) -> Self { + match self { + Self::Start(b, _) => Self::Start(b, offset), + other => other, + } + } + pub fn finish(&mut self) -> ArrayRef { match self { Self::SeqId(builder) => Arc::new(builder.finish()), Self::Source(builder) => Arc::new(builder.finish()), Self::Type(builder) => Arc::new(builder.finish()), - Self::Start(builder) => Arc::new(builder.finish()), + Self::Start(builder, _) => Arc::new(builder.finish()), Self::End(builder) => Arc::new(builder.finish()), Self::Score(builder) => Arc::new(builder.finish()), Self::Strand(builder) => Arc::new(builder.finish()), @@ -145,8 +155,11 @@ impl<'a> Push<&'a noodles::gff::Record<'a>> for FieldBuilder { let ty = record.ty().to_string(); builder.append_value(ty); } - Self::Start(builder) => { - let start = record.start().ok().map(|pos| usize::from(pos) as i32); + Self::Start(builder, offset) => { + let start = record + .start() + .ok() + .map(|pos| usize::from(pos) as i32 + *offset); builder.append_option(start); } Self::End(builder) => { @@ -195,8 +208,11 @@ impl<'a> Push<&'a noodles::gtf::Record<'a>> for FieldBuilder { let ty = record.ty().to_string(); builder.append_value(ty); } - Self::Start(builder) => { - let start = record.start().ok().map(|pos| usize::from(pos) as i32); + Self::Start(builder, offset) => { + let start = record + .start() + .ok() + .map(|pos| usize::from(pos) as i32 + *offset); builder.append_option(start); } Self::End(builder) => { diff --git a/oxbow/src/gxf/scanner/gff.rs b/oxbow/src/gxf/scanner/gff.rs index ce038af..00d1cc2 100644 --- a/oxbow/src/gxf/scanner/gff.rs +++ b/oxbow/src/gxf/scanner/gff.rs @@ -12,7 +12,7 @@ use crate::gxf::model::BatchBuilder; use crate::gxf::model::Model; use crate::gxf::scanner::batch_iterator::{BatchIterator, QueryBatchIterator}; use crate::util::query::{BgzfChunkReader, ByteRangeReader}; -use crate::{OxbowError, Select}; +use crate::{CoordSystem, OxbowError, Select}; /// A GFF scanner. /// @@ -32,7 +32,8 @@ use crate::{OxbowError, Select}; /// let mut fmt_reader = noodles::gff::io::Reader::new(inner); /// /// let attr_defs = Scanner::attribute_defs(&mut fmt_reader, Some(1000)).unwrap(); -/// let scanner = Scanner::new(None, Select::All, Some(attr_defs)).unwrap(); +/// use oxbow::CoordSystem; +/// let scanner = Scanner::new(None, Select::All, Some(attr_defs), CoordSystem::OneClosed).unwrap(); /// let batches = scanner.scan(fmt_reader, None, None, Some(1000)); /// ``` pub struct Scanner { @@ -49,8 +50,9 @@ impl Scanner { header: Option, fields: Select, attr_defs: Option>, + coord_system: CoordSystem, ) -> crate::Result { - let model = Model::new(fields, attr_defs)?; + let model = Model::new(fields, attr_defs, coord_system)?; Ok(Self { header, model }) } diff --git a/oxbow/src/gxf/scanner/gtf.rs b/oxbow/src/gxf/scanner/gtf.rs index 9c54fc1..6f80f4d 100644 --- a/oxbow/src/gxf/scanner/gtf.rs +++ b/oxbow/src/gxf/scanner/gtf.rs @@ -12,7 +12,7 @@ use crate::gxf::model::BatchBuilder; use crate::gxf::model::Model; use crate::gxf::scanner::batch_iterator::{BatchIterator, QueryBatchIterator}; use crate::util::query::{BgzfChunkReader, ByteRangeReader}; -use crate::{OxbowError, Select}; +use crate::{CoordSystem, OxbowError, Select}; /// A GTF scanner. /// @@ -32,7 +32,8 @@ use crate::{OxbowError, Select}; /// let mut fmt_reader = noodles::gtf::io::Reader::new(inner); /// /// let attr_defs = Scanner::attribute_defs(&mut fmt_reader, Some(1000)).unwrap(); -/// let scanner = Scanner::new(None, Select::All, Some(attr_defs)).unwrap(); +/// use oxbow::CoordSystem; +/// let scanner = Scanner::new(None, Select::All, Some(attr_defs), CoordSystem::OneClosed).unwrap(); /// let batches = scanner.scan(fmt_reader, None, None, Some(1000)); /// ``` pub struct Scanner { @@ -49,8 +50,9 @@ impl Scanner { header: Option, fields: Select, attr_defs: Option>, + coord_system: CoordSystem, ) -> crate::Result { - let model = Model::new(fields, attr_defs)?; + let model = Model::new(fields, attr_defs, coord_system)?; Ok(Self { header, model }) } diff --git a/oxbow/src/lib.rs b/oxbow/src/lib.rs index b1979e8..15f48f7 100644 --- a/oxbow/src/lib.rs +++ b/oxbow/src/lib.rs @@ -71,6 +71,64 @@ pub mod variant; pub use error::{OxbowError, Result}; +/// Genomic coordinate system. +/// +/// The notation `XY` encodes the base of the start coordinate (`X`) and the +/// base of the end coordinate (`Y`): +/// +/// - `"11"` — 1-based start, 1-based end (closed; SAM/VCF/GFF convention) +/// - `"01"` — 0-based start, 1-based end (half-open; BED/BBI convention) +/// +/// End coordinates are numerically identical in both systems; only start +/// positions differ. Use [`CoordSystem::start_offset_from`] to get the +/// additive offset needed to convert a start value from one system to another. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum CoordSystem { + /// 1-based start, closed end. + OneClosed, + /// 0-based start, half-open end. + ZeroHalfOpen, +} + +impl CoordSystem { + /// Returns the additive offset to apply to a start coordinate when + /// converting from `source_cs` to `self`. + /// + /// - `OneClosed` → `ZeroHalfOpen`: `-1` + /// - `ZeroHalfOpen` → `OneClosed`: `+1` + /// - same → same: `0` + pub fn start_offset_from(self, source_cs: CoordSystem) -> i32 { + match (source_cs, self) { + (CoordSystem::OneClosed, CoordSystem::ZeroHalfOpen) => -1, + (CoordSystem::ZeroHalfOpen, CoordSystem::OneClosed) => 1, + _ => 0, + } + } +} + +impl std::fmt::Display for CoordSystem { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + CoordSystem::OneClosed => write!(f, "11"), + CoordSystem::ZeroHalfOpen => write!(f, "01"), + } + } +} + +impl std::str::FromStr for CoordSystem { + type Err = OxbowError; + + fn from_str(s: &str) -> Result { + match s { + "11" => Ok(CoordSystem::OneClosed), + "01" => Ok(CoordSystem::ZeroHalfOpen), + other => Err(OxbowError::invalid_input(format!( + "invalid coordinate system '{other}'; expected \"01\" or \"11\"" + ))), + } + } +} + #[derive(Debug, Clone)] pub enum Select { /// Select specific items explicitly diff --git a/oxbow/src/sequence/model.rs b/oxbow/src/sequence/model.rs index 0192df5..a1644c8 100644 --- a/oxbow/src/sequence/model.rs +++ b/oxbow/src/sequence/model.rs @@ -7,7 +7,7 @@ use std::sync::Arc; use arrow::datatypes::{Field as ArrowField, Schema, SchemaRef}; -use crate::{OxbowError, Select}; +use crate::{CoordSystem, OxbowError, Select}; use field::{Field, FASTA_DEFAULT_FIELD_NAMES, FASTQ_DEFAULT_FIELD_NAMES}; /// A data model for sequence records (FASTA/FASTQ). @@ -23,10 +23,10 @@ use field::{Field, FASTA_DEFAULT_FIELD_NAMES, FASTQ_DEFAULT_FIELD_NAMES}; /// /// ``` /// use oxbow::sequence::model::Model; -/// use oxbow::Select; +/// use oxbow::{CoordSystem, Select}; /// /// // FASTA defaults: name, description, sequence. -/// let model = Model::new_fasta(Select::All).unwrap(); +/// let model = Model::new_fasta(Select::All, CoordSystem::OneClosed).unwrap(); /// assert_eq!(model.field_names().len(), 3); /// /// // FASTQ defaults: name, description, sequence, quality. @@ -40,6 +40,7 @@ use field::{Field, FASTA_DEFAULT_FIELD_NAMES, FASTQ_DEFAULT_FIELD_NAMES}; #[derive(Clone, Debug)] pub struct Model { fields: Vec, + coord_system: CoordSystem, schema: SchemaRef, } @@ -48,7 +49,7 @@ impl Model { /// /// `fields`: `All` → `["name", "description", "sequence"]`. `Omit` → no /// fields. `Some(vec)` → specific fields. - pub fn new_fasta(fields: Select) -> crate::Result { + pub fn new_fasta(fields: Select, coord_system: CoordSystem) -> crate::Result { let defaults = || { FASTA_DEFAULT_FIELD_NAMES .iter() @@ -60,7 +61,7 @@ impl Model { Select::Some(names) => names, Select::Omit => Vec::new(), }; - Self::new(field_names) + Self::new(field_names, coord_system) } /// Create a new FASTQ model. @@ -79,10 +80,10 @@ impl Model { Select::Some(names) => names, Select::Omit => Vec::new(), }; - Self::new(field_names) + Self::new(field_names, CoordSystem::OneClosed) } - fn new(field_names: Vec) -> crate::Result { + fn new(field_names: Vec, coord_system: CoordSystem) -> crate::Result { let mut parsed_fields = Vec::new(); for name in &field_names { let field: Field = name @@ -97,6 +98,7 @@ impl Model { Ok(Self { fields: parsed_fields, + coord_system, schema, }) } @@ -111,6 +113,11 @@ impl Model { self.fields.iter().map(|f| f.name().to_string()).collect() } + /// The output coordinate system for query regions. + pub fn coord_system(&self) -> CoordSystem { + self.coord_system + } + /// The Arrow schema for this model. pub fn schema(&self) -> &SchemaRef { &self.schema @@ -149,13 +156,13 @@ impl Model { .map(|f| f.name().to_string()) .collect(); - Self::new(projected) + Self::new(projected, self.coord_system) } } impl PartialEq for Model { fn eq(&self, other: &Self) -> bool { - self.fields == other.fields + self.fields == other.fields && self.coord_system == other.coord_system } } @@ -167,7 +174,7 @@ mod tests { #[test] fn test_fasta_defaults() { - let model = Model::new_fasta(Select::All).unwrap(); + let model = Model::new_fasta(Select::All, CoordSystem::OneClosed).unwrap(); assert_eq!(model.field_names(), vec!["name", "description", "sequence"]); assert_eq!(model.schema().fields().len(), 3); } @@ -191,7 +198,7 @@ mod tests { #[test] fn test_invalid_field() { - let result = Model::new_fasta(Select::Some(vec!["invalid".into()])); + let result = Model::new_fasta(Select::Some(vec!["invalid".into()]), CoordSystem::OneClosed); assert!(result.is_err()); } @@ -204,7 +211,7 @@ mod tests { #[test] fn test_project_unknown() { - let model = Model::new_fasta(Select::All).unwrap(); + let model = Model::new_fasta(Select::All, CoordSystem::OneClosed).unwrap(); let result = model.project(&["nonexistent".into()]); assert!(result.is_err()); } diff --git a/oxbow/src/sequence/model/batch.rs b/oxbow/src/sequence/model/batch.rs index 2f384a0..4fbf61a 100644 --- a/oxbow/src/sequence/model/batch.rs +++ b/oxbow/src/sequence/model/batch.rs @@ -5,7 +5,7 @@ use arrow::record_batch::{RecordBatch, RecordBatchOptions}; use indexmap::IndexMap; use crate::batch::{Push, RecordBatchBuilder}; -use crate::Select; +use crate::{CoordSystem, Select}; use super::field::Push as _; use super::field::{Field, FieldBuilder}; @@ -27,7 +27,7 @@ impl BatchBuilder { /// Creates a new `BatchBuilder` for FASTA records. pub fn new_fasta(fields: Select, capacity: usize) -> crate::Result { - let model = Model::new_fasta(fields)?; + let model = Model::new_fasta(fields, CoordSystem::OneClosed)?; Self::from_model(&model, capacity) } diff --git a/oxbow/src/sequence/scanner/fasta.rs b/oxbow/src/sequence/scanner/fasta.rs index b913d55..fb141ef 100644 --- a/oxbow/src/sequence/scanner/fasta.rs +++ b/oxbow/src/sequence/scanner/fasta.rs @@ -7,7 +7,7 @@ use noodles::core::Region; use crate::sequence::model::BatchBuilder; use crate::sequence::model::Model; use crate::sequence::scanner::batch_iterator::{BatchIterator, QueryBatchIterator}; -use crate::Select; +use crate::{CoordSystem, Select}; /// A FASTA scanner. /// @@ -18,7 +18,7 @@ use crate::Select; /// /// ```no_run /// use oxbow::sequence::scanner::fasta::Scanner; -/// use oxbow::Select; +/// use oxbow::{CoordSystem, Select}; /// use std::fs::File; /// use std::io::BufReader; /// use noodles::core::Region; @@ -27,7 +27,7 @@ use crate::Select; /// let fmt_reader = noodles::fasta::io::Reader::new(inner); /// let index = noodles::fasta::fai::fs::read("sample.fa.fai").unwrap(); /// -/// let scanner = Scanner::new(Select::All).unwrap(); +/// let scanner = Scanner::new(Select::All, CoordSystem::OneClosed).unwrap(); /// let regions = vec!["chr1:1-1000", "chr1:1001-2000"]; /// let regions: Vec = regions.iter().map(|s| s.parse().unwrap()).collect(); /// let batches = scanner.scan_query(fmt_reader, regions, index, None, Some(2)); @@ -40,8 +40,8 @@ impl Scanner { /// Creates a FASTA scanner from schema parameters. /// /// `fields`: `All` → `["name", "description", "sequence"]`. - pub fn new(fields: Select) -> crate::Result { - let model = Model::new_fasta(fields)?; + pub fn new(fields: Select, coord_system: CoordSystem) -> crate::Result { + let model = Model::new_fasta(fields, coord_system)?; Ok(Self { model }) } @@ -127,7 +127,7 @@ mod tests { #[test] fn test_scanner_default() { - let scanner = Scanner::new(Select::All).unwrap(); + let scanner = Scanner::new(Select::All, CoordSystem::OneClosed).unwrap(); assert_eq!( scanner.field_names(), vec!["name", "description", "sequence"] @@ -136,12 +136,12 @@ mod tests { #[test] fn test_scanner_schema() { - let scanner = Scanner::new(Select::All).unwrap(); + let scanner = Scanner::new(Select::All, CoordSystem::OneClosed).unwrap(); assert_eq!(scanner.schema().fields().len(), 3); - let scanner = Scanner::new(Select::Some(vec![ - "name".to_string(), - "sequence".to_string(), - ])) + let scanner = Scanner::new( + Select::Some(vec!["name".to_string(), "sequence".to_string()]), + CoordSystem::OneClosed, + ) .unwrap(); assert_eq!(scanner.schema().fields().len(), 2); } @@ -153,7 +153,7 @@ mod tests { let reader = BufReader::new(file); let fmt_reader = noodles::fasta::io::Reader::new(reader); - let scanner = Scanner::new(Select::All).unwrap(); + let scanner = Scanner::new(Select::All, CoordSystem::OneClosed).unwrap(); let mut batch_iter = scanner.scan(fmt_reader, None, Some(2), Some(10)).unwrap(); let batch = batch_iter.next().unwrap().unwrap(); @@ -175,7 +175,7 @@ mod tests { fai::Record::new("seq3", 12, 24, 13, 13), ]); - let scanner = Scanner::new(Select::All).unwrap(); + let scanner = Scanner::new(Select::All, CoordSystem::OneClosed).unwrap(); let regions = ["seq1:1-4", "seq2:1-4", "seq3:1-4"]; let regions: Vec = regions.iter().map(|s| s.parse().unwrap()).collect(); let mut batch_iter = scanner diff --git a/oxbow/src/variant/model.rs b/oxbow/src/variant/model.rs index 9751103..95b193c 100644 --- a/oxbow/src/variant/model.rs +++ b/oxbow/src/variant/model.rs @@ -9,7 +9,7 @@ use std::sync::Arc; use arrow::datatypes::{DataType, Field as ArrowField, Schema, SchemaRef}; -use crate::{OxbowError, Select}; +use crate::{CoordSystem, OxbowError, Select}; use field::{Field, DEFAULT_FIELD_NAMES}; use genotype::GenotypeDef; use info::InfoDef; @@ -30,6 +30,7 @@ use info::InfoDef; /// - `samples_nested` controls whether genotype columns are wrapped in a /// single `"samples"` struct column (`true`) or are top-level (`false`, /// default). +/// - `coord_system` controls the coordinate system representation of the output. /// /// The model can produce an Arrow schema independently of any file header. /// Use `from_header()` to derive definitions from a VCF header. @@ -41,6 +42,7 @@ pub struct Model { genotype_by: GenotypeBy, samples: Option>, samples_nested: bool, + coord_system: CoordSystem, schema: SchemaRef, } @@ -55,6 +57,7 @@ impl Model { /// - `samples_nested`: if `true`, genotype columns are wrapped in a /// single `"samples"` struct column. If `false` (default), they are /// top-level. + /// - `coord_system`: coordinate system for position column. #[allow(clippy::too_many_arguments)] pub fn new( fields: Select, @@ -63,6 +66,7 @@ impl Model { genotype_by: Option, samples: Option>, samples_nested: Option, + coord_system: CoordSystem, ) -> crate::Result { let field_names = match fields { Select::All => DEFAULT_FIELD_NAMES.iter().map(|&s| s.to_string()).collect(), @@ -97,6 +101,7 @@ impl Model { genotype_by, samples, samples_nested, + coord_system, schema, }) } @@ -119,6 +124,7 @@ impl Model { genotype_by: Option, samples: Select, samples_nested: Option, + coord_system: CoordSystem, ) -> crate::Result { // Derive info defs from header // Omit → no info column. All/Some → info column present (even if empty struct). @@ -194,6 +200,7 @@ impl Model { genotype_by, samples, samples_nested, + coord_system, ) } @@ -307,6 +314,11 @@ impl Model { self.samples_nested } + /// The output coordinate system for the position column. + pub fn coord_system(&self) -> CoordSystem { + self.coord_system + } + /// The Arrow schema for this model. pub fn schema(&self) -> &SchemaRef { &self.schema @@ -401,6 +413,7 @@ impl Model { Some(self.genotype_by.clone()), samples, Some(self.samples_nested), + self.coord_system, ) } } @@ -413,6 +426,7 @@ impl PartialEq for Model { && self.samples == other.samples && self.genotype_by == other.genotype_by && self.samples_nested == other.samples_nested + && self.coord_system == other.coord_system } } @@ -424,6 +438,8 @@ mod tests { use noodles::vcf::header::record::value::map::{Contig, Format, Info, Map}; use noodles::vcf::Header; + const CS: CoordSystem = CoordSystem::OneClosed; + fn create_test_header() -> Header { Header::builder() .add_contig("sq0", Map::::new()) @@ -436,7 +452,7 @@ mod tests { #[test] fn test_default_model() { - let model = Model::new(Select::All, None, None, None, None, None).unwrap(); + let model = Model::new(Select::All, None, None, None, None, None, CS).unwrap(); assert_eq!(model.field_names().len(), 7); assert!(!model.has_info()); assert!(model.genotype_defs().is_none()); @@ -455,6 +471,7 @@ mod tests { None, Select::All, None, + CS, ) .unwrap(); assert_eq!(model.field_names().len(), 7); @@ -477,6 +494,7 @@ mod tests { None, Select::Some(vec!["sample1".into()]), None, + CS, ) .unwrap(); assert_eq!(model.field_names(), vec!["chrom", "pos"]); @@ -498,6 +516,7 @@ mod tests { None, Select::Omit, None, + CS, ) .unwrap(); assert_eq!(model.field_names().len(), 7); @@ -516,6 +535,7 @@ mod tests { None, None, None, + CS, ) .unwrap(); assert!(!model.has_info()); @@ -534,6 +554,7 @@ mod tests { None, Select::All, None, + CS, ) .unwrap(); let projected = model.project(&["chrom".into(), "pos".into()]).unwrap(); @@ -553,6 +574,7 @@ mod tests { None, Select::All, None, + CS, ) .unwrap(); let projected = model.project(&["chrom".into(), "info".into()]).unwrap(); @@ -571,6 +593,7 @@ mod tests { None, Select::All, None, + CS, ) .unwrap(); let projected = model.project(&["chrom".into(), "sample1".into()]).unwrap(); @@ -587,6 +610,7 @@ mod tests { None, None, None, + CS, ); assert!(result.is_err()); } @@ -602,6 +626,7 @@ mod tests { None, Select::All, Some(true), + CS, ) .unwrap(); assert!(model.samples_nested()); @@ -629,6 +654,7 @@ mod tests { None, Select::All, Some(true), + CS, ) .unwrap(); // "samples" is an atomic column in nested mode @@ -650,6 +676,7 @@ mod tests { None, Select::All, Some(true), + CS, ) .unwrap(); let projected = model.project(&["chrom".into(), "info".into()]).unwrap(); @@ -670,6 +697,7 @@ mod tests { None, Select::All, None, + CS, ) .unwrap(); assert!(!model.samples_nested()); @@ -690,6 +718,7 @@ mod tests { None, Select::Omit, None, + CS, ) .unwrap(); assert!(model.has_info()); @@ -714,6 +743,7 @@ mod tests { None, Select::Omit, None, + CS, ) .unwrap(); assert!(!model.has_info()); @@ -731,6 +761,7 @@ mod tests { Some(GenotypeBy::Sample), Select::Some(vec!["sample1".into()]), None, + CS, ) .unwrap(); assert!(model.genotype_defs().is_some()); @@ -756,6 +787,7 @@ mod tests { Some(GenotypeBy::Field), Select::Some(vec!["sample1".into()]), None, + CS, ) .unwrap(); assert_eq!(model.schema().fields().len(), 7); @@ -772,6 +804,7 @@ mod tests { None, Select::Some(vec!["sample1".into()]), Some(true), + CS, ) .unwrap(); assert!(model.genotype_defs().is_none()); @@ -789,6 +822,7 @@ mod tests { None, Select::Some(vec![]), // samples active, none selected Some(true), + CS, ) .unwrap(); assert!(model.samples().is_some()); @@ -814,6 +848,7 @@ mod tests { None, Select::Some(vec![]), // samples active, none selected Some(false), + CS, ) .unwrap(); assert_eq!(model.schema().fields().len(), 7); @@ -830,6 +865,7 @@ mod tests { None, Select::Omit, // samples deactivated Some(true), + CS, ) .unwrap(); assert!(model.samples().is_none()); diff --git a/oxbow/src/variant/model/batch.rs b/oxbow/src/variant/model/batch.rs index 3208b49..dc69349 100644 --- a/oxbow/src/variant/model/batch.rs +++ b/oxbow/src/variant/model/batch.rs @@ -1,7 +1,7 @@ use std::collections::HashMap; use std::sync::Arc; -use crate::{OxbowError, Select}; +use crate::{CoordSystem, OxbowError, Select}; use arrow::array::{ArrayRef, StructArray}; use arrow::datatypes::{Field as ArrowField, FieldRef, SchemaRef}; @@ -15,6 +15,9 @@ use noodles::vcf::variant::record::samples::Series; use crate::batch::{Push, RecordBatchBuilder}; +/// The coordinate system in which noodles returns variant positions. +const SOURCE_CS: CoordSystem = CoordSystem::OneClosed; + use super::field::Push as _; use super::field::{Field, FieldBuilder}; use super::genotype::{GenotypeDef, SampleStructBuilder, SeriesStructBuilder}; @@ -69,6 +72,7 @@ impl BatchBuilder { Some(genotype_by), sample_names, None, + CoordSystem::OneClosed, )?; Self::from_model(&model, header, capacity) } @@ -85,11 +89,16 @@ impl BatchBuilder { .map(|(name, _)| name.to_string()) .collect(); + let coord_offset = model.coord_system().start_offset_from(SOURCE_CS); + let mut field_builders = IndexMap::new(); for field in model.fields() { let builder = match field { Field::Chrom => FieldBuilder::with_refs(field.clone(), capacity, &ref_names) .map_err(|e| crate::OxbowError::invalid_data(e.to_string()))?, + Field::Pos => { + FieldBuilder::new(field.clone(), capacity).with_coord_offset(coord_offset) + } _ => FieldBuilder::new(field.clone(), capacity), }; field_builders.insert(field.clone(), builder); diff --git a/oxbow/src/variant/model/field.rs b/oxbow/src/variant/model/field.rs index 3879e19..55f046b 100644 --- a/oxbow/src/variant/model/field.rs +++ b/oxbow/src/variant/model/field.rs @@ -92,7 +92,7 @@ impl FromStr for Field { /// Builds an Arrow array (column) corresponding to a variant standard field. pub enum FieldBuilder { Chrom(StringDictionaryBuilder), - Pos(Int32Builder), + Pos(Int32Builder, i32), Id(ListBuilder>), Ref(GenericStringBuilder), Alt(ListBuilder>), @@ -104,7 +104,7 @@ impl FieldBuilder { pub fn new(field: Field, capacity: usize) -> Self { match field { Field::Chrom => Self::Chrom(StringDictionaryBuilder::::new()), - Field::Pos => Self::Pos(Int32Builder::with_capacity(capacity)), + Field::Pos => Self::Pos(Int32Builder::with_capacity(capacity), 0), Field::Id => Self::Id(ListBuilder::with_capacity( GenericStringBuilder::new(), capacity, @@ -122,6 +122,16 @@ impl FieldBuilder { } } + /// Sets the coordinate offset for the position field (`pos`). + /// + /// Has no effect on other field variants. + pub fn with_coord_offset(self, offset: i32) -> Self { + match self { + Self::Pos(b, _) => Self::Pos(b, offset), + other => other, + } + } + pub fn with_refs( field: Field, capacity: usize, @@ -150,7 +160,7 @@ impl FieldBuilder { let array = reset_dictarray_builder(builder); Arc::new(array) } - Self::Pos(builder) => Arc::new(builder.finish()), + Self::Pos(builder, _) => Arc::new(builder.finish()), Self::Id(builder) => Arc::new(builder.finish()), Self::Ref(builder) => Arc::new(builder.finish()), Self::Alt(builder) => Arc::new(builder.finish()), @@ -176,13 +186,13 @@ impl Push<&noodles::vcf::Record> for FieldBuilder { let rname = record.reference_sequence_name(); builder.append_value(rname); } - Self::Pos(builder) => { + Self::Pos(builder, offset) => { builder.append_option( record .variant_start() .transpose() .unwrap_or(None) - .map(|x| x.get() as i32), + .map(|x| x.get() as i32 + *offset), ); } Self::Id(builder) => { @@ -268,13 +278,13 @@ impl Push<&noodles::bcf::Record> for FieldBuilder { .map(|(name, _)| name.to_string()); builder.append_option(rname); } - Self::Pos(builder) => { + Self::Pos(builder, offset) => { builder.append_option( record .variant_start() .transpose() .unwrap_or(None) - .map(|x| x.get() as i32), + .map(|x| x.get() as i32 + *offset), ); } Self::Id(builder) => { diff --git a/oxbow/src/variant/scanner/bcf.rs b/oxbow/src/variant/scanner/bcf.rs index 31c1e83..9573968 100644 --- a/oxbow/src/variant/scanner/bcf.rs +++ b/oxbow/src/variant/scanner/bcf.rs @@ -8,7 +8,7 @@ use noodles::csi::BinningIndex; use crate::util::query::{BgzfChunkReader, ByteRangeReader}; use crate::variant::model::{BatchBuilder, GenotypeBy, Model}; use crate::variant::scanner::batch_iterator::{BatchIterator, QueryBatchIterator}; -use crate::{OxbowError, Select}; +use crate::{CoordSystem, OxbowError, Select}; /// A BCF scanner. /// @@ -28,7 +28,8 @@ use crate::{OxbowError, Select}; /// let mut fmt_reader = noodles::bcf::io::Reader::new(inner); /// let header = fmt_reader.read_header().unwrap(); /// -/// let scanner = Scanner::new(header, Select::All, Select::All, Select::All, None, Select::All, None).unwrap(); +/// use oxbow::CoordSystem; +/// let scanner = Scanner::new(header, Select::All, Select::All, Select::All, None, Select::All, None, CoordSystem::OneClosed).unwrap(); /// let batches = scanner.scan(fmt_reader, None, None, Some(1000)); /// ``` pub struct Scanner { @@ -47,6 +48,7 @@ impl Scanner { genotype_by: Option, samples: Select, samples_nested: Option, + coord_system: CoordSystem, ) -> crate::Result { let model = Model::from_header( &header, @@ -56,6 +58,7 @@ impl Scanner { genotype_by, samples, samples_nested, + coord_system, )?; Ok(Self { header, model }) } diff --git a/oxbow/src/variant/scanner/vcf.rs b/oxbow/src/variant/scanner/vcf.rs index 1417b61..c025fa5 100644 --- a/oxbow/src/variant/scanner/vcf.rs +++ b/oxbow/src/variant/scanner/vcf.rs @@ -8,7 +8,7 @@ use noodles::csi::BinningIndex; use crate::util::query::{BgzfChunkReader, ByteRangeReader}; use crate::variant::model::{BatchBuilder, GenotypeBy, Model}; use crate::variant::scanner::batch_iterator::{BatchIterator, QueryBatchIterator}; -use crate::{OxbowError, Select}; +use crate::{CoordSystem, OxbowError, Select}; /// A VCF scanner. /// @@ -28,7 +28,8 @@ use crate::{OxbowError, Select}; /// let mut fmt_reader = noodles::vcf::io::Reader::new(inner); /// let header = fmt_reader.read_header().unwrap(); /// -/// let scanner = Scanner::new(header, Select::All, Select::All, Select::All, None, Select::All, None).unwrap(); +/// use oxbow::CoordSystem; +/// let scanner = Scanner::new(header, Select::All, Select::All, Select::All, None, Select::All, None, CoordSystem::OneClosed).unwrap(); /// let batches = scanner.scan(fmt_reader, None, None, Some(1000)); /// ``` pub struct Scanner { @@ -47,6 +48,7 @@ impl Scanner { genotype_by: Option, samples: Select, samples_nested: Option, + coord_system: CoordSystem, ) -> crate::Result { let model = Model::from_header( &header, @@ -56,6 +58,7 @@ impl Scanner { genotype_by, samples, samples_nested, + coord_system, )?; Ok(Self { header, model }) } diff --git a/py-oxbow/oxbow/_core/alignment.py b/py-oxbow/oxbow/_core/alignment.py index 9b7c6a1..31a658e 100644 --- a/py-oxbow/oxbow/_core/alignment.py +++ b/py-oxbow/oxbow/_core/alignment.py @@ -24,6 +24,7 @@ def __init__( *, fields: Literal["*"] | list[str] | None = "*", tag_defs: list[tuple[str, str]] | None = None, + coords: Literal["01", "11"] = "11", regions: str | list[str] | None = None, index: str | Callable[[], IO[bytes] | str] | None = None, batch_size: int = DEFAULT_BATCH_SIZE, @@ -35,7 +36,7 @@ def __init__( self._regions = regions self._scanner_kwargs = dict( - compressed=compressed, fields=fields, tag_defs=tag_defs + compressed=compressed, fields=fields, tag_defs=tag_defs, coords=coords ) def _tag_discovery_kwargs(self) -> dict: @@ -146,6 +147,7 @@ def __init__( *, fields: Literal["*"] | list[str] | None = "*", tag_defs: list[tuple[str, str]] | None = None, + coords: Literal["01", "11"] = "11", regions: str | list[str] | None = None, index: str | Callable[[], IO[bytes] | str] | None = None, reference: str | Callable[[], IO[bytes] | str] | None = None, @@ -159,6 +161,7 @@ def __init__( compressed=compressed, fields=fields, tag_defs=tag_defs, + coords=coords, regions=regions, index=index, batch_size=batch_size, @@ -180,6 +183,7 @@ def from_sam( *, fields: Literal["*"] | list[str] | None = "*", tag_defs: list[tuple[str, str]] | None = None, + coords: Literal["01", "11"] = "11", regions: str | list[str] | None = None, index: str | pathlib.Path | Callable[[], IO[bytes] | str] | None = None, batch_size: int = DEFAULT_BATCH_SIZE, @@ -248,6 +252,7 @@ def from_sam( compressed=bgzf_compressed, fields=fields, tag_defs=tag_defs, + coords=coords, regions=regions, index=index, batch_size=batch_size, @@ -260,6 +265,7 @@ def from_bam( *, fields: Literal["*"] | list[str] | None = "*", tag_defs: list[tuple[str, str]] | None = None, + coords: Literal["01", "11"] = "11", regions: str | list[str] | None = None, index: str | pathlib.Path | Callable[[], IO[bytes] | str] | None = None, batch_size: int = DEFAULT_BATCH_SIZE, @@ -325,6 +331,7 @@ def from_bam( compressed=bgzf_compressed, fields=fields, tag_defs=tag_defs, + coords=coords, regions=regions, index=index, batch_size=batch_size, @@ -336,6 +343,7 @@ def from_cram( *, fields: Literal["*"] | list[str] | None = "*", tag_defs: list[tuple[str, str]] | None = None, + coords: Literal["01", "11"] = "11", regions: str | list[str] | None = None, index: str | pathlib.Path | Callable[[], IO[bytes] | str] | None = None, reference: str | pathlib.Path | Callable[[], IO[bytes] | str] | None = None, @@ -403,6 +411,7 @@ def from_cram( source=source, fields=fields, tag_defs=tag_defs, + coords=coords, regions=regions, index=index, reference=reference, diff --git a/py-oxbow/oxbow/_core/bbi.py b/py-oxbow/oxbow/_core/bbi.py index 3cdf9ee..c5f30b3 100644 --- a/py-oxbow/oxbow/_core/bbi.py +++ b/py-oxbow/oxbow/_core/bbi.py @@ -74,6 +74,7 @@ def __init__( schema: str = "bed3+", *, fields: Literal["*"] | list[str] | None = "*", + coords: Literal["01", "11"] = "01", regions: str | list[str] | None = None, batch_size: int = DEFAULT_BATCH_SIZE, ): @@ -83,7 +84,7 @@ def __init__( regions = [regions] self._regions = regions - self._scanner_kwargs = dict(schema=schema, fields=fields) + self._scanner_kwargs = dict(schema=schema, fields=fields, coords=coords) def regions(self, regions: str | list[str]) -> Self: return type(self)( @@ -102,6 +103,7 @@ def __init__( source: str | Callable[[], IO[bytes] | str], *, fields: Literal["*"] | list[str] | None = "*", + coords: Literal["01", "11"] = "01", regions: str | list[str] | None = None, batch_size: int = DEFAULT_BATCH_SIZE, ): @@ -111,7 +113,7 @@ def __init__( regions = [regions] self._regions = regions - self._scanner_kwargs = dict(fields=fields) + self._scanner_kwargs = dict(fields=fields, coords=coords) def regions(self, regions: str | list[str]) -> Self: return type(self)( @@ -165,6 +167,7 @@ def from_bigbed( schema: str = "bed3+", *, fields: Literal["*"] | list[str] | None = "*", + coords: Literal["01", "11"] = "01", regions: str | list[str] | None = None, batch_size: int = DEFAULT_BATCH_SIZE, ) -> BigBedFile: @@ -207,6 +210,7 @@ def from_bigbed( source=source, schema=schema, fields=fields, + coords=coords, regions=regions, batch_size=batch_size, ) @@ -216,6 +220,7 @@ def from_bigwig( source: str | pathlib.Path | Callable[[], IO[bytes] | str], *, fields: Literal["*"] | list[str] | None = "*", + coords: Literal["01", "11"] = "01", regions: str | list[str] | None = None, batch_size: int = DEFAULT_BATCH_SIZE, ) -> BigWigFile: @@ -251,6 +256,7 @@ def from_bigwig( return BigWigFile( source=source, fields=fields, + coords=coords, regions=regions, batch_size=batch_size, ) diff --git a/py-oxbow/oxbow/_core/bed.py b/py-oxbow/oxbow/_core/bed.py index 7ae19de..666446f 100644 --- a/py-oxbow/oxbow/_core/bed.py +++ b/py-oxbow/oxbow/_core/bed.py @@ -30,6 +30,7 @@ def __init__( compressed: bool = False, *, fields: Literal["*"] | list[str] | None = "*", + coords: Literal["01", "11"] = "01", regions: str | list[str] | None = None, index: str | Callable[[], IO[bytes] | str] | None = None, batch_size: int = DEFAULT_BATCH_SIZE, @@ -44,6 +45,7 @@ def __init__( bed_schema=bed_schema, compressed=compressed, fields=fields, + coords=coords, ) def _scan_query(self, scanner, region, columns, batch_size): @@ -70,6 +72,7 @@ def from_bed( compression: Literal["infer", "bgzf", "gzip", None] = "infer", *, fields: Literal["*"] | list[str] | None = "*", + coords: Literal["01", "11"] = "01", regions: str | list[str] | None = None, index: str | pathlib.Path | Callable[[], IO[bytes] | str] | None = None, batch_size: int = DEFAULT_BATCH_SIZE, @@ -130,6 +133,7 @@ def from_bed( bed_schema=bed_schema, compressed=bgzf_compressed, fields=fields, + coords=coords, regions=regions, index=index, batch_size=batch_size, diff --git a/py-oxbow/oxbow/_core/gxf.py b/py-oxbow/oxbow/_core/gxf.py index b28f893..7ffe22e 100644 --- a/py-oxbow/oxbow/_core/gxf.py +++ b/py-oxbow/oxbow/_core/gxf.py @@ -24,6 +24,7 @@ def __init__( *, fields: Literal["*"] | list[str] | None = "*", attribute_defs: list[tuple[str, str]] | None = None, + coords: Literal["01", "11"] = "11", regions: str | list[str] | None = None, index: str | Callable[[], IO[bytes] | str] | None = None, batch_size: int = DEFAULT_BATCH_SIZE, @@ -35,7 +36,10 @@ def __init__( self._regions = regions self._scanner_kwargs = dict( - compressed=compressed, fields=fields, attribute_defs=attribute_defs + compressed=compressed, + fields=fields, + attribute_defs=attribute_defs, + coords=coords, ) def _scan_query(self, scanner, region, columns, batch_size): @@ -123,6 +127,7 @@ def from_gtf( *, fields: Literal["*"] | list[str] | None = "*", attribute_defs: list[tuple[str, str]] | None = None, + coords: Literal["01", "11"] = "11", regions: str | list[str] | None = None, index: str | pathlib.Path | Callable[[], IO[bytes] | str] | None = None, batch_size: int = DEFAULT_BATCH_SIZE, @@ -190,6 +195,7 @@ def from_gtf( compressed=bgzf_compressed, fields=fields, attribute_defs=attribute_defs, + coords=coords, regions=regions, index=index, batch_size=batch_size, @@ -202,6 +208,7 @@ def from_gff( *, fields: Literal["*"] | list[str] | None = "*", attribute_defs: list[tuple[str, str]] | None = None, + coords: Literal["01", "11"] = "11", regions: str | list[str] | None = None, index: str | pathlib.Path | Callable[[], IO[bytes] | str] | None = None, batch_size: int = DEFAULT_BATCH_SIZE, diff --git a/py-oxbow/oxbow/_core/sequence.py b/py-oxbow/oxbow/_core/sequence.py index 7a2c09f..03b0e65 100644 --- a/py-oxbow/oxbow/_core/sequence.py +++ b/py-oxbow/oxbow/_core/sequence.py @@ -88,6 +88,7 @@ def __init__( compressed: bool = False, *, fields: Literal["*"] | list[str] | None = "*", + coords: Literal["01", "11"] = "11", regions: str | list[str] | None = None, index: str | Callable[[], IO[bytes] | str] | None = None, gzi: str | Callable[[], IO[bytes]] | None = None, @@ -102,6 +103,7 @@ def __init__( gzi=gzi, batch_size=batch_size, ) + self._scanner_kwargs["coords"] = coords class FastqFile(SequenceFile): @@ -134,6 +136,7 @@ def from_fasta( compression: Literal["infer", "bgzf", "gzip", None] = "infer", *, fields: Literal["*"] | list[str] | None = "*", + coords: Literal["01", "11"] = "11", regions: str | list[str] | None = None, index: str | pathlib.Path | Callable[[], IO[bytes] | str] | None = None, gzi: str | pathlib.Path | Callable[[], IO[bytes] | str] | None = None, @@ -190,6 +193,7 @@ def from_fasta( source=source, compressed=bgzf_compressed, fields=fields, + coords=coords, regions=regions, index=index, gzi=gzi, diff --git a/py-oxbow/oxbow/_core/variant.py b/py-oxbow/oxbow/_core/variant.py index 451bb5e..9ee3b64 100644 --- a/py-oxbow/oxbow/_core/variant.py +++ b/py-oxbow/oxbow/_core/variant.py @@ -28,6 +28,7 @@ def __init__( genotype_by: Literal["sample", "field"] = "sample", samples: Literal["*"] | list[str] | None = None, samples_nested: bool = False, + coords: Literal["01", "11"] = "11", regions: str | list[str] | None = None, index: str | Callable[[], IO[bytes] | str] | None = None, batch_size: int = DEFAULT_BATCH_SIZE, @@ -46,6 +47,7 @@ def __init__( genotype_by=genotype_by, samples=samples, samples_nested=samples_nested, + coords=coords, ) def _scan_query(self, scanner, region, columns, batch_size): @@ -153,6 +155,7 @@ def from_vcf( genotype_by: Literal["sample", "field"] = "sample", samples: Literal["*"] | list[str] | None = None, samples_nested: bool = False, + coords: Literal["01", "11"] = "11", regions: str | list[str] | None = None, index: str | pathlib.Path | Callable[[], IO[bytes] | str] | None = None, batch_size: int = DEFAULT_BATCH_SIZE, @@ -239,6 +242,7 @@ def from_vcf( genotype_by=genotype_by, samples=samples, samples_nested=samples_nested, + coords=coords, regions=regions, index=index, batch_size=batch_size, @@ -255,6 +259,7 @@ def from_bcf( genotype_by: Literal["sample", "field"] = "sample", samples: Literal["*"] | list[str] | None = None, samples_nested: bool = False, + coords: Literal["01", "11"] = "11", regions: str | list[str] | None = None, index: str | pathlib.Path | Callable[[], IO[bytes] | str] | None = None, batch_size: int = DEFAULT_BATCH_SIZE, @@ -339,6 +344,7 @@ def from_bcf( genotype_by=genotype_by, samples=samples, samples_nested=samples_nested, + coords=coords, regions=regions, index=index, batch_size=batch_size, diff --git a/py-oxbow/src/alignment.rs b/py-oxbow/src/alignment.rs index 6bd89c9..99454f8 100644 --- a/py-oxbow/src/alignment.rs +++ b/py-oxbow/src/alignment.rs @@ -13,12 +13,13 @@ use noodles::core::Region; use crate::error::{err_on_unwind, to_py}; use crate::util::{ - pyobject_to_bufreader, resolve_cram_index, resolve_fasta_repository, resolve_fields, - resolve_index, PyVirtualPosition, Reader, + pyobject_to_bufreader, resolve_coord_system, resolve_cram_index, resolve_fasta_repository, + resolve_fields, resolve_index, PyVirtualPosition, Reader, }; use oxbow::alignment::{BamScanner, CramScanner, SamScanner}; use oxbow::util::batches_to_ipc; use oxbow::util::index::IndexType; +use oxbow::CoordSystem; /// A SAM file scanner. /// @@ -45,20 +46,28 @@ pub struct PySamScanner { #[pymethods] impl PySamScanner { #[new] - #[pyo3(signature = (src, compressed=false, fields=None, tag_defs=None))] + #[pyo3(signature = (src, compressed=false, fields=None, tag_defs=None, coords=None))] fn new( py: Python, src: Py, compressed: bool, fields: Option>, tag_defs: Option>, + coords: Option, ) -> PyResult { let fields = resolve_fields(fields, py)?; + let coord_system = resolve_coord_system(coords)?; let reader = pyobject_to_bufreader(py, src.clone_ref(py), compressed)?; let mut fmt_reader = noodles::sam::io::Reader::new(reader); let header = fmt_reader.read_header()?; let reader = fmt_reader.into_inner(); - let scanner = SamScanner::new(header, fields, tag_defs).map_err(to_py)?; + let scanner = SamScanner::new( + header, + fields, + tag_defs, + coord_system.unwrap_or(CoordSystem::OneClosed), + ) + .map_err(to_py)?; Ok(Self { src, reader, @@ -84,6 +93,7 @@ impl PySamScanner { .collect::>(); kwargs.set_item("tag_defs", tag_defs_raw)?; } + kwargs.set_item("coords", model.coord_system().to_string())?; Ok((args.into_py_any(py)?, kwargs.into_py_any(py)?)) } @@ -483,20 +493,28 @@ pub struct PyBamScanner { #[pymethods] impl PyBamScanner { #[new] - #[pyo3(signature = (src, compressed=true, fields=None, tag_defs=None))] + #[pyo3(signature = (src, compressed=true, fields=None, tag_defs=None, coords=None))] fn new( py: Python, src: Py, compressed: bool, fields: Option>, tag_defs: Option>, + coords: Option, ) -> PyResult { let fields = resolve_fields(fields, py)?; + let coord_system = resolve_coord_system(coords)?; let reader = pyobject_to_bufreader(py, src.clone_ref(py), compressed)?; let mut fmt_reader = noodles::bam::io::Reader::from(reader); let header = fmt_reader.read_header()?; let reader = fmt_reader.into_inner(); - let scanner = BamScanner::new(header, fields, tag_defs).map_err(to_py)?; + let scanner = BamScanner::new( + header, + fields, + tag_defs, + coord_system.unwrap_or(CoordSystem::OneClosed), + ) + .map_err(to_py)?; Ok(Self { src, reader, @@ -522,6 +540,7 @@ impl PyBamScanner { .collect::>(); kwargs.set_item("tag_defs", tag_defs_raw)?; } + kwargs.set_item("coords", model.coord_system().to_string())?; Ok((args.into_py_any(py)?, kwargs.into_py_any(py)?)) } @@ -921,7 +940,8 @@ pub struct PyCramScanner { impl PyCramScanner { #[new] #[allow(unused_variables)] - #[pyo3(signature = (src, compressed=None, fields=None, tag_defs=None, reference=None, reference_index=None))] + #[allow(clippy::too_many_arguments)] + #[pyo3(signature = (src, compressed=None, fields=None, tag_defs=None, reference=None, reference_index=None, coords=None))] fn new( py: Python, src: Py, @@ -930,8 +950,10 @@ impl PyCramScanner { tag_defs: Option>, reference: Option>, reference_index: Option>, + coords: Option, ) -> PyResult { let fields = resolve_fields(fields, py)?; + let coord_system = resolve_coord_system(coords)?; let reader = pyobject_to_bufreader(py, src.clone_ref(py), false)?; let mut fmt_reader = noodles::cram::io::Reader::new(reader); let header = fmt_reader.read_header()?; @@ -941,7 +963,14 @@ impl PyCramScanner { reference.as_ref().map(|r| r.clone_ref(py)), reference_index.as_ref().map(|r| r.clone_ref(py)), )?; - let scanner = CramScanner::new(header, fields, tag_defs, repo).map_err(to_py)?; + let scanner = CramScanner::new( + header, + fields, + tag_defs, + repo, + coord_system.unwrap_or(CoordSystem::OneClosed), + ) + .map_err(to_py)?; Ok(Self { src, reader, @@ -970,6 +999,7 @@ impl PyCramScanner { if let Some(ref reference_index) = self.reference_index { kwargs.set_item("reference_index", reference_index)?; } + kwargs.set_item("coords", model.coord_system().to_string())?; Ok((args.into_py_any(py)?, kwargs.into_py_any(py)?)) } @@ -1166,7 +1196,8 @@ pub fn read_sam( let reader = pyobject_to_bufreader(py, src.clone_ref(py), compressed)?; let mut fmt_reader = noodles::sam::io::Reader::new(reader); let header = fmt_reader.read_header()?; - let scanner = SamScanner::new(header, fields, tag_defs).map_err(to_py)?; + let scanner = + SamScanner::new(header, fields, tag_defs, CoordSystem::OneClosed).map_err(to_py)?; let reader = fmt_reader.into_inner(); let ipc = if let Some(region) = region { @@ -1238,7 +1269,8 @@ pub fn read_bam( let reader = pyobject_to_bufreader(py, src.clone_ref(py), compressed)?; let mut fmt_reader = noodles::bam::io::Reader::from(reader); let header = fmt_reader.read_header()?; - let scanner = BamScanner::new(header, fields, tag_defs).map_err(to_py)?; + let scanner = + BamScanner::new(header, fields, tag_defs, CoordSystem::OneClosed).map_err(to_py)?; let reader = fmt_reader.into_inner(); let ipc = if let Some(region) = region { @@ -1311,7 +1343,8 @@ pub fn read_cram( let mut fmt_reader = noodles::cram::io::Reader::new(reader); let header = fmt_reader.read_header()?; let repo = resolve_fasta_repository(py, reference, reference_index)?; - let scanner = CramScanner::new(header, fields, tag_defs, repo).map_err(to_py)?; + let scanner = + CramScanner::new(header, fields, tag_defs, repo, CoordSystem::OneClosed).map_err(to_py)?; let reader = fmt_reader.into_inner(); let ipc = if let Some(region) = region { diff --git a/py-oxbow/src/bbi.rs b/py-oxbow/src/bbi.rs index 9a7ac94..b5c349b 100644 --- a/py-oxbow/src/bbi.rs +++ b/py-oxbow/src/bbi.rs @@ -12,10 +12,11 @@ use bigtools::bed::autosql::parse::parse_autosql; use noodles::core::Region; use crate::error::{err_on_unwind, to_py}; -use crate::util::{pyobject_to_bufreader, resolve_fields, Reader}; +use crate::util::{pyobject_to_bufreader, resolve_coord_system, resolve_fields, Reader}; use oxbow::bbi::model::base::field::FieldDef; use oxbow::bbi::{BBIReader, BBIZoomScanner, BedSchema, BigBedScanner, BigWigScanner}; use oxbow::util::batches_to_ipc; +use oxbow::CoordSystem; #[pyclass(eq, eq_int, from_py_object, module = "oxbow.oxbow")] #[derive(Clone, PartialEq)] @@ -42,13 +43,24 @@ pub struct PyBigWigScanner { #[pymethods] impl PyBigWigScanner { #[new] - #[pyo3(signature = (src, fields=None))] - fn new(py: Python, src: Py, fields: Option>) -> PyResult { + #[pyo3(signature = (src, fields=None, coords=None))] + fn new( + py: Python, + src: Py, + fields: Option>, + coords: Option, + ) -> PyResult { + let coord_system = resolve_coord_system(coords)?; let reader = pyobject_to_bufreader(py, src.clone_ref(py), false)?; let fmt_reader = bigtools::BigWigRead::open(reader).unwrap(); let info = fmt_reader.info().clone(); let reader = fmt_reader.into_inner(); - let scanner = BigWigScanner::new(info, resolve_fields(fields, py)?).map_err(to_py)?; + let scanner = BigWigScanner::new( + info, + resolve_fields(fields, py)?, + coord_system.unwrap_or(CoordSystem::ZeroHalfOpen), + ) + .map_err(to_py)?; Ok(Self { _src: src, reader, @@ -64,6 +76,7 @@ impl PyBigWigScanner { let args = (self._src.clone_ref(py),); let kwargs = PyDict::new(py); kwargs.set_item("fields", self.scanner.model().field_names())?; + kwargs.set_item("coords", self.scanner.model().coord_system().to_string())?; Ok((args.into_py_any(py)?, kwargs.into_py_any(py)?)) } @@ -113,6 +126,7 @@ impl PyBigWigScanner { PyBBIFileType::BigWig, zoom_level, fields, + Some(self.scanner.model().coord_system().to_string()), ) }) } @@ -226,13 +240,15 @@ pub struct PyBigBedScanner { #[pymethods] impl PyBigBedScanner { #[new] - #[pyo3(signature = (src, schema=None, fields=None))] + #[pyo3(signature = (src, schema=None, fields=None, coords=None))] fn new( py: Python, src: Py, schema: Option>, fields: Option>, + coords: Option, ) -> PyResult { + let coord_system = resolve_coord_system(coords)?; let reader = pyobject_to_bufreader(py, src.clone_ref(py), false)?; let mut fmt_reader = bigtools::BigBedRead::open(reader).unwrap(); let bed_schema = match &schema { @@ -266,8 +282,13 @@ impl PyBigBedScanner { }; let info = fmt_reader.info().clone(); let reader = fmt_reader.into_inner(); - let scanner = - BigBedScanner::new(bed_schema, info, resolve_fields(fields, py)?).map_err(to_py)?; + let scanner = BigBedScanner::new( + bed_schema, + info, + resolve_fields(fields, py)?, + coord_system.unwrap_or(CoordSystem::ZeroHalfOpen), + ) + .map_err(to_py)?; let _schema: Option = schema.as_ref().and_then(|s| s.extract::(py).ok()); Ok(Self { _src: src, @@ -288,6 +309,7 @@ impl PyBigBedScanner { ); let kwargs = PyDict::new(py); kwargs.set_item("fields", self.scanner.model().field_names())?; + kwargs.set_item("coords", self.scanner.model().coord_system().to_string())?; Ok((args.into_py_any(py)?, kwargs.into_py_any(py)?)) } @@ -350,6 +372,7 @@ impl PyBigBedScanner { PyBBIFileType::BigBed, zoom_level, fields, + Some(self.scanner.model().coord_system().to_string()), ) }) } @@ -463,14 +486,16 @@ pub struct PyBBIZoomScanner { #[pymethods] impl PyBBIZoomScanner { #[new] - #[pyo3(signature = (src, bbi_type, zoom_level, fields=None))] + #[pyo3(signature = (src, bbi_type, zoom_level, fields=None, coords=None))] pub fn new( py: Python, src: Py, bbi_type: PyBBIFileType, zoom_level: u32, fields: Option>, + coords: Option, ) -> PyResult { + let coord_system = resolve_coord_system(coords)?; let reader = pyobject_to_bufreader(py, src.clone_ref(py), false) .expect("Failed to convert Py to BufReader"); match bbi_type { @@ -494,9 +519,13 @@ impl PyBBIZoomScanner { ))); } let reader = fmt_reader.into_inner(); - let scanner = - BBIZoomScanner::new(ref_names, zoom_level, resolve_fields(fields, py)?) - .map_err(to_py)?; + let scanner = BBIZoomScanner::new( + ref_names, + zoom_level, + resolve_fields(fields, py)?, + coord_system.unwrap_or(CoordSystem::ZeroHalfOpen), + ) + .map_err(to_py)?; Ok(Self { src, reader, @@ -525,9 +554,13 @@ impl PyBBIZoomScanner { ))); } let reader = fmt_reader.into_inner(); - let scanner = - BBIZoomScanner::new(ref_names, zoom_level, resolve_fields(fields, py)?) - .map_err(to_py)?; + let scanner = BBIZoomScanner::new( + ref_names, + zoom_level, + resolve_fields(fields, py)?, + coord_system.unwrap_or(CoordSystem::ZeroHalfOpen), + ) + .map_err(to_py)?; Ok(Self { src, reader, @@ -551,6 +584,7 @@ impl PyBBIZoomScanner { ); let kwargs = PyDict::new(py); kwargs.set_item("fields", self.scanner.field_names())?; + kwargs.set_item("coords", self.scanner.model().coord_system().to_string())?; Ok((args.into_py_any(py)?, kwargs.into_py_any(py)?)) } @@ -701,7 +735,9 @@ pub fn read_bigwig( let fmt_reader = bigtools::BigWigRead::open(reader) .map_err(|e| PyErr::new::(e.to_string()))?; let info = fmt_reader.info().clone(); - let scanner = BigWigScanner::new(info, resolve_fields(fields, py)?).map_err(to_py)?; + let scanner = + BigWigScanner::new(info, resolve_fields(fields, py)?, CoordSystem::ZeroHalfOpen) + .map_err(to_py)?; let batches = scanner .scan_query(fmt_reader, region, None, None, None) .map_err(to_py)?; @@ -710,7 +746,9 @@ pub fn read_bigwig( let fmt_reader = bigtools::BigWigRead::open(reader) .map_err(|e| PyErr::new::(e.to_string()))?; let info = fmt_reader.info().clone(); - let scanner = BigWigScanner::new(info, resolve_fields(fields, py)?).map_err(to_py)?; + let scanner = + BigWigScanner::new(info, resolve_fields(fields, py)?, CoordSystem::ZeroHalfOpen) + .map_err(to_py)?; let batches = scanner.scan(fmt_reader, None, None, None).map_err(to_py)?; batches_to_ipc(batches) }; @@ -753,8 +791,13 @@ pub fn read_bigbed( let fmt_reader = bigtools::BigBedRead::open(reader) .map_err(|e| PyErr::new::(e.to_string()))?; let info = fmt_reader.info().clone(); - let scanner = - BigBedScanner::new(bed_schema, info, resolve_fields(fields, py)?).map_err(to_py)?; + let scanner = BigBedScanner::new( + bed_schema, + info, + resolve_fields(fields, py)?, + CoordSystem::ZeroHalfOpen, + ) + .map_err(to_py)?; let batches = scanner .scan_query(fmt_reader, region, None, None, None) .map_err(to_py)?; @@ -763,8 +806,13 @@ pub fn read_bigbed( let fmt_reader = bigtools::BigBedRead::open(reader) .map_err(|e| PyErr::new::(e.to_string()))?; let info = fmt_reader.info().clone(); - let scanner = - BigBedScanner::new(bed_schema, info, resolve_fields(fields, py)?).map_err(to_py)?; + let scanner = BigBedScanner::new( + bed_schema, + info, + resolve_fields(fields, py)?, + CoordSystem::ZeroHalfOpen, + ) + .map_err(to_py)?; let batches = scanner.scan(fmt_reader, None, None, None).map_err(to_py)?; batches_to_ipc(batches) }; diff --git a/py-oxbow/src/bed.rs b/py-oxbow/src/bed.rs index a5a31e1..10334f3 100644 --- a/py-oxbow/src/bed.rs +++ b/py-oxbow/src/bed.rs @@ -10,12 +10,14 @@ use pyo3_arrow::PySchema; use noodles::core::Region; use crate::error::{err_on_unwind, to_py}; +use crate::util::resolve_coord_system; use crate::util::{ pyobject_to_bufreader, resolve_fields, resolve_index, PyVirtualPosition, Reader, }; use oxbow::bed::{BedScanner, BedSchema, FieldDef, FieldType}; use oxbow::util::batches_to_ipc; use oxbow::util::index::IndexType; +use oxbow::CoordSystem; /// Extract custom field definitions from a Python list or dict. fn extract_custom_defs(obj: &Bound<'_, PyAny>) -> PyResult> { @@ -102,17 +104,24 @@ pub struct PyBedScanner { #[pymethods] impl PyBedScanner { #[new] - #[pyo3(signature = (src, bed_schema, compressed=false, fields=None))] + #[pyo3(signature = (src, bed_schema, compressed=false, fields=None, coords=None))] fn new( py: Python, src: Py, bed_schema: Py, compressed: bool, fields: Option>, + coords: Option, ) -> PyResult { + let coord_system = resolve_coord_system(coords)?; let reader = pyobject_to_bufreader(py, src.clone_ref(py), compressed)?; let parsed_schema = resolve_bed_schema(py, &bed_schema)?; - let scanner = BedScanner::new(parsed_schema, resolve_fields(fields, py)?).map_err(to_py)?; + let scanner = BedScanner::new( + parsed_schema, + resolve_fields(fields, py)?, + coord_system.unwrap_or(CoordSystem::ZeroHalfOpen), + ) + .map_err(to_py)?; Ok(Self { src, reader, @@ -134,6 +143,7 @@ impl PyBedScanner { let kwargs = PyDict::new(py); kwargs.set_item("compressed", self.compressed)?; kwargs.set_item("fields", model.field_names())?; + kwargs.set_item("coords", model.coord_system().to_string())?; Ok((args.into_py_any(py)?, kwargs.into_py_any(py)?)) } @@ -399,7 +409,12 @@ pub fn read_bed( ) -> PyResult> { let reader = pyobject_to_bufreader(py, src.clone_ref(py), compressed)?; let bed_schema = resolve_bed_schema(py, &bed_schema)?; - let scanner = BedScanner::new(bed_schema, resolve_fields(fields, py)?).map_err(to_py)?; + let scanner = BedScanner::new( + bed_schema, + resolve_fields(fields, py)?, + CoordSystem::ZeroHalfOpen, + ) + .map_err(to_py)?; let ipc = if let Some(region) = region { let region = region diff --git a/py-oxbow/src/gxf.rs b/py-oxbow/src/gxf.rs index 48ded30..9e168fb 100644 --- a/py-oxbow/src/gxf.rs +++ b/py-oxbow/src/gxf.rs @@ -13,11 +13,13 @@ use noodles::core::Region; use crate::error::{err_on_unwind, to_py}; use crate::util::{ - pyobject_to_bufreader, resolve_fields, resolve_index, PyVirtualPosition, Reader, + pyobject_to_bufreader, resolve_coord_system, resolve_fields, resolve_index, PyVirtualPosition, + Reader, }; use oxbow::gxf::{GffScanner, GtfScanner}; use oxbow::util::batches_to_ipc; use oxbow::util::index::IndexType; +use oxbow::CoordSystem; /// A GTF file scanner. /// @@ -43,18 +45,26 @@ pub struct PyGtfScanner { #[pymethods] impl PyGtfScanner { #[new] - #[pyo3(signature = (src, compressed=false, fields=None, attribute_defs=None))] + #[pyo3(signature = (src, compressed=false, fields=None, attribute_defs=None, coords=None))] fn new( py: Python, src: Py, compressed: Option, fields: Option>, attribute_defs: Option>, + coords: Option, ) -> PyResult { let fields = resolve_fields(fields, py)?; + let coord_system = resolve_coord_system(coords)?; let compressed = compressed.unwrap_or(false); let reader = pyobject_to_bufreader(py, src.clone_ref(py), compressed)?; - let scanner = GtfScanner::new(None, fields, attribute_defs).map_err(to_py)?; + let scanner = GtfScanner::new( + None, + fields, + attribute_defs, + coord_system.unwrap_or(CoordSystem::OneClosed), + ) + .map_err(to_py)?; Ok(Self { src, reader, @@ -76,6 +86,7 @@ impl PyGtfScanner { let attr_defs: Vec<_> = defs.iter().map(|d| d.to_tuple()).collect(); kwargs.set_item("attribute_defs", attr_defs)?; } + kwargs.set_item("coords", model.coord_system().to_string())?; Ok((args.into_py_any(py)?, kwargs.into_py_any(py)?)) } @@ -359,18 +370,26 @@ pub struct PyGffScanner { #[pymethods] impl PyGffScanner { #[new] - #[pyo3(signature = (src, compressed=false, fields=None, attribute_defs=None))] + #[pyo3(signature = (src, compressed=false, fields=None, attribute_defs=None, coords=None))] fn new( py: Python, src: Py, compressed: Option, fields: Option>, attribute_defs: Option>, + coords: Option, ) -> PyResult { let fields = resolve_fields(fields, py)?; + let coord_system = resolve_coord_system(coords)?; let compressed = compressed.unwrap_or(false); let reader = pyobject_to_bufreader(py, src.clone_ref(py), compressed)?; - let scanner = GffScanner::new(None, fields, attribute_defs).map_err(to_py)?; + let scanner = GffScanner::new( + None, + fields, + attribute_defs, + coord_system.unwrap_or(CoordSystem::OneClosed), + ) + .map_err(to_py)?; Ok(Self { src, reader, @@ -392,6 +411,7 @@ impl PyGffScanner { let attr_defs: Vec<_> = defs.iter().map(|d| d.to_tuple()).collect(); kwargs.set_item("attribute_defs", attr_defs)?; } + kwargs.set_item("coords", model.coord_system().to_string())?; Ok((args.into_py_any(py)?, kwargs.into_py_any(py)?)) } @@ -680,7 +700,8 @@ pub fn read_gtf( ) -> PyResult> { let fields = resolve_fields(fields, py)?; let reader = pyobject_to_bufreader(py, src.clone_ref(py), compressed)?; - let scanner = GtfScanner::new(None, fields, attr_defs).map_err(to_py)?; + let scanner = + GtfScanner::new(None, fields, attr_defs, CoordSystem::OneClosed).map_err(to_py)?; let ipc = if let Some(region) = region { let region = region @@ -749,7 +770,8 @@ pub fn read_gff( ) -> PyResult> { let fields = resolve_fields(fields, py)?; let reader = pyobject_to_bufreader(py, src.clone_ref(py), compressed)?; - let scanner = GffScanner::new(None, fields, attr_defs).map_err(to_py)?; + let scanner = + GffScanner::new(None, fields, attr_defs, CoordSystem::OneClosed).map_err(to_py)?; let ipc = if let Some(region) = region { let region = region diff --git a/py-oxbow/src/sequence.rs b/py-oxbow/src/sequence.rs index ca77acc..eac98ae 100644 --- a/py-oxbow/src/sequence.rs +++ b/py-oxbow/src/sequence.rs @@ -14,7 +14,8 @@ use noodles::core::Region; use crate::error::{err_on_unwind, to_py}; use crate::util::{ - pyobject_to_bufreader, resolve_faidx, resolve_fields, PyVirtualPosition, Reader, + pyobject_to_bufreader, resolve_coord_system, resolve_faidx, resolve_fields, PyVirtualPosition, + Reader, }; use oxbow::sequence::{FastaScanner, FastqScanner}; use oxbow::util::batches_to_ipc; @@ -224,16 +225,22 @@ pub struct PyFastaScanner { #[pymethods] impl PyFastaScanner { #[new] - #[pyo3(signature = (src, compressed=false, fields=None))] + #[pyo3(signature = (src, compressed=false, fields=None, coords=None))] fn new( py: Python, src: Py, compressed: bool, fields: Option>, + coords: Option, ) -> PyResult { let fields = resolve_fields(fields, py)?; + let coord_system = resolve_coord_system(coords)?; let reader = pyobject_to_bufreader(py, src.clone_ref(py), false)?; - let scanner = FastaScanner::new(fields).map_err(to_py)?; + let scanner = FastaScanner::new( + fields, + coord_system.unwrap_or(oxbow::CoordSystem::OneClosed), + ) + .map_err(to_py)?; Ok(Self { src, reader, @@ -250,6 +257,7 @@ impl PyFastaScanner { let args = (self.src.clone_ref(py), self.compressed.into_py_any(py)?); let kwargs = PyDict::new(py); kwargs.set_item("fields", self.scanner.model().field_names())?; + kwargs.set_item("coords", self.scanner.model().coord_system().to_string())?; Ok((args.into_py_any(py)?, kwargs.into_py_any(py)?)) } @@ -455,7 +463,7 @@ pub fn read_fasta( ) -> PyResult> { let fields = resolve_fields(fields, py)?; let reader = pyobject_to_bufreader(py, src.clone_ref(py), compressed)?; - let scanner = FastaScanner::new(fields).map_err(to_py)?; + let scanner = FastaScanner::new(fields, oxbow::CoordSystem::OneClosed).map_err(to_py)?; let ipc = if let Some(regions) = regions { let index = resolve_faidx(py, &src, index)?; diff --git a/py-oxbow/src/util.rs b/py-oxbow/src/util.rs index 3984557..d621ec0 100644 --- a/py-oxbow/src/util.rs +++ b/py-oxbow/src/util.rs @@ -5,7 +5,7 @@ use pyo3::exceptions::PyValueError; use pyo3::prelude::*; use pyo3::types::{PyAny, PyString}; -use oxbow::Select; +use oxbow::{CoordSystem, Select}; use noodles::bgzf::gzi::Index as GzIndex; use noodles::bgzf::io::Seek as BgzfSeek; @@ -408,3 +408,13 @@ pub fn resolve_fields(fields: Option>, py: Python) -> PyResult) -> PyResult> { + match coords { + None => Ok(None), + Some(s) => s + .parse::() + .map(Some) + .map_err(|e| PyErr::new::(e.to_string())), + } +} diff --git a/py-oxbow/src/variant.rs b/py-oxbow/src/variant.rs index 10ee3ec..f450892 100644 --- a/py-oxbow/src/variant.rs +++ b/py-oxbow/src/variant.rs @@ -11,11 +11,13 @@ use noodles::core::Region; use crate::error::{err_on_unwind, to_py}; use crate::util::{ - pyobject_to_bufreader, resolve_fields, resolve_index, PyVirtualPosition, Reader, + pyobject_to_bufreader, resolve_coord_system, resolve_fields, resolve_index, PyVirtualPosition, + Reader, }; use oxbow::util::batches_to_ipc; use oxbow::util::index::IndexType; use oxbow::variant::{BcfScanner, GenotypeBy, VcfScanner}; +use oxbow::CoordSystem; /// A VCF file scanner. /// @@ -53,7 +55,7 @@ pub struct PyVcfScanner { #[pymethods] impl PyVcfScanner { #[new] - #[pyo3(signature = (src, compressed=false, fields=None, info_fields=None, genotype_fields=None, genotype_by=None, samples=None,samples_nested=false))] + #[pyo3(signature = (src, compressed=false, fields=None, info_fields=None, genotype_fields=None, genotype_by=None, samples=None, samples_nested=false, coords=None))] #[allow(clippy::too_many_arguments)] fn new( py: Python, @@ -65,7 +67,9 @@ impl PyVcfScanner { genotype_by: Option, samples: Option>, samples_nested: bool, + coords: Option, ) -> PyResult { + let coord_system = resolve_coord_system(coords)?; let reader = pyobject_to_bufreader(py, src.clone_ref(py), compressed)?; let mut fmt_reader = noodles::vcf::io::Reader::new(reader); let header = fmt_reader.read_header()?; @@ -79,6 +83,7 @@ impl PyVcfScanner { gt_by, resolve_fields(samples, py)?, Some(samples_nested), + coord_system.unwrap_or(CoordSystem::OneClosed), ) .map_err(to_py)?; Ok(Self { @@ -115,6 +120,7 @@ impl PyVcfScanner { }; kwargs.set_item("genotype_by", gt_by)?; kwargs.set_item("samples_nested", model.samples_nested())?; + kwargs.set_item("coords", model.coord_system().to_string())?; Ok((args.into_py_any(py)?, kwargs.into_py_any(py)?)) } @@ -421,7 +427,7 @@ pub struct PyBcfScanner { #[pymethods] impl PyBcfScanner { #[new] - #[pyo3(signature = (src, compressed=true, fields=None, info_fields=None, genotype_fields=None, genotype_by=None, samples=None, samples_nested=false))] + #[pyo3(signature = (src, compressed=true, fields=None, info_fields=None, genotype_fields=None, genotype_by=None, samples=None, samples_nested=false, coords=None))] #[allow(clippy::too_many_arguments)] fn new( py: Python, @@ -433,7 +439,9 @@ impl PyBcfScanner { genotype_by: Option, samples: Option>, samples_nested: bool, + coords: Option, ) -> PyResult { + let coord_system = resolve_coord_system(coords)?; let reader = pyobject_to_bufreader(py, src.clone_ref(py), compressed)?; let mut fmt_reader = noodles::bcf::io::Reader::from(reader); let header = fmt_reader.read_header()?; @@ -447,6 +455,7 @@ impl PyBcfScanner { gt_by, resolve_fields(samples, py)?, Some(samples_nested), + coord_system.unwrap_or(CoordSystem::OneClosed), ) .map_err(to_py)?; Ok(Self { @@ -483,6 +492,7 @@ impl PyBcfScanner { }; kwargs.set_item("genotype_by", gt_by)?; kwargs.set_item("samples_nested", model.samples_nested())?; + kwargs.set_item("coords", model.coord_system().to_string())?; Ok((args.into_py_any(py)?, kwargs.into_py_any(py)?)) } @@ -824,6 +834,7 @@ pub fn read_vcf( genotype_by, resolve_fields(samples, py)?, Some(samples_nested), + CoordSystem::OneClosed, ) .map_err(to_py)?; @@ -924,6 +935,7 @@ pub fn read_bcf( genotype_by, resolve_fields(samples, py)?, Some(samples_nested), + CoordSystem::OneClosed, ) .map_err(to_py)?; diff --git a/py-oxbow/tests/manifests/test_alignment.TestCramFile.test_init_callstack.yaml b/py-oxbow/tests/manifests/test_alignment.TestCramFile.test_init_callstack.yaml index e54d727..bcc0232 100644 --- a/py-oxbow/tests/manifests/test_alignment.TestCramFile.test_init_callstack.yaml +++ b/py-oxbow/tests/manifests/test_alignment.TestCramFile.test_init_callstack.yaml @@ -1,20 +1,20 @@ CramFile("data/does-not-exist.cram"): |- -> oxbow._core.alignment.CramFile.__init__("data/does-not-exist.cram") - -> oxbow._core.alignment.AlignmentFile.__init__("data/does-not-exist.cram", compressed=False, fields="*", tag_defs=None, regions=None, index=None, batch_size=131072) + -> oxbow._core.alignment.AlignmentFile.__init__("data/does-not-exist.cram", compressed=False, fields="*", tag_defs=None, coords="11", regions=None, index=None, batch_size=131072) -> oxbow._core.base.DataSource.__init__("data/does-not-exist.cram", None, 131072) <- None <- None <- None CramFile("data/malformed.cram"): |- -> oxbow._core.alignment.CramFile.__init__("data/malformed.cram") - -> oxbow._core.alignment.AlignmentFile.__init__("data/malformed.cram", compressed=False, fields="*", tag_defs=None, regions=None, index=None, batch_size=131072) + -> oxbow._core.alignment.AlignmentFile.__init__("data/malformed.cram", compressed=False, fields="*", tag_defs=None, coords="11", regions=None, index=None, batch_size=131072) -> oxbow._core.base.DataSource.__init__("data/malformed.cram", None, 131072) <- None <- None <- None CramFile("data/sample.cram"): |- -> oxbow._core.alignment.CramFile.__init__("data/sample.cram") - -> oxbow._core.alignment.AlignmentFile.__init__("data/sample.cram", compressed=False, fields="*", tag_defs=None, regions=None, index=None, batch_size=131072) + -> oxbow._core.alignment.AlignmentFile.__init__("data/sample.cram", compressed=False, fields="*", tag_defs=None, coords="11", regions=None, index=None, batch_size=131072) -> oxbow._core.base.DataSource.__init__("data/sample.cram", None, 131072) <- None <- None diff --git a/py-oxbow/tests/manifests/test_bed.TestBedFile.test_batches.yaml b/py-oxbow/tests/manifests/test_bed.TestBedFile.test_batches.yaml index 02dd333..25ed295 100644 --- a/py-oxbow/tests/manifests/test_bed.TestBedFile.test_batches.yaml +++ b/py-oxbow/tests/manifests/test_bed.TestBedFile.test_batches.yaml @@ -135,72 +135,72 @@ fields=('chrom', 'start', 'end'): - 500000 - 750000 start: - - 1100001 - - 1550001 - - 1900001 - - 50001 - - 250001 - - 650001 - - 200001 - - 800001 - - 900001 - - 50001 - - 650001 - - 700001 - - 19300001 - - 19550001 - - 19650001 - - 19850001 - - 20000001 - - 20250001 - - 20650001 - - 22750001 - - 23450001 - - 50001 - - 200001 - - 300001 - - 150001 - - 350001 - - 500001 - - 150001 - - 850001 - - 2650001 - - 250001 - - 900001 - - 1000001 - - 1 - - 300001 - - 450001 - - 50001 - - 200001 - - 300001 - - 14050001 - - 14100001 - - 14250001 - - 16700001 - - 16900001 - - 17100001 - - 1 - - 3100001 - - 3200001 - - 50001 - - 350001 - - 450001 - - 1 - - 100001 - - 150001 - - 150001 - - 250001 - - 400001 - - 50001 - - 250001 - - 450001 - - 200001 - - 250001 - - 450001 - - 200001 - - 350001 - - 500001 + - 1100000 + - 1550000 + - 1900000 + - 50000 + - 250000 + - 650000 + - 200000 + - 800000 + - 900000 + - 50000 + - 650000 + - 700000 + - 19300000 + - 19550000 + - 19650000 + - 19850000 + - 20000000 + - 20250000 + - 20650000 + - 22750000 + - 23450000 + - 50000 + - 200000 + - 300000 + - 150000 + - 350000 + - 500000 + - 150000 + - 850000 + - 2650000 + - 250000 + - 900000 + - 1000000 + - 0 + - 300000 + - 450000 + - 50000 + - 200000 + - 300000 + - 14050000 + - 14100000 + - 14250000 + - 16700000 + - 16900000 + - 17100000 + - 0 + - 3100000 + - 3200000 + - 50000 + - 350000 + - 450000 + - 0 + - 100000 + - 150000 + - 150000 + - 250000 + - 400000 + - 50000 + - 250000 + - 450000 + - 200000 + - 250000 + - 450000 + - 200000 + - 350000 + - 500000 fields=('nonexistent-field',): 'Field ''nonexistent-field'' not in BED schema. Available: ["chrom", "start", "end", "rest"]' fields=*: @@ -407,71 +407,71 @@ fields=*: - "A3\t.\t.\t350000\t500000\t255,185,0" - "A3\t.\t.\t500000\t750000\t255,185,0" start: - - 1100001 - - 1550001 - - 1900001 - - 50001 - - 250001 - - 650001 - - 200001 - - 800001 - - 900001 - - 50001 - - 650001 - - 700001 - - 19300001 - - 19550001 - - 19650001 - - 19850001 - - 20000001 - - 20250001 - - 20650001 - - 22750001 - - 23450001 - - 50001 - - 200001 - - 300001 - - 150001 - - 350001 - - 500001 - - 150001 - - 850001 - - 2650001 - - 250001 - - 900001 - - 1000001 - - 1 - - 300001 - - 450001 - - 50001 - - 200001 - - 300001 - - 14050001 - - 14100001 - - 14250001 - - 16700001 - - 16900001 - - 17100001 - - 1 - - 3100001 - - 3200001 - - 50001 - - 350001 - - 450001 - - 1 - - 100001 - - 150001 - - 150001 - - 250001 - - 400001 - - 50001 - - 250001 - - 450001 - - 200001 - - 250001 - - 450001 - - 200001 - - 350001 - - 500001 + - 1100000 + - 1550000 + - 1900000 + - 50000 + - 250000 + - 650000 + - 200000 + - 800000 + - 900000 + - 50000 + - 650000 + - 700000 + - 19300000 + - 19550000 + - 19650000 + - 19850000 + - 20000000 + - 20250000 + - 20650000 + - 22750000 + - 23450000 + - 50000 + - 200000 + - 300000 + - 150000 + - 350000 + - 500000 + - 150000 + - 850000 + - 2650000 + - 250000 + - 900000 + - 1000000 + - 0 + - 300000 + - 450000 + - 50000 + - 200000 + - 300000 + - 14050000 + - 14100000 + - 14250000 + - 16700000 + - 16900000 + - 17100000 + - 0 + - 3100000 + - 3200000 + - 50000 + - 350000 + - 450000 + - 0 + - 100000 + - 150000 + - 150000 + - 250000 + - 400000 + - 50000 + - 250000 + - 450000 + - 200000 + - 250000 + - 450000 + - 200000 + - 350000 + - 500000 fields=None: batch-00: {} diff --git a/py-oxbow/tests/manifests/test_scanners.TestPyBedScanner.test_scan.yaml b/py-oxbow/tests/manifests/test_scanners.TestPyBedScanner.test_scan.yaml index 929cb6c..b94ae2d 100644 --- a/py-oxbow/tests/manifests/test_scanners.TestPyBedScanner.test_scan.yaml +++ b/py-oxbow/tests/manifests/test_scanners.TestPyBedScanner.test_scan.yaml @@ -4,7 +4,7 @@ batch_size=1, bed_schema="bed3": end: - 1200000 start: - - 1100001 + - 1100000 batch_size=2, bed_schema="bed3": chrom: - chr1 @@ -13,8 +13,8 @@ batch_size=2, bed_schema="bed3": - 1200000 - 1600000 start: - - 1100001 - - 1550001 + - 1100000 + - 1550000 batch_size=2, columns=("chrom", "start", "end"), bed_schema="bed3": chrom: - chr1 @@ -23,8 +23,8 @@ batch_size=2, columns=("chrom", "start", "end"), bed_schema="bed3": - 1200000 - 1600000 start: - - 1100001 - - 1550001 + - 1100000 + - 1550000 batch_size=2, columns=("chrom", "start", "end"), bed_schema="bed3+": chrom: - chr1 @@ -33,8 +33,8 @@ batch_size=2, columns=("chrom", "start", "end"), bed_schema="bed3+": - 1200000 - 1600000 start: - - 1100001 - - 1550001 + - 1100000 + - 1550000 batch_size=2, columns=("chrom", "start", "end"), bed_schema="bed3+3": chrom: - chr1 @@ -43,8 +43,8 @@ batch_size=2, columns=("chrom", "start", "end"), bed_schema="bed3+3": - 1200000 - 1600000 start: - - 1100001 - - 1550001 + - 1100000 + - 1550000 batch_size=2, columns=("chrom", "start", "end"), bed_schema="bed3+6": chrom: - chr1 @@ -53,8 +53,8 @@ batch_size=2, columns=("chrom", "start", "end"), bed_schema="bed3+6": - 1200000 - 1600000 start: - - 1100001 - - 1550001 + - 1100000 + - 1550000 batch_size=2, columns=("chrom", "start", "end"), bed_schema="bed6": chrom: - chr1 @@ -63,8 +63,8 @@ batch_size=2, columns=("chrom", "start", "end"), bed_schema="bed6": - 1200000 - 1600000 start: - - 1100001 - - 1550001 + - 1100000 + - 1550000 batch_size=2, columns=("chrom", "start", "end"), bed_schema="bed6+": chrom: - chr1 @@ -73,8 +73,8 @@ batch_size=2, columns=("chrom", "start", "end"), bed_schema="bed6+": - 1200000 - 1600000 start: - - 1100001 - - 1550001 + - 1100000 + - 1550000 batch_size=2, columns=("chrom", "start", "end"), bed_schema="bed9": chrom: - chr1 @@ -83,8 +83,8 @@ batch_size=2, columns=("chrom", "start", "end"), bed_schema="bed9": - 1200000 - 1600000 start: - - 1100001 - - 1550001 + - 1100000 + - 1550000 batch_size=2, columns=None, bed_schema="bed3": chrom: - chr1 @@ -93,8 +93,8 @@ batch_size=2, columns=None, bed_schema="bed3": - 1200000 - 1600000 start: - - 1100001 - - 1550001 + - 1100000 + - 1550000 batch_size=2, columns=None, bed_schema="bed3+": chrom: - chr1 @@ -106,8 +106,8 @@ batch_size=2, columns=None, bed_schema="bed3+": - "A1\t.\t.\t1100000\t1200000\t226,56,56" - "A1\t.\t.\t1550000\t1600000\t226,56,56" start: - - 1100001 - - 1550001 + - 1100000 + - 1550000 batch_size=2, columns=None, bed_schema="bed3+3": BED3+1: - A1 @@ -125,8 +125,8 @@ batch_size=2, columns=None, bed_schema="bed3+3": - 1200000 - 1600000 start: - - 1100001 - - 1550001 + - 1100000 + - 1550000 batch_size=2, columns=None, bed_schema="bed3+6": BED3+1: - A1 @@ -153,8 +153,8 @@ batch_size=2, columns=None, bed_schema="bed3+6": - 1200000 - 1600000 start: - - 1100001 - - 1550001 + - 1100000 + - 1550000 batch_size=2, columns=None, bed_schema="bed6": chrom: - chr1 @@ -169,8 +169,8 @@ batch_size=2, columns=None, bed_schema="bed6": - null - null start: - - 1100001 - - 1550001 + - 1100000 + - 1550000 strand: - null - null @@ -191,8 +191,8 @@ batch_size=2, columns=None, bed_schema="bed6+": - null - null start: - - 1100001 - - 1550001 + - 1100000 + - 1550000 strand: - null - null @@ -217,221 +217,8 @@ batch_size=2, columns=None, bed_schema="bed9": - null - null start: - - 1100001 - - 1550001 - strand: - - null - - null - thickEnd: - - 1200000 - - 1600000 - thickStart: - 1100000 - 1550000 -batch_size=2, fields=("chrom", "start", "end"), bed_schema="bed3": - chrom: - - chr1 - - chr1 - end: - - 1200000 - - 1600000 - start: - - 1100001 - - 1550001 -batch_size=2, fields=("chrom", "start", "end"), bed_schema="bed3+": - chrom: - - chr1 - - chr1 - end: - - 1200000 - - 1600000 - start: - - 1100001 - - 1550001 -batch_size=2, fields=("chrom", "start", "end"), bed_schema="bed3+3": - chrom: - - chr1 - - chr1 - end: - - 1200000 - - 1600000 - start: - - 1100001 - - 1550001 -batch_size=2, fields=("chrom", "start", "end"), bed_schema="bed3+6": - chrom: - - chr1 - - chr1 - end: - - 1200000 - - 1600000 - start: - - 1100001 - - 1550001 -batch_size=2, fields=("chrom", "start", "end"), bed_schema="bed6": - chrom: - - chr1 - - chr1 - end: - - 1200000 - - 1600000 - start: - - 1100001 - - 1550001 -batch_size=2, fields=("chrom", "start", "end"), bed_schema="bed6+": - chrom: - - chr1 - - chr1 - end: - - 1200000 - - 1600000 - start: - - 1100001 - - 1550001 -batch_size=2, fields=("chrom", "start", "end"), bed_schema="bed9": - chrom: - - chr1 - - chr1 - end: - - 1200000 - - 1600000 - start: - - 1100001 - - 1550001 -batch_size=2, fields=None, bed_schema="bed3": - chrom: - - chr1 - - chr1 - end: - - 1200000 - - 1600000 - start: - - 1100001 - - 1550001 -batch_size=2, fields=None, bed_schema="bed3+": - chrom: - - chr1 - - chr1 - end: - - 1200000 - - 1600000 - rest: - - "A1\t.\t.\t1100000\t1200000\t226,56,56" - - "A1\t.\t.\t1550000\t1600000\t226,56,56" - start: - - 1100001 - - 1550001 -batch_size=2, fields=None, bed_schema="bed3+3": - BED3+1: - - A1 - - A1 - BED3+2: - - . - - . - BED3+3: - - . - - . - chrom: - - chr1 - - chr1 - end: - - 1200000 - - 1600000 - start: - - 1100001 - - 1550001 -batch_size=2, fields=None, bed_schema="bed3+6": - BED3+1: - - A1 - - A1 - BED3+2: - - . - - . - BED3+3: - - . - - . - BED3+4: - - '1100000' - - '1550000' - BED3+5: - - '1200000' - - '1600000' - BED3+6: - - 226,56,56 - - 226,56,56 - chrom: - - chr1 - - chr1 - end: - - 1200000 - - 1600000 - start: - - 1100001 - - 1550001 -batch_size=2, fields=None, bed_schema="bed6": - chrom: - - chr1 - - chr1 - end: - - 1200000 - - 1600000 - name: - - A1 - - A1 - score: - - null - - null - start: - - 1100001 - - 1550001 - strand: - - null - - null -batch_size=2, fields=None, bed_schema="bed6+": - chrom: - - chr1 - - chr1 - end: - - 1200000 - - 1600000 - name: - - A1 - - A1 - rest: - - "A1\t.\t.\t1100000\t1200000\t226,56,56" - - "A1\t.\t.\t1550000\t1600000\t226,56,56" - score: - - null - - null - start: - - 1100001 - - 1550001 - strand: - - null - - null -batch_size=2, fields=None, bed_schema="bed9": - chrom: - - chr1 - - chr1 - end: - - 1200000 - - 1600000 - itemRgb: - - - 226 - - 56 - - 56 - - - 226 - - 56 - - 56 - name: - - A1 - - A1 - score: - - null - - null - start: - - 1100001 - - 1550001 strand: - null - null @@ -451,9 +238,9 @@ batch_size=3, bed_schema="bed3": - 1600000 - 2450000 start: - - 1100001 - - 1550001 - - 1900001 + - 1100000 + - 1550000 + - 1900000 batch_size=4, bed_schema="bed3": chrom: - chr1 @@ -466,7 +253,7 @@ batch_size=4, bed_schema="bed3": - 2450000 - 250000 start: - - 1100001 - - 1550001 - - 1900001 - - 50001 + - 1100000 + - 1550000 + - 1900000 + - 50000 diff --git a/py-oxbow/tests/test_coords.py b/py-oxbow/tests/test_coords.py new file mode 100644 index 0000000..bbc645c --- /dev/null +++ b/py-oxbow/tests/test_coords.py @@ -0,0 +1,153 @@ +"""Tests for the coords parameter on scanner classes. + +Each format has a native coordinate system: +- SAM/BAM/CRAM, VCF/BCF, GFF/GTF: 1-based closed (default "11") +- BED, BigBed, BigWig: 0-based half-open (default "01") + +Passing coords="01" to a 1-based format shifts start positions by -1. +Passing coords="11" to a 0-based format shifts start positions by +1. +""" + +import pyarrow as pa + +import oxbow.oxbow as ox + + +def _read_pos(scanner, col="pos"): + """Read a single batch from a scanner and return the named column as a list.""" + stream = scanner.scan() + schema = scanner.schema() + reader = pa.RecordBatchReader.from_stream(data=stream, schema=pa.schema(schema)) + batch = reader.read_all() + return batch.column(col).to_pylist() + + +# -- Alignment (1-based native) ------------------------------------------------ + + +class TestSamCoords: + def test_default_is_one_based(self): + scanner = ox.PySamScanner("data/sample.sam", fields=["pos"]) + pos = _read_pos(scanner) + assert pos == [16, 29, 37] + + def test_zero_based(self): + scanner = ox.PySamScanner("data/sample.sam", fields=["pos"], coords="01") + pos = _read_pos(scanner) + assert pos == [15, 28, 36] + + def test_explicit_one_based_matches_default(self): + default = _read_pos(ox.PySamScanner("data/sample.sam", fields=["pos"])) + explicit = _read_pos( + ox.PySamScanner("data/sample.sam", fields=["pos"], coords="11") + ) + assert default == explicit + + +class TestBamCoords: + def test_zero_based_shifts_by_minus_one(self): + default = _read_pos(ox.PyBamScanner("data/sample.bam", fields=["pos"])) + zero_based = _read_pos( + ox.PyBamScanner("data/sample.bam", fields=["pos"], coords="01") + ) + assert zero_based == [v - 1 for v in default] + + +# -- Variant (1-based native) -------------------------------------------------- + + +class TestVcfCoords: + def test_default_is_one_based(self): + scanner = ox.PyVcfScanner("data/sample.vcf", fields=["pos"]) + pos = _read_pos(scanner) + # Just check the offset relationship, not exact values + assert all(isinstance(v, int) for v in pos) + + def test_zero_based_shifts_by_minus_one(self): + default = _read_pos(ox.PyVcfScanner("data/sample.vcf", fields=["pos"])) + zero_based = _read_pos( + ox.PyVcfScanner("data/sample.vcf", fields=["pos"], coords="01") + ) + assert zero_based == [v - 1 for v in default] + + +# -- GXF (1-based native) ------------------------------------------------------ + + +class TestGtfCoords: + def test_default_is_one_based(self): + scanner = ox.PyGtfScanner("data/sample.gtf", fields=["start"]) + start = _read_pos(scanner, col="start") + assert all(isinstance(v, int) for v in start) + + def test_zero_based_shifts_by_minus_one(self): + default = _read_pos( + ox.PyGtfScanner("data/sample.gtf", fields=["start"]), col="start" + ) + zero_based = _read_pos( + ox.PyGtfScanner("data/sample.gtf", fields=["start"], coords="01"), + col="start", + ) + assert zero_based == [v - 1 for v in default] + + +class TestGffCoords: + def test_zero_based_shifts_by_minus_one(self): + default = _read_pos( + ox.PyGffScanner("data/sample.gff", fields=["start"]), col="start" + ) + zero_based = _read_pos( + ox.PyGffScanner("data/sample.gff", fields=["start"], coords="01"), + col="start", + ) + assert zero_based == [v - 1 for v in default] + + +# -- BED (0-based native) ------------------------------------------------------ + + +class TestBedCoords: + def test_default_is_zero_based(self): + scanner = ox.PyBedScanner("data/sample.bed", "bed9", fields=["start"]) + start = _read_pos(scanner, col="start") + # BED file has 0-based starts; noodles converts to 1-based Position + # internally, then the default ZeroHalfOpen coord_system subtracts 1 + # back to 0-based. So default output == file values. + assert start[0] == 1100000 + + def test_one_based_shifts_by_plus_one(self): + default = _read_pos( + ox.PyBedScanner("data/sample.bed", "bed9", fields=["start"]), col="start" + ) + one_based = _read_pos( + ox.PyBedScanner("data/sample.bed", "bed9", fields=["start"], coords="11"), + col="start", + ) + assert one_based == [v + 1 for v in default] + + +# -- BBI (0-based native) ------------------------------------------------------ + + +class TestBigWigCoords: + def test_one_based_shifts_by_plus_one(self): + default = _read_pos( + ox.PyBigWigScanner("data/sample.bw", fields=["start"]), col="start" + ) + one_based = _read_pos( + ox.PyBigWigScanner("data/sample.bw", fields=["start"], coords="11"), + col="start", + ) + assert one_based == [v + 1 for v in default] + + +class TestBigBedCoords: + def test_one_based_shifts_by_plus_one(self): + default = _read_pos( + ox.PyBigBedScanner("data/sample.bb", fields=["start"]), col="start" + ) + one_based = _read_pos( + ox.PyBigBedScanner("data/sample.bb", fields=["start"], coords="11"), + col="start", + ) + assert one_based == [v + 1 for v in default] diff --git a/r-oxbow/src/rust/src/lib.rs b/r-oxbow/src/rust/src/lib.rs index e92e353..dc9cb1f 100644 --- a/r-oxbow/src/rust/src/lib.rs +++ b/r-oxbow/src/rust/src/lib.rs @@ -13,7 +13,7 @@ use oxbow::gxf::{GffScanner, GtfScanner}; use oxbow::sequence::{FastaScanner, FastqScanner}; use oxbow::util::batches_to_ipc; use oxbow::variant::{BcfScanner, GenotypeBy, VcfScanner}; -use oxbow::Select; +use oxbow::{CoordSystem, Select}; pub const BUFFER_SIZE_BYTES: usize = const { 1024 * 1024 }; @@ -66,7 +66,7 @@ fn read_fasta_impl( let reader = std::fs::File::open(path) .map(|f| BufReader::with_capacity(BUFFER_SIZE_BYTES, f)) .unwrap(); - let scanner = FastaScanner::new(resolve_r_fields(fields)).unwrap(); + let scanner = FastaScanner::new(resolve_r_fields(fields), CoordSystem::OneClosed).unwrap(); let ipc = if let Some(regions) = regions { let index_path = index.unwrap_or(format!("{}.fai", path)); @@ -125,7 +125,13 @@ pub fn read_sam_impl( let mut fmt_reader = noodles::sam::io::Reader::new(bgzf_reader); let header = fmt_reader.read_header().unwrap(); let tag_defs = SamScanner::tag_defs(&mut fmt_reader, scan_rows).unwrap(); - let scanner = SamScanner::new(header, resolve_r_fields(fields), Some(tag_defs)).unwrap(); + let scanner = SamScanner::new( + header, + resolve_r_fields(fields), + Some(tag_defs), + CoordSystem::OneClosed, + ) + .unwrap(); let batches = scanner .scan_query(fmt_reader, region, index, None, None, None) .unwrap(); @@ -137,7 +143,13 @@ pub fn read_sam_impl( let pos = fmt_reader.get_mut().virtual_position(); let tag_defs = SamScanner::tag_defs(&mut fmt_reader, scan_rows).unwrap(); fmt_reader.get_mut().seek(pos).unwrap(); - let scanner = SamScanner::new(header, resolve_r_fields(fields), Some(tag_defs)).unwrap(); + let scanner = SamScanner::new( + header, + resolve_r_fields(fields), + Some(tag_defs), + CoordSystem::OneClosed, + ) + .unwrap(); let batches = scanner.scan(fmt_reader, None, None, None).unwrap(); batches_to_ipc(batches) } else { @@ -149,7 +161,13 @@ pub fn read_sam_impl( .get_mut() .seek(std::io::SeekFrom::Start(pos)) .unwrap(); - let scanner = SamScanner::new(header, resolve_r_fields(fields), Some(tag_defs)).unwrap(); + let scanner = SamScanner::new( + header, + resolve_r_fields(fields), + Some(tag_defs), + CoordSystem::OneClosed, + ) + .unwrap(); let batches = scanner.scan(fmt_reader, None, None, None).unwrap(); batches_to_ipc(batches) }; @@ -181,7 +199,13 @@ pub fn read_bam_impl( let mut fmt_reader = noodles::bam::io::Reader::from(bgzf_reader); let header = fmt_reader.read_header().unwrap(); let tag_defs = BamScanner::tag_defs(&mut fmt_reader, scan_rows).unwrap(); - let scanner = BamScanner::new(header, resolve_r_fields(fields), Some(tag_defs)).unwrap(); + let scanner = BamScanner::new( + header, + resolve_r_fields(fields), + Some(tag_defs), + CoordSystem::OneClosed, + ) + .unwrap(); let batches = scanner .scan_query(fmt_reader, region, index, None, None, None) .unwrap(); @@ -193,7 +217,13 @@ pub fn read_bam_impl( let pos = fmt_reader.get_mut().virtual_position(); let tag_defs = BamScanner::tag_defs(&mut fmt_reader, scan_rows).unwrap(); fmt_reader.get_mut().seek(pos).unwrap(); - let scanner = BamScanner::new(header, resolve_r_fields(fields), Some(tag_defs)).unwrap(); + let scanner = BamScanner::new( + header, + resolve_r_fields(fields), + Some(tag_defs), + CoordSystem::OneClosed, + ) + .unwrap(); let batches = scanner.scan(fmt_reader, None, None, None).unwrap(); batches_to_ipc(batches) } else { @@ -205,7 +235,13 @@ pub fn read_bam_impl( .get_mut() .seek(std::io::SeekFrom::Start(pos)) .unwrap(); - let scanner = BamScanner::new(header, resolve_r_fields(fields), Some(tag_defs)).unwrap(); + let scanner = BamScanner::new( + header, + resolve_r_fields(fields), + Some(tag_defs), + CoordSystem::OneClosed, + ) + .unwrap(); let batches = scanner.scan(fmt_reader, None, None, None).unwrap(); batches_to_ipc(batches) }; @@ -256,8 +292,14 @@ pub fn read_cram_impl( .build_from_reader(reader); let header = fmt_reader.read_header().unwrap(); let tag_defs = CramScanner::tag_defs(&mut fmt_reader, &header, scan_rows).unwrap(); - let scanner = - CramScanner::new(header, resolve_r_fields(fields), Some(tag_defs), repo).unwrap(); + let scanner = CramScanner::new( + header, + resolve_r_fields(fields), + Some(tag_defs), + repo, + CoordSystem::OneClosed, + ) + .unwrap(); let batches = scanner .scan_query(fmt_reader, region, index, None, None, None) .unwrap(); @@ -270,8 +312,14 @@ pub fn read_cram_impl( let pos = fmt_reader.position().unwrap(); let tag_defs = CramScanner::tag_defs(&mut fmt_reader, &header, scan_rows).unwrap(); fmt_reader.seek(std::io::SeekFrom::Start(pos)).unwrap(); - let scanner = - CramScanner::new(header, resolve_r_fields(fields), Some(tag_defs), repo).unwrap(); + let scanner = CramScanner::new( + header, + resolve_r_fields(fields), + Some(tag_defs), + repo, + CoordSystem::OneClosed, + ) + .unwrap(); let batches = scanner.scan(fmt_reader, None, None, None).unwrap(); batches_to_ipc(batches) }; @@ -318,6 +366,7 @@ pub fn read_vcf_impl( Some(genotype_by), resolve_r_fields(samples), Some(samples_nested), + CoordSystem::OneClosed, ) .unwrap(); let batches = scanner @@ -336,6 +385,7 @@ pub fn read_vcf_impl( Some(genotype_by), resolve_r_fields(samples), Some(samples_nested), + CoordSystem::OneClosed, ) .unwrap(); let batches = scanner.scan(fmt_reader, None, None, None).unwrap(); @@ -351,6 +401,7 @@ pub fn read_vcf_impl( Some(genotype_by), resolve_r_fields(samples), Some(samples_nested), + CoordSystem::OneClosed, ) .unwrap(); let batches = scanner.scan(fmt_reader, None, None, None).unwrap(); @@ -399,6 +450,7 @@ pub fn read_bcf_impl( Some(genotype_by), resolve_r_fields(samples), Some(samples_nested), + CoordSystem::OneClosed, ) .unwrap(); let batches = scanner @@ -417,6 +469,7 @@ pub fn read_bcf_impl( Some(genotype_by), resolve_r_fields(samples), Some(samples_nested), + CoordSystem::OneClosed, ) .unwrap(); let batches = scanner.scan(fmt_reader, None, None, None).unwrap(); @@ -432,6 +485,7 @@ pub fn read_bcf_impl( Some(genotype_by), resolve_r_fields(samples), Some(samples_nested), + CoordSystem::OneClosed, ) .unwrap(); let batches = scanner.scan(fmt_reader, None, None, None).unwrap(); @@ -463,7 +517,13 @@ pub fn read_gtf_impl( let bgzf_reader = noodles::bgzf::io::Reader::new(reader); let mut fmt_reader = noodles::gtf::io::Reader::new(bgzf_reader); let attr_defs = GtfScanner::attribute_defs(&mut fmt_reader, scan_rows).unwrap(); - let scanner = GtfScanner::new(None, resolve_r_fields(fields), Some(attr_defs)).unwrap(); + let scanner = GtfScanner::new( + None, + resolve_r_fields(fields), + Some(attr_defs), + CoordSystem::OneClosed, + ) + .unwrap(); let batches = scanner .scan_query(fmt_reader, region, index, None, None, None) .unwrap(); @@ -474,7 +534,13 @@ pub fn read_gtf_impl( let pos = fmt_reader.get_mut().virtual_position(); let attr_defs = GtfScanner::attribute_defs(&mut fmt_reader, scan_rows).unwrap(); fmt_reader.get_mut().seek(pos).unwrap(); - let scanner = GtfScanner::new(None, resolve_r_fields(fields), Some(attr_defs)).unwrap(); + let scanner = GtfScanner::new( + None, + resolve_r_fields(fields), + Some(attr_defs), + CoordSystem::OneClosed, + ) + .unwrap(); let batches = scanner.scan(fmt_reader, None, None, None).unwrap(); batches_to_ipc(batches) } else { @@ -485,7 +551,13 @@ pub fn read_gtf_impl( .get_mut() .seek(std::io::SeekFrom::Start(pos)) .unwrap(); - let scanner = GtfScanner::new(None, resolve_r_fields(fields), Some(attr_defs)).unwrap(); + let scanner = GtfScanner::new( + None, + resolve_r_fields(fields), + Some(attr_defs), + CoordSystem::OneClosed, + ) + .unwrap(); let batches = scanner.scan(fmt_reader, None, None, None).unwrap(); batches_to_ipc(batches) }; @@ -515,7 +587,13 @@ pub fn read_gff_impl( let bgzf_reader = noodles::bgzf::io::Reader::new(reader); let mut fmt_reader = noodles::gff::io::Reader::new(bgzf_reader); let attr_defs = GffScanner::attribute_defs(&mut fmt_reader, scan_rows).unwrap(); - let scanner = GffScanner::new(None, resolve_r_fields(fields), Some(attr_defs)).unwrap(); + let scanner = GffScanner::new( + None, + resolve_r_fields(fields), + Some(attr_defs), + CoordSystem::OneClosed, + ) + .unwrap(); let batches = scanner .scan_query(fmt_reader, region, index, None, None, None) .unwrap(); @@ -526,7 +604,13 @@ pub fn read_gff_impl( let pos = fmt_reader.get_mut().virtual_position(); let attr_defs = GffScanner::attribute_defs(&mut fmt_reader, scan_rows).unwrap(); fmt_reader.get_mut().seek(pos).unwrap(); - let scanner = GffScanner::new(None, resolve_r_fields(fields), Some(attr_defs)).unwrap(); + let scanner = GffScanner::new( + None, + resolve_r_fields(fields), + Some(attr_defs), + CoordSystem::OneClosed, + ) + .unwrap(); let batches = scanner.scan(fmt_reader, None, None, None).unwrap(); batches_to_ipc(batches) } else { @@ -537,7 +621,13 @@ pub fn read_gff_impl( .get_mut() .seek(std::io::SeekFrom::Start(pos)) .unwrap(); - let scanner = GffScanner::new(None, resolve_r_fields(fields), Some(attr_defs)).unwrap(); + let scanner = GffScanner::new( + None, + resolve_r_fields(fields), + Some(attr_defs), + CoordSystem::OneClosed, + ) + .unwrap(); let batches = scanner.scan(fmt_reader, None, None, None).unwrap(); batches_to_ipc(batches) }; @@ -559,7 +649,12 @@ pub fn read_bed_impl( let reader = std::fs::File::open(path) .map(|f| BufReader::with_capacity(BUFFER_SIZE_BYTES, f)) .unwrap(); - let scanner = BedScanner::new(bed_schema, resolve_r_fields(fields)).unwrap(); + let scanner = BedScanner::new( + bed_schema, + resolve_r_fields(fields), + CoordSystem::ZeroHalfOpen, + ) + .unwrap(); let ipc = if let Some(region) = region { let index_path = index.unwrap_or(format!("{}.tbi", path)); @@ -600,7 +695,8 @@ pub fn read_bigwig_impl( let region = region.parse::().unwrap(); let fmt_reader = bigtools::BigWigRead::open(reader).unwrap(); let info = fmt_reader.info().clone(); - let scanner = BigWigScanner::new(info, resolve_r_fields(fields)).unwrap(); + let scanner = + BigWigScanner::new(info, resolve_r_fields(fields), CoordSystem::ZeroHalfOpen).unwrap(); let batches = scanner .scan_query(fmt_reader, region, None, None, None) .unwrap(); @@ -608,7 +704,8 @@ pub fn read_bigwig_impl( } else { let fmt_reader = bigtools::BigWigRead::open(reader).unwrap(); let info = fmt_reader.info().clone(); - let scanner = BigWigScanner::new(info, resolve_r_fields(fields)).unwrap(); + let scanner = + BigWigScanner::new(info, resolve_r_fields(fields), CoordSystem::ZeroHalfOpen).unwrap(); let batches = scanner.scan(fmt_reader, None, None, None).unwrap(); batches_to_ipc(batches) }; @@ -633,7 +730,13 @@ pub fn read_bigbed_impl( let region = region.parse::().unwrap(); let fmt_reader = bigtools::BigBedRead::open(reader).unwrap(); let info = fmt_reader.info().clone(); - let scanner = BigBedScanner::new(bed_schema, info, resolve_r_fields(fields)).unwrap(); + let scanner = BigBedScanner::new( + bed_schema, + info, + resolve_r_fields(fields), + CoordSystem::ZeroHalfOpen, + ) + .unwrap(); let batches = scanner .scan_query(fmt_reader, region, None, None, None) .unwrap(); @@ -641,7 +744,13 @@ pub fn read_bigbed_impl( } else { let fmt_reader = bigtools::BigBedRead::open(reader).unwrap(); let info = fmt_reader.info().clone(); - let scanner = BigBedScanner::new(bed_schema, info, resolve_r_fields(fields)).unwrap(); + let scanner = BigBedScanner::new( + bed_schema, + info, + resolve_r_fields(fields), + CoordSystem::ZeroHalfOpen, + ) + .unwrap(); let batches = scanner.scan(fmt_reader, None, None, None).unwrap(); batches_to_ipc(batches) }; From 001ab8d9847fbec0f11d55f2ba59f67fc8dd9676 Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Fri, 20 Mar 2026 15:51:27 -0400 Subject: [PATCH 2/3] feat: Add Region type with coordinate-system-aware parsing Introduce `oxbow::Region`: a coordinate-system-aware genomic region type that normalizes all coordinates to 0-based half-open internally. Supports two parsing styles: * Ambiguous UCSC notation (chr1:10,000-20,000) interpreted using a provided CoordSystem. Accepts , and _ as thousands separators. * Explicit bracket notation (chr1:[10_000,20_000) or chr1:[10_001,20_000]) that is self-describing and overrides any provided coordinate system. Only _ is accepted as a thousands separator (since , delimits start and end). `Region::to_noodles()` converts to a noodles `Region` for index-based seeking. All `scan_query` methods now accept `oxbow::Region` instead of `noodles::core::Region`, performing the conversion internally. `CoordSystem` and `Region` are extracted into a new `oxbow::coords` module and re-exported from the crate root. py-oxbow scanner classes parse user region strings using the scanner's `model().coord_system()` when using ambiguous notation. Standalone `read_*` functions use the format-native default. r-oxbow follows the same convention. --- oxbow/src/alignment/scanner/bam.rs | 3 +- oxbow/src/alignment/scanner/cram.rs | 3 +- oxbow/src/alignment/scanner/sam.rs | 3 +- oxbow/src/bbi/scanner/bbizoom.rs | 3 +- oxbow/src/bbi/scanner/bigbed.rs | 3 +- oxbow/src/bbi/scanner/bigwig.rs | 3 +- oxbow/src/bed/scanner/bed.rs | 3 +- oxbow/src/coords.rs | 433 ++++++++++++++++++++++++++++ oxbow/src/gxf/scanner/gff.rs | 3 +- oxbow/src/gxf/scanner/gtf.rs | 3 +- oxbow/src/lib.rs | 60 +--- oxbow/src/sequence/scanner/fasta.rs | 12 +- oxbow/src/variant/scanner/bcf.rs | 3 +- oxbow/src/variant/scanner/vcf.rs | 3 +- py-oxbow/src/alignment.rs | 31 +- py-oxbow/src/bbi.rs | 29 +- py-oxbow/src/bed.rs | 12 +- py-oxbow/src/gxf.rs | 20 +- py-oxbow/src/sequence.rs | 29 +- py-oxbow/src/variant.rs | 20 +- r-oxbow/src/rust/src/lib.rs | 24 +- 21 files changed, 525 insertions(+), 178 deletions(-) create mode 100644 oxbow/src/coords.rs diff --git a/oxbow/src/alignment/scanner/bam.rs b/oxbow/src/alignment/scanner/bam.rs index 09f2eba..93e607d 100644 --- a/oxbow/src/alignment/scanner/bam.rs +++ b/oxbow/src/alignment/scanner/bam.rs @@ -175,13 +175,14 @@ impl Scanner { pub fn scan_query( &self, fmt_reader: noodles::bam::io::Reader, - region: noodles::core::Region, + region: crate::Region, index: impl BinningIndex, columns: Option>, batch_size: Option, limit: Option, ) -> crate::Result { let batch_size = batch_size.unwrap_or(1024); + let region = region.to_noodles()?; let interval = region.interval(); let batch_builder = self.build_batch_builder(columns, batch_size)?; diff --git a/oxbow/src/alignment/scanner/cram.rs b/oxbow/src/alignment/scanner/cram.rs index 0c1516d..240b95a 100644 --- a/oxbow/src/alignment/scanner/cram.rs +++ b/oxbow/src/alignment/scanner/cram.rs @@ -210,13 +210,14 @@ impl Scanner { pub fn scan_query( &self, fmt_reader: noodles::cram::io::Reader, - region: noodles::core::Region, + region: crate::Region, index: noodles::cram::crai::Index, columns: Option>, batch_size: Option, limit: Option, ) -> crate::Result { let batch_size = batch_size.unwrap_or(1024); + let region = region.to_noodles()?; let interval = region.interval(); let batch_builder = self.build_batch_builder(columns, batch_size)?; diff --git a/oxbow/src/alignment/scanner/sam.rs b/oxbow/src/alignment/scanner/sam.rs index 75b51bb..f4ebab4 100644 --- a/oxbow/src/alignment/scanner/sam.rs +++ b/oxbow/src/alignment/scanner/sam.rs @@ -175,13 +175,14 @@ impl Scanner { pub fn scan_query( &self, fmt_reader: noodles::sam::io::Reader, - region: noodles::core::Region, + region: crate::Region, index: impl BinningIndex, columns: Option>, batch_size: Option, limit: Option, ) -> crate::Result { let batch_size = batch_size.unwrap_or(1024); + let region = region.to_noodles()?; let interval = region.interval(); let batch_builder = self.build_batch_builder(columns, batch_size)?; diff --git a/oxbow/src/bbi/scanner/bbizoom.rs b/oxbow/src/bbi/scanner/bbizoom.rs index b1b853b..38a6b9d 100644 --- a/oxbow/src/bbi/scanner/bbizoom.rs +++ b/oxbow/src/bbi/scanner/bbizoom.rs @@ -125,12 +125,13 @@ impl Scanner { pub fn scan_query( &self, reader: BBIReader, - region: noodles::core::Region, + region: crate::Region, columns: Option>, batch_size: Option, limit: Option, ) -> crate::Result { let batch_size = batch_size.unwrap_or(1024); + let region = region.to_noodles()?; let batch_builder = self.build_batch_builder(columns, batch_size)?; match reader { BBIReader::BigWig(reader) => { diff --git a/oxbow/src/bbi/scanner/bigbed.rs b/oxbow/src/bbi/scanner/bigbed.rs index 92edd69..4463faf 100644 --- a/oxbow/src/bbi/scanner/bigbed.rs +++ b/oxbow/src/bbi/scanner/bigbed.rs @@ -130,12 +130,13 @@ impl Scanner { pub fn scan_query( &self, fmt_reader: BigBedRead, - region: noodles::core::Region, + region: crate::Region, columns: Option>, batch_size: Option, limit: Option, ) -> crate::Result { let batch_size = batch_size.unwrap_or(1024); + let region = region.to_noodles()?; let batch_builder = self.build_batch_builder(columns, batch_size)?; let batch_iter = BigBedQueryBatchIterator::new(fmt_reader, region, batch_builder, batch_size, limit); diff --git a/oxbow/src/bbi/scanner/bigwig.rs b/oxbow/src/bbi/scanner/bigwig.rs index 10cab05..ca7faa1 100644 --- a/oxbow/src/bbi/scanner/bigwig.rs +++ b/oxbow/src/bbi/scanner/bigwig.rs @@ -130,12 +130,13 @@ impl Scanner { pub fn scan_query( &self, fmt_reader: BigWigRead, - region: noodles::core::Region, + region: crate::Region, columns: Option>, batch_size: Option, limit: Option, ) -> crate::Result { let batch_size = batch_size.unwrap_or(1024); + let region = region.to_noodles()?; let batch_builder = self.build_batch_builder(columns, batch_size)?; let batch_iter = BigWigQueryBatchIterator::new(fmt_reader, region, batch_builder, batch_size, limit); diff --git a/oxbow/src/bed/scanner/bed.rs b/oxbow/src/bed/scanner/bed.rs index 72f91c9..e78c7e7 100644 --- a/oxbow/src/bed/scanner/bed.rs +++ b/oxbow/src/bed/scanner/bed.rs @@ -113,13 +113,14 @@ impl Scanner { pub fn scan_query( &self, fmt_reader: noodles::bed::io::Reader<3, R>, - region: noodles::core::Region, + region: crate::Region, index: impl BinningIndex, columns: Option>, batch_size: Option, limit: Option, ) -> crate::Result { let batch_size = batch_size.unwrap_or(1024); + let region = region.to_noodles()?; let reference_sequence_name = region.name().to_string(); let interval = region.interval(); diff --git a/oxbow/src/coords.rs b/oxbow/src/coords.rs new file mode 100644 index 0000000..a8c2163 --- /dev/null +++ b/oxbow/src/coords.rs @@ -0,0 +1,433 @@ +//! Genomic coordinate systems and region types. + +use crate::{OxbowError, Result}; + +/// Genomic coordinate system. +/// +/// The notation `XY` encodes the base of the start coordinate (`X`) and the +/// base of the end coordinate (`Y`): +/// +/// - `"11"` — 1-based start, 1-based end (closed; SAM/VCF/GFF convention) +/// - `"01"` — 0-based start, 1-based end (half-open; BED/BBI convention) +/// +/// End coordinates are numerically identical in both systems; only start +/// positions differ. Use [`CoordSystem::start_offset_from`] to get the +/// additive offset needed to convert a start value from one system to another. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum CoordSystem { + /// 1-based start, closed end. + OneClosed, + /// 0-based start, half-open end. + ZeroHalfOpen, +} + +impl CoordSystem { + /// Returns the additive offset to apply to a start coordinate when + /// converting from `source_cs` to `self`. + /// + /// - `OneClosed` → `ZeroHalfOpen`: `-1` + /// - `ZeroHalfOpen` → `OneClosed`: `+1` + /// - same → same: `0` + pub fn start_offset_from(self, source_cs: CoordSystem) -> i32 { + match (source_cs, self) { + (CoordSystem::OneClosed, CoordSystem::ZeroHalfOpen) => -1, + (CoordSystem::ZeroHalfOpen, CoordSystem::OneClosed) => 1, + _ => 0, + } + } +} + +impl std::fmt::Display for CoordSystem { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + CoordSystem::OneClosed => write!(f, "11"), + CoordSystem::ZeroHalfOpen => write!(f, "01"), + } + } +} + +impl std::str::FromStr for CoordSystem { + type Err = OxbowError; + + fn from_str(s: &str) -> Result { + match s { + "11" => Ok(CoordSystem::OneClosed), + "01" => Ok(CoordSystem::ZeroHalfOpen), + other => Err(OxbowError::invalid_input(format!( + "invalid coordinate system '{other}'; expected \"01\" or \"11\"" + ))), + } + } +} + +/// A genomic region. +/// +/// Represents a query region on a named reference sequence. Internally, +/// coordinates are always stored as **0-based half-open** `[start, end)`. +/// +/// # Parsing +/// +/// Regions can be parsed from strings in two styles: +/// +/// **UCSC notation** — coordinate system is ambiguous and must be supplied: +/// ```text +/// "chr1" → whole chromosome (start=0, end=None) +/// "chr1:10000-20000" → depends on coord_system +/// "chr1:10,000-20,000" → same, with comma separators +/// "chr1:10_000-20_000" → same, with underscore separators +/// ``` +/// +/// **Explicit bracket notation** — self-describing coordinate system: +/// ```text +/// "chr1:[10000,20000)" → 0-based half-open +/// "chr1:[10001,20000]" → 1-based closed (normalized to 0-based internally) +/// ``` +/// +/// Use [`Region::parse`] with a default [`CoordSystem`] for UCSC notation, +/// or [`Region::from_str`] which assumes 1-based closed for bare UCSC strings. +/// +/// # Examples +/// +/// ``` +/// use oxbow::{CoordSystem, Region}; +/// +/// // Construct directly (0-based half-open) +/// let r = Region::new("chr1", Some(10000), Some(20000)); +/// assert_eq!(r.start, 10000); +/// assert_eq!(r.end, Some(20000)); +/// +/// let r = Region::new("chr1", None, None); +/// assert_eq!(r.start, 0); +/// assert_eq!(r.end, None); +/// +/// // UCSC notation with explicit coord system +/// let r = Region::parse("chr1:10,001-20,000", CoordSystem::OneClosed).unwrap(); +/// assert_eq!(r.start, 10000); // normalized to 0-based +/// assert_eq!(r.end, Some(20000)); +/// +/// // Explicit bracket notation (coord system in the string itself) +/// let r: Region = "chr1:[10000,20000)".parse().unwrap(); +/// assert_eq!(r.start, 10000); +/// assert_eq!(r.end, Some(20000)); +/// +/// let r: Region = "chr1:[10001,20000]".parse().unwrap(); +/// assert_eq!(r.start, 10000); // 1-based 10001 → 0-based 10000 +/// assert_eq!(r.end, Some(20000)); +/// ``` +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Region { + /// Reference sequence name. + pub name: String, + /// 0-based start position (inclusive). + pub start: u64, + /// 0-based end position (exclusive). `None` means to the end of the + /// reference sequence. + pub end: Option, +} + +impl Region { + /// Create a new region with 0-based half-open coordinates. + /// + /// `start` defaults to 0 if `None`. + pub fn new(name: impl Into, start: Option, end: Option) -> Self { + Self { + name: name.into(), + start: start.unwrap_or(0), + end, + } + } + + /// Parse a region string using the given coordinate system for UCSC + /// notation. Explicit bracket notation overrides `coord_system`. + pub fn parse(s: &str, coord_system: CoordSystem) -> Result { + // Try explicit bracket notation first. + if let Some(result) = Self::try_parse_bracket(s) { + return result; + } + // Fall back to UCSC notation with the provided coord system. + Self::parse_ucsc(s, coord_system) + } + + /// Parse UCSC-style `name[:start[-end]]`. + fn parse_ucsc(s: &str, coord_system: CoordSystem) -> Result { + if s.is_empty() { + return Err(OxbowError::invalid_input("empty region string")); + } + + let (name, interval) = match s.rsplit_once(':') { + Some((name, "")) => (name, None), + Some((name, suffix)) => (name, Some(suffix)), + None => (s, None), + }; + + if name.is_empty() { + return Err(OxbowError::invalid_input("empty reference name")); + } + + let (start, end) = match interval { + None => (None, None), + Some(iv) => { + let parts: Vec<&str> = iv.splitn(2, '-').collect(); + let start = parse_number(parts[0])?; + let end = if parts.len() == 2 { + Some(parse_number(parts[1])?) + } else { + None + }; + (Some(start), end) + } + }; + + // Normalize to 0-based half-open. + let (start, end) = match coord_system { + CoordSystem::OneClosed => { + // 1-based closed → 0-based half-open: start -= 1, end unchanged + (start.map(|s| s.saturating_sub(1)), end) + } + CoordSystem::ZeroHalfOpen => (start, end), + }; + + Ok(Self::new(name, start, end)) + } + + /// Try to parse explicit bracket notation `name:[start,end)` or + /// `name:[start,end]`. Returns `None` if the string doesn't match + /// bracket notation. + fn try_parse_bracket(s: &str) -> Option> { + let (name, rest) = s.rsplit_once(':')?; + if !rest.starts_with('[') { + return None; + } + + let result = (|| { + let rest = &rest[1..]; // strip leading '[' + + let (half_open, body) = if let Some(body) = rest.strip_suffix(')') { + (true, body) + } else if let Some(body) = rest.strip_suffix(']') { + (false, body) + } else { + return Err(OxbowError::invalid_input(format!( + "bracket notation must end with ')' or ']': '{s}'" + ))); + }; + + // Strip underscores first (commas are ambiguous in bracket notation). + let body: String = body.chars().filter(|c| *c != '_').collect(); + let (start_str, end_str) = body.split_once(',').ok_or_else(|| { + OxbowError::invalid_input(format!("bracket notation requires 'start,end': '{s}'")) + })?; + + let start = start_str.parse::().map_err(|_| { + OxbowError::invalid_input(format!("invalid start in bracket notation: '{s}'")) + })?; + let end = end_str.parse::().map_err(|_| { + OxbowError::invalid_input(format!("invalid end in bracket notation: '{s}'")) + })?; + + // Normalize to 0-based half-open. + let (start, end) = if half_open { + // [start, end) — 0-based half-open, already normalized + (start, end) + } else { + // [start, end] — 1-based closed + // start: 1-based → 0-based = start - 1 + // end: closed → half-open = end (numerically same) + (start.saturating_sub(1), end) + }; + + Ok(Self::new(name, Some(start), Some(end))) + })(); + + Some(result) + } + + /// Convert to a noodles `Region` for index-based seeking. + /// + /// Noodles regions are 1-based with inclusive bounds. + pub fn to_noodles(&self) -> std::result::Result { + use noodles::core::Position; + + match (self.start, self.end) { + (0, None) => Ok(noodles::core::Region::new(self.name.as_str(), ..)), + (s, None) => { + let start = Position::try_from(s as usize + 1) + .map_err(|_| OxbowError::invalid_input("start position out of range"))?; + Ok(noodles::core::Region::new(self.name.as_str(), start..)) + } + (s, Some(e)) => { + let start = Position::try_from(s as usize + 1) + .map_err(|_| OxbowError::invalid_input("start position out of range"))?; + let end = Position::try_from(e as usize) + .map_err(|_| OxbowError::invalid_input("end position out of range"))?; + Ok(noodles::core::Region::new(self.name.as_str(), start..=end)) + } + } + } +} + +impl std::str::FromStr for Region { + type Err = OxbowError; + + /// Parse a region string. Bracket notation is self-describing; bare + /// UCSC notation assumes 1-based closed (the most common convention). + fn from_str(s: &str) -> Result { + Self::parse(s, CoordSystem::OneClosed) + } +} + +impl std::fmt::Display for Region { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.name)?; + match (self.start, self.end) { + (0, None) => {} + (s, None) => write!(f, ":[{s},)")?, + (s, Some(e)) => write!(f, ":[{s},{e})")?, + } + Ok(()) + } +} + +/// Parse a number string, stripping `,` and `_` thousands separators. +fn parse_number(s: &str) -> Result { + let cleaned: String = s.chars().filter(|c| *c != ',' && *c != '_').collect(); + cleaned + .parse::() + .map_err(|_| OxbowError::invalid_input(format!("invalid number: '{s}'"))) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_new() { + let r = Region::new("chr1", Some(100), Some(200)); + assert_eq!(r.name, "chr1"); + assert_eq!(r.start, 100); + assert_eq!(r.end, Some(200)); + } + + #[test] + fn test_new_defaults() { + let r = Region::new("chr1", None, None); + assert_eq!(r.start, 0); + assert_eq!(r.end, None); + } + + #[test] + fn test_ucsc_one_closed() { + let r = Region::parse("chr1:10001-20000", CoordSystem::OneClosed).unwrap(); + assert_eq!(r.name, "chr1"); + assert_eq!(r.start, 10000); + assert_eq!(r.end, Some(20000)); + } + + #[test] + fn test_ucsc_zero_half_open() { + let r = Region::parse("chr1:10000-20000", CoordSystem::ZeroHalfOpen).unwrap(); + assert_eq!(r.name, "chr1"); + assert_eq!(r.start, 10000); + assert_eq!(r.end, Some(20000)); + } + + #[test] + fn test_ucsc_whole_chrom() { + let r = Region::parse("chr1", CoordSystem::OneClosed).unwrap(); + assert_eq!(r.name, "chr1"); + assert_eq!(r.start, 0); + assert_eq!(r.end, None); + } + + #[test] + fn test_ucsc_start_only() { + let r = Region::parse("chr1:5000", CoordSystem::OneClosed).unwrap(); + assert_eq!(r.start, 4999); + assert_eq!(r.end, None); + } + + #[test] + fn test_ucsc_thousands_separators() { + let r = Region::parse("chr1:10,001-20,000", CoordSystem::OneClosed).unwrap(); + assert_eq!(r.start, 10000); + assert_eq!(r.end, Some(20000)); + + let r = Region::parse("chr1:10_001-20_000", CoordSystem::OneClosed).unwrap(); + assert_eq!(r.start, 10000); + assert_eq!(r.end, Some(20000)); + } + + #[test] + fn test_bracket_half_open() { + let r: Region = "chr1:[10000,20000)".parse().unwrap(); + assert_eq!(r.start, 10000); + assert_eq!(r.end, Some(20000)); + } + + #[test] + fn test_bracket_closed() { + let r: Region = "chr1:[10001,20000]".parse().unwrap(); + assert_eq!(r.start, 10000); + assert_eq!(r.end, Some(20000)); + } + + #[test] + fn test_bracket_overrides_coord_system() { + // Even with ZeroHalfOpen context, bracket notation is self-describing + let r = Region::parse("chr1:[10001,20000]", CoordSystem::ZeroHalfOpen).unwrap(); + assert_eq!(r.start, 10000); // interpreted as 1-based closed + } + + #[test] + fn test_bracket_with_separators() { + let r: Region = "chr1:[10_000,20_000)".parse().unwrap(); + assert_eq!(r.start, 10000); + assert_eq!(r.end, Some(20000)); + } + + #[test] + fn test_display_roundtrip() { + let r = Region::new("chr1", Some(10000), Some(20000)); + assert_eq!(r.to_string(), "chr1:[10000,20000)"); + + let parsed: Region = r.to_string().parse().unwrap(); + assert_eq!(r, parsed); + } + + #[test] + fn test_display_whole_chrom() { + let r = Region::new("chr1", None, None); + assert_eq!(r.to_string(), "chr1"); + } + + #[test] + fn test_to_noodles_full_range() { + let r = Region::new("chr1", Some(10000), Some(20000)); + let nr = r.to_noodles().unwrap(); + assert_eq!(nr.name(), &b"chr1"[..]); + // 0-based 10000 → 1-based 10001 + let start = noodles::core::Position::try_from(10001).unwrap(); + let end = noodles::core::Position::try_from(20000).unwrap(); + assert_eq!(nr.start(), std::ops::Bound::Included(start)); + assert_eq!(nr.end(), std::ops::Bound::Included(end)); + } + + #[test] + fn test_to_noodles_whole_chrom() { + let r = Region::new("chr1", None, None); + let nr = r.to_noodles().unwrap(); + assert_eq!(nr.start(), std::ops::Bound::Unbounded); + assert_eq!(nr.end(), std::ops::Bound::Unbounded); + } + + #[test] + fn test_empty_string_errors() { + assert!(Region::parse("", CoordSystem::OneClosed).is_err()); + } + + #[test] + fn test_invalid_bracket_notation() { + assert!("chr1:[10000,20000".parse::().is_err()); + assert!("chr1:[10000)".parse::().is_err()); + } +} diff --git a/oxbow/src/gxf/scanner/gff.rs b/oxbow/src/gxf/scanner/gff.rs index 00d1cc2..09bdbbe 100644 --- a/oxbow/src/gxf/scanner/gff.rs +++ b/oxbow/src/gxf/scanner/gff.rs @@ -168,13 +168,14 @@ impl Scanner { pub fn scan_query( &self, fmt_reader: noodles::gff::io::Reader, - region: noodles::core::Region, + region: crate::Region, index: impl BinningIndex, columns: Option>, batch_size: Option, limit: Option, ) -> crate::Result { let batch_size = batch_size.unwrap_or(1024); + let region = region.to_noodles()?; let reference_sequence_name = region.name().to_string(); let interval = region.interval(); diff --git a/oxbow/src/gxf/scanner/gtf.rs b/oxbow/src/gxf/scanner/gtf.rs index 6f80f4d..3825535 100644 --- a/oxbow/src/gxf/scanner/gtf.rs +++ b/oxbow/src/gxf/scanner/gtf.rs @@ -168,13 +168,14 @@ impl Scanner { pub fn scan_query( &self, fmt_reader: noodles::gtf::io::Reader, - region: noodles::core::Region, + region: crate::Region, index: impl BinningIndex, columns: Option>, batch_size: Option, limit: Option, ) -> crate::Result { let batch_size = batch_size.unwrap_or(1024); + let region = region.to_noodles()?; let reference_sequence_name = region.name().to_string(); let interval = region.interval(); diff --git a/oxbow/src/lib.rs b/oxbow/src/lib.rs index 15f48f7..0f5d74d 100644 --- a/oxbow/src/lib.rs +++ b/oxbow/src/lib.rs @@ -63,72 +63,16 @@ pub mod alignment; pub mod batch; pub mod bbi; pub mod bed; +pub mod coords; pub mod error; pub mod gxf; pub mod sequence; pub mod util; pub mod variant; +pub use coords::{CoordSystem, Region}; pub use error::{OxbowError, Result}; -/// Genomic coordinate system. -/// -/// The notation `XY` encodes the base of the start coordinate (`X`) and the -/// base of the end coordinate (`Y`): -/// -/// - `"11"` — 1-based start, 1-based end (closed; SAM/VCF/GFF convention) -/// - `"01"` — 0-based start, 1-based end (half-open; BED/BBI convention) -/// -/// End coordinates are numerically identical in both systems; only start -/// positions differ. Use [`CoordSystem::start_offset_from`] to get the -/// additive offset needed to convert a start value from one system to another. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub enum CoordSystem { - /// 1-based start, closed end. - OneClosed, - /// 0-based start, half-open end. - ZeroHalfOpen, -} - -impl CoordSystem { - /// Returns the additive offset to apply to a start coordinate when - /// converting from `source_cs` to `self`. - /// - /// - `OneClosed` → `ZeroHalfOpen`: `-1` - /// - `ZeroHalfOpen` → `OneClosed`: `+1` - /// - same → same: `0` - pub fn start_offset_from(self, source_cs: CoordSystem) -> i32 { - match (source_cs, self) { - (CoordSystem::OneClosed, CoordSystem::ZeroHalfOpen) => -1, - (CoordSystem::ZeroHalfOpen, CoordSystem::OneClosed) => 1, - _ => 0, - } - } -} - -impl std::fmt::Display for CoordSystem { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - CoordSystem::OneClosed => write!(f, "11"), - CoordSystem::ZeroHalfOpen => write!(f, "01"), - } - } -} - -impl std::str::FromStr for CoordSystem { - type Err = OxbowError; - - fn from_str(s: &str) -> Result { - match s { - "11" => Ok(CoordSystem::OneClosed), - "01" => Ok(CoordSystem::ZeroHalfOpen), - other => Err(OxbowError::invalid_input(format!( - "invalid coordinate system '{other}'; expected \"01\" or \"11\"" - ))), - } - } -} - #[derive(Debug, Clone)] pub enum Select { /// Select specific items explicitly diff --git a/oxbow/src/sequence/scanner/fasta.rs b/oxbow/src/sequence/scanner/fasta.rs index fb141ef..0dad1dd 100644 --- a/oxbow/src/sequence/scanner/fasta.rs +++ b/oxbow/src/sequence/scanner/fasta.rs @@ -2,7 +2,6 @@ use std::io::{BufRead, Seek}; use arrow::array::RecordBatchReader; use arrow::datatypes::Schema; -use noodles::core::Region; use crate::sequence::model::BatchBuilder; use crate::sequence::model::Model; @@ -18,10 +17,9 @@ use crate::{CoordSystem, Select}; /// /// ```no_run /// use oxbow::sequence::scanner::fasta::Scanner; -/// use oxbow::{CoordSystem, Select}; +/// use oxbow::{CoordSystem, Region, Select}; /// use std::fs::File; /// use std::io::BufReader; -/// use noodles::core::Region; /// /// let inner = File::open("sample.fa").map(BufReader::new).unwrap(); /// let fmt_reader = noodles::fasta::io::Reader::new(inner); @@ -106,12 +104,16 @@ impl Scanner { pub fn scan_query( &self, fmt_reader: noodles::fasta::io::Reader, - regions: Vec, + regions: Vec, index: noodles::fasta::fai::Index, columns: Option>, batch_size: Option, ) -> crate::Result { let batch_size = batch_size.unwrap_or(1024); + let regions: Vec = regions + .iter() + .map(|r| r.to_noodles()) + .collect::>>()?; let batch_builder = self.build_batch_builder(columns, batch_size)?; let batch_iter = QueryBatchIterator::new(fmt_reader, index, regions, batch_builder, batch_size); @@ -177,7 +179,7 @@ mod tests { let scanner = Scanner::new(Select::All, CoordSystem::OneClosed).unwrap(); let regions = ["seq1:1-4", "seq2:1-4", "seq3:1-4"]; - let regions: Vec = regions.iter().map(|s| s.parse().unwrap()).collect(); + let regions: Vec = regions.iter().map(|s| s.parse().unwrap()).collect(); let mut batch_iter = scanner .scan_query(fmt_reader, regions, index, None, Some(2)) .unwrap(); diff --git a/oxbow/src/variant/scanner/bcf.rs b/oxbow/src/variant/scanner/bcf.rs index 9573968..7a64d92 100644 --- a/oxbow/src/variant/scanner/bcf.rs +++ b/oxbow/src/variant/scanner/bcf.rs @@ -205,13 +205,14 @@ impl Scanner { pub fn scan_query( &self, fmt_reader: noodles::bcf::io::Reader, - region: noodles::core::Region, + region: crate::Region, index: impl BinningIndex, columns: Option>, batch_size: Option, limit: Option, ) -> crate::Result { let batch_size = batch_size.unwrap_or(1024); + let region = region.to_noodles()?; let reference_sequence_name = region.name().to_string(); let interval = region.interval(); diff --git a/oxbow/src/variant/scanner/vcf.rs b/oxbow/src/variant/scanner/vcf.rs index c025fa5..7d6c4fa 100644 --- a/oxbow/src/variant/scanner/vcf.rs +++ b/oxbow/src/variant/scanner/vcf.rs @@ -205,13 +205,14 @@ impl Scanner { pub fn scan_query( &self, fmt_reader: noodles::vcf::io::Reader, - region: noodles::core::Region, + region: crate::Region, index: impl BinningIndex, columns: Option>, batch_size: Option, limit: Option, ) -> crate::Result { let batch_size = batch_size.unwrap_or(1024); + let region = region.to_noodles()?; let reference_sequence_name = region.name().to_string(); let interval = region.interval(); diff --git a/py-oxbow/src/alignment.rs b/py-oxbow/src/alignment.rs index 99454f8..f49a6b9 100644 --- a/py-oxbow/src/alignment.rs +++ b/py-oxbow/src/alignment.rs @@ -8,14 +8,12 @@ use pyo3::IntoPyObjectExt; use pyo3_arrow::PyRecordBatchReader; use pyo3_arrow::PySchema; -use noodles::bgzf::io::Seek as _; -use noodles::core::Region; - use crate::error::{err_on_unwind, to_py}; use crate::util::{ pyobject_to_bufreader, resolve_coord_system, resolve_cram_index, resolve_fasta_repository, resolve_fields, resolve_index, PyVirtualPosition, Reader, }; +use noodles::bgzf::io::Seek as _; use oxbow::alignment::{BamScanner, CramScanner, SamScanner}; use oxbow::util::batches_to_ipc; use oxbow::util::index::IndexType; @@ -334,9 +332,8 @@ impl PySamScanner { batch_size: Option, limit: Option, ) -> PyResult { - let region = region - .parse::() - .map_err(|e| PyErr::new::(e.to_string()))?; + let region = + oxbow::Region::parse(®ion, self.scanner.model().coord_system()).map_err(to_py)?; match self.reader.clone() { Reader::BgzfFile(bgzf_reader) => { @@ -781,9 +778,8 @@ impl PyBamScanner { batch_size: Option, limit: Option, ) -> PyResult { - let region = region - .parse::() - .map_err(|e| PyErr::new::(e.to_string()))?; + let region = + oxbow::Region::parse(®ion, self.scanner.model().coord_system()).map_err(to_py)?; match self.reader.clone() { Reader::BgzfFile(bgzf_reader) => { @@ -1135,9 +1131,8 @@ impl PyCramScanner { batch_size: Option, limit: Option, ) -> PyResult { - let region = region - .parse::() - .map_err(|e| PyErr::new::(e.to_string()))?; + let region = + oxbow::Region::parse(®ion, self.scanner.model().coord_system()).map_err(to_py)?; let index = resolve_cram_index(py, &self.src, index)?; match self.reader.clone() { @@ -1201,9 +1196,7 @@ pub fn read_sam( let reader = fmt_reader.into_inner(); let ipc = if let Some(region) = region { - let region = region - .parse::() - .map_err(|e| PyErr::new::(e.to_string()))?; + let region = oxbow::Region::parse(®ion, oxbow::CoordSystem::OneClosed).map_err(to_py)?; match reader { Reader::BgzfFile(bgzf_reader) => { @@ -1274,9 +1267,7 @@ pub fn read_bam( let reader = fmt_reader.into_inner(); let ipc = if let Some(region) = region { - let region = region - .parse::() - .map_err(|e| PyErr::new::(e.to_string()))?; + let region = oxbow::Region::parse(®ion, oxbow::CoordSystem::OneClosed).map_err(to_py)?; match reader { Reader::BgzfFile(bgzf_reader) => { @@ -1348,9 +1339,7 @@ pub fn read_cram( let reader = fmt_reader.into_inner(); let ipc = if let Some(region) = region { - let region = region - .parse::() - .map_err(|e| PyErr::new::(e.to_string()))?; + let region = oxbow::Region::parse(®ion, oxbow::CoordSystem::OneClosed).map_err(to_py)?; match reader { Reader::File(reader) => { diff --git a/py-oxbow/src/bbi.rs b/py-oxbow/src/bbi.rs index b5c349b..e292bf4 100644 --- a/py-oxbow/src/bbi.rs +++ b/py-oxbow/src/bbi.rs @@ -8,11 +8,9 @@ use pyo3::IntoPyObjectExt; use pyo3_arrow::PyRecordBatchReader; use pyo3_arrow::PySchema; -use bigtools::bed::autosql::parse::parse_autosql; -use noodles::core::Region; - use crate::error::{err_on_unwind, to_py}; use crate::util::{pyobject_to_bufreader, resolve_coord_system, resolve_fields, Reader}; +use bigtools::bed::autosql::parse::parse_autosql; use oxbow::bbi::model::base::field::FieldDef; use oxbow::bbi::{BBIReader, BBIZoomScanner, BedSchema, BigBedScanner, BigWigScanner}; use oxbow::util::batches_to_ipc; @@ -199,9 +197,8 @@ impl PyBigWigScanner { batch_size: Option, limit: Option, ) -> PyResult { - let region = region - .parse::() - .map_err(|e| PyErr::new::(e.to_string()))?; + let region = + oxbow::Region::parse(®ion, self.scanner.model().coord_system()).map_err(to_py)?; let reader = self.reader.clone(); let info = self.scanner.info().clone(); @@ -445,9 +442,8 @@ impl PyBigBedScanner { batch_size: Option, limit: Option, ) -> PyResult { - let region = region - .parse::() - .map_err(|e| PyErr::new::(e.to_string()))?; + let region = + oxbow::Region::parse(®ion, self.scanner.model().coord_system()).map_err(to_py)?; let reader = self.reader.clone(); let info = self.scanner.info().clone(); @@ -675,9 +671,8 @@ impl PyBBIZoomScanner { batch_size: Option, limit: Option, ) -> PyResult { - let region = region - .parse::() - .map_err(|e| PyErr::new::(e.to_string()))?; + let region = + oxbow::Region::parse(®ion, self.scanner.model().coord_system()).map_err(to_py)?; self.reader.seek(std::io::SeekFrom::Start(0)).unwrap(); let reader = self.reader.clone(); @@ -728,9 +723,8 @@ pub fn read_bigwig( let reader = pyobject_to_bufreader(py, src.clone_ref(py), false)?; let ipc = if let Some(region) = region { - let region = region - .parse::() - .map_err(|e| PyErr::new::(e.to_string()))?; + let region = + oxbow::Region::parse(®ion, oxbow::CoordSystem::ZeroHalfOpen).map_err(to_py)?; let fmt_reader = bigtools::BigWigRead::open(reader) .map_err(|e| PyErr::new::(e.to_string()))?; @@ -784,9 +778,8 @@ pub fn read_bigbed( let reader = pyobject_to_bufreader(py, src.clone_ref(py), false)?; let ipc = if let Some(region) = region { - let region = region - .parse::() - .map_err(|e| PyErr::new::(e.to_string()))?; + let region = + oxbow::Region::parse(®ion, oxbow::CoordSystem::ZeroHalfOpen).map_err(to_py)?; let fmt_reader = bigtools::BigBedRead::open(reader) .map_err(|e| PyErr::new::(e.to_string()))?; diff --git a/py-oxbow/src/bed.rs b/py-oxbow/src/bed.rs index 10334f3..8e1be22 100644 --- a/py-oxbow/src/bed.rs +++ b/py-oxbow/src/bed.rs @@ -7,8 +7,6 @@ use pyo3::IntoPyObjectExt; use pyo3_arrow::PyRecordBatchReader; use pyo3_arrow::PySchema; -use noodles::core::Region; - use crate::error::{err_on_unwind, to_py}; use crate::util::resolve_coord_system; use crate::util::{ @@ -325,9 +323,8 @@ impl PyBedScanner { batch_size: Option, limit: Option, ) -> PyResult { - let region = region - .parse::() - .map_err(|e| PyErr::new::(e.to_string()))?; + let region = + oxbow::Region::parse(region, self.scanner.model().coord_system()).map_err(to_py)?; match self.reader.clone() { Reader::BgzfFile(bgzf_reader) => { @@ -417,9 +414,8 @@ pub fn read_bed( .map_err(to_py)?; let ipc = if let Some(region) = region { - let region = region - .parse::() - .map_err(|e| PyErr::new::(e.to_string()))?; + let region = + oxbow::Region::parse(®ion, oxbow::CoordSystem::ZeroHalfOpen).map_err(to_py)?; match reader { Reader::BgzfFile(bgzf_reader) => { diff --git a/py-oxbow/src/gxf.rs b/py-oxbow/src/gxf.rs index 9e168fb..fbc6d12 100644 --- a/py-oxbow/src/gxf.rs +++ b/py-oxbow/src/gxf.rs @@ -9,8 +9,6 @@ use pyo3::IntoPyObjectExt; use pyo3_arrow::PyRecordBatchReader; use pyo3_arrow::PySchema; -use noodles::core::Region; - use crate::error::{err_on_unwind, to_py}; use crate::util::{ pyobject_to_bufreader, resolve_coord_system, resolve_fields, resolve_index, PyVirtualPosition, @@ -292,9 +290,8 @@ impl PyGtfScanner { batch_size: Option, limit: Option, ) -> PyResult { - let region = region - .parse::() - .map_err(|e| PyErr::new::(e.to_string()))?; + let region = + oxbow::Region::parse(region, self.scanner.model().coord_system()).map_err(to_py)?; match self.reader.clone() { Reader::BgzfFile(bgzf_reader) => { @@ -616,9 +613,8 @@ impl PyGffScanner { batch_size: Option, limit: Option, ) -> PyResult { - let region = region - .parse::() - .map_err(|e| PyErr::new::(e.to_string()))?; + let region = + oxbow::Region::parse(region, self.scanner.model().coord_system()).map_err(to_py)?; match self.reader.clone() { Reader::BgzfFile(bgzf_reader) => { @@ -704,9 +700,7 @@ pub fn read_gtf( GtfScanner::new(None, fields, attr_defs, CoordSystem::OneClosed).map_err(to_py)?; let ipc = if let Some(region) = region { - let region = region - .parse::() - .map_err(|e| PyErr::new::(e.to_string()))?; + let region = oxbow::Region::parse(®ion, oxbow::CoordSystem::OneClosed).map_err(to_py)?; match reader { Reader::BgzfFile(bgzf_reader) => { @@ -774,9 +768,7 @@ pub fn read_gff( GffScanner::new(None, fields, attr_defs, CoordSystem::OneClosed).map_err(to_py)?; let ipc = if let Some(region) = region { - let region = region - .parse::() - .map_err(|e| PyErr::new::(e.to_string()))?; + let region = oxbow::Region::parse(®ion, oxbow::CoordSystem::OneClosed).map_err(to_py)?; match reader { Reader::BgzfFile(bgzf_reader) => { diff --git a/py-oxbow/src/sequence.rs b/py-oxbow/src/sequence.rs index eac98ae..0bd8473 100644 --- a/py-oxbow/src/sequence.rs +++ b/py-oxbow/src/sequence.rs @@ -7,16 +7,14 @@ use pyo3::IntoPyObjectExt; use pyo3_arrow::PyRecordBatchReader; use pyo3_arrow::PySchema; -use flate2::read::MultiGzDecoder; -use noodles::bgzf::gzi::io::Reader as GziReader; -use noodles::bgzf::io::IndexedReader as IndexedBgzfReader; -use noodles::core::Region; - use crate::error::{err_on_unwind, to_py}; use crate::util::{ pyobject_to_bufreader, resolve_coord_system, resolve_faidx, resolve_fields, PyVirtualPosition, Reader, }; +use flate2::read::MultiGzDecoder; +use noodles::bgzf::gzi::io::Reader as GziReader; +use noodles::bgzf::io::IndexedReader as IndexedBgzfReader; use oxbow::sequence::{FastaScanner, FastqScanner}; use oxbow::util::batches_to_ipc; @@ -352,13 +350,12 @@ impl PyFastaScanner { let index = resolve_faidx(py, &self.src, index)?; let reader = self.reader.clone(); - let regions: Vec = regions + let coord_system = self.scanner.model().coord_system(); + let regions: Vec = regions .into_iter() - .map(|s| { - s.parse::() - .map_err(|e| PyErr::new::(e.to_string())) - }) - .collect::, _>>()?; + .map(|s| oxbow::Region::parse(&s, coord_system)) + .collect::>>() + .map_err(to_py)?; if self.compressed { let gzi_source = match gzi { @@ -467,13 +464,11 @@ pub fn read_fasta( let ipc = if let Some(regions) = regions { let index = resolve_faidx(py, &src, index)?; - let regions: Vec = regions + let regions: Vec = regions .into_iter() - .map(|s| { - s.parse::() - .map_err(|e| PyErr::new::(e.to_string())) - }) - .collect::, _>>()?; + .map(|s| oxbow::Region::parse(&s, oxbow::CoordSystem::OneClosed)) + .collect::>>() + .map_err(to_py)?; if compressed { let gzi_source = match gzi { Some(gzi) => pyobject_to_bufreader(py, gzi.clone_ref(py), false)?, diff --git a/py-oxbow/src/variant.rs b/py-oxbow/src/variant.rs index f450892..3f10547 100644 --- a/py-oxbow/src/variant.rs +++ b/py-oxbow/src/variant.rs @@ -7,8 +7,6 @@ use pyo3::IntoPyObjectExt; use pyo3_arrow::PyRecordBatchReader; use pyo3_arrow::PySchema; -use noodles::core::Region; - use crate::error::{err_on_unwind, to_py}; use crate::util::{ pyobject_to_bufreader, resolve_coord_system, resolve_fields, resolve_index, PyVirtualPosition, @@ -337,9 +335,8 @@ impl PyVcfScanner { batch_size: Option, limit: Option, ) -> PyResult { - let region = region - .parse::() - .map_err(|e| PyErr::new::(e.to_string()))?; + let region = + oxbow::Region::parse(®ion, self.scanner.model().coord_system()).map_err(to_py)?; match self.reader.clone() { Reader::BgzfFile(bgzf_reader) => { let fmt_reader = noodles::vcf::io::Reader::new(bgzf_reader); @@ -709,9 +706,8 @@ impl PyBcfScanner { batch_size: Option, limit: Option, ) -> PyResult { - let region = region - .parse::() - .map_err(|e| PyErr::new::(e.to_string()))?; + let region = + oxbow::Region::parse(®ion, self.scanner.model().coord_system()).map_err(to_py)?; match self.reader.clone() { Reader::BgzfFile(bgzf_reader) => { @@ -839,9 +835,7 @@ pub fn read_vcf( .map_err(to_py)?; let ipc = if let Some(region) = region { - let region = region - .parse::() - .map_err(|e| PyErr::new::(e.to_string()))?; + let region = oxbow::Region::parse(®ion, oxbow::CoordSystem::OneClosed).map_err(to_py)?; match reader { Reader::BgzfFile(bgzf_reader) => { @@ -940,9 +934,7 @@ pub fn read_bcf( .map_err(to_py)?; let ipc = if let Some(region) = region { - let region = region - .parse::() - .map_err(|e| PyErr::new::(e.to_string()))?; + let region = oxbow::Region::parse(®ion, oxbow::CoordSystem::OneClosed).map_err(to_py)?; match reader { Reader::BgzfFile(bgzf_reader) => { diff --git a/r-oxbow/src/rust/src/lib.rs b/r-oxbow/src/rust/src/lib.rs index dc9cb1f..6f65444 100644 --- a/r-oxbow/src/rust/src/lib.rs +++ b/r-oxbow/src/rust/src/lib.rs @@ -4,7 +4,7 @@ use extendr_api::prelude::*; use flate2::bufread::MultiGzDecoder; use noodles::bgzf::io::IndexedReader as IndexedBgzfReader; -use noodles::core::Region; +use oxbow::Region; use oxbow::alignment::{BamScanner, CramScanner, SamScanner}; use oxbow::bbi::{BigBedScanner, BigWigScanner}; @@ -74,7 +74,7 @@ fn read_fasta_impl( noodles::fasta::fai::fs::read(index_path).expect("Could not read FASTA index file."); let regions: Vec = regions .into_iter() - .map(|s| s.parse::().unwrap()) + .map(|s| Region::parse(&s, CoordSystem::OneClosed).unwrap()) .collect(); if compressed { let gzi_path = gzi.unwrap_or(format!("{}.gzi", path)); @@ -120,7 +120,7 @@ pub fn read_sam_impl( let ipc = if let Some(region) = region { let index_path = index.unwrap_or(format!("{}.tbi", path)); let index = noodles::tabix::fs::read(index_path).expect("Could not read TBI index file."); - let region = region.parse::().unwrap(); + let region = Region::parse(®ion, CoordSystem::OneClosed).unwrap(); let bgzf_reader = noodles::bgzf::io::Reader::new(reader); let mut fmt_reader = noodles::sam::io::Reader::new(bgzf_reader); let header = fmt_reader.read_header().unwrap(); @@ -194,7 +194,7 @@ pub fn read_bam_impl( let index_path = index.unwrap_or(format!("{}.bai", path)); let index = noodles::bam::bai::fs::read(index_path).expect("Could not read BAI index file."); - let region = region.parse::().unwrap(); + let region = Region::parse(®ion, CoordSystem::OneClosed).unwrap(); let bgzf_reader = noodles::bgzf::io::Reader::new(reader); let mut fmt_reader = noodles::bam::io::Reader::from(bgzf_reader); let header = fmt_reader.read_header().unwrap(); @@ -286,7 +286,7 @@ pub fn read_cram_impl( let index_path = index.unwrap_or(format!("{}.crai", path)); let index = noodles::cram::crai::fs::read(index_path).expect("Could not read CRAI index file."); - let region = region.parse::().unwrap(); + let region = Region::parse(®ion, CoordSystem::OneClosed).unwrap(); let mut fmt_reader = noodles::cram::io::reader::Builder::default() .set_reference_sequence_repository(repo.clone()) .build_from_reader(reader); @@ -354,7 +354,7 @@ pub fn read_vcf_impl( let ipc = if let Some(region) = region { let index_path = index.unwrap_or(format!("{}.tbi", path)); let index = noodles::tabix::fs::read(index_path).expect("Could not read TBI index file."); - let region = region.parse::().unwrap(); + let region = Region::parse(®ion, CoordSystem::OneClosed).unwrap(); let bgzf_reader = noodles::bgzf::io::Reader::new(reader); let mut fmt_reader = noodles::vcf::io::Reader::new(bgzf_reader); let header = fmt_reader.read_header().unwrap(); @@ -438,7 +438,7 @@ pub fn read_bcf_impl( let ipc = if let Some(region) = region { let index_path = index.unwrap_or(format!("{}.csi", path)); let index = noodles::csi::fs::read(index_path).expect("Could not read CSI index file."); - let region = region.parse::().unwrap(); + let region = Region::parse(®ion, CoordSystem::OneClosed).unwrap(); let bgzf_reader = noodles::bgzf::io::Reader::new(reader); let mut fmt_reader = noodles::bcf::io::Reader::from(bgzf_reader); let header = fmt_reader.read_header().unwrap(); @@ -513,7 +513,7 @@ pub fn read_gtf_impl( let ipc = if let Some(region) = region { let index_path = index.unwrap_or(format!("{}.tbi", path)); let index = noodles::tabix::fs::read(index_path).expect("Could not read TBI index file."); - let region = region.parse::().unwrap(); + let region = Region::parse(®ion, CoordSystem::OneClosed).unwrap(); let bgzf_reader = noodles::bgzf::io::Reader::new(reader); let mut fmt_reader = noodles::gtf::io::Reader::new(bgzf_reader); let attr_defs = GtfScanner::attribute_defs(&mut fmt_reader, scan_rows).unwrap(); @@ -583,7 +583,7 @@ pub fn read_gff_impl( let ipc = if let Some(region) = region { let index_path = index.unwrap_or(format!("{}.tbi", path)); let index = noodles::tabix::fs::read(index_path).expect("Could not read TBI index file."); - let region = region.parse::().unwrap(); + let region = Region::parse(®ion, CoordSystem::OneClosed).unwrap(); let bgzf_reader = noodles::bgzf::io::Reader::new(reader); let mut fmt_reader = noodles::gff::io::Reader::new(bgzf_reader); let attr_defs = GffScanner::attribute_defs(&mut fmt_reader, scan_rows).unwrap(); @@ -659,7 +659,7 @@ pub fn read_bed_impl( let ipc = if let Some(region) = region { let index_path = index.unwrap_or(format!("{}.tbi", path)); let index = noodles::tabix::fs::read(index_path).expect("Could not read TBI index file."); - let region = region.parse::().unwrap(); + let region = Region::parse(®ion, CoordSystem::ZeroHalfOpen).unwrap(); let bgzf_reader = noodles::bgzf::io::Reader::new(reader); let fmt_reader = noodles::bed::io::Reader::new(bgzf_reader); let batches = scanner @@ -692,7 +692,7 @@ pub fn read_bigwig_impl( .unwrap(); let ipc = if let Some(region) = region { - let region = region.parse::().unwrap(); + let region = Region::parse(®ion, CoordSystem::ZeroHalfOpen).unwrap(); let fmt_reader = bigtools::BigWigRead::open(reader).unwrap(); let info = fmt_reader.info().clone(); let scanner = @@ -727,7 +727,7 @@ pub fn read_bigbed_impl( .unwrap(); let ipc = if let Some(region) = region { - let region = region.parse::().unwrap(); + let region = Region::parse(®ion, CoordSystem::ZeroHalfOpen).unwrap(); let fmt_reader = bigtools::BigBedRead::open(reader).unwrap(); let info = fmt_reader.info().clone(); let scanner = BigBedScanner::new( From a7ced213efa5dc49c9a4f9637377db491ddde00a Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Sat, 21 Mar 2026 09:02:52 -0400 Subject: [PATCH 3/3] Update docstrings --- oxbow/src/alignment/scanner/bam.rs | 6 +++--- oxbow/src/alignment/scanner/cram.rs | 6 +++--- oxbow/src/alignment/scanner/sam.rs | 6 +++--- oxbow/src/bbi/model/base.rs | 3 +++ oxbow/src/bbi/model/zoom.rs | 2 ++ oxbow/src/bbi/scanner/bbizoom.rs | 9 +++++++-- oxbow/src/bbi/scanner/bigbed.rs | 9 +++++++-- oxbow/src/bbi/scanner/bigwig.rs | 8 ++++++-- oxbow/src/bed/model.rs | 3 +++ oxbow/src/bed/scanner/bed.rs | 7 ++++--- oxbow/src/gxf/scanner/gff.rs | 5 +++-- oxbow/src/gxf/scanner/gtf.rs | 5 +++-- oxbow/src/sequence/model.rs | 3 +++ oxbow/src/sequence/scanner/fasta.rs | 7 ++++--- oxbow/src/variant/scanner/bcf.rs | 13 +++++++++++-- oxbow/src/variant/scanner/vcf.rs | 13 +++++++++++-- py-oxbow/oxbow/_core/base.py | 9 +++++++++ py-oxbow/src/alignment.rs | 27 ++++++++++++++++++++++++--- py-oxbow/src/bbi.rs | 24 +++++++++++++++++++++--- py-oxbow/src/bed.rs | 9 ++++++++- py-oxbow/src/gxf.rs | 18 ++++++++++++++++-- py-oxbow/src/sequence.rs | 9 +++++++-- py-oxbow/src/variant.rs | 22 ++++++++++++++++------ 23 files changed, 177 insertions(+), 46 deletions(-) diff --git a/oxbow/src/alignment/scanner/bam.rs b/oxbow/src/alignment/scanner/bam.rs index 93e607d..96fe5c8 100644 --- a/oxbow/src/alignment/scanner/bam.rs +++ b/oxbow/src/alignment/scanner/bam.rs @@ -10,7 +10,7 @@ use crate::alignment::model::BatchBuilder; use crate::alignment::scanner::batch_iterator::{BatchIterator, QueryBatchIterator}; use crate::alignment::AlignmentModel; use crate::util::query::{BgzfChunkReader, ByteRangeReader}; -use crate::{CoordSystem, Select}; +use crate::{CoordSystem, Region, Select}; /// A BAM scanner. /// @@ -44,7 +44,7 @@ impl Scanner { /// /// - `fields`: standard SAM field selection. /// - `tag_defs`: `None` → no tags column. `Some(vec![])` → empty struct. - /// - `coord_system`: output coordinate system. `None` → 1-based closed. + /// - `coord_system`: output coordinate system for position columns. pub fn new( header: noodles::sam::Header, fields: Select, @@ -175,7 +175,7 @@ impl Scanner { pub fn scan_query( &self, fmt_reader: noodles::bam::io::Reader, - region: crate::Region, + region: Region, index: impl BinningIndex, columns: Option>, batch_size: Option, diff --git a/oxbow/src/alignment/scanner/cram.rs b/oxbow/src/alignment/scanner/cram.rs index 240b95a..6e0c37c 100644 --- a/oxbow/src/alignment/scanner/cram.rs +++ b/oxbow/src/alignment/scanner/cram.rs @@ -10,7 +10,7 @@ use crate::alignment::model::tag::TagScanner; use crate::alignment::model::BatchBuilder; use crate::alignment::AlignmentModel; use crate::batch::{Push, RecordBatchBuilder as _}; -use crate::{CoordSystem, Select}; +use crate::{CoordSystem, Region, Select}; /// A CRAM scanner. /// @@ -50,7 +50,7 @@ impl Scanner { /// /// - `fields`: standard SAM field selection. /// - `tag_defs`: `None` → no tags column. `Some(vec![])` → empty struct. - /// - `coord_system`: output coordinate system. `None` → 1-based closed. + /// - `coord_system`: output coordinate system for position columns. /// /// The FASTA repository is stored and used by scan methods for decoding. pub fn new( @@ -210,7 +210,7 @@ impl Scanner { pub fn scan_query( &self, fmt_reader: noodles::cram::io::Reader, - region: crate::Region, + region: Region, index: noodles::cram::crai::Index, columns: Option>, batch_size: Option, diff --git a/oxbow/src/alignment/scanner/sam.rs b/oxbow/src/alignment/scanner/sam.rs index f4ebab4..1125a9e 100644 --- a/oxbow/src/alignment/scanner/sam.rs +++ b/oxbow/src/alignment/scanner/sam.rs @@ -10,7 +10,7 @@ use crate::alignment::model::BatchBuilder; use crate::alignment::scanner::batch_iterator::{BatchIterator, QueryBatchIterator}; use crate::alignment::AlignmentModel; use crate::util::query::{BgzfChunkReader, ByteRangeReader}; -use crate::{CoordSystem, Select}; +use crate::{CoordSystem, Region, Select}; /// A SAM scanner. /// @@ -44,7 +44,7 @@ impl Scanner { /// /// - `fields`: standard SAM field selection. /// - `tag_defs`: `None` → no tags column. `Some(vec![])` → empty struct. - /// - `coord_system`: output coordinate system. `None` → 1-based closed. + /// - `coord_system`: output coordinate system for position columns. pub fn new( header: noodles::sam::Header, fields: Select, @@ -175,7 +175,7 @@ impl Scanner { pub fn scan_query( &self, fmt_reader: noodles::sam::io::Reader, - region: crate::Region, + region: Region, index: impl BinningIndex, columns: Option>, batch_size: Option, diff --git a/oxbow/src/bbi/model/base.rs b/oxbow/src/bbi/model/base.rs index 39baf9d..40cc348 100644 --- a/oxbow/src/bbi/model/base.rs +++ b/oxbow/src/bbi/model/base.rs @@ -61,6 +61,9 @@ fn bed_schema_field_defs(bed_schema: &BedSchema) -> Vec { /// /// Wraps a [`BedSchema`] with field projection, using AutoSql-based Arrow /// types (e.g., UInt32 for positions) rather than BED's noodles-based types. +/// +/// `coord_system` controls the coordinate system to return the positions in. +/// The default is 0-based half-open (BED/BBI convention). #[derive(Clone, Debug)] pub struct Model { bed_schema: BedSchema, diff --git a/oxbow/src/bbi/model/zoom.rs b/oxbow/src/bbi/model/zoom.rs index 71cf28a..c7a4350 100644 --- a/oxbow/src/bbi/model/zoom.rs +++ b/oxbow/src/bbi/model/zoom.rs @@ -40,6 +40,8 @@ impl<'a> BBIZoomRecord<'a> { /// /// Fixed schema: 8 fields (chrom, start, end, bases_covered, min, max, /// sum, sum_squares). The `fields` parameter projects which to include. +/// The `coord_system` parameter controls the coordinate system to return the +/// positions in. /// /// # Examples /// diff --git a/oxbow/src/bbi/scanner/bbizoom.rs b/oxbow/src/bbi/scanner/bbizoom.rs index 38a6b9d..c351ec4 100644 --- a/oxbow/src/bbi/scanner/bbizoom.rs +++ b/oxbow/src/bbi/scanner/bbizoom.rs @@ -7,7 +7,7 @@ pub use super::BBIReader; use crate::bbi::model::zoom::BatchBuilder; use crate::bbi::model::zoom::Model; use crate::bbi::scanner::batch_iterator::zoom::{BBIZoomBatchIterator, BBIZoomQueryBatchIterator}; -use crate::{CoordSystem, Select}; +use crate::{CoordSystem, Region, Select}; /// A scanner for the summary statistics from BBI file zoom level. /// @@ -35,6 +35,11 @@ pub struct Scanner { impl Scanner { /// Creates a BBI zoom level scanner. + /// + /// - `ref_names`: the reference sequence names in the BBI file. + /// - `zoom_level`: the zoom level to read from. + /// - `fields`: column names to project. + /// - `coord_system`: output coordinate system for position columns. pub fn new( ref_names: Vec, zoom_level: u32, @@ -125,7 +130,7 @@ impl Scanner { pub fn scan_query( &self, reader: BBIReader, - region: crate::Region, + region: Region, columns: Option>, batch_size: Option, limit: Option, diff --git a/oxbow/src/bbi/scanner/bigbed.rs b/oxbow/src/bbi/scanner/bigbed.rs index 4463faf..b597319 100644 --- a/oxbow/src/bbi/scanner/bigbed.rs +++ b/oxbow/src/bbi/scanner/bigbed.rs @@ -8,7 +8,7 @@ use crate::bbi::model::base::BatchBuilder; use crate::bbi::model::base::BedSchema; use crate::bbi::model::base::Model; use crate::bbi::scanner::batch_iterator::base::{BigBedBatchIterator, BigBedQueryBatchIterator}; -use crate::{CoordSystem, Select}; +use crate::{CoordSystem, Region, Select}; /// A BigBed scanner. /// @@ -34,6 +34,11 @@ pub struct Scanner { impl Scanner { /// Creates a BigBed scanner from a BED schema, BBI file info, and optional field names. + /// + /// - `bed_schema`: the parsing interpretation. + /// - `info`: the BBI file info. + /// - `fields`: column names to project. + /// - `coord_system`: output coordinate system for position columns. pub fn new( bed_schema: BedSchema, info: bigtools::BBIFileInfo, @@ -130,7 +135,7 @@ impl Scanner { pub fn scan_query( &self, fmt_reader: BigBedRead, - region: crate::Region, + region: Region, columns: Option>, batch_size: Option, limit: Option, diff --git a/oxbow/src/bbi/scanner/bigwig.rs b/oxbow/src/bbi/scanner/bigwig.rs index ca7faa1..69d86d0 100644 --- a/oxbow/src/bbi/scanner/bigwig.rs +++ b/oxbow/src/bbi/scanner/bigwig.rs @@ -8,7 +8,7 @@ use crate::bbi::model::base::BatchBuilder; use crate::bbi::model::base::BedSchema; use crate::bbi::model::base::Model; use crate::bbi::scanner::batch_iterator::base::{BigWigBatchIterator, BigWigQueryBatchIterator}; -use crate::{CoordSystem, Select}; +use crate::{CoordSystem, Region, Select}; /// A BigWig scanner. /// @@ -34,6 +34,10 @@ pub struct Scanner { impl Scanner { /// Creates a BigWig scanner from BBI file info and optional field names. + /// + /// - `info`: the BBI file info. + /// - `fields`: column names to project. + /// - `coord_system`: output coordinate system for position columns. pub fn new( info: bigtools::BBIFileInfo, fields: Select, @@ -130,7 +134,7 @@ impl Scanner { pub fn scan_query( &self, fmt_reader: BigWigRead, - region: crate::Region, + region: Region, columns: Option>, batch_size: Option, limit: Option, diff --git a/oxbow/src/bed/model.rs b/oxbow/src/bed/model.rs index 2d25cd7..8c42e6e 100644 --- a/oxbow/src/bed/model.rs +++ b/oxbow/src/bed/model.rs @@ -25,6 +25,9 @@ use field::Field; /// Uses BED-specific Arrow types for standard fields (e.g., Int64 for /// positions) and FieldDef types for custom fields. /// +/// `coord_system` controls the coordinate system to return the positions in. +/// The default is 0-based half-open (BED convention). +/// /// # Examples /// /// ``` diff --git a/oxbow/src/bed/scanner/bed.rs b/oxbow/src/bed/scanner/bed.rs index e78c7e7..d1fb35d 100644 --- a/oxbow/src/bed/scanner/bed.rs +++ b/oxbow/src/bed/scanner/bed.rs @@ -10,7 +10,7 @@ use crate::bed::model::BedSchema; use crate::bed::model::Model; use crate::bed::scanner::batch_iterator::{BatchIterator, QueryBatchIterator}; use crate::util::query::{BgzfChunkReader, ByteRangeReader}; -use crate::{CoordSystem, OxbowError, Select}; +use crate::{CoordSystem, OxbowError, Region, Select}; /// A BED scanner. /// @@ -41,7 +41,8 @@ impl Scanner { /// Creates a BED scanner from a BED schema and optional field projection. /// /// - `bed_schema`: the parsing interpretation. - /// - `fields`: column names to project. `None` → all fields from the schema. + /// - `fields`: column names to project. + /// - `coord_system`: output coordinate system for position columns. pub fn new( bed_schema: BedSchema, fields: Select, @@ -113,7 +114,7 @@ impl Scanner { pub fn scan_query( &self, fmt_reader: noodles::bed::io::Reader<3, R>, - region: crate::Region, + region: Region, index: impl BinningIndex, columns: Option>, batch_size: Option, diff --git a/oxbow/src/gxf/scanner/gff.rs b/oxbow/src/gxf/scanner/gff.rs index 09bdbbe..00931ff 100644 --- a/oxbow/src/gxf/scanner/gff.rs +++ b/oxbow/src/gxf/scanner/gff.rs @@ -12,7 +12,7 @@ use crate::gxf::model::BatchBuilder; use crate::gxf::model::Model; use crate::gxf::scanner::batch_iterator::{BatchIterator, QueryBatchIterator}; use crate::util::query::{BgzfChunkReader, ByteRangeReader}; -use crate::{CoordSystem, OxbowError, Select}; +use crate::{CoordSystem, OxbowError, Region, Select}; /// A GFF scanner. /// @@ -46,6 +46,7 @@ impl Scanner { /// /// - `fields`: standard GXF field selection. `All` → all 8 standard fields. /// - `attr_defs`: `None` → no attributes column. `Some(vec![])` → empty struct. + /// - `coord_system`: output coordinate system for position columns. pub fn new( header: Option, fields: Select, @@ -168,7 +169,7 @@ impl Scanner { pub fn scan_query( &self, fmt_reader: noodles::gff::io::Reader, - region: crate::Region, + region: Region, index: impl BinningIndex, columns: Option>, batch_size: Option, diff --git a/oxbow/src/gxf/scanner/gtf.rs b/oxbow/src/gxf/scanner/gtf.rs index 3825535..00db307 100644 --- a/oxbow/src/gxf/scanner/gtf.rs +++ b/oxbow/src/gxf/scanner/gtf.rs @@ -12,7 +12,7 @@ use crate::gxf::model::BatchBuilder; use crate::gxf::model::Model; use crate::gxf::scanner::batch_iterator::{BatchIterator, QueryBatchIterator}; use crate::util::query::{BgzfChunkReader, ByteRangeReader}; -use crate::{CoordSystem, OxbowError, Select}; +use crate::{CoordSystem, OxbowError, Region, Select}; /// A GTF scanner. /// @@ -46,6 +46,7 @@ impl Scanner { /// /// - `fields`: standard GXF field selection. `All` → all 8 standard fields. /// - `attr_defs`: `None` → no attributes column. `Some(vec![])` → empty struct. + /// - `coord_system`: output coordinate system for position columns. pub fn new( header: Option, fields: Select, @@ -168,7 +169,7 @@ impl Scanner { pub fn scan_query( &self, fmt_reader: noodles::gtf::io::Reader, - region: crate::Region, + region: Region, index: impl BinningIndex, columns: Option>, batch_size: Option, diff --git a/oxbow/src/sequence/model.rs b/oxbow/src/sequence/model.rs index a1644c8..5511145 100644 --- a/oxbow/src/sequence/model.rs +++ b/oxbow/src/sequence/model.rs @@ -18,6 +18,9 @@ use field::{Field, FASTA_DEFAULT_FIELD_NAMES, FASTQ_DEFAULT_FIELD_NAMES}; /// - `fields` selects which fields become Arrow columns. /// `All` → format-specific defaults (3 for FASTA, 4 for FASTQ). /// `Omit` → no fields. `Some(vec)` → specific fields. +/// - `coord_system` can be used to define how ambiguous input query region +/// strings should be interpreted. There are no position columns in sequence +/// output. /// /// # Examples /// diff --git a/oxbow/src/sequence/scanner/fasta.rs b/oxbow/src/sequence/scanner/fasta.rs index 0dad1dd..ed2df70 100644 --- a/oxbow/src/sequence/scanner/fasta.rs +++ b/oxbow/src/sequence/scanner/fasta.rs @@ -6,7 +6,7 @@ use arrow::datatypes::Schema; use crate::sequence::model::BatchBuilder; use crate::sequence::model::Model; use crate::sequence::scanner::batch_iterator::{BatchIterator, QueryBatchIterator}; -use crate::{CoordSystem, Select}; +use crate::{CoordSystem, Region, Select}; /// A FASTA scanner. /// @@ -37,7 +37,8 @@ pub struct Scanner { impl Scanner { /// Creates a FASTA scanner from schema parameters. /// - /// `fields`: `All` → `["name", "description", "sequence"]`. + /// - `fields`: `All` → `["name", "description", "sequence"]`. + /// - `coord_system`: coordinate system for query region interpretation. pub fn new(fields: Select, coord_system: CoordSystem) -> crate::Result { let model = Model::new_fasta(fields, coord_system)?; Ok(Self { model }) @@ -104,7 +105,7 @@ impl Scanner { pub fn scan_query( &self, fmt_reader: noodles::fasta::io::Reader, - regions: Vec, + regions: Vec, index: noodles::fasta::fai::Index, columns: Option>, batch_size: Option, diff --git a/oxbow/src/variant/scanner/bcf.rs b/oxbow/src/variant/scanner/bcf.rs index 7a64d92..8b1222d 100644 --- a/oxbow/src/variant/scanner/bcf.rs +++ b/oxbow/src/variant/scanner/bcf.rs @@ -8,7 +8,7 @@ use noodles::csi::BinningIndex; use crate::util::query::{BgzfChunkReader, ByteRangeReader}; use crate::variant::model::{BatchBuilder, GenotypeBy, Model}; use crate::variant::scanner::batch_iterator::{BatchIterator, QueryBatchIterator}; -use crate::{CoordSystem, OxbowError, Select}; +use crate::{CoordSystem, OxbowError, Region, Select}; /// A BCF scanner. /// @@ -39,6 +39,15 @@ pub struct Scanner { impl Scanner { /// Creates a BCF scanner from a VCF header and schema parameters. + /// + /// - `header`: the VCF header, used for schema inference and validation. + /// - `fields`: standard SAM field selection. + /// - `info_fields`: INFO field selection. + /// - `genotype_fields`: FORMAT field selection. + /// - `genotype_by`: how to group genotype fields and samples. + /// - `samples`: sample selection for genotype fields. + /// - `samples_nested`: whether to nest sample-genotype columns under a single samples column. + /// - `coord_system`: output coordinate system for position columns. #[allow(clippy::too_many_arguments)] pub fn new( header: noodles::vcf::Header, @@ -205,7 +214,7 @@ impl Scanner { pub fn scan_query( &self, fmt_reader: noodles::bcf::io::Reader, - region: crate::Region, + region: Region, index: impl BinningIndex, columns: Option>, batch_size: Option, diff --git a/oxbow/src/variant/scanner/vcf.rs b/oxbow/src/variant/scanner/vcf.rs index 7d6c4fa..909f826 100644 --- a/oxbow/src/variant/scanner/vcf.rs +++ b/oxbow/src/variant/scanner/vcf.rs @@ -8,7 +8,7 @@ use noodles::csi::BinningIndex; use crate::util::query::{BgzfChunkReader, ByteRangeReader}; use crate::variant::model::{BatchBuilder, GenotypeBy, Model}; use crate::variant::scanner::batch_iterator::{BatchIterator, QueryBatchIterator}; -use crate::{CoordSystem, OxbowError, Select}; +use crate::{CoordSystem, OxbowError, Region, Select}; /// A VCF scanner. /// @@ -39,6 +39,15 @@ pub struct Scanner { impl Scanner { /// Creates a VCF scanner from a VCF header and schema parameters. + /// + /// - `header`: the VCF header, used for schema inference and validation. + /// - `fields`: standard SAM field selection. + /// - `info_fields`: INFO field selection. + /// - `genotype_fields`: FORMAT field selection. + /// - `genotype_by`: how to group genotype fields and samples. + /// - `samples`: sample selection for genotype fields. + /// - `samples_nested`: whether to nest sample-genotype columns under a single samples column. + /// - `coord_system`: output coordinate system for position columns. #[allow(clippy::too_many_arguments)] pub fn new( header: noodles::vcf::Header, @@ -205,7 +214,7 @@ impl Scanner { pub fn scan_query( &self, fmt_reader: noodles::vcf::io::Reader, - region: crate::Region, + region: Region, index: impl BinningIndex, columns: Option>, batch_size: Option, diff --git a/py-oxbow/oxbow/_core/base.py b/py-oxbow/oxbow/_core/base.py index 37547a8..b550dd2 100644 --- a/py-oxbow/oxbow/_core/base.py +++ b/py-oxbow/oxbow/_core/base.py @@ -144,6 +144,15 @@ def regions(self, regions: str | list[str]) -> Self: Returns ------- DataSource + + Notes + ----- + Genomic range strings can be in the following formats: + + - UCSC-style ``"chr:start-end"``: intepreted using the coordinate + system of the data source. + - Bracket-style ``"chr:[start,end]"``: explicitly 1-based, end-inclusive. + - Bracket-style ``"chr:[start,end)"``: explicitly 0-based, end-exclusive. """ ... diff --git a/py-oxbow/src/alignment.rs b/py-oxbow/src/alignment.rs index f49a6b9..59dcde6 100644 --- a/py-oxbow/src/alignment.rs +++ b/py-oxbow/src/alignment.rs @@ -33,6 +33,9 @@ use oxbow::CoordSystem; /// tag_defs : list[tuple[str, str]], optional [default: None] /// Tag definitions for the ``"tags"`` struct column. ``None`` omits the /// tags column. Use the ``tag_defs()`` method to discover definitions. +/// coords : Literal["01", "11"], optional [default: "11"] +/// Coordinate system for returning positions and interpreting query ranges. +/// "01" for 0-based half-open, "11" for 1-based closed. #[pyclass(module = "oxbow.oxbow")] pub struct PySamScanner { src: Py, @@ -305,7 +308,8 @@ impl PySamScanner { /// Parameters /// ---------- /// region : str - /// Genomic region in the format "chr:start-end". + /// Genomic range string in the format "chr:start-end", + /// "chr:[start,end]" or "chr:[start,end)". /// index : path or file-like, optional /// The index file to use for querying the region. If None and the /// source was provided as a path, we will attempt to load the index @@ -479,6 +483,9 @@ impl PySamScanner { /// tag_defs : list[tuple[str, str]], optional [default: None] /// Tag definitions for the ``"tags"`` struct column. ``None`` omits the /// tags column. Use the ``tag_defs()`` method to discover definitions. +/// coords : Literal["01", "11"], optional [default: "11"] +/// Coordinate system for returning positions and interpreting query ranges. +/// "01" for 0-based half-open, "11" for 1-based closed. #[pyclass(module = "oxbow.oxbow")] pub struct PyBamScanner { src: Py, @@ -751,7 +758,8 @@ impl PyBamScanner { /// Parameters /// ---------- /// region : str - /// Genomic region in the format "chr:start-end". + /// Genomic range string in the format "chr:start-end", + /// "chr:[start,end]" or "chr:[start,end)". /// index : path or file-like, optional /// The index file to use for querying the region. If None and the /// source was provided as a path, we will attempt to load the index @@ -923,6 +931,9 @@ impl PyBamScanner { /// tag_defs : list[tuple[str, str]], optional [default: None] /// Tag definitions for the ``"tags"`` struct column. ``None`` omits the /// tags column. Use the ``tag_defs()`` method to discover definitions. +/// coords : Literal["01", "11"], optional [default: "11"] +/// Coordinate system for returning positions and interpreting query ranges. +/// "01" for 0-based half-open, "11" for 1-based closed. #[pyclass] pub struct PyCramScanner { src: Py, @@ -1104,7 +1115,8 @@ impl PyCramScanner { /// Parameters /// ---------- /// region : str - /// Genomic region in the format "chr:start-end". + /// Genomic range string in the format "chr:start-end", + /// "chr:[start,end]" or "chr:[start,end)". /// index : path or file-like, optional /// The index file to use for querying the region. If None and the /// source was provided as a path, we will attempt to load the index @@ -1165,6 +1177,9 @@ impl PyCramScanner { /// ---------- /// src : str or file-like /// The path to the source file or a file-like object. +/// region : str +/// Genomic range string in the format "chr:start-end", +/// "chr:[start,end]" or "chr:[start,end)". /// fields : str or list[str] or None, optional /// Standard SAM fields to project. /// tag_defs : list[tuple[str, str]], optional @@ -1236,6 +1251,9 @@ pub fn read_sam( /// ---------- /// src : str or file-like /// The path to the source file or a file-like object. +/// region : str +/// Genomic range string in the format "chr:start-end", +/// "chr:[start,end]" or "chr:[start,end)". /// fields : str or list[str] or None, optional /// Standard SAM fields to project. /// tag_defs : list[tuple[str, str]], optional @@ -1307,6 +1325,9 @@ pub fn read_bam( /// ---------- /// src : str or file-like /// The path to the source file or a file-like object. +/// region : str +/// Genomic range string in the format "chr:start-end", +/// "chr:[start,end]" or "chr:[start,end)". /// fields : str or list[str] or None, optional /// Standard SAM fields to project. /// tag_defs : list[tuple[str, str]], optional diff --git a/py-oxbow/src/bbi.rs b/py-oxbow/src/bbi.rs index e292bf4..4581f98 100644 --- a/py-oxbow/src/bbi.rs +++ b/py-oxbow/src/bbi.rs @@ -31,6 +31,9 @@ pub enum PyBBIFileType { /// The path to the BigWig file or a file-like object. /// fields : list[str], optional /// Names of the fields to include in the schema. +/// coords : Literal["01", "11"], optional [default: "01"] +/// Coordinate system for returning positions and interpreting query ranges. +/// "01" for 0-based half-open, "11" for 1-based closed. #[pyclass(module = "oxbow.oxbow")] pub struct PyBigWigScanner { _src: Py, @@ -176,7 +179,8 @@ impl PyBigWigScanner { /// Parameters /// ---------- /// region : str - /// Genomic region in the format "chr:start-end". + /// Genomic range string in the format "chr:start-end", + /// "chr:[start,end]" or "chr:[start,end)". /// columns : list[str], optional /// Names of the columns to project. /// batch_size : int, optional [default: 1024] @@ -226,6 +230,9 @@ impl PyBigWigScanner { /// records, if it exists. /// fields : list[str], optional /// Names of the fields to include in the schema. +/// coords : Literal["01", "11"], optional [default: "01"] +/// Coordinate system for returning positions and interpreting query ranges. +/// "01" for 0-based half-open, "11" for 1-based closed. #[pyclass(module = "oxbow.oxbow")] pub struct PyBigBedScanner { _src: Py, @@ -421,7 +428,8 @@ impl PyBigBedScanner { /// Parameters /// ---------- /// region : str - /// Genomic region in the format "chr:start-end". + /// Genomic range string in the format "chr:start-end", + /// "chr:[start,end]" or "chr:[start,end)". /// columns : list[str], optional /// Names of the columns to project. /// batch_size : int, optional [default: 1024] @@ -470,6 +478,9 @@ impl PyBigBedScanner { /// The zoom level resolution in bp. /// fields : list[str], optional /// Names of the fields to include in the schema. +/// coords : Literal["01", "11"], optional [default: "01"] +/// Coordinate system for returning positions and interpreting query ranges. +/// "01" for 0-based half-open, "11" for 1-based closed. #[pyclass(module = "oxbow.oxbow")] pub struct PyBBIZoomScanner { src: Py, @@ -650,7 +661,8 @@ impl PyBBIZoomScanner { /// Parameters /// ---------- /// region : str - /// Genomic region in the format "chr:start-end". + /// Genomic range string in the format "chr:start-end", + /// "chr:[start,end]" or "chr:[start,end)". /// columns : list[str], optional /// Names of the columns to project. /// batch_size : int, optional [default: 1024] @@ -705,6 +717,9 @@ impl PyBBIZoomScanner { /// ---------- /// src : str or file-like /// The path to the source file or a file-like object. +/// region : str +/// Genomic range string in the format "chr:start-end", +/// "chr:[start,end]" or "chr:[start,end)". /// fields : list[str], optional /// Names of the fixed fields to project. /// @@ -758,6 +773,9 @@ pub fn read_bigwig( /// The path to the source file or a file-like object. /// bed_schema : str /// The BED schema to use for parsing BigBed records. +/// region : str +/// Genomic range string in the format "chr:start-end", +/// "chr:[start,end]" or "chr:[start,end)". /// fields : list[str], optional /// Names of the fixed fields to project. /// diff --git a/py-oxbow/src/bed.rs b/py-oxbow/src/bed.rs index 8e1be22..db96c6b 100644 --- a/py-oxbow/src/bed.rs +++ b/py-oxbow/src/bed.rs @@ -91,6 +91,9 @@ pub fn resolve_bed_schema(py: Python, obj: &Py) -> PyResult { /// Whether the source is BGZF-compressed. /// fields : list[str], optional /// Names of the BED fields to include in the schema. +/// coords : Literal["01", "11"], optional [default: "01"] +/// Coordinate system for returning positions and interpreting query ranges. +/// "01" for 0-based half-open, "11" for 1-based closed. #[pyclass(module = "oxbow.oxbow")] pub struct PyBedScanner { src: Py, @@ -296,7 +299,8 @@ impl PyBedScanner { /// Parameters /// ---------- /// region : str - /// Genomic region in the format "chr:start-end". + /// Genomic range string in the format "chr:start-end", + /// "chr:[start,end]" or "chr:[start,end)". /// index : path or file-like, optional /// The index file to use for querying the region. If None and the /// source was provided as a path, we will attempt to load the index @@ -384,6 +388,9 @@ impl PyBedScanner { /// The path to the source file or a file-like object. /// bed_schema : str, list[tuple[str, str]], or dict[str, str] /// The BED schema. +/// region : str +/// Genomic range string in the format "chr:start-end", +/// "chr:[start,end]" or "chr:[start,end)". /// fields : list[str], optional /// Names of the fields to project. /// compressed : bool, optional [default: False] diff --git a/py-oxbow/src/gxf.rs b/py-oxbow/src/gxf.rs index fbc6d12..6b70684 100644 --- a/py-oxbow/src/gxf.rs +++ b/py-oxbow/src/gxf.rs @@ -32,6 +32,9 @@ use oxbow::CoordSystem; /// attribute_defs : list[tuple[str, str]], optional [default: None] /// Definitions for the ``"attributes"`` struct column. ``None`` omits the /// attributes column. Use the ``attribute_defs()`` method to discover definitions. +/// coords : Literal["01", "11"], optional [default: "11"] +/// Coordinate system for returning positions and interpreting query ranges. +/// "01" for 0-based half-open, "11" for 1-based closed. #[pyclass(module = "oxbow.oxbow")] pub struct PyGtfScanner { src: Py, @@ -267,7 +270,8 @@ impl PyGtfScanner { /// Parameters /// ---------- /// region : str - /// Genomic region in the format "chr:start-end". + /// Genomic range string in the format "chr:start-end", + /// "chr:[start,end]" or "chr:[start,end)". /// index : path or file-like, optional /// The index file to use for querying the region. /// columns : list[str], optional @@ -356,6 +360,9 @@ impl PyGtfScanner { /// attribute_defs : list[tuple[str, str]], optional [default: None] /// Definitions for the ``"attributes"`` struct column. ``None`` omits the /// attributes column. Use the ``attribute_defs()`` method to discover definitions. +/// coords : Literal["01", "11"], optional [default: "11"] +/// Coordinate system for returning positions and interpreting query ranges. +/// "01" for 0-based half-open, "11" for 1-based closed. #[pyclass(module = "oxbow.oxbow")] pub struct PyGffScanner { src: Py, @@ -590,7 +597,8 @@ impl PyGffScanner { /// Parameters /// ---------- /// region : str - /// Genomic region in the format "chr:start-end". + /// Genomic range string in the format "chr:start-end", + /// "chr:[start,end]" or "chr:[start,end)". /// index : path or file-like, optional /// The index file to use for querying the region. /// columns : list[str], optional @@ -672,6 +680,9 @@ impl PyGffScanner { /// ---------- /// src : str or file-like /// The path to the source file or a file-like object. +/// region : str +/// Genomic range string in the format "chr:start-end", +/// "chr:[start,end]" or "chr:[start,end)". /// fields : list[str], optional /// Names of the fixed fields to project. /// attr_defs : list[tuple[str, str]], optional @@ -740,6 +751,9 @@ pub fn read_gtf( /// ---------- /// src : str or file-like /// The path to the source file or a file-like object. +/// region : str +/// Genomic range string in the format "chr:start-end", +/// "chr:[start,end]" or "chr:[start,end)". /// fields : list[str], optional /// Names of the fixed fields to project. /// attr_defs : list[tuple[str, str]], optional diff --git a/py-oxbow/src/sequence.rs b/py-oxbow/src/sequence.rs index 0bd8473..90c20ed 100644 --- a/py-oxbow/src/sequence.rs +++ b/py-oxbow/src/sequence.rs @@ -212,6 +212,9 @@ impl PyFastqScanner { /// Whether the source is BGZF-compressed. /// fields : list[str], optional /// Names of the fixed fields to project. +/// coords : Literal["01", "11"], optional [default: "11"] +/// Coordinate system for interpreting query ranges. "01" for 0-based +/// half-open, "11" for 1-based closed. #[pyclass(module = "oxbow.oxbow")] pub struct PyFastaScanner { src: Py, @@ -324,7 +327,8 @@ impl PyFastaScanner { /// Parameters /// ---------- /// regions : list[str] - /// Genomic ranges in the format "chr:start-end". + /// Genomic ranges in the format "chr:start-end", "chr:[start,end]" or + /// "chr:[start,end)". /// index : path or file-like, optional /// The FAI index file. /// gzi : path or file-like, optional @@ -433,7 +437,8 @@ pub fn read_fastq( /// src : str or file-like /// The path to the source file or a file-like object. /// regions : list[str], optional -/// Genomic ranges in the format "chr:start-end". +/// Genomic ranges in the format "chr:start-end", "chr:[start,end]" or +/// "chr:[start,end)". /// index : path or file-like, optional /// The FAI index file. /// gzi : path or file-like, optional diff --git a/py-oxbow/src/variant.rs b/py-oxbow/src/variant.rs index 3f10547..4b98e82 100644 --- a/py-oxbow/src/variant.rs +++ b/py-oxbow/src/variant.rs @@ -42,6 +42,9 @@ use oxbow::CoordSystem; /// samples_nested : bool, optional [default: False] /// Whether to nest sample genotype data under a single ``"samples"`` struct /// column. +/// coords : Literal["01", "11"], optional [default: "11"] +/// Coordinate system for returning positions and interpreting query ranges. +/// "01" for 0-based half-open, "11" for 1-based closed. #[pyclass(module = "oxbow.oxbow")] pub struct PyVcfScanner { src: Py, @@ -308,7 +311,8 @@ impl PyVcfScanner { /// Parameters /// ---------- /// region : str - /// Genomic region in the format "chr:start-end". + /// Genomic range string in the format "chr:start-end", + /// "chr:[start,end]" or "chr:[start,end)". /// index : path or file-like, optional /// The index file to use for querying the region. If None and the /// source was provided as a path, we will attempt to load the index @@ -413,6 +417,9 @@ impl PyVcfScanner { /// samples_nested : bool, optional [default: False] /// Whether to nest sample genotype data under a single ``"samples"`` struct /// column. +/// coords : Literal["01", "11"], optional [default: "11"] +/// Coordinate system for returning positions and interpreting query ranges. +/// "01" for 0-based half-open, "11" for 1-based closed. #[pyclass(module = "oxbow.oxbow")] pub struct PyBcfScanner { src: Py, @@ -679,7 +686,8 @@ impl PyBcfScanner { /// Parameters /// ---------- /// region : str - /// Genomic region in the format "chr:start-end". + /// Genomic range string in the format "chr:start-end", + /// "chr:[start,end]" or "chr:[start,end)". /// index : path or file-like, optional /// The index file to use for querying the region. If None and the /// source was provided as a path, we will attempt to load the index @@ -776,8 +784,9 @@ fn resolve_genotype_by(genotype_by: Option) -> PyResult