diff --git a/docs/api/python/oxbow.core.BamFile.rst b/docs/api/python/oxbow.core.BamFile.rst index 4531295f..7e7308fa 100644 --- a/docs/api/python/oxbow.core.BamFile.rst +++ b/docs/api/python/oxbow.core.BamFile.rst @@ -27,6 +27,7 @@ ~BamFile.to_ipc ~BamFile.to_pandas ~BamFile.to_polars + ~BamFile.with_tags diff --git a/docs/api/python/oxbow.core.BcfFile.rst b/docs/api/python/oxbow.core.BcfFile.rst index 27185b03..4e79ff25 100644 --- a/docs/api/python/oxbow.core.BcfFile.rst +++ b/docs/api/python/oxbow.core.BcfFile.rst @@ -27,6 +27,7 @@ ~BcfFile.to_ipc ~BcfFile.to_pandas ~BcfFile.to_polars + ~BcfFile.with_samples diff --git a/docs/api/python/oxbow.core.CramFile.rst b/docs/api/python/oxbow.core.CramFile.rst index 50ac2325..a400593a 100644 --- a/docs/api/python/oxbow.core.CramFile.rst +++ b/docs/api/python/oxbow.core.CramFile.rst @@ -27,6 +27,7 @@ ~CramFile.to_ipc ~CramFile.to_pandas ~CramFile.to_polars + ~CramFile.with_tags diff --git a/docs/api/python/oxbow.core.GffFile.rst b/docs/api/python/oxbow.core.GffFile.rst index 35bc2c54..79438f9f 100644 --- a/docs/api/python/oxbow.core.GffFile.rst +++ b/docs/api/python/oxbow.core.GffFile.rst @@ -27,6 +27,7 @@ ~GffFile.to_ipc ~GffFile.to_pandas ~GffFile.to_polars + ~GffFile.with_attributes diff --git a/docs/api/python/oxbow.core.GtfFile.rst b/docs/api/python/oxbow.core.GtfFile.rst index d432ff5f..3d45f8b1 100644 --- a/docs/api/python/oxbow.core.GtfFile.rst +++ b/docs/api/python/oxbow.core.GtfFile.rst @@ -27,6 +27,7 @@ ~GtfFile.to_ipc ~GtfFile.to_pandas ~GtfFile.to_polars + ~GtfFile.with_attributes diff --git a/docs/api/python/oxbow.core.PyBamScanner.rst b/docs/api/python/oxbow.core.PyBamScanner.rst index 6df37dc0..ac280974 100644 --- a/docs/api/python/oxbow.core.PyBamScanner.rst +++ b/docs/api/python/oxbow.core.PyBamScanner.rst @@ -17,6 +17,7 @@ ~PyBamScanner.chrom_names ~PyBamScanner.chrom_sizes ~PyBamScanner.field_names + ~PyBamScanner.model ~PyBamScanner.scan ~PyBamScanner.scan_byte_ranges ~PyBamScanner.scan_query diff --git a/docs/api/python/oxbow.core.PyCramScanner.rst b/docs/api/python/oxbow.core.PyCramScanner.rst index ed2fdefe..30312ae4 100644 --- a/docs/api/python/oxbow.core.PyCramScanner.rst +++ b/docs/api/python/oxbow.core.PyCramScanner.rst @@ -17,6 +17,7 @@ ~PyCramScanner.chrom_names ~PyCramScanner.chrom_sizes ~PyCramScanner.field_names + ~PyCramScanner.model ~PyCramScanner.scan ~PyCramScanner.scan_query ~PyCramScanner.schema diff --git a/docs/api/python/oxbow.core.PySamScanner.rst b/docs/api/python/oxbow.core.PySamScanner.rst index 537b2a2d..cd274c0e 100644 --- a/docs/api/python/oxbow.core.PySamScanner.rst +++ b/docs/api/python/oxbow.core.PySamScanner.rst @@ -17,6 +17,7 @@ ~PySamScanner.chrom_names ~PySamScanner.chrom_sizes ~PySamScanner.field_names + ~PySamScanner.model ~PySamScanner.scan ~PySamScanner.scan_byte_ranges ~PySamScanner.scan_query diff --git a/docs/api/python/oxbow.core.SamFile.rst b/docs/api/python/oxbow.core.SamFile.rst index fae730a5..291c1ca4 100644 --- a/docs/api/python/oxbow.core.SamFile.rst +++ b/docs/api/python/oxbow.core.SamFile.rst @@ -27,6 +27,7 @@ ~SamFile.to_ipc ~SamFile.to_pandas ~SamFile.to_polars + ~SamFile.with_tags diff --git a/docs/api/python/oxbow.core.VcfFile.rst b/docs/api/python/oxbow.core.VcfFile.rst index 266003ed..93a376e8 100644 --- a/docs/api/python/oxbow.core.VcfFile.rst +++ b/docs/api/python/oxbow.core.VcfFile.rst @@ -27,6 +27,7 @@ ~VcfFile.to_ipc ~VcfFile.to_pandas ~VcfFile.to_polars + ~VcfFile.with_samples diff --git a/docs/getting-started/quickstart.md b/docs/getting-started/quickstart.md index a5072710..d060cfaf 100644 --- a/docs/getting-started/quickstart.md +++ b/docs/getting-started/quickstart.md @@ -103,7 +103,6 @@ import polars as pl ox.from_bam( "data/sample.bam", fields=["rname", "pos", "end", "mapq"], - tag_defs=[], ).regions( "chr1" ).pl() @@ -127,7 +126,7 @@ df = ( df ``` -## Nested and complex fields +## Nested and composite fields Oxbow can handle the complex field structures of genomics file formats because they can all be mapped to Arrow constructs like lists, arrays, and structs. @@ -141,7 +140,7 @@ The htslib alignment formats, SAM and BAM, have optional fields called `tags` th df = ( ox.from_bam( "data/sample.bam", - fields=[], + fields=None, tag_defs=[('MD', 'Z'), ('NM', 'C')] ) .regions("chr1") @@ -153,20 +152,22 @@ df = ( df ``` -By default, oxbow will scan an initial number of rows to discover tag definitions (determined by `tag_scan_rows`). Set `tag_defs=[]` to ignore tags entirely. +By calling the `with_tags()` method, oxbow will scan an initial number of rows to discover tag definitions to add to the schema (determined by `scan_rows`). ```{code-cell} ipython3 df = ( - ox.from_bam( - "data/sample.bam", - tag_defs=[], - ) + ox.from_bam("data/sample.bam") + .with_tags() .regions("chr1") .pl() ) df ``` +```{code-cell} ipython3 +df['tags'].struct.unnest().head() +``` + ### GTF/GFF attributes GTF/GFF attributes are analogous to SAM tags. For GTF, the type is always `"String"`. For GFF, attributes can be `"String"` or `"Array"`, the latter materializing as a list column. @@ -174,6 +175,7 @@ GTF/GFF attributes are analogous to SAM tags. For GTF, the type is always `"Stri ```{code-cell} ipython3 df = ( ox.from_gff("data/sample.gff") + .with_attributes() .pl() ) df.head() @@ -183,18 +185,22 @@ df.head() df['attributes'].struct.unnest().head() ``` +:::{important} +As of oxbow v0.7, alignment file tag definitions and annotation file attribute definitions are no longer auto-discovered by default---this behavior is **opt-in**. Use the {py:meth}`~oxbow.core.BamFile.with_tags` or {py:meth}`~oxbow.core.GffFile.with_attributes` methods, respectively, to discover or specify tag/attribute definitions. +::: + + ### VCF/BCF info fields For the htslib variant call formats, VCF and BCF, the subfields of the `INFO` field are defined in the VCF header, so they do not need to be discovered by sniffing rows and you do not need to specify types. -By default, all info fields are parsed. You can project any subset or ignore them entirely using the `info_fields` argument. +By default, all info fields are parsed (`info_fields="*"`). You can project any subset or ignore them entirely by setting the `info_fields` argument to `None`. ```{code-cell} ipython3 ( ox.from_vcf( "data/sample.vcf.gz", - info_fields=[], - samples=[], + info_fields=None, ) .pl() ).head() @@ -205,7 +211,6 @@ df = ( ox.from_vcf( "data/sample.vcf.gz", info_fields=["TYPE", "snpeff.Effect", "snpeff.Gene_Name", "snpeff.Transcript_BioType"], - samples=[], ) .pl() ) @@ -220,40 +225,70 @@ df.unnest("info").head() For the htslib variant call formats, each variant call record is associated with an arbitrary number of so-called `FORMAT` fields that provide genotype-related information for each sample. Like `INFO`, these fields are defined in the header. -Using the `samples` and `genotype_fields` arguments, you can project any subset of samples as separate struct columns and project any subset of their associated genotype fields. +Using the `samples` and `genotype_fields` arguments, you can project any subset of samples as separate struct columns and project any subset of their associated genotype fields. Use `samples="*"` to select all samples or a list to select a subset. ```{code-cell} ipython3 df = ox.from_vcf( "data/sample.vcf.gz", - info_fields=[], + info_fields=None, samples=['NA12891', 'NA12892'], ).pl() df.head() ``` -Each sample column is essentially a sub-dataframe of genotype fields. +Each sample column is essentially a sub-dataframe of that sample's genotype fields. ```{code-cell} ipython3 df['NA12892'].struct.unnest().head() ``` -You can also customize _how_ sample genotype data are nested by using the `genotype_by` argument. By default (`genotype_by="sample"`), the columns are grouped first by sample name, then by genotype field name. By setting `genotype_by="field"`, you can swap the nesting order to group columns first by genotype field name, then by sample name. +:::{important} +As of oxbow v0.7, variant file sample columns are no longer projected by default---they are **opt-in**. We recommend using the {py:meth}`~oxbow.core.VcfFile.with_samples` API, below, to do this. +::: + +The recommended approach to project sample genotype data is to use the `with_samples()` method. Declaring samples this way further nests all sample-related data in a single "samples" struct column for convenience. ```{code-cell} ipython3 -df = ox.from_vcf( - "data/sample.vcf.gz", - info_fields=[], - samples=['NA12891', 'NA12892'], - genotype_fields=['AD', 'DP', 'GQ', 'PL', 'TP'], - genotype_by="field", +df = ( + ox.from_vcf( + "data/sample.vcf.gz", + info_fields=None, + ) + .with_samples() + .pl() +) +df.head() +``` + +```{code-cell} ipython3 +df.unnest("samples").head() +``` + +You can also customize _how_ sample genotype data are nested by using the `group_by` argument to `with_samples()`. By default (`group_by="sample"`), the columns are grouped first by sample name, then by genotype field name. By setting `group_by="field"`, you can swap the nesting order to group columns first by genotype field name, then by sample name. + +```{code-cell} ipython3 +df = ( + ox.from_vcf( + "data/sample.vcf.gz", + info_fields=None, + ) + .with_samples( + ['NA12891', 'NA12892'], + genotype_fields=['AD', 'DP', 'GQ', 'PL', 'TP'], + group_by="field", + ) ).pl() df.head() ``` +```{code-cell} ipython3 +df.unnest("samples").head() +``` + In this case, each genotype field column is a data series containing the values of that field associated with each of the samples. ```{code-cell} ipython3 -df['DP'].struct.unnest().head() +df.unnest("samples")['DP'].struct.unnest().head() ``` ### BED schemas @@ -274,16 +309,51 @@ ox.from_bed("data/sample.bed", bed_schema="bed9").pl().head() ### BigBed AutoSql -Oxbow can also parse BigBed records that contain AutoSql definitions of the records. +BigBed records natively store genomic coordinate fields and a flat string containing the "rest" of the data (equivalent to a `bed3+` schema). ```{code-cell} ipython3 ox.from_bigbed("data/autosql-sample.bb").pl().head() ``` +If a BigBed file contains [AutoSql](https://genomewiki.ucsc.edu/index.php/AutoSql) definitions of its record fields and types, Oxbow can parse them. + ```{code-cell} ipython3 ox.from_bigbed("data/autosql-sample.bb", schema="autosql").pl().head() ``` +### Custom BED schemas + +You can impose a custom parsing interpretation---field names and types (beyond the first three fields)---on a BED or BigBed file as long as the text values in those fields are compatible with the types you impose. + +Pass in a BED schema as a tuple of `(str, dict[str, str])`, representing 3-12 standard BED fields (`"bed{n}"`) + custom extended fields encoded as a dictionary of field name to type name. Types can be declared using C-style AutoSql names (`string`, `short`, `float`, `double`, etc.) or Rust integer shorthands (`i8`, `u8`, `i32`, `f32`, `f64`, etc.). Fixed and variable-length array types can be declared using `int[]`, `int[10]` (AutoSql style) or `[i32]`, `[i32; 10]` (Rust shorthand style). + +```{code-cell} ipython3 +( + ox.from_bigbed( + "data/autosql-sample.bb", + schema=("bed4", {"score": "double", "strand": "string"}) + ) + .pl() + .head() +) +``` + +```{code-cell} ipython3 +narrowpeak = ( + "bed6", + {"fold_change": "f64", "-log10p": "f64", "-log10q": "f64", "relSummit": "i64"} +) +( + ox.from_bed( + "data/ENCFF758CQW.100.bed.gz", + bed_schema=narrowpeak, + compression="gzip" + ) + .pl() + .head() +) +``` + ## Zoom levels The UCSC BBI formats store multiple "zoom" or "reduction" levels. These are tables of fixed-resolution genomic bins containing summary statistics of the signal of a BigWig track track or the interval coverage depth of a BigBed track. diff --git a/fixtures/ENCFF758CQW.100.bed.gz b/fixtures/ENCFF758CQW.100.bed.gz new file mode 100644 index 00000000..b8131905 Binary files /dev/null and b/fixtures/ENCFF758CQW.100.bed.gz differ diff --git a/oxbow/src/alignment/model.rs b/oxbow/src/alignment/model.rs index 730fb8d6..64b66cab 100644 --- a/oxbow/src/alignment/model.rs +++ b/oxbow/src/alignment/model.rs @@ -10,7 +10,7 @@ use std::sync::Arc; use arrow::datatypes::{DataType, Field as ArrowField, Schema, SchemaRef}; -use crate::OxbowError; +use crate::{OxbowError, Select}; use field::{Field, DEFAULT_FIELD_NAMES}; use tag::TagDef; @@ -32,15 +32,16 @@ use tag::TagDef; /// /// ``` /// use oxbow::alignment::model::Model; +/// use oxbow::Select; /// /// // Default: all 12 standard fields, no tags column. -/// let model = Model::new(None, None).unwrap(); +/// let model = Model::new(Select::All, None).unwrap(); /// assert_eq!(model.field_names().len(), 12); /// assert!(!model.has_tags()); /// /// // Custom: selected fields with tags. /// let model = Model::new( -/// Some(vec!["qname".into(), "pos".into()]), +/// Select::Some(vec!["qname".into(), "pos".into()]), /// Some(vec![("NM".into(), "i".into()), ("MD".into(), "Z".into())]), /// ).unwrap(); /// assert_eq!(model.field_names(), vec!["qname", "pos"]); @@ -59,16 +60,19 @@ pub struct Model { impl Model { /// Create a new alignment model. /// - /// - `fields`: standard SAM field names to include. `None` → all 12 - /// standard fields. + /// - `fields`: standard SAM field selection. `All` → all 12 standard + /// fields. `Select(vec)` → specific fields. `Omit` → no fields. /// - `tag_defs`: tag definitions as `(name, type_code)` pairs. `None` → /// no tags column. `Some(vec![])` → tags column with empty struct. pub fn new( - fields: Option>, + fields: Select, tag_defs: Option>, ) -> crate::Result { - let field_names = - fields.unwrap_or_else(|| DEFAULT_FIELD_NAMES.iter().map(|&s| s.to_string()).collect()); + let field_names = match fields { + Select::All => DEFAULT_FIELD_NAMES.iter().map(|&s| s.to_string()).collect(), + Select::Some(names) => names, + Select::Omit => Vec::new(), + }; let mut parsed_fields = Vec::new(); for name in &field_names { @@ -96,7 +100,7 @@ impl Model { /// Create a model with all 12 default standard fields and no tags. pub fn default_fields() -> Self { - Self::new(None, None).expect("default fields are always valid") + Self::new(Select::All, None).expect("default fields are always valid") } fn build_schema(fields: &[Field], tag_defs: Option<&[TagDef]>) -> SchemaRef { @@ -188,7 +192,7 @@ impl Model { None }; - Self::new(Some(projected_fields), tag_defs) + Self::new(Select::Some(projected_fields), tag_defs) } } @@ -271,6 +275,10 @@ impl FromStr for Model { } } + let fields = match fields { + Some(names) => Select::Some(names), + None => Select::All, + }; Self::new(fields, tag_defs) } } @@ -281,7 +289,7 @@ mod tests { #[test] fn test_default_model() { - let model = Model::new(None, None).unwrap(); + let model = Model::new(Select::All, None).unwrap(); assert_eq!(model.field_names().len(), 12); assert!(!model.has_tags()); assert!(model.tag_defs().is_none()); @@ -291,13 +299,13 @@ mod tests { #[test] fn test_default_fields_constructor() { let model = Model::default_fields(); - assert_eq!(model, Model::new(None, None).unwrap()); + assert_eq!(model, Model::new(Select::All, None).unwrap()); } #[test] fn test_custom_fields_no_tags() { let model = Model::new( - Some(vec!["qname".into(), "flag".into(), "pos".into()]), + Select::Some(vec!["qname".into(), "flag".into(), "pos".into()]), None, ) .unwrap(); @@ -309,7 +317,7 @@ mod tests { #[test] fn test_fields_with_tags() { let model = Model::new( - Some(vec!["qname".into(), "pos".into()]), + Select::Some(vec!["qname".into(), "pos".into()]), Some(vec![("NM".into(), "i".into()), ("MD".into(), "Z".into())]), ) .unwrap(); @@ -323,7 +331,7 @@ mod tests { #[test] fn test_tags_empty_defs_is_empty_struct() { - let model = Model::new(Some(vec!["qname".into()]), Some(vec![])).unwrap(); + let model = Model::new(Select::Some(vec!["qname".into()]), Some(vec![])).unwrap(); assert!(model.has_tags()); assert!(model.tag_defs().unwrap().is_empty()); assert_eq!(model.schema().fields().len(), 2); @@ -336,7 +344,7 @@ mod tests { #[test] fn test_no_tags_when_tag_defs_none() { - let model = Model::new(Some(vec!["qname".into(), "pos".into()]), None).unwrap(); + let model = Model::new(Select::Some(vec!["qname".into(), "pos".into()]), None).unwrap(); assert!(!model.has_tags()); assert!(model.tag_defs().is_none()); assert_eq!(model.schema().fields().len(), 2); @@ -344,26 +352,26 @@ mod tests { #[test] fn test_invalid_field() { - let result = Model::new(Some(vec!["invalid".into()]), None); + let result = Model::new(Select::Some(vec!["invalid".into()]), None); assert!(result.is_err()); } #[test] fn test_invalid_tag_name() { - let result = Model::new(None, Some(vec![("X".into(), "i".into())])); + let result = Model::new(Select::All, Some(vec![("X".into(), "i".into())])); assert!(result.is_err()); } #[test] fn test_invalid_tag_type() { - let result = Model::new(None, Some(vec![("NM".into(), "Q".into())])); + let result = Model::new(Select::All, Some(vec![("NM".into(), "Q".into())])); assert!(result.is_err()); } #[test] fn test_project() { let model = Model::new( - Some(vec!["qname".into(), "flag".into(), "pos".into()]), + Select::Some(vec!["qname".into(), "flag".into(), "pos".into()]), Some(vec![("NM".into(), "i".into())]), ) .unwrap(); @@ -376,7 +384,7 @@ mod tests { #[test] fn test_project_with_tags() { let model = Model::new( - Some(vec!["qname".into(), "pos".into()]), + Select::Some(vec!["qname".into(), "pos".into()]), Some(vec![("NM".into(), "i".into())]), ) .unwrap(); @@ -403,7 +411,7 @@ mod tests { #[test] fn test_display_custom_with_tags() { let model = Model::new( - Some(vec!["qname".into(), "pos".into()]), + Select::Some(vec!["qname".into(), "pos".into()]), Some(vec![("NM".into(), "i".into()), ("MD".into(), "Z".into())]), ) .unwrap(); @@ -412,7 +420,7 @@ mod tests { #[test] fn test_display_tags_no_defs() { - let model = Model::new(Some(vec!["qname".into()]), Some(vec![])).unwrap(); + let model = Model::new(Select::Some(vec!["qname".into()]), Some(vec![])).unwrap(); assert_eq!(model.to_string(), "fields=qname;tags"); } @@ -425,7 +433,7 @@ mod tests { #[test] fn test_from_str_roundtrip() { let model = Model::new( - Some(vec!["qname".into(), "pos".into()]), + Select::Some(vec!["qname".into(), "pos".into()]), Some(vec![("NM".into(), "i".into()), ("MD".into(), "Z".into())]), ) .unwrap(); @@ -444,7 +452,7 @@ mod tests { #[test] fn test_from_str_roundtrip_empty_tags() { - let model = Model::new(Some(vec!["qname".into()]), Some(vec![])).unwrap(); + let model = Model::new(Select::Some(vec!["qname".into()]), Some(vec![])).unwrap(); let s = model.to_string(); let parsed: Model = s.parse().unwrap(); assert_eq!(model, parsed); @@ -453,7 +461,7 @@ mod tests { #[test] fn test_clone_eq() { let model = Model::new( - Some(vec!["qname".into()]), + Select::Some(vec!["qname".into()]), Some(vec![("NM".into(), "i".into())]), ) .unwrap(); @@ -465,12 +473,12 @@ mod tests { fn test_schema_independence() { // Schema should not depend on any file header content. let m1 = Model::new( - Some(vec!["qname".into(), "rname".into(), "pos".into()]), + Select::Some(vec!["qname".into(), "rname".into(), "pos".into()]), None, ) .unwrap(); let m2 = Model::new( - Some(vec!["qname".into(), "rname".into(), "pos".into()]), + Select::Some(vec!["qname".into(), "rname".into(), "pos".into()]), None, ) .unwrap(); diff --git a/oxbow/src/alignment/model/batch.rs b/oxbow/src/alignment/model/batch.rs index 8cc00a78..676d7e5c 100644 --- a/oxbow/src/alignment/model/batch.rs +++ b/oxbow/src/alignment/model/batch.rs @@ -8,6 +8,7 @@ use indexmap::IndexMap; use noodles::sam::alignment::record::data::field::Tag; use crate::batch::{Push, RecordBatchBuilder}; +use crate::Select; use super::field::Push as _; use super::field::{Field, FieldBuilder}; @@ -27,11 +28,11 @@ pub struct BatchBuilder { impl BatchBuilder { /// Creates a new `BatchBuilder` for SAM/BAM records. /// - /// - `fields`: standard SAM field names. `None` → all 12 standard fields. + /// - `fields`: standard SAM field selection. /// - `tag_defs`: `None` → no tags column. `Some(vec![])` → empty struct. pub fn new( header: noodles::sam::Header, - fields: Option>, + fields: Select, tag_defs: Option>, capacity: usize, ) -> crate::Result { @@ -254,7 +255,7 @@ mod tests { #[test] fn test_batch_builder_new() { let header = noodles::sam::Header::default(); - let fields = Some(vec!["QNAME".to_string(), "FLAG".to_string()]); + let fields = Select::Some(vec!["QNAME".to_string(), "FLAG".to_string()]); let tag_defs = Some(vec![("NM".to_string(), "i".to_string())]); let capacity = 10; @@ -267,7 +268,7 @@ mod tests { #[test] fn test_schema() { let header = noodles::sam::Header::default(); - let fields = Some(vec!["QNAME".to_string(), "FLAG".to_string()]); + let fields = Select::Some(vec!["QNAME".to_string(), "FLAG".to_string()]); let tag_defs = Some(vec![("NM".to_string(), "i".to_string())]); let capacity = 10; @@ -286,7 +287,7 @@ mod tests { #[test] fn test_no_tags_when_tag_defs_none() { let header = noodles::sam::Header::default(); - let fields = Some(vec!["QNAME".to_string(), "FLAG".to_string()]); + let fields = Select::Some(vec!["QNAME".to_string(), "FLAG".to_string()]); let capacity = 10; let batch_builder = BatchBuilder::new(header, fields, None, capacity).unwrap(); @@ -298,7 +299,7 @@ mod tests { #[test] fn test_from_model() { let model = Model::new( - Some(vec!["qname".into(), "pos".into()]), + Select::Some(vec!["qname".into(), "pos".into()]), Some(vec![("NM".into(), "i".into())]), ) .unwrap(); @@ -312,7 +313,7 @@ mod tests { #[test] fn test_push_sam_record() { let header = noodles::sam::Header::default(); - let fields = Some(vec!["QNAME".to_string(), "FLAG".to_string()]); + let fields = Select::Some(vec!["QNAME".to_string(), "FLAG".to_string()]); let tag_defs = Some(vec![("NM".to_string(), "i".to_string())]); let capacity = 10; @@ -326,7 +327,7 @@ mod tests { #[test] fn test_push_bam_record() { let header = noodles::sam::Header::default(); - let fields = Some(vec!["QNAME".to_string(), "FLAG".to_string()]); + let fields = Select::Some(vec!["QNAME".to_string(), "FLAG".to_string()]); let tag_defs = Some(vec![("NM".to_string(), "i".to_string())]); let capacity = 10; @@ -340,7 +341,7 @@ mod tests { #[test] fn test_finish() { let header = noodles::sam::Header::default(); - let fields = Some(vec!["QNAME".to_string(), "FLAG".to_string()]); + let fields = Select::Some(vec!["QNAME".to_string(), "FLAG".to_string()]); let tag_defs = Some(vec![("NM".to_string(), "i".to_string())]); let capacity = 10; @@ -359,7 +360,7 @@ mod tests { #[test] fn test_finish_empty_tags() { let header = noodles::sam::Header::default(); - let fields = Some(vec!["QNAME".to_string()]); + let fields = Select::Some(vec!["QNAME".to_string()]); let capacity = 10; let mut batch_builder = BatchBuilder::new(header, fields, Some(vec![]), capacity).unwrap(); @@ -374,7 +375,7 @@ mod tests { #[test] fn test_finish_no_tags() { let header = noodles::sam::Header::default(); - let fields = Some(vec!["QNAME".to_string(), "FLAG".to_string()]); + let fields = Select::Some(vec!["QNAME".to_string(), "FLAG".to_string()]); let capacity = 10; let mut batch_builder = BatchBuilder::new(header, fields, None, capacity).unwrap(); diff --git a/oxbow/src/alignment/scanner/bam.rs b/oxbow/src/alignment/scanner/bam.rs index b12c5f2a..5f852183 100644 --- a/oxbow/src/alignment/scanner/bam.rs +++ b/oxbow/src/alignment/scanner/bam.rs @@ -10,6 +10,7 @@ use crate::alignment::model::BatchBuilder; use crate::alignment::scanner::batch_iterator::{BatchIterator, QueryBatchIterator}; use crate::alignment::AlignmentModel; use crate::util::query::{BgzfChunkReader, ByteRangeReader}; +use crate::Select; /// A BAM scanner. /// @@ -20,6 +21,7 @@ use crate::util::query::{BgzfChunkReader, ByteRangeReader}; /// /// ```no_run /// use oxbow::alignment::scanner::bam::Scanner; +/// use oxbow::Select; /// use std::fs::File; /// use std::io::BufReader; /// @@ -28,7 +30,7 @@ use crate::util::query::{BgzfChunkReader, ByteRangeReader}; /// let header = fmt_reader.read_header().unwrap(); /// /// let tag_defs = Scanner::tag_defs(&mut fmt_reader, Some(1000)).unwrap(); -/// let scanner = Scanner::new(header, None, Some(tag_defs)).unwrap(); +/// let scanner = Scanner::new(header, Select::All, Some(tag_defs)).unwrap(); /// let batches = scanner.scan(fmt_reader, None, None, Some(1000)); /// ``` pub struct Scanner { @@ -39,11 +41,11 @@ pub struct Scanner { impl Scanner { /// Creates a BAM scanner from a SAM header and schema parameters. /// - /// - `fields`: standard SAM field names. `None` → all 12 standard fields. + /// - `fields`: standard SAM field selection. /// - `tag_defs`: `None` → no tags column. `Some(vec![])` → empty struct. pub fn new( header: noodles::sam::Header, - fields: Option>, + fields: Select, tag_defs: Option>, ) -> crate::Result { let model = AlignmentModel::new(fields, tag_defs)?; @@ -295,7 +297,7 @@ mod tests { #[test] fn test_scan_with_multithreaded_reader() { let (header, fmt_reader) = mt_reader(); - let scanner = Scanner::new(header, None, None).unwrap(); + let scanner = Scanner::new(header, Select::All, None).unwrap(); let mut batches = scanner.scan(fmt_reader, None, None, Some(10)).unwrap(); let batch = batches.next().unwrap().unwrap(); @@ -306,7 +308,7 @@ mod tests { #[test] fn test_scan_query_with_multithreaded_reader() { let (header, fmt_reader) = mt_reader(); - let scanner = Scanner::new(header, None, None).unwrap(); + let scanner = Scanner::new(header, Select::All, None).unwrap(); let index = noodles::bam::bai::fs::read("../fixtures/sample.bam.bai").unwrap(); diff --git a/oxbow/src/alignment/scanner/cram.rs b/oxbow/src/alignment/scanner/cram.rs index f3f0a34f..a117c7f5 100644 --- a/oxbow/src/alignment/scanner/cram.rs +++ b/oxbow/src/alignment/scanner/cram.rs @@ -10,6 +10,7 @@ use crate::alignment::model::tag::TagScanner; use crate::alignment::model::BatchBuilder; use crate::alignment::AlignmentModel; use crate::batch::{Push, RecordBatchBuilder as _}; +use crate::Select; /// A CRAM scanner. /// @@ -20,6 +21,7 @@ use crate::batch::{Push, RecordBatchBuilder as _}; /// /// ```no_run /// use oxbow::alignment::scanner::cram::Scanner; +/// use oxbow::Select; /// use std::fs::File; /// use noodles::fasta::io::indexed_reader::Builder as FastaIndexedReaderBuilder; /// use noodles::fasta::repository::adapters::IndexedReader as FastaIndexedReaderAdapter; @@ -33,7 +35,7 @@ use crate::batch::{Push, RecordBatchBuilder as _}; /// let header = fmt_reader.read_header().unwrap(); /// /// let tag_defs = Scanner::tag_defs(&mut fmt_reader, &header, Some(1000)).unwrap(); -/// let scanner = Scanner::new(header, None, Some(tag_defs), repository).unwrap(); +/// let scanner = Scanner::new(header, Select::All, Some(tag_defs), repository).unwrap(); /// let batches = scanner.scan(fmt_reader, None, None, Some(1000)); /// ``` pub struct Scanner { @@ -45,13 +47,13 @@ pub struct Scanner { impl Scanner { /// Creates a CRAM scanner from a SAM header and schema parameters. /// - /// - `fields`: standard SAM field names. `None` → all 12 standard fields. + /// - `fields`: standard SAM field selection. /// - `tag_defs`: `None` → no tags column. `Some(vec![])` → empty struct. /// /// The FASTA repository is stored and used by scan methods for decoding. pub fn new( header: noodles::sam::Header, - fields: Option>, + fields: Select, tag_defs: Option>, repo: noodles::fasta::Repository, ) -> crate::Result { diff --git a/oxbow/src/alignment/scanner/sam.rs b/oxbow/src/alignment/scanner/sam.rs index 05820881..ea0daeb3 100644 --- a/oxbow/src/alignment/scanner/sam.rs +++ b/oxbow/src/alignment/scanner/sam.rs @@ -10,6 +10,7 @@ use crate::alignment::model::BatchBuilder; use crate::alignment::scanner::batch_iterator::{BatchIterator, QueryBatchIterator}; use crate::alignment::AlignmentModel; use crate::util::query::{BgzfChunkReader, ByteRangeReader}; +use crate::Select; /// A SAM scanner. /// @@ -20,6 +21,7 @@ use crate::util::query::{BgzfChunkReader, ByteRangeReader}; /// /// ```no_run /// use oxbow::alignment::scanner::sam::Scanner; +/// use oxbow::Select; /// use std::fs::File; /// use std::io::BufReader; /// @@ -28,7 +30,7 @@ use crate::util::query::{BgzfChunkReader, ByteRangeReader}; /// let header = fmt_reader.read_header().unwrap(); /// /// let tag_defs = Scanner::tag_defs(&mut fmt_reader, Some(1000)).unwrap(); -/// let scanner = Scanner::new(header, None, Some(tag_defs)).unwrap(); +/// let scanner = Scanner::new(header, Select::All, Some(tag_defs)).unwrap(); /// let batches = scanner.scan(fmt_reader, None, None, Some(1000)); /// ``` pub struct Scanner { @@ -39,11 +41,11 @@ pub struct Scanner { impl Scanner { /// Creates a SAM scanner from a SAM header and schema parameters. /// - /// - `fields`: standard SAM field names. `None` → all 12 standard fields. + /// - `fields`: standard SAM field selection. /// - `tag_defs`: `None` → no tags column. `Some(vec![])` → empty struct. pub fn new( header: noodles::sam::Header, - fields: Option>, + fields: Select, tag_defs: Option>, ) -> crate::Result { let model = AlignmentModel::new(fields, tag_defs)?; diff --git a/oxbow/src/bbi/model/base.rs b/oxbow/src/bbi/model/base.rs index 075e3e07..ee29bbfc 100644 --- a/oxbow/src/bbi/model/base.rs +++ b/oxbow/src/bbi/model/base.rs @@ -6,7 +6,7 @@ use std::sync::Arc; use arrow::datatypes::{Field as ArrowField, Schema, SchemaRef}; pub use crate::bed::model::schema::BedSchema; -use crate::OxbowError; +use crate::{OxbowError, Select}; pub use batch::BatchBuilder; use field::{bed_standard_fields, FieldDef}; @@ -73,11 +73,12 @@ impl Model { /// /// - `bed_schema`: the parsing interpretation. /// - `fields`: column names to project. `None` → all fields from the schema. - pub fn new(bed_schema: BedSchema, fields: Option>) -> crate::Result { + pub fn new(bed_schema: BedSchema, fields: Select) -> crate::Result { let all_defs = bed_schema_field_defs(&bed_schema); let projected = match fields { - None => all_defs, - Some(names) => { + Select::All => all_defs, + Select::Omit => Vec::new(), + Select::Some(names) => { let mut projected = Vec::new(); for name in &names { let def = all_defs @@ -155,7 +156,7 @@ impl Model { .map(|d| d.name.clone()) .collect(); - Self::new(self.bed_schema.clone(), Some(projected)) + Self::new(self.bed_schema.clone(), Select::Some(projected)) } } @@ -168,7 +169,7 @@ mod tests { #[test] fn test_bedgraph_model() { let bed_schema = BedSchema::new_bedgraph().unwrap(); - let model = Model::new(bed_schema, None).unwrap(); + let model = Model::new(bed_schema, Select::All).unwrap(); assert_eq!(model.field_names(), vec!["chrom", "start", "end", "value"]); // BBI uses UInt32 for positions (AutoSql types) assert_eq!(model.schema().field(1).data_type(), &DataType::UInt32); @@ -178,7 +179,7 @@ mod tests { #[test] fn test_bed6_model() { let bed_schema: BedSchema = "bed6".parse().unwrap(); - let model = Model::new(bed_schema, None).unwrap(); + let model = Model::new(bed_schema, Select::All).unwrap(); assert_eq!(model.field_names().len(), 6); assert_eq!(model.schema().field(1).data_type(), &DataType::UInt32); } @@ -186,7 +187,7 @@ mod tests { #[test] fn test_bed6_projection() { let bed_schema: BedSchema = "bed6".parse().unwrap(); - let model = Model::new(bed_schema, None).unwrap(); + let model = Model::new(bed_schema, Select::All).unwrap(); let projected = model.project(&["chrom".into(), "strand".into()]).unwrap(); assert_eq!(projected.field_names(), vec!["chrom", "strand"]); } @@ -195,7 +196,7 @@ mod tests { fn test_custom_fields() { let defs = vec![FieldDef::new("extra".into(), FieldType::Float)]; let bed_schema = BedSchema::new(3, Some(defs)).unwrap(); - let model = Model::new(bed_schema, None).unwrap(); + let model = Model::new(bed_schema, Select::All).unwrap(); assert_eq!(model.field_names(), vec!["chrom", "start", "end", "extra"]); assert_eq!(model.schema().field(3).data_type(), &DataType::Float32); } @@ -208,7 +209,7 @@ mod tests { FieldDef::new("c".into(), FieldType::String), ]; let bed_schema = BedSchema::new(3, Some(defs)).unwrap(); - let model = Model::new(bed_schema, None).unwrap(); + let model = Model::new(bed_schema, Select::All).unwrap(); let projected = model.project(&["chrom".into(), "c".into()]).unwrap(); assert_eq!(projected.field_names(), vec!["chrom", "c"]); } diff --git a/oxbow/src/bbi/model/base/batch.rs b/oxbow/src/bbi/model/base/batch.rs index 0e481a4f..a70c61cf 100644 --- a/oxbow/src/bbi/model/base/batch.rs +++ b/oxbow/src/bbi/model/base/batch.rs @@ -7,6 +7,7 @@ use arrow::record_batch::{RecordBatch, RecordBatchOptions}; use indexmap::IndexMap; use crate::batch::{Push, RecordBatchBuilder}; +use crate::Select; use super::field::Push as _; pub use super::field::{FieldBuilder, FieldDef, FieldType}; @@ -25,7 +26,7 @@ impl BatchBuilder { /// Creates a new `BatchBuilder` for BigWig or BigBed records. pub fn new( bed_schema: BedSchema, - fields: Option>, + fields: Select, capacity: usize, ) -> crate::Result { let model = Model::new(bed_schema, fields)?; @@ -244,7 +245,7 @@ mod tests { #[test] fn test_batch_builder_new() { let bed_schema = create_test_bedschema(); - let model = Model::new(bed_schema, None).unwrap(); + let model = Model::new(bed_schema, Select::All).unwrap(); let builder = BatchBuilder::from_model(&model, 10).unwrap(); assert_eq!(builder.schema().fields().len(), 4); @@ -254,7 +255,7 @@ mod tests { #[test] fn test_schema() { let bed_schema = create_test_bedschema(); - let model = Model::new(bed_schema, None).unwrap(); + let model = Model::new(bed_schema, Select::All).unwrap(); let builder = BatchBuilder::from_model(&model, 10).unwrap(); let schema = builder.schema(); @@ -269,7 +270,7 @@ mod tests { #[test] fn test_push_bigbed_record() { let schema = create_test_bedschema(); - let model = Model::new(schema, None).unwrap(); + let model = Model::new(schema, Select::All).unwrap(); let mut builder = BatchBuilder::from_model(&model, 10).unwrap(); let record = BigBedRecord { @@ -314,7 +315,7 @@ mod tests { #[test] fn test_push_bigwig_record() { let schema = create_test_bedschema(); - let model = Model::new(schema, None).unwrap(); + let model = Model::new(schema, Select::All).unwrap(); let mut builder = BatchBuilder::from_model(&model, 10).unwrap(); let record = BigWigRecord { @@ -368,7 +369,7 @@ mod tests { #[test] fn test_finish_empty_batch() { let schema = create_test_bedschema(); - let model = Model::new(schema, None).unwrap(); + let model = Model::new(schema, Select::All).unwrap(); let mut builder = BatchBuilder::from_model(&model, 10).unwrap(); let batch = builder.finish().unwrap(); @@ -380,7 +381,7 @@ mod tests { fn test_bigbed_bed6_no_custom() { // bed6 with no custom fields — standard fields 4-6 are in rest let bed_schema: BedSchema = "bed6".parse().unwrap(); - let model = Model::new(bed_schema, None).unwrap(); + let model = Model::new(bed_schema, Select::All).unwrap(); let mut builder = BatchBuilder::from_model(&model, 10).unwrap(); let record = BigBedRecord { @@ -420,7 +421,11 @@ mod tests { fn test_bigbed_bed6_projected() { // bed6 projected to chrom + strand — strand is field 6, skipping name/score let bed_schema: BedSchema = "bed6".parse().unwrap(); - let model = Model::new(bed_schema, Some(vec!["chrom".into(), "strand".into()])).unwrap(); + let model = Model::new( + bed_schema, + Select::Some(vec!["chrom".into(), "strand".into()]), + ) + .unwrap(); let mut builder = BatchBuilder::from_model(&model, 10).unwrap(); let record = BigBedRecord { diff --git a/oxbow/src/bbi/model/zoom.rs b/oxbow/src/bbi/model/zoom.rs index 1419d478..69003e41 100644 --- a/oxbow/src/bbi/model/zoom.rs +++ b/oxbow/src/bbi/model/zoom.rs @@ -7,7 +7,7 @@ use std::sync::Arc; use arrow::datatypes::{Field as ArrowField, Schema, SchemaRef}; -use crate::OxbowError; +use crate::{OxbowError, Select}; use field::{Field, DEFAULT_FIELD_NAMES}; pub struct BBIZoomRecord<'a> { @@ -45,11 +45,12 @@ impl<'a> BBIZoomRecord<'a> { /// /// ``` /// use oxbow::bbi::model::zoom::Model; +/// use oxbow::Select; /// -/// let model = Model::new(None).unwrap(); +/// let model = Model::new(Select::All).unwrap(); /// assert_eq!(model.field_names().len(), 8); /// -/// let model = Model::new(Some(vec!["chrom".into(), "start".into(), "end".into(), "sum".into()])).unwrap(); +/// let model = Model::new(Select::Some(vec!["chrom".into(), "start".into(), "end".into(), "sum".into()])).unwrap(); /// assert_eq!(model.field_names().len(), 4); /// ``` #[derive(Clone, Debug)] @@ -62,9 +63,12 @@ impl Model { /// Create a new BBI zoom model. /// /// `fields`: field names. `None` → all 8 default fields. - pub fn new(fields: Option>) -> crate::Result { - let field_names = - fields.unwrap_or_else(|| DEFAULT_FIELD_NAMES.iter().map(|&s| s.to_string()).collect()); + pub fn new(fields: Select) -> crate::Result { + let field_names = match fields { + Select::All => DEFAULT_FIELD_NAMES.iter().map(|&s| s.to_string()).collect(), + Select::Some(names) => names, + Select::Omit => Vec::new(), + }; let mut parsed_fields = Vec::new(); for name in &field_names { @@ -130,7 +134,7 @@ impl Model { .map(|f| f.name().to_string()) .collect(); - Self::new(Some(projected)) + Self::new(Select::Some(projected)) } } @@ -148,20 +152,25 @@ mod tests { #[test] fn test_defaults() { - let model = Model::new(None).unwrap(); + let model = Model::new(Select::All).unwrap(); assert_eq!(model.field_names().len(), 8); assert_eq!(model.schema().fields().len(), 8); } #[test] fn test_custom() { - let model = Model::new(Some(vec!["chrom".into(), "start".into(), "sum".into()])).unwrap(); + let model = Model::new(Select::Some(vec![ + "chrom".into(), + "start".into(), + "sum".into(), + ])) + .unwrap(); assert_eq!(model.field_names(), vec!["chrom", "start", "sum"]); } #[test] fn test_project() { - let model = Model::new(None).unwrap(); + let model = Model::new(Select::All).unwrap(); let projected = model .project(&["chrom".into(), "min".into(), "max".into()]) .unwrap(); @@ -170,7 +179,7 @@ mod tests { #[test] fn test_invalid_field() { - let result = Model::new(Some(vec!["invalid".into()])); + let result = Model::new(Select::Some(vec!["invalid".into()])); assert!(result.is_err()); } } diff --git a/oxbow/src/bbi/scanner/bbizoom.rs b/oxbow/src/bbi/scanner/bbizoom.rs index 6e3ceb45..4574b415 100644 --- a/oxbow/src/bbi/scanner/bbizoom.rs +++ b/oxbow/src/bbi/scanner/bbizoom.rs @@ -7,6 +7,7 @@ pub use super::BBIReader; use crate::bbi::model::zoom::BatchBuilder; use crate::bbi::model::zoom::Model; use crate::bbi::scanner::batch_iterator::zoom::{BBIZoomBatchIterator, BBIZoomQueryBatchIterator}; +use crate::Select; /// A scanner for the summary statistics from BBI file zoom level. /// @@ -23,7 +24,8 @@ use crate::bbi::scanner::batch_iterator::zoom::{BBIZoomBatchIterator, BBIZoomQue /// let info = fmt_reader.info(); /// let ref_names = info.chrom_info.iter().map(|c| c.name.clone()).collect(); /// let zoom_levels: Vec = info.zoom_headers.iter().map(|h| h.reduction_level).collect(); -/// let scanner = Scanner::new(ref_names, zoom_levels[0], None).unwrap(); +/// use oxbow::Select; +/// let scanner = Scanner::new(ref_names, zoom_levels[0], Select::All).unwrap(); /// let batches = scanner.scan(BBIReader::BigWig(fmt_reader), None, None, Some(1000)); pub struct Scanner { ref_names: Vec, @@ -36,7 +38,7 @@ impl Scanner { pub fn new( ref_names: Vec, zoom_level: u32, - fields: Option>, + fields: Select, ) -> crate::Result { let model = Model::new(fields)?; Ok(Self { diff --git a/oxbow/src/bbi/scanner/bigbed.rs b/oxbow/src/bbi/scanner/bigbed.rs index 50e150ca..4d1f7f33 100644 --- a/oxbow/src/bbi/scanner/bigbed.rs +++ b/oxbow/src/bbi/scanner/bigbed.rs @@ -8,6 +8,7 @@ use crate::bbi::model::base::BatchBuilder; use crate::bbi::model::base::BedSchema; use crate::bbi::model::base::Model; use crate::bbi::scanner::batch_iterator::base::{BigBedBatchIterator, BigBedQueryBatchIterator}; +use crate::Select; /// A BigBed scanner. /// @@ -22,7 +23,8 @@ use crate::bbi::scanner::batch_iterator::base::{BigBedBatchIterator, BigBedQuery /// let mut fmt_reader = bigtools::BigBedRead::open_file("sample.bigBed").unwrap(); /// let info = fmt_reader.info(); /// -/// let scanner = Scanner::new("bed12".parse().unwrap(), info.clone(), None).unwrap(); +/// use oxbow::Select; +/// let scanner = Scanner::new("bed12".parse().unwrap(), info.clone(), Select::All).unwrap(); /// let batches = scanner.scan(fmt_reader, None, None, Some(1000)); pub struct Scanner { model: Model, @@ -34,7 +36,7 @@ impl Scanner { pub fn new( bed_schema: BedSchema, info: bigtools::BBIFileInfo, - fields: Option>, + fields: Select, ) -> crate::Result { let model = Model::new(bed_schema, fields)?; Ok(Self { model, info }) diff --git a/oxbow/src/bbi/scanner/bigwig.rs b/oxbow/src/bbi/scanner/bigwig.rs index 7f666649..847dc914 100644 --- a/oxbow/src/bbi/scanner/bigwig.rs +++ b/oxbow/src/bbi/scanner/bigwig.rs @@ -8,6 +8,7 @@ use crate::bbi::model::base::BatchBuilder; use crate::bbi::model::base::BedSchema; use crate::bbi::model::base::Model; use crate::bbi::scanner::batch_iterator::base::{BigWigBatchIterator, BigWigQueryBatchIterator}; +use crate::Select; /// A BigWig scanner. /// @@ -21,7 +22,8 @@ use crate::bbi::scanner::batch_iterator::base::{BigWigBatchIterator, BigWigQuery /// let mut fmt_reader = bigtools::BigWigRead::open_file("sample.bigWig").unwrap(); /// let info = fmt_reader.info(); /// -/// let scanner = Scanner::new(info.clone(), None).unwrap(); +/// use oxbow::Select; +/// let scanner = Scanner::new(info.clone(), Select::All).unwrap(); /// let batches = scanner.scan(fmt_reader, None, None, Some(1000)); /// ``` pub struct Scanner { @@ -31,7 +33,7 @@ pub struct Scanner { impl Scanner { /// Creates a BigWig scanner from BBI file info and optional field names. - pub fn new(info: bigtools::BBIFileInfo, fields: Option>) -> crate::Result { + pub fn new(info: bigtools::BBIFileInfo, fields: Select) -> crate::Result { let bed_schema: BedSchema = "bedGraph".parse().unwrap(); let model = Model::new(bed_schema, fields)?; Ok(Self { model, info }) diff --git a/oxbow/src/bed/model.rs b/oxbow/src/bed/model.rs index cae24823..8bd3659a 100644 --- a/oxbow/src/bed/model.rs +++ b/oxbow/src/bed/model.rs @@ -14,7 +14,7 @@ use std::sync::Arc; use arrow::datatypes::{Field as ArrowField, Schema, SchemaRef}; -use crate::OxbowError; +use crate::{OxbowError, Select}; use field::Field; /// A data model for BED records. @@ -29,15 +29,16 @@ use field::Field; /// /// ``` /// use oxbow::bed::model::{Model, BedSchema}; +/// use oxbow::Select; /// /// // BED6 with all fields. /// let bed_schema: BedSchema = "bed6".parse().unwrap(); -/// let model = Model::new(bed_schema, None).unwrap(); +/// let model = Model::new(bed_schema, Select::All).unwrap(); /// assert_eq!(model.field_names().len(), 6); /// /// // BED6 projected to 3 fields. /// let bed_schema: BedSchema = "bed6".parse().unwrap(); -/// let model = Model::new(bed_schema, Some(vec!["chrom".into(), "start".into(), "end".into()])).unwrap(); +/// let model = Model::new(bed_schema, Select::Some(vec!["chrom".into(), "start".into(), "end".into()])).unwrap(); /// assert_eq!(model.field_names().len(), 3); /// ``` #[derive(Clone, Debug)] @@ -52,11 +53,12 @@ impl Model { /// /// - `bed_schema`: the parsing interpretation. /// - `fields`: column names to project. `None` → all fields from the schema. - pub fn new(bed_schema: BedSchema, fields: Option>) -> crate::Result { + pub fn new(bed_schema: BedSchema, fields: Select) -> crate::Result { let available_names = bed_schema.field_names(); let projected_names = match fields { - None => available_names.clone(), - Some(names) => { + Select::All => available_names.clone(), + Select::Omit => Vec::new(), + Select::Some(names) => { for name in &names { if !available_names.iter().any(|a| a.eq_ignore_ascii_case(name)) { return Err(OxbowError::invalid_input(format!( @@ -146,7 +148,7 @@ impl Model { .cloned() .collect(); - Self::new(self.bed_schema.clone(), Some(projected)) + Self::new(self.bed_schema.clone(), Select::Some(projected)) } } @@ -165,7 +167,7 @@ mod tests { #[test] fn test_bed6_all_fields() { let bed_schema: BedSchema = "bed6".parse().unwrap(); - let model = Model::new(bed_schema, None).unwrap(); + let model = Model::new(bed_schema, Select::All).unwrap(); assert_eq!(model.field_names().len(), 6); assert_eq!(model.schema().fields().len(), 6); } @@ -175,7 +177,7 @@ mod tests { let bed_schema: BedSchema = "bed6".parse().unwrap(); let model = Model::new( bed_schema, - Some(vec!["chrom".into(), "start".into(), "end".into()]), + Select::Some(vec!["chrom".into(), "start".into(), "end".into()]), ) .unwrap(); assert_eq!(model.field_names(), vec!["chrom", "start", "end"]); @@ -185,14 +187,14 @@ mod tests { #[test] fn test_bed3_plus() { let bed_schema: BedSchema = "bed3+".parse().unwrap(); - let model = Model::new(bed_schema, None).unwrap(); + let model = Model::new(bed_schema, Select::All).unwrap(); assert_eq!(model.field_names(), vec!["chrom", "start", "end", "rest"]); } #[test] fn test_bedgraph() { let bed_schema = BedSchema::new_bedgraph().unwrap(); - let model = Model::new(bed_schema, None).unwrap(); + let model = Model::new(bed_schema, Select::All).unwrap(); assert_eq!(model.field_names(), vec!["chrom", "start", "end", "value"]); } @@ -203,7 +205,7 @@ mod tests { FieldDef::new("pValue".into(), FieldType::Float), ]; let bed_schema = BedSchema::new(3, Some(defs)).unwrap(); - let model = Model::new(bed_schema, None).unwrap(); + let model = Model::new(bed_schema, Select::All).unwrap(); assert_eq!( model.field_names(), vec!["chrom", "start", "end", "signalValue", "pValue"] @@ -213,7 +215,7 @@ mod tests { #[test] fn test_project() { let bed_schema: BedSchema = "bed6+3".parse().unwrap(); - let model = Model::new(bed_schema, None).unwrap(); + let model = Model::new(bed_schema, Select::All).unwrap(); let projected = model .project(&["chrom".into(), "end".into(), "BED6+1".into()]) .unwrap(); @@ -223,7 +225,7 @@ mod tests { #[test] fn test_project_unknown() { let bed_schema: BedSchema = "bed3".parse().unwrap(); - let model = Model::new(bed_schema, None).unwrap(); + let model = Model::new(bed_schema, Select::All).unwrap(); let result = model.project(&["nonexistent".into()]); assert!(result.is_err()); } @@ -231,14 +233,15 @@ mod tests { #[test] fn test_invalid_field_name() { let bed_schema: BedSchema = "bed3".parse().unwrap(); - let result = Model::new(bed_schema, Some(vec!["nonexistent".into()])); + let result = Model::new(bed_schema, Select::Some(vec!["nonexistent".into()])); assert!(result.is_err()); } #[test] fn test_bed3_projected_subset() { let bed_schema: BedSchema = "bed3".parse().unwrap(); - let model = Model::new(bed_schema, Some(vec!["chrom".into(), "end".into()])).unwrap(); + let model = + Model::new(bed_schema, Select::Some(vec!["chrom".into(), "end".into()])).unwrap(); assert_eq!(model.field_names(), vec!["chrom", "end"]); } @@ -247,7 +250,7 @@ mod tests { let bed_schema: BedSchema = "bed9".parse().unwrap(); let model = Model::new( bed_schema, - Some(vec!["chrom".into(), "strand".into(), "itemRgb".into()]), + Select::Some(vec!["chrom".into(), "strand".into(), "itemRgb".into()]), ) .unwrap(); assert_eq!(model.field_names(), vec!["chrom", "strand", "itemRgb"]); @@ -261,7 +264,7 @@ mod tests { FieldDef::new("extra2".into(), FieldType::String), ]; let bed_schema = BedSchema::new(12, Some(defs)).unwrap(); - let model = Model::new(bed_schema, None).unwrap(); + let model = Model::new(bed_schema, Select::All).unwrap(); assert_eq!(model.field_names().len(), 14); let projected = model @@ -278,7 +281,7 @@ mod tests { use arrow::datatypes::DataType; let bed_schema = BedSchema::new_bedgraph().unwrap(); - let model = Model::new(bed_schema, None).unwrap(); + let model = Model::new(bed_schema, Select::All).unwrap(); // Standard fields use BED types (Int64 for positions) assert_eq!(model.schema().field(1).data_type(), &DataType::Int64); // Custom "value" field uses FieldDef type (Float32) diff --git a/oxbow/src/bed/model/batch.rs b/oxbow/src/bed/model/batch.rs index 096d633b..a034d95a 100644 --- a/oxbow/src/bed/model/batch.rs +++ b/oxbow/src/bed/model/batch.rs @@ -24,6 +24,11 @@ pub struct BatchBuilder { } impl BatchBuilder { + /// Creates a new `BatchBuilder` from a [`Model`]. + pub fn from_model(model: &super::Model, capacity: usize) -> crate::Result { + Self::new(model.bed_schema(), Some(model.field_names()), capacity) + } + /// Creates a new `BatchBuilder` for BED records. pub fn new( bed_schema: &BedSchema, diff --git a/oxbow/src/bed/scanner/bed.rs b/oxbow/src/bed/scanner/bed.rs index 4feca10a..8bccd820 100644 --- a/oxbow/src/bed/scanner/bed.rs +++ b/oxbow/src/bed/scanner/bed.rs @@ -10,7 +10,7 @@ use crate::bed::model::BedSchema; use crate::bed::model::Model; use crate::bed::scanner::batch_iterator::{BatchIterator, QueryBatchIterator}; use crate::util::query::{BgzfChunkReader, ByteRangeReader}; -use crate::OxbowError; +use crate::{OxbowError, Select}; /// A BED scanner. /// @@ -27,8 +27,9 @@ use crate::OxbowError; /// let inner = File::open("sample.bed").map(BufReader::new).unwrap(); /// let mut fmt_reader = noodles::bed::io::Reader::new(inner); /// +/// use oxbow::Select; /// let bed_schema = "bed6+3".parse().unwrap(); -/// let scanner = Scanner::new(bed_schema, None).unwrap(); +/// let scanner = Scanner::new(bed_schema, Select::All).unwrap(); /// let batches = scanner.scan(fmt_reader, None, None, Some(1000)).unwrap(); /// ``` pub struct Scanner { @@ -40,7 +41,7 @@ impl Scanner { /// /// - `bed_schema`: the parsing interpretation. /// - `fields`: column names to project. `None` → all fields from the schema. - pub fn new(bed_schema: BedSchema, fields: Option>) -> crate::Result { + pub fn new(bed_schema: BedSchema, fields: Select) -> crate::Result { let model = Model::new(bed_schema, fields)?; Ok(Self { model }) } @@ -71,17 +72,11 @@ impl Scanner { columns: Option>, capacity: usize, ) -> crate::Result { - match columns { - None => BatchBuilder::new(self.model.bed_schema(), None, capacity), - Some(cols) => { - let projected = self.model.project(&cols)?; - BatchBuilder::new( - projected.bed_schema(), - Some(projected.field_names()), - capacity, - ) - } - } + let model = match columns { + None => self.model.clone(), + Some(cols) => self.model.project(&cols)?, + }; + BatchBuilder::from_model(&model, capacity) } } diff --git a/oxbow/src/gxf/model.rs b/oxbow/src/gxf/model.rs index ad005108..c8d4c0cc 100644 --- a/oxbow/src/gxf/model.rs +++ b/oxbow/src/gxf/model.rs @@ -8,7 +8,7 @@ use std::sync::Arc; use arrow::datatypes::{DataType, Field as ArrowField, Schema, SchemaRef}; -use crate::OxbowError; +use crate::{OxbowError, Select}; use attribute::AttributeDef; use field::{Field, DEFAULT_FIELD_NAMES}; @@ -19,7 +19,8 @@ use field::{Field, DEFAULT_FIELD_NAMES}; /// to materialize. /// /// - `fields` selects which standard GXF fields become Arrow columns. -/// `None` → all 8 standard fields. +/// `All` → all 8 standard fields. `Omit` → no fields. `Some(vec)` → +/// specific fields. /// - `attr_defs` controls the attributes struct column independently. /// `None` → no attributes column. `Some(vec![])` → empty struct column. /// `Some(vec![...])` → struct column with the specified sub-fields. @@ -30,15 +31,16 @@ use field::{Field, DEFAULT_FIELD_NAMES}; /// /// ``` /// use oxbow::gxf::model::Model; +/// use oxbow::Select; /// /// // Default: all 8 standard fields, no attributes column. -/// let model = Model::new(None, None).unwrap(); +/// let model = Model::new(Select::All, None).unwrap(); /// assert_eq!(model.field_names().len(), 8); /// assert!(!model.has_attributes()); /// /// // Custom: selected fields with attributes. /// let model = Model::new( -/// Some(vec!["seqid".into(), "start".into(), "end".into()]), +/// Select::Some(vec!["seqid".into(), "start".into(), "end".into()]), /// Some(vec![("gene_id".into(), "String".into())]), /// ).unwrap(); /// assert_eq!(model.field_names(), vec!["seqid", "start", "end"]); @@ -57,16 +59,20 @@ pub struct Model { impl Model { /// Create a new GXF model. /// - /// - `fields`: standard GXF field names. `None` → all 8 standard fields. + /// - `fields`: standard GXF field selection. `All` → all 8 standard + /// fields. `Some(vec)` → specific fields. `Omit` → no fields. /// - `attr_defs`: attribute definitions as `(name, type)` pairs. `None` → /// no attributes column. `Some(vec![])` → attributes column with empty /// struct. pub fn new( - fields: Option>, + fields: Select, attr_defs: Option>, ) -> crate::Result { - let field_names = - fields.unwrap_or_else(|| DEFAULT_FIELD_NAMES.iter().map(|&s| s.to_string()).collect()); + let field_names = match fields { + Select::All => DEFAULT_FIELD_NAMES.iter().map(|&s| s.to_string()).collect(), + Select::Some(names) => names, + Select::Omit => Vec::new(), + }; let mut parsed_fields = Vec::new(); for name in &field_names { @@ -94,7 +100,7 @@ impl Model { /// Create a model with all 8 default standard fields and no attributes. pub fn default_fields() -> Self { - Self::new(None, None).expect("default fields are always valid") + Self::new(Select::All, None).expect("default fields are always valid") } fn build_schema(fields: &[Field], attr_defs: Option<&[AttributeDef]>) -> SchemaRef { @@ -182,7 +188,7 @@ impl Model { None }; - Self::new(Some(projected_fields), attr_defs) + Self::new(Select::Some(projected_fields), attr_defs) } } @@ -201,7 +207,7 @@ mod tests { #[test] fn test_default_model() { - let model = Model::new(None, None).unwrap(); + let model = Model::new(Select::All, None).unwrap(); assert_eq!(model.field_names().len(), 8); assert!(!model.has_attributes()); assert!(model.attr_defs().is_none()); @@ -211,13 +217,13 @@ mod tests { #[test] fn test_default_fields_constructor() { let model = Model::default_fields(); - assert_eq!(model, Model::new(None, None).unwrap()); + assert_eq!(model, Model::new(Select::All, None).unwrap()); } #[test] fn test_custom_fields_no_attrs() { let model = Model::new( - Some(vec!["seqid".into(), "start".into(), "end".into()]), + Select::Some(vec!["seqid".into(), "start".into(), "end".into()]), None, ) .unwrap(); @@ -229,7 +235,7 @@ mod tests { #[test] fn test_fields_with_attrs() { let model = Model::new( - Some(vec!["seqid".into(), "start".into()]), + Select::Some(vec!["seqid".into(), "start".into()]), Some(vec![("gene_id".into(), "String".into())]), ) .unwrap(); @@ -242,7 +248,7 @@ mod tests { #[test] fn test_attrs_empty_defs_is_empty_struct() { - let model = Model::new(Some(vec!["seqid".into()]), Some(vec![])).unwrap(); + let model = Model::new(Select::Some(vec!["seqid".into()]), Some(vec![])).unwrap(); assert!(model.has_attributes()); assert!(model.attr_defs().unwrap().is_empty()); assert_eq!(model.schema().fields().len(), 2); @@ -255,7 +261,7 @@ mod tests { #[test] fn test_no_attrs_when_attr_defs_none() { - let model = Model::new(Some(vec!["seqid".into(), "start".into()]), None).unwrap(); + let model = Model::new(Select::Some(vec!["seqid".into(), "start".into()]), None).unwrap(); assert!(!model.has_attributes()); assert!(model.attr_defs().is_none()); assert_eq!(model.schema().fields().len(), 2); @@ -263,20 +269,23 @@ mod tests { #[test] fn test_invalid_field() { - let result = Model::new(Some(vec!["invalid".into()]), None); + let result = Model::new(Select::Some(vec!["invalid".into()]), None); assert!(result.is_err()); } #[test] fn test_invalid_attr_type() { - let result = Model::new(None, Some(vec![("gene_id".into(), "InvalidType".into())])); + let result = Model::new( + Select::All, + Some(vec![("gene_id".into(), "InvalidType".into())]), + ); assert!(result.is_err()); } #[test] fn test_attr_defs_to_tuple() { let model = Model::new( - None, + Select::All, Some(vec![ ("gene_id".into(), "String".into()), ("tag".into(), "Array".into()), @@ -301,7 +310,7 @@ mod tests { #[test] fn test_project() { let model = Model::new( - Some(vec!["seqid".into(), "start".into(), "end".into()]), + Select::Some(vec!["seqid".into(), "start".into(), "end".into()]), Some(vec![("gene_id".into(), "String".into())]), ) .unwrap(); @@ -314,7 +323,7 @@ mod tests { #[test] fn test_project_with_attrs() { let model = Model::new( - Some(vec!["seqid".into(), "start".into()]), + Select::Some(vec!["seqid".into(), "start".into()]), Some(vec![("gene_id".into(), "String".into())]), ) .unwrap(); diff --git a/oxbow/src/gxf/model/batch.rs b/oxbow/src/gxf/model/batch.rs index 0d957101..994cc9e8 100644 --- a/oxbow/src/gxf/model/batch.rs +++ b/oxbow/src/gxf/model/batch.rs @@ -11,6 +11,7 @@ use crate::batch::{Push, RecordBatchBuilder}; use crate::gxf::model::attribute::{AttributeBuilder, AttributeDef, AttributeValue}; use crate::gxf::model::field::Push as _; use crate::gxf::model::field::{Field, FieldBuilder}; +use crate::Select; use super::Model; @@ -26,10 +27,10 @@ pub struct BatchBuilder { impl BatchBuilder { /// Creates a new `BatchBuilder` for GTF/GFF records. /// - /// - `fields`: standard GXF field names. `None` → all 8 standard fields. + /// - `fields`: standard GXF field selection. `All` → all 8 standard fields. /// - `attr_defs`: `None` → no attributes column. `Some(vec![])` → empty struct. pub fn new( - fields: Option>, + fields: Select, attr_defs: Option>, capacity: usize, ) -> crate::Result { @@ -204,7 +205,7 @@ mod tests { #[test] fn test_batch_builder_new() { - let field_names = Some(vec!["seqid".to_string(), "source".to_string()]); + let field_names = Select::Some(vec!["seqid".to_string(), "source".to_string()]); let attr_defs = Some(vec![("gene_id".to_string(), "String".to_string())]); let capacity = 10; @@ -216,7 +217,7 @@ mod tests { #[test] fn test_schema() { - let field_names = Some(vec!["seqid".to_string(), "source".to_string()]); + let field_names = Select::Some(vec!["seqid".to_string(), "source".to_string()]); let attr_defs = Some(vec![("gene_id".to_string(), "String".to_string())]); let capacity = 10; @@ -235,7 +236,7 @@ mod tests { #[test] fn test_push_gff_record() { - let field_names = Some(vec!["seqid".to_string(), "source".to_string()]); + let field_names = Select::Some(vec!["seqid".to_string(), "source".to_string()]); let attr_defs = Some(vec![("gene_id".to_string(), "String".to_string())]); let capacity = 10; @@ -250,7 +251,7 @@ mod tests { #[test] fn test_push_gtf_record() { - let field_names = Some(vec!["seqid".to_string(), "source".to_string()]); + let field_names = Select::Some(vec!["seqid".to_string(), "source".to_string()]); let attr_defs = Some(vec![("gene_id".to_string(), "String".to_string())]); let capacity = 10; @@ -265,7 +266,7 @@ mod tests { #[test] fn test_finish() { - let field_names = Some(vec!["seqid".to_string(), "source".to_string()]); + let field_names = Select::Some(vec!["seqid".to_string(), "source".to_string()]); let attr_defs = Some(vec![("gene_id".to_string(), "String".to_string())]); let capacity = 10; diff --git a/oxbow/src/gxf/scanner/gff.rs b/oxbow/src/gxf/scanner/gff.rs index 2c370ff0..ce038afc 100644 --- a/oxbow/src/gxf/scanner/gff.rs +++ b/oxbow/src/gxf/scanner/gff.rs @@ -12,7 +12,7 @@ use crate::gxf::model::BatchBuilder; use crate::gxf::model::Model; use crate::gxf::scanner::batch_iterator::{BatchIterator, QueryBatchIterator}; use crate::util::query::{BgzfChunkReader, ByteRangeReader}; -use crate::OxbowError; +use crate::{OxbowError, Select}; /// A GFF scanner. /// @@ -24,6 +24,7 @@ use crate::OxbowError; /// /// ```no_run /// use oxbow::gxf::scanner::gff::Scanner; +/// use oxbow::Select; /// use std::fs::File; /// use std::io::BufReader; /// @@ -31,7 +32,7 @@ use crate::OxbowError; /// let mut fmt_reader = noodles::gff::io::Reader::new(inner); /// /// let attr_defs = Scanner::attribute_defs(&mut fmt_reader, Some(1000)).unwrap(); -/// let scanner = Scanner::new(None, None, Some(attr_defs)).unwrap(); +/// let scanner = Scanner::new(None, Select::All, Some(attr_defs)).unwrap(); /// let batches = scanner.scan(fmt_reader, None, None, Some(1000)); /// ``` pub struct Scanner { @@ -42,11 +43,11 @@ pub struct Scanner { impl Scanner { /// Creates a GFF scanner from schema parameters. /// - /// - `fields`: standard GXF field names. `None` → all 8 standard fields. + /// - `fields`: standard GXF field selection. `All` → all 8 standard fields. /// - `attr_defs`: `None` → no attributes column. `Some(vec![])` → empty struct. pub fn new( header: Option, - fields: Option>, + fields: Select, attr_defs: Option>, ) -> crate::Result { let model = Model::new(fields, attr_defs)?; diff --git a/oxbow/src/gxf/scanner/gtf.rs b/oxbow/src/gxf/scanner/gtf.rs index 87fd2e89..9c54fc13 100644 --- a/oxbow/src/gxf/scanner/gtf.rs +++ b/oxbow/src/gxf/scanner/gtf.rs @@ -12,7 +12,7 @@ use crate::gxf::model::BatchBuilder; use crate::gxf::model::Model; use crate::gxf::scanner::batch_iterator::{BatchIterator, QueryBatchIterator}; use crate::util::query::{BgzfChunkReader, ByteRangeReader}; -use crate::OxbowError; +use crate::{OxbowError, Select}; /// A GTF scanner. /// @@ -24,6 +24,7 @@ use crate::OxbowError; /// /// ```no_run /// use oxbow::gxf::scanner::gtf::Scanner; +/// use oxbow::Select; /// use std::fs::File; /// use std::io::BufReader; /// @@ -31,7 +32,7 @@ use crate::OxbowError; /// let mut fmt_reader = noodles::gtf::io::Reader::new(inner); /// /// let attr_defs = Scanner::attribute_defs(&mut fmt_reader, Some(1000)).unwrap(); -/// let scanner = Scanner::new(None, None, Some(attr_defs)).unwrap(); +/// let scanner = Scanner::new(None, Select::All, Some(attr_defs)).unwrap(); /// let batches = scanner.scan(fmt_reader, None, None, Some(1000)); /// ``` pub struct Scanner { @@ -42,11 +43,11 @@ pub struct Scanner { impl Scanner { /// Creates a GTF scanner from schema parameters. /// - /// - `fields`: standard GXF field names. `None` → all 8 standard fields. + /// - `fields`: standard GXF field selection. `All` → all 8 standard fields. /// - `attr_defs`: `None` → no attributes column. `Some(vec![])` → empty struct. pub fn new( header: Option, - fields: Option>, + fields: Select, attr_defs: Option>, ) -> crate::Result { let model = Model::new(fields, attr_defs)?; diff --git a/oxbow/src/lib.rs b/oxbow/src/lib.rs index 9b63d460..b1979e86 100644 --- a/oxbow/src/lib.rs +++ b/oxbow/src/lib.rs @@ -70,3 +70,13 @@ pub mod util; pub mod variant; pub use error::{OxbowError, Result}; + +#[derive(Debug, Clone)] +pub enum Select { + /// Select specific items explicitly + Some(Vec), + /// Omit (explicitly empty) + Omit, + /// Select all items (wildcard) + All, +} diff --git a/oxbow/src/sequence/model.rs b/oxbow/src/sequence/model.rs index 83297908..0192df54 100644 --- a/oxbow/src/sequence/model.rs +++ b/oxbow/src/sequence/model.rs @@ -7,7 +7,7 @@ use std::sync::Arc; use arrow::datatypes::{Field as ArrowField, Schema, SchemaRef}; -use crate::OxbowError; +use crate::{OxbowError, Select}; use field::{Field, FASTA_DEFAULT_FIELD_NAMES, FASTQ_DEFAULT_FIELD_NAMES}; /// A data model for sequence records (FASTA/FASTQ). @@ -16,23 +16,25 @@ use field::{Field, FASTA_DEFAULT_FIELD_NAMES, FASTQ_DEFAULT_FIELD_NAMES}; /// which fields to include. /// /// - `fields` selects which fields become Arrow columns. -/// `None` → format-specific defaults (3 for FASTA, 4 for FASTQ). +/// `All` → format-specific defaults (3 for FASTA, 4 for FASTQ). +/// `Omit` → no fields. `Some(vec)` → specific fields. /// /// # Examples /// /// ``` /// use oxbow::sequence::model::Model; +/// use oxbow::Select; /// /// // FASTA defaults: name, description, sequence. -/// let model = Model::new_fasta(None).unwrap(); +/// let model = Model::new_fasta(Select::All).unwrap(); /// assert_eq!(model.field_names().len(), 3); /// /// // FASTQ defaults: name, description, sequence, quality. -/// let model = Model::new_fastq(None).unwrap(); +/// let model = Model::new_fastq(Select::All).unwrap(); /// assert_eq!(model.field_names().len(), 4); /// /// // Custom field selection. -/// let model = Model::new_fastq(Some(vec!["name".into(), "sequence".into()])).unwrap(); +/// let model = Model::new_fastq(Select::Some(vec!["name".into(), "sequence".into()])).unwrap(); /// assert_eq!(model.field_names(), vec!["name", "sequence"]); /// ``` #[derive(Clone, Debug)] @@ -44,24 +46,40 @@ pub struct Model { impl Model { /// Create a new FASTA model. /// - /// `fields`: field names. `None` → `["name", "description", "sequence"]`. - pub fn new_fasta(fields: Option>) -> crate::Result { - let defaults = FASTA_DEFAULT_FIELD_NAMES - .iter() - .map(|&s| s.to_string()) - .collect(); - Self::new(fields.unwrap_or(defaults)) + /// `fields`: `All` → `["name", "description", "sequence"]`. `Omit` → no + /// fields. `Some(vec)` → specific fields. + pub fn new_fasta(fields: Select) -> crate::Result { + let defaults = || { + FASTA_DEFAULT_FIELD_NAMES + .iter() + .map(|&s| s.to_string()) + .collect() + }; + let field_names = match fields { + Select::All => defaults(), + Select::Some(names) => names, + Select::Omit => Vec::new(), + }; + Self::new(field_names) } /// Create a new FASTQ model. /// - /// `fields`: field names. `None` → `["name", "description", "sequence", "quality"]`. - pub fn new_fastq(fields: Option>) -> crate::Result { - let defaults = FASTQ_DEFAULT_FIELD_NAMES - .iter() - .map(|&s| s.to_string()) - .collect(); - Self::new(fields.unwrap_or(defaults)) + /// `fields`: `All` → `["name", "description", "sequence", "quality"]`. + /// `Omit` → no fields. `Some(vec)` → specific fields. + pub fn new_fastq(fields: Select) -> crate::Result { + let defaults = || { + FASTQ_DEFAULT_FIELD_NAMES + .iter() + .map(|&s| s.to_string()) + .collect() + }; + let field_names = match fields { + Select::All => defaults(), + Select::Some(names) => names, + Select::Omit => Vec::new(), + }; + Self::new(field_names) } fn new(field_names: Vec) -> crate::Result { @@ -149,14 +167,14 @@ mod tests { #[test] fn test_fasta_defaults() { - let model = Model::new_fasta(None).unwrap(); + let model = Model::new_fasta(Select::All).unwrap(); assert_eq!(model.field_names(), vec!["name", "description", "sequence"]); assert_eq!(model.schema().fields().len(), 3); } #[test] fn test_fastq_defaults() { - let model = Model::new_fastq(None).unwrap(); + let model = Model::new_fastq(Select::All).unwrap(); assert_eq!( model.field_names(), vec!["name", "description", "sequence", "quality"] @@ -166,27 +184,27 @@ mod tests { #[test] fn test_custom_fields() { - let model = Model::new_fastq(Some(vec!["name".into(), "sequence".into()])).unwrap(); + let model = Model::new_fastq(Select::Some(vec!["name".into(), "sequence".into()])).unwrap(); assert_eq!(model.field_names(), vec!["name", "sequence"]); assert_eq!(model.schema().fields().len(), 2); } #[test] fn test_invalid_field() { - let result = Model::new_fasta(Some(vec!["invalid".into()])); + let result = Model::new_fasta(Select::Some(vec!["invalid".into()])); assert!(result.is_err()); } #[test] fn test_project() { - let model = Model::new_fastq(None).unwrap(); + let model = Model::new_fastq(Select::All).unwrap(); let projected = model.project(&["name".into(), "quality".into()]).unwrap(); assert_eq!(projected.field_names(), vec!["name", "quality"]); } #[test] fn test_project_unknown() { - let model = Model::new_fasta(None).unwrap(); + let model = Model::new_fasta(Select::All).unwrap(); let result = model.project(&["nonexistent".into()]); assert!(result.is_err()); } diff --git a/oxbow/src/sequence/model/batch.rs b/oxbow/src/sequence/model/batch.rs index fe5addd9..2f384a0d 100644 --- a/oxbow/src/sequence/model/batch.rs +++ b/oxbow/src/sequence/model/batch.rs @@ -5,6 +5,7 @@ use arrow::record_batch::{RecordBatch, RecordBatchOptions}; use indexmap::IndexMap; use crate::batch::{Push, RecordBatchBuilder}; +use crate::Select; use super::field::Push as _; use super::field::{Field, FieldBuilder}; @@ -19,13 +20,13 @@ pub struct BatchBuilder { impl BatchBuilder { /// Creates a new `BatchBuilder` for FASTQ records. - pub fn new_fastq(fields: Option>, capacity: usize) -> crate::Result { + pub fn new_fastq(fields: Select, capacity: usize) -> crate::Result { let model = Model::new_fastq(fields)?; Self::from_model(&model, capacity) } /// Creates a new `BatchBuilder` for FASTA records. - pub fn new_fasta(fields: Option>, capacity: usize) -> crate::Result { + pub fn new_fasta(fields: Select, capacity: usize) -> crate::Result { let model = Model::new_fasta(fields)?; Self::from_model(&model, capacity) } @@ -100,19 +101,19 @@ mod tests { #[test] fn test_new_fastq_with_default_fields() { - let batch_builder = BatchBuilder::new_fastq(None, 10).unwrap(); + let batch_builder = BatchBuilder::new_fastq(Select::All, 10).unwrap(); assert_eq!(batch_builder.schema().fields().len(), 4); } #[test] fn test_new_fasta_with_default_fields() { - let batch_builder = BatchBuilder::new_fasta(None, 10).unwrap(); + let batch_builder = BatchBuilder::new_fasta(Select::All, 10).unwrap(); assert_eq!(batch_builder.schema().fields().len(), 3); } #[test] fn test_schema() { - let batch_builder = BatchBuilder::new_fastq(None, 10).unwrap(); + let batch_builder = BatchBuilder::new_fastq(Select::All, 10).unwrap(); let schema = batch_builder.schema(); assert_eq!(schema.fields().len(), 4); assert_eq!(schema.field(0).name(), "name"); @@ -122,7 +123,7 @@ mod tests { #[test] fn test_push_fasta_record() { let capacity = 10; - let mut batch_builder = BatchBuilder::new_fasta(None, capacity).unwrap(); + let mut batch_builder = BatchBuilder::new_fasta(Select::All, capacity).unwrap(); let record = noodles::fasta::Record::new( noodles::fasta::record::Definition::new(b"s0", Some(b"description".into())), @@ -137,7 +138,7 @@ mod tests { #[test] fn test_push_fastq_record() { let capacity = 10; - let mut batch_builder = BatchBuilder::new_fastq(None, capacity).unwrap(); + let mut batch_builder = BatchBuilder::new_fastq(Select::All, capacity).unwrap(); let record = noodles::fastq::Record::new( noodles::fastq::record::Definition::new(b"s0", b""), @@ -153,7 +154,7 @@ mod tests { #[test] fn test_finish_empty_batch() { let capacity = 10; - let mut batch_builder = BatchBuilder::new_fastq(None, capacity).unwrap(); + let mut batch_builder = BatchBuilder::new_fastq(Select::All, capacity).unwrap(); let record_batch = batch_builder.finish().unwrap(); assert_eq!(record_batch.num_rows(), 0); diff --git a/oxbow/src/sequence/scanner/fasta.rs b/oxbow/src/sequence/scanner/fasta.rs index 674e5838..b913d55f 100644 --- a/oxbow/src/sequence/scanner/fasta.rs +++ b/oxbow/src/sequence/scanner/fasta.rs @@ -7,6 +7,7 @@ use noodles::core::Region; use crate::sequence::model::BatchBuilder; use crate::sequence::model::Model; use crate::sequence::scanner::batch_iterator::{BatchIterator, QueryBatchIterator}; +use crate::Select; /// A FASTA scanner. /// @@ -17,6 +18,7 @@ use crate::sequence::scanner::batch_iterator::{BatchIterator, QueryBatchIterator /// /// ```no_run /// use oxbow::sequence::scanner::fasta::Scanner; +/// use oxbow::Select; /// use std::fs::File; /// use std::io::BufReader; /// use noodles::core::Region; @@ -25,7 +27,7 @@ use crate::sequence::scanner::batch_iterator::{BatchIterator, QueryBatchIterator /// let fmt_reader = noodles::fasta::io::Reader::new(inner); /// let index = noodles::fasta::fai::fs::read("sample.fa.fai").unwrap(); /// -/// let scanner = Scanner::new(None).unwrap(); +/// let scanner = Scanner::new(Select::All).unwrap(); /// let regions = vec!["chr1:1-1000", "chr1:1001-2000"]; /// let regions: Vec = regions.iter().map(|s| s.parse().unwrap()).collect(); /// let batches = scanner.scan_query(fmt_reader, regions, index, None, Some(2)); @@ -37,8 +39,8 @@ pub struct Scanner { impl Scanner { /// Creates a FASTA scanner from schema parameters. /// - /// `fields`: field names. `None` → `["name", "description", "sequence"]`. - pub fn new(fields: Option>) -> crate::Result { + /// `fields`: `All` → `["name", "description", "sequence"]`. + pub fn new(fields: Select) -> crate::Result { let model = Model::new_fasta(fields)?; Ok(Self { model }) } @@ -125,7 +127,7 @@ mod tests { #[test] fn test_scanner_default() { - let scanner = Scanner::new(None).unwrap(); + let scanner = Scanner::new(Select::All).unwrap(); assert_eq!( scanner.field_names(), vec!["name", "description", "sequence"] @@ -134,9 +136,13 @@ mod tests { #[test] fn test_scanner_schema() { - let scanner = Scanner::new(None).unwrap(); + let scanner = Scanner::new(Select::All).unwrap(); assert_eq!(scanner.schema().fields().len(), 3); - let scanner = Scanner::new(Some(vec!["name".to_string(), "sequence".to_string()])).unwrap(); + let scanner = Scanner::new(Select::Some(vec![ + "name".to_string(), + "sequence".to_string(), + ])) + .unwrap(); assert_eq!(scanner.schema().fields().len(), 2); } @@ -147,7 +153,7 @@ mod tests { let reader = BufReader::new(file); let fmt_reader = noodles::fasta::io::Reader::new(reader); - let scanner = Scanner::new(None).unwrap(); + let scanner = Scanner::new(Select::All).unwrap(); let mut batch_iter = scanner.scan(fmt_reader, None, Some(2), Some(10)).unwrap(); let batch = batch_iter.next().unwrap().unwrap(); @@ -169,7 +175,7 @@ mod tests { fai::Record::new("seq3", 12, 24, 13, 13), ]); - let scanner = Scanner::new(None).unwrap(); + let scanner = Scanner::new(Select::All).unwrap(); let regions = ["seq1:1-4", "seq2:1-4", "seq3:1-4"]; let regions: Vec = regions.iter().map(|s| s.parse().unwrap()).collect(); let mut batch_iter = scanner diff --git a/oxbow/src/sequence/scanner/fastq.rs b/oxbow/src/sequence/scanner/fastq.rs index 3e1c8264..6fe03a9d 100644 --- a/oxbow/src/sequence/scanner/fastq.rs +++ b/oxbow/src/sequence/scanner/fastq.rs @@ -7,6 +7,7 @@ use crate::sequence::model::BatchBuilder; use crate::sequence::model::Model; use crate::sequence::scanner::batch_iterator::BatchIterator; use crate::util::query::{BgzfChunkReader, ByteRangeReader}; +use crate::Select; use noodles::bgzf::VirtualPosition; /// A FASTQ scanner. @@ -18,13 +19,14 @@ use noodles::bgzf::VirtualPosition; /// /// ```no_run /// use oxbow::sequence::scanner::fastq::Scanner; +/// use oxbow::Select; /// use std::fs::File; /// use std::io::BufReader; /// /// let inner = File::open("sample.R1.fastq").map(BufReader::new).unwrap(); /// let fmt_reader = noodles::fastq::io::Reader::new(inner); /// -/// let scanner = Scanner::new(None).unwrap(); +/// let scanner = Scanner::new(Select::All).unwrap(); /// let batches = scanner.scan(fmt_reader, None, None, Some(1000)); /// ``` pub struct Scanner { @@ -34,8 +36,8 @@ pub struct Scanner { impl Scanner { /// Creates a FASTQ scanner from schema parameters. /// - /// `fields`: field names. `None` → `["name", "description", "sequence", "quality"]`. - pub fn new(fields: Option>) -> crate::Result { + /// `fields`: `All` → `["name", "description", "sequence", "quality"]`. + pub fn new(fields: Select) -> crate::Result { let model = Model::new_fastq(fields)?; Ok(Self { model }) } @@ -136,7 +138,7 @@ mod tests { #[test] fn test_scanner_default() { - let scanner = Scanner::new(None).unwrap(); + let scanner = Scanner::new(Select::All).unwrap(); assert_eq!( scanner.field_names(), vec!["name", "description", "sequence", "quality"] @@ -145,9 +147,13 @@ mod tests { #[test] fn test_scanner_schema() { - let scanner = Scanner::new(None).unwrap(); + let scanner = Scanner::new(Select::All).unwrap(); assert_eq!(scanner.schema().fields().len(), 4); - let scanner = Scanner::new(Some(vec!["name".to_string(), "quality".to_string()])).unwrap(); + let scanner = Scanner::new(Select::Some(vec![ + "name".to_string(), + "quality".to_string(), + ])) + .unwrap(); assert_eq!(scanner.schema().fields().len(), 2); } @@ -158,7 +164,7 @@ mod tests { let file = std::io::Cursor::new(data); let fmt_reader = noodles::fastq::io::Reader::new(file); - let scanner = Scanner::new(None).unwrap(); + let scanner = Scanner::new(Select::All).unwrap(); let mut batch_iter = scanner.scan(fmt_reader, None, Some(2), None).unwrap(); let batch = batch_iter.next().unwrap().unwrap(); @@ -175,7 +181,7 @@ mod tests { let file = std::io::Cursor::new(data); let fmt_reader = noodles::fastq::io::Reader::new(file); - let scanner = Scanner::new(None).unwrap(); + let scanner = Scanner::new(Select::All).unwrap(); let mut batch_iter = scanner.scan(fmt_reader, None, Some(3), Some(2)).unwrap(); let batch = batch_iter.next().unwrap().unwrap(); diff --git a/oxbow/src/variant/model.rs b/oxbow/src/variant/model.rs index 73b80f6a..97511039 100644 --- a/oxbow/src/variant/model.rs +++ b/oxbow/src/variant/model.rs @@ -9,7 +9,7 @@ use std::sync::Arc; use arrow::datatypes::{DataType, Field as ArrowField, Schema, SchemaRef}; -use crate::OxbowError; +use crate::{OxbowError, Select}; use field::{Field, DEFAULT_FIELD_NAMES}; use genotype::GenotypeDef; use info::InfoDef; @@ -27,9 +27,9 @@ use info::InfoDef; /// - `genotype_defs` + `samples` control per-sample/per-field genotype columns. /// Both must be `Some` (and non-empty) to produce genotype columns. /// - `genotype_by` controls the layout: `Sample` (default) or `Field`. -/// - `unnest_samples` controls whether genotype columns are top-level -/// (`true`, default) or wrapped in a single `"samples"` struct column -/// (`false`). +/// - `samples_nested` controls whether genotype columns are wrapped in a +/// single `"samples"` struct column (`true`) or are top-level (`false`, +/// default). /// /// The model can produce an Arrow schema independently of any file header. /// Use `from_header()` to derive definitions from a VCF header. @@ -38,34 +38,37 @@ pub struct Model { fields: Vec, info_defs: Option>, genotype_defs: Option>, - samples: Option>, genotype_by: GenotypeBy, - unnest_samples: bool, + samples: Option>, + samples_nested: bool, schema: SchemaRef, } impl Model { /// Create a new variant model from validated definitions. /// - /// - `fields`: standard VCF field names. `None` → all 7 standard fields. + /// - `fields`: standard VCF field selection. `All` → all 7 standard fields. /// - `info_defs`: validated INFO definitions. `None` → no info column. /// - `genotype_defs`: validated FORMAT definitions. `None` → no genotype columns. /// - `samples`: sample names. `None` → no genotype columns. /// - `genotype_by`: layout mode. Defaults to `GenotypeBy::Sample`. - /// - `unnest_samples`: if `true` (default), genotype columns are - /// top-level. If `false`, they are wrapped in a single `"samples"` - /// struct column. + /// - `samples_nested`: if `true`, genotype columns are wrapped in a + /// single `"samples"` struct column. If `false` (default), they are + /// top-level. #[allow(clippy::too_many_arguments)] pub fn new( - fields: Option>, + fields: Select, info_defs: Option>, genotype_defs: Option>, - samples: Option>, genotype_by: Option, - unnest_samples: Option, + samples: Option>, + samples_nested: Option, ) -> crate::Result { - let field_names = - fields.unwrap_or_else(|| DEFAULT_FIELD_NAMES.iter().map(|&s| s.to_string()).collect()); + let field_names = match fields { + Select::All => DEFAULT_FIELD_NAMES.iter().map(|&s| s.to_string()).collect(), + Select::Some(names) => names, + Select::Omit => Vec::new(), + }; let mut parsed_fields = Vec::new(); for name in &field_names { @@ -76,7 +79,7 @@ impl Model { } let genotype_by = genotype_by.unwrap_or(GenotypeBy::Sample); - let unnest_samples = unnest_samples.unwrap_or(true); + let samples_nested = samples_nested.unwrap_or(false); let schema = Self::build_schema( &parsed_fields, @@ -84,93 +87,113 @@ impl Model { genotype_defs.as_deref(), samples.as_deref(), &genotype_by, - unnest_samples, + samples_nested, ); Ok(Self { fields: parsed_fields, info_defs, genotype_defs, - samples, genotype_by, - unnest_samples, + samples, + samples_nested, schema, }) } /// Create a model by deriving INFO and FORMAT definitions from a VCF header. /// - /// Explicit name lists filter which definitions are included. - /// `None` → include all definitions from the header. + /// - `fields`: standard VCF field selection. `All` → all 7 standard fields. + /// - `info_field_names`: `All` → all INFO from header. `Some(vec)` → filter + /// by name. `Omit` → no info column. + /// - `genotype_field_names`: `All` → all FORMAT from header. `Some(vec)` → + /// filter by name. `Omit` → no genotype columns. + /// - `samples`: `All` → all samples from header. `Some(vec)` → filter by + /// name. `Omit` → no sample columns. #[allow(clippy::too_many_arguments)] pub fn from_header( header: &noodles::vcf::Header, - fields: Option>, - info_field_names: Option>, - genotype_field_names: Option>, - samples: Option>, + fields: Select, + info_field_names: Select, + genotype_field_names: Select, genotype_by: Option, - unnest_samples: Option, + samples: Select, + samples_nested: Option, ) -> crate::Result { // Derive info defs from header - let info_names: Vec = info_field_names.unwrap_or_else(|| { - header - .infos() - .iter() - .map(|(name, _)| name.to_string()) - .collect() - }); - let info_defs: Vec = info_names - .into_iter() - .filter_map(|name| { - let info = header.infos().get(&name)?; - Some(InfoDef::new(name, &info.number(), &info.ty())) - }) - .collect(); + // Omit → no info column. All/Some → info column present (even if empty struct). + let info_defs = match info_field_names { + Select::Omit => None, + sel => { + let names: Vec = match sel { + Select::All => header + .infos() + .iter() + .map(|(name, _)| name.to_string()) + .collect(), + Select::Some(names) => names, + Select::Omit => unreachable!(), + }; + let defs: Vec = names + .into_iter() + .filter_map(|name| { + let info = header.infos().get(&name)?; + Some(InfoDef::new(name, &info.number(), &info.ty())) + }) + .collect(); + Some(defs) + } + }; // Derive genotype defs from header - let gt_names: Vec = genotype_field_names.unwrap_or_else(|| { - header - .formats() - .iter() - .map(|(name, _)| name.to_string()) - .collect() - }); - let genotype_defs: Vec = gt_names - .into_iter() - .filter_map(|name| { - let format = header.formats().get(&name)?; - Some(GenotypeDef::new(name, &format.number(), &format.ty())) - }) - .collect(); + // Omit → deactivate genotype output. All/Some → genotype active (even if empty). + let genotype_defs = match genotype_field_names { + Select::Omit => None, + sel => { + let names: Vec = match sel { + Select::All => header + .formats() + .iter() + .map(|(name, _)| name.to_string()) + .collect(), + Select::Some(names) => names, + Select::Omit => unreachable!(), + }; + let defs: Vec = names + .into_iter() + .filter_map(|name| { + let format = header.formats().get(&name)?; + Some(GenotypeDef::new(name, &format.number(), &format.ty())) + }) + .collect(); + Some(defs) + } + }; // Derive sample names from header - let samples = samples.unwrap_or_else(|| header.sample_names().iter().cloned().collect()); - - // Wrap in Option: empty → None for info/genotype, Some for samples - let info_defs = if info_defs.is_empty() { - None - } else { - Some(info_defs) - }; - let genotype_defs = if genotype_defs.is_empty() { - None - } else { - Some(genotype_defs) - }; - let samples = if samples.is_empty() { - None - } else { - Some(samples) + // Omit → no sample output. All (with samples in header) / Some → sample output active. + // Select::All with no samples in header → None (nothing to include). + // Select::Some([]) → Some(vec![]) (column present, empty). + let samples = match samples { + Select::Omit => None, + Select::All => { + let s: Vec = header.sample_names().iter().cloned().collect(); + if s.is_empty() { + None + } else { + Some(s) + } + } + Select::Some(names) => Some(names), }; Self::new( fields, info_defs, genotype_defs, - samples, genotype_by, - unnest_samples, + samples, + samples_nested, ) } @@ -180,7 +203,7 @@ impl Model { genotype_defs: Option<&[GenotypeDef]>, samples: Option<&[String]>, genotype_by: &GenotypeBy, - unnest_samples: bool, + samples_nested: bool, ) -> SchemaRef { let mut arrow_fields: Vec = fields.iter().map(|f| f.get_arrow_field()).collect(); @@ -195,10 +218,9 @@ impl Model { )); } - // Genotype columns (require both samples and genotype_defs) - let samples = samples.unwrap_or(&[]); - let gt_defs = genotype_defs.unwrap_or(&[]); - if !samples.is_empty() && !gt_defs.is_empty() { + // Genotype columns: both samples and genotype_defs must be Some to activate. + // Some([]) produces empty struct content; None deactivates entirely. + if let (Some(samples), Some(gt_defs)) = (samples, genotype_defs) { let genotype_columns: Vec = match genotype_by { GenotypeBy::Sample => samples .iter() @@ -230,7 +252,7 @@ impl Model { .collect(), }; - if unnest_samples { + if !samples_nested { arrow_fields.extend(genotype_columns); } else { arrow_fields.push(ArrowField::new( @@ -279,10 +301,10 @@ impl Model { &self.genotype_by } - /// Whether genotype columns are unnested (top-level) or wrapped in - /// a single `"samples"` struct column. - pub fn unnest_samples(&self) -> bool { - self.unnest_samples + /// Whether genotype columns are nested (wrapped in a `"samples"` struct + /// column) or top-level. + pub fn samples_nested(&self) -> bool { + self.samples_nested } /// The Arrow schema for this model. @@ -333,7 +355,7 @@ impl Model { }; // Genotype columns - let (genotype_defs, samples) = if !self.unnest_samples { + let (genotype_defs, samples) = if self.samples_nested { // Nested mode: "samples" is an atomic top-level column if columns.iter().any(|c| c.eq_ignore_ascii_case("samples")) { (self.genotype_defs.clone(), self.samples.clone()) @@ -373,12 +395,12 @@ impl Model { }; Self::new( - Some(projected_fields), + Select::Some(projected_fields), info_defs, genotype_defs, - samples, Some(self.genotype_by.clone()), - Some(self.unnest_samples), + samples, + Some(self.samples_nested), ) } } @@ -390,7 +412,7 @@ impl PartialEq for Model { && self.genotype_defs == other.genotype_defs && self.samples == other.samples && self.genotype_by == other.genotype_by - && self.unnest_samples == other.unnest_samples + && self.samples_nested == other.samples_nested } } @@ -414,7 +436,7 @@ mod tests { #[test] fn test_default_model() { - let model = Model::new(None, None, None, None, None, None).unwrap(); + let model = Model::new(Select::All, None, None, None, None, None).unwrap(); assert_eq!(model.field_names().len(), 7); assert!(!model.has_info()); assert!(model.genotype_defs().is_none()); @@ -425,7 +447,16 @@ mod tests { #[test] fn test_from_header() { let header = create_test_header(); - let model = Model::from_header(&header, None, None, None, None, None, None).unwrap(); + let model = Model::from_header( + &header, + Select::All, + Select::All, + Select::All, + None, + Select::All, + None, + ) + .unwrap(); assert_eq!(model.field_names().len(), 7); assert!(model.has_info()); assert_eq!(model.info_defs().unwrap().len(), 1); @@ -440,11 +471,11 @@ mod tests { let header = create_test_header(); let model = Model::from_header( &header, - Some(vec!["chrom".into(), "pos".into()]), - Some(vec!["DP".into()]), - Some(vec!["GT".into()]), - Some(vec!["sample1".into()]), + Select::Some(vec!["chrom".into(), "pos".into()]), + Select::Some(vec!["DP".into()]), + Select::Some(vec!["GT".into()]), None, + Select::Some(vec!["sample1".into()]), None, ) .unwrap(); @@ -456,10 +487,30 @@ mod tests { assert_eq!(model.schema().fields().len(), 4); } + #[test] + fn test_from_header_omit() { + let header = create_test_header(); + let model = Model::from_header( + &header, + Select::All, + Select::Omit, + Select::Omit, + None, + Select::Omit, + None, + ) + .unwrap(); + assert_eq!(model.field_names().len(), 7); + assert!(!model.has_info()); + assert!(model.genotype_defs().is_none()); + assert!(model.samples().is_none()); + assert_eq!(model.schema().fields().len(), 7); + } + #[test] fn test_no_info_no_genotype() { let model = Model::new( - Some(vec!["chrom".into(), "pos".into()]), + Select::Some(vec!["chrom".into(), "pos".into()]), None, None, None, @@ -475,7 +526,16 @@ mod tests { #[test] fn test_project_drops_info() { let header = create_test_header(); - let model = Model::from_header(&header, None, None, None, None, None, None).unwrap(); + let model = Model::from_header( + &header, + Select::All, + Select::All, + Select::All, + None, + Select::All, + None, + ) + .unwrap(); let projected = model.project(&["chrom".into(), "pos".into()]).unwrap(); assert_eq!(projected.field_names(), vec!["chrom", "pos"]); assert!(!projected.has_info()); @@ -485,7 +545,16 @@ mod tests { #[test] fn test_project_keeps_info() { let header = create_test_header(); - let model = Model::from_header(&header, None, None, None, None, None, None).unwrap(); + let model = Model::from_header( + &header, + Select::All, + Select::All, + Select::All, + None, + Select::All, + None, + ) + .unwrap(); let projected = model.project(&["chrom".into(), "info".into()]).unwrap(); assert_eq!(projected.field_names(), vec!["chrom"]); assert!(projected.has_info()); @@ -494,7 +563,16 @@ mod tests { #[test] fn test_project_samples() { let header = create_test_header(); - let model = Model::from_header(&header, None, None, None, None, None, None).unwrap(); + let model = Model::from_header( + &header, + Select::All, + Select::All, + Select::All, + None, + Select::All, + None, + ) + .unwrap(); let projected = model.project(&["chrom".into(), "sample1".into()]).unwrap(); assert_eq!(projected.samples().unwrap(), &["sample1"]); assert!(projected.genotype_defs().is_some()); @@ -502,15 +580,31 @@ mod tests { #[test] fn test_invalid_field() { - let result = Model::new(Some(vec!["invalid".into()]), None, None, None, None, None); + let result = Model::new( + Select::Some(vec!["invalid".into()]), + None, + None, + None, + None, + None, + ); assert!(result.is_err()); } #[test] fn test_nested_samples() { let header = create_test_header(); - let model = Model::from_header(&header, None, None, None, None, None, Some(false)).unwrap(); - assert!(!model.unnest_samples()); + let model = Model::from_header( + &header, + Select::All, + Select::All, + Select::All, + None, + Select::All, + Some(true), + ) + .unwrap(); + assert!(model.samples_nested()); // 7 fields + info + 1 "samples" struct assert_eq!(model.schema().fields().len(), 9); let samples_field = model.schema().field_with_name("samples").unwrap(); @@ -527,7 +621,16 @@ mod tests { #[test] fn test_nested_samples_projection() { let header = create_test_header(); - let model = Model::from_header(&header, None, None, None, None, None, Some(false)).unwrap(); + let model = Model::from_header( + &header, + Select::All, + Select::All, + Select::All, + None, + Select::All, + Some(true), + ) + .unwrap(); // "samples" is an atomic column in nested mode let projected = model.project(&["chrom".into(), "samples".into()]).unwrap(); assert_eq!(projected.field_names(), vec!["chrom"]); @@ -539,7 +642,16 @@ mod tests { #[test] fn test_nested_samples_excluded() { let header = create_test_header(); - let model = Model::from_header(&header, None, None, None, None, None, Some(false)).unwrap(); + let model = Model::from_header( + &header, + Select::All, + Select::All, + Select::All, + None, + Select::All, + Some(true), + ) + .unwrap(); let projected = model.project(&["chrom".into(), "info".into()]).unwrap(); assert!(projected.samples().is_none()); assert!(projected.genotype_defs().is_none()); @@ -547,12 +659,180 @@ mod tests { } #[test] - fn test_unnest_samples_default() { + fn test_samples_nested_default() { let header = create_test_header(); - // Default (unnest_samples = true) produces top-level sample columns - let model = Model::from_header(&header, None, None, None, None, None, None).unwrap(); - assert!(model.unnest_samples()); + // Default (samples_nested = false) produces top-level sample columns + let model = Model::from_header( + &header, + Select::All, + Select::All, + Select::All, + None, + Select::All, + None, + ) + .unwrap(); + assert!(!model.samples_nested()); // 7 fields + info + 2 sample columns assert_eq!(model.schema().fields().len(), 10); } + + // --- Empty-select (Some([])) edge cases --- + + #[test] + fn test_info_empty_select_produces_empty_struct_column() { + let header = create_test_header(); + let model = Model::from_header( + &header, + Select::All, + Select::Some(vec![]), // info present, empty + Select::Omit, + None, + Select::Omit, + None, + ) + .unwrap(); + assert!(model.has_info()); + assert_eq!(model.info_defs().unwrap().len(), 0); + // 7 fields + empty info struct + assert_eq!(model.schema().fields().len(), 8); + let info_field = model.schema().field_with_name("info").unwrap(); + match info_field.data_type() { + DataType::Struct(fields) => assert!(fields.is_empty()), + other => panic!("Expected empty Struct, got {:?}", other), + } + } + + #[test] + fn test_info_omit_produces_no_column() { + let header = create_test_header(); + let model = Model::from_header( + &header, + Select::All, + Select::Omit, + Select::Omit, + None, + Select::Omit, + None, + ) + .unwrap(); + assert!(!model.has_info()); + assert_eq!(model.schema().fields().len(), 7); + } + + #[test] + fn test_genotype_empty_select_by_sample_produces_empty_struct_columns() { + let header = create_test_header(); + let model = Model::from_header( + &header, + Select::All, + Select::Omit, + Select::Some(vec![]), // genotype active, no fields + Some(GenotypeBy::Sample), + Select::Some(vec!["sample1".into()]), + None, + ) + .unwrap(); + assert!(model.genotype_defs().is_some()); + assert_eq!(model.genotype_defs().unwrap().len(), 0); + // 7 fields + 1 sample column (empty struct) + assert_eq!(model.schema().fields().len(), 8); + let sample_field = model.schema().field_with_name("sample1").unwrap(); + match sample_field.data_type() { + DataType::Struct(fields) => assert!(fields.is_empty()), + other => panic!("Expected empty Struct, got {:?}", other), + } + } + + #[test] + fn test_genotype_empty_select_by_field_produces_no_columns() { + // by_field: columns are keyed by FORMAT field name; no fields → no columns + let header = create_test_header(); + let model = Model::from_header( + &header, + Select::All, + Select::Omit, + Select::Some(vec![]), // genotype active, no fields + Some(GenotypeBy::Field), + Select::Some(vec!["sample1".into()]), + None, + ) + .unwrap(); + assert_eq!(model.schema().fields().len(), 7); + } + + #[test] + fn test_genotype_omit_deactivates_output_despite_samples() { + let header = create_test_header(); + let model = Model::from_header( + &header, + Select::All, + Select::Omit, + Select::Omit, // genotype deactivated + None, + Select::Some(vec!["sample1".into()]), + Some(true), + ) + .unwrap(); + assert!(model.genotype_defs().is_none()); + assert_eq!(model.schema().fields().len(), 7); + } + + #[test] + fn test_samples_empty_select_nested_produces_empty_struct_column() { + let header = create_test_header(); + let model = Model::from_header( + &header, + Select::All, + Select::Omit, + Select::Some(vec!["GT".into()]), + None, + Select::Some(vec![]), // samples active, none selected + Some(true), + ) + .unwrap(); + assert!(model.samples().is_some()); + assert_eq!(model.samples().unwrap().len(), 0); + // 7 fields + "samples" struct (empty: no samples → no sub-fields) + assert_eq!(model.schema().fields().len(), 8); + let samples_field = model.schema().field_with_name("samples").unwrap(); + match samples_field.data_type() { + DataType::Struct(fields) => assert!(fields.is_empty()), + other => panic!("Expected empty Struct, got {:?}", other), + } + } + + #[test] + fn test_samples_empty_select_unnested_produces_no_columns() { + // unnested: columns are keyed by sample name; no samples → no columns + let header = create_test_header(); + let model = Model::from_header( + &header, + Select::All, + Select::Omit, + Select::Some(vec!["GT".into()]), + None, + Select::Some(vec![]), // samples active, none selected + Some(false), + ) + .unwrap(); + assert_eq!(model.schema().fields().len(), 7); + } + + #[test] + fn test_samples_omit_deactivates_nested_column() { + let header = create_test_header(); + let model = Model::from_header( + &header, + Select::All, + Select::Omit, + Select::Some(vec!["GT".into()]), + None, + Select::Omit, // samples deactivated + Some(true), + ) + .unwrap(); + assert!(model.samples().is_none()); + assert_eq!(model.schema().fields().len(), 7); + } } diff --git a/oxbow/src/variant/model/batch.rs b/oxbow/src/variant/model/batch.rs index ea957ffd..3208b49d 100644 --- a/oxbow/src/variant/model/batch.rs +++ b/oxbow/src/variant/model/batch.rs @@ -1,7 +1,7 @@ use std::collections::HashMap; use std::sync::Arc; -use crate::OxbowError; +use crate::{OxbowError, Select}; use arrow::array::{ArrayRef, StructArray}; use arrow::datatypes::{Field as ArrowField, FieldRef, SchemaRef}; @@ -37,11 +37,12 @@ pub struct BatchBuilder { schema: SchemaRef, row_count: usize, header: noodles::vcf::Header, - info_defs: Vec, + has_info: bool, + has_genotype: bool, genotype_defs: Vec, - sample_names: Vec, genotype_by: GenotypeBy, - unnest_samples: bool, + sample_names: Vec, + samples_nested: bool, field_builders: IndexMap, info_builders: IndexMap, genotype_builders: IndexMap, @@ -53,11 +54,11 @@ impl BatchBuilder { /// Derives INFO and FORMAT definitions from the header. pub fn new( header: noodles::vcf::Header, - field_names: Option>, - info_field_names: Option>, - genotype_field_names: Option>, - sample_names: Option>, + field_names: Select, + info_field_names: Select, + genotype_field_names: Select, genotype_by: GenotypeBy, + sample_names: Select, capacity: usize, ) -> crate::Result { let model = Model::from_header( @@ -65,8 +66,8 @@ impl BatchBuilder { field_names, info_field_names, genotype_field_names, - sample_names, Some(genotype_by), + sample_names, None, )?; Self::from_model(&model, header, capacity) @@ -94,9 +95,11 @@ impl BatchBuilder { field_builders.insert(field.clone(), builder); } - let info_defs: Vec = model.info_defs().unwrap_or(&[]).to_vec(); + let has_info = model.info_defs().is_some(); + let has_genotype = model.genotype_defs().is_some() && model.samples().is_some(); + let mut info_builders = IndexMap::new(); - for def in &info_defs { + for def in model.info_defs().unwrap_or(&[]) { let builder = InfoBuilder::new(&def.ty); info_builders.insert(def.clone(), builder); } @@ -129,11 +132,12 @@ impl BatchBuilder { schema: model.schema().clone(), row_count: 0, header, - info_defs, + has_info, + has_genotype, genotype_defs, - sample_names, genotype_by, - unnest_samples: model.unnest_samples(), + sample_names, + samples_nested: model.samples_nested(), field_builders, info_builders, genotype_builders, @@ -157,23 +161,27 @@ impl RecordBatchBuilder for BatchBuilder { .map(|(_, builder)| builder.finish()) .collect(); - // info (optional) - if !self.info_defs.is_empty() { - let info_arrays: Vec<(FieldRef, ArrayRef)> = self - .info_builders - .iter_mut() - .map(|(def, builder)| { - let arrow_field = def.get_arrow_field(); - let array_ref = builder.finish(); - (Arc::new(arrow_field), array_ref) - }) - .collect(); - let info = StructArray::from(info_arrays); + // info (optional): has_info=true even when info_defs is empty (→ empty struct) + if self.has_info { + let info = if self.info_builders.is_empty() { + StructArray::new_empty_fields(self.row_count, None) + } else { + let info_arrays: Vec<(FieldRef, ArrayRef)> = self + .info_builders + .iter_mut() + .map(|(def, builder)| { + let arrow_field = def.get_arrow_field(); + let array_ref = builder.finish(); + (Arc::new(arrow_field), array_ref) + }) + .collect(); + StructArray::from(info_arrays) + }; columns.push(Arc::new(info)); } - // genotype data (optional) - if !self.sample_names.is_empty() && !self.genotype_defs.is_empty() { + // genotype data (optional): has_genotype=true even when defs/samples are empty + if self.has_genotype { let mut genotype_arrays: Vec<(FieldRef, ArrayRef)> = Vec::new(); match self.genotype_by { @@ -205,14 +213,18 @@ impl RecordBatchBuilder for BatchBuilder { } } - if self.unnest_samples { + if !self.samples_nested { // Top-level columns for (_, arr) in genotype_arrays { columns.push(arr); } } else { - // Wrap in a single "samples" struct - let samples_struct = StructArray::from(genotype_arrays); + // Wrap in a single "samples" struct (empty struct when no columns) + let samples_struct = if genotype_arrays.is_empty() { + StructArray::new_empty_fields(self.row_count, None) + } else { + StructArray::from(genotype_arrays) + }; columns.push(Arc::new(samples_struct)); } } @@ -252,7 +264,7 @@ impl Push<&noodles::vcf::Record> for BatchBuilder { } // info (optional) - if !self.info_defs.is_empty() { + if self.has_info { let info = record.info(); let parsed = collect_info_fields(&info, &self.header); for (def, builder) in self.info_builders.iter_mut() { @@ -274,7 +286,7 @@ impl Push<&noodles::vcf::Record> for BatchBuilder { } // genotype data (optional) - if !self.sample_names.is_empty() && !self.genotype_defs.is_empty() { + if self.has_genotype { let record_samples = record.samples(); let keys: Vec = self .genotype_defs @@ -360,15 +372,21 @@ impl Push<&noodles::vcf::Record> for BatchBuilder { .header .sample_names() .get_index_of(sample_name) - .unwrap(); + .ok_or_else(|| { + OxbowError::not_found(format!( + "Sample not found: {}", + sample_name + )) + })?; let maybe_result = series.get(&self.header, i).flatten(); let option = match maybe_result { Some(Ok(value)) => Some(value), _ => None, }; - (sample_name.clone(), option) + Ok((sample_name.clone(), option)) }) - .collect::>>(); + .collect::>>>( + )?; builder.push(data)?; } @@ -388,7 +406,7 @@ impl Push<&noodles::bcf::Record> for BatchBuilder { } // info (optional) - if !self.info_defs.is_empty() { + if self.has_info { let info = record.info(); let parsed = collect_info_fields(&info, &self.header); for (def, builder) in self.info_builders.iter_mut() { @@ -407,7 +425,7 @@ impl Push<&noodles::bcf::Record> for BatchBuilder { } // genotype data (optional) - if !self.sample_names.is_empty() && !self.genotype_defs.is_empty() { + if self.has_genotype { let record_samples = record.samples()?; let keys: Vec = self .genotype_defs @@ -507,15 +525,21 @@ impl Push<&noodles::bcf::Record> for BatchBuilder { .header .sample_names() .get_index_of(sample_name) - .unwrap(); + .ok_or_else(|| { + OxbowError::not_found(format!( + "Sample not found: {}", + sample_name + )) + })?; let maybe_result = series.get(&self.header, i).unwrap(); let option = match maybe_result { Some(Ok(value)) => Some(value), _ => None, }; - (sample_name.clone(), option) + Ok((sample_name.clone(), option)) }) - .collect::>>(); + .collect::>>>( + )?; builder.push(data)?; } @@ -579,11 +603,11 @@ mod tests { let header = create_test_header(); let batch_builder = BatchBuilder::new( header.clone(), - None, - None, - None, - None, + Select::All, + Select::All, + Select::All, GenotypeBy::Sample, + Select::All, 10, ) .unwrap(); @@ -596,8 +620,16 @@ mod tests { #[test] fn test_schema() { let header = create_test_header(); - let batch_builder = - BatchBuilder::new(header, None, None, None, None, GenotypeBy::Sample, 10).unwrap(); + let batch_builder = BatchBuilder::new( + header, + Select::All, + Select::All, + Select::All, + GenotypeBy::Sample, + Select::All, + 10, + ) + .unwrap(); let schema = batch_builder.schema(); assert!(schema.fields().len() > 0); @@ -608,11 +640,11 @@ mod tests { let header = create_test_header(); let mut batch_builder = BatchBuilder::new( header.clone(), - None, - None, - None, - None, + Select::All, + Select::All, + Select::All, GenotypeBy::Sample, + Select::All, 10, ) .unwrap(); @@ -626,11 +658,11 @@ mod tests { let header = create_test_header(); let mut batch_builder = BatchBuilder::new( header.clone(), - None, - None, - None, - None, + Select::All, + Select::All, + Select::All, GenotypeBy::Sample, + Select::All, 10, ) .unwrap(); @@ -667,11 +699,11 @@ mod tests { let mut batch_builder = BatchBuilder::new( header, - None, - None, - None, - Some(vec![]), + Select::All, + Select::All, + Select::All, GenotypeBy::Sample, + Select::Omit, 10, ) .unwrap(); diff --git a/oxbow/src/variant/model/genotype.rs b/oxbow/src/variant/model/genotype.rs index 22232886..6ce5881d 100644 --- a/oxbow/src/variant/model/genotype.rs +++ b/oxbow/src/variant/model/genotype.rs @@ -610,6 +610,7 @@ impl GenotypeBuilder { pub struct SampleStructBuilder { genotype_defs: Vec, builders: IndexMap, + row_count: usize, } impl SampleStructBuilder { @@ -627,6 +628,7 @@ impl SampleStructBuilder { Self { genotype_defs, builders, + row_count: 0, } } @@ -654,10 +656,16 @@ impl SampleStructBuilder { )) })?; } + self.row_count += 1; Ok(()) } pub fn finish(&mut self) -> StructArray { + let row_count = self.row_count; + self.row_count = 0; + if self.builders.is_empty() { + return StructArray::new_empty_fields(row_count, None); + } let fields = self.get_arrow_fields().into_iter().map(Arc::new); let arrays: Vec = self .genotype_defs @@ -678,6 +686,7 @@ impl SampleStructBuilder { pub struct SeriesStructBuilder { sample_names: Vec, builders: IndexMap, + row_count: usize, } impl SeriesStructBuilder { @@ -697,6 +706,7 @@ impl SeriesStructBuilder { Self { sample_names, builders, + row_count: 0, } } @@ -724,10 +734,16 @@ impl SeriesStructBuilder { )) })?; } + self.row_count += 1; Ok(()) } pub fn finish(&mut self) -> StructArray { + let row_count = self.row_count; + self.row_count = 0; + if self.builders.is_empty() { + return StructArray::new_empty_fields(row_count, None); + } let fields = self.get_arrow_fields().into_iter().map(Arc::new); let arrays: Vec = self .sample_names diff --git a/oxbow/src/variant/scanner/bcf.rs b/oxbow/src/variant/scanner/bcf.rs index 27c73fc9..31c1e830 100644 --- a/oxbow/src/variant/scanner/bcf.rs +++ b/oxbow/src/variant/scanner/bcf.rs @@ -8,7 +8,7 @@ use noodles::csi::BinningIndex; use crate::util::query::{BgzfChunkReader, ByteRangeReader}; use crate::variant::model::{BatchBuilder, GenotypeBy, Model}; use crate::variant::scanner::batch_iterator::{BatchIterator, QueryBatchIterator}; -use crate::OxbowError; +use crate::{OxbowError, Select}; /// A BCF scanner. /// @@ -20,6 +20,7 @@ use crate::OxbowError; /// /// ```no_run /// use oxbow::variant::scanner::bcf::Scanner; +/// use oxbow::Select; /// use std::fs::File; /// use std::io::BufReader; /// @@ -27,7 +28,7 @@ use crate::OxbowError; /// let mut fmt_reader = noodles::bcf::io::Reader::new(inner); /// let header = fmt_reader.read_header().unwrap(); /// -/// let scanner = Scanner::new(header, None, None, None, None, None, None).unwrap(); +/// let scanner = Scanner::new(header, Select::All, Select::All, Select::All, None, Select::All, None).unwrap(); /// let batches = scanner.scan(fmt_reader, None, None, Some(1000)); /// ``` pub struct Scanner { @@ -40,21 +41,21 @@ impl Scanner { #[allow(clippy::too_many_arguments)] pub fn new( header: noodles::vcf::Header, - fields: Option>, - info_fields: Option>, - genotype_fields: Option>, - samples: Option>, + fields: Select, + info_fields: Select, + genotype_fields: Select, genotype_by: Option, - unnest_samples: Option, + samples: Select, + samples_nested: Option, ) -> crate::Result { let model = Model::from_header( &header, fields, info_fields, genotype_fields, - samples, genotype_by, - unnest_samples, + samples, + samples_nested, )?; Ok(Self { header, model }) } diff --git a/oxbow/src/variant/scanner/vcf.rs b/oxbow/src/variant/scanner/vcf.rs index 30e53c43..1417b614 100644 --- a/oxbow/src/variant/scanner/vcf.rs +++ b/oxbow/src/variant/scanner/vcf.rs @@ -8,7 +8,7 @@ use noodles::csi::BinningIndex; use crate::util::query::{BgzfChunkReader, ByteRangeReader}; use crate::variant::model::{BatchBuilder, GenotypeBy, Model}; use crate::variant::scanner::batch_iterator::{BatchIterator, QueryBatchIterator}; -use crate::OxbowError; +use crate::{OxbowError, Select}; /// A VCF scanner. /// @@ -20,6 +20,7 @@ use crate::OxbowError; /// /// ```no_run /// use oxbow::variant::scanner::vcf::Scanner; +/// use oxbow::Select; /// use std::fs::File; /// use std::io::BufReader; /// @@ -27,7 +28,7 @@ use crate::OxbowError; /// let mut fmt_reader = noodles::vcf::io::Reader::new(inner); /// let header = fmt_reader.read_header().unwrap(); /// -/// let scanner = Scanner::new(header, None, None, None, None, None, None).unwrap(); +/// let scanner = Scanner::new(header, Select::All, Select::All, Select::All, None, Select::All, None).unwrap(); /// let batches = scanner.scan(fmt_reader, None, None, Some(1000)); /// ``` pub struct Scanner { @@ -40,21 +41,21 @@ impl Scanner { #[allow(clippy::too_many_arguments)] pub fn new( header: noodles::vcf::Header, - fields: Option>, - info_fields: Option>, - genotype_fields: Option>, - samples: Option>, + fields: Select, + info_fields: Select, + genotype_fields: Select, genotype_by: Option, - unnest_samples: Option, + samples: Select, + samples_nested: Option, ) -> crate::Result { let model = Model::from_header( &header, fields, info_fields, genotype_fields, - samples, genotype_by, - unnest_samples, + samples, + samples_nested, )?; Ok(Self { header, model }) } diff --git a/py-oxbow/oxbow/_core/alignment.py b/py-oxbow/oxbow/_core/alignment.py index b177bf5b..9b7c6a1f 100644 --- a/py-oxbow/oxbow/_core/alignment.py +++ b/py-oxbow/oxbow/_core/alignment.py @@ -22,9 +22,8 @@ def __init__( source: str | Callable[[], IO[bytes] | str], compressed: bool = False, *, - fields: list[str] | None = None, + fields: Literal["*"] | list[str] | None = "*", tag_defs: list[tuple[str, str]] | None = None, - tag_scan_rows: int = 1024, regions: str | list[str] | None = None, index: str | Callable[[], IO[bytes] | str] | None = None, batch_size: int = DEFAULT_BATCH_SIZE, @@ -38,15 +37,9 @@ def __init__( self._scanner_kwargs = dict( compressed=compressed, fields=fields, tag_defs=tag_defs ) - if tag_defs is None: - discovered = self._scanner_type( - self._source, **self._tag_discovery_kwargs() - ).tag_defs(tag_scan_rows) - self._scanner_kwargs["tag_defs"] = discovered or None def _tag_discovery_kwargs(self) -> dict: - """Extra kwargs passed to the scanner used for tag discovery.""" - return dict(compressed=self._scanner_kwargs.get("compressed", False)) + return dict(compressed=self._scanner_kwargs["compressed"]) def _scan_query(self, scanner, region, columns, batch_size): if region == "*": @@ -58,6 +51,58 @@ def _scan_query(self, scanner, region, columns, batch_size): region=region, index=self._index, columns=columns, batch_size=batch_size ) + def with_tags( + self, tag_defs: list[tuple[str, str]] | None = None, *, scan_rows: int = 1024 + ) -> Self: + """ + Return a new data source with the specified tag definitions. + + Parameters + ---------- + tag_defs : list[tuple[str, str]] or None, optional [default: None] + Definitions for tags to project. These will be nested in a "tags" + column. If None (default), tag definitions are discovered by + scanning records in the file, which is controlled by the + ``scan_rows`` parameter. + scan_rows : int, optional [default: 1024] + Number of rows to scan for tag discovery if tag_defs is None. Set + to -1 to scan the entire file, which may be slow for large files. + + Returns + ------- + Self + A new data source with the specified tag definitions. + + Notes + ----- + Tag definitions take the form of a list of (tag_name, tag_type) tuples, + where tag_name is a 2-character string and tag_type is a + single-character type code as defined in the SAM specification. + + Type codes: + + - A: Printable character + - i: Signed integer + - f: Floating point number + - Z: String + - H: Hex string + - B: Array (comma-separated values with type code prefix, e.g., "i,1,2,3") + """ + if tag_defs is None: + scan_rows = scan_rows if scan_rows >= 0 else None + discovered = self._scanner_type( + self._source, **self._tag_discovery_kwargs() + ).tag_defs(scan_rows) + self._scanner_kwargs["tag_defs"] = discovered or [] + + return type(self)( + self._src, + regions=self._regions, + index=self._index_src, + batch_size=self._batch_size, + **self._scanner_kwargs, + ) + def regions(self, regions: str | list[str]) -> Self: return type(self)( self._src, @@ -79,7 +124,7 @@ def chrom_sizes(self) -> list[tuple[str, int]]: @property def tag_defs(self) -> list[tuple[str, str]]: - """List of definitions for interpreting tag records.""" + """List of definitions for interpreting tags.""" return self._scanner_kwargs["tag_defs"] @@ -99,9 +144,8 @@ def __init__( source: str | Callable[[], IO[bytes] | str], compressed: bool = False, *, - fields: list[str] | None = None, + fields: Literal["*"] | list[str] | None = "*", tag_defs: list[tuple[str, str]] | None = None, - tag_scan_rows: int = 1024, regions: str | list[str] | None = None, index: str | Callable[[], IO[bytes] | str] | None = None, reference: str | Callable[[], IO[bytes] | str] | None = None, @@ -115,7 +159,6 @@ def __init__( compressed=compressed, fields=fields, tag_defs=tag_defs, - tag_scan_rows=tag_scan_rows, regions=regions, index=index, batch_size=batch_size, @@ -135,9 +178,8 @@ def from_sam( source: str | pathlib.Path | Callable[[], IO[bytes] | str], compression: Literal["infer", "bgzf", "gzip", None] = "infer", *, - fields: list[str] | None = None, + fields: Literal["*"] | list[str] | None = "*", tag_defs: list[tuple[str, str]] | None = None, - tag_scan_rows: int = 1024, regions: str | list[str] | None = None, index: str | pathlib.Path | Callable[[], IO[bytes] | str] | None = None, batch_size: int = DEFAULT_BATCH_SIZE, @@ -145,6 +187,14 @@ def from_sam( """ Create a SAM file data source. + .. versionchanged:: 0.7.0 + The ``tag_scan_rows`` parameter was removed and tag definitions are no + longer discovered by default. The ``tag_defs`` parameter now defaults + to omitting tag definitions (``None``). To perform tag discovery, + use the :meth:`~oxbow.core.SamFile.with_tags()` method on the returned + data source, which accepts a ``scan_rows`` parameter to control how + many records are scanned. + Parameters ---------- source : str, pathlib.Path, or Callable @@ -157,16 +207,13 @@ def from_sam( regular GZIP. If None, the source bytestream is assumed to be uncompressed. For more customized decoding, provide a callable ``source`` instead. - fields : list[str], optional + fields : list[str] or "*", optional [default: "*"] Standard SAM fields to include. By default, all standard fields are included. tag_defs : list[tuple[str, str]], optional [default: None] Definitions for tags to project. These will be nested in a "tags" - column. If None, tag definitions are discovered by scanning records in - the file, which is controlled by the ``tag_scan_rows`` parameter. To - omit tags entirely, set ``tag_defs=[]``. - tag_scan_rows : int, optional [default: 1024] - Number of rows to scan for tag definitions. + column. If None, tag definitions are omitted. To discover tag + definitions, use the ``with_tags()`` method on the returned data source. regions : str | list[str], optional One or more genomic regions to query. Only applicable if an associated index file is available. @@ -201,7 +248,6 @@ def from_sam( compressed=bgzf_compressed, fields=fields, tag_defs=tag_defs, - tag_scan_rows=tag_scan_rows, regions=regions, index=index, batch_size=batch_size, @@ -212,9 +258,8 @@ def from_bam( source: str | pathlib.Path | Callable[[], IO[bytes] | str], compression: Literal["bgzf", None] = "bgzf", *, - fields: list[str] | None = None, + fields: Literal["*"] | list[str] | None = "*", tag_defs: list[tuple[str, str]] | None = None, - tag_scan_rows: int = 1024, regions: str | list[str] | None = None, index: str | pathlib.Path | Callable[[], IO[bytes] | str] | None = None, batch_size: int = DEFAULT_BATCH_SIZE, @@ -222,6 +267,14 @@ def from_bam( """ Create a BAM file data source. + .. versionchanged:: 0.7.0 + The ``tag_scan_rows`` parameter was removed and tag definitions are no + longer discovered by default. The ``tag_defs`` parameter now defaults + to omitting tag definitions (``None``). To perform tag discovery, + use the :meth:`~oxbow.core.BamFile.with_tags()` method on the returned + data source, which accepts a ``scan_rows`` parameter to control how + many records are scanned. + Parameters ---------- source : str, pathlib.Path, or Callable @@ -232,21 +285,18 @@ def from_bam( assumed to be BGZF-compressed. If None, the source is assumed to be uncompressed. For more custom decoding, provide a callable ``source`` instead. - fields : list[str], optional + fields : list[str] or "*", optional [default: "*"] Standard SAM fields to include. By default, all standard fields are included. tag_defs : list[tuple[str, str]], optional [default: None] Definitions for tags to project. These will be nested in a "tags" - column. If None, tag definitions are discovered by scanning records in - the file, which is controlled by the ``tag_scan_rows`` parameter. To - omit tags entirely, set ``tag_defs=[]``. - tag_scan_rows : int, optional [default: 1024] - Number of rows to scan for tag definitions. + column. If None, tag definitions are omitted. To discover tag + definitions, use the ``with_tags()`` method on the returned data source. regions : str | list[str], optional One or more genomic regions to query. Only applicable if an associated index file is available. index : str, pathlib.Path, or Callable, optional - An optional index file associated with the SAM file. If ``source`` is a + An optional index file associated with the BAM file. If ``source`` is a URI or path, is BGZF-compressed, and the index file shares the same name with a ".tbi" or ".csi" extension, the index file is automatically detected. @@ -275,7 +325,6 @@ def from_bam( compressed=bgzf_compressed, fields=fields, tag_defs=tag_defs, - tag_scan_rows=tag_scan_rows, regions=regions, index=index, batch_size=batch_size, @@ -285,9 +334,8 @@ def from_bam( def from_cram( source: str | pathlib.Path | Callable[[], IO[bytes] | str], *, - fields: list[str] | None = None, + fields: Literal["*"] | list[str] | None = "*", tag_defs: list[tuple[str, str]] | None = None, - tag_scan_rows: int = 1024, regions: str | list[str] | None = None, index: str | pathlib.Path | Callable[[], IO[bytes] | str] | None = None, reference: str | pathlib.Path | Callable[[], IO[bytes] | str] | None = None, @@ -297,21 +345,26 @@ def from_cram( """ Create a CRAM file data source. + .. versionchanged:: 0.7.0 + The ``tag_scan_rows`` parameter was removed and tag definitions are no + longer discovered by default. The ``tag_defs`` parameter now defaults + to omitting tag definitions (``None``). To perform tag discovery, + use the :meth:`~oxbow.core.CramFile.with_tags()` method on the returned + data source, which accepts a ``scan_rows`` parameter to control how + many records are scanned. + Parameters ---------- source : str, pathlib.Path, or Callable The URI or path to the CRAM file, or a callable that opens the file as a file-like object. - fields : list[str], optional + fields : list[str] or "*", optional [default: "*"] Standard SAM fields to include. By default, all standard fields are included. tag_defs : list[tuple[str, str]], optional [default: None] Definitions for tags to project. These will be nested in a "tags" - column. If None, tag definitions are discovered by scanning records in - the file, which is controlled by the ``tag_scan_rows`` parameter. To - omit tags entirely, set ``tag_defs=[]``. - tag_scan_rows : int, optional [default: 1024] - Number of rows to scan for tag definitions. + column. If None, tag definitions are omitted. To discover tag + definitions, use the ``with_tags()`` method on the returned data source. regions : str | list[str], optional One or more genomic regions to query. Only applicable if an associated index file is available. @@ -350,7 +403,6 @@ def from_cram( source=source, fields=fields, tag_defs=tag_defs, - tag_scan_rows=tag_scan_rows, regions=regions, index=index, reference=reference, diff --git a/py-oxbow/oxbow/_core/bbi.py b/py-oxbow/oxbow/_core/bbi.py index 699587d7..3cdf9ee6 100644 --- a/py-oxbow/oxbow/_core/bbi.py +++ b/py-oxbow/oxbow/_core/bbi.py @@ -5,7 +5,7 @@ from __future__ import annotations import pathlib -from typing import IO, Callable +from typing import IO, Callable, Literal try: from typing import Self @@ -43,7 +43,7 @@ def zoom( self, resolution: int, *, - fields: list[str] | None = None, + fields: Literal["*"] | list[str] | None = "*", regions: str | list[str] | None = None, batch_size: int = DEFAULT_BATCH_SIZE, ) -> BbiZoom: @@ -73,7 +73,7 @@ def __init__( source: str | Callable[[], IO[bytes] | str], schema: str = "bed3+", *, - fields: list[str] | None = None, + fields: Literal["*"] | list[str] | None = "*", regions: str | list[str] | None = None, batch_size: int = DEFAULT_BATCH_SIZE, ): @@ -101,7 +101,7 @@ def __init__( self, source: str | Callable[[], IO[bytes] | str], *, - fields: list[str] | None = None, + fields: Literal["*"] | list[str] | None = "*", regions: str | list[str] | None = None, batch_size: int = DEFAULT_BATCH_SIZE, ): @@ -136,7 +136,7 @@ def __init__( base: BbiFile, resolution: int, *, - fields: list[str] | None = None, + fields: Literal["*"] | list[str] | None = "*", regions: str | list[str] | None = None, batch_size: int = DEFAULT_BATCH_SIZE, ): @@ -164,7 +164,7 @@ def from_bigbed( source: str | pathlib.Path | Callable[[], IO[bytes] | str], schema: str = "bed3+", *, - fields: list[str] | None = None, + fields: Literal["*"] | list[str] | None = "*", regions: str | list[str] | None = None, batch_size: int = DEFAULT_BATCH_SIZE, ) -> BigBedFile: @@ -215,7 +215,7 @@ def from_bigbed( def from_bigwig( source: str | pathlib.Path | Callable[[], IO[bytes] | str], *, - fields: list[str] | None = None, + fields: Literal["*"] | list[str] | None = "*", regions: str | list[str] | None = None, batch_size: int = DEFAULT_BATCH_SIZE, ) -> BigWigFile: diff --git a/py-oxbow/oxbow/_core/bed.py b/py-oxbow/oxbow/_core/bed.py index 45337628..7ae19de1 100644 --- a/py-oxbow/oxbow/_core/bed.py +++ b/py-oxbow/oxbow/_core/bed.py @@ -29,7 +29,7 @@ def __init__( bed_schema: BedSchemaLike = "bed3+", compressed: bool = False, *, - fields: list[str] | None = None, + fields: Literal["*"] | list[str] | None = "*", regions: str | list[str] | None = None, index: str | Callable[[], IO[bytes] | str] | None = None, batch_size: int = DEFAULT_BATCH_SIZE, @@ -69,7 +69,7 @@ def from_bed( bed_schema: BedSchemaLike = "bed3+", compression: Literal["infer", "bgzf", "gzip", None] = "infer", *, - fields: list[str] | None = None, + fields: Literal["*"] | list[str] | None = "*", regions: str | list[str] | None = None, index: str | pathlib.Path | Callable[[], IO[bytes] | str] | None = None, batch_size: int = DEFAULT_BATCH_SIZE, diff --git a/py-oxbow/oxbow/_core/gxf.py b/py-oxbow/oxbow/_core/gxf.py index 4deccaf3..b28f8934 100644 --- a/py-oxbow/oxbow/_core/gxf.py +++ b/py-oxbow/oxbow/_core/gxf.py @@ -22,9 +22,8 @@ def __init__( source: str | Callable[[], IO[bytes] | str], compressed: bool = False, *, - fields: list[str] | None = None, + fields: Literal["*"] | list[str] | None = "*", attribute_defs: list[tuple[str, str]] | None = None, - attribute_scan_rows: int = 1024, regions: str | list[str] | None = None, index: str | Callable[[], IO[bytes] | str] | None = None, batch_size: int = DEFAULT_BATCH_SIZE, @@ -38,17 +37,63 @@ def __init__( self._scanner_kwargs = dict( compressed=compressed, fields=fields, attribute_defs=attribute_defs ) - if attribute_defs is None: - discovered = self._scanner_type( - self._source, compressed=compressed - ).attribute_defs(attribute_scan_rows) - self._scanner_kwargs["attribute_defs"] = discovered or None def _scan_query(self, scanner, region, columns, batch_size): return scanner.scan_query( region=region, index=self._index, columns=columns, batch_size=batch_size ) + def with_attributes( + self, + attribute_defs: list[tuple[str, str]] | None = None, + *, + scan_rows: int = 1024, + ) -> Self: + """ + Return a new data source with the specified attribute definitions. + + Parameters + ---------- + attribute_defs : list[tuple[str, str]] or None, optional [default: None] + Definitions for attributes to project. These will be nested in an + "attributes" column. If None (default), attribute definitions are + discovered by scanning records in the file, which is controlled by + the ``scan_rows`` parameter. + scan_rows : int, optional [default: 1024] + Number of rows to scan for attribute discovery if attribute_defs is + None. Set to -1 to scan the entire file, which may be slow for + large files. + + Returns + ------- + Self + A new data source with the specified attribute definitions. + + Notes + ----- + Attribute definitions are tuples of (name, type), where type is a string + indicating how to interpret the attribute values. + + Attribute types: + + - "String": a string value + - "Array": a comma-separated list of values + """ + if attribute_defs is None: + scan_rows = scan_rows if scan_rows >= 0 else None + discovered = self._scanner_type( + self._source, compressed=self._scanner_kwargs["compressed"] + ).attribute_defs(scan_rows) + self._scanner_kwargs["attribute_defs"] = discovered or [] + + return type(self)( + self._src, + regions=self._regions, + index=self._index_src, + batch_size=self._batch_size, + **self._scanner_kwargs, + ) + def regions(self, regions: str | list[str]) -> Self: return type(self)( self._src, @@ -76,9 +121,8 @@ def from_gtf( source: str | pathlib.Path | Callable[[], IO[bytes] | str], compression: Literal["infer", "bgzf", "gzip", None] = "infer", *, - fields: list[str] | None = None, + fields: Literal["*"] | list[str] | None = "*", attribute_defs: list[tuple[str, str]] | None = None, - attribute_scan_rows: int = 1024, regions: str | list[str] | None = None, index: str | pathlib.Path | Callable[[], IO[bytes] | str] | None = None, batch_size: int = DEFAULT_BATCH_SIZE, @@ -86,6 +130,15 @@ def from_gtf( """ Create a GTF file data source. + .. versionchanged:: 0.7.0 + The ``attribute_scan_rows`` parameter was removed and attribute + definitions are no longer discovered by default. The ``attribute_defs`` + parameter now defaults to omitting attribute definitions (``None``). + To perform attribute discovery, use the + :meth:`~oxbow.core.GtfFile.with_attributes()` method on the returned + data source, which accepts a ``scan_rows`` parameter to control how + many records are scanned. + Parameters ---------- source : str, pathlib.Path, or Callable @@ -98,17 +151,14 @@ def from_gtf( regular GZIP. If None, the source bytestream is assumed to be uncompressed. For more customized decoding, provide a callable ``source`` instead. - fields : list[str], optional + fields : list[str] or "*", optional [default: "*"] Specific fixed fields to project. By default, all fixed fields are included. attribute_defs : list[tuple[str, str]], optional [default: None] Definitions for variable attribute fields to project. These will be nested in an "attributes" column. If None, attribute definitions are - discovered by scanning records in the file, which is controlled by the - ``attribute_scan_rows`` parameter. To omit attributes entirely, - set ``attribute_defs=[]``. - attribute_scan_rows : int, optional [default: 1024] - Number of rows to scan for attribute definitions. + omitted. To discover attribute definitions, use the + ``with_attributes()`` method on the returned data source. regions : str | list[str], optional One or more genomic regions to query. Only applicable if an associated index file is available. @@ -140,7 +190,6 @@ def from_gtf( compressed=bgzf_compressed, fields=fields, attribute_defs=attribute_defs, - attribute_scan_rows=attribute_scan_rows, regions=regions, index=index, batch_size=batch_size, @@ -151,9 +200,8 @@ def from_gff( source: str | pathlib.Path | Callable[[], IO[bytes] | str], compression: Literal["infer", "bgzf", "gzip", None] = "infer", *, - fields: list[str] | None = None, + fields: Literal["*"] | list[str] | None = "*", attribute_defs: list[tuple[str, str]] | None = None, - attribute_scan_rows: int = 1024, regions: str | list[str] | None = None, index: str | pathlib.Path | Callable[[], IO[bytes] | str] | None = None, batch_size: int = DEFAULT_BATCH_SIZE, @@ -161,6 +209,15 @@ def from_gff( """ Create a GFF3 file data source. + .. versionchanged:: 0.7.0 + The ``attribute_scan_rows`` parameter was removed and attribute + definitions are no longer discovered by default. The ``attribute_defs`` + parameter now defaults to omitting attribute definitions (``None``). + To perform attribute discovery, use the + :meth:`~oxbow.core.GffFile.with_attributes()` method on the returned + data source, which accepts a ``scan_rows`` parameter to control how + many records are scanned. + Parameters ---------- source : str, pathlib.Path, or Callable @@ -173,22 +230,19 @@ def from_gff( regular GZIP. If None, the source bytestream is assumed to be uncompressed. For more customized decoding, provide a callable ``source`` instead. - fields : list[str], optional + fields : list[str] or "*", optional [default: "*"] Specific fixed fields to project. By default, all fixed fields are included. attribute_defs : list[tuple[str, str]], optional [default: None] Definitions for variable attribute fields to project. These will be nested in an "attributes" column. If None, attribute definitions are - discovered by scanning records in the file, which is controlled by the - ``attribute_scan_rows`` parameter. To omit attributes entirely, - set ``attribute_defs=[]``. - attribute_scan_rows : int, optional [default: 1024] - Number of rows to scan for attribute definitions. + omitted. To discover attribute definitions, use the + ``with_attributes()`` method on the returned data source. regions : str | list[str], optional One or more genomic regions to query. Only applicable if an associated index file is available. index : str, pathlib.Path, or Callable, optional - An optional index file associated with the GTF file. If ``source`` is a + An optional index file associated with the GFF file. If ``source`` is a URI or path, is BGZF-compressed, and the index file shares the same name with a ".tbi" or ".csi" extension, the index file is automatically detected. @@ -217,6 +271,5 @@ def from_gff( compressed=bgzf_compressed, regions=regions, attribute_defs=attribute_defs, - attribute_scan_rows=attribute_scan_rows, batch_size=batch_size, ) diff --git a/py-oxbow/oxbow/_core/sequence.py b/py-oxbow/oxbow/_core/sequence.py index 64774bd3..7a2c09fd 100644 --- a/py-oxbow/oxbow/_core/sequence.py +++ b/py-oxbow/oxbow/_core/sequence.py @@ -87,7 +87,7 @@ def __init__( source: str | Callable[[], IO[bytes] | str], compressed: bool = False, *, - fields: list[str] | None = None, + fields: Literal["*"] | list[str] | None = "*", regions: str | list[str] | None = None, index: str | Callable[[], IO[bytes] | str] | None = None, gzi: str | Callable[[], IO[bytes]] | None = None, @@ -112,7 +112,7 @@ def __init__( source: str | Callable[[], IO[bytes] | str], compressed: bool = False, *, - fields: list[str] | None = None, + fields: Literal["*"] | list[str] | None = "*", batch_size: int = DEFAULT_BATCH_SIZE, ): super().__init__( @@ -133,7 +133,7 @@ def from_fasta( source: str | pathlib.Path | Callable[[], IO[bytes] | str], compression: Literal["infer", "bgzf", "gzip", None] = "infer", *, - fields: list[str] | None = None, + fields: Literal["*"] | list[str] | None = "*", regions: str | list[str] | None = None, index: str | pathlib.Path | Callable[[], IO[bytes] | str] | None = None, gzi: str | pathlib.Path | Callable[[], IO[bytes] | str] | None = None, @@ -201,7 +201,7 @@ def from_fastq( source: str | pathlib.Path | Callable[[], IO[bytes] | str], compression: Literal["infer", "gzip", None] = "infer", *, - fields: list[str] | None = None, + fields: Literal["*"] | list[str] | None = "*", batch_size: int = DEFAULT_BATCH_SIZE, ) -> FastqFile: """ diff --git a/py-oxbow/oxbow/_core/variant.py b/py-oxbow/oxbow/_core/variant.py index 5ccf6591..451bb5e1 100644 --- a/py-oxbow/oxbow/_core/variant.py +++ b/py-oxbow/oxbow/_core/variant.py @@ -22,12 +22,12 @@ def __init__( source: str | Callable[[], IO[bytes] | str], compressed: bool = False, *, - fields=None, - info_fields: list[str] | None = None, - samples: list[str] | None = None, - genotype_fields: list[str] | None = None, + fields: Literal["*"] | list[str] | None = "*", + info_fields: Literal["*"] | list[str] | None = "*", + genotype_fields: Literal["*"] | list[str] | None = "*", genotype_by: Literal["sample", "field"] = "sample", - unnest_samples: bool = True, + samples: Literal["*"] | list[str] | None = None, + samples_nested: bool = False, regions: str | list[str] | None = None, index: str | Callable[[], IO[bytes] | str] | None = None, batch_size: int = DEFAULT_BATCH_SIZE, @@ -42,10 +42,10 @@ def __init__( compressed=compressed, fields=fields, info_fields=info_fields, - samples=samples, genotype_fields=genotype_fields, genotype_by=genotype_by, - unnest_samples=unnest_samples, + samples=samples, + samples_nested=samples_nested, ) def _scan_query(self, scanner, region, columns, batch_size): @@ -62,6 +62,53 @@ def regions(self, regions: str | list[str]) -> Self: **self._scanner_kwargs, ) + def with_samples( + self, + samples: Literal["*"] | list[str] | None = "*", + *, + genotype_fields: Literal["*"] | list[str] | None = "*", + group_by: Literal["sample", "field"] = "sample", + ) -> Self: + """ + Return a new data source with sample genotype data nested under a + single ``"samples"`` struct column. + + Parameters + ---------- + samples : "*", list[str], or None, optional [default: "*"] + Names of samples to include in the genotype output. ``"*"`` + includes all samples declared in the header. Pass a list to select + specific samples. ``None`` omits all sample genotype data. + genotype_fields : "*", list[str], or None, optional [default: "*"] + Genotype (aka FORMAT) fields to project for each sample. ``"*"`` + includes all FORMAT fields declared in the header. Pass a list to + select specific fields. ``None`` omits all genotype fields. + group_by : Literal["sample", "field"], optional [default: "sample"] + Determines how genotype data is organized within the ``"samples"`` + struct. If ``"sample"``, each sample name is a sub-column with + nested genotype fields. If ``"field"``, each genotype field is a + sub-column with nested sample values. + + Returns + ------- + Self + A new data source with sample genotype data nested under a single + ``"samples"`` struct column. + """ + self._scanner_kwargs.update( + genotype_fields=genotype_fields, + genotype_by=group_by, + samples=samples, + samples_nested=True, + ) + return type(self)( + self._src, + regions=self._regions, + index=self._index_src, + batch_size=self._batch_size, + **self._scanner_kwargs, + ) + @property def chrom_names(self) -> list[str]: """List of reference sequence names declared in the header.""" @@ -100,12 +147,12 @@ def from_vcf( source: str | pathlib.Path | Callable[[], IO[bytes] | str], compression: Literal["infer", "bgzf", "gzip", None] = "infer", *, - fields: list[str] | None = None, - info_fields: list[str] | None = None, - samples: list[str] | None = None, - genotype_fields: list[str] | None = None, + fields: Literal["*"] | list[str] | None = "*", + info_fields: Literal["*"] | list[str] | None = "*", + genotype_fields: Literal["*"] | list[str] | None = "*", genotype_by: Literal["sample", "field"] = "sample", - unnest_samples: bool = True, + samples: Literal["*"] | list[str] | None = None, + samples_nested: bool = False, regions: str | list[str] | None = None, index: str | pathlib.Path | Callable[[], IO[bytes] | str] | None = None, batch_size: int = DEFAULT_BATCH_SIZE, @@ -113,6 +160,13 @@ def from_vcf( """ Create a VCF file data source. + .. versionchanged:: 0.7.0 + The ``samples`` parameter now defaults to omitting sample genotype + data (``None``) instead of including all samples (``"*"``). To include + samples, pass a value to the ``samples`` parameter or use the + :meth:`~oxbow.core.VcfFile.with_samples()` method on the returned data + source. + Parameters ---------- source : str, pathlib.Path, or Callable @@ -125,25 +179,28 @@ def from_vcf( regular GZIP. If None, the source bytestream is assumed to be uncompressed. For more customized decoding, provide a callable ``source`` instead. - fields : list[str], optional - Specific fixed fields to project. By default, all fixed fields are - included. - info_fields : list[str], optional [default: None] - INFO fields to project. These will be nested under an "info" column. - If None, all INFO fields declared in the header are included. To omit - all INFO fields, set ``info_fields=[]``. - samples : list[str], optional [default: None] - A subset of samples to include in the genotype output. If None, all - samples declared in the header are included. To omit all sample - genotype data, set ``samples=[]``. - genotype_fields : list[str], optional [default: None] - Genotype (aka "FORMAT") fields to project for each sample. If None, all - FORMAT fields declared in the header are included. + fields : ``"*"``, list[str], or None, optional [default: ``"*"``] + Fixed fields to project. ``"*"`` includes all standard fields. Pass a + list to select specific fields. ``None`` omits all fixed fields. + info_fields : ``"*"``, list[str], or None, optional [default: ``"*"``] + INFO fields to project, nested under an ``"info"`` column. ``"*"`` + includes all INFO fields declared in the header. Pass a list to select + specific fields. ``None`` omits the info column entirely. + genotype_fields : ``"*"``, list[str], or None, optional [default: ``"*"``] + Genotype (aka "FORMAT") fields to project for each sample. ``"*"`` + includes all FORMAT fields declared in the header. Pass a list to select + specific fields. ``None`` omits the genotype fields. genotype_by : Literal["sample", "field"], optional [default: "sample"] Determines how genotype-specific data is organized. If "sample", each sample is provided as a separate column with nested FORMAT fields. If "field", each FORMAT field is provided as a separate column with nested sample name fields. + samples : ``"*"``, list[str], or None, optional [default: ``None``] + Samples to include in the genotype output. ``"*"`` includes all samples + declared in the header. Pass a list to select specific samples. ``None`` + omits all sample genotype data. + samples_nested : bool, optional [default: False] + Whether to nest sample data under a single structured column. regions : str | list[str], optional One or more genomic regions to query. Only applicable if an associated index file is available. @@ -178,10 +235,10 @@ def from_vcf( compressed=bgzf_compressed, fields=fields, info_fields=info_fields, - samples=samples, genotype_fields=genotype_fields, genotype_by=genotype_by, - unnest_samples=unnest_samples, + samples=samples, + samples_nested=samples_nested, regions=regions, index=index, batch_size=batch_size, @@ -192,12 +249,12 @@ def from_bcf( source: str | pathlib.Path | Callable[[], IO[bytes] | str], compression: Literal["bgzf", None] = "bgzf", *, - fields: list[str] | None = None, - info_fields: list[str] | None = None, - samples: list[str] | None = None, - genotype_fields: list[str] | None = None, + fields: Literal["*"] | list[str] | None = "*", + info_fields: Literal["*"] | list[str] | None = "*", + genotype_fields: Literal["*"] | list[str] | None = "*", genotype_by: Literal["sample", "field"] = "sample", - unnest_samples: bool = True, + samples: Literal["*"] | list[str] | None = None, + samples_nested: bool = False, regions: str | list[str] | None = None, index: str | pathlib.Path | Callable[[], IO[bytes] | str] | None = None, batch_size: int = DEFAULT_BATCH_SIZE, @@ -205,6 +262,13 @@ def from_bcf( """ Create a BCF file data source. + .. versionchanged:: 0.7.0 + The ``samples`` parameter now defaults to omitting sample genotype + data (``None``) instead of including all samples (``"*"``). To include + samples, pass a value to the ``samples`` parameter or use the + :meth:`~oxbow.core.BcfFile.with_samples()` method on the returned data + source. + Parameters ---------- source : str, pathlib.Path, or Callable @@ -215,25 +279,28 @@ def from_bcf( assumed to be BGZF-compressed. If None, the source is assumed to be uncompressed. For more custom decoding, provide a callable ``source`` instead. - fields : list[str], optional - Specific fixed fields to project. By default, all fixed fields are - included. - info_fields : list[str], optional [default: None] - INFO fields to project. These will be nested under an "info" column. - If None, all INFO fields declared in the header are included. To omit - all INFO fields, set ``info_fields=[]``. - samples : list[str], optional [default: None] - A subset of samples to include in the genotype output. If None, all - samples declared in the header are included. To omit all sample - genotype data, set ``samples=[]``. - genotype_fields : list[str], optional [default: None] - Genotype (aka "FORMAT") fields to project for each sample. If None, all - FORMAT fields declared in the header are included. + fields : ``"*"``, list[str], or None, optional [default: ``"*"``] + Fixed fields to project. ``"*"`` includes all standard fields. Pass a + list to select specific fields. ``None`` omits all fixed fields. + info_fields : ``"*"``, list[str], or None, optional [default: ``"*"``] + INFO fields to project, nested under an ``"info"`` column. ``"*"`` + includes all INFO fields declared in the header. Pass a list to select + specific fields. ``None`` omits the info column entirely. + genotype_fields : ``"*"``, list[str], or None, optional [default: ``"*"``] + Genotype (aka "FORMAT") fields to project for each sample. ``"*"`` + includes all FORMAT fields declared in the header. Pass a list to select + specific fields. ``None`` omits the genotype fields. genotype_by : Literal["sample", "field"], optional [default: "sample"] Determines how genotype-specific data is organized. If "sample", each sample is provided as a separate column with nested FORMAT fields. If "field", each FORMAT field is provided as a separate column with nested sample name fields. + samples : ``"*"``, list[str], or None, optional [default: ``None``] + Samples to include in the genotype output. ``"*"`` includes all samples + declared in the header. Pass a list to select specific samples. ``None`` + omits all sample genotype data. + samples_nested : bool, optional [default: False] + Whether to nest sample data under a single structured column. regions : str | list[str], optional One or more genomic regions to query. Only applicable if an associated index file is available. @@ -268,10 +335,10 @@ def from_bcf( compressed=bgzf_compressed, fields=fields, info_fields=info_fields, - samples=samples, genotype_fields=genotype_fields, genotype_by=genotype_by, - unnest_samples=unnest_samples, + samples=samples, + samples_nested=samples_nested, regions=regions, index=index, batch_size=batch_size, diff --git a/py-oxbow/pyproject.toml b/py-oxbow/pyproject.toml index 9bde9bee..ce856226 100644 --- a/py-oxbow/pyproject.toml +++ b/py-oxbow/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["maturin>=1.0,<2.0"] +requires = ["maturin>=1.12,<2.0"] build-backend = "maturin" [project] @@ -20,7 +20,7 @@ classifiers = [ ] readme = "README.md" dependencies = [ - "arro3-core>=0.4.6", + "arro3-core>=0.5.1", "polars>=1.36.0", "pandas>=2.2.3", "pyarrow>=19.0.1", diff --git a/py-oxbow/src/alignment.rs b/py-oxbow/src/alignment.rs index 28e33eb3..6bd89c99 100644 --- a/py-oxbow/src/alignment.rs +++ b/py-oxbow/src/alignment.rs @@ -13,8 +13,8 @@ use noodles::core::Region; use crate::error::{err_on_unwind, to_py}; use crate::util::{ - pyobject_to_bufreader, resolve_cram_index, resolve_fasta_repository, resolve_index, - PyVirtualPosition, Reader, + pyobject_to_bufreader, resolve_cram_index, resolve_fasta_repository, resolve_fields, + resolve_index, PyVirtualPosition, Reader, }; use oxbow::alignment::{BamScanner, CramScanner, SamScanner}; use oxbow::util::batches_to_ipc; @@ -28,10 +28,12 @@ use oxbow::util::index::IndexType; /// The path to the SAM file or a file-like object. /// compressed : bool, optional [default: False] /// Whether the source is BGZF-compressed. -/// fields : list[str], optional -/// Names of the standard SAM fields to include. -/// tag_defs : list[tuple[str, str]], optional -/// Tag definitions. None means no tags column. +/// fields : str or list[str] or None, optional [default: "*"] +/// Standard SAM fields to include. ``"*"`` for all, ``None`` to omit, +/// or a list of field names. +/// tag_defs : list[tuple[str, str]], optional [default: None] +/// Tag definitions for the ``"tags"`` struct column. ``None`` omits the +/// tags column. Use the ``tag_defs()`` method to discover definitions. #[pyclass(module = "oxbow.oxbow")] pub struct PySamScanner { src: Py, @@ -48,9 +50,10 @@ impl PySamScanner { py: Python, src: Py, compressed: bool, - fields: Option>, + fields: Option>, tag_defs: Option>, ) -> PyResult { + let fields = resolve_fields(fields, py)?; let reader = pyobject_to_bufreader(py, src.clone_ref(py), compressed)?; let mut fmt_reader = noodles::sam::io::Reader::new(reader); let header = fmt_reader.read_header()?; @@ -108,7 +111,7 @@ impl PySamScanner { /// /// The reader stream is reset to its original position after scanning. /// - /// Parametersuv + /// Parameters /// ---------- /// scan_rows : int, optional [default: 1024] /// The number of records to scan. If None, all records are scanned. @@ -463,10 +466,12 @@ impl PySamScanner { /// The path to the BAM file or a file-like object. /// compressed : bool, optional [default: True] /// Whether the source is BGZF-compressed. -/// fields : list[str], optional -/// Names of the standard SAM fields to include. -/// tag_defs : list[tuple[str, str]], optional -/// Tag definitions. None means no tags column. +/// fields : str or list[str] or None, optional [default: "*"] +/// Standard SAM fields to include. ``"*"`` for all, ``None`` to omit, +/// or a list of field names. +/// tag_defs : list[tuple[str, str]], optional [default: None] +/// Tag definitions for the ``"tags"`` struct column. ``None`` omits the +/// tags column. Use the ``tag_defs()`` method to discover definitions. #[pyclass(module = "oxbow.oxbow")] pub struct PyBamScanner { src: Py, @@ -483,9 +488,10 @@ impl PyBamScanner { py: Python, src: Py, compressed: bool, - fields: Option>, + fields: Option>, tag_defs: Option>, ) -> PyResult { + let fields = resolve_fields(fields, py)?; let reader = pyobject_to_bufreader(py, src.clone_ref(py), compressed)?; let mut fmt_reader = noodles::bam::io::Reader::from(reader); let header = fmt_reader.read_header()?; @@ -896,10 +902,12 @@ impl PyBamScanner { /// ---------- /// src : str or file-like /// The path to the CRAM file or a file-like object. -/// fields : list[str], optional -/// Names of the standard SAM fields to include. -/// tag_defs : list[tuple[str, str]], optional -/// Tag definitions. None means no tags column. +/// fields : str or list[str] or None, optional [default: "*"] +/// Standard SAM fields to include. ``"*"`` for all, ``None`` to omit, +/// or a list of field names. +/// tag_defs : list[tuple[str, str]], optional [default: None] +/// Tag definitions for the ``"tags"`` struct column. ``None`` omits the +/// tags column. Use the ``tag_defs()`` method to discover definitions. #[pyclass] pub struct PyCramScanner { src: Py, @@ -918,11 +926,12 @@ impl PyCramScanner { py: Python, src: Py, compressed: Option, - fields: Option>, + fields: Option>, tag_defs: Option>, reference: Option>, reference_index: Option>, ) -> PyResult { + let fields = resolve_fields(fields, py)?; let reader = pyobject_to_bufreader(py, src.clone_ref(py), false)?; let mut fmt_reader = noodles::cram::io::Reader::new(reader); let header = fmt_reader.read_header()?; @@ -1131,8 +1140,8 @@ impl PyCramScanner { /// ---------- /// src : str or file-like /// The path to the source file or a file-like object. -/// fields : list[str], optional -/// Names of the standard SAM fields to project. +/// fields : str or list[str] or None, optional +/// Standard SAM fields to project. /// tag_defs : list[tuple[str, str]], optional /// Tag definitions. None means no tags column. /// compressed : bool, optional [default: False] @@ -1149,10 +1158,11 @@ pub fn read_sam( src: Py, region: Option, index: Option>, - fields: Option>, + fields: Option>, tag_defs: Option>, compressed: bool, ) -> PyResult> { + let fields = resolve_fields(fields, py)?; let reader = pyobject_to_bufreader(py, src.clone_ref(py), compressed)?; let mut fmt_reader = noodles::sam::io::Reader::new(reader); let header = fmt_reader.read_header()?; @@ -1202,8 +1212,8 @@ pub fn read_sam( /// ---------- /// src : str or file-like /// The path to the source file or a file-like object. -/// fields : list[str], optional -/// Names of the standard SAM fields to project. +/// fields : str or list[str] or None, optional +/// Standard SAM fields to project. /// tag_defs : list[tuple[str, str]], optional /// Tag definitions. None means no tags column. /// compressed : bool, optional [default: True] @@ -1220,10 +1230,11 @@ pub fn read_bam( src: Py, region: Option, index: Option>, - fields: Option>, + fields: Option>, tag_defs: Option>, compressed: bool, ) -> PyResult> { + let fields = resolve_fields(fields, py)?; let reader = pyobject_to_bufreader(py, src.clone_ref(py), compressed)?; let mut fmt_reader = noodles::bam::io::Reader::from(reader); let header = fmt_reader.read_header()?; @@ -1273,8 +1284,8 @@ pub fn read_bam( /// ---------- /// src : str or file-like /// The path to the source file or a file-like object. -/// fields : list[str], optional -/// Names of the standard SAM fields to project. +/// fields : str or list[str] or None, optional +/// Standard SAM fields to project. /// tag_defs : list[tuple[str, str]], optional /// Tag definitions. None means no tags column. /// @@ -1292,9 +1303,10 @@ pub fn read_cram( index: Option>, reference: Option>, reference_index: Option>, - fields: Option>, + fields: Option>, tag_defs: Option>, ) -> PyResult> { + let fields = resolve_fields(fields, py)?; let reader = pyobject_to_bufreader(py, src.clone_ref(py), false)?; let mut fmt_reader = noodles::cram::io::Reader::new(reader); let header = fmt_reader.read_header()?; diff --git a/py-oxbow/src/bbi.rs b/py-oxbow/src/bbi.rs index 74fe2e40..9a7ac947 100644 --- a/py-oxbow/src/bbi.rs +++ b/py-oxbow/src/bbi.rs @@ -12,7 +12,7 @@ use bigtools::bed::autosql::parse::parse_autosql; use noodles::core::Region; use crate::error::{err_on_unwind, to_py}; -use crate::util::{pyobject_to_bufreader, Reader}; +use crate::util::{pyobject_to_bufreader, resolve_fields, Reader}; use oxbow::bbi::model::base::field::FieldDef; use oxbow::bbi::{BBIReader, BBIZoomScanner, BedSchema, BigBedScanner, BigWigScanner}; use oxbow::util::batches_to_ipc; @@ -43,12 +43,12 @@ pub struct PyBigWigScanner { impl PyBigWigScanner { #[new] #[pyo3(signature = (src, fields=None))] - fn new(py: Python, src: Py, fields: Option>) -> PyResult { + fn new(py: Python, src: Py, fields: Option>) -> PyResult { let reader = pyobject_to_bufreader(py, src.clone_ref(py), false)?; let fmt_reader = bigtools::BigWigRead::open(reader).unwrap(); let info = fmt_reader.info().clone(); let reader = fmt_reader.into_inner(); - let scanner = BigWigScanner::new(info, fields).map_err(to_py)?; + let scanner = BigWigScanner::new(info, resolve_fields(fields, py)?).map_err(to_py)?; Ok(Self { _src: src, reader, @@ -104,7 +104,7 @@ impl PyBigWigScanner { fn get_zoom( &mut self, zoom_level: u32, - fields: Option>, + fields: Option>, ) -> PyResult { Python::attach(|py| { PyBBIZoomScanner::new( @@ -231,7 +231,7 @@ impl PyBigBedScanner { py: Python, src: Py, schema: Option>, - fields: Option>, + fields: Option>, ) -> PyResult { let reader = pyobject_to_bufreader(py, src.clone_ref(py), false)?; let mut fmt_reader = bigtools::BigBedRead::open(reader).unwrap(); @@ -266,7 +266,8 @@ impl PyBigBedScanner { }; let info = fmt_reader.info().clone(); let reader = fmt_reader.into_inner(); - let scanner = BigBedScanner::new(bed_schema, info, fields).map_err(to_py)?; + let scanner = + BigBedScanner::new(bed_schema, info, resolve_fields(fields, py)?).map_err(to_py)?; let _schema: Option = schema.as_ref().and_then(|s| s.extract::(py).ok()); Ok(Self { _src: src, @@ -340,7 +341,7 @@ impl PyBigBedScanner { fn get_zoom( &mut self, zoom_level: u32, - fields: Option>, + fields: Option>, ) -> PyResult { Python::attach(|py| { PyBBIZoomScanner::new( @@ -468,7 +469,7 @@ impl PyBBIZoomScanner { src: Py, bbi_type: PyBBIFileType, zoom_level: u32, - fields: Option>, + fields: Option>, ) -> PyResult { let reader = pyobject_to_bufreader(py, src.clone_ref(py), false) .expect("Failed to convert Py to BufReader"); @@ -493,7 +494,9 @@ impl PyBBIZoomScanner { ))); } let reader = fmt_reader.into_inner(); - let scanner = BBIZoomScanner::new(ref_names, zoom_level, fields).map_err(to_py)?; + let scanner = + BBIZoomScanner::new(ref_names, zoom_level, resolve_fields(fields, py)?) + .map_err(to_py)?; Ok(Self { src, reader, @@ -522,7 +525,9 @@ impl PyBBIZoomScanner { ))); } let reader = fmt_reader.into_inner(); - let scanner = BBIZoomScanner::new(ref_names, zoom_level, fields).map_err(to_py)?; + let scanner = + BBIZoomScanner::new(ref_names, zoom_level, resolve_fields(fields, py)?) + .map_err(to_py)?; Ok(Self { src, reader, @@ -684,7 +689,7 @@ pub fn read_bigwig( py: Python, src: Py, region: Option, - fields: Option>, + fields: Option>, ) -> PyResult> { let reader = pyobject_to_bufreader(py, src.clone_ref(py), false)?; @@ -696,7 +701,7 @@ pub fn read_bigwig( let fmt_reader = bigtools::BigWigRead::open(reader) .map_err(|e| PyErr::new::(e.to_string()))?; let info = fmt_reader.info().clone(); - let scanner = BigWigScanner::new(info, fields).map_err(to_py)?; + let scanner = BigWigScanner::new(info, resolve_fields(fields, py)?).map_err(to_py)?; let batches = scanner .scan_query(fmt_reader, region, None, None, None) .map_err(to_py)?; @@ -705,7 +710,7 @@ pub fn read_bigwig( let fmt_reader = bigtools::BigWigRead::open(reader) .map_err(|e| PyErr::new::(e.to_string()))?; let info = fmt_reader.info().clone(); - let scanner = BigWigScanner::new(info, fields).map_err(to_py)?; + let scanner = BigWigScanner::new(info, resolve_fields(fields, py)?).map_err(to_py)?; let batches = scanner.scan(fmt_reader, None, None, None).map_err(to_py)?; batches_to_ipc(batches) }; @@ -735,7 +740,7 @@ pub fn read_bigbed( src: Py, bed_schema: &str, region: Option, - fields: Option>, + fields: Option>, ) -> PyResult> { let bed_schema: BedSchema = bed_schema.parse().map_err(to_py)?; let reader = pyobject_to_bufreader(py, src.clone_ref(py), false)?; @@ -748,7 +753,8 @@ pub fn read_bigbed( let fmt_reader = bigtools::BigBedRead::open(reader) .map_err(|e| PyErr::new::(e.to_string()))?; let info = fmt_reader.info().clone(); - let scanner = BigBedScanner::new(bed_schema, info, fields).map_err(to_py)?; + let scanner = + BigBedScanner::new(bed_schema, info, resolve_fields(fields, py)?).map_err(to_py)?; let batches = scanner .scan_query(fmt_reader, region, None, None, None) .map_err(to_py)?; @@ -757,7 +763,8 @@ pub fn read_bigbed( let fmt_reader = bigtools::BigBedRead::open(reader) .map_err(|e| PyErr::new::(e.to_string()))?; let info = fmt_reader.info().clone(); - let scanner = BigBedScanner::new(bed_schema, info, fields).map_err(to_py)?; + let scanner = + BigBedScanner::new(bed_schema, info, resolve_fields(fields, py)?).map_err(to_py)?; let batches = scanner.scan(fmt_reader, None, None, None).map_err(to_py)?; batches_to_ipc(batches) }; diff --git a/py-oxbow/src/bed.rs b/py-oxbow/src/bed.rs index 33297963..a5a31e10 100644 --- a/py-oxbow/src/bed.rs +++ b/py-oxbow/src/bed.rs @@ -10,7 +10,9 @@ use pyo3_arrow::PySchema; use noodles::core::Region; use crate::error::{err_on_unwind, to_py}; -use crate::util::{pyobject_to_bufreader, resolve_index, PyVirtualPosition, Reader}; +use crate::util::{ + pyobject_to_bufreader, resolve_fields, resolve_index, PyVirtualPosition, Reader, +}; use oxbow::bed::{BedScanner, BedSchema, FieldDef, FieldType}; use oxbow::util::batches_to_ipc; use oxbow::util::index::IndexType; @@ -106,11 +108,11 @@ impl PyBedScanner { src: Py, bed_schema: Py, compressed: bool, - fields: Option>, + fields: Option>, ) -> PyResult { let reader = pyobject_to_bufreader(py, src.clone_ref(py), compressed)?; let parsed_schema = resolve_bed_schema(py, &bed_schema)?; - let scanner = BedScanner::new(parsed_schema, fields).map_err(to_py)?; + let scanner = BedScanner::new(parsed_schema, resolve_fields(fields, py)?).map_err(to_py)?; Ok(Self { src, reader, @@ -392,12 +394,12 @@ pub fn read_bed( bed_schema: Py, region: Option, index: Option>, - fields: Option>, + fields: Option>, compressed: bool, ) -> PyResult> { let reader = pyobject_to_bufreader(py, src.clone_ref(py), compressed)?; let bed_schema = resolve_bed_schema(py, &bed_schema)?; - let scanner = BedScanner::new(bed_schema, fields).map_err(to_py)?; + let scanner = BedScanner::new(bed_schema, resolve_fields(fields, py)?).map_err(to_py)?; let ipc = if let Some(region) = region { let region = region diff --git a/py-oxbow/src/gxf.rs b/py-oxbow/src/gxf.rs index f906c661..48ded306 100644 --- a/py-oxbow/src/gxf.rs +++ b/py-oxbow/src/gxf.rs @@ -12,7 +12,9 @@ use pyo3_arrow::PySchema; use noodles::core::Region; use crate::error::{err_on_unwind, to_py}; -use crate::util::{pyobject_to_bufreader, resolve_index, PyVirtualPosition, Reader}; +use crate::util::{ + pyobject_to_bufreader, resolve_fields, resolve_index, PyVirtualPosition, Reader, +}; use oxbow::gxf::{GffScanner, GtfScanner}; use oxbow::util::batches_to_ipc; use oxbow::util::index::IndexType; @@ -27,8 +29,9 @@ use oxbow::util::index::IndexType; /// Whether the source is BGZF-compressed. /// fields : list[str], optional /// Names of the fixed fields to project. -/// attribute_defs : list[tuple[str, str]], optional -/// Definitions of attribute fields to project. +/// attribute_defs : list[tuple[str, str]], optional [default: None] +/// Definitions for the ``"attributes"`` struct column. ``None`` omits the +/// attributes column. Use the ``attribute_defs()`` method to discover definitions. #[pyclass(module = "oxbow.oxbow")] pub struct PyGtfScanner { src: Py, @@ -45,9 +48,10 @@ impl PyGtfScanner { py: Python, src: Py, compressed: Option, - fields: Option>, + fields: Option>, attribute_defs: Option>, ) -> PyResult { + let fields = resolve_fields(fields, py)?; let compressed = compressed.unwrap_or(false); let reader = pyobject_to_bufreader(py, src.clone_ref(py), compressed)?; let scanner = GtfScanner::new(None, fields, attribute_defs).map_err(to_py)?; @@ -341,8 +345,9 @@ impl PyGtfScanner { /// Whether the source is BGZF-compressed. /// fields : list[str], optional /// Names of the fixed fields to project. -/// attribute_defs : list[tuple[str, str]], optional -/// Definitions of attribute fields to project. +/// attribute_defs : list[tuple[str, str]], optional [default: None] +/// Definitions for the ``"attributes"`` struct column. ``None`` omits the +/// attributes column. Use the ``attribute_defs()`` method to discover definitions. #[pyclass(module = "oxbow.oxbow")] pub struct PyGffScanner { src: Py, @@ -359,9 +364,10 @@ impl PyGffScanner { py: Python, src: Py, compressed: Option, - fields: Option>, + fields: Option>, attribute_defs: Option>, ) -> PyResult { + let fields = resolve_fields(fields, py)?; let compressed = compressed.unwrap_or(false); let reader = pyobject_to_bufreader(py, src.clone_ref(py), compressed)?; let scanner = GffScanner::new(None, fields, attribute_defs).map_err(to_py)?; @@ -668,10 +674,11 @@ pub fn read_gtf( src: Py, region: Option, index: Option>, - fields: Option>, + fields: Option>, attr_defs: Option>, compressed: bool, ) -> PyResult> { + let fields = resolve_fields(fields, py)?; let reader = pyobject_to_bufreader(py, src.clone_ref(py), compressed)?; let scanner = GtfScanner::new(None, fields, attr_defs).map_err(to_py)?; @@ -736,10 +743,11 @@ pub fn read_gff( src: Py, region: Option, index: Option>, - fields: Option>, + fields: Option>, attr_defs: Option>, compressed: bool, ) -> PyResult> { + let fields = resolve_fields(fields, py)?; let reader = pyobject_to_bufreader(py, src.clone_ref(py), compressed)?; let scanner = GffScanner::new(None, fields, attr_defs).map_err(to_py)?; diff --git a/py-oxbow/src/sequence.rs b/py-oxbow/src/sequence.rs index 38550767..ca77acc8 100644 --- a/py-oxbow/src/sequence.rs +++ b/py-oxbow/src/sequence.rs @@ -13,7 +13,9 @@ use noodles::bgzf::io::IndexedReader as IndexedBgzfReader; use noodles::core::Region; use crate::error::{err_on_unwind, to_py}; -use crate::util::{pyobject_to_bufreader, resolve_faidx, PyVirtualPosition, Reader}; +use crate::util::{ + pyobject_to_bufreader, resolve_faidx, resolve_fields, PyVirtualPosition, Reader, +}; use oxbow::sequence::{FastaScanner, FastqScanner}; use oxbow::util::batches_to_ipc; @@ -43,8 +45,9 @@ impl PyFastqScanner { py: Python, src: Py, compressed: bool, - fields: Option>, + fields: Option>, ) -> PyResult { + let fields = resolve_fields(fields, py)?; let _src = src.clone_ref(py); let reader = pyobject_to_bufreader(py, src, false)?; let scanner = FastqScanner::new(fields).map_err(to_py)?; @@ -226,8 +229,9 @@ impl PyFastaScanner { py: Python, src: Py, compressed: bool, - fields: Option>, + fields: Option>, ) -> PyResult { + let fields = resolve_fields(fields, py)?; let reader = pyobject_to_bufreader(py, src.clone_ref(py), false)?; let scanner = FastaScanner::new(fields).map_err(to_py)?; Ok(Self { @@ -396,9 +400,10 @@ impl PyFastaScanner { pub fn read_fastq( py: Python, src: Py, - fields: Option>, + fields: Option>, compressed: bool, ) -> PyResult> { + let fields = resolve_fields(fields, py)?; let reader = pyobject_to_bufreader(py, src, false)?; let scanner = FastqScanner::new(fields).map_err(to_py)?; @@ -445,9 +450,10 @@ pub fn read_fasta( regions: Option>, index: Option>, gzi: Option>, - fields: Option>, + fields: Option>, compressed: bool, ) -> PyResult> { + let fields = resolve_fields(fields, py)?; let reader = pyobject_to_bufreader(py, src.clone_ref(py), compressed)?; let scanner = FastaScanner::new(fields).map_err(to_py)?; diff --git a/py-oxbow/src/util.rs b/py-oxbow/src/util.rs index f999cb62..39845578 100644 --- a/py-oxbow/src/util.rs +++ b/py-oxbow/src/util.rs @@ -1,9 +1,12 @@ use std::io::BufReader; use std::io::{BufRead, Read, Seek}; +use pyo3::exceptions::PyValueError; use pyo3::prelude::*; use pyo3::types::{PyAny, PyString}; +use oxbow::Select; + use noodles::bgzf::gzi::Index as GzIndex; use noodles::bgzf::io::Seek as BgzfSeek; use noodles::bgzf::VirtualPosition; @@ -375,3 +378,33 @@ pub fn partition_from_index( Ok(partition) } + +/// Convert a Python `fields` argument to `Select`. +/// +/// Accepts: +/// - `"*"` → `Select::All` +/// - `None` → `Select::Omit` +/// - `list[str]` → `Select::Some(vec)` +pub fn resolve_fields(fields: Option>, py: Python) -> PyResult> { + match fields { + None => Ok(Select::Omit), + Some(obj) => { + let obj = obj.bind(py); + if let Ok(s) = obj.extract::() { + if s == "*" { + return Ok(Select::All); + } + return Err(PyErr::new::(format!( + "Invalid fields specifier '{}'. Use '*' for all fields.", + s + ))); + } + if let Ok(list) = obj.extract::>() { + return Ok(Select::Some(list)); + } + Err(PyErr::new::( + "fields must be '*', None, or a list of field names", + )) + } + } +} diff --git a/py-oxbow/src/variant.rs b/py-oxbow/src/variant.rs index ced3c45a..10ee3ec8 100644 --- a/py-oxbow/src/variant.rs +++ b/py-oxbow/src/variant.rs @@ -10,7 +10,9 @@ use pyo3_arrow::PySchema; use noodles::core::Region; use crate::error::{err_on_unwind, to_py}; -use crate::util::{pyobject_to_bufreader, resolve_index, PyVirtualPosition, Reader}; +use crate::util::{ + pyobject_to_bufreader, resolve_fields, resolve_index, PyVirtualPosition, Reader, +}; use oxbow::util::batches_to_ipc; use oxbow::util::index::IndexType; use oxbow::variant::{BcfScanner, GenotypeBy, VcfScanner}; @@ -29,12 +31,17 @@ use oxbow::variant::{BcfScanner, GenotypeBy, VcfScanner}; /// Names of the INFO fields to project. /// genotype_fields : list[str], optional /// Names of the sample-specific genotype fields to project. -/// samples : list[str], optional -/// Names of the samples to include in the genotype fields. /// genotype_by : Literal["sample", "field"], optional [default: "sample"] /// How to project the genotype fields. If "sample", the columns /// correspond to the samples. If "field", the columns correspond to /// the genotype fields. +/// samples : list[str] or None, optional [default: None] +/// Names of the samples to include in the genotype output. ``"*"`` for +/// all samples, a list to select specific samples, or ``None`` to omit +/// all sample genotype data. +/// samples_nested : bool, optional [default: False] +/// Whether to nest sample genotype data under a single ``"samples"`` struct +/// column. #[pyclass(module = "oxbow.oxbow")] pub struct PyVcfScanner { src: Py, @@ -46,18 +53,18 @@ pub struct PyVcfScanner { #[pymethods] impl PyVcfScanner { #[new] - #[pyo3(signature = (src, compressed=false, fields=None, info_fields=None, genotype_fields=None, samples=None, genotype_by=None, unnest_samples=true))] + #[pyo3(signature = (src, compressed=false, fields=None, info_fields=None, genotype_fields=None, genotype_by=None, samples=None,samples_nested=false))] #[allow(clippy::too_many_arguments)] fn new( py: Python, src: Py, compressed: bool, - fields: Option>, - info_fields: Option>, - genotype_fields: Option>, - samples: Option>, + fields: Option>, + info_fields: Option>, + genotype_fields: Option>, genotype_by: Option, - unnest_samples: bool, + samples: Option>, + samples_nested: bool, ) -> PyResult { let reader = pyobject_to_bufreader(py, src.clone_ref(py), compressed)?; let mut fmt_reader = noodles::vcf::io::Reader::new(reader); @@ -66,12 +73,12 @@ impl PyVcfScanner { let gt_by = resolve_genotype_by(genotype_by)?; let scanner = VcfScanner::new( header, - fields, - info_fields, - genotype_fields, - samples, + resolve_fields(fields, py)?, + resolve_fields(info_fields, py)?, + resolve_fields(genotype_fields, py)?, gt_by, - Some(unnest_samples), + resolve_fields(samples, py)?, + Some(samples_nested), ) .map_err(to_py)?; Ok(Self { @@ -107,7 +114,7 @@ impl PyVcfScanner { GenotypeBy::Field => "field", }; kwargs.set_item("genotype_by", gt_by)?; - kwargs.set_item("unnest_samples", model.unnest_samples())?; + kwargs.set_item("samples_nested", model.samples_nested())?; Ok((args.into_py_any(py)?, kwargs.into_py_any(py)?)) } @@ -392,12 +399,17 @@ impl PyVcfScanner { /// Names of the INFO fields to project. /// genotype_fields : list[str], optional /// Names of the sample-specific genotype fields to project. -/// samples : list[str], optional -/// Names of the samples to include in the genotype fields. /// genotype_by : Literal["sample", "field"], optional [default: "sample"] /// How to project the genotype fields. If "sample", the columns /// correspond to the samples. If "field", the columns correspond to /// the genotype fields. +/// samples : list[str] or None, optional [default: None] +/// Names of the samples to include in the genotype output. ``"*"`` for +/// all samples, a list to select specific samples, or ``None`` to omit +/// all sample genotype data. +/// samples_nested : bool, optional [default: False] +/// Whether to nest sample genotype data under a single ``"samples"`` struct +/// column. #[pyclass(module = "oxbow.oxbow")] pub struct PyBcfScanner { src: Py, @@ -409,18 +421,18 @@ pub struct PyBcfScanner { #[pymethods] impl PyBcfScanner { #[new] - #[pyo3(signature = (src, compressed=true, fields=None, info_fields=None, genotype_fields=None, samples=None, genotype_by=None, unnest_samples=true))] + #[pyo3(signature = (src, compressed=true, fields=None, info_fields=None, genotype_fields=None, genotype_by=None, samples=None, samples_nested=false))] #[allow(clippy::too_many_arguments)] fn new( py: Python, src: Py, compressed: bool, - fields: Option>, - info_fields: Option>, - genotype_fields: Option>, - samples: Option>, + fields: Option>, + info_fields: Option>, + genotype_fields: Option>, genotype_by: Option, - unnest_samples: bool, + samples: Option>, + samples_nested: bool, ) -> PyResult { let reader = pyobject_to_bufreader(py, src.clone_ref(py), compressed)?; let mut fmt_reader = noodles::bcf::io::Reader::from(reader); @@ -429,12 +441,12 @@ impl PyBcfScanner { let gt_by = resolve_genotype_by(genotype_by)?; let scanner = BcfScanner::new( header, - fields, - info_fields, - genotype_fields, - samples, + resolve_fields(fields, py)?, + resolve_fields(info_fields, py)?, + resolve_fields(genotype_fields, py)?, gt_by, - Some(unnest_samples), + resolve_fields(samples, py)?, + Some(samples_nested), ) .map_err(to_py)?; Ok(Self { @@ -470,7 +482,7 @@ impl PyBcfScanner { GenotypeBy::Field => "field", }; kwargs.set_item("genotype_by", gt_by)?; - kwargs.set_item("unnest_samples", model.unnest_samples())?; + kwargs.set_item("samples_nested", model.samples_nested())?; Ok((args.into_py_any(py)?, kwargs.into_py_any(py)?)) } @@ -768,12 +780,14 @@ fn resolve_genotype_by(genotype_by: Option) -> PyResult) -> PyResult, region: Option, index: Option>, - fields: Option>, - info_fields: Option>, - genotype_fields: Option>, - samples: Option>, + fields: Option>, + info_fields: Option>, + genotype_fields: Option>, genotype_by: Option, + samples: Option>, + samples_nested: bool, compressed: bool, ) -> PyResult> { let reader = pyobject_to_bufreader(py, src.clone_ref(py), compressed)?; @@ -803,12 +818,12 @@ pub fn read_vcf( let genotype_by = resolve_genotype_by(genotype_by)?; let scanner = VcfScanner::new( header, - fields, - info_fields, - genotype_fields, - samples, + resolve_fields(fields, py)?, + resolve_fields(info_fields, py)?, + resolve_fields(genotype_fields, py)?, genotype_by, - None, + resolve_fields(samples, py)?, + Some(samples_nested), ) .map_err(to_py)?; @@ -865,12 +880,14 @@ pub fn read_vcf( /// Names of the INFO fields to project. /// genotype_fields : list[str], optional /// Names of the sample-specific genotype fields to project. -/// samples : list[str], optional -/// Names of the samples to include in the genotype fields. /// genotype_by : Literal["sample", "field"], optional [default: "sample"] /// How to project the genotype fields. If "sample", the columns /// correspond to the samples. If "field", the columns correspond to /// the genotype fields. +/// samples : list[str], optional +/// Names of the samples to include in the genotype fields. +/// samples_nested : bool, optional [default: False] +/// Whether to nest the sample-specific genotype fields under a "samples" struct column. /// compressed : bool, optional [default: True] /// Whether the source is BGZF-compressed. /// @@ -879,18 +896,19 @@ pub fn read_vcf( /// bytes /// Arrow IPC #[pyfunction] -#[pyo3(signature = (src, region=None, index=None, fields=None, info_fields=None, genotype_fields=None, samples=None, genotype_by=None, compressed=true))] +#[pyo3(signature = (src, region=None, index=None, fields=None, info_fields=None, genotype_fields=None, genotype_by=None, samples=None, samples_nested=false, compressed=true))] #[allow(clippy::too_many_arguments)] pub fn read_bcf( py: Python, src: Py, region: Option, index: Option>, - fields: Option>, - info_fields: Option>, - genotype_fields: Option>, - samples: Option>, + fields: Option>, + info_fields: Option>, + genotype_fields: Option>, genotype_by: Option, + samples: Option>, + samples_nested: bool, compressed: bool, ) -> PyResult> { let reader = pyobject_to_bufreader(py, src.clone_ref(py), compressed)?; @@ -900,12 +918,12 @@ pub fn read_bcf( let genotype_by = resolve_genotype_by(genotype_by)?; let scanner = BcfScanner::new( header, - fields, - info_fields, - genotype_fields, - samples, + resolve_fields(fields, py)?, + resolve_fields(info_fields, py)?, + resolve_fields(genotype_fields, py)?, genotype_by, - None, + resolve_fields(samples, py)?, + Some(samples_nested), ) .map_err(to_py)?; diff --git a/py-oxbow/tests/manifests/test_alignment.TestBamFile.test_batches.yaml b/py-oxbow/tests/manifests/test_alignment.TestBamFile.test_batches.yaml index 4871e50e..6a7542d2 100644 --- a/py-oxbow/tests/manifests/test_alignment.TestBamFile.test_batches.yaml +++ b/py-oxbow/tests/manifests/test_alignment.TestBamFile.test_batches.yaml @@ -1,258 +1,5 @@ fields=None: - batch-00: - cigar: - - 50M - - 75M - - 75M - - 75M - - 50M - - 75M - - 50M - - 50M - - 75M - - 75M - end: - - 10591 - - 10620 - - 946531 - - 1014134 - - 197006 - - 197032 - - 197010 - - 503896 - - 586259 - - 587635 - flag: - - 163 - - 16 - - 16 - - 16 - - 83 - - 16 - - 163 - - 161 - - 16 - - 0 - mapq: - - 0 - - 16 - - 0 - - 37 - - 60 - - 37 - - 60 - - 0 - - 0 - - 0 - pnext: - - 10571 - - null - - null - - null - - 196008 - - null - - 319702 - - 185365552 - - null - - null - pos: - - 10542 - - 10546 - - 946457 - - 1014060 - - 196957 - - 196958 - - 196961 - - 503847 - - 586185 - - 587561 - qname: - - HWI-BRUNOP16X_0001:3:48:4861:11838#0 - - HWI-BRUNOP16X_0001:3:28:6650:168848#0 - - HWI-BRUNOP16X_0001:3:8:20066:88158#0 - - HWI-BRUNOP16X_0001:3:27:10302:58768#0 - - HWI-BRUNOP16X_0001:3:65:3144:143676#0 - - HWI-BRUNOP16X_0001:3:68:13088:156644#0 - - HWI-BRUNOP16X_0001:3:48:3417:101389#0 - - HWI-BRUNOP16X_0001:3:46:17583:95767#0 - - HWI-BRUNOP16X_0001:3:4:7989:14941#0 - - HWI-BRUNOP16X_0001:3:44:11450:50194#0 - qual: - - gggggggggggggggggggggggggeggggR\_[\ggggghggggggggg - - fggggggggdgdggcdfggggfgggggggggggggggggggggggggfggggggggggggggggggggggggggg - - BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBd`aed``__U^__]_ggggcggggd]\\\[\]^]]gggggdfcbb - - BBBBBBBBBBBBBBBBcYRcffggfgf_gfg\deegfgfgfcggcggfggggcgggggcgcggfgggggggggeg - - BBBBBBBBBBBBBB_TTSSS[[Obbd`]e^STTTSZW`beTTTTTSSTTT - - BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB^bg_`[^\]X`ZZcggggdfgggggggg - - gggggggggggggggggggggggfdaggggggdgggfgdhbe\T`BBBBB - - ddfdfd____dffff]__aeZ]\XZSPSNSSSSSSbbaabZ_``BBBBBB - - BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB - - BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB - rname: - - chr1 - - chr1 - - chr1 - - chr1 - - chr3 - - chr3 - - chr3 - - chrX - - chrY - - chrY - rnext: - - chr1 - - null - - null - - null - - chr3 - - null - - chr3 - - chr4 - - null - - null - seq: - - CGAAATCTGTGCAGAGGAGAACGCAGCTCCGCCCTCGCGGTGCTCTCCGG - - ATCTGTGCAGAGGAGAACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAGCTCCGCC - - TAGTCCGAGGTCTCCTGAACCTTCCCAAGCAGCTGCTGCACCTGCCGGCAGTAGTTGGCCACCTTGCACTCCCGG - - AGCTGAATGGGCAGGTCCCCCAGAAGATCGGCGTGCACGCCTTCCAGCAGCGTCTGGCTGTCCACCCGAGCGGTG - - GTAACGCTCCCGGACCCTGCGCGCCCCCGTCCCGGCTCCCGGCCGGCTCG - - GACCCCCCCGGCCCCCGGCGCCCCCCCGCCCCGCCCCCGGGCGGGCGGGGGGGAGAAGGCGCCCGAGGGGAGGCG - - GCTTACCGGACCCTGCGCGCCCCCGTCCCGGCTCCCGGCCGGCTCGGGGG - - TTTTATTTTTTTTTTTGAGATGGAGTCTCGCTCTTGTCACCGAGGCTGGA - - GTGCGATCTCGGTTCGCTGCAACCTCTGCTTCCCAGGTTCAAGTGATTCTCCGGCCTCAGCCTCCCAAGTAGCNN - - NNTGCAGTGAGCTGAGATTGTGCCACTGCACTCCAGCCTGGGTGACAGAGGTAGACTGTGTCTCAAAAAAAAAAA - tags: - - AM: 0 - MD: 18C31 - NM: 1 - RG: brain_50_fcb - SM: 0 - X0: 3 - X1: 8 - XA: null - XG: 0 - XM: 1 - XO: 0 - XT: '82' - - AM: null - MD: 14C52A7 - NM: 2 - RG: brain_75_fca - SM: null - X0: 1 - X1: 5 - XA: null - XG: 0 - XM: 2 - XO: 0 - XT: '85' - - AM: null - MD: 2T0G5T65 - NM: 3 - RG: brain_75_fca - SM: null - X0: 2 - X1: 0 - XA: 2,-131443143,75M,3; - XG: 0 - XM: 3 - XO: 0 - XT: '82' - - AM: null - MD: 7G1C4A2A57 - NM: 4 - RG: brain_75_fca - SM: null - X0: 1 - X1: 0 - XA: null - XG: 0 - XM: 4 - XO: 0 - XT: '85' - - AM: 37 - MD: 0C0A0G1G0C0T1A41 - NM: 7 - RG: brain_50_fcb - SM: 37 - X0: 1 - X1: 0 - XA: null - XG: 0 - XM: 7 - XO: 0 - XT: '85' - - AM: null - MD: 0A0G0A0G1T0T0A1C1G0A3T4G6T4G1T2C3C3T0C27 - NM: 19 - RG: brain_75_fca - SM: null - X0: 1 - X1: 0 - XA: null - XG: 0 - XM: 19 - XO: 0 - XT: '85' - - AM: 37 - MD: '50' - NM: 0 - RG: brain_50_fcb - SM: 37 - X0: 1 - X1: 0 - XA: null - XG: 0 - XM: 0 - XO: 0 - XT: '85' - - AM: 0 - MD: 4T36C8 - NM: 2 - RG: brain_50_fcb - SM: 0 - X0: 18 - X1: 174 - XA: null - XG: 0 - XM: 2 - XO: 0 - XT: '82' - - AM: null - MD: 4C10A10C16C29T0G0 - NM: 6 - RG: brain_75_fca - SM: null - X0: 2 - X1: 2 - XA: X,-586185,75M,6;3,+196723225,75M,7;19,+13666092,75M,7; - XG: 0 - XM: 6 - XO: 0 - XT: '82' - - AM: null - MD: 0G0G48T0G23 - NM: 4 - RG: brain_75_fca - SM: null - X0: 6 - X1: 54 - XA: null - XG: 0 - XM: 4 - XO: 0 - XT: '82' - tlen: - - 79 - - 0 - - 0 - - 0 - - -999 - - 0 - - 122791 - - 0 - - 0 - - 0 + batch-00: {} fields=['qname', 'rname', 'foo']: 'Invalid field name: foo' fields=['qname', 'rname', 'mapq']: batch-00: @@ -289,124 +36,3 @@ fields=['qname', 'rname', 'mapq']: - chrX - chrY - chrY - tags: - - AM: 0 - MD: 18C31 - NM: 1 - RG: brain_50_fcb - SM: 0 - X0: 3 - X1: 8 - XA: null - XG: 0 - XM: 1 - XO: 0 - XT: '82' - - AM: null - MD: 14C52A7 - NM: 2 - RG: brain_75_fca - SM: null - X0: 1 - X1: 5 - XA: null - XG: 0 - XM: 2 - XO: 0 - XT: '85' - - AM: null - MD: 2T0G5T65 - NM: 3 - RG: brain_75_fca - SM: null - X0: 2 - X1: 0 - XA: 2,-131443143,75M,3; - XG: 0 - XM: 3 - XO: 0 - XT: '82' - - AM: null - MD: 7G1C4A2A57 - NM: 4 - RG: brain_75_fca - SM: null - X0: 1 - X1: 0 - XA: null - XG: 0 - XM: 4 - XO: 0 - XT: '85' - - AM: 37 - MD: 0C0A0G1G0C0T1A41 - NM: 7 - RG: brain_50_fcb - SM: 37 - X0: 1 - X1: 0 - XA: null - XG: 0 - XM: 7 - XO: 0 - XT: '85' - - AM: null - MD: 0A0G0A0G1T0T0A1C1G0A3T4G6T4G1T2C3C3T0C27 - NM: 19 - RG: brain_75_fca - SM: null - X0: 1 - X1: 0 - XA: null - XG: 0 - XM: 19 - XO: 0 - XT: '85' - - AM: 37 - MD: '50' - NM: 0 - RG: brain_50_fcb - SM: 37 - X0: 1 - X1: 0 - XA: null - XG: 0 - XM: 0 - XO: 0 - XT: '85' - - AM: 0 - MD: 4T36C8 - NM: 2 - RG: brain_50_fcb - SM: 0 - X0: 18 - X1: 174 - XA: null - XG: 0 - XM: 2 - XO: 0 - XT: '82' - - AM: null - MD: 4C10A10C16C29T0G0 - NM: 6 - RG: brain_75_fca - SM: null - X0: 2 - X1: 2 - XA: X,-586185,75M,6;3,+196723225,75M,7;19,+13666092,75M,7; - XG: 0 - XM: 6 - XO: 0 - XT: '82' - - AM: null - MD: 0G0G48T0G23 - NM: 4 - RG: brain_75_fca - SM: null - X0: 6 - X1: 54 - XA: null - XG: 0 - XM: 4 - XO: 0 - XT: '82' diff --git a/py-oxbow/tests/manifests/test_alignment.TestBamFile.test_init_callstack.yaml b/py-oxbow/tests/manifests/test_alignment.TestBamFile.test_init_callstack.yaml index f3d9dc27..431f214c 100644 --- a/py-oxbow/tests/manifests/test_alignment.TestBamFile.test_init_callstack.yaml +++ b/py-oxbow/tests/manifests/test_alignment.TestBamFile.test_init_callstack.yaml @@ -2,26 +2,14 @@ BamFile("data/does-not-exist.bam", compressed=True): |- -> oxbow._core.alignment.AlignmentFile.__init__("data/does-not-exist.bam", compressed=True) -> oxbow._core.base.DataSource.__init__("data/does-not-exist.bam", None, 131072) <- None - -> oxbow._core.base.DataSource._source - <- 'data/does-not-exist.bam' - -> oxbow._core.alignment.AlignmentFile._tag_discovery_kwargs() - <- {'compressed': 'True'} - !! FileNotFoundError("No such file or directory (os error 2)") + <- None BamFile("data/malformed.bam", compressed=True): |- -> oxbow._core.alignment.AlignmentFile.__init__("data/malformed.bam", compressed=True) -> oxbow._core.base.DataSource.__init__("data/malformed.bam", None, 131072) <- None - -> oxbow._core.base.DataSource._source - <- 'data/malformed.bam' - -> oxbow._core.alignment.AlignmentFile._tag_discovery_kwargs() - <- {'compressed': 'True'} - !! OSError("failed to fill whole buffer") + <- None BamFile("data/sample.bam", compressed=True): |- -> oxbow._core.alignment.AlignmentFile.__init__("data/sample.bam", compressed=True) -> oxbow._core.base.DataSource.__init__("data/sample.bam", None, 131072) <- None - -> oxbow._core.base.DataSource._source - <- 'data/sample.bam' - -> oxbow._core.alignment.AlignmentFile._tag_discovery_kwargs() - <- {'compressed': 'True'} <- None diff --git a/py-oxbow/tests/manifests/test_alignment.TestCramFile.test_batches.yaml b/py-oxbow/tests/manifests/test_alignment.TestCramFile.test_batches.yaml index fdc86749..c3b98e57 100644 --- a/py-oxbow/tests/manifests/test_alignment.TestCramFile.test_batches.yaml +++ b/py-oxbow/tests/manifests/test_alignment.TestCramFile.test_batches.yaml @@ -1,57 +1,5 @@ fields=None: - batch-00: - cigar: - - 6M14N1I5M - - 6H5M - - 9M - end: - - 40 - - 33 - - 45 - flag: - - 0 - - 16 - - 83 - mapq: - - 30 - - 30 - - 30 - pnext: - - null - - null - - 7 - pos: - - 16 - - 29 - - 37 - qname: - - r004 - - r003 - - r001 - qual: - - null - - null - - null - rname: - - chr1 - - chr1 - - chr2 - rnext: - - null - - null - - chr2 - seq: - - ATAGCTCTCAGC - - TAGGC - - CAGCGCCAT - tags: - - cF: 3 - - cF: 3 - - cF: 3 - tlen: - - 0 - - 0 - - -39 + batch-00: {} fields=['qname', 'rname', 'foo']: 'Invalid field name: foo' fields=['qname', 'rname', 'mapq']: batch-00: @@ -67,7 +15,3 @@ fields=['qname', 'rname', 'mapq']: - chr1 - chr1 - chr2 - tags: - - cF: 3 - - cF: 3 - - cF: 3 diff --git a/py-oxbow/tests/manifests/test_alignment.TestCramFile.test_init_callstack.yaml b/py-oxbow/tests/manifests/test_alignment.TestCramFile.test_init_callstack.yaml index bd52f910..e54d7276 100644 --- a/py-oxbow/tests/manifests/test_alignment.TestCramFile.test_init_callstack.yaml +++ b/py-oxbow/tests/manifests/test_alignment.TestCramFile.test_init_callstack.yaml @@ -1,33 +1,21 @@ CramFile("data/does-not-exist.cram"): |- -> oxbow._core.alignment.CramFile.__init__("data/does-not-exist.cram") - -> oxbow._core.alignment.AlignmentFile.__init__("data/does-not-exist.cram", compressed=False, fields=None, tag_defs=None, tag_scan_rows=1024, regions=None, index=None, batch_size=131072) + -> oxbow._core.alignment.AlignmentFile.__init__("data/does-not-exist.cram", compressed=False, fields="*", tag_defs=None, regions=None, index=None, batch_size=131072) -> oxbow._core.base.DataSource.__init__("data/does-not-exist.cram", None, 131072) <- None - -> oxbow._core.base.DataSource._source - <- 'data/does-not-exist.cram' - -> oxbow._core.alignment.CramFile._tag_discovery_kwargs() - <- {'compressed': 'False', 'reference': 'None', 'reference_index': 'None'} - !! FileNotFoundError("No such file or directory (os error 2)") - !! FileNotFoundError("No such file or directory (os error 2)") + <- None + <- None CramFile("data/malformed.cram"): |- -> oxbow._core.alignment.CramFile.__init__("data/malformed.cram") - -> oxbow._core.alignment.AlignmentFile.__init__("data/malformed.cram", compressed=False, fields=None, tag_defs=None, tag_scan_rows=1024, regions=None, index=None, batch_size=131072) + -> oxbow._core.alignment.AlignmentFile.__init__("data/malformed.cram", compressed=False, fields="*", tag_defs=None, regions=None, index=None, batch_size=131072) -> oxbow._core.base.DataSource.__init__("data/malformed.cram", None, 131072) <- None - -> oxbow._core.base.DataSource._source - <- 'data/malformed.cram' - -> oxbow._core.alignment.CramFile._tag_discovery_kwargs() - <- {'compressed': 'False', 'reference': 'None', 'reference_index': 'None'} <- None <- None CramFile("data/sample.cram"): |- -> oxbow._core.alignment.CramFile.__init__("data/sample.cram") - -> oxbow._core.alignment.AlignmentFile.__init__("data/sample.cram", compressed=False, fields=None, tag_defs=None, tag_scan_rows=1024, regions=None, index=None, batch_size=131072) + -> oxbow._core.alignment.AlignmentFile.__init__("data/sample.cram", compressed=False, fields="*", tag_defs=None, regions=None, index=None, batch_size=131072) -> oxbow._core.base.DataSource.__init__("data/sample.cram", None, 131072) <- None - -> oxbow._core.base.DataSource._source - <- 'data/sample.cram' - -> oxbow._core.alignment.CramFile._tag_discovery_kwargs() - <- {'compressed': 'False', 'reference': 'None', 'reference_index': 'None'} <- None <- None diff --git a/py-oxbow/tests/manifests/test_alignment.TestSamFile.test_batches.yaml b/py-oxbow/tests/manifests/test_alignment.TestSamFile.test_batches.yaml index e11c98e2..2613d884 100644 --- a/py-oxbow/tests/manifests/test_alignment.TestSamFile.test_batches.yaml +++ b/py-oxbow/tests/manifests/test_alignment.TestSamFile.test_batches.yaml @@ -1,3 +1,53 @@ +fields=*: + batch-00: + cigar: + - 6M14N1I5M + - 6H5M + - 9M + end: + - 40 + - 33 + - 45 + flag: + - 0 + - 16 + - 83 + mapq: + - 30 + - 30 + - 30 + pnext: + - null + - null + - 7 + pos: + - 16 + - 29 + - 37 + qname: + - r004 + - r003 + - r001 + qual: + - null + - null + - null + rname: + - chr1 + - chr1 + - chr2 + rnext: + - null + - null + - chr2 + seq: + - ATAGCTCTCAGC + - TAGGC + - CAGCGCCAT + tlen: + - 0 + - 0 + - -39 fields=None: batch-00: cigar: diff --git a/py-oxbow/tests/manifests/test_alignment.TestSamFile.test_init_callstack.yaml b/py-oxbow/tests/manifests/test_alignment.TestSamFile.test_init_callstack.yaml index 0e3ed39c..3cbfd365 100644 --- a/py-oxbow/tests/manifests/test_alignment.TestSamFile.test_init_callstack.yaml +++ b/py-oxbow/tests/manifests/test_alignment.TestSamFile.test_init_callstack.yaml @@ -2,26 +2,14 @@ SamFile("data/does-not-exist.sam"): |- -> oxbow._core.alignment.AlignmentFile.__init__("data/does-not-exist.sam") -> oxbow._core.base.DataSource.__init__("data/does-not-exist.sam", None, 131072) <- None - -> oxbow._core.base.DataSource._source - <- 'data/does-not-exist.sam' - -> oxbow._core.alignment.AlignmentFile._tag_discovery_kwargs() - <- {'compressed': 'False'} - !! FileNotFoundError("No such file or directory (os error 2)") + <- None SamFile("data/malformed.sam"): |- -> oxbow._core.alignment.AlignmentFile.__init__("data/malformed.sam") -> oxbow._core.base.DataSource.__init__("data/malformed.sam", None, 131072) <- None - -> oxbow._core.base.DataSource._source - <- 'data/malformed.sam' - -> oxbow._core.alignment.AlignmentFile._tag_discovery_kwargs() - <- {'compressed': 'False'} <- None SamFile("data/sample.sam"): |- -> oxbow._core.alignment.AlignmentFile.__init__("data/sample.sam") -> oxbow._core.base.DataSource.__init__("data/sample.sam", None, 131072) <- None - -> oxbow._core.base.DataSource._source - <- 'data/sample.sam' - -> oxbow._core.alignment.AlignmentFile._tag_discovery_kwargs() - <- {'compressed': 'False'} <- None diff --git a/py-oxbow/tests/manifests/test_bbi.TestBigBedFile.test_batches.yaml b/py-oxbow/tests/manifests/test_bbi.TestBigBedFile.test_batches.yaml index e07042e6..66d1a11a 100644 --- a/py-oxbow/tests/manifests/test_bbi.TestBigBedFile.test_batches.yaml +++ b/py-oxbow/tests/manifests/test_bbi.TestBigBedFile.test_batches.yaml @@ -305,7 +305,7 @@ fields=('chrom', 'start', 'end'): - 46715593 fields=('nonexistent-field',): 'Field ''nonexistent-field'' not in BED schema. Available: ["chrom", "start", "end", "rest"]' -fields=None: +fields=*: batch-00: chrom: - chr21 @@ -711,3 +711,5 @@ fields=None: - 46270819 - 46442909 - 46715593 +fields=None: + batch-00: {} diff --git a/py-oxbow/tests/manifests/test_bbi.TestBigWigFile.test_batches.yaml b/py-oxbow/tests/manifests/test_bbi.TestBigWigFile.test_batches.yaml index b6926b82..42527b47 100644 --- a/py-oxbow/tests/manifests/test_bbi.TestBigWigFile.test_batches.yaml +++ b/py-oxbow/tests/manifests/test_bbi.TestBigWigFile.test_batches.yaml @@ -305,7 +305,7 @@ fields=('chrom', 'start', 'end'): - 47650460 fields=('nonexistent-field',): 'Field ''nonexistent-field'' not in BED schema. Available: ["chrom", "start", "end", "value"]' -fields=None: +fields=*: batch-00: chrom: - chr21 @@ -711,3 +711,5 @@ fields=None: - 60.0 - 40.0 - 20.0 +fields=None: + batch-00: {} diff --git a/py-oxbow/tests/manifests/test_bed.TestBedFile.test_batches.yaml b/py-oxbow/tests/manifests/test_bed.TestBedFile.test_batches.yaml index 8c4016d0..02dd3335 100644 --- a/py-oxbow/tests/manifests/test_bed.TestBedFile.test_batches.yaml +++ b/py-oxbow/tests/manifests/test_bed.TestBedFile.test_batches.yaml @@ -203,7 +203,7 @@ fields=('chrom', 'start', 'end'): - 500001 fields=('nonexistent-field',): 'Field ''nonexistent-field'' not in BED schema. Available: ["chrom", "start", "end", "rest"]' -fields=None: +fields=*: batch-00: chrom: - chr1 @@ -473,3 +473,5 @@ fields=None: - 200001 - 350001 - 500001 +fields=None: + batch-00: {} diff --git a/py-oxbow/tests/manifests/test_gxf.TestGffFile.test_batches.yaml b/py-oxbow/tests/manifests/test_gxf.TestGffFile.test_batches.yaml index fc0da731..6ff2d139 100644 --- a/py-oxbow/tests/manifests/test_gxf.TestGffFile.test_batches.yaml +++ b/py-oxbow/tests/manifests/test_gxf.TestGffFile.test_batches.yaml @@ -1,421 +1,6 @@ fields=('nonexistent-field',): 'Invalid field name: nonexistent-field' fields=('seqid', 'start', 'end'): batch-00: - attributes: - - ID: exon:ENST00000782961.1:2 - Parent: ENST00000782961.1 - ccdsid: null - exon_id: ENSE00004156517.1 - exon_number: '2' - gene_id: ENSG00000229309.3 - gene_name: ENSG00000229309 - gene_type: lncRNA - havana_gene: OTTHUMG00000017146.2 - havana_transcript: null - hgnc_id: null - level: '2' - protein_id: null - tag: - - basic - - Ensembl_canonical - - TAGENE - transcript_id: ENST00000782961.1 - transcript_name: ENST00000782961 - transcript_support_level: null - transcript_type: lncRNA - - ID: CDS:ENST00000498271.1 - Parent: ENST00000498271.1 - ccdsid: CCDS59005.1 - exon_id: ENSE00001878698.1 - exon_number: '40' - gene_id: ENSG00000244731.10 - gene_name: C4A - gene_type: protein_coding - havana_gene: OTTHUMG00000031186.6 - havana_transcript: OTTHUMT00000356896.1 - hgnc_id: HGNC:1323 - level: '2' - protein_id: ENSP00000420212.1 - tag: - - RNA_Seq_supported_only - - basic - - GENCODE_Primary - - CCDS - transcript_id: ENST00000498271.1 - transcript_name: C4A-246 - transcript_support_level: '1' - transcript_type: protein_coding - - ID: exon:ENST00000334011.10:8 - Parent: ENST00000334011.10 - ccdsid: CCDS7318.1 - exon_id: ENSE00001170094.1 - exon_number: '8' - gene_id: ENSG00000138315.13 - gene_name: OIT3 - gene_type: protein_coding - havana_gene: OTTHUMG00000018444.2 - havana_transcript: OTTHUMT00000048596.2 - hgnc_id: HGNC:29953 - level: '2' - protein_id: ENSP00000333900.5 - tag: - - basic - - Ensembl_canonical - - GENCODE_Primary - - MANE_Select - - appris_principal_1 - - CCDS - transcript_id: ENST00000334011.10 - transcript_name: OIT3-201 - transcript_support_level: '1' - transcript_type: protein_coding - - ID: exon:ENST00000641916.1:4 - Parent: ENST00000641916.1 - ccdsid: null - exon_id: ENSE00003812605.1 - exon_number: '4' - gene_id: ENSG00000290385.2 - gene_name: ENSG00000290385 - gene_type: lncRNA - havana_gene: null - havana_transcript: OTTHUMT00000493599.1 - hgnc_id: null - level: '2' - protein_id: null - tag: null - transcript_id: ENST00000641916.1 - transcript_name: ENST00000641916 - transcript_support_level: null - transcript_type: lncRNA - - ID: CDS:ENST00000629018.4 - Parent: ENST00000629018.4 - ccdsid: null - exon_id: ENSE00000938859.1 - exon_number: '28' - gene_id: ENSG00000172915.20 - gene_name: NBEA - gene_type: protein_coding - havana_gene: OTTHUMG00000016724.2 - havana_transcript: null - hgnc_id: HGNC:7648 - level: '2' - protein_id: ENSP00000486239.3 - tag: - - RNA_Seq_supported_only - - mRNA_start_NF - - cds_start_NF - transcript_id: ENST00000629018.4 - transcript_name: NBEA-207 - transcript_support_level: '5' - transcript_type: protein_coding - - ID: UTR5:ENST00000674645.1 - Parent: ENST00000674645.1 - ccdsid: CCDS56266.1 - exon_id: ENSE00000967215.3 - exon_number: '2' - gene_id: ENSG00000114354.15 - gene_name: TFG - gene_type: protein_coding - havana_gene: OTTHUMG00000159085.4 - havana_transcript: null - hgnc_id: HGNC:11758 - level: '2' - protein_id: ENSP00000501892.1 - tag: - - basic - - GENCODE_Primary - - appris_alternative_1 - - CCDS - transcript_id: ENST00000674645.1 - transcript_name: TFG-212 - transcript_support_level: null - transcript_type: protein_coding - - ID: exon:ENST00000497536.1:1 - Parent: ENST00000497536.1 - ccdsid: null - exon_id: ENSE00003569641.1 - exon_number: '1' - gene_id: ENSG00000130414.13 - gene_name: NDUFA10 - gene_type: protein_coding - havana_gene: OTTHUMG00000133350.5 - havana_transcript: OTTHUMT00000326145.1 - hgnc_id: HGNC:7684 - level: '2' - protein_id: null - tag: null - transcript_id: ENST00000497536.1 - transcript_name: NDUFA10-213 - transcript_support_level: '2' - transcript_type: retained_intron - - ID: ENST00000558100.1 - Parent: ENSG00000069956.14 - ccdsid: null - exon_id: null - exon_number: null - gene_id: ENSG00000069956.14 - gene_name: MAPK6 - gene_type: protein_coding - havana_gene: OTTHUMG00000131891.4 - havana_transcript: OTTHUMT00000419352.1 - hgnc_id: HGNC:6879 - level: '1' - protein_id: null - tag: - - exp_conf - transcript_id: ENST00000558100.1 - transcript_name: MAPK6-203 - transcript_support_level: '2' - transcript_type: protein_coding_CDS_not_defined - - ID: ENST00000728222.1 - Parent: ENSG00000259225.10 - ccdsid: null - exon_id: null - exon_number: null - gene_id: ENSG00000259225.10 - gene_name: LINC02345 - gene_type: lncRNA - havana_gene: OTTHUMG00000172364.6 - havana_transcript: null - hgnc_id: HGNC:53267 - level: '2' - protein_id: null - tag: - - TAGENE - transcript_id: ENST00000728222.1 - transcript_name: LINC02345-220 - transcript_support_level: null - transcript_type: lncRNA - - ID: CDS:ENST00000606065.3 - Parent: ENST00000606065.3 - ccdsid: null - exon_id: ENSE00003643788.1 - exon_number: '1' - gene_id: ENSG00000099804.9 - gene_name: CDC34 - gene_type: protein_coding - havana_gene: OTTHUMG00000180558.11 - havana_transcript: OTTHUMT00000471238.3 - hgnc_id: HGNC:1734 - level: '2' - protein_id: ENSP00000475610.1 - tag: - - mRNA_start_NF - - cds_start_NF - transcript_id: ENST00000606065.3 - transcript_name: CDC34-205 - transcript_support_level: '5' - transcript_type: protein_coding - - ID: exon:ENST00000398743.6:2 - Parent: ENST00000398743.6 - ccdsid: CCDS13801.1 - exon_id: ENSE00001534682.1 - exon_number: '2' - gene_id: ENSG00000185686.18 - gene_name: PRAME - gene_type: protein_coding - havana_gene: OTTHUMG00000151172.4 - havana_transcript: OTTHUMT00000321642.2 - hgnc_id: HGNC:9336 - level: '2' - protein_id: ENSP00000381728.2 - tag: - - basic - - appris_principal_1 - - CCDS - transcript_id: ENST00000398743.6 - transcript_name: PRAME-202 - transcript_support_level: '1' - transcript_type: protein_coding - - ID: exon:ENST00000592304.1:2 - Parent: ENST00000592304.1 - ccdsid: null - exon_id: ENSE00002735216.1 - exon_number: '2' - gene_id: ENSG00000188554.15 - gene_name: NBR1 - gene_type: protein_coding - havana_gene: OTTHUMG00000180878.2 - havana_transcript: OTTHUMT00000453462.1 - hgnc_id: HGNC:6746 - level: '2' - protein_id: null - tag: null - transcript_id: ENST00000592304.1 - transcript_name: NBR1-207 - transcript_support_level: '2' - transcript_type: retained_intron - - ID: exon:ENST00000589170.5:5 - Parent: ENST00000589170.5 - ccdsid: null - exon_id: ENSE00002793715.1 - exon_number: '5' - gene_id: ENSG00000184828.10 - gene_name: ZBTB7C - gene_type: protein_coding - havana_gene: OTTHUMG00000180322.3 - havana_transcript: OTTHUMT00000450756.1 - hgnc_id: HGNC:31700 - level: '2' - protein_id: ENSP00000467338.1 - tag: - - alternative_5_UTR - - mRNA_end_NF - - cds_end_NF - transcript_id: ENST00000589170.5 - transcript_name: ZBTB7C-215 - transcript_support_level: '1' - transcript_type: protein_coding - - ID: exon:ENST00000016913.8:3 - Parent: ENST00000016913.8 - ccdsid: CCDS7988.1 - exon_id: ENSE00000720002.1 - exon_number: '3' - gene_id: ENSG00000071203.9 - gene_name: MS4A12 - gene_type: protein_coding - havana_gene: OTTHUMG00000165364.2 - havana_transcript: OTTHUMT00000383627.1 - hgnc_id: HGNC:13370 - level: '2' - protein_id: ENSP00000016913.4 - tag: - - basic - - Ensembl_canonical - - GENCODE_Primary - - MANE_Select - - appris_principal_1 - - CCDS - transcript_id: ENST00000016913.8 - transcript_name: MS4A12-201 - transcript_support_level: '1' - transcript_type: protein_coding - - ID: exon:ENST00000379937.6:17 - Parent: ENST00000379937.6 - ccdsid: CCDS55952.1 - exon_id: ENSE00001883333.1 - exon_number: '17' - gene_id: ENSG00000185024.18 - gene_name: BRF1 - gene_type: protein_coding - havana_gene: OTTHUMG00000029884.9 - havana_transcript: OTTHUMT00000408726.1 - hgnc_id: HGNC:11551 - level: '2' - protein_id: ENSP00000369269.2 - tag: - - basic - - GENCODE_Primary - - CCDS - transcript_id: ENST00000379937.6 - transcript_name: BRF1-202 - transcript_support_level: '1' - transcript_type: protein_coding - - ID: UTR3:ENST00000589807.1 - Parent: ENST00000589807.1 - ccdsid: null - exon_id: ENSE00003603860.1 - exon_number: '9' - gene_id: ENSG00000267120.3 - gene_name: ENSG00000267120 - gene_type: protein_coding - havana_gene: OTTHUMG00000182065.2 - havana_transcript: OTTHUMT00000459084.2 - hgnc_id: null - level: '2' - protein_id: ENSP00000472696.1 - tag: - - basic - - Ensembl_canonical - - GENCODE_Primary - - appris_principal_1 - - readthrough_transcript - transcript_id: ENST00000589807.1 - transcript_name: ENST00000589807 - transcript_support_level: '2' - transcript_type: nonsense_mediated_decay - - ID: CDS:ENST00000420138.5 - Parent: ENST00000420138.5 - ccdsid: null - exon_id: ENSE00003542120.1 - exon_number: '7' - gene_id: ENSG00000063660.9 - gene_name: GPC1 - gene_type: protein_coding - havana_gene: OTTHUMG00000133349.7 - havana_transcript: OTTHUMT00000325970.2 - hgnc_id: HGNC:4449 - level: '2' - protein_id: ENSP00000415077.2 - tag: - - basic - transcript_id: ENST00000420138.5 - transcript_name: GPC1-202 - transcript_support_level: '5' - transcript_type: protein_coding - - ID: exon:ENST00000626609.2:1 - Parent: ENST00000626609.2 - ccdsid: null - exon_id: ENSE00001748881.1 - exon_number: '1' - gene_id: ENSG00000293616.2 - gene_name: PCBP1-AS1 - gene_type: lncRNA - havana_gene: OTTHUMG00000153728.26 - havana_transcript: OTTHUMT00000480846.2 - hgnc_id: HGNC:42948 - level: '2' - protein_id: null - tag: - - nested_454_RNA_Seq_supported - transcript_id: ENST00000626609.2 - transcript_name: PCBP1-AS1-307 - transcript_support_level: '5' - transcript_type: lncRNA - - ID: exon:ENST00000374315.1:4 - Parent: ENST00000374315.1 - ccdsid: CCDS41283.1 - exon_id: ENSE00001508527.1 - exon_number: '4' - gene_id: ENSG00000162430.18 - gene_name: SELENON - gene_type: protein_coding - havana_gene: OTTHUMG00000007375.5 - havana_transcript: OTTHUMT00000019315.2 - hgnc_id: HGNC:15999 - level: '2' - protein_id: ENSP00000363434.1 - tag: - - basic - - GENCODE_Primary - - appris_principal_1 - - CCDS - - seleno - transcript_id: ENST00000374315.1 - transcript_name: SELENON-203 - transcript_support_level: '5' - transcript_type: protein_coding - - ID: exon:ENST00000376991.6:2 - Parent: ENST00000376991.6 - ccdsid: null - exon_id: ENSE00001190268.1 - exon_number: '2' - gene_id: ENSG00000175550.8 - gene_name: DRAP1 - gene_type: protein_coding - havana_gene: OTTHUMG00000166723.5 - havana_transcript: OTTHUMT00000391203.1 - hgnc_id: HGNC:3019 - level: '2' - protein_id: ENSP00000366190.2 - tag: - - not_organism_supported - - basic - - GENCODE_Primary - transcript_id: ENST00000376991.6 - transcript_name: DRAP1-202 - transcript_support_level: '5' - transcript_type: protein_coding end: - 81326191 - 32002540 @@ -480,587 +65,4 @@ fields=('seqid', 'start', 'end'): - 25808580 - 65919780 fields=None: - batch-00: - attributes: - - ID: exon:ENST00000782961.1:2 - Parent: ENST00000782961.1 - ccdsid: null - exon_id: ENSE00004156517.1 - exon_number: '2' - gene_id: ENSG00000229309.3 - gene_name: ENSG00000229309 - gene_type: lncRNA - havana_gene: OTTHUMG00000017146.2 - havana_transcript: null - hgnc_id: null - level: '2' - protein_id: null - tag: - - basic - - Ensembl_canonical - - TAGENE - transcript_id: ENST00000782961.1 - transcript_name: ENST00000782961 - transcript_support_level: null - transcript_type: lncRNA - - ID: CDS:ENST00000498271.1 - Parent: ENST00000498271.1 - ccdsid: CCDS59005.1 - exon_id: ENSE00001878698.1 - exon_number: '40' - gene_id: ENSG00000244731.10 - gene_name: C4A - gene_type: protein_coding - havana_gene: OTTHUMG00000031186.6 - havana_transcript: OTTHUMT00000356896.1 - hgnc_id: HGNC:1323 - level: '2' - protein_id: ENSP00000420212.1 - tag: - - RNA_Seq_supported_only - - basic - - GENCODE_Primary - - CCDS - transcript_id: ENST00000498271.1 - transcript_name: C4A-246 - transcript_support_level: '1' - transcript_type: protein_coding - - ID: exon:ENST00000334011.10:8 - Parent: ENST00000334011.10 - ccdsid: CCDS7318.1 - exon_id: ENSE00001170094.1 - exon_number: '8' - gene_id: ENSG00000138315.13 - gene_name: OIT3 - gene_type: protein_coding - havana_gene: OTTHUMG00000018444.2 - havana_transcript: OTTHUMT00000048596.2 - hgnc_id: HGNC:29953 - level: '2' - protein_id: ENSP00000333900.5 - tag: - - basic - - Ensembl_canonical - - GENCODE_Primary - - MANE_Select - - appris_principal_1 - - CCDS - transcript_id: ENST00000334011.10 - transcript_name: OIT3-201 - transcript_support_level: '1' - transcript_type: protein_coding - - ID: exon:ENST00000641916.1:4 - Parent: ENST00000641916.1 - ccdsid: null - exon_id: ENSE00003812605.1 - exon_number: '4' - gene_id: ENSG00000290385.2 - gene_name: ENSG00000290385 - gene_type: lncRNA - havana_gene: null - havana_transcript: OTTHUMT00000493599.1 - hgnc_id: null - level: '2' - protein_id: null - tag: null - transcript_id: ENST00000641916.1 - transcript_name: ENST00000641916 - transcript_support_level: null - transcript_type: lncRNA - - ID: CDS:ENST00000629018.4 - Parent: ENST00000629018.4 - ccdsid: null - exon_id: ENSE00000938859.1 - exon_number: '28' - gene_id: ENSG00000172915.20 - gene_name: NBEA - gene_type: protein_coding - havana_gene: OTTHUMG00000016724.2 - havana_transcript: null - hgnc_id: HGNC:7648 - level: '2' - protein_id: ENSP00000486239.3 - tag: - - RNA_Seq_supported_only - - mRNA_start_NF - - cds_start_NF - transcript_id: ENST00000629018.4 - transcript_name: NBEA-207 - transcript_support_level: '5' - transcript_type: protein_coding - - ID: UTR5:ENST00000674645.1 - Parent: ENST00000674645.1 - ccdsid: CCDS56266.1 - exon_id: ENSE00000967215.3 - exon_number: '2' - gene_id: ENSG00000114354.15 - gene_name: TFG - gene_type: protein_coding - havana_gene: OTTHUMG00000159085.4 - havana_transcript: null - hgnc_id: HGNC:11758 - level: '2' - protein_id: ENSP00000501892.1 - tag: - - basic - - GENCODE_Primary - - appris_alternative_1 - - CCDS - transcript_id: ENST00000674645.1 - transcript_name: TFG-212 - transcript_support_level: null - transcript_type: protein_coding - - ID: exon:ENST00000497536.1:1 - Parent: ENST00000497536.1 - ccdsid: null - exon_id: ENSE00003569641.1 - exon_number: '1' - gene_id: ENSG00000130414.13 - gene_name: NDUFA10 - gene_type: protein_coding - havana_gene: OTTHUMG00000133350.5 - havana_transcript: OTTHUMT00000326145.1 - hgnc_id: HGNC:7684 - level: '2' - protein_id: null - tag: null - transcript_id: ENST00000497536.1 - transcript_name: NDUFA10-213 - transcript_support_level: '2' - transcript_type: retained_intron - - ID: ENST00000558100.1 - Parent: ENSG00000069956.14 - ccdsid: null - exon_id: null - exon_number: null - gene_id: ENSG00000069956.14 - gene_name: MAPK6 - gene_type: protein_coding - havana_gene: OTTHUMG00000131891.4 - havana_transcript: OTTHUMT00000419352.1 - hgnc_id: HGNC:6879 - level: '1' - protein_id: null - tag: - - exp_conf - transcript_id: ENST00000558100.1 - transcript_name: MAPK6-203 - transcript_support_level: '2' - transcript_type: protein_coding_CDS_not_defined - - ID: ENST00000728222.1 - Parent: ENSG00000259225.10 - ccdsid: null - exon_id: null - exon_number: null - gene_id: ENSG00000259225.10 - gene_name: LINC02345 - gene_type: lncRNA - havana_gene: OTTHUMG00000172364.6 - havana_transcript: null - hgnc_id: HGNC:53267 - level: '2' - protein_id: null - tag: - - TAGENE - transcript_id: ENST00000728222.1 - transcript_name: LINC02345-220 - transcript_support_level: null - transcript_type: lncRNA - - ID: CDS:ENST00000606065.3 - Parent: ENST00000606065.3 - ccdsid: null - exon_id: ENSE00003643788.1 - exon_number: '1' - gene_id: ENSG00000099804.9 - gene_name: CDC34 - gene_type: protein_coding - havana_gene: OTTHUMG00000180558.11 - havana_transcript: OTTHUMT00000471238.3 - hgnc_id: HGNC:1734 - level: '2' - protein_id: ENSP00000475610.1 - tag: - - mRNA_start_NF - - cds_start_NF - transcript_id: ENST00000606065.3 - transcript_name: CDC34-205 - transcript_support_level: '5' - transcript_type: protein_coding - - ID: exon:ENST00000398743.6:2 - Parent: ENST00000398743.6 - ccdsid: CCDS13801.1 - exon_id: ENSE00001534682.1 - exon_number: '2' - gene_id: ENSG00000185686.18 - gene_name: PRAME - gene_type: protein_coding - havana_gene: OTTHUMG00000151172.4 - havana_transcript: OTTHUMT00000321642.2 - hgnc_id: HGNC:9336 - level: '2' - protein_id: ENSP00000381728.2 - tag: - - basic - - appris_principal_1 - - CCDS - transcript_id: ENST00000398743.6 - transcript_name: PRAME-202 - transcript_support_level: '1' - transcript_type: protein_coding - - ID: exon:ENST00000592304.1:2 - Parent: ENST00000592304.1 - ccdsid: null - exon_id: ENSE00002735216.1 - exon_number: '2' - gene_id: ENSG00000188554.15 - gene_name: NBR1 - gene_type: protein_coding - havana_gene: OTTHUMG00000180878.2 - havana_transcript: OTTHUMT00000453462.1 - hgnc_id: HGNC:6746 - level: '2' - protein_id: null - tag: null - transcript_id: ENST00000592304.1 - transcript_name: NBR1-207 - transcript_support_level: '2' - transcript_type: retained_intron - - ID: exon:ENST00000589170.5:5 - Parent: ENST00000589170.5 - ccdsid: null - exon_id: ENSE00002793715.1 - exon_number: '5' - gene_id: ENSG00000184828.10 - gene_name: ZBTB7C - gene_type: protein_coding - havana_gene: OTTHUMG00000180322.3 - havana_transcript: OTTHUMT00000450756.1 - hgnc_id: HGNC:31700 - level: '2' - protein_id: ENSP00000467338.1 - tag: - - alternative_5_UTR - - mRNA_end_NF - - cds_end_NF - transcript_id: ENST00000589170.5 - transcript_name: ZBTB7C-215 - transcript_support_level: '1' - transcript_type: protein_coding - - ID: exon:ENST00000016913.8:3 - Parent: ENST00000016913.8 - ccdsid: CCDS7988.1 - exon_id: ENSE00000720002.1 - exon_number: '3' - gene_id: ENSG00000071203.9 - gene_name: MS4A12 - gene_type: protein_coding - havana_gene: OTTHUMG00000165364.2 - havana_transcript: OTTHUMT00000383627.1 - hgnc_id: HGNC:13370 - level: '2' - protein_id: ENSP00000016913.4 - tag: - - basic - - Ensembl_canonical - - GENCODE_Primary - - MANE_Select - - appris_principal_1 - - CCDS - transcript_id: ENST00000016913.8 - transcript_name: MS4A12-201 - transcript_support_level: '1' - transcript_type: protein_coding - - ID: exon:ENST00000379937.6:17 - Parent: ENST00000379937.6 - ccdsid: CCDS55952.1 - exon_id: ENSE00001883333.1 - exon_number: '17' - gene_id: ENSG00000185024.18 - gene_name: BRF1 - gene_type: protein_coding - havana_gene: OTTHUMG00000029884.9 - havana_transcript: OTTHUMT00000408726.1 - hgnc_id: HGNC:11551 - level: '2' - protein_id: ENSP00000369269.2 - tag: - - basic - - GENCODE_Primary - - CCDS - transcript_id: ENST00000379937.6 - transcript_name: BRF1-202 - transcript_support_level: '1' - transcript_type: protein_coding - - ID: UTR3:ENST00000589807.1 - Parent: ENST00000589807.1 - ccdsid: null - exon_id: ENSE00003603860.1 - exon_number: '9' - gene_id: ENSG00000267120.3 - gene_name: ENSG00000267120 - gene_type: protein_coding - havana_gene: OTTHUMG00000182065.2 - havana_transcript: OTTHUMT00000459084.2 - hgnc_id: null - level: '2' - protein_id: ENSP00000472696.1 - tag: - - basic - - Ensembl_canonical - - GENCODE_Primary - - appris_principal_1 - - readthrough_transcript - transcript_id: ENST00000589807.1 - transcript_name: ENST00000589807 - transcript_support_level: '2' - transcript_type: nonsense_mediated_decay - - ID: CDS:ENST00000420138.5 - Parent: ENST00000420138.5 - ccdsid: null - exon_id: ENSE00003542120.1 - exon_number: '7' - gene_id: ENSG00000063660.9 - gene_name: GPC1 - gene_type: protein_coding - havana_gene: OTTHUMG00000133349.7 - havana_transcript: OTTHUMT00000325970.2 - hgnc_id: HGNC:4449 - level: '2' - protein_id: ENSP00000415077.2 - tag: - - basic - transcript_id: ENST00000420138.5 - transcript_name: GPC1-202 - transcript_support_level: '5' - transcript_type: protein_coding - - ID: exon:ENST00000626609.2:1 - Parent: ENST00000626609.2 - ccdsid: null - exon_id: ENSE00001748881.1 - exon_number: '1' - gene_id: ENSG00000293616.2 - gene_name: PCBP1-AS1 - gene_type: lncRNA - havana_gene: OTTHUMG00000153728.26 - havana_transcript: OTTHUMT00000480846.2 - hgnc_id: HGNC:42948 - level: '2' - protein_id: null - tag: - - nested_454_RNA_Seq_supported - transcript_id: ENST00000626609.2 - transcript_name: PCBP1-AS1-307 - transcript_support_level: '5' - transcript_type: lncRNA - - ID: exon:ENST00000374315.1:4 - Parent: ENST00000374315.1 - ccdsid: CCDS41283.1 - exon_id: ENSE00001508527.1 - exon_number: '4' - gene_id: ENSG00000162430.18 - gene_name: SELENON - gene_type: protein_coding - havana_gene: OTTHUMG00000007375.5 - havana_transcript: OTTHUMT00000019315.2 - hgnc_id: HGNC:15999 - level: '2' - protein_id: ENSP00000363434.1 - tag: - - basic - - GENCODE_Primary - - appris_principal_1 - - CCDS - - seleno - transcript_id: ENST00000374315.1 - transcript_name: SELENON-203 - transcript_support_level: '5' - transcript_type: protein_coding - - ID: exon:ENST00000376991.6:2 - Parent: ENST00000376991.6 - ccdsid: null - exon_id: ENSE00001190268.1 - exon_number: '2' - gene_id: ENSG00000175550.8 - gene_name: DRAP1 - gene_type: protein_coding - havana_gene: OTTHUMG00000166723.5 - havana_transcript: OTTHUMT00000391203.1 - hgnc_id: HGNC:3019 - level: '2' - protein_id: ENSP00000366190.2 - tag: - - not_organism_supported - - basic - - GENCODE_Primary - transcript_id: ENST00000376991.6 - transcript_name: DRAP1-202 - transcript_support_level: '5' - transcript_type: protein_coding - end: - - 81326191 - - 32002540 - - 72930637 - - 497299 - - 35655749 - - 100713685 - - 240025324 - - 52046124 - - 38043209 - - 535923 - - 22557580 - - 43176676 - - 48338398 - - 60501182 - - 105210588 - - 35740564 - - 240465210 - - 70087015 - - 25808789 - - 65919852 - frame: - - null - - 1 - - null - - null - - 2 - - null - - null - - null - - null - - 0 - - null - - null - - null - - null - - null - - null - - 0 - - null - - null - - null - score: - - null - - null - - null - - null - - null - - null - - null - - null - - null - - null - - null - - null - - null - - null - - null - - null - - null - - null - - null - - null - seqid: - - chr13 - - chr6 - - chr10 - - chr1 - - chr13 - - chr3 - - chr2 - - chr15 - - chr15 - - chr19 - - chr22 - - chr17 - - chr18 - - chr11 - - chr14 - - chr19 - - chr2 - - chr2 - - chr1 - - chr11 - source: - - HAVANA - - HAVANA - - HAVANA - - HAVANA - - HAVANA - - HAVANA - - HAVANA - - HAVANA - - HAVANA - - HAVANA - - HAVANA - - HAVANA - - HAVANA - - HAVANA - - HAVANA - - HAVANA - - HAVANA - - HAVANA - - HAVANA - - HAVANA - start: - - 81326030 - - 32002399 - - 72930538 - - 497210 - - 35655579 - - 100713643 - - 240025227 - - 52019238 - - 38042948 - - 535837 - - 22557545 - - 43175791 - - 48338174 - - 60501045 - - 105209286 - - 35740380 - - 240465077 - - 70086978 - - 25808580 - - 65919780 - strand: - - + - - + - - + - - '-' - - + - - + - - '-' - - + - - + - - + - - '-' - - + - - '-' - - + - - '-' - - '-' - - + - - '-' - - + - - + - type: - - exon - - CDS - - exon - - exon - - CDS - - five_prime_UTR - - exon - - transcript - - transcript - - CDS - - exon - - exon - - exon - - exon - - exon - - three_prime_UTR - - CDS - - exon - - exon - - exon + batch-00: {} diff --git a/py-oxbow/tests/manifests/test_gxf.TestGffFile.test_init_callstack.yaml b/py-oxbow/tests/manifests/test_gxf.TestGffFile.test_init_callstack.yaml index 02b7783d..8eb9501a 100644 --- a/py-oxbow/tests/manifests/test_gxf.TestGffFile.test_init_callstack.yaml +++ b/py-oxbow/tests/manifests/test_gxf.TestGffFile.test_init_callstack.yaml @@ -2,20 +2,14 @@ GffFile("data/does-not-exist.gff"): |- -> oxbow._core.gxf.GxfFile.__init__("data/does-not-exist.gff") -> oxbow._core.base.DataSource.__init__("data/does-not-exist.gff", None, 131072) <- None - -> oxbow._core.base.DataSource._source - <- 'data/does-not-exist.gff' - !! FileNotFoundError("No such file or directory (os error 2)") + <- None GffFile("data/malformed.gff"): |- -> oxbow._core.gxf.GxfFile.__init__("data/malformed.gff") -> oxbow._core.base.DataSource.__init__("data/malformed.gff", None, 131072) <- None - -> oxbow._core.base.DataSource._source - <- 'data/malformed.gff' - !! OSError("unexpected end of file") + <- None GffFile("data/sample.gff"): |- -> oxbow._core.gxf.GxfFile.__init__("data/sample.gff") -> oxbow._core.base.DataSource.__init__("data/sample.gff", None, 131072) <- None - -> oxbow._core.base.DataSource._source - <- 'data/sample.gff' <- None diff --git a/py-oxbow/tests/manifests/test_gxf.TestGtfFile.test_batches.yaml b/py-oxbow/tests/manifests/test_gxf.TestGtfFile.test_batches.yaml index 20b7f687..818a3de6 100644 --- a/py-oxbow/tests/manifests/test_gxf.TestGtfFile.test_batches.yaml +++ b/py-oxbow/tests/manifests/test_gxf.TestGtfFile.test_batches.yaml @@ -1,361 +1,6 @@ fields=('nonexistent-field',): 'Invalid field name: nonexistent-field' fields=('seqid', 'start', 'end'): batch-00: - attributes: - - ccdsid: null - exon_id: ENSE00000688402.1 - exon_number: '2' - gene_id: ENSG00000137177.20 - gene_name: KIF13A - gene_type: protein_coding - havana_gene: OTTHUMG00000014313.9 - havana_transcript: OTTHUMT00000039956.2 - hgnc_id: HGNC:14566 - level: '2' - protein_id: ENSP00000351150.6 - tag: - - mRNA_start_NF - - cds_start_NF - transcript_id: ENST00000358380.10 - transcript_name: KIF13A-202 - transcript_support_level: '1' - transcript_type: protein_coding - - ccdsid: null - exon_id: ENSE00003554096.1 - exon_number: '4' - gene_id: ENSG00000028203.19 - gene_name: VEZT - gene_type: protein_coding - havana_gene: OTTHUMG00000170182.4 - havana_transcript: OTTHUMT00000407816.2 - hgnc_id: HGNC:18258 - level: '2' - protein_id: ENSP00000380894.4 - tag: - - mRNA_end_NF - - cds_end_NF - transcript_id: ENST00000397792.8 - transcript_name: VEZT-202 - transcript_support_level: '1' - transcript_type: protein_coding - - ccdsid: null - exon_id: ENSE00001350331.1 - exon_number: '4' - gene_id: ENSG00000135541.22 - gene_name: AHI1 - gene_type: protein_coding - havana_gene: OTTHUMG00000015631.9 - havana_transcript: null - hgnc_id: HGNC:21575 - level: '2' - protein_id: ENSP00000505809.1 - tag: - - alternative_3_UTR - - inferred_transcript_model - transcript_id: ENST00000680840.1 - transcript_name: AHI1-242 - transcript_support_level: null - transcript_type: nonsense_mediated_decay - - ccdsid: null - exon_id: ENSE00003612242.1 - exon_number: '2' - gene_id: ENSG00000153292.16 - gene_name: ADGRF1 - gene_type: protein_coding - havana_gene: OTTHUMG00000014795.3 - havana_transcript: null - hgnc_id: HGNC:18990 - level: '3' - protein_id: ENSP00000283297.5 - tag: - - basic - transcript_id: ENST00000283297.5 - transcript_name: ADGRF1-201 - transcript_support_level: '1' - transcript_type: protein_coding - - ccdsid: null - exon_id: null - exon_number: null - gene_id: ENSG00000248323.9 - gene_name: LUCAT1 - gene_type: lncRNA - havana_gene: OTTHUMG00000162611.16 - havana_transcript: null - hgnc_id: HGNC:48498 - level: '2' - protein_id: null - tag: - - basic - - TAGENE - transcript_id: ENST00000730025.1 - transcript_name: LUCAT1-267 - transcript_support_level: null - transcript_type: lncRNA - - ccdsid: null - exon_id: null - exon_number: null - gene_id: ENSG00000304011.1 - gene_name: ENSG00000304011 - gene_type: lncRNA - havana_gene: null - havana_transcript: null - hgnc_id: null - level: '2' - protein_id: null - tag: - - TAGENE - transcript_id: ENST00000798830.1 - transcript_name: ENST00000798830 - transcript_support_level: null - transcript_type: lncRNA - - ccdsid: null - exon_id: ENSE00003729115.1 - exon_number: '12' - gene_id: ENSG00000131044.19 - gene_name: TTLL9 - gene_type: protein_coding - havana_gene: OTTHUMG00000186843.3 - havana_transcript: OTTHUMT00000473883.1 - hgnc_id: HGNC:16118 - level: '2' - protein_id: ENSP00000365086.3 - tag: null - transcript_id: ENST00000375921.6 - transcript_name: TTLL9-202 - transcript_support_level: '1' - transcript_type: nonsense_mediated_decay - - ccdsid: CCDS5467.1 - exon_id: ENSE00003556589.1 - exon_number: '1' - gene_id: ENSG00000256646.8 - gene_name: ENSG00000256646 - gene_type: protein_coding - havana_gene: OTTHUMG00000168259.1 - havana_transcript: OTTHUMT00000398997.1 - hgnc_id: null - level: '2' - protein_id: ENSP00000455744.1 - tag: - - not_organism_supported - - readthrough_transcript - - CCDS - transcript_id: ENST00000442788.5 - transcript_name: ENST00000442788 - transcript_support_level: '5' - transcript_type: nonsense_mediated_decay - - ccdsid: null - exon_id: ENSE00002791736.1 - exon_number: '2' - gene_id: ENSG00000160439.16 - gene_name: RDH13 - gene_type: protein_coding - havana_gene: OTTHUMG00000180478.4 - havana_transcript: OTTHUMT00000451484.2 - hgnc_id: HGNC:19978 - level: '2' - protein_id: null - tag: null - transcript_id: ENST00000593134.1 - transcript_name: RDH13-222 - transcript_support_level: '4' - transcript_type: protein_coding_CDS_not_defined - - ccdsid: null - exon_id: ENSE00004130809.1 - exon_number: '1' - gene_id: ENSG00000226067.8 - gene_name: LINC00623 - gene_type: lncRNA - havana_gene: OTTHUMG00000185016.4 - havana_transcript: null - hgnc_id: HGNC:44252 - level: '2' - protein_id: null - tag: - - TAGENE - transcript_id: ENST00000769594.1 - transcript_name: LINC00623-343 - transcript_support_level: null - transcript_type: lncRNA - - ccdsid: CCDS94631.1 - exon_id: ENSE00003612797.1 - exon_number: '5' - gene_id: ENSG00000147099.21 - gene_name: HDAC8 - gene_type: protein_coding - havana_gene: OTTHUMG00000021814.18 - havana_transcript: OTTHUMT00000057204.3 - hgnc_id: HGNC:13315 - level: '2' - protein_id: ENSP00000362685.2 - tag: - - basic - - GENCODE_Primary - - CCDS - transcript_id: ENST00000373583.6 - transcript_name: HDAC8-208 - transcript_support_level: '5' - transcript_type: protein_coding - - ccdsid: CCDS11685.1 - exon_id: ENSE00001373572.1 - exon_number: '8' - gene_id: ENSG00000154265.16 - gene_name: ABCA5 - gene_type: protein_coding - havana_gene: OTTHUMG00000180303.4 - havana_transcript: OTTHUMT00000450654.3 - hgnc_id: HGNC:35 - level: '2' - protein_id: ENSP00000376443.2 - tag: - - basic - - Ensembl_canonical - - GENCODE_Primary - - MANE_Select - - appris_principal_1 - - CCDS - transcript_id: ENST00000392676.8 - transcript_name: ABCA5-201 - transcript_support_level: '1' - transcript_type: protein_coding - - ccdsid: null - exon_id: ENSE00003673013.1 - exon_number: '19' - gene_id: ENSG00000049759.20 - gene_name: NEDD4L - gene_type: protein_coding - havana_gene: OTTHUMG00000179875.12 - havana_transcript: OTTHUMT00000448908.1 - hgnc_id: HGNC:7728 - level: '2' - protein_id: ENSP00000502309.1 - tag: - - RNA_Seq_supported_only - transcript_id: ENST00000674845.1 - transcript_name: NEDD4L-242 - transcript_support_level: null - transcript_type: nonsense_mediated_decay - - ccdsid: null - exon_id: ENSE00003966028.1 - exon_number: '5' - gene_id: ENSG00000171793.17 - gene_name: CTPS1 - gene_type: protein_coding - havana_gene: OTTHUMG00000005712.13 - havana_transcript: null - hgnc_id: HGNC:2519 - level: '2' - protein_id: ENSP00000512402.1 - tag: - - RNA_Seq_supported_only - transcript_id: ENST00000696108.1 - transcript_name: CTPS1-221 - transcript_support_level: null - transcript_type: nonsense_mediated_decay - - ccdsid: null - exon_id: ENSE00002070419.1 - exon_number: '1' - gene_id: ENSG00000158987.22 - gene_name: RAPGEF6 - gene_type: protein_coding - havana_gene: OTTHUMG00000162683.7 - havana_transcript: OTTHUMT00000370060.1 - hgnc_id: HGNC:20655 - level: '2' - protein_id: ENSP00000425772.1 - tag: null - transcript_id: ENST00000515170.5 - transcript_name: RAPGEF6-214 - transcript_support_level: '2' - transcript_type: nonsense_mediated_decay - - ccdsid: null - exon_id: ENSE00003980557.1 - exon_number: '7' - gene_id: ENSG00000052126.17 - gene_name: PLEKHA5 - gene_type: protein_coding - havana_gene: OTTHUMG00000167921.3 - havana_transcript: null - hgnc_id: HGNC:30036 - level: '2' - protein_id: null - tag: - - RNA_Seq_supported_only - transcript_id: ENST00000706617.1 - transcript_name: PLEKHA5-236 - transcript_support_level: null - transcript_type: retained_intron - - ccdsid: null - exon_id: ENSE00004218036.1 - exon_number: '1' - gene_id: ENSG00000272690.7 - gene_name: LINC02018 - gene_type: lncRNA - havana_gene: OTTHUMG00000185852.36 - havana_transcript: null - hgnc_id: HGNC:52853 - level: '2' - protein_id: null - tag: - - TAGENE - transcript_id: ENST00000815791.1 - transcript_name: LINC02018-313 - transcript_support_level: null - transcript_type: lncRNA - - ccdsid: CCDS43874.1 - exon_id: ENSE00003692738.1 - exon_number: '14' - gene_id: ENSG00000056586.16 - gene_name: RC3H2 - gene_type: protein_coding - havana_gene: OTTHUMG00000020632.4 - havana_transcript: OTTHUMT00000053966.1 - hgnc_id: HGNC:21461 - level: '2' - protein_id: ENSP00000362774.1 - tag: - - basic - - appris_principal_1 - - CCDS - transcript_id: ENST00000373670.5 - transcript_name: RC3H2-203 - transcript_support_level: '5' - transcript_type: protein_coding - - ccdsid: null - exon_id: ENSE00003527710.1 - exon_number: '8' - gene_id: ENSG00000096093.16 - gene_name: EFHC1 - gene_type: protein_coding - havana_gene: OTTHUMG00000014848.14 - havana_transcript: OTTHUMT00000489944.1 - hgnc_id: HGNC:16406 - level: '2' - protein_id: ENSP00000489854.1 - tag: - - RNA_Seq_supported_only - - basic - transcript_id: ENST00000637089.1 - transcript_name: EFHC1-227 - transcript_support_level: '5' - transcript_type: protein_coding - - ccdsid: null - exon_id: ENSE00002899123.1 - exon_number: '5' - gene_id: ENSG00000072958.9 - gene_name: AP1M1 - gene_type: protein_coding - havana_gene: OTTHUMG00000182323.4 - havana_transcript: OTTHUMT00000460505.1 - hgnc_id: HGNC:13667 - level: '2' - protein_id: ENSP00000468015.1 - tag: - - mRNA_start_NF - - cds_start_NF - transcript_id: ENST00000586543.1 - transcript_name: AP1M1-205 - transcript_support_level: '5' - transcript_type: protein_coding end: - 17808930 - 95257239 @@ -419,363 +64,8 @@ fields=('seqid', 'start', 'end'): - 122855184 - 52479037 - 16235231 -fields=None: +fields=*: batch-00: - attributes: - - ccdsid: null - exon_id: ENSE00000688402.1 - exon_number: '2' - gene_id: ENSG00000137177.20 - gene_name: KIF13A - gene_type: protein_coding - havana_gene: OTTHUMG00000014313.9 - havana_transcript: OTTHUMT00000039956.2 - hgnc_id: HGNC:14566 - level: '2' - protein_id: ENSP00000351150.6 - tag: - - mRNA_start_NF - - cds_start_NF - transcript_id: ENST00000358380.10 - transcript_name: KIF13A-202 - transcript_support_level: '1' - transcript_type: protein_coding - - ccdsid: null - exon_id: ENSE00003554096.1 - exon_number: '4' - gene_id: ENSG00000028203.19 - gene_name: VEZT - gene_type: protein_coding - havana_gene: OTTHUMG00000170182.4 - havana_transcript: OTTHUMT00000407816.2 - hgnc_id: HGNC:18258 - level: '2' - protein_id: ENSP00000380894.4 - tag: - - mRNA_end_NF - - cds_end_NF - transcript_id: ENST00000397792.8 - transcript_name: VEZT-202 - transcript_support_level: '1' - transcript_type: protein_coding - - ccdsid: null - exon_id: ENSE00001350331.1 - exon_number: '4' - gene_id: ENSG00000135541.22 - gene_name: AHI1 - gene_type: protein_coding - havana_gene: OTTHUMG00000015631.9 - havana_transcript: null - hgnc_id: HGNC:21575 - level: '2' - protein_id: ENSP00000505809.1 - tag: - - alternative_3_UTR - - inferred_transcript_model - transcript_id: ENST00000680840.1 - transcript_name: AHI1-242 - transcript_support_level: null - transcript_type: nonsense_mediated_decay - - ccdsid: null - exon_id: ENSE00003612242.1 - exon_number: '2' - gene_id: ENSG00000153292.16 - gene_name: ADGRF1 - gene_type: protein_coding - havana_gene: OTTHUMG00000014795.3 - havana_transcript: null - hgnc_id: HGNC:18990 - level: '3' - protein_id: ENSP00000283297.5 - tag: - - basic - transcript_id: ENST00000283297.5 - transcript_name: ADGRF1-201 - transcript_support_level: '1' - transcript_type: protein_coding - - ccdsid: null - exon_id: null - exon_number: null - gene_id: ENSG00000248323.9 - gene_name: LUCAT1 - gene_type: lncRNA - havana_gene: OTTHUMG00000162611.16 - havana_transcript: null - hgnc_id: HGNC:48498 - level: '2' - protein_id: null - tag: - - basic - - TAGENE - transcript_id: ENST00000730025.1 - transcript_name: LUCAT1-267 - transcript_support_level: null - transcript_type: lncRNA - - ccdsid: null - exon_id: null - exon_number: null - gene_id: ENSG00000304011.1 - gene_name: ENSG00000304011 - gene_type: lncRNA - havana_gene: null - havana_transcript: null - hgnc_id: null - level: '2' - protein_id: null - tag: - - TAGENE - transcript_id: ENST00000798830.1 - transcript_name: ENST00000798830 - transcript_support_level: null - transcript_type: lncRNA - - ccdsid: null - exon_id: ENSE00003729115.1 - exon_number: '12' - gene_id: ENSG00000131044.19 - gene_name: TTLL9 - gene_type: protein_coding - havana_gene: OTTHUMG00000186843.3 - havana_transcript: OTTHUMT00000473883.1 - hgnc_id: HGNC:16118 - level: '2' - protein_id: ENSP00000365086.3 - tag: null - transcript_id: ENST00000375921.6 - transcript_name: TTLL9-202 - transcript_support_level: '1' - transcript_type: nonsense_mediated_decay - - ccdsid: CCDS5467.1 - exon_id: ENSE00003556589.1 - exon_number: '1' - gene_id: ENSG00000256646.8 - gene_name: ENSG00000256646 - gene_type: protein_coding - havana_gene: OTTHUMG00000168259.1 - havana_transcript: OTTHUMT00000398997.1 - hgnc_id: null - level: '2' - protein_id: ENSP00000455744.1 - tag: - - not_organism_supported - - readthrough_transcript - - CCDS - transcript_id: ENST00000442788.5 - transcript_name: ENST00000442788 - transcript_support_level: '5' - transcript_type: nonsense_mediated_decay - - ccdsid: null - exon_id: ENSE00002791736.1 - exon_number: '2' - gene_id: ENSG00000160439.16 - gene_name: RDH13 - gene_type: protein_coding - havana_gene: OTTHUMG00000180478.4 - havana_transcript: OTTHUMT00000451484.2 - hgnc_id: HGNC:19978 - level: '2' - protein_id: null - tag: null - transcript_id: ENST00000593134.1 - transcript_name: RDH13-222 - transcript_support_level: '4' - transcript_type: protein_coding_CDS_not_defined - - ccdsid: null - exon_id: ENSE00004130809.1 - exon_number: '1' - gene_id: ENSG00000226067.8 - gene_name: LINC00623 - gene_type: lncRNA - havana_gene: OTTHUMG00000185016.4 - havana_transcript: null - hgnc_id: HGNC:44252 - level: '2' - protein_id: null - tag: - - TAGENE - transcript_id: ENST00000769594.1 - transcript_name: LINC00623-343 - transcript_support_level: null - transcript_type: lncRNA - - ccdsid: CCDS94631.1 - exon_id: ENSE00003612797.1 - exon_number: '5' - gene_id: ENSG00000147099.21 - gene_name: HDAC8 - gene_type: protein_coding - havana_gene: OTTHUMG00000021814.18 - havana_transcript: OTTHUMT00000057204.3 - hgnc_id: HGNC:13315 - level: '2' - protein_id: ENSP00000362685.2 - tag: - - basic - - GENCODE_Primary - - CCDS - transcript_id: ENST00000373583.6 - transcript_name: HDAC8-208 - transcript_support_level: '5' - transcript_type: protein_coding - - ccdsid: CCDS11685.1 - exon_id: ENSE00001373572.1 - exon_number: '8' - gene_id: ENSG00000154265.16 - gene_name: ABCA5 - gene_type: protein_coding - havana_gene: OTTHUMG00000180303.4 - havana_transcript: OTTHUMT00000450654.3 - hgnc_id: HGNC:35 - level: '2' - protein_id: ENSP00000376443.2 - tag: - - basic - - Ensembl_canonical - - GENCODE_Primary - - MANE_Select - - appris_principal_1 - - CCDS - transcript_id: ENST00000392676.8 - transcript_name: ABCA5-201 - transcript_support_level: '1' - transcript_type: protein_coding - - ccdsid: null - exon_id: ENSE00003673013.1 - exon_number: '19' - gene_id: ENSG00000049759.20 - gene_name: NEDD4L - gene_type: protein_coding - havana_gene: OTTHUMG00000179875.12 - havana_transcript: OTTHUMT00000448908.1 - hgnc_id: HGNC:7728 - level: '2' - protein_id: ENSP00000502309.1 - tag: - - RNA_Seq_supported_only - transcript_id: ENST00000674845.1 - transcript_name: NEDD4L-242 - transcript_support_level: null - transcript_type: nonsense_mediated_decay - - ccdsid: null - exon_id: ENSE00003966028.1 - exon_number: '5' - gene_id: ENSG00000171793.17 - gene_name: CTPS1 - gene_type: protein_coding - havana_gene: OTTHUMG00000005712.13 - havana_transcript: null - hgnc_id: HGNC:2519 - level: '2' - protein_id: ENSP00000512402.1 - tag: - - RNA_Seq_supported_only - transcript_id: ENST00000696108.1 - transcript_name: CTPS1-221 - transcript_support_level: null - transcript_type: nonsense_mediated_decay - - ccdsid: null - exon_id: ENSE00002070419.1 - exon_number: '1' - gene_id: ENSG00000158987.22 - gene_name: RAPGEF6 - gene_type: protein_coding - havana_gene: OTTHUMG00000162683.7 - havana_transcript: OTTHUMT00000370060.1 - hgnc_id: HGNC:20655 - level: '2' - protein_id: ENSP00000425772.1 - tag: null - transcript_id: ENST00000515170.5 - transcript_name: RAPGEF6-214 - transcript_support_level: '2' - transcript_type: nonsense_mediated_decay - - ccdsid: null - exon_id: ENSE00003980557.1 - exon_number: '7' - gene_id: ENSG00000052126.17 - gene_name: PLEKHA5 - gene_type: protein_coding - havana_gene: OTTHUMG00000167921.3 - havana_transcript: null - hgnc_id: HGNC:30036 - level: '2' - protein_id: null - tag: - - RNA_Seq_supported_only - transcript_id: ENST00000706617.1 - transcript_name: PLEKHA5-236 - transcript_support_level: null - transcript_type: retained_intron - - ccdsid: null - exon_id: ENSE00004218036.1 - exon_number: '1' - gene_id: ENSG00000272690.7 - gene_name: LINC02018 - gene_type: lncRNA - havana_gene: OTTHUMG00000185852.36 - havana_transcript: null - hgnc_id: HGNC:52853 - level: '2' - protein_id: null - tag: - - TAGENE - transcript_id: ENST00000815791.1 - transcript_name: LINC02018-313 - transcript_support_level: null - transcript_type: lncRNA - - ccdsid: CCDS43874.1 - exon_id: ENSE00003692738.1 - exon_number: '14' - gene_id: ENSG00000056586.16 - gene_name: RC3H2 - gene_type: protein_coding - havana_gene: OTTHUMG00000020632.4 - havana_transcript: OTTHUMT00000053966.1 - hgnc_id: HGNC:21461 - level: '2' - protein_id: ENSP00000362774.1 - tag: - - basic - - appris_principal_1 - - CCDS - transcript_id: ENST00000373670.5 - transcript_name: RC3H2-203 - transcript_support_level: '5' - transcript_type: protein_coding - - ccdsid: null - exon_id: ENSE00003527710.1 - exon_number: '8' - gene_id: ENSG00000096093.16 - gene_name: EFHC1 - gene_type: protein_coding - havana_gene: OTTHUMG00000014848.14 - havana_transcript: OTTHUMT00000489944.1 - hgnc_id: HGNC:16406 - level: '2' - protein_id: ENSP00000489854.1 - tag: - - RNA_Seq_supported_only - - basic - transcript_id: ENST00000637089.1 - transcript_name: EFHC1-227 - transcript_support_level: '5' - transcript_type: protein_coding - - ccdsid: null - exon_id: ENSE00002899123.1 - exon_number: '5' - gene_id: ENSG00000072958.9 - gene_name: AP1M1 - gene_type: protein_coding - havana_gene: OTTHUMG00000182323.4 - havana_transcript: OTTHUMT00000460505.1 - hgnc_id: HGNC:13667 - level: '2' - protein_id: ENSP00000468015.1 - tag: - - mRNA_start_NF - - cds_start_NF - transcript_id: ENST00000586543.1 - transcript_name: AP1M1-205 - transcript_support_level: '5' - transcript_type: protein_coding end: - 17808930 - 95257239 @@ -944,3 +234,5 @@ fields=None: - CDS - exon - UTR +fields=None: + batch-00: {} diff --git a/py-oxbow/tests/manifests/test_gxf.TestGtfFile.test_init_callstack.yaml b/py-oxbow/tests/manifests/test_gxf.TestGtfFile.test_init_callstack.yaml index 7581c6ff..f82fbc6b 100644 --- a/py-oxbow/tests/manifests/test_gxf.TestGtfFile.test_init_callstack.yaml +++ b/py-oxbow/tests/manifests/test_gxf.TestGtfFile.test_init_callstack.yaml @@ -2,20 +2,14 @@ GtfFile("data/does-not-exist.gtf"): |- -> oxbow._core.gxf.GxfFile.__init__("data/does-not-exist.gtf") -> oxbow._core.base.DataSource.__init__("data/does-not-exist.gtf", None, 131072) <- None - -> oxbow._core.base.DataSource._source - <- 'data/does-not-exist.gtf' - !! FileNotFoundError("No such file or directory (os error 2)") + <- None GtfFile("data/malformed.gtf"): |- -> oxbow._core.gxf.GxfFile.__init__("data/malformed.gtf") -> oxbow._core.base.DataSource.__init__("data/malformed.gtf", None, 131072) <- None - -> oxbow._core.base.DataSource._source - <- 'data/malformed.gtf' <- None GtfFile("data/sample.gtf"): |- -> oxbow._core.gxf.GxfFile.__init__("data/sample.gtf") -> oxbow._core.base.DataSource.__init__("data/sample.gtf", None, 131072) <- None - -> oxbow._core.base.DataSource._source - <- 'data/sample.gtf' <- None diff --git a/py-oxbow/tests/manifests/test_scanners.TestPyBedScanner.test_scan_invalid_field.yaml b/py-oxbow/tests/manifests/test_scanners.TestPyBedScanner.test_scan_invalid_field.yaml index 814008dd..fe7aa5fc 100644 --- a/py-oxbow/tests/manifests/test_scanners.TestPyBedScanner.test_scan_invalid_field.yaml +++ b/py-oxbow/tests/manifests/test_scanners.TestPyBedScanner.test_scan_invalid_field.yaml @@ -1,2 +1 @@ -'Unknown columns: ["nonexistent-field"]. Available: ["chrom", "start", "end", "name", - "score", "strand", "thickStart", "thickEnd", "itemRgb"]' +'Unknown columns: ["nonexistent-field"]. Available: []' diff --git a/py-oxbow/tests/manifests/test_scanners.TestPyBigBedScanner.test_scan_with_autosql.yaml b/py-oxbow/tests/manifests/test_scanners.TestPyBigBedScanner.test_scan_with_autosql.yaml index b826308b..e0fe5af9 100644 --- a/py-oxbow/tests/manifests/test_scanners.TestPyBigBedScanner.test_scan_with_autosql.yaml +++ b/py-oxbow/tests/manifests/test_scanners.TestPyBigBedScanner.test_scan_with_autosql.yaml @@ -1,432 +1,9 @@ -batch_size=1: - blockCount: - - 3 - blockSizes: - - - 359 - - 109 - - 1189 - cdsEndStat: - - none - cdsStartStat: - - none - chrom: - - chr1 - chromStarts: - - - 0 - - 744 - - 1352 - end: - - 14409 - exonFrames: - - - -1 - - -1 - - -1 - geneName: - - ENST00000456328.2 - geneName2: - - DDX11L1 - geneType: - - none - name: - - ENST00000456328.2 - name2: - - DDX11L1 - reserved: - - null - score: - - 1000 - start: - - 11868 - strand: - - + - thickEnd: - - 11868 - thickStart: - - 11868 - type: - - none -batch_size=2: - blockCount: - - 3 - - 11 - blockSizes: - - - 359 - - 109 - - 1189 - - - 98 - - 34 - - 152 - - 159 - - 198 - - 136 - - 137 - - 147 - - 99 - - 154 - - 37 - cdsEndStat: - - none - - none - cdsStartStat: - - none - - none - chrom: - - chr1 - - chr1 - chromStarts: - - - 0 - - 744 - - 1352 - - - 0 - - 601 - - 1392 - - 2203 - - 2454 - - 2829 - - 3202 - - 3511 - - 3864 - - 10334 - - 15130 - end: - - 14409 - - 29570 - exonFrames: - - - -1 - - -1 - - -1 - - - -1 - - -1 - - -1 - - -1 - - -1 - - -1 - - -1 - - -1 - - -1 - - -1 - - -1 - geneName: - - ENST00000456328.2 - - ENST00000488147.1 - geneName2: - - DDX11L1 - - WASH7P - geneType: - - none - - none - name: - - ENST00000456328.2 - - ENST00000488147.1 - name2: - - DDX11L1 - - WASH7P - reserved: - - null - - null - score: - - 1000 - - 1000 - start: - - 11868 - - 14403 - strand: - - + - - '-' - thickEnd: - - 11868 - - 14403 - thickStart: - - 11868 - - 14403 - type: - - none - - none -batch_size=3: - blockCount: - - 3 - - 11 - - 1 - blockSizes: - - - 359 - - 109 - - 1189 - - - 98 - - 34 - - 152 - - 159 - - 198 - - 136 - - 137 - - 147 - - 99 - - 154 - - 37 - - - 68 - cdsEndStat: - - none - - none - - none - cdsStartStat: - - none - - none - - none - chrom: - - chr1 - - chr1 - - chr1 - chromStarts: - - - 0 - - 744 - - 1352 - - - 0 - - 601 - - 1392 - - 2203 - - 2454 - - 2829 - - 3202 - - 3511 - - 3864 - - 10334 - - 15130 - - - 0 - end: - - 14409 - - 29570 - - 17436 - exonFrames: - - - -1 - - -1 - - -1 - - - -1 - - -1 - - -1 - - -1 - - -1 - - -1 - - -1 - - -1 - - -1 - - -1 - - -1 - - - -1 - geneName: - - ENST00000456328.2 - - ENST00000488147.1 - - ENST00000619216.1 - geneName2: - - DDX11L1 - - WASH7P - - MIR6859-2 - geneType: - - none - - none - - none - name: - - ENST00000456328.2 - - ENST00000488147.1 - - ENST00000619216.1 - name2: - - DDX11L1 - - WASH7P - - MIR6859-2 - reserved: - - null - - null - - null - score: - - 1000 - - 1000 - - 1000 - start: - - 11868 - - 14403 - - 17368 - strand: - - + - - '-' - - '-' - thickEnd: - - 11868 - - 14403 - - 17368 - thickStart: - - 11868 - - 14403 - - 17368 - type: - - none - - none - - none -batch_size=4: - blockCount: - - 3 - - 11 - - 1 - - 3 - blockSizes: - - - 359 - - 109 - - 1189 - - - 98 - - 34 - - 152 - - 159 - - 198 - - 136 - - 137 - - 147 - - 99 - - 154 - - 37 - - - 68 - - - 486 - - 104 - - 122 - cdsEndStat: - - none - - none - - none - - none - cdsStartStat: - - none - - none - - none - - none - chrom: - - chr1 - - chr1 - - chr1 - - chr1 - chromStarts: - - - 0 - - 744 - - 1352 - - - 0 - - 601 - - 1392 - - 2203 - - 2454 - - 2829 - - 3202 - - 3511 - - 3864 - - 10334 - - 15130 - - - 0 - - - 0 - - 1010 - - 1422 - end: - - 14409 - - 29570 - - 17436 - - 31097 - exonFrames: - - - -1 - - -1 - - -1 - - - -1 - - -1 - - -1 - - -1 - - -1 - - -1 - - -1 - - -1 - - -1 - - -1 - - -1 - - - -1 - - - -1 - - -1 - - -1 - geneName: - - ENST00000456328.2 - - ENST00000488147.1 - - ENST00000619216.1 - - ENST00000473358.1 - geneName2: - - DDX11L1 - - WASH7P - - MIR6859-2 - - MIR1302-11 - geneType: - - none - - none - - none - - none - name: - - ENST00000456328.2 - - ENST00000488147.1 - - ENST00000619216.1 - - ENST00000473358.1 - name2: - - DDX11L1 - - WASH7P - - MIR6859-2 - - MIR1302-11 - reserved: - - null - - null - - null - - null - score: - - 1000 - - 1000 - - 1000 - - 1000 - start: - - 11868 - - 14403 - - 17368 - - 29553 - strand: - - + - - '-' - - '-' - - + - thickEnd: - - 11868 - - 14403 - - 17368 - - 29553 - thickStart: - - 11868 - - 14403 - - 17368 - - 29553 - type: - - none - - none - - none - - none -columns=("chrom", "start", "end", "chromStarts"), batch_size=2: - chrom: - - chr1 - - chr1 - chromStarts: - - - 0 - - 744 - - 1352 - - - 0 - - 601 - - 1392 - - 2203 - - 2454 - - 2829 - - 3202 - - 3511 - - 3864 - - 10334 - - 15130 - end: - - 14409 - - 29570 - start: - - 11868 - - 14403 +batch_size=1: {} +batch_size=2: {} +batch_size=3: {} +batch_size=4: {} +columns=("chrom", "start", "end", "chromStarts"), batch_size=2: 'Unknown columns: + ["chrom", "start", "end", "chromStarts"]. Available: []' fields=("chrom", "start", "end", "chromStarts"), batch_size=2: chrom: - chr1 diff --git a/py-oxbow/tests/manifests/test_sequence.TestFastaFile.test_batches.yaml b/py-oxbow/tests/manifests/test_sequence.TestFastaFile.test_batches.yaml index 3e893712..55da8fc9 100644 --- a/py-oxbow/tests/manifests/test_sequence.TestFastaFile.test_batches.yaml +++ b/py-oxbow/tests/manifests/test_sequence.TestFastaFile.test_batches.yaml @@ -100,7 +100,7 @@ fields=('name', 'sequence'): sequence: - TACGTACGTACGTACGTACGTACGTACGTACGTACGTAC fields=('nonexistent-field',): 'Invalid field name: nonexistent-field' -fields=None: +fields=*: batch-00: description: - null @@ -241,3 +241,24 @@ fields=None: - seq20 sequence: - TACGTACGTACGTACGTACGTACGTACGTACGTACGTAC +fields=None: + batch-00: {} + batch-01: {} + batch-02: {} + batch-03: {} + batch-04: {} + batch-05: {} + batch-06: {} + batch-07: {} + batch-08: {} + batch-09: {} + batch-10: {} + batch-11: {} + batch-12: {} + batch-13: {} + batch-14: {} + batch-15: {} + batch-16: {} + batch-17: {} + batch-18: {} + batch-19: {} diff --git a/py-oxbow/tests/manifests/test_sequence.TestFastaFile.test_init_callstack.yaml b/py-oxbow/tests/manifests/test_sequence.TestFastaFile.test_init_callstack.yaml index 4eefc516..f48fbdb0 100644 --- a/py-oxbow/tests/manifests/test_sequence.TestFastaFile.test_init_callstack.yaml +++ b/py-oxbow/tests/manifests/test_sequence.TestFastaFile.test_init_callstack.yaml @@ -1,20 +1,20 @@ FastaFile("data/does-not-exist.fasta"): |- -> oxbow._core.sequence.FastaFile.__init__("data/does-not-exist.fasta") - -> oxbow._core.sequence.SequenceFile.__init__(source="data/does-not-exist.fasta", compressed=False, fields=None, regions=None, index=None, gzi=None, batch_size=1) + -> oxbow._core.sequence.SequenceFile.__init__(source="data/does-not-exist.fasta", compressed=False, fields="*", regions=None, index=None, gzi=None, batch_size=1) -> oxbow._core.base.DataSource.__init__("data/does-not-exist.fasta", None, 1) <- None <- None <- None FastaFile("data/malformed.fasta"): |- -> oxbow._core.sequence.FastaFile.__init__("data/malformed.fasta") - -> oxbow._core.sequence.SequenceFile.__init__(source="data/malformed.fasta", compressed=False, fields=None, regions=None, index=None, gzi=None, batch_size=1) + -> oxbow._core.sequence.SequenceFile.__init__(source="data/malformed.fasta", compressed=False, fields="*", regions=None, index=None, gzi=None, batch_size=1) -> oxbow._core.base.DataSource.__init__("data/malformed.fasta", None, 1) <- None <- None <- None FastaFile("data/sample.fasta"): |- -> oxbow._core.sequence.FastaFile.__init__("data/sample.fasta") - -> oxbow._core.sequence.SequenceFile.__init__(source="data/sample.fasta", compressed=False, fields=None, regions=None, index=None, gzi=None, batch_size=1) + -> oxbow._core.sequence.SequenceFile.__init__(source="data/sample.fasta", compressed=False, fields="*", regions=None, index=None, gzi=None, batch_size=1) -> oxbow._core.base.DataSource.__init__("data/sample.fasta", None, 1) <- None <- None diff --git a/py-oxbow/tests/manifests/test_sequence.TestFastqFile.test_batches.yaml b/py-oxbow/tests/manifests/test_sequence.TestFastqFile.test_batches.yaml index 7395467b..2008aad2 100644 --- a/py-oxbow/tests/manifests/test_sequence.TestFastqFile.test_batches.yaml +++ b/py-oxbow/tests/manifests/test_sequence.TestFastqFile.test_batches.yaml @@ -43,7 +43,7 @@ fields=('name', 'sequence'): - GATCGATCGATCGATCGATCGATCGATCGATCGATCGAT - TACGTACGTACGTACGTACGTACGTACGTACGTACGTAC fields=('nonexistent-field',): 'Invalid field name: nonexistent-field' -fields=None: +fields=*: batch-00: description: - '' @@ -129,3 +129,5 @@ fields=None: - CTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTA - GATCGATCGATCGATCGATCGATCGATCGATCGATCGAT - TACGTACGTACGTACGTACGTACGTACGTACGTACGTAC +fields=None: + batch-00: {} diff --git a/py-oxbow/tests/manifests/test_sequence.TestFastqFile.test_init_callstack.yaml b/py-oxbow/tests/manifests/test_sequence.TestFastqFile.test_init_callstack.yaml index 60d41913..a4ace11a 100644 --- a/py-oxbow/tests/manifests/test_sequence.TestFastqFile.test_init_callstack.yaml +++ b/py-oxbow/tests/manifests/test_sequence.TestFastqFile.test_init_callstack.yaml @@ -1,20 +1,20 @@ FastqFile("data/does-not-exist.fastq"): |- -> oxbow._core.sequence.FastqFile.__init__("data/does-not-exist.fastq") - -> oxbow._core.sequence.SequenceFile.__init__(source="data/does-not-exist.fastq", compressed=False, fields=None, regions=None, index=None, gzi=None, batch_size=131072) + -> oxbow._core.sequence.SequenceFile.__init__(source="data/does-not-exist.fastq", compressed=False, fields="*", regions=None, index=None, gzi=None, batch_size=131072) -> oxbow._core.base.DataSource.__init__("data/does-not-exist.fastq", None, 131072) <- None <- None <- None FastqFile("data/malformed.fastq"): |- -> oxbow._core.sequence.FastqFile.__init__("data/malformed.fastq") - -> oxbow._core.sequence.SequenceFile.__init__(source="data/malformed.fastq", compressed=False, fields=None, regions=None, index=None, gzi=None, batch_size=131072) + -> oxbow._core.sequence.SequenceFile.__init__(source="data/malformed.fastq", compressed=False, fields="*", regions=None, index=None, gzi=None, batch_size=131072) -> oxbow._core.base.DataSource.__init__("data/malformed.fastq", None, 131072) <- None <- None <- None FastqFile("data/sample.fastq"): |- -> oxbow._core.sequence.FastqFile.__init__("data/sample.fastq") - -> oxbow._core.sequence.SequenceFile.__init__(source="data/sample.fastq", compressed=False, fields=None, regions=None, index=None, gzi=None, batch_size=131072) + -> oxbow._core.sequence.SequenceFile.__init__(source="data/sample.fastq", compressed=False, fields="*", regions=None, index=None, gzi=None, batch_size=131072) -> oxbow._core.base.DataSource.__init__("data/sample.fastq", None, 131072) <- None <- None diff --git a/py-oxbow/tests/manifests/test_variant.TestBcfFile.test_batches.yaml b/py-oxbow/tests/manifests/test_variant.TestBcfFile.test_batches.yaml index ec9a9e29..932ccdb0 100644 --- a/py-oxbow/tests/manifests/test_variant.TestBcfFile.test_batches.yaml +++ b/py-oxbow/tests/manifests/test_variant.TestBcfFile.test_batches.yaml @@ -1,3 +1,4 @@ fields=('nonexistent-field',): 'Invalid field name: nonexistent-field' fields=('pos', 'qual'): 1 +fields=*: 1 fields=None: 1 diff --git a/py-oxbow/tests/manifests/test_variant.TestVcfFile.test_batches.yaml b/py-oxbow/tests/manifests/test_variant.TestVcfFile.test_batches.yaml index 321e9fc7..932ccdb0 100644 --- a/py-oxbow/tests/manifests/test_variant.TestVcfFile.test_batches.yaml +++ b/py-oxbow/tests/manifests/test_variant.TestVcfFile.test_batches.yaml @@ -1,3 +1,4 @@ fields=('nonexistent-field',): 'Invalid field name: nonexistent-field' -fields=('pos', 'qual'): 'External error: Sample not found: HG00096' -fields=None: 'External error: Sample not found: HG00096' +fields=('pos', 'qual'): 1 +fields=*: 1 +fields=None: 1 diff --git a/py-oxbow/tests/test_alignment.py b/py-oxbow/tests/test_alignment.py index f21d78d1..83545988 100644 --- a/py-oxbow/tests/test_alignment.py +++ b/py-oxbow/tests/test_alignment.py @@ -48,7 +48,7 @@ def test_serialized_fragments(self): @pytest.mark.parametrize( "fields", [ - None, + "*", ["qname", "rname", "mapq"], ["qname", "rname", "foo"], ], @@ -169,6 +169,10 @@ def test_serialized_fragments(self): assert [f.count_rows() for f in fragments] == [4] + def test_malformed(self): + with pytest.raises(OSError): + next(ox.BamFile("data/malformed.bam", compressed=True).batches()) + def test_input_encodings(self): file = ox.BamFile("data/sample.bam", compressed=True, batch_size=3) assert next((file.batches())).num_rows <= 3 @@ -310,6 +314,21 @@ def test_with_reference(self): seqs = batch.column("seq").to_pylist() assert all("N" not in s for s in seqs if s is not None) + def test_with_tags(self): + file = ox.CramFile("data/sample.cram").with_tags() + assert len(file.tag_defs) > 0 + batch = pa.record_batch(next(file.batches())) + assert "tags" in batch.schema.names + + def test_with_tags_with_reference(self): + # Verifies that _tag_discovery_kwargs passes reference to the scanner + file = ox.CramFile( + "data/sample-ref.cram", + reference="data/sample-ref.fa", + reference_index="data/sample-ref.fa.fai", + ).with_tags() + assert isinstance(file.tag_defs, list) + def test_with_reference_and_regions(self): file = ox.CramFile( "data/sample-ref.cram", diff --git a/py-oxbow/tests/test_bbi.py b/py-oxbow/tests/test_bbi.py index e7b526d0..bbb6dde9 100644 --- a/py-oxbow/tests/test_bbi.py +++ b/py-oxbow/tests/test_bbi.py @@ -47,6 +47,7 @@ def test_serialized_fragments(self): "fields", [ None, + "*", ("chrom", "start", "end"), ("nonexistent-field",), ], @@ -150,6 +151,7 @@ def test_serialized_fragments(self): "fields", [ None, + "*", ("chrom", "start", "end"), ("nonexistent-field",), ], diff --git a/py-oxbow/tests/test_bed.py b/py-oxbow/tests/test_bed.py index 442751bb..9081b4bd 100644 --- a/py-oxbow/tests/test_bed.py +++ b/py-oxbow/tests/test_bed.py @@ -49,6 +49,7 @@ def test_serialized_fragments(self): "fields", [ None, + "*", ("chrom", "start", "end"), ("nonexistent-field",), ], diff --git a/py-oxbow/tests/test_gxf.py b/py-oxbow/tests/test_gxf.py index 2c775871..9c9cbf61 100644 --- a/py-oxbow/tests/test_gxf.py +++ b/py-oxbow/tests/test_gxf.py @@ -51,6 +51,7 @@ def test_serialized_fragments(self): "fields", [ None, + "*", ("seqid", "start", "end"), ("nonexistent-field",), ], diff --git a/py-oxbow/tests/test_scanners.py b/py-oxbow/tests/test_scanners.py index a083ecfb..488294e4 100644 --- a/py-oxbow/tests/test_scanners.py +++ b/py-oxbow/tests/test_scanners.py @@ -21,7 +21,11 @@ class TestPySamScanner: ) def test_scan(self, input, manifest: pytest_manifest.Manifest): scanner = pickle.loads( - pickle.dumps(pickle.loads(pickle.dumps(ox.PySamScanner("data/sample.sam")))) + pickle.dumps( + pickle.loads( + pickle.dumps(ox.PySamScanner("data/sample.sam", fields="*")) + ) + ) ) schema = scanner.schema() stream = scanner.scan(*input.args, **input.kwargs) @@ -31,7 +35,11 @@ def test_scan(self, input, manifest: pytest_manifest.Manifest): def test_scan_invalid_field(self, manifest): input = Input(columns=["qname", "rname", "foo"]) scanner = pickle.loads( - pickle.dumps(pickle.loads(pickle.dumps(ox.PySamScanner("data/sample.sam")))) + pickle.dumps( + pickle.loads( + pickle.dumps(ox.PySamScanner("data/sample.sam", fields="*")) + ) + ) ) error = None try: @@ -42,11 +50,13 @@ def test_scan_invalid_field(self, manifest): assert manifest == error def test_pickle(self): - scanner = pickle.loads(pickle.dumps(ox.PySamScanner("data/sample.sam"))) + scanner = pickle.loads( + pickle.dumps(ox.PySamScanner("data/sample.sam", fields="*")) + ) assert isinstance(scanner, ox.PySamScanner) def test_scan_byte_ranges(self): - scanner = ox.PySamScanner("data/sample.sam") + scanner = ox.PySamScanner("data/sample.sam", fields="*") schema = scanner.schema() stream = scanner.scan_byte_ranges([(36, 123)]) reader = pa.RecordBatchReader.from_stream(data=stream, schema=pa.schema(schema)) @@ -54,7 +64,7 @@ def test_scan_byte_ranges(self): assert batch.num_rows == 2 def test_scan_virtual_ranges(self): - scanner = ox.PySamScanner("data/sample.sam.gz", compressed=True) + scanner = ox.PySamScanner("data/sample.sam.gz", compressed=True, fields="*") schema = scanner.schema() # unpacked virtual positions stream = scanner.scan_virtual_ranges([((53, 0), (53, 87))]) @@ -81,7 +91,11 @@ class TestPyBamScanner: ) def test_scan(self, input, manifest: pytest_manifest.Manifest): scanner = pickle.loads( - pickle.dumps(pickle.loads(pickle.dumps(ox.PyBamScanner("data/sample.bam")))) + pickle.dumps( + pickle.loads( + pickle.dumps(ox.PyBamScanner("data/sample.bam", fields="*")) + ) + ) ) schema = scanner.schema() stream = scanner.scan(*input.args, **input.kwargs) @@ -91,7 +105,11 @@ def test_scan(self, input, manifest: pytest_manifest.Manifest): def test_scan_invalid_field(self, manifest): input = Input(columns=["qname", "rname", "foo"]) scanner = pickle.loads( - pickle.dumps(pickle.loads(pickle.dumps(ox.PyBamScanner("data/sample.bam")))) + pickle.dumps( + pickle.loads( + pickle.dumps(ox.PyBamScanner("data/sample.bam", fields="*")) + ) + ) ) error = None try: @@ -102,11 +120,13 @@ def test_scan_invalid_field(self, manifest): assert manifest == error def test_pickle(self): - scanner = pickle.loads(pickle.dumps(ox.PyBamScanner("data/sample.bam"))) + scanner = pickle.loads( + pickle.dumps(ox.PyBamScanner("data/sample.bam", fields="*")) + ) assert isinstance(scanner, ox.PyBamScanner) def test_scan_byte_ranges(self): - scanner = ox.PyBamScanner("data/sample.ubam", compressed=False) + scanner = ox.PyBamScanner("data/sample.ubam", compressed=False, fields="*") schema = scanner.schema() stream = scanner.scan_byte_ranges([(130, 339)]) reader = pa.RecordBatchReader.from_stream(data=stream, schema=pa.schema(schema)) @@ -114,7 +134,7 @@ def test_scan_byte_ranges(self): assert batch.num_rows == 2 def test_scan_virtual_ranges(self): - scanner = ox.PyBamScanner("data/sample.bam") + scanner = ox.PyBamScanner("data/sample.bam", fields="*") schema = scanner.schema() # unpacked virtual positions stream = scanner.scan_virtual_ranges([((643, 977), (643, 1693))]) @@ -130,23 +150,73 @@ def test_scan_virtual_ranges(self): class TestPyBcfScanner: def test_chrom_names(self, manifest: pytest_manifest.Manifest): - scanner = pickle.loads(pickle.dumps(ox.PyBcfScanner("data/sample.bcf"))) + scanner = pickle.loads( + pickle.dumps( + ox.PyBcfScanner( + "data/sample.bcf", + fields="*", + info_fields="*", + genotype_fields="*", + samples="*", + ) + ) + ) assert manifest == scanner.chrom_names() def test_chrom_sizes(self, manifest: pytest_manifest.Manifest): - scanner = pickle.loads(pickle.dumps(ox.PyBcfScanner("data/sample.bcf"))) + scanner = pickle.loads( + pickle.dumps( + ox.PyBcfScanner( + "data/sample.bcf", + fields="*", + info_fields="*", + genotype_fields="*", + samples="*", + ) + ) + ) assert manifest == scanner.chrom_sizes() def test_info_field_defs(self, manifest: pytest_manifest.Manifest): - scanner = pickle.loads(pickle.dumps(ox.PyBcfScanner("data/sample.bcf"))) + scanner = pickle.loads( + pickle.dumps( + ox.PyBcfScanner( + "data/sample.bcf", + fields="*", + info_fields="*", + genotype_fields="*", + samples="*", + ) + ) + ) assert manifest == scanner.info_field_defs() def test_genotype_field_defs(self, manifest: pytest_manifest.Manifest): - scanner = pickle.loads(pickle.dumps(ox.PyBcfScanner("data/sample.bcf"))) + scanner = pickle.loads( + pickle.dumps( + ox.PyBcfScanner( + "data/sample.bcf", + fields="*", + info_fields="*", + genotype_fields="*", + samples="*", + ) + ) + ) assert manifest == scanner.genotype_field_defs() def test_sample_names(self): - scanner = pickle.loads(pickle.dumps(ox.PyBcfScanner("data/sample.bcf"))) + scanner = pickle.loads( + pickle.dumps( + ox.PyBcfScanner( + "data/sample.bcf", + fields="*", + info_fields="*", + genotype_fields="*", + samples="*", + ) + ) + ) assert 1233 == len(scanner.sample_names()) @pytest.mark.parametrize( @@ -158,6 +228,9 @@ def test_scan(self, input, manifest: pytest_manifest.Manifest): pickle.dumps( ox.PyBcfScanner( "data/sample.bcf", + fields="*", + info_fields="*", + genotype_fields="*", samples=["HG00096", "HG00101", "HG00103"], ) ) @@ -169,7 +242,17 @@ def test_scan(self, input, manifest: pytest_manifest.Manifest): def test_scan_invalid_field(self, manifest): input = Input(columns=["name", "sequence", "foo"]) - scanner = pickle.loads(pickle.dumps(ox.PyBcfScanner("data/sample.bcf"))) + scanner = pickle.loads( + pickle.dumps( + ox.PyBcfScanner( + "data/sample.bcf", + fields="*", + info_fields="*", + genotype_fields="*", + samples="*", + ) + ) + ) error = None try: scanner.scan(*input.args, **input.kwargs) @@ -191,6 +274,9 @@ def test_scan_query(self, input, manifest: pytest_manifest.Manifest): pickle.dumps( ox.PyBcfScanner( "data/sample.bcf", + fields="*", + info_fields="*", + genotype_fields="*", samples=["HG00096", "HG00101", "HG00103"], ) ) @@ -201,11 +287,28 @@ def test_scan_query(self, input, manifest: pytest_manifest.Manifest): assert manifest[str(input)] == reader.read_next_batch().to_pydict() def test_pickle(self): - scanner = pickle.loads(pickle.dumps(ox.PyBcfScanner("data/sample.bcf"))) + scanner = pickle.loads( + pickle.dumps( + ox.PyBcfScanner( + "data/sample.bcf", + fields="*", + info_fields="*", + genotype_fields="*", + samples="*", + ) + ) + ) assert isinstance(scanner, ox.PyBcfScanner) def test_scan_byte_ranges(self): - scanner = ox.PyBcfScanner("data/sample.ubcf", compressed=False) + scanner = ox.PyBcfScanner( + "data/sample.ubcf", + compressed=False, + fields="*", + info_fields="*", + genotype_fields="*", + samples="*", + ) schema = scanner.schema() stream = scanner.scan_byte_ranges([(100242, 102922)]) reader = pa.RecordBatchReader.from_stream(data=stream, schema=pa.schema(schema)) @@ -213,7 +316,13 @@ def test_scan_byte_ranges(self): assert batch.num_rows == 2 def test_scan_virtual_ranges(self): - scanner = ox.PyBcfScanner("data/sample.bcf", samples=[]) + scanner = ox.PyBcfScanner( + "data/sample.bcf", + fields="*", + info_fields="*", + genotype_fields="*", + samples="*", + ) schema = scanner.schema() # unpacked virtual positions stream = scanner.scan_virtual_ranges([((4713, 1341), (7244, 436))]) @@ -226,89 +335,286 @@ def test_scan_virtual_ranges(self): batch2 = reader.read_next_batch() assert batch.to_pydict() == batch2.to_pydict() + def _scan_batch(self, **kwargs): + scanner = ox.PyBcfScanner("data/sample.bcf", fields="*", **kwargs) + schema = scanner.schema() + stream = scanner.scan() + return pa.RecordBatchReader.from_stream( + data=stream, schema=pa.schema(schema) + ).read_next_batch() + + def test_info_omit_no_column(self): + # info_fields=None → no "info" column + batch = self._scan_batch(info_fields=None) + assert "info" not in batch.schema.names + + def test_info_empty_select_empty_struct(self): + # info_fields=[] → "info" column present as empty struct + batch = self._scan_batch(info_fields=[]) + assert "info" in batch.schema.names + assert pa.types.is_struct(batch.schema.field("info").type) + assert batch.schema.field("info").type.num_fields == 0 + + def test_genotype_omit_no_columns(self): + # genotype_fields=None with samples → no genotype columns + batch = self._scan_batch( + genotype_fields=None, + samples=["HG00096", "HG00101"], + ) + assert "HG00096" not in batch.schema.names + assert "HG00101" not in batch.schema.names + assert "samples" not in batch.schema.names + + def test_genotype_empty_by_sample_empty_struct_columns(self): + # genotype_fields=[] with samples, genotype_by="sample" → per-sample columns with empty struct + batch = self._scan_batch( + genotype_fields=[], + genotype_by="sample", + samples=["HG00096", "HG00101"], + ) + assert "HG00096" in batch.schema.names + assert pa.types.is_struct(batch.schema.field("HG00096").type) + assert batch.schema.field("HG00096").type.num_fields == 0 + + def test_genotype_empty_by_field_no_columns(self): + # genotype_fields=[] with samples, genotype_by="field" → no genotype columns + batch = self._scan_batch( + genotype_fields=[], + genotype_by="field", + samples=["HG00096", "HG00101"], + ) + assert "HG00096" not in batch.schema.names + assert "GT" not in batch.schema.names + + def test_samples_empty_nested_empty_struct(self): + # samples=[] with genotype_fields, samples_nested=True → "samples" column with empty struct + batch = self._scan_batch( + genotype_fields=["GT"], + samples=[], + samples_nested=True, + ) + assert "samples" in batch.schema.names + assert pa.types.is_struct(batch.schema.field("samples").type) + assert batch.schema.field("samples").type.num_fields == 0 + + def test_samples_omit_no_nested_column(self): + # samples=None with genotype_fields, samples_nested=True → no "samples" column + batch = self._scan_batch( + genotype_fields=["GT"], + samples=None, + samples_nested=True, + ) + assert "samples" not in batch.schema.names + class TestPyVcfScanner: def test_chrom_names(self, manifest: pytest_manifest.Manifest): scanner = pickle.loads( - pickle.dumps(ox.PyVcfScanner("data/sample.vcf", compressed=False)) + pickle.dumps( + ox.PyVcfScanner( + "data/sample.vcf", + compressed=False, + fields="*", + info_fields="*", + genotype_fields="*", + samples="*", + ) + ) ) assert manifest == scanner.chrom_names() def test_chrom_names_compressed(self, manifest: pytest_manifest.Manifest): scanner = pickle.loads( - pickle.dumps(ox.PyVcfScanner("data/sample.vcf.gz", compressed=True)) + pickle.dumps( + ox.PyVcfScanner( + "data/sample.vcf.gz", + compressed=True, + fields="*", + info_fields="*", + genotype_fields="*", + samples="*", + ) + ) ) assert manifest == scanner.chrom_names() def test_chrom_sizes(self, manifest: pytest_manifest.Manifest): scanner = pickle.loads( - pickle.dumps(ox.PyVcfScanner("data/sample.vcf", compressed=False)) + pickle.dumps( + ox.PyVcfScanner( + "data/sample.vcf", + compressed=False, + fields="*", + info_fields="*", + genotype_fields="*", + samples="*", + ) + ) ) assert manifest == scanner.chrom_sizes() def test_chrom_sizes_compressed(self, manifest: pytest_manifest.Manifest): scanner = pickle.loads( - pickle.dumps(ox.PyVcfScanner("data/sample.vcf.gz", compressed=True)) + pickle.dumps( + ox.PyVcfScanner( + "data/sample.vcf.gz", + compressed=True, + fields="*", + info_fields="*", + genotype_fields="*", + samples="*", + ) + ) ) assert manifest == scanner.chrom_sizes() def test_info_field_names(self, manifest: pytest_manifest.Manifest): scanner = pickle.loads( - pickle.dumps(ox.PyVcfScanner("data/sample.vcf", compressed=False)) + pickle.dumps( + ox.PyVcfScanner( + "data/sample.vcf", + compressed=False, + fields="*", + info_fields="*", + genotype_fields="*", + samples="*", + ) + ) ) assert manifest == scanner.info_field_names() def test_info_field_names_compressed(self, manifest: pytest_manifest.Manifest): scanner = pickle.loads( - pickle.dumps(ox.PyVcfScanner("data/sample.vcf.gz", compressed=True)) + pickle.dumps( + ox.PyVcfScanner( + "data/sample.vcf.gz", + compressed=True, + fields="*", + info_fields="*", + genotype_fields="*", + samples="*", + ) + ) ) assert manifest == scanner.info_field_names() def test_info_field_defs(self, manifest: pytest_manifest.Manifest): scanner = pickle.loads( - pickle.dumps(ox.PyVcfScanner("data/sample.vcf", compressed=False)) + pickle.dumps( + ox.PyVcfScanner( + "data/sample.vcf", + compressed=False, + fields="*", + info_fields="*", + genotype_fields="*", + samples="*", + ) + ) ) assert manifest == scanner.info_field_defs() def test_info_field_defs_compressed(self, manifest: pytest_manifest.Manifest): scanner = pickle.loads( - pickle.dumps(ox.PyVcfScanner("data/sample.vcf.gz", compressed=True)) + pickle.dumps( + ox.PyVcfScanner( + "data/sample.vcf.gz", + compressed=True, + fields="*", + info_fields="*", + genotype_fields="*", + samples="*", + ) + ) ) assert manifest == scanner.info_field_defs() def test_genotype_field_names(self, manifest: pytest_manifest.Manifest): scanner = pickle.loads( - pickle.dumps(ox.PyVcfScanner("data/sample.vcf", compressed=False)) + pickle.dumps( + ox.PyVcfScanner( + "data/sample.vcf", + compressed=False, + fields="*", + info_fields="*", + genotype_fields="*", + samples="*", + ) + ) ) assert manifest == scanner.genotype_field_names() def test_genotype_field_names_compressed(self, manifest: pytest_manifest.Manifest): scanner = pickle.loads( - pickle.dumps(ox.PyVcfScanner("data/sample.vcf.gz", compressed=True)) + pickle.dumps( + ox.PyVcfScanner( + "data/sample.vcf.gz", + compressed=True, + fields="*", + info_fields="*", + genotype_fields="*", + samples="*", + ) + ) ) assert manifest == scanner.genotype_field_names() def test_genotype_field_defs(self, manifest: pytest_manifest.Manifest): scanner = pickle.loads( - pickle.dumps(ox.PyVcfScanner("data/sample.vcf", compressed=False)) + pickle.dumps( + ox.PyVcfScanner( + "data/sample.vcf", + compressed=False, + fields="*", + info_fields="*", + genotype_fields="*", + samples="*", + ) + ) ) assert manifest == scanner.genotype_field_defs() def test_genotype_field_defs_compressed(self, manifest: pytest_manifest.Manifest): scanner = pickle.loads( - pickle.dumps(ox.PyVcfScanner("data/sample.vcf.gz", compressed=True)) + pickle.dumps( + ox.PyVcfScanner( + "data/sample.vcf.gz", + compressed=True, + fields="*", + info_fields="*", + genotype_fields="*", + samples="*", + ) + ) ) assert manifest == scanner.genotype_field_defs() def test_sample_names(self): scanner = pickle.loads( - pickle.dumps(ox.PyVcfScanner("data/sample.vcf", compressed=False)) + pickle.dumps( + ox.PyVcfScanner( + "data/sample.vcf", + compressed=False, + fields="*", + info_fields="*", + genotype_fields="*", + samples="*", + ) + ) ) assert 3 == len(scanner.sample_names()) def test_sample_names_compressed(self): scanner = pickle.loads( - pickle.dumps(ox.PyVcfScanner("data/sample.vcf.gz", compressed=True)) + pickle.dumps( + ox.PyVcfScanner( + "data/sample.vcf.gz", + compressed=True, + fields="*", + info_fields="*", + genotype_fields="*", + samples="*", + ) + ) ) assert 3 == len(scanner.sample_names()) @@ -321,6 +627,9 @@ def test_scan(self, input, manifest: pytest_manifest.Manifest): pickle.dumps( ox.PyVcfScanner( "data/sample.vcf", + fields="*", + info_fields="*", + genotype_fields="*", compressed=False, samples=["NA12878i", "NA12891", "NA12892"], ) @@ -341,6 +650,9 @@ def test_scan_compressed(self, input, manifest: pytest_manifest.Manifest): ox.PyVcfScanner( "data/sample.vcf.gz", compressed=True, + fields="*", + info_fields="*", + genotype_fields="*", samples=["NA12878i", "NA12891", "NA12892"], ) ) @@ -352,7 +664,17 @@ def test_scan_compressed(self, input, manifest: pytest_manifest.Manifest): def test_scan_invalid_field(self, manifest): input = Input(columns=["name", "sequence", "foo"]) - scanner = pickle.loads(pickle.dumps(ox.PyVcfScanner("data/sample.vcf"))) + scanner = pickle.loads( + pickle.dumps( + ox.PyVcfScanner( + "data/sample.vcf", + fields="*", + info_fields="*", + genotype_fields="*", + samples="*", + ) + ) + ) error = None try: scanner.scan(*input.args, **input.kwargs) @@ -374,6 +696,9 @@ def test_scan_query(self, input, manifest: pytest_manifest.Manifest): pickle.dumps( ox.PyVcfScanner( "data/sample.vcf.gz", + fields="*", + info_fields="*", + genotype_fields="*", compressed=True, samples=["NA12878i", "NA12891", "NA12892"], ) @@ -386,12 +711,28 @@ def test_scan_query(self, input, manifest: pytest_manifest.Manifest): def test_pickle(self): scanner = pickle.loads( - pickle.dumps(ox.PyVcfScanner("data/sample.vcf", compressed=False)) + pickle.dumps( + ox.PyVcfScanner( + "data/sample.vcf", + compressed=False, + fields="*", + info_fields="*", + genotype_fields="*", + samples="*", + ) + ) ) assert isinstance(scanner, ox.PyVcfScanner) def test_scan_byte_ranges(self): - scanner = ox.PyVcfScanner("data/sample.vcf") + scanner = ox.PyVcfScanner( + "data/sample.vcf", + compressed=False, + fields="*", + info_fields="*", + genotype_fields="*", + samples="*", + ) schema = scanner.schema() stream = scanner.scan_byte_ranges([(24785, 55935)]) reader = pa.RecordBatchReader.from_stream(data=stream, schema=pa.schema(schema)) @@ -399,7 +740,14 @@ def test_scan_byte_ranges(self): assert batch.num_rows == 23 def test_scan_virtual_ranges(self): - scanner = ox.PyVcfScanner("data/sample.vcf.gz", compressed=True) + scanner = ox.PyVcfScanner( + "data/sample.vcf.gz", + compressed=True, + fields="*", + info_fields="*", + genotype_fields="*", + samples="*", + ) schema = scanner.schema() # unpacked virtual positions stream = scanner.scan_virtual_ranges([((6516, 2243), (6516, 4502))]) @@ -412,6 +760,79 @@ def test_scan_virtual_ranges(self): batch2 = reader.read_next_batch() assert batch.to_pydict() == batch2.to_pydict() + def _scan_batch(self, **kwargs): + scanner = ox.PyVcfScanner( + "data/sample.vcf", compressed=False, fields="*", **kwargs + ) + schema = scanner.schema() + stream = scanner.scan() + return pa.RecordBatchReader.from_stream( + data=stream, schema=pa.schema(schema) + ).read_next_batch() + + def test_info_omit_no_column(self): + # info_fields=None → no "info" column + batch = self._scan_batch(info_fields=None) + assert "info" not in batch.schema.names + + def test_info_empty_select_empty_struct(self): + # info_fields=[] → "info" column present as empty struct + batch = self._scan_batch(info_fields=[]) + assert "info" in batch.schema.names + assert pa.types.is_struct(batch.schema.field("info").type) + assert batch.schema.field("info").type.num_fields == 0 + + def test_genotype_omit_no_columns(self): + # genotype_fields=None with samples → no genotype columns + batch = self._scan_batch( + genotype_fields=None, + samples=["NA12878i", "NA12891"], + ) + assert "NA12878i" not in batch.schema.names + assert "NA12891" not in batch.schema.names + assert "samples" not in batch.schema.names + + def test_genotype_empty_by_sample_empty_struct_columns(self): + # genotype_fields=[] with samples, genotype_by="sample" → per-sample columns with empty struct + batch = self._scan_batch( + genotype_fields=[], + genotype_by="sample", + samples=["NA12878i", "NA12891"], + ) + assert "NA12878i" in batch.schema.names + assert pa.types.is_struct(batch.schema.field("NA12878i").type) + assert batch.schema.field("NA12878i").type.num_fields == 0 + + def test_genotype_empty_by_field_no_columns(self): + # genotype_fields=[] with samples, genotype_by="field" → no genotype columns + batch = self._scan_batch( + genotype_fields=[], + genotype_by="field", + samples=["NA12878i", "NA12891"], + ) + assert "NA12878i" not in batch.schema.names + assert "GT" not in batch.schema.names + + def test_samples_empty_nested_empty_struct(self): + # samples=[] with genotype_fields, samples_nested=True → "samples" column with empty struct + batch = self._scan_batch( + genotype_fields=["GT"], + samples=[], + samples_nested=True, + ) + assert "samples" in batch.schema.names + assert pa.types.is_struct(batch.schema.field("samples").type) + assert batch.schema.field("samples").type.num_fields == 0 + + def test_samples_omit_no_nested_column(self): + # samples=None with genotype_fields, samples_nested=True → no "samples" column + batch = self._scan_batch( + genotype_fields=["GT"], + samples=None, + samples_nested=True, + ) + assert "samples" not in batch.schema.names + class TestPyFastaScanner: @pytest.mark.parametrize( @@ -425,7 +846,9 @@ class TestPyFastaScanner: ], ) def test_scan(self, input, manifest: pytest_manifest.Manifest): - scanner = pickle.loads(pickle.dumps(ox.PyFastaScanner("data/sample.fasta"))) + scanner = pickle.loads( + pickle.dumps(ox.PyFastaScanner("data/sample.fasta", fields="*")) + ) schema = scanner.schema() stream = scanner.scan(*input.args, **input.kwargs) reader = pa.RecordBatchReader.from_stream(data=stream, schema=pa.schema(schema)) @@ -442,7 +865,9 @@ def test_scan(self, input, manifest: pytest_manifest.Manifest): ], ) def test_scan_query(self, input, manifest: pytest_manifest.Manifest): - scanner = pickle.loads(pickle.dumps(ox.PyFastaScanner("data/sample.fasta"))) + scanner = pickle.loads( + pickle.dumps(ox.PyFastaScanner("data/sample.fasta", fields="*")) + ) schema = scanner.schema() stream = scanner.scan_query(*input.args, **input.kwargs) reader = pa.RecordBatchReader.from_stream(data=stream, schema=pa.schema(schema)) @@ -450,7 +875,9 @@ def test_scan_query(self, input, manifest: pytest_manifest.Manifest): def test_scan_invalid_field(self, manifest): input = Input(columns=["name", "sequence", "foo"]) - scanner = pickle.loads(pickle.dumps(ox.PyFastaScanner("data/sample.fasta"))) + scanner = pickle.loads( + pickle.dumps(ox.PyFastaScanner("data/sample.fasta", fields="*")) + ) error = None try: scanner.scan(*input.args, **input.kwargs) @@ -460,7 +887,9 @@ def test_scan_invalid_field(self, manifest): assert manifest == error def test_pickle(self): - scanner = pickle.loads(pickle.dumps(ox.PyFastaScanner("data/sample.fasta"))) + scanner = pickle.loads( + pickle.dumps(ox.PyFastaScanner("data/sample.fasta", fields="*")) + ) assert isinstance(scanner, ox.PyFastaScanner) @@ -476,7 +905,9 @@ class TestPyFastqScanner: ], ) def test_scan(self, input, manifest: pytest_manifest.Manifest): - scanner = pickle.loads(pickle.dumps(ox.PyFastqScanner("data/sample.fastq"))) + scanner = pickle.loads( + pickle.dumps(ox.PyFastqScanner("data/sample.fastq", fields="*")) + ) schema = scanner.schema() stream = scanner.scan(*input.args, **input.kwargs) reader = pa.RecordBatchReader.from_stream(data=stream, schema=pa.schema(schema)) @@ -484,7 +915,9 @@ def test_scan(self, input, manifest: pytest_manifest.Manifest): def test_scan_invalid_field(self, manifest): input = Input(columns=["name", "sequence", "foo"]) - scanner = pickle.loads(pickle.dumps(ox.PyFastqScanner("data/sample.fastq"))) + scanner = pickle.loads( + pickle.dumps(ox.PyFastqScanner("data/sample.fastq", fields="*")) + ) error = None try: scanner.scan(*input.args, **input.kwargs) @@ -494,11 +927,13 @@ def test_scan_invalid_field(self, manifest): assert manifest == error def test_pickle(self): - scanner = pickle.loads(pickle.dumps(ox.PyFastqScanner("data/sample.fastq"))) + scanner = pickle.loads( + pickle.dumps(ox.PyFastqScanner("data/sample.fastq", fields="*")) + ) assert isinstance(scanner, ox.PyFastqScanner) def test_scan_byte_ranges(self): - scanner = ox.PyFastqScanner("data/sample.fastq") + scanner = ox.PyFastqScanner("data/sample.fastq", fields="*") schema = scanner.schema() stream = scanner.scan_byte_ranges([(90, 270)]) reader = pa.RecordBatchReader.from_stream(data=stream, schema=pa.schema(schema)) @@ -506,7 +941,9 @@ def test_scan_byte_ranges(self): assert batch.num_rows == 2 def test_scan_virtual_ranges(self): - scanner = ox.PyFastqScanner("data/sample.fastq.bgz", compressed=True) + scanner = ox.PyFastqScanner( + "data/sample.fastq.bgz", compressed=True, fields="*" + ) schema = scanner.schema() # unpacked virtual positions stream = scanner.scan_virtual_ranges([((37, 84), (37, 264))]) @@ -548,7 +985,9 @@ def test_scan(self, input, manifest: pytest_manifest.Manifest): scanner = pickle.loads( pickle.dumps( ox.PyBedScanner( - "data/sample.bed", bed_schema=input.kwargs.pop("bed_schema") + "data/sample.bed", + bed_schema=input.kwargs.pop("bed_schema"), + fields="*", ) ) ) @@ -577,17 +1016,21 @@ def test_scan_invalid_field(self, manifest): def test_pickle(self): scanner = pickle.loads( - pickle.dumps(ox.PyBedScanner("data/sample.bed", bed_schema="bed9")) + pickle.dumps( + ox.PyBedScanner("data/sample.bed", bed_schema="bed9", fields="*") + ) ) assert isinstance(scanner, ox.PyBedScanner) def test_project_rest(self): for bed_schema in ["bed6", "bed6+3", "bed9"]: - scanner = ox.PyBedScanner("data/sample.bed", bed_schema=bed_schema) + scanner = ox.PyBedScanner( + "data/sample.bed", bed_schema=bed_schema, fields="*" + ) schema = scanner.schema() assert "rest" not in [field.name for field in schema] - scanner = ox.PyBedScanner("data/sample.bed", bed_schema="bed6+") + scanner = ox.PyBedScanner("data/sample.bed", bed_schema="bed6+", fields="*") schema = scanner.schema() assert "rest" in [field.name for field in schema] @@ -596,7 +1039,7 @@ def test_project_rest(self): assert "rest" in batch.schema.names def test_scan_byte_ranges(self): - scanner = ox.PyBedScanner("data/sample.bed", bed_schema="bed9") + scanner = ox.PyBedScanner("data/sample.bed", bed_schema="bed9", fields="*") schema = scanner.schema() stream = scanner.scan_byte_ranges([(108, 211)]) reader = pa.RecordBatchReader.from_stream(data=stream, schema=pa.schema(schema)) @@ -605,7 +1048,7 @@ def test_scan_byte_ranges(self): def test_scan_virtual_ranges(self): scanner = ox.PyBedScanner( - "data/sample.bed.gz", bed_schema="bed9", compressed=True + "data/sample.bed.gz", bed_schema="bed9", compressed=True, fields="*" ) schema = scanner.schema() # unpacked virtual positions @@ -642,7 +1085,9 @@ def test_scan(self, input, manifest: pytest_manifest.Manifest): scanner = pickle.loads( pickle.dumps( ox.PyBigBedScanner( - "data/sample.bb", schema=input.kwargs.pop("bed_schema") + "data/sample.bb", + schema=input.kwargs.pop("bed_schema"), + fields="*", ) ) ) @@ -691,7 +1136,9 @@ def test_scan_invalid_field(self, manifest): input = Input(columns=("nonexistent-field",)) error = None try: - scanner = pickle.loads(pickle.dumps(ox.PyBigBedScanner("data/sample.bb"))) + scanner = pickle.loads( + pickle.dumps(ox.PyBigBedScanner("data/sample.bb", fields="*")) + ) schema = scanner.schema() stream = scanner.scan(*input.args, **input.kwargs) reader = pa.RecordBatchReader.from_stream( @@ -720,7 +1167,9 @@ class TestPyBigWigScanner: ], ) def test_scan(self, input, manifest: pytest_manifest.Manifest): - scanner = pickle.loads(pickle.dumps(ox.PyBigWigScanner("data/sample.bw"))) + scanner = pickle.loads( + pickle.dumps(ox.PyBigWigScanner("data/sample.bw", fields="*")) + ) schema = scanner.schema() stream = scanner.scan(*input.args, **input.kwargs) reader = pa.RecordBatchReader.from_stream(data=stream, schema=pa.schema(schema)) @@ -730,7 +1179,9 @@ def test_scan_invalid_field(self, manifest): input = Input(columns=("nonexistent-field",)) error = None try: - scanner = pickle.loads(pickle.dumps(ox.PyBigWigScanner("data/sample.bw"))) + scanner = pickle.loads( + pickle.dumps(ox.PyBigWigScanner("data/sample.bw", fields="*")) + ) schema = scanner.schema() stream = scanner.scan(*input.args, **input.kwargs) reader = pa.RecordBatchReader.from_stream( @@ -743,7 +1194,9 @@ def test_scan_invalid_field(self, manifest): assert manifest == str(error) def test_pickle(self): - scanner = pickle.loads(pickle.dumps(ox.PyBigWigScanner("data/sample.bw"))) + scanner = pickle.loads( + pickle.dumps(ox.PyBigWigScanner("data/sample.bw", fields="*")) + ) assert isinstance(scanner, ox.PyBigWigScanner) @@ -759,7 +1212,9 @@ class TestPyGffScanner: ], ) def test_scan(self, input, manifest: pytest_manifest.Manifest): - scanner = pickle.loads(pickle.dumps(ox.PyGffScanner("data/sample.gff"))) + scanner = pickle.loads( + pickle.dumps(ox.PyGffScanner("data/sample.gff", fields="*")) + ) schema = scanner.schema() stream = scanner.scan(*input.args, **input.kwargs) reader = pa.RecordBatchReader.from_stream(data=stream, schema=pa.schema(schema)) @@ -778,7 +1233,9 @@ def test_scan(self, input, manifest: pytest_manifest.Manifest): def test_scan_with_attributes(self, input, manifest: pytest_manifest.Manifest): attr_defs = ox.PyGffScanner("data/sample.gff").attribute_defs(1024) scanner = pickle.loads( - pickle.dumps(ox.PyGffScanner("data/sample.gff", attribute_defs=attr_defs)) + pickle.dumps( + ox.PyGffScanner("data/sample.gff", fields="*", attribute_defs=attr_defs) + ) ) schema = scanner.schema() stream = scanner.scan(*input.args, **input.kwargs) @@ -796,7 +1253,9 @@ def test_scan_with_attributes(self, input, manifest: pytest_manifest.Manifest): ], ) def test_scan_sorted(self, input, manifest: pytest_manifest.Manifest): - scanner = pickle.loads(pickle.dumps(ox.PyGffScanner("data/sample.sorted.gff"))) + scanner = pickle.loads( + pickle.dumps(ox.PyGffScanner("data/sample.sorted.gff", fields="*")) + ) schema = scanner.schema() stream = scanner.scan(*input.args, **input.kwargs) reader = pa.RecordBatchReader.from_stream(data=stream, schema=pa.schema(schema)) @@ -814,7 +1273,11 @@ def test_scan_sorted(self, input, manifest: pytest_manifest.Manifest): ) def test_scan_sorted_compressed(self, input, manifest: pytest_manifest.Manifest): scanner = pickle.loads( - pickle.dumps(ox.PyGffScanner("data/sample.sorted.gff.gz", compressed=True)) + pickle.dumps( + ox.PyGffScanner( + "data/sample.sorted.gff.gz", fields="*", compressed=True + ) + ) ) schema = scanner.schema() stream = scanner.scan(*input.args, **input.kwargs) @@ -838,7 +1301,9 @@ def test_scan_query_sorted_compressed( try: scanner = pickle.loads( pickle.dumps( - ox.PyGffScanner("data/sample.sorted.gff.gz", compressed=True) + ox.PyGffScanner( + "data/sample.sorted.gff.gz", fields="*", compressed=True + ) ) ) schema = scanner.schema() @@ -895,7 +1360,9 @@ class TestPyGtfScanner: ], ) def test_scan(self, input, manifest: pytest_manifest.Manifest): - scanner = pickle.loads(pickle.dumps(ox.PyGtfScanner("data/sample.gtf"))) + scanner = pickle.loads( + pickle.dumps(ox.PyGtfScanner("data/sample.gtf", fields="*")) + ) schema = scanner.schema() stream = scanner.scan(*input.args, **input.kwargs) reader = pa.RecordBatchReader.from_stream(data=stream, schema=pa.schema(schema)) @@ -912,7 +1379,9 @@ def test_scan(self, input, manifest: pytest_manifest.Manifest): ], ) def test_scan_sorted(self, input, manifest: pytest_manifest.Manifest): - scanner = pickle.loads(pickle.dumps(ox.PyGtfScanner("data/sample.sorted.gtf"))) + scanner = pickle.loads( + pickle.dumps(ox.PyGtfScanner("data/sample.sorted.gtf", fields="*")) + ) schema = scanner.schema() stream = scanner.scan(*input.args, **input.kwargs) reader = pa.RecordBatchReader.from_stream(data=stream, schema=pa.schema(schema)) @@ -930,7 +1399,11 @@ def test_scan_sorted(self, input, manifest: pytest_manifest.Manifest): ) def test_scan_sorted_compressed(self, input, manifest: pytest_manifest.Manifest): scanner = pickle.loads( - pickle.dumps(ox.PyGtfScanner("data/sample.sorted.gtf.gz", compressed=True)) + pickle.dumps( + ox.PyGtfScanner( + "data/sample.sorted.gtf.gz", fields="*", compressed=True + ) + ) ) schema = scanner.schema() stream = scanner.scan(*input.args, **input.kwargs) @@ -954,7 +1427,9 @@ def test_scan_query_sorted_compressed( try: scanner = pickle.loads( pickle.dumps( - ox.PyGtfScanner("data/sample.sorted.gtf.gz", compressed=True) + ox.PyGtfScanner( + "data/sample.sorted.gtf.gz", fields="*", compressed=True + ) ) ) schema = scanner.schema() @@ -973,7 +1448,9 @@ def test_scan_query_sorted_compressed( assert manifest[str(input)] == result def test_schema(self, manifest: pytest_manifest.Manifest): - scanner = pickle.loads(pickle.dumps(ox.PyGtfScanner("data/sample.gtf"))) + scanner = pickle.loads( + pickle.dumps(ox.PyGtfScanner("data/sample.gtf", fields="*")) + ) schema = scanner.schema() assert manifest == schema.names diff --git a/py-oxbow/tests/test_sequence.py b/py-oxbow/tests/test_sequence.py index 5b286ae9..816136d8 100644 --- a/py-oxbow/tests/test_sequence.py +++ b/py-oxbow/tests/test_sequence.py @@ -52,6 +52,7 @@ def test_serialized_fragments(self): "fields", [ None, + "*", ("name", "sequence"), ("nonexistent-field",), ], @@ -213,6 +214,7 @@ def test_fragments(self): "fields", [ None, + "*", ("name", "sequence"), ("nonexistent-field",), ], diff --git a/py-oxbow/tests/test_variant.py b/py-oxbow/tests/test_variant.py index 69e0f714..5d9e9879 100644 --- a/py-oxbow/tests/test_variant.py +++ b/py-oxbow/tests/test_variant.py @@ -1,5 +1,6 @@ import cloudpickle import fsspec +import pyarrow as pa import pytest from pytest_manifest import Manifest @@ -53,6 +54,7 @@ def test_serialized_fragments(self): "fields", [ None, + "*", ("pos", "qual"), ("nonexistent-field",), ], @@ -63,7 +65,7 @@ def test_batches(self, fields, manifest: Manifest): fields=fields, genotype_fields=("GT",), info_fields=("DP",), - samples=("HG00096",), + samples=("NA12878i",), ) batches = ox.VcfFile(*input.args, **input.kwargs).batches() try: @@ -128,6 +130,55 @@ def test_input_with_regions(self, regions): ) file.pl() + @pytest.mark.parametrize("genotype_by", ["sample", "field"]) + def test_samples_nested_false(self, genotype_by): + # Default: genotype columns are top-level (one column per sample or per field) + file = ox.VcfFile( + "data/sample.vcf", + samples=["NA12878i", "NA12891"], + genotype_fields=["GT"], + genotype_by=genotype_by, + samples_nested=False, + ) + batch = pa.record_batch(next(file.batches())) + assert "samples" not in batch.schema.names + if genotype_by == "sample": + assert "NA12878i" in batch.schema.names + assert "NA12891" in batch.schema.names + else: + assert "GT" in batch.schema.names + + @pytest.mark.parametrize("genotype_by", ["sample", "field"]) + def test_samples_nested_true(self, genotype_by): + # Nested: all genotype data wrapped under a single "samples" struct column + file = ox.VcfFile( + "data/sample.vcf", + samples=["NA12878i", "NA12891"], + genotype_fields=["GT"], + genotype_by=genotype_by, + samples_nested=True, + ) + batch = pa.record_batch(next(file.batches())) + assert "samples" in batch.schema.names + samples_type = batch.schema.field("samples").type + assert pa.types.is_struct(samples_type) + if genotype_by == "sample": + assert samples_type.get_field_index("NA12878i") >= 0 + assert samples_type.get_field_index("NA12891") >= 0 + else: + assert samples_type.get_field_index("GT") >= 0 + + def test_samples_nested_pickle_roundtrip(self): + file = ox.VcfFile( + "data/sample.vcf", + samples=["NA12878i"], + genotype_fields=["GT"], + samples_nested=True, + ) + file2 = cloudpickle.loads(cloudpickle.dumps(file)) + batch = pa.record_batch(next(file2.batches())) + assert "samples" in batch.schema.names + class TestBcfFile: @pytest.mark.parametrize( @@ -179,6 +230,7 @@ def test_serialized_fragments(self): "fields", [ None, + "*", ("pos", "qual"), ("nonexistent-field",), ], @@ -245,3 +297,53 @@ def test_input_with_regions(self, regions): regions=regions, ) file.pl() + + @pytest.mark.parametrize("genotype_by", ["sample", "field"]) + def test_samples_nested_false(self, genotype_by): + file = ox.BcfFile( + "data/sample.bcf", + compressed=True, + samples=["HG00096", "HG00101"], + genotype_fields=["GT"], + genotype_by=genotype_by, + samples_nested=False, + ) + batch = pa.record_batch(next(file.batches())) + assert "samples" not in batch.schema.names + if genotype_by == "sample": + assert "HG00096" in batch.schema.names + assert "HG00101" in batch.schema.names + else: + assert "GT" in batch.schema.names + + @pytest.mark.parametrize("genotype_by", ["sample", "field"]) + def test_samples_nested_true(self, genotype_by): + file = ox.BcfFile( + "data/sample.bcf", + compressed=True, + samples=["HG00096", "HG00101"], + genotype_fields=["GT"], + genotype_by=genotype_by, + samples_nested=True, + ) + batch = pa.record_batch(next(file.batches())) + assert "samples" in batch.schema.names + samples_type = batch.schema.field("samples").type + assert pa.types.is_struct(samples_type) + if genotype_by == "sample": + assert samples_type.get_field_index("HG00096") >= 0 + assert samples_type.get_field_index("HG00101") >= 0 + else: + assert samples_type.get_field_index("GT") >= 0 + + def test_samples_nested_pickle_roundtrip(self): + file = ox.BcfFile( + "data/sample.bcf", + compressed=True, + samples=["HG00096"], + genotype_fields=["GT"], + samples_nested=True, + ) + file2 = cloudpickle.loads(cloudpickle.dumps(file)) + batch = pa.record_batch(next(file2.batches())) + assert "samples" in batch.schema.names diff --git a/py-oxbow/uv.lock b/py-oxbow/uv.lock index 4e8fb5ef..59d9b52a 100644 --- a/py-oxbow/uv.lock +++ b/py-oxbow/uv.lock @@ -324,86 +324,91 @@ wheels = [ [[package]] name = "arro3-core" -version = "0.4.6" +version = "0.8.0" source = { registry = "https://pypi.org/simple" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/85/e7/28f27fccbdd0b8f9b87bcb65fd539f538f54f034257ded55e947ae9ea178/arro3_core-0.4.6-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:d742956e0ce809705aaab582a3f92886fdaeb5d8eba1f659ed8b128b15ff8393", size = 2313992, upload-time = "2025-03-10T17:26:35.248Z" }, - { url = "https://files.pythonhosted.org/packages/e5/dc/f85e3da479703688ca981bd297aa4a63fba60fe5c844e60df8476d286796/arro3_core-0.4.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:25f8d1b269a08063a45753999b87bb0ba167b7f07258fdaeffcbd96ec8a9b3d0", size = 2031297, upload-time = "2025-03-10T17:26:36.768Z" }, - { url = "https://files.pythonhosted.org/packages/82/98/fc596ab7cd23357b4dbef7a3e41c57d095b57271a2d610d5c0cf8d33f6ae/arro3_core-0.4.6-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f1ce415289c3aa92539b0e05f06f1522a48b217598b1cc9487ed19ea5c61786b", size = 2495227, upload-time = "2025-03-10T17:26:38.119Z" }, - { url = "https://files.pythonhosted.org/packages/24/49/28e38a759e11ad3bd0e6b1e3234c56e9992c68d139ddb0b6f9be01f37a11/arro3_core-0.4.6-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7166c5d30db3856c0fc255697d7ff6ab37bcf0e0e8b8f0eedb41d2f63d1c6872", size = 2413316, upload-time = "2025-03-10T17:26:39.24Z" }, - { url = "https://files.pythonhosted.org/packages/77/70/5339a87f163fbdde6a0e5a04bccf710d12ce84c26338b575dc2a24b3181f/arro3_core-0.4.6-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:50275220ef52d4692bb30a9c640c8ddf8a06d99ddec41309aa7bafeec3351071", size = 3321622, upload-time = "2025-03-10T17:26:40.413Z" }, - { url = "https://files.pythonhosted.org/packages/58/73/9cc5dd54778e191683a5a8ca6da5bb124fb5adac56e3a54c2da3ad44a19d/arro3_core-0.4.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:58955a2c64aa9fdaf735711bf99eb6a0f1a5aac87f304f9445c51eeb578d6c02", size = 2408942, upload-time = "2025-03-10T17:26:42.882Z" }, - { url = "https://files.pythonhosted.org/packages/7e/e7/ec8a150a6b6b0cd2105be38535e67a2c7a2502f34474c1f94512e411491f/arro3_core-0.4.6-cp310-cp310-manylinux_2_24_aarch64.whl", hash = "sha256:f9c4e8883a9b14f8d40062dec84dafcf1a8d19a00392b7f200f06be3e84fbcc5", size = 2169821, upload-time = "2025-03-10T17:26:46.276Z" }, - { url = "https://files.pythonhosted.org/packages/f0/a2/04e5b33f5e3fe06dd6b2347457ba0c0c2515d0e42f4f38e3dc7aac73f713/arro3_core-0.4.6-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ff0518986d6f215a51478d12ec48deaf670967ec0d9d160e2149a12a5ee0b118", size = 2591771, upload-time = "2025-03-10T17:26:49.541Z" }, - { url = "https://files.pythonhosted.org/packages/24/9b/9fe01d85a39afa8418b2dacc3496494b288ee613e1fdfe2d9385996b945b/arro3_core-0.4.6-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:e1483783882689ae5683ee0186024372c51b8f9b80a4392cc0777ffb579107bf", size = 2311987, upload-time = "2025-03-10T17:26:52.996Z" }, - { url = "https://files.pythonhosted.org/packages/08/37/d4bb242e7ae0aec4b15b6817fc4813ea396b21c3976a5c19d31198def50e/arro3_core-0.4.6-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:5e48c937cbeed3895b43429ddd1821684073bc21ccb066d0c9de98acc12d8650", size = 2756198, upload-time = "2025-03-10T17:26:56.897Z" }, - { url = "https://files.pythonhosted.org/packages/c0/e5/573af99fb52c892d3d8d35ca1312334018fe17e77d33d47f5d40f02aa6cf/arro3_core-0.4.6-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:5b38d56d0168a4378fe071af0fd70c24e83d5756881d2d090516fdbdc42b8db6", size = 2655993, upload-time = "2025-03-10T17:27:00.301Z" }, - { url = "https://files.pythonhosted.org/packages/24/4f/cbb260f40d1a3d643ed1387d18130042dbdb4c625900e3768710376156dd/arro3_core-0.4.6-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:34a2d922ee8d291a3364894f282ae5110089b8551edf97944c7cf24e4a979816", size = 2576821, upload-time = "2025-03-10T17:27:03.6Z" }, - { url = "https://files.pythonhosted.org/packages/d1/d7/f5326da29cf1b35522ea47835d230874045709df3c58898387a35a02f06f/arro3_core-0.4.6-cp310-cp310-win_amd64.whl", hash = "sha256:dce9807f827f9cf13303260712f2e837fd67fe83a0ff94bfcb70274043e347b0", size = 2459384, upload-time = "2025-03-10T17:27:07.146Z" }, - { url = "https://files.pythonhosted.org/packages/d8/49/ad211d1ab6f7a96aa5690ceab3c66d66ad832b33f55a4bafa96598701947/arro3_core-0.4.6-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:6675a3ef15afc8615595d02013b544f2ca19bf91ff1043882d87bc4fc225cddb", size = 2314439, upload-time = "2025-03-10T17:27:10.587Z" }, - { url = "https://files.pythonhosted.org/packages/26/62/0c6b94c73c48104131b80b3f7cf6818c994f33a1c9d217f9056527860196/arro3_core-0.4.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:dfcc8a06b8ee16aa2de291e5d3c2edb47b991ae36afa944e076d1945daa1016c", size = 2031925, upload-time = "2025-03-10T17:27:13.794Z" }, - { url = "https://files.pythonhosted.org/packages/bb/56/3ce0cbd6381c62247f9389c35b5331c4d52eb20c571afd8fec85370178ab/arro3_core-0.4.6-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ee932a9d8798a1434d428d7e3d1d0f2f53572f2bfe8ff59fdb32938aa0bee095", size = 2495766, upload-time = "2025-03-10T17:27:16.886Z" }, - { url = "https://files.pythonhosted.org/packages/ee/3a/56dd7a8320fe3f181fefd5c9ca376c070ad6d6d209b0162fec8f73ea1334/arro3_core-0.4.6-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c2e9e23d4408c32dbcc5a0974866448bf35707ef0fcfa2d10e20cab678287d1f", size = 2413679, upload-time = "2025-03-10T17:27:20.075Z" }, - { url = "https://files.pythonhosted.org/packages/bb/a1/b62a1e18dd1f0fbb1ad4822c764e83f70f567c32986b2375ab86c9d33d71/arro3_core-0.4.6-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:809a8d17ae2e5606ca220e5665ef7c5928994c23d9837f470bdf667f9c1a5092", size = 3312049, upload-time = "2025-03-10T17:27:23.298Z" }, - { url = "https://files.pythonhosted.org/packages/dd/1a/b6c7e38b36c892600d881eeeb174deaa595df31e845e9553b90aad51d356/arro3_core-0.4.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4d89e7597fe3abdb1c90742e29feb0311a53cefe41e3732c422b311c451a6173", size = 2409350, upload-time = "2025-03-10T17:27:26.666Z" }, - { url = "https://files.pythonhosted.org/packages/68/02/d17c291a15472e1b521128ca1360d4b3aff98f0906890334db1dd6c75d5b/arro3_core-0.4.6-cp311-cp311-manylinux_2_24_aarch64.whl", hash = "sha256:3d70b13683aeaa829ab0687cad69f13cdbc9d766daffadffef95390dc33062ee", size = 2169964, upload-time = "2025-03-10T17:27:31.839Z" }, - { url = "https://files.pythonhosted.org/packages/33/66/f23e945acd76e9c3d6446ed03b989fc6be57d0fcc866f080294ac270921b/arro3_core-0.4.6-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c28a2b91865f4d3a0370a385f29fcb7d9e8cd8bfc5615c84e728cc7f5b2c78c3", size = 2591988, upload-time = "2025-03-10T17:27:34.815Z" }, - { url = "https://files.pythonhosted.org/packages/11/45/37fc865bee4953f3072fb4ddd8a8336ad1effda0a5a3fa96b0b3325d59b3/arro3_core-0.4.6-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c257ed771b0c943d62225faf5bdb035a34c3be2ca499bac168d5d23fc34cab02", size = 2312133, upload-time = "2025-03-10T17:27:38.133Z" }, - { url = "https://files.pythonhosted.org/packages/76/52/0efa431a9ba2541a2da6cc7a710199b8f9c3b49d9fe08e0a1f18e8d99049/arro3_core-0.4.6-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:bfa4d282eee6af6f69c7483d25fbcec0bed42ec0f760772890fc861c4ac1e428", size = 2756649, upload-time = "2025-03-10T17:27:41.591Z" }, - { url = "https://files.pythonhosted.org/packages/64/33/620208bdd2e90c0a6c96d63465d86ba61cad57f646b0855a3c67eff15512/arro3_core-0.4.6-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:ce0b74f548e0856b7c6b02229698f23269b39e16231df354c69d16846c0b7140", size = 2656310, upload-time = "2025-03-10T17:27:45.006Z" }, - { url = "https://files.pythonhosted.org/packages/36/4a/d76a0718cd7db5f40dcd248e8edac7a701dc0f42c522df2a4c41832173f9/arro3_core-0.4.6-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:0ec3316736eeae4b64906a5ef7d50413f9b7e4e91a7d884d1885a4a4eacc8faf", size = 2576800, upload-time = "2025-03-10T17:27:48.455Z" }, - { url = "https://files.pythonhosted.org/packages/e8/da/d00c7079f32f8e2b4c52eb5e9d38eeccb19516978ec14da212c433f1bb59/arro3_core-0.4.6-cp311-cp311-win_amd64.whl", hash = "sha256:c9645a9dfa02376cfb183847e99c68959b9dd8250a5bf53f9b64ace9f1bf6ff2", size = 2459619, upload-time = "2025-03-10T17:27:52.2Z" }, - { url = "https://files.pythonhosted.org/packages/f9/43/8311326212b35c3f0b9ecee5a91aebdf80e18373f681678c7180108702c1/arro3_core-0.4.6-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:97affacc9b038178c640ec5f853102252215a6195f1b33cc3eef15482c932452", size = 2305502, upload-time = "2025-03-10T17:27:56.213Z" }, - { url = "https://files.pythonhosted.org/packages/5f/06/f6792fcadb910b8a674fe4ef10a32c0628228681566ba12b123a07358b9e/arro3_core-0.4.6-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5a24cd950871d01f7ef35f90fb5b3b81d6ea1b7bab6d054bab847778289be9f8", size = 2023040, upload-time = "2025-03-10T17:27:59.728Z" }, - { url = "https://files.pythonhosted.org/packages/74/a6/730d19346f9daa3b7b9df8c8f6a5aec95f9e97024cb3fe1be64e6156373a/arro3_core-0.4.6-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3f42db997815e583b9610be02cbd967f9b6cca3e39857b7f68e6bddf2ddc71bb", size = 2487674, upload-time = "2025-03-10T17:28:03.514Z" }, - { url = "https://files.pythonhosted.org/packages/0f/ae/44fe7f8e79638d8670341c3bed60346d2334aaf82bca9a1b94f4fd7e4068/arro3_core-0.4.6-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4327bb25bce7fb885ab89b5ce37527a2038a24cec91feef57f2f5bfb1e378944", size = 2405470, upload-time = "2025-03-10T17:28:06.742Z" }, - { url = "https://files.pythonhosted.org/packages/3c/df/51589a2ee34aef32835e6c9c02137d129d03903be8ce1320fa102a5a8891/arro3_core-0.4.6-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c415104ed761a48ac901a7af01f712cb8a888087227b99aaf53532ac42e1a3fe", size = 3336672, upload-time = "2025-03-10T17:28:10.151Z" }, - { url = "https://files.pythonhosted.org/packages/b9/2d/82442dac4c43abff91106a6de6f3ea16833b1d076deb68852914455c5833/arro3_core-0.4.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cce0f0c2858a8aacbfbcf49013335b7a287e97ec6d7e951a9e6a6a50c110f9aa", size = 2407915, upload-time = "2025-03-10T17:28:13.539Z" }, - { url = "https://files.pythonhosted.org/packages/65/38/5534fe23b947fdf022f0407bebefb89335374bef248d605706353247dc06/arro3_core-0.4.6-cp312-cp312-manylinux_2_24_aarch64.whl", hash = "sha256:462f0f96bb4db3d938e6352a56b568ebbd2645fa146a395f1bbe60c7d8a18d97", size = 2167090, upload-time = "2025-03-10T17:28:17.851Z" }, - { url = "https://files.pythonhosted.org/packages/9a/49/6b7882afb3e9e9d40758c11c7e949816f731d38193265559d3c49bfc6405/arro3_core-0.4.6-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9ae177b7b871da4d8e81b49d8c956738a7090b1ebd1fb75923a0104507385f49", size = 2584825, upload-time = "2025-03-10T17:28:21.004Z" }, - { url = "https://files.pythonhosted.org/packages/be/a5/137f42559f1a12ddff33b5838e423ce49d8a8ffaabe6a987b9fd1c6967b7/arro3_core-0.4.6-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8045272f40725cec96720426318e80598e1f229d5d0caa662753a5106e67acfe", size = 2309470, upload-time = "2025-03-10T17:28:24.269Z" }, - { url = "https://files.pythonhosted.org/packages/f0/e3/ff210d5936bb5c2e8961c997a83d3bdb9959d3eeb6dceb355b45bf5e299e/arro3_core-0.4.6-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:4f3c759d8dbdd68a52d8f723fb4f9a10e439e25af470dde71bce9c33521ef0ec", size = 2748426, upload-time = "2025-03-10T17:28:27.658Z" }, - { url = "https://files.pythonhosted.org/packages/da/16/3fa01e44da91c34a619a997d0bcbb8cfbb91761f46651ab6b8e57959646e/arro3_core-0.4.6-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:05bced8ed3b360a51a1ab552a911510e57fb4166102b07865cf0ab6da80b4e16", size = 2652472, upload-time = "2025-03-10T17:28:32.182Z" }, - { url = "https://files.pythonhosted.org/packages/19/8f/25b04b14b04c5f1e7e8d9a95fe2ecf3730f3f8d6a15c61d15ce00c4dcdde/arro3_core-0.4.6-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2e96293d0def7c3debb791bdcef4dfabc67a2874a52fa64ed739dfe9a0a5a516", size = 2575049, upload-time = "2025-03-10T17:28:35.818Z" }, - { url = "https://files.pythonhosted.org/packages/f2/0b/804bce1b81485b774d3028b0ed11bf66f943433b4bc81dd41636ae930d2e/arro3_core-0.4.6-cp312-cp312-win_amd64.whl", hash = "sha256:bc0444060399bc19c89079e44ce5a2ca7b3cd70124bc5ae03eee1d03b2702767", size = 2460621, upload-time = "2025-03-10T17:28:39.1Z" }, - { url = "https://files.pythonhosted.org/packages/8a/67/b5a5f06035972e4613dfe5433e4d11f6bd0f0a1665ad88fb70e4b5ed1ef5/arro3_core-0.4.6-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:ec9b1969858257bef233516a626b06596524121057523a2cc3412366d7f69309", size = 2305374, upload-time = "2025-03-10T17:28:42.767Z" }, - { url = "https://files.pythonhosted.org/packages/d5/03/81a30777c55a6eb080af669e28ac59872d1dcce0ea9cc44bca206ddc1daa/arro3_core-0.4.6-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:7365673fbec5a07d5dddf93304369555e3bef5679db9dde1031291d3d1a8a705", size = 2022986, upload-time = "2025-03-10T17:28:46.362Z" }, - { url = "https://files.pythonhosted.org/packages/15/cf/e3d31ee19a1e9b77f39e5dcc15c0f2d2dc0880009dd427a50760a5118a85/arro3_core-0.4.6-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a1ad36deb9679b14e18f89f7baeab84a1b849705981a246c851a2eec1f51bc4e", size = 2486829, upload-time = "2025-03-10T17:28:49.69Z" }, - { url = "https://files.pythonhosted.org/packages/af/11/b314d459a37808e739f6fda7b43191e32903ace512104aa63d62dbd7c42b/arro3_core-0.4.6-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a49b83ea12551439d849ccc96f389c04353ab7c11343fa2c83a775b59551a73c", size = 2405204, upload-time = "2025-03-10T17:28:53.389Z" }, - { url = "https://files.pythonhosted.org/packages/fe/a0/22bfe24742acfdd3b37cc68f4e0015e6d4e35e24d6b39fdcf08bc3f77295/arro3_core-0.4.6-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6ecc1292ceb721e9fb6d50be16f556cc61db4050ec26f3d10f58d4b9a7cca50d", size = 3336385, upload-time = "2025-03-10T17:28:57.049Z" }, - { url = "https://files.pythonhosted.org/packages/ea/d4/4c35efcb9ae64ce224f82ccaf6e513aeedb6c43a1f0b0ba2f2cd81b98e40/arro3_core-0.4.6-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c07689409a5950794a724ce315e27e0ff7e5d21235a7bc84271601b96febbf07", size = 2407329, upload-time = "2025-03-10T17:29:00.503Z" }, - { url = "https://files.pythonhosted.org/packages/88/7e/3077a07587a8655c642c7c215dc98d0c5dc6e74c2ebf67fc756e2cc1a2ec/arro3_core-0.4.6-cp313-cp313-manylinux_2_24_aarch64.whl", hash = "sha256:d3d3938342949d05e079b8f8fcf26e0c77ed674a40df4eee40e20dcdef6da67e", size = 2166951, upload-time = "2025-03-10T17:29:03.588Z" }, - { url = "https://files.pythonhosted.org/packages/61/04/33797dae797045ae70230d4bc56fd65d10ff17357ac7067ea4fb7e3be712/arro3_core-0.4.6-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:eaf92f8418e670ab32dbb6decd5490d49cdac9ab763737dc63551dfb751d9ee3", size = 2584396, upload-time = "2025-03-10T17:29:06.761Z" }, - { url = "https://files.pythonhosted.org/packages/94/5e/b53ba2277e458f7f20e67879a7b2aef616dd5d80fd62376d6c8baef52d59/arro3_core-0.4.6-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a0425774da6aea06ad8fef05778d9ac3c0e1c99bb0f5c3f466f70345aca05750", size = 2309280, upload-time = "2025-03-10T17:29:10.453Z" }, - { url = "https://files.pythonhosted.org/packages/72/0a/5e85d21d756c7e93cd0e91b477f0458a06e200938c9d50536f2528acc48a/arro3_core-0.4.6-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:5d0f2efb57540222abd22a1929caf44975bee02d47595a6b8b77a3a20e1e2e47", size = 2747742, upload-time = "2025-03-10T17:29:14.178Z" }, - { url = "https://files.pythonhosted.org/packages/70/53/25884a19251a4b025443b30dd21b95162dfc326557cd518837f71de5655b/arro3_core-0.4.6-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:e37fb193ac79eeb6aac3071264da7679e27eb817267681ef3c2774160f37fbb1", size = 2652235, upload-time = "2025-03-10T17:29:17.679Z" }, - { url = "https://files.pythonhosted.org/packages/5e/5c/267ef3c8c1ffa84e1e551a2868eb59b93321cc149199133e648249a7f3c3/arro3_core-0.4.6-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:d510c9fa65a473c2eb61bcae6b14fea90fbe23b242cdd51ba348eef24f2a8d0f", size = 2574505, upload-time = "2025-03-10T17:29:21.351Z" }, - { url = "https://files.pythonhosted.org/packages/16/01/4b6ea2bfebc0c3969ee1d67d25d4361ddac6c40f1b6a1ab93a5b92fa2b47/arro3_core-0.4.6-cp313-cp313-win_amd64.whl", hash = "sha256:e6fa834dcd30b140cdfd6b19975a01b2e36f506464e569ca4bf368e13f6550bb", size = 2460117, upload-time = "2025-03-10T17:29:24.782Z" }, - { url = "https://files.pythonhosted.org/packages/20/70/c198dd2a633d04d4282b41d61f259ee554b5bf7e9dbb627d044f32aec409/arro3_core-0.4.6-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:5e1f2e8b1aba783b6602296a324846ad3df6526f304a8a071b94ebf788a78e16", size = 2316495, upload-time = "2025-03-10T17:29:28.071Z" }, - { url = "https://files.pythonhosted.org/packages/6e/aa/4644cf4e5a21a210d06300a40e331bab4d085d08ff21dbc0443402f5970b/arro3_core-0.4.6-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2f61856bfcbfda2e99b1a119295e4e1ebc7b2016437e2b81cc4bd5457c3f57c3", size = 2033187, upload-time = "2025-03-10T17:29:31.327Z" }, - { url = "https://files.pythonhosted.org/packages/84/e3/2bc4007f6eb0231931c69679851d53a2b6321fe6091a7c7863ef917572c1/arro3_core-0.4.6-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c22226c0ccb48606eb38e268b1ee25d165c861caab18eb38f27a28117908508a", size = 2496516, upload-time = "2025-03-10T17:29:34.557Z" }, - { url = "https://files.pythonhosted.org/packages/4b/53/46685d3e11d9be8bb97d8c77c9bdcf495b62c8b9bea3d5c785f7e85b68b8/arro3_core-0.4.6-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9c9f92b8831194c9eba4946fee0fa6c764d41f34b51cdd702b8585d0fbf1a187", size = 2415126, upload-time = "2025-03-10T17:29:37.985Z" }, - { url = "https://files.pythonhosted.org/packages/ea/3b/684b72ddbfc4cf847f39712df46249becd5d41d40ddd94c80340a0f07b65/arro3_core-0.4.6-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7056e7666a9345561b7ca6c50a2514d3e27c79088be08122ddf2d581916a9a67", size = 3324822, upload-time = "2025-03-10T17:29:41.272Z" }, - { url = "https://files.pythonhosted.org/packages/a0/c8/e593e9904d22ead32c3f098122161c588a674d11e33a1007a0a358edc841/arro3_core-0.4.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d8e54d6b4e7a65b64aac93283d33cd3993c97ccb86fc6a453526a29fecd04db0", size = 2410702, upload-time = "2025-03-10T17:29:44.599Z" }, - { url = "https://files.pythonhosted.org/packages/c6/8f/605bccb2b5350a5bdc8cda042ebd21d5b6b4139d862cb768ad775a051dbe/arro3_core-0.4.6-cp39-cp39-manylinux_2_24_aarch64.whl", hash = "sha256:db3f0d2d9bc84fa84a1a9349f58796a82b62115e6f0ef4e2cd49922d04748ac9", size = 2170959, upload-time = "2025-03-10T17:29:48.112Z" }, - { url = "https://files.pythonhosted.org/packages/53/b3/edbd25ee0a92f0cfcc9650d8e4d1c39376fb11f6f9eec5197b2c88da00ad/arro3_core-0.4.6-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2ebf77cf541a06dc9350d2404225c043a360c3ee60fe850cb587f3cce96c2640", size = 2593127, upload-time = "2025-03-10T17:29:51.437Z" }, - { url = "https://files.pythonhosted.org/packages/ef/40/b5ab851ab8db0e5b3e1633861da76a788f4c578448a9c869cd03b4ee83a6/arro3_core-0.4.6-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:a479ef3a777ffeb74ec74b8d171c9120bb5bfc6ab807277e73be36c155320bc7", size = 2314178, upload-time = "2025-03-10T17:29:55.257Z" }, - { url = "https://files.pythonhosted.org/packages/08/de/5e7e2c5abf4bd8889328e8f66d68fcb6c05a83b6d59e26cbe5f5c810e461/arro3_core-0.4.6-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:892e5148a563dfab4136a7148cb1e26975b06e8bc90b61dcd6ded4baaf608922", size = 2757568, upload-time = "2025-03-10T17:29:58.502Z" }, - { url = "https://files.pythonhosted.org/packages/16/00/03f7969d58d721c0ff19af98c558c12b295af4381ebfcf37d389e517963b/arro3_core-0.4.6-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:8c92a9c9545d9876e458a6a59d98f5dc3f081195ba283357c42a23eb7554bf82", size = 2656913, upload-time = "2025-03-10T17:30:01.838Z" }, - { url = "https://files.pythonhosted.org/packages/5c/ef/371c1df195351cf78a60b43584e01abe5a1a24890dc74266f5438d4d7a7f/arro3_core-0.4.6-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:b875e3216598cdad4f9ea347b935b5cfc8476cb0e528875e3d8a99a4aa58fb92", size = 2578647, upload-time = "2025-03-10T17:30:05.867Z" }, - { url = "https://files.pythonhosted.org/packages/bf/94/4fa30d3d0f71cd8db910dd4f35361af5a72ab2db50a5f02ec2d376ba1ca3/arro3_core-0.4.6-cp39-cp39-win_amd64.whl", hash = "sha256:bc1a7bd457b93d533090b97ed8314090ee90351e11a328af3dae8b89dff09a28", size = 2460349, upload-time = "2025-03-10T17:30:09.1Z" }, - { url = "https://files.pythonhosted.org/packages/78/0f/9fc952987f5364bf165bc7bcfa17f4cc36ca3f624a59604219954a3ec593/arro3_core-0.4.6-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:95c93208049dba4f857113b875ce052814b5b7bab35bf49213ec1dde1057e376", size = 2313058, upload-time = "2025-03-10T17:30:12.677Z" }, - { url = "https://files.pythonhosted.org/packages/d9/9c/57a4e205287739437dca1d4f95075a040500aca90fb3a2ed914d4b565aa8/arro3_core-0.4.6-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:b6d9eed939cca052bbfcd8c4b11122875400fe0dffb49d1d7f6b7d0ac770fb2a", size = 2030809, upload-time = "2025-03-10T17:30:16.237Z" }, - { url = "https://files.pythonhosted.org/packages/bf/25/fa3dffc31c0393ac0c6f1b2c2dc7337d9cfa63210d3633f3b45b73e68723/arro3_core-0.4.6-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ae4cbdaffbb8eec63540cefec5b125b4fb49933c8cc1ce09f5dfd7b1fddf7fe5", size = 2494695, upload-time = "2025-03-10T17:30:19.671Z" }, - { url = "https://files.pythonhosted.org/packages/f4/0c/fb2a5ffe8795c896b404991f54201ae4ce99e848d76694313fce3bd07f51/arro3_core-0.4.6-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:dbf47ae175c45f3db077279b2fc0428822736d356d604be7815ba2054fc87279", size = 2412527, upload-time = "2025-03-10T17:30:22.921Z" }, - { url = "https://files.pythonhosted.org/packages/34/ac/8258c3ff7052ddae517ba3fbf7a2421d7c7469278b46d381024357b7ffc6/arro3_core-0.4.6-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ab93f872c8585012830f8f804a5073ae23fb8d7e5656715ca6097429ebbc77ca", size = 3320601, upload-time = "2025-03-10T17:30:26.258Z" }, - { url = "https://files.pythonhosted.org/packages/95/50/d10164ed9a137d21c75aafb050eb3888008618a0506907fc7c6093ec23b6/arro3_core-0.4.6-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94c01adcff5f4ff7610689163d7a42fa8ebd138aa5d502535a94f881ffb01b7d", size = 2408571, upload-time = "2025-03-10T17:30:29.768Z" }, - { url = "https://files.pythonhosted.org/packages/9c/fd/2099987b064366634cb9337a46d71a1401fa3403c75f4e4298c26ee3ca94/arro3_core-0.4.6-pp310-pypy310_pp73-manylinux_2_24_aarch64.whl", hash = "sha256:d0b4928fe59b6d1139013463babd0961b83d2189782df0617e8607e783317f23", size = 2169277, upload-time = "2025-03-10T17:30:34.104Z" }, - { url = "https://files.pythonhosted.org/packages/f9/ce/46376494b93a9734ce5dad6fda897283b5f6dd5f69732e0e2f4d380ca8b9/arro3_core-0.4.6-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:434eaab117e83ef4295e607613a8091c63aa83216c6b983053b56a94d474c007", size = 2591181, upload-time = "2025-03-10T17:30:38.282Z" }, - { url = "https://files.pythonhosted.org/packages/c4/03/d55cbe38dd4534f3f6cf70d15b46c3cb851f6e0eb86aa89639b15e3c9db2/arro3_core-0.4.6-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:8bd3975595c7f5d110c56024a0dd1b87bd9aa3962296867aafcd389ffd2a2351", size = 2311938, upload-time = "2025-03-10T17:30:41.32Z" }, - { url = "https://files.pythonhosted.org/packages/1f/53/11e69b2297454d7a7623a815382a24017ad1e27be63505938597cb5c774f/arro3_core-0.4.6-pp310-pypy310_pp73-musllinux_1_2_armv7l.whl", hash = "sha256:e3af28607bc897be053abebc5e0a9009046297ea58206b4cdef9fc91f7ed9b06", size = 2755644, upload-time = "2025-03-10T17:30:44.77Z" }, - { url = "https://files.pythonhosted.org/packages/f4/80/b23f50d5834bcb0f6ca37e7a9a69e492f19164d0ddffef6559f865c9cd0b/arro3_core-0.4.6-pp310-pypy310_pp73-musllinux_1_2_i686.whl", hash = "sha256:77e621a03918faea2a5c21636b519b346421806084131001428f3f59775dbab9", size = 2656018, upload-time = "2025-03-10T17:30:48.128Z" }, - { url = "https://files.pythonhosted.org/packages/8d/a5/7ba168683f7efa55ae7673fdf8e1fdee8cb0e74d9177d0c04345d4e56fd2/arro3_core-0.4.6-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:957162e769eed29f39d645102ae3b1646b968fdd55f62831ddeb91d8a72b0a41", size = 2576173, upload-time = "2025-03-10T17:30:52.763Z" }, +dependencies = [ + { name = "typing-extensions", version = "4.13.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10' or python_full_version == '3.11.*'" }, + { name = "typing-extensions", version = "4.15.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a5/e7/d84370ea85be641a8c57f4f8296e8465d30e46938cc9480d384a3ee0084c/arro3_core-0.8.0.tar.gz", hash = "sha256:b75d8281b87a87d3b66836bab89951ae06421970e5f880717723a93e38743f40", size = 93557, upload-time = "2026-02-23T15:12:20.622Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/94/f8/a3b022a4e8d8f4a7d28ab379105df756be1351c88576f0ab6a47cbfdc2ee/arro3_core-0.8.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:ac43f746d5331bb37ed1b0ade18a12707b64cb85b3eb5cc1d5d7b5029f1f2c12", size = 2894496, upload-time = "2026-02-23T15:09:28.302Z" }, + { url = "https://files.pythonhosted.org/packages/57/f6/a92704f33af317ce33c2bbda4a63f902f088d24b92a89fb5cdc52148e7cb/arro3_core-0.8.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29b3d1cbd2c4bac787f473d071e1eb02b71b2701a7118bb5d0a274ffbd26b16c", size = 2629080, upload-time = "2026-02-23T15:09:29.932Z" }, + { url = "https://files.pythonhosted.org/packages/b5/41/082dac085cde3e4adfd3c09b57a265fb6fb6ff2595a02ac06efa80e2a65f/arro3_core-0.8.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ea28c1f0d7bd327b0116557e041151da7eca3362e1ffe8cc9f53832c808a75f8", size = 3105123, upload-time = "2026-02-23T14:48:28.73Z" }, + { url = "https://files.pythonhosted.org/packages/7f/b4/dd6353739155e2013a1cec77092951d92e086c2ac7bb44dd3c2c0f908cab/arro3_core-0.8.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1244885a5b3aebebcfedac2c30a83a635f15d65bc9079e32c16cae07ec3b4db2", size = 3214068, upload-time = "2026-02-23T14:49:10.93Z" }, + { url = "https://files.pythonhosted.org/packages/99/d4/3fad1e5559ca26f3d9a1235405c0529df19bed11952dc6d672c9898ea341/arro3_core-0.8.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6147bc60d36bce9241e5972dc344fb144eabe8cef4e2c0812eb58cdfadebeacb", size = 3423152, upload-time = "2026-02-23T14:50:46.919Z" }, + { url = "https://files.pythonhosted.org/packages/28/0f/ae28551a2cc20c87a0cd435045824501258a8564f12981802e5ad68a54c9/arro3_core-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:524e2ce13ea3d2739df4d52ea03977a53d103c1fd73f0fcc6a713903ea6ad4fa", size = 2992192, upload-time = "2026-02-23T14:51:01.83Z" }, + { url = "https://files.pythonhosted.org/packages/74/ea/a754cf21dd367dd59f91f84bac3a0a2043dc9883c2b963edac191f1f3a51/arro3_core-0.8.0-cp310-cp310-manylinux_2_24_aarch64.whl", hash = "sha256:12a7eddb4b406a4d9343bd9d42d2bb40de0bdc4ad5f50bfe10b0836e98ac2285", size = 2774834, upload-time = "2026-02-23T14:47:44.872Z" }, + { url = "https://files.pythonhosted.org/packages/e8/92/6ec0384dbdad44baf1933449cfa7a04575111df2fd5e873c8e97f5d2fc51/arro3_core-0.8.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c8f36b7a064c0080d2db903d52ffe0682602f26c1d7ccc9347f846b4bfe2cad8", size = 3200721, upload-time = "2026-02-23T14:51:14.655Z" }, + { url = "https://files.pythonhosted.org/packages/0f/79/d9b65859bdbdcce43c30c92f4a157e78e919241e996f19636a62594ef149/arro3_core-0.8.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:e8e1230c94f55d5a9ffcd99580a0d38e9299cdf743e9d5ad7595be862b5dc21c", size = 2950513, upload-time = "2026-02-23T15:09:41.907Z" }, + { url = "https://files.pythonhosted.org/packages/ce/07/41d033d5faf544e6515faa8feb2df83c34a3b4682b5c1ae2d4e2826a6f1f/arro3_core-0.8.0-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:d30a515f1f52afb54b7fe1306179a8a18c9e8c0ef6631eeec82ebab21cc07a8a", size = 3382707, upload-time = "2026-02-23T15:09:45.891Z" }, + { url = "https://files.pythonhosted.org/packages/f9/ab/2bba797c3041a50770e81e29e508bb5c5079566ae47d13c63e2a61886061/arro3_core-0.8.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:0a60a568ab9d96eb0f52670da8a0b356369d32460d30857fd60cc5c7b74e1d02", size = 3309259, upload-time = "2026-02-23T15:09:48.924Z" }, + { url = "https://files.pythonhosted.org/packages/42/5c/cb1cfc4da613901bde7a2ef70dad67f873afb8842d8cece4d789c970568d/arro3_core-0.8.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:bf2aefe40ca6e374fad1c94200f586bd22917717f275d48968846b8e5c698d5e", size = 3210182, upload-time = "2026-02-23T15:09:53.025Z" }, + { url = "https://files.pythonhosted.org/packages/24/4d/d9d0be057c2fb806c21e1781f192081954f79155c8f9be7f3ee8d4d92d34/arro3_core-0.8.0-cp310-cp310-win_amd64.whl", hash = "sha256:a928179451fe32564b39989ad737d769c2d0343ee71e8b3a4ebd3dd8c9d2c8f7", size = 3165683, upload-time = "2026-02-23T15:09:56.972Z" }, + { url = "https://files.pythonhosted.org/packages/77/61/a6a33a24bc4eccfbf168d7765d96488193789b48d8a916d8d42aae3a8e75/arro3_core-0.8.0-cp311-abi3-macosx_10_12_x86_64.whl", hash = "sha256:051b1c46b424c207b7ee2f5ae50f8f88cb79d167c3e4000adf59a0e3e3994331", size = 2901125, upload-time = "2026-02-23T15:10:00.796Z" }, + { url = "https://files.pythonhosted.org/packages/d4/60/cfe8b327ea30d8183e9b9eaca9668a8e6ce7c6e187701dc83a0820ddc0fb/arro3_core-0.8.0-cp311-abi3-macosx_11_0_arm64.whl", hash = "sha256:c6b0e0b8914e634096fb377046bfcd21420b50141394e8cc1b12d43a98df1a43", size = 2632882, upload-time = "2026-02-23T15:10:04.335Z" }, + { url = "https://files.pythonhosted.org/packages/c0/99/71d9e31022d68c8cf104ed9c744291657c6a5fe94348869edfdaf1e8dab2/arro3_core-0.8.0-cp311-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e4c20b6a55016ecd3f37f7dadf4d13d5a03dd51b7385e8f4130931447d110700", size = 3108341, upload-time = "2026-02-23T14:48:30.745Z" }, + { url = "https://files.pythonhosted.org/packages/39/1f/c067cc12b306b8a0dbec1e24a9c9e32dc5b5f3f9179466873d5c5666f124/arro3_core-0.8.0-cp311-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:90dbbde6294d7349b2713e308cd3ef284de75003e8b5ad927f1716e7062525ce", size = 3216570, upload-time = "2026-02-23T14:49:12.829Z" }, + { url = "https://files.pythonhosted.org/packages/1b/9b/f253dd3281e2d980c81e1526f9386b24c6a55e9bd152dd259032f94aceee/arro3_core-0.8.0-cp311-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ee6693d496ab733fce43b2e83f9f7b5147db6906b3fbeba3b2d4108ffae5fbec", size = 3422198, upload-time = "2026-02-23T14:50:50.472Z" }, + { url = "https://files.pythonhosted.org/packages/2e/66/70786ee1cfdd03d36d456c4ef02a35506b7ae256c70a74bd7abf135daba0/arro3_core-0.8.0-cp311-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d882481e2f739fe449ca9bf724f4b33185fc48ba87dd82a26a64e6a23f5ed2f8", size = 2996395, upload-time = "2026-02-23T14:51:03.946Z" }, + { url = "https://files.pythonhosted.org/packages/f3/b6/adf08e655df3ea07c460f3e441736face4de29277fdd753d5ba1fd89a43e/arro3_core-0.8.0-cp311-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:d56d08a3e08864512d343a4d75e468beba743abc3a9d139e14bf3e81d0d8d79b", size = 2777566, upload-time = "2026-02-23T14:47:46.817Z" }, + { url = "https://files.pythonhosted.org/packages/07/9b/3d0b811a143372398b4c31eb58a9011774f20d184a1ba3d6dff99023205d/arro3_core-0.8.0-cp311-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:02c47e2d87f063e04c12c074f4cc66efd65fc9c6b14db7f80934827ec46c589d", size = 3203472, upload-time = "2026-02-23T14:51:16.938Z" }, + { url = "https://files.pythonhosted.org/packages/77/88/987517aa8902f93e6395bafa1ade91fadae3aef49474199de5e1f75e42c7/arro3_core-0.8.0-cp311-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:72fa13702df4698884900e60824fecda434f61ffecb5ff0d914bf9f0afa79fe9", size = 2950379, upload-time = "2026-02-23T15:10:17.001Z" }, + { url = "https://files.pythonhosted.org/packages/6a/3a/e059061b6ace4090b8ec4f9170811a3fdcca3181ff126c6714c382b144ed/arro3_core-0.8.0-cp311-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:8ab0bc6ad9b449b8a939e13ce94f6cacfea1d21953d437a8aa2ff8b4622512e0", size = 3386585, upload-time = "2026-02-23T15:10:18.51Z" }, + { url = "https://files.pythonhosted.org/packages/f8/80/7161d0d0326597775784db854e58b88d748127df7e072a099ec36c1fb355/arro3_core-0.8.0-cp311-abi3-musllinux_1_2_i686.whl", hash = "sha256:975a3e3dea90789608d40c54b4176b9b72c9664a4cd2c842914ac62c489b1f06", size = 3313967, upload-time = "2026-02-23T15:10:20.993Z" }, + { url = "https://files.pythonhosted.org/packages/3b/62/13fbb9fdfae011513f944e45804e528a041c0e35efab9363ccdd716cde65/arro3_core-0.8.0-cp311-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:7c3658fda04e0816333c8dda702c777d305b581876cd4176b15877726231b350", size = 3215978, upload-time = "2026-02-23T15:10:24.593Z" }, + { url = "https://files.pythonhosted.org/packages/bf/81/c0983e56969d8039116ffcf1bb3eafc17f8f34b2b63229970562bba6b52c/arro3_core-0.8.0-cp311-abi3-win_amd64.whl", hash = "sha256:a988c6cb74f97df4d276d5496f8667b6d5d95311d453ef32b28fb933b5ae96c4", size = 3176374, upload-time = "2026-02-23T15:10:27.902Z" }, + { url = "https://files.pythonhosted.org/packages/b8/b6/08f088efd3737bcdaed98057b51c9d20d622e62e5b7dd626c6d60e67bd93/arro3_core-0.8.0-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:3cfa6b5c3981711a602c357afae1f16a6daa380cac8365100365560852e51d4a", size = 2890907, upload-time = "2026-02-23T15:10:32.408Z" }, + { url = "https://files.pythonhosted.org/packages/0f/a4/2f1e20b879587a0419699a50e60aed9d2802423f8e5df844f31fa81f64d6/arro3_core-0.8.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:4438167e4c357bafe66e8716adf5a55d73d79cf31bd4f7db465491605ee4afbc", size = 2625446, upload-time = "2026-02-23T15:10:36.324Z" }, + { url = "https://files.pythonhosted.org/packages/9c/e7/92dbdf38de67435f04b5e2d013460e5a12ccac8edabd6a47a159c2f8acf7/arro3_core-0.8.0-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5ddc9a49b04ff179e1f6281164ee88008e73a0a72a931449c24ad0f8897be220", size = 3108513, upload-time = "2026-02-23T14:48:32.841Z" }, + { url = "https://files.pythonhosted.org/packages/16/a8/b8e7c8b64f0df4fd9c0f0e2faa2753658664d2dec9109d4e2ae2d470fb14/arro3_core-0.8.0-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:85dfb4df87cd7e9adc17798e4468d5ea4f3e5dbd7845abebe1c85bba2a092ba3", size = 3211045, upload-time = "2026-02-23T14:49:14.962Z" }, + { url = "https://files.pythonhosted.org/packages/0f/e8/657194c4cfc8516984ec560cd326c1b6ab8e83becc6bdb761508019704b1/arro3_core-0.8.0-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0d4abad932811cadc1ae3e4976c4bb797e025c2451ae551edc60cf34a807edcf", size = 3424840, upload-time = "2026-02-23T14:50:52.742Z" }, + { url = "https://files.pythonhosted.org/packages/26/d6/0ceb8490347f3317cee4a902d3999a1d729cf9a074310d89a046fd93fb18/arro3_core-0.8.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c8a80c8ece04cb45328eba5667dacdef553dbe379443920f98b25d8ce3db761", size = 2994109, upload-time = "2026-02-23T14:51:05.837Z" }, + { url = "https://files.pythonhosted.org/packages/a5/82/1ef508fd796d341898a55f9c86f48ffa5d74a658159faad096d03929b419/arro3_core-0.8.0-cp313-cp313t-manylinux_2_24_aarch64.whl", hash = "sha256:12fc8c7133102c77661051a5e55c331a84dc58a3a8fe58fd18c38fcb61fa80d8", size = 2775585, upload-time = "2026-02-23T14:47:49.084Z" }, + { url = "https://files.pythonhosted.org/packages/d0/ac/7e23539e5ba39a6534eb374a3a0e0178d25e8278cdf3d531bca89bd2bd82/arro3_core-0.8.0-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:278f2d35b4144ef7c77a037fd68dccacd651eda462cf2e739a85043109749cd3", size = 3204688, upload-time = "2026-02-23T14:51:18.986Z" }, + { url = "https://files.pythonhosted.org/packages/f0/cc/e2788c16f383a82d75a273bfe6a741e647d5ba4615c884c462e0e8a7d53e/arro3_core-0.8.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:b7173b44e8809eb772a8bdb51dd866edb32682aac0c80055ea8e3c79077ad8c5", size = 2950218, upload-time = "2026-02-23T15:10:48.828Z" }, + { url = "https://files.pythonhosted.org/packages/e2/7d/ba5ad9dcd69f8465011eef8558b7536eeb90384fa6f054874e2252d5a707/arro3_core-0.8.0-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:bc69ca8cbd02a2a0d63d8278182380ba79d62c798ada8768fd700e8e5168b4c1", size = 3386355, upload-time = "2026-02-23T15:10:51.527Z" }, + { url = "https://files.pythonhosted.org/packages/58/59/5369b3575af4093633f894206d94f3102a19b6e7f07c17f1c8035c78542e/arro3_core-0.8.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:bc02ce82e8681d87c1d9fa27c0bc8322c982d93ba12a317dce33756cee79f285", size = 3312564, upload-time = "2026-02-23T15:10:54.502Z" }, + { url = "https://files.pythonhosted.org/packages/08/d3/d3da1020627d6d9408979e4dd7f466a66cc08e41a1f2b778d8cdaf7725df/arro3_core-0.8.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:3e66450987724a1f71bdfa1f721486af09bd07cb86227f575805e6f94f764b4f", size = 3213371, upload-time = "2026-02-23T15:10:56.666Z" }, + { url = "https://files.pythonhosted.org/packages/c9/47/dddb6852b57403a306a477d64befb2c0d0536baba8700581d785f0fef6e7/arro3_core-0.8.0-cp313-cp313t-win_amd64.whl", hash = "sha256:03fc7a1348a9d42f48061d45825e823985ee10c80aa509bafc0e84b10e7ecbb4", size = 3164236, upload-time = "2026-02-23T15:11:00.222Z" }, + { url = "https://files.pythonhosted.org/packages/68/3f/c15e183e63504c86e81d28c3672a9c3d01f48b7f9691a78c0e47cab831d3/arro3_core-0.8.0-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:be7dd0088bbab7b528d8d754b0fa05506e26da62f4a5d2f741fe94d7548e724e", size = 2890665, upload-time = "2026-02-23T15:11:04.753Z" }, + { url = "https://files.pythonhosted.org/packages/a1/45/b808cd7b1ba7afe6de4223414ca8191c030266d437ee69cce269b76e8a23/arro3_core-0.8.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:396496e96e4b86ac73aef32263c607c2161b878f334cf6ef954aaa74c8f1267f", size = 2625876, upload-time = "2026-02-23T15:11:08.236Z" }, + { url = "https://files.pythonhosted.org/packages/a1/63/cbb9f41624b6301dac4540e6fd5b6d18e6fe16c47bda0534330e6b22999e/arro3_core-0.8.0-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:828032a416906af1d84702301885098ab0bc2aa9f956f677b676161aeabeb06d", size = 3108175, upload-time = "2026-02-23T14:48:34.654Z" }, + { url = "https://files.pythonhosted.org/packages/75/f3/b9cf731acb9a910091518da1234d51904a1d0b615f16a13fc883331c627d/arro3_core-0.8.0-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:87d56b263bbc747691d08b3902a5f0d77adfb180d0544f9c52d622b2b79cd21f", size = 3211409, upload-time = "2026-02-23T14:49:17.204Z" }, + { url = "https://files.pythonhosted.org/packages/24/f8/30992bf19380285a9bc1a0c52aae26802679911c3787e804952505e7c4e5/arro3_core-0.8.0-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7f08c07be0ff8d893d756ba20381b4fcbdf50af3c2bcec677529664920c07cf5", size = 3425205, upload-time = "2026-02-23T14:50:55.802Z" }, + { url = "https://files.pythonhosted.org/packages/04/51/44de5c60e3058947d8733cae3c916e33f96b875b05ac795188def5542680/arro3_core-0.8.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:34b280c70fe6bd6ca4c236f552d09b51ac551dc1c24793c9142ce89087346371", size = 2994668, upload-time = "2026-02-23T14:51:07.771Z" }, + { url = "https://files.pythonhosted.org/packages/1e/79/447e62f939183216361c6bfc8e3445e21835c2ae1a31e4ab817eb5d7cdc4/arro3_core-0.8.0-cp314-cp314t-manylinux_2_24_aarch64.whl", hash = "sha256:37202b826dd9695fc775064806bc07897c04caacef9403ea9d6706635f95ebdd", size = 2775761, upload-time = "2026-02-23T14:47:50.944Z" }, + { url = "https://files.pythonhosted.org/packages/58/d7/aa6572d46908e2986968887cec55d6c771ceea6a0ab14c7d219365a4ee09/arro3_core-0.8.0-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b70530b95d36e1409023f7bde3e9aeb75e3048852beb44263d98685c9f0d8f37", size = 3204821, upload-time = "2026-02-23T14:51:21.002Z" }, + { url = "https://files.pythonhosted.org/packages/41/f2/3c14108c13872b4143ffec3cddde56921caab04e45bf3a473769e8ff5b59/arro3_core-0.8.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:faf03d46e0a1817bf3959c21f2ca4d2bd2d61277b5319439df3044082e10effa", size = 2950512, upload-time = "2026-02-23T15:11:20.941Z" }, + { url = "https://files.pythonhosted.org/packages/75/fc/b4e1b9f90543eb560683f05520abced6ca9b236f12b147490da538d6028f/arro3_core-0.8.0-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:7a120ee05477c7e28565ce0b7572413a093745bb314195c4206c0ef578abea1b", size = 3386434, upload-time = "2026-02-23T15:11:23.584Z" }, + { url = "https://files.pythonhosted.org/packages/f1/55/4c7fc0e9f4e816c49ba3b520d87478b4900db3ae3e5186d0d333300918cc/arro3_core-0.8.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:a19842cfa196f07c7fd7398d08eec5bdeed331b522dcbbf9d53830180f8d6d66", size = 3312814, upload-time = "2026-02-23T15:11:26.247Z" }, + { url = "https://files.pythonhosted.org/packages/e7/fc/a4209e468b87bec36ee41afe9a01848f6ac2855055fcefad57da04c8896a/arro3_core-0.8.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:d6ceab802cc609498e47dc214967d282af8c3104c7a83aff008739192cf821e8", size = 3213623, upload-time = "2026-02-23T15:11:29.263Z" }, + { url = "https://files.pythonhosted.org/packages/c6/84/61882d6491f38d9362d9382a914a47fd3992c57ee76b35646ea01d65b0bb/arro3_core-0.8.0-cp314-cp314t-win_amd64.whl", hash = "sha256:355e22a8845cbc6379e705f71a08c9cdaab6a7facc63a863e43ee5dc56ed7976", size = 3163287, upload-time = "2026-02-23T15:11:31.69Z" }, + { url = "https://files.pythonhosted.org/packages/66/a7/4f2a5e2ef035fbfa7d61b4711c51714634cbbe90783b671c950523c17ef2/arro3_core-0.8.0-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:8e44f93f3a4295b33fcde8c8e7dd65cfab5f3a6996f6f8f76bcaba6097a72eb0", size = 2897301, upload-time = "2026-02-23T15:11:33.828Z" }, + { url = "https://files.pythonhosted.org/packages/bc/93/5d8372a6bf904ce0ad8fbfc486e716b5840540cd6bff28ab3a8bd5bc9e45/arro3_core-0.8.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8b90edee9e259245cdd18bf38ebc7bec70c55f609722ba0f2faaa73bac47f21d", size = 2632791, upload-time = "2026-02-23T15:11:37.571Z" }, + { url = "https://files.pythonhosted.org/packages/d7/c9/c7ae7b30baba05a71a55d92d4e9a735e9aea988fd48c07e90374d88f83e2/arro3_core-0.8.0-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:136a51f721fbb98cf27f40d99634a91b38bf34faf424ed0f1f9f3f0fba5073ea", size = 3107494, upload-time = "2026-02-23T14:48:36.491Z" }, + { url = "https://files.pythonhosted.org/packages/0f/98/5eb301502fc67ce130028a8a459031c04a9b775fc4c27e98243dd4a310dc/arro3_core-0.8.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:223fc71ff6a7146babefe341a4b77aa122e2f594742f68b7c81fdef3938d24f3", size = 3216789, upload-time = "2026-02-23T14:49:19.526Z" }, + { url = "https://files.pythonhosted.org/packages/e2/b0/3039da0f7e33d26bba22781a6ae7998883736fd76e30d47bffbc93b72f04/arro3_core-0.8.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8626e3e4d2009836f0e9ef6b61713f383fb1a13494cbe48719ccc1e8a99048de", size = 3424886, upload-time = "2026-02-23T14:50:57.709Z" }, + { url = "https://files.pythonhosted.org/packages/73/28/004ec37c50f73ce15e4997f8008efbdf70a96663ff704603544510f075d1/arro3_core-0.8.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f6914887962eee77e2776290910afe8041b1d78f472a4e14024b79064a6dbcd1", size = 2996090, upload-time = "2026-02-23T14:51:09.841Z" }, + { url = "https://files.pythonhosted.org/packages/bc/c2/4fb2a97efe750ae8b0dc149914e3a062c2222258b68c430523331e74daad/arro3_core-0.8.0-cp39-cp39-manylinux_2_24_aarch64.whl", hash = "sha256:484a8457240cc8ef48b0c457ab4d65c8543e1f2a9710356c7d93500919dc2666", size = 2780708, upload-time = "2026-02-23T14:47:52.724Z" }, + { url = "https://files.pythonhosted.org/packages/c2/a2/4c97cf459f837aff7409314e61e8f3b1e9445bb22ffe82b8bfb263159c6a/arro3_core-0.8.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:043afdc505fd3b27b69a3fc241fe0fd7f14c3850e3106e1e86627bb05cbc1e66", size = 3203987, upload-time = "2026-02-23T14:51:23.097Z" }, + { url = "https://files.pythonhosted.org/packages/e5/43/26c07e862213a0680972363778d4b7577a7f77afae9a9ba92134979ced4c/arro3_core-0.8.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:0b5c1c44c11893a8c9e565aacd3ff3c5f508993f83f0b17ec9a6b7d5050aaf10", size = 2953856, upload-time = "2026-02-23T15:11:51.499Z" }, + { url = "https://files.pythonhosted.org/packages/d8/c0/a0b0ca18490707606a68c7e45a3f77df235e65e6b2c3229365636b84ca63/arro3_core-0.8.0-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:083135d8a67d28d8caa4928c2919030ea9f4dbd973886445c64f2f975bea550a", size = 3385886, upload-time = "2026-02-23T15:11:55.2Z" }, + { url = "https://files.pythonhosted.org/packages/56/7f/fb1ab44804b3be21ac5251d4534e5b8bc459714072ac3ec298c402b72088/arro3_core-0.8.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:bd53bea121a482efa8fb3a5abae0e434e10568618cc7f1ceb5c2c96b19078a29", size = 3312186, upload-time = "2026-02-23T15:11:56.865Z" }, + { url = "https://files.pythonhosted.org/packages/9c/ce/7e4824565464f9b7e40ee797b304f6d572484279fa7e0a86d745f12abfc5/arro3_core-0.8.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:53eebfd0eb4becde8c446a70cd744c8a9cc3eab7f3fffb17a54171dac201b35d", size = 3215660, upload-time = "2026-02-23T15:11:58.153Z" }, + { url = "https://files.pythonhosted.org/packages/a2/99/b2f3f56606813ad241bb9aca1fb569c7cf8df18e7fada828aab39bf092f5/arro3_core-0.8.0-cp39-cp39-win_amd64.whl", hash = "sha256:26d9eeef6a464e40283674152f11e9b43e9fd6f1508462647cb2de07a1e0a7fb", size = 3170235, upload-time = "2026-02-23T15:11:59.447Z" }, + { url = "https://files.pythonhosted.org/packages/f1/a9/03d96a36be26aaa896c317ba8ccd6b678202a2d8b936c6467011ed57e4c7/arro3_core-0.8.0-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a4b89836e3e761d6e74437e3c40b26b0b83b9be1ca4c9c15d049cd6c4791cbc9", size = 2893299, upload-time = "2026-02-23T15:12:00.765Z" }, + { url = "https://files.pythonhosted.org/packages/bc/4d/6950d7779ca191fa9a546462cc37cdd67e28a419de24196561f7517cd434/arro3_core-0.8.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:df944e458b10262e548ca7d5b1c079238955d11ae294ae4258e73dabe494e2c8", size = 2628653, upload-time = "2026-02-23T15:12:02.017Z" }, + { url = "https://files.pythonhosted.org/packages/c3/38/28e697f2003f65356831b89f970eea57a09bea21a065c03dd482b6f935d9/arro3_core-0.8.0-pp311-pypy311_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:282ec1edea03818186978ee01568e8e6d2f92bd4ef9e94c7923873e0a442aa99", size = 3104767, upload-time = "2026-02-23T14:48:38.294Z" }, + { url = "https://files.pythonhosted.org/packages/22/ec/c47529f387161fc8f19e277835151fa7b2631943fdff48e1ce0a04d464dd/arro3_core-0.8.0-pp311-pypy311_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7eb254cf4fd8b10681479df88f303ec03d1f54e4689479c77bbf81df841a4bb4", size = 3212133, upload-time = "2026-02-23T14:49:21.952Z" }, + { url = "https://files.pythonhosted.org/packages/09/7b/5ae7753bf0bbd8301c71b53b3437904c4c6792351065328ee93494ca0bde/arro3_core-0.8.0-pp311-pypy311_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7699397b5892294af5a0ff3165b1aa4339bcebdbb1a51fc38bd9ffc9e283f3d0", size = 3423229, upload-time = "2026-02-23T14:50:59.701Z" }, + { url = "https://files.pythonhosted.org/packages/52/dd/ca58a929c0e49a18e31394d32cc2db280978bc769267839d3142b75a1e4f/arro3_core-0.8.0-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64ac061557bd150a37bb5bc4fbd46c162db5254acd6338f800e907ddc93f5422", size = 2990806, upload-time = "2026-02-23T14:51:12.725Z" }, + { url = "https://files.pythonhosted.org/packages/87/d9/b525f754b8d7a42c69705cd4b940d2d4e47512bc2396747ba77fb8528869/arro3_core-0.8.0-pp311-pypy311_pp73-manylinux_2_24_aarch64.whl", hash = "sha256:a6a4212ac0555e195d7617488c030b85aa9acd0d4e0ad8da3bf18c3572f2d60a", size = 2775013, upload-time = "2026-02-23T14:47:54.45Z" }, + { url = "https://files.pythonhosted.org/packages/3d/78/11269c9c9c0dc53e6df46fa7c2968e0ad30c3f48d7ab7bf9fb54166fd12c/arro3_core-0.8.0-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9b5f016f6af7531afb3d22f20a56adcf68073348c37c9cc196e96740f7e95a70", size = 3199670, upload-time = "2026-02-23T14:51:25.327Z" }, + { url = "https://files.pythonhosted.org/packages/8d/0d/a9b1e492004f9372d52bf901b57dd4d8934ee56f4d5e54c0ff0cfd75d08e/arro3_core-0.8.0-pp311-pypy311_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:e981a204d9b829eab1fbe1a04f0fa53f06dc4a6c9695e978d9ca0eed32925d2f", size = 2949791, upload-time = "2026-02-23T15:12:10.85Z" }, + { url = "https://files.pythonhosted.org/packages/42/8b/2496b369a5f33b8bd7dea09e21578dc697b74db3ad5bdb83f7324ef5369c/arro3_core-0.8.0-pp311-pypy311_pp73-musllinux_1_2_armv7l.whl", hash = "sha256:593c2ca6f47de78fd92abf1809b625be9c20e36b1d3160a5d79713ec7d04819f", size = 3382665, upload-time = "2026-02-23T15:12:14.583Z" }, + { url = "https://files.pythonhosted.org/packages/b4/48/04b27a7d217f823324b4eb7bcb626c29ce100fdb7c54430a4101e5b851c0/arro3_core-0.8.0-pp311-pypy311_pp73-musllinux_1_2_i686.whl", hash = "sha256:57ff2c2761b4451c1a693f7a63d26ed1067e9d64e3670411e45998989859f3e5", size = 3307751, upload-time = "2026-02-23T15:12:17.124Z" }, + { url = "https://files.pythonhosted.org/packages/12/03/8653a2dce9f3908fe01fb5dc5aaaabd89dcd9f22bbdaa50745aad7c47a7a/arro3_core-0.8.0-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:d285aab000ef4ad4d91597e9662298ad3ac774939e8accea96a6522815331896", size = 3209809, upload-time = "2026-02-23T15:12:19.359Z" }, ] [[package]] @@ -2276,7 +2281,7 @@ notebooks = [ [package.metadata] requires-dist = [ - { name = "arro3-core", specifier = ">=0.4.6" }, + { name = "arro3-core", specifier = ">=0.5.1" }, { name = "fsspec", specifier = ">=2025.5.1" }, { name = "pandas", specifier = ">=2.2.3" }, { name = "polars", specifier = ">=1.36.0" }, diff --git a/r-oxbow/R/api.R b/r-oxbow/R/api.R index c0cac272..915be138 100644 --- a/r-oxbow/R/api.R +++ b/r-oxbow/R/api.R @@ -1,71 +1,71 @@ #' Return Arrow IPC format from a FASTQ file. #' @export -read_fastq <- function(path, fields = NULL) { - read_fast_impl(path, fields) +read_fastq <- function(path, fields = "*") { + read_fastq_impl(path, fields) } #' Return Arrow IPC format from a FASTA file. #' @export -read_fasta <- function(path, regions = NULL, index = NULL, gzi = NULL, fields = NULL) { +read_fasta <- function(path, regions = NULL, index = NULL, gzi = NULL, fields = "*") { read_fasta_impl(path, regions, index, gzi, fields) } #' Return Arrow IPC format from a SAM file. #' @export -read_sam <- function(path, region = NULL, index = NULL, fields = NULL, scan_rows = NULL) { +read_sam <- function(path, region = NULL, index = NULL, fields = "*", scan_rows = NULL) { read_sam_impl(path, region, index, fields, scan_rows) } #' Return Arrow IPC format from a BAM file. #' @export -read_bam <- function(path, region = NULL, index = NULL, fields = NULL, scan_rows = NULL) { +read_bam <- function(path, region = NULL, index = NULL, fields = "*", scan_rows = NULL) { read_bam_impl(path, region, index, fields, scan_rows) } #' Return Arrow IPC format from a CRAM file. #' @export -read_cram <- function(path, reference = NULL, reference_index = NULL, region = NULL, index = NULL, fields = NULL, scan_rows = NULL) { +read_cram <- function(path, reference = NULL, reference_index = NULL, region = NULL, index = NULL, fields = "*", scan_rows = NULL) { read_cram_impl(path, reference, reference_index, region, index, fields, scan_rows) } #' Return Arrow IPC format from a VCF file. #' @export -read_vcf <- function(path, region = NULL, index = NULL, fields = NULL, info_field = NULL, genotype_fields = NULL, genotype_by = "sample") { - read_vcf_impl(path, region, index, fields, scan_rows) +read_vcf <- function(path, region = NULL, index = NULL, fields = "*", info_fields = "*", genotype_fields = "*", genotype_by = "sample", samples = NULL, samples_nested = FALSE) { + read_vcf_impl(path, region, index, fields, info_fields, genotype_fields, genotype_by, samples, samples_nested) } #' Return Arrow IPC format from a BCF file. #' @export -read_bcf <- function(path, region = NULL, index = NULL, fields = NULL, info_fields = NULL, genotype_fields = NULL, genotype_by = "sample") { - read_bcf_impl(path, region, index, fields, scan_rows) +read_bcf <- function(path, region = NULL, index = NULL, fields = "*", info_fields = "*", genotype_fields = "*", genotype_by = "sample", samples = NULL, samples_nested = FALSE) { + read_bcf_impl(path, region, index, fields, info_fields, genotype_fields, genotype_by, samples, samples_nested) } #' Return Arrow IPC format from a GTF file. #' @export -read_gtf <- function(path, region = NULL, index = NULL, fields = NULL, scan_rows = NULL) { +read_gtf <- function(path, region = NULL, index = NULL, fields = "*", scan_rows = NULL) { read_gtf_impl(path, region, index, fields, scan_rows) } #' Return Arrow IPC format from a GFF file. #' @export -read_gff <- function(path, region = NULL, index = NULL, fields = NULL, scan_rows = NULL) { +read_gff <- function(path, region = NULL, index = NULL, fields = "*", scan_rows = NULL) { read_gff_impl(path, region, index, fields, scan_rows) } #' Return Arrow IPC format from a BED file. #' @export -read_bed <- function(path, bed_schema, region = NULL, index = NULL, fields = NULL) { +read_bed <- function(path, bed_schema = "bed3+", region = NULL, index = NULL, fields = "*") { read_bed_impl(path, bed_schema, region, index, fields) } #' Return Arrow IPC format from a BigWig file. #' @export -read_bigwig <- function(path, region = NULL, fields = NULL) { +read_bigwig <- function(path, region = NULL, fields = "*") { read_bigwig_impl(path, region, fields) } #' Return Arrow IPC format from a BigBed file. #' @export -read_bigbed <- function(path, bed_schema = "bed3+", region = NULL, fields = NULL) { +read_bigbed <- function(path, bed_schema = "bed3+", region = NULL, fields = "*") { read_bigbed_impl(path, bed_schema, region, fields) } diff --git a/r-oxbow/R/extendr-wrappers.R b/r-oxbow/R/extendr-wrappers.R index 188f0b44..b3351d7e 100644 --- a/r-oxbow/R/extendr-wrappers.R +++ b/r-oxbow/R/extendr-wrappers.R @@ -26,10 +26,10 @@ read_bam_impl <- function(path, region, index, fields, scan_rows) .Call(wrap__re read_cram_impl <- function(path, reference, reference_index, region, index, fields, scan_rows) .Call(wrap__read_cram_impl, path, reference, reference_index, region, index, fields, scan_rows) #' Return Arrow IPC format from a VCF file. -read_vcf_impl <- function(path, region, index, fields, info_fields, genotype_fields, samples, genotype_by) .Call(wrap__read_vcf_impl, path, region, index, fields, info_fields, genotype_fields, samples, genotype_by) +read_vcf_impl <- function(path, region, index, fields, info_fields, genotype_fields, genotype_by, samples, samples_nested) .Call(wrap__read_vcf_impl, path, region, index, fields, info_fields, genotype_fields, genotype_by, samples, samples_nested) #' Return Arrow IPC format from a BCF file. -read_bcf_impl <- function(path, region, index, fields, info_fields, genotype_fields, samples, genotype_by) .Call(wrap__read_bcf_impl, path, region, index, fields, info_fields, genotype_fields, samples, genotype_by) +read_bcf_impl <- function(path, region, index, fields, info_fields, genotype_fields, genotype_by, samples, samples_nested) .Call(wrap__read_bcf_impl, path, region, index, fields, info_fields, genotype_fields, genotype_by, samples, samples_nested) #' Return Arrow IPC format from a GTF file. read_gtf_impl <- function(path, region, index, fields, scan_rows) .Call(wrap__read_gtf_impl, path, region, index, fields, scan_rows) diff --git a/r-oxbow/src/rust/src/lib.rs b/r-oxbow/src/rust/src/lib.rs index 3eda6b8b..e92e353f 100644 --- a/r-oxbow/src/rust/src/lib.rs +++ b/r-oxbow/src/rust/src/lib.rs @@ -13,9 +13,23 @@ use oxbow::gxf::{GffScanner, GtfScanner}; use oxbow::sequence::{FastaScanner, FastqScanner}; use oxbow::util::batches_to_ipc; use oxbow::variant::{BcfScanner, GenotypeBy, VcfScanner}; +use oxbow::Select; pub const BUFFER_SIZE_BYTES: usize = const { 1024 * 1024 }; +/// Convert an R character vector (or NULL) to a `Select`. +/// +/// - `NULL` (`None`) → `Select::Omit` (exclude the column group) +/// - `"*"` (single star) → `Select::All` (explicit wildcard, mirrors Python) +/// - any other vector → `Select::Some` (include only named items) +fn resolve_r_fields(fields: Option>) -> Select { + match fields { + None => Select::Omit, + Some(v) if v.len() == 1 && v[0] == "*" => Select::All, + Some(v) => Select::Some(v), + } +} + /// Return Arrow IPC format from a FASTQ file. #[extendr] fn read_fastq_impl(path: &str, fields: Option>) -> Vec { @@ -23,7 +37,7 @@ fn read_fastq_impl(path: &str, fields: Option>) -> Vec { let reader = std::fs::File::open(path) .map(|f| BufReader::with_capacity(BUFFER_SIZE_BYTES, f)) .unwrap(); - let scanner = FastqScanner::new(fields).unwrap(); + let scanner = FastqScanner::new(resolve_r_fields(fields)).unwrap(); let ipc = if compressed { let gz_reader = std::io::BufReader::new(MultiGzDecoder::new(reader)); @@ -52,7 +66,7 @@ fn read_fasta_impl( let reader = std::fs::File::open(path) .map(|f| BufReader::with_capacity(BUFFER_SIZE_BYTES, f)) .unwrap(); - let scanner = FastaScanner::new(fields).unwrap(); + let scanner = FastaScanner::new(resolve_r_fields(fields)).unwrap(); let ipc = if let Some(regions) = regions { let index_path = index.unwrap_or(format!("{}.fai", path)); @@ -111,7 +125,7 @@ pub fn read_sam_impl( let mut fmt_reader = noodles::sam::io::Reader::new(bgzf_reader); let header = fmt_reader.read_header().unwrap(); let tag_defs = SamScanner::tag_defs(&mut fmt_reader, scan_rows).unwrap(); - let scanner = SamScanner::new(header, fields, Some(tag_defs)).unwrap(); + let scanner = SamScanner::new(header, resolve_r_fields(fields), Some(tag_defs)).unwrap(); let batches = scanner .scan_query(fmt_reader, region, index, None, None, None) .unwrap(); @@ -123,7 +137,7 @@ pub fn read_sam_impl( let pos = fmt_reader.get_mut().virtual_position(); let tag_defs = SamScanner::tag_defs(&mut fmt_reader, scan_rows).unwrap(); fmt_reader.get_mut().seek(pos).unwrap(); - let scanner = SamScanner::new(header, fields, Some(tag_defs)).unwrap(); + let scanner = SamScanner::new(header, resolve_r_fields(fields), Some(tag_defs)).unwrap(); let batches = scanner.scan(fmt_reader, None, None, None).unwrap(); batches_to_ipc(batches) } else { @@ -135,7 +149,7 @@ pub fn read_sam_impl( .get_mut() .seek(std::io::SeekFrom::Start(pos)) .unwrap(); - let scanner = SamScanner::new(header, fields, Some(tag_defs)).unwrap(); + let scanner = SamScanner::new(header, resolve_r_fields(fields), Some(tag_defs)).unwrap(); let batches = scanner.scan(fmt_reader, None, None, None).unwrap(); batches_to_ipc(batches) }; @@ -167,7 +181,7 @@ pub fn read_bam_impl( let mut fmt_reader = noodles::bam::io::Reader::from(bgzf_reader); let header = fmt_reader.read_header().unwrap(); let tag_defs = BamScanner::tag_defs(&mut fmt_reader, scan_rows).unwrap(); - let scanner = BamScanner::new(header, fields, Some(tag_defs)).unwrap(); + let scanner = BamScanner::new(header, resolve_r_fields(fields), Some(tag_defs)).unwrap(); let batches = scanner .scan_query(fmt_reader, region, index, None, None, None) .unwrap(); @@ -179,7 +193,7 @@ pub fn read_bam_impl( let pos = fmt_reader.get_mut().virtual_position(); let tag_defs = BamScanner::tag_defs(&mut fmt_reader, scan_rows).unwrap(); fmt_reader.get_mut().seek(pos).unwrap(); - let scanner = BamScanner::new(header, fields, Some(tag_defs)).unwrap(); + let scanner = BamScanner::new(header, resolve_r_fields(fields), Some(tag_defs)).unwrap(); let batches = scanner.scan(fmt_reader, None, None, None).unwrap(); batches_to_ipc(batches) } else { @@ -191,7 +205,7 @@ pub fn read_bam_impl( .get_mut() .seek(std::io::SeekFrom::Start(pos)) .unwrap(); - let scanner = BamScanner::new(header, fields, Some(tag_defs)).unwrap(); + let scanner = BamScanner::new(header, resolve_r_fields(fields), Some(tag_defs)).unwrap(); let batches = scanner.scan(fmt_reader, None, None, None).unwrap(); batches_to_ipc(batches) }; @@ -242,7 +256,8 @@ pub fn read_cram_impl( .build_from_reader(reader); let header = fmt_reader.read_header().unwrap(); let tag_defs = CramScanner::tag_defs(&mut fmt_reader, &header, scan_rows).unwrap(); - let scanner = CramScanner::new(header, fields, Some(tag_defs), repo).unwrap(); + let scanner = + CramScanner::new(header, resolve_r_fields(fields), Some(tag_defs), repo).unwrap(); let batches = scanner .scan_query(fmt_reader, region, index, None, None, None) .unwrap(); @@ -255,7 +270,8 @@ pub fn read_cram_impl( let pos = fmt_reader.position().unwrap(); let tag_defs = CramScanner::tag_defs(&mut fmt_reader, &header, scan_rows).unwrap(); fmt_reader.seek(std::io::SeekFrom::Start(pos)).unwrap(); - let scanner = CramScanner::new(header, fields, Some(tag_defs), repo).unwrap(); + let scanner = + CramScanner::new(header, resolve_r_fields(fields), Some(tag_defs), repo).unwrap(); let batches = scanner.scan(fmt_reader, None, None, None).unwrap(); batches_to_ipc(batches) }; @@ -273,8 +289,9 @@ pub fn read_vcf_impl( fields: Option>, info_fields: Option>, genotype_fields: Option>, - samples: Option>, genotype_by: Option, + samples: Option>, + samples_nested: bool, ) -> Vec { let compressed = path.ends_with(".gz"); let reader = std::fs::File::open(path) @@ -295,12 +312,12 @@ pub fn read_vcf_impl( let header = fmt_reader.read_header().unwrap(); let scanner = VcfScanner::new( header, - fields, - info_fields, - genotype_fields, - samples, + resolve_r_fields(fields), + resolve_r_fields(info_fields), + resolve_r_fields(genotype_fields), Some(genotype_by), - None, + resolve_r_fields(samples), + Some(samples_nested), ) .unwrap(); let batches = scanner @@ -313,12 +330,12 @@ pub fn read_vcf_impl( let header = fmt_reader.read_header().unwrap(); let scanner = VcfScanner::new( header, - fields, - info_fields, - genotype_fields, - samples, + resolve_r_fields(fields), + resolve_r_fields(info_fields), + resolve_r_fields(genotype_fields), Some(genotype_by), - None, + resolve_r_fields(samples), + Some(samples_nested), ) .unwrap(); let batches = scanner.scan(fmt_reader, None, None, None).unwrap(); @@ -328,12 +345,12 @@ pub fn read_vcf_impl( let header = fmt_reader.read_header().unwrap(); let scanner = VcfScanner::new( header, - fields, - info_fields, - genotype_fields, - samples, + resolve_r_fields(fields), + resolve_r_fields(info_fields), + resolve_r_fields(genotype_fields), Some(genotype_by), - None, + resolve_r_fields(samples), + Some(samples_nested), ) .unwrap(); let batches = scanner.scan(fmt_reader, None, None, None).unwrap(); @@ -353,8 +370,9 @@ pub fn read_bcf_impl( fields: Option>, info_fields: Option>, genotype_fields: Option>, - samples: Option>, genotype_by: Option, + samples: Option>, + samples_nested: bool, ) -> Vec { let compressed = true; let reader = std::fs::File::open(path) @@ -375,12 +393,12 @@ pub fn read_bcf_impl( let header = fmt_reader.read_header().unwrap(); let scanner = BcfScanner::new( header, - fields, - info_fields, - genotype_fields, - samples, + resolve_r_fields(fields), + resolve_r_fields(info_fields), + resolve_r_fields(genotype_fields), Some(genotype_by), - None, + resolve_r_fields(samples), + Some(samples_nested), ) .unwrap(); let batches = scanner @@ -393,12 +411,12 @@ pub fn read_bcf_impl( let header = fmt_reader.read_header().unwrap(); let scanner = BcfScanner::new( header, - fields, - info_fields, - genotype_fields, - samples, + resolve_r_fields(fields), + resolve_r_fields(info_fields), + resolve_r_fields(genotype_fields), Some(genotype_by), - None, + resolve_r_fields(samples), + Some(samples_nested), ) .unwrap(); let batches = scanner.scan(fmt_reader, None, None, None).unwrap(); @@ -408,12 +426,12 @@ pub fn read_bcf_impl( let header = fmt_reader.read_header().unwrap(); let scanner = BcfScanner::new( header, - fields, - info_fields, - genotype_fields, - samples, + resolve_r_fields(fields), + resolve_r_fields(info_fields), + resolve_r_fields(genotype_fields), Some(genotype_by), - None, + resolve_r_fields(samples), + Some(samples_nested), ) .unwrap(); let batches = scanner.scan(fmt_reader, None, None, None).unwrap(); @@ -445,7 +463,7 @@ pub fn read_gtf_impl( let bgzf_reader = noodles::bgzf::io::Reader::new(reader); let mut fmt_reader = noodles::gtf::io::Reader::new(bgzf_reader); let attr_defs = GtfScanner::attribute_defs(&mut fmt_reader, scan_rows).unwrap(); - let scanner = GtfScanner::new(None, fields, Some(attr_defs)).unwrap(); + let scanner = GtfScanner::new(None, resolve_r_fields(fields), Some(attr_defs)).unwrap(); let batches = scanner .scan_query(fmt_reader, region, index, None, None, None) .unwrap(); @@ -456,7 +474,7 @@ pub fn read_gtf_impl( let pos = fmt_reader.get_mut().virtual_position(); let attr_defs = GtfScanner::attribute_defs(&mut fmt_reader, scan_rows).unwrap(); fmt_reader.get_mut().seek(pos).unwrap(); - let scanner = GtfScanner::new(None, fields, Some(attr_defs)).unwrap(); + let scanner = GtfScanner::new(None, resolve_r_fields(fields), Some(attr_defs)).unwrap(); let batches = scanner.scan(fmt_reader, None, None, None).unwrap(); batches_to_ipc(batches) } else { @@ -467,7 +485,7 @@ pub fn read_gtf_impl( .get_mut() .seek(std::io::SeekFrom::Start(pos)) .unwrap(); - let scanner = GtfScanner::new(None, fields, Some(attr_defs)).unwrap(); + let scanner = GtfScanner::new(None, resolve_r_fields(fields), Some(attr_defs)).unwrap(); let batches = scanner.scan(fmt_reader, None, None, None).unwrap(); batches_to_ipc(batches) }; @@ -497,7 +515,7 @@ pub fn read_gff_impl( let bgzf_reader = noodles::bgzf::io::Reader::new(reader); let mut fmt_reader = noodles::gff::io::Reader::new(bgzf_reader); let attr_defs = GffScanner::attribute_defs(&mut fmt_reader, scan_rows).unwrap(); - let scanner = GffScanner::new(None, fields, Some(attr_defs)).unwrap(); + let scanner = GffScanner::new(None, resolve_r_fields(fields), Some(attr_defs)).unwrap(); let batches = scanner .scan_query(fmt_reader, region, index, None, None, None) .unwrap(); @@ -508,7 +526,7 @@ pub fn read_gff_impl( let pos = fmt_reader.get_mut().virtual_position(); let attr_defs = GffScanner::attribute_defs(&mut fmt_reader, scan_rows).unwrap(); fmt_reader.get_mut().seek(pos).unwrap(); - let scanner = GffScanner::new(None, fields, Some(attr_defs)).unwrap(); + let scanner = GffScanner::new(None, resolve_r_fields(fields), Some(attr_defs)).unwrap(); let batches = scanner.scan(fmt_reader, None, None, None).unwrap(); batches_to_ipc(batches) } else { @@ -519,7 +537,7 @@ pub fn read_gff_impl( .get_mut() .seek(std::io::SeekFrom::Start(pos)) .unwrap(); - let scanner = GffScanner::new(None, fields, Some(attr_defs)).unwrap(); + let scanner = GffScanner::new(None, resolve_r_fields(fields), Some(attr_defs)).unwrap(); let batches = scanner.scan(fmt_reader, None, None, None).unwrap(); batches_to_ipc(batches) }; @@ -541,7 +559,7 @@ pub fn read_bed_impl( let reader = std::fs::File::open(path) .map(|f| BufReader::with_capacity(BUFFER_SIZE_BYTES, f)) .unwrap(); - let scanner = BedScanner::new(bed_schema, fields).unwrap(); + let scanner = BedScanner::new(bed_schema, resolve_r_fields(fields)).unwrap(); let ipc = if let Some(region) = region { let index_path = index.unwrap_or(format!("{}.tbi", path)); @@ -582,7 +600,7 @@ pub fn read_bigwig_impl( let region = region.parse::().unwrap(); let fmt_reader = bigtools::BigWigRead::open(reader).unwrap(); let info = fmt_reader.info().clone(); - let scanner = BigWigScanner::new(info, fields).unwrap(); + let scanner = BigWigScanner::new(info, resolve_r_fields(fields)).unwrap(); let batches = scanner .scan_query(fmt_reader, region, None, None, None) .unwrap(); @@ -590,7 +608,7 @@ pub fn read_bigwig_impl( } else { let fmt_reader = bigtools::BigWigRead::open(reader).unwrap(); let info = fmt_reader.info().clone(); - let scanner = BigWigScanner::new(info, fields).unwrap(); + let scanner = BigWigScanner::new(info, resolve_r_fields(fields)).unwrap(); let batches = scanner.scan(fmt_reader, None, None, None).unwrap(); batches_to_ipc(batches) }; @@ -615,7 +633,7 @@ pub fn read_bigbed_impl( let region = region.parse::().unwrap(); let fmt_reader = bigtools::BigBedRead::open(reader).unwrap(); let info = fmt_reader.info().clone(); - let scanner = BigBedScanner::new(bed_schema, info, fields).unwrap(); + let scanner = BigBedScanner::new(bed_schema, info, resolve_r_fields(fields)).unwrap(); let batches = scanner .scan_query(fmt_reader, region, None, None, None) .unwrap(); @@ -623,7 +641,7 @@ pub fn read_bigbed_impl( } else { let fmt_reader = bigtools::BigBedRead::open(reader).unwrap(); let info = fmt_reader.info().clone(); - let scanner = BigBedScanner::new(bed_schema, info, fields).unwrap(); + let scanner = BigBedScanner::new(bed_schema, info, resolve_r_fields(fields)).unwrap(); let batches = scanner.scan(fmt_reader, None, None, None).unwrap(); batches_to_ipc(batches) };