Skip to content

Assertion failure #132

@drtconway

Description

@drtconway

Hi Oxbow.

Thanks for your work on this project!

Some of my VCF inputs cause an assertion failure. The VCFs may be non-conforming to the specification, but it would be nice to receive an error rather than an assertion failure.

To reproduce:

With the files below, I get the following behaviour:

$ ./target/debug/oxbow-read-vcf 

thread 'main' panicked at /Users/tom.conway/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/arrow-array-54.3.1/src/builder/fixed_size_list_builder.rs:174:9:
assertion `left == right` failed: Length of the child array (6) must be the multiple of the value length (2) and the array length (4).
  left: 6
 right: 8
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace

Cargo.toml:

[package]
name = "oxbow-read-vcf"
version = "0.1.0"
edition = "2024"

[dependencies]
oxbow = "0.4.0"
noodles = { version = "0.90.0", features = ["core", "vcf"] }

main.rs:

use std::fs::File;
use std::io::BufReader;
use oxbow::variant::format::vcf::Scanner;
fn main() {
    let inner = File::open("x.vcf").map(BufReader::new).unwrap();
    let mut fmt_reader = noodles::vcf::io::Reader::new(inner);
    let header = fmt_reader.read_header().unwrap();

    let scanner = Scanner::new(header);
    let batches = scanner.scan(fmt_reader, None, None, None, None, None, None, Some(1000)).unwrap();
    for batch in batches {
        let batch = batch.unwrap();
        eprintln!("{:?}", batch.schema().as_ref());
    }
}

x.vcf:

##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele not already represented at this location by REF and ALT">
##FILTER=<ID=LowQual,Description="Low quality">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=GP,Number=G,Type=Float,Description="genotype posterior in Phred Scale">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block">
##FORMAT=<ID=PG,Number=G,Type=Float,Description="genotype priors in Phred Scale">
##FORMAT=<ID=PGT,Number=1,Type=String,Description="Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another; will always be heterozygous and is not intended to describe called alleles">
##FORMAT=<ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##FORMAT=<ID=PS,Number=1,Type=Integer,Description="Phasing set (typically the position of the first variant in the set)">
##FORMAT=<ID=RGQ,Number=1,Type=Integer,Description="Unconditional reference genotype confidence, encoded as a phred quality -10*log10 p(genotype call is wrong)">
##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.">
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities">
##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP Membership">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
##INFO=<ID=DRAGstrInfo,Number=2,Type=Integer,Description="Indicates the period and repeat count">
##INFO=<ID=DRAGstrParams,Number=3,Type=Float,Description="Parameters used (GOP, GCP, API)">
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
##INFO=<ID=ExcessHet,Number=1,Type=Float,Description="Phred-scaled p-value for exact test of excess heterozygosity">
##INFO=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias">
##INFO=<ID=InbreedingCoeff,Number=1,Type=Float,Description="Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation">
##INFO=<ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
##INFO=<ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed">
##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities">
##INFO=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth">
##INFO=<ID=RAW_MQandDP,Number=2,Type=Integer,Description="Raw data (sum of squared MQ and total depth) for improved RMS Mapping Quality calculation. Incompatible with deprecated RAW_MQ formulation.">
##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">
##INFO=<ID=SOR,Number=1,Type=Float,Description="Symmetric Odds Ratio of 2x2 contingency table to detect strand bias">
##contig=<ID=chr1,length=248956422>
##contig=<ID=chr2,length=242193529>
##contig=<ID=chr3,length=198295559>
##contig=<ID=chr4,length=190214555>
##contig=<ID=chr5,length=181538259>
##contig=<ID=chr6,length=170805979>
##contig=<ID=chr7,length=159345973>
##contig=<ID=chr8,length=145138636>
##contig=<ID=chr9,length=138394717>
##contig=<ID=chr10,length=133797422>
##contig=<ID=chr11,length=135086622>
##contig=<ID=chr12,length=133275309>
##contig=<ID=chr13,length=114364328>
##contig=<ID=chr14,length=107043718>
##contig=<ID=chr15,length=101991189>
##contig=<ID=chr16,length=90338345>
##contig=<ID=chr17,length=83257441>
##contig=<ID=chr18,length=80373285>
##contig=<ID=chr19,length=58617616>
##contig=<ID=chr20,length=64444167>
##contig=<ID=chr21,length=46709983>
##contig=<ID=chr22,length=50818468>
##contig=<ID=chrX,length=156040895>
##contig=<ID=chrY,length=57227415>
##contig=<ID=chrM,length=16569>
##INFO=<ID=ANN,Number=.,Type=String,Description="Consequence annotations from Ensembl VEP. Format: Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|ALLELE_NUM|DISTANCE|STRAND|FLAGS|VARIANT_CLASS|SYMBOL_SOURCE|HGNC_ID|CANONICAL|MANE|ENSP|REFSEQ_MATCH|REFSEQ_OFFSET|GIVEN_REF|USED_REF|BAM_EDIT|SIFT|PolyPhen|HGVS_OFFSET|AFR_AF|AMR_AF|EAS_AF|EUR_AF|SAS_AF|AA_AF|EA_AF|gnomAD_AF|gnomAD_AFR_AF|gnomAD_AMR_AF|gnomAD_ASJ_AF|gnomAD_EAS_AF|gnomAD_FIN_AF|gnomAD_NFE_AF|gnomAD_OTH_AF|gnomAD_SAS_AF|MAX_AF|MAX_AF_POPS|CLIN_SIG|SOMATIC|PHENO|PUBMED">
#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA24143	NA24149	NA24385
chr1	10421	.	ACCCTAACCCTAACCCTAAC	A	13.58	.	AC=1;AF=0.167;AN=6;BaseQRankSum=0.366;DP=141;DRAGstrInfo=1;DRAGstrParams=36;ExcessHet=0;FS=6.419;MLEAC=1;MLEAF=0.167;MQ=20.57;MQRankSum=-1.534;QD=1.7;ReadPosRankSum=0.366;SOR=2.833;ANN=-|downstream_gene_variant|MODIFIER|WASH7P|653635|Transcript|NR_024540.1|transcribed_pseudogene||||||||||rs1557426865|1|3922|-1||deletion|EntrezGene||YES|||||CCCTAACCCTAACCCTAAC|CCCTAACCCTAACCCTAAC|OK|||||||||||||||||||||||||,-|upstream_gene_variant|MODIFIER|DDX11L1|100287102|Transcript|NR_046018.2|transcribed_pseudogene||||||||||rs1557426865|1|1434|1||deletion|EntrezGene||YES|||||CCCTAACCCTAACCCTAAC|CCCTAACCCTAACCCTAAC||||||||||||||||||||||||||	GT:AD:DP:GQ:PGT:PID:PL:PS	0/1:7,1:8:21:.:.:21,0,34:.	0/0:14,0:14:0:.:.:0,0,248:.	0|0:9,0:11:53:0|1:10418_CT_C:0,53,93:10418
chr1	10439	rs112766696	AC	A,*	30.89	.	AC=2,1;AF=0.333,0.167;AN=6;BaseQRankSum=1.09;DB;DP=153;DRAGstrInfo=1;DRAGstrParams=32;ExcessHet=3.9794;FS=0;MLEAC=2,1;MLEAF=0.333,0.167;MQ=24.89;MQRankSum=-1.068;QD=1.72;ReadPosRankSum=0.536;SOR=1.075;ANN=-|downstream_gene_variant|MODIFIER|WASH7P|653635|Transcript|NR_024540.1|transcribed_pseudogene||||||||||rs112766696|1|3922|-1||sequence_alteration|EntrezGene||YES|||||C|C|OK|||||||||||||||||||||||||,-|upstream_gene_variant|MODIFIER|DDX11L1|100287102|Transcript|NR_046018.2|transcribed_pseudogene||||||||||rs112766696|1|1434|1||sequence_alteration|EntrezGene||YES|||||C|C||||||||||||||||||||||||||	GT:AD:DP:GQ:PL	0/2:7,0,1:8:21:21,42,76,0,206,34	0/1:6,2,0:8:34:34,0,40,52,179,92	0/1:7,2,1:10:7:7,0,40,30,212,70
chr1	10492	rs55998931	C	T	101.58	.	AC=2;AF=0.333;AN=6;BaseQRankSum=-0.319;DB;DP=47;ExcessHet=1.7609;FS=7.16;MLEAC=2;MLEAF=0.333;MQ=31.71;MQRankSum=0;QD=7.26;ReadPosRankSum=-0.414;SOR=3.737;ANN=T|downstream_gene_variant|MODIFIER|WASH7P|653635|Transcript|NR_024540.1|transcribed_pseudogene||||||||||rs55998931|1|3870|-1||SNV|EntrezGene||YES|||||C|C|OK|||||||||||||||||||||||||,T|upstream_gene_variant|MODIFIER|DDX11L1|100287102|Transcript|NR_046018.2|transcribed_pseudogene||||||||||rs55998931|1|1382|1||SNV|EntrezGene||YES|||||C|C||||||||||||||||||||||||||	GT:AD:DP:GQ:PL	0/1:2,4:6:30:57,0,30	0/0:6,0:6:0:0,0,149	0/1:5,3:8:40:53,0,40
chr1	13273	.	G	C	91.88	.	AC=2;AF=0.5;AN=4;DP=38;ExcessHet=0;FS=0;MLEAC=2;MLEAF=0.5;MQ=5.35;QD=5.1;SOR=1.179;ANN=C|downstream_gene_variant|MODIFIER|WASH7P|653635|Transcript|NR_024540.1|transcribed_pseudogene||||||||||rs531730856|1|1089|-1||SNV|EntrezGene||YES|||||G|G|OK||||0.0204|0.1455|0.0625|0.1471|0.1401||||||||||||0.1471|EUR||||,C|non_coding_transcript_exon_variant|MODIFIER|DDX11L1|100287102|Transcript|NR_046018.2|transcribed_pseudogene|3/3||NR_046018.2:n.516G>C||516|||||rs531730856|1||1||SNV|EntrezGene||YES|||||G|G|||||0.0204|0.1455|0.0625|0.1471|0.1401||||||||||||0.1471|EUR||||,C|downstream_gene_variant|MODIFIER|MIR6859-1|102466751|Transcript|NR_106918.1|miRNA||||||||||rs531730856|1|4096|-1||SNV|EntrezGene||YES|||||G|G|||||0.0204|0.1455|0.0625|0.1471|0.1401||||||||||||0.1471|EUR||||	GT:AD:DP:GQ:PL	1/1:0,18:18:54:106,54,0	0/0:17,0:17:51:0,51,540	./.:0,0:0:0:0,0,0

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions