fg-labs · tfenne · Apr 15, 2026 · Apr 15, 2026
diff --git a/src/commands/simulate.rs b/src/commands/simulate.rs
@@ -152,6 +152,26 @@ impl Simulate {
         if self.compression > 12 {
             bail!("--compression must be between 0 and 12");
         }
+
+        // --sample without --vcf is nonsensical.
+        if self.vcf.sample.is_some() && self.vcf.vcf.is_none() {
+            bail!("--sample requires --vcf");
+        }
+
+        // Validate VCF sample configuration upfront so the user gets a clear
+        // error before the simulation loop starts.
+        if let Some(vcf_path) = &self.vcf.vcf {
+            crate::vcf::validate_vcf_sample(vcf_path, self.vcf.sample.as_deref())?;
+        }
+
+        // Validate output parent directory exists.
+        if let Some(parent) = self.output.output.parent()
+            && !parent.as_os_str().is_empty()
+            && !parent.exists()
+        {
+            bail!("Output directory does not exist: {}", parent.display());
+        }
+
         Ok(())
     }
 

diff --git a/src/vcf/mod.rs b/src/vcf/mod.rs
@@ -89,6 +89,27 @@ pub fn load_variants_for_contig(
     Ok(variants)
 }
 
+/// Validate that a VCF file has the expected sample configuration.
+///
+/// Opens the VCF, reads the header, and checks:
+/// - If `sample_name` is `Some`, that the named sample exists.
+/// - If `sample_name` is `None`, that the VCF has exactly one sample.
+///
+/// Call this during argument validation to surface sample errors before
+/// the per-contig simulation loop begins.
+///
+/// # Errors
+/// Returns an error if the VCF cannot be read or the sample configuration
+/// is invalid.
+pub(crate) fn validate_vcf_sample(path: &Path, sample_name: Option<&str>) -> Result<()> {
+    let mut reader = vcf::io::reader::Builder::default()
+        .build_from_path(path)
+        .with_context(|| format!("Failed to open VCF: {}", path.display()))?;
+    let header = reader.read_header()?;
+    resolve_sample_index(&header, sample_name)?;
+    Ok(())
+}
+
 /// Resolve the sample index from a VCF header.
 ///
 /// If `sample_name` is `Some`, looks up that sample. If `None` and the VCF has
@@ -99,10 +120,18 @@ fn resolve_sample_index(header: &vcf::Header, sample_name: Option<&str>) -> Resu
 
     match sample_name {
         Some(name) => {
-            let idx = sample_names
-                .iter()
-                .position(|s| s == name)
-                .ok_or_else(|| anyhow::anyhow!("Sample '{name}' not found in VCF header"))?;
+            let idx = sample_names.iter().position(|s| s == name).ok_or_else(|| {
+                if sample_names.is_empty() {
+                    anyhow::anyhow!("Sample '{name}' not found in VCF (VCF has no sample columns)")
+                } else {
+                    let available: Vec<&str> =
+                        sample_names.iter().map(String::as_str).take(10).collect();
+                    anyhow::anyhow!(
+                        "Sample '{name}' not found in VCF. Available samples: {}",
+                        available.join(", ")
+                    )
+                }
+            })?;
             Ok(idx)
         }
         None => {
@@ -115,7 +144,7 @@ fn resolve_sample_index(header: &vcf::Header, sample_name: Option<&str>) -> Resu
                     "VCF has {} samples but no --sample was specified. \
                      Available samples: {}",
                     sample_names.len(),
-                    sample_names.iter().take(5).cloned().collect::<Vec<_>>().join(", ")
+                    sample_names.iter().take(10).cloned().collect::<Vec<_>>().join(", ")
                 )
             }
         }

diff --git a/tests/helpers/mod.rs b/tests/helpers/mod.rs
@@ -145,6 +145,37 @@ impl TestEnv {
         path
     }
 
+    /// Write a VCF file with the given sample names and contig definitions but
+    /// no variant records.
+    ///
+    /// Useful for testing sample-level validation logic without needing actual
+    /// variants.  Uses raw VCF text to support arbitrary sample configurations
+    /// (single-sample, multi-sample, zero-sample).
+    ///
+    /// # Panics
+    /// Panics on I/O errors.
+    #[must_use]
+    pub fn write_vcf_header_only(
+        &self,
+        sample_names: &[&str],
+        contig_lengths: &[(&str, usize)],
+    ) -> PathBuf {
+        let path = self.dir.path().join("samples.vcf");
+        let mut f = std::fs::File::create(&path).unwrap();
+        writeln!(f, "##fileformat=VCFv4.3").unwrap();
+        for &(name, len) in contig_lengths {
+            writeln!(f, "##contig=<ID={name},length={len}>").unwrap();
+        }
+        writeln!(f, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">").unwrap();
+        write!(f, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT").unwrap();
+        for name in sample_names {
+            write!(f, "\t{name}").unwrap();
+        }
+        writeln!(f).unwrap();
+        f.flush().unwrap();
+        path
+    }
+
     /// Write a BED file with the given intervals and return the path.
     ///
     /// Each entry is `(contig, start, end)` in 0-based half-open coordinates.

diff --git a/tests/test_simulate.rs b/tests/test_simulate.rs
@@ -2970,3 +2970,142 @@ fn test_simulate_then_eval() {
     }
     panic!("ALL row not found in eval output");
 }
+
+// ── VCF and CLI validation tests ──────────────────────────────────────────
+
+#[test]
+fn test_sample_without_vcf_fails() {
+    let env = simple_env();
+    let out = env.output_prefix();
+    let out_str = out.to_str().unwrap();
+    let ref_str = env.fasta_path.to_str().unwrap();
+
+    let (ok, _stdout, stderr) = run_simulate(&[
+        "simulate",
+        "-r",
+        ref_str,
+        "-o",
+        out_str,
+        "--sample",
+        "NA12878",
+        "--coverage",
+        "1",
+    ]);
+    assert!(!ok, "should fail when --sample is given without --vcf");
+    assert!(
+        stderr.contains("--sample requires --vcf"),
+        "error should mention --sample requires --vcf, got: {stderr}"
+    );
+}
+
+#[test]
+fn test_wrong_sample_name_fails() {
+    let env = simple_env();
+    let out = env.output_prefix();
+    let out_str = out.to_str().unwrap();
+    let ref_str = env.fasta_path.to_str().unwrap();
+
+    let vcf_path = env.write_vcf_header_only(&["SampleA"], &[("chr1", 1000)]);
+    let vcf_str = vcf_path.to_str().unwrap();
+
+    let (ok, _stdout, stderr) = run_simulate(&[
+        "simulate",
+        "-r",
+        ref_str,
+        "-v",
+        vcf_str,
+        "--sample",
+        "NoSuchSample",
+        "-o",
+        out_str,
+        "--coverage",
+        "1",
+    ]);
+    assert!(!ok, "should fail when sample name is not in VCF");
+    assert!(
+        stderr.contains("NoSuchSample") && stderr.contains("not found"),
+        "error should name the missing sample, got: {stderr}"
+    );
+    assert!(stderr.contains("SampleA"), "error should list available samples, got: {stderr}");
+}
+
+#[test]
+fn test_multi_sample_vcf_without_sample_flag_fails() {
+    let env = simple_env();
+    let out = env.output_prefix();
+    let out_str = out.to_str().unwrap();
+    let ref_str = env.fasta_path.to_str().unwrap();
+
+    let vcf_path = env.write_vcf_header_only(&["SampleA", "SampleB", "SampleC"], &[("chr1", 1000)]);
+    let vcf_str = vcf_path.to_str().unwrap();
+
+    let (ok, _stdout, stderr) =
+        run_simulate(&["simulate", "-r", ref_str, "-v", vcf_str, "-o", out_str, "--coverage", "1"]);
+    assert!(!ok, "should fail when multi-sample VCF has no --sample");
+    assert!(
+        stderr.contains("3 samples") && stderr.contains("--sample"),
+        "error should mention sample count and --sample flag, got: {stderr}"
+    );
+    assert!(
+        stderr.contains("SampleA") && stderr.contains("SampleB") && stderr.contains("SampleC"),
+        "error should list available samples, got: {stderr}"
+    );
+}
+
+#[test]
+fn test_single_sample_vcf_without_sample_flag_works() {
+    let env = simple_env();
+    let out = env.output_prefix();
+    let out_str = out.to_str().unwrap();
+    let ref_str = env.fasta_path.to_str().unwrap();
+
+    let vcf_path = env.write_vcf_header_only(&["OnlySample"], &[("chr1", 1000)]);
+    let vcf_str = vcf_path.to_str().unwrap();
+
+    let (ok, _stdout, stderr) =
+        run_simulate(&["simulate", "-r", ref_str, "-v", vcf_str, "-o", out_str, "--coverage", "1"]);
+    assert!(ok, "single-sample VCF without --sample should succeed: {stderr}");
+}
+
+#[test]
+fn test_multi_sample_vcf_with_correct_sample_works() {
+    let env = simple_env();
+    let out = env.output_prefix();
+    let out_str = out.to_str().unwrap();
+    let ref_str = env.fasta_path.to_str().unwrap();
+
+    let vcf_path = env.write_vcf_header_only(&["SampleA", "SampleB"], &[("chr1", 1000)]);
+    let vcf_str = vcf_path.to_str().unwrap();
+
+    let (ok, _stdout, stderr) = run_simulate(&[
+        "simulate",
+        "-r",
+        ref_str,
+        "-v",
+        vcf_str,
+        "--sample",
+        "SampleB",
+        "-o",
+        out_str,
+        "--coverage",
+        "1",
+    ]);
+    assert!(ok, "multi-sample VCF with correct --sample should succeed: {stderr}");
+}
+
+#[test]
+fn test_output_directory_does_not_exist_fails() {
+    let env = simple_env();
+    let ref_str = env.fasta_path.to_str().unwrap();
+
+    let bad_out = env.dir.path().join("no_such_dir").join("output");
+    let out_str = bad_out.to_str().unwrap();
+
+    let (ok, _stdout, stderr) =
+        run_simulate(&["simulate", "-r", ref_str, "-o", out_str, "--coverage", "1"]);
+    assert!(!ok, "should fail when output directory doesn't exist");
+    assert!(
+        stderr.contains("Output directory does not exist"),
+        "error should mention missing output directory, got: {stderr}"
+    );
+}