diff --git a/CHANGELOG.md b/CHANGELOG.md index 067cf36..d20aedb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## Unreleased +### Added +- Support for renaming samples while copying (#6) + + ## [0.1.3] 2025-12-03 ### Added - Option to add source and destination paths to the copy log with a `--verbose` flag (#5) diff --git a/ezfastq/api.py b/ezfastq/api.py index e587618..d3ea708 100644 --- a/ezfastq/api.py +++ b/ezfastq/api.py @@ -13,7 +13,7 @@ def copy( - sample_names, + sample_name_map, seq_path, pair_mode=PairMode.Unspecified, prefix="", @@ -21,7 +21,7 @@ def copy( subdir="seq", verbose=False, ): - copier = FastqCopier.from_dir(sample_names, seq_path, prefix=prefix, pair_mode=pair_mode) + copier = FastqCopier.from_dir(sample_name_map, seq_path, prefix=prefix, pair_mode=pair_mode) copier.copy_files(workdir / subdir) copier.print_copy_log() nlogs = len(list((workdir / subdir).glob("copy-log-*.toml"))) diff --git a/ezfastq/cli.py b/ezfastq/cli.py index 243a5d2..b3ec2b3 100644 --- a/ezfastq/cli.py +++ b/ezfastq/cli.py @@ -8,11 +8,14 @@ # ------------------------------------------------------------------------------------------------- from .api import copy +from .namemap import NameMap from .pair import PairMode from argparse import ArgumentParser from importlib.metadata import version from pathlib import Path -from rich_argparse import RichHelpFormatter +from rich.text import Text +from rich_argparse import RawDescriptionRichHelpFormatter +from shutil import get_terminal_size def main(arglist=None): @@ -35,15 +38,27 @@ def parse_args(arglist=None): samples_file = Path(args.samples[0]) samples_file_exists = samples_file.is_file() or samples_file.is_fifo() if len(args.samples) == 1 and samples_file_exists: - args.samples = samples_file.read_text().strip().split("\n") + args.samples = NameMap.from_file(samples_file) + else: + args.samples = NameMap.from_arglist(args.samples) args.pair_mode = PairMode(args.pair_mode) return args def get_parser(): + epilog = """ +[bold cyan]Examples:[/bold cyan] + [dim]ezfastq /path/to/fastqs/ sample1 sample2 sample3 + ezfastq /path/to/fastqs/ s1:Sample1 s2:Sample2 s3:Sample3 + ezfastq /path/to/fastqs/ samplenames.txt + ezfastq /path/to/fastqs/ samplenames.txt --workdir /path/to/projectdir/ --subdir seq/Run01/ + ezfastq /path/to/fastqs/ samplenames.txt --pair-mode 2[/dim] +""" + width = min(99, get_terminal_size().columns - 2) parser = ArgumentParser( description="Copy FASTQ files and use sample names to make filenames consistent", - formatter_class=RichHelpFormatter, + formatter_class=lambda prog: RawDescriptionRichHelpFormatter(prog, width=width), + epilog=epilog, ) parser.add_argument( "seq_path", @@ -52,7 +67,7 @@ def get_parser(): parser.add_argument( "samples", nargs="+", - help="name of one or more samples to process; can be provided as command-line arguments or as a file with one sample name per line", + help="name of one or more samples to process; samples can optionally be renamed by appending a colon and new name to each sample name; alternatively, sample names can be provided as a file with one sample name per line, or two tab-separated values to rename samples", ) parser.add_argument( "-v", diff --git a/ezfastq/copier.py b/ezfastq/copier.py index 0af1e3d..a2573ee 100644 --- a/ezfastq/copier.py +++ b/ezfastq/copier.py @@ -9,6 +9,7 @@ from .fastq import FastqFile from .map import SampleFastqMap +from .namemap import NameMap from .pair import PairMode from dataclasses import dataclass from io import StringIO @@ -35,18 +36,18 @@ class FastqCopier: FASTQ file names are streamlined in the process, and read pairing status is validated. """ - sample_names: List + sample_name_map: NameMap copied_files: List skipped_files: List file_map: SampleFastqMap prefix: str = "" @classmethod - def from_dir(cls, sample_names, data_path, prefix="", pair_mode=PairMode.Unspecified): + def from_dir(cls, sample_name_map, data_path, prefix="", pair_mode=PairMode.Unspecified): copied_files = list() skipped_files = list() - file_map = SampleFastqMap.new(sample_names, data_path, pair_mode=pair_mode) - copier = cls(sorted(sample_names), copied_files, skipped_files, file_map, prefix) + file_map = SampleFastqMap.new(sample_name_map.keys(), data_path, pair_mode=pair_mode) + copier = cls(sample_name_map, copied_files, skipped_files, file_map, prefix) return copier def copy_files(self, destination): @@ -95,7 +96,7 @@ def print_copy_log(self, outstream=sys.stderr): @property def length_longest_sample_name(self): - return max(len(sample) for sample in self.sample_names) + return max(len(sample) for sample in self.sample_name_map.keys()) def __len__(self): return sum(len(fqfiles) for fqfiles in self.file_map.values()) @@ -105,7 +106,8 @@ def __iter__(self): for n, fqfile in enumerate(fqfiles, 1): source_path = Path(fqfile).absolute() read = 0 if len(fqfiles) == 1 else n - yield FastqFile(source_path, sample_name, read, self.prefix) + new_name = self.sample_name_map[sample_name] + yield FastqFile(source_path, new_name, read, self.prefix) def __str__(self): output = StringIO() diff --git a/ezfastq/namemap.py b/ezfastq/namemap.py new file mode 100644 index 0000000..672f1da --- /dev/null +++ b/ezfastq/namemap.py @@ -0,0 +1,46 @@ +# ------------------------------------------------------------------------------------------------- +# Copyright (c) 2025, DHS. This file is part of ezfastq: https://github.com/bioforensics/ezfastq. +# +# This software was prepared for the Department of Homeland Security (DHS) by the Battelle National +# Biodefense Institute, LLC (BNBI) as part of contract HSHQDC-15-C-00064 to manage and operate the +# National Biodefense Analysis and Countermeasures Center (NBACC), a Federally Funded Research and +# Development Center. +# ------------------------------------------------------------------------------------------------- + + +class NameMap(dict): + @classmethod + def from_arglist(cls, arg_list): + name_map = cls() + for argument in arg_list: + old_name, new_name = cls.parse_name(argument, sep=":") + name_map[old_name] = new_name + return name_map + + @classmethod + def from_file(cls, path): + name_map = cls() + with open(path, "r") as fh: + for line in fh: + old_name, new_name = cls.parse_name(line, sep="\t") + name_map[old_name] = new_name + if len(name_map) == 0: + raise ValueError(f'sample name file "{path}" is empty') + return name_map + + @staticmethod + def parse_name(name_string, sep=":"): + name_string = name_string.strip() + num_values = name_string.count(sep) + 1 + if num_values != 1 and num_values != 2: + message = f'expected 1 or 2 values in sample name, not {num_values}: "{name_string}"' + raise SampleNameError(message) + if num_values == 1: + return name_string, name_string + else: + old_name, new_name = name_string.split(sep) + return old_name, new_name + + +class SampleNameError(ValueError): + pass diff --git a/ezfastq/tests/test_copier.py b/ezfastq/tests/test_copier.py index 42406cd..c7ba533 100644 --- a/ezfastq/tests/test_copier.py +++ b/ezfastq/tests/test_copier.py @@ -8,8 +8,9 @@ # ------------------------------------------------------------------------------------------------- from ezfastq.copier import FastqCopier +from ezfastq.namemap import NameMap from importlib.resources import files -import pytest +from itertools import product try: import tomllib @@ -22,7 +23,7 @@ def test_copier_basic(): - sample_names = ["test1", "test2"] + sample_names = NameMap.from_arglist(["test1", "test2"]) copier = FastqCopier.from_dir(sample_names, SEQ_PATH_1) observed = [fqfile.source_path.name for fqfile in copier] expected = [ @@ -36,7 +37,7 @@ def test_copier_basic(): def test_copier_copy(tmp_path): - sample_names = ["test1", "test2"] + sample_names = NameMap.from_arglist(["test1", "test2"]) # First pass: copy all 4 copier1 = FastqCopier.from_dir(sample_names, SEQ_PATH_1) copier1.copy_files(tmp_path) @@ -58,14 +59,14 @@ def test_copier_copy(tmp_path): def test_copier_prefix(tmp_path): - sample_names = ["test2", "test3"] + sample_names = NameMap.from_arglist(["test2", "test3"]) copier = FastqCopier.from_dir(sample_names, SEQ_PATH_1, prefix="abc_") copier.copy_files(tmp_path) assert len(list(tmp_path.glob("abc_*.fastq.gz"))) == 4 def test_copier_str_basic(tmp_path): - sample_names = ["test1", "test2", "test3"] + sample_names = NameMap.from_arglist(["test1", "test2", "test3"]) copier = FastqCopier.from_dir(sample_names, SEQ_PATH_1) copier.copy_files(tmp_path) observed = str(copier) @@ -81,14 +82,14 @@ def test_copier_str_basic(tmp_path): assert observed.strip() == expected.strip() -def test_copier_str_noop(tmp_path): - sample_names = ["test1", "test2", "test3"] +def test_copier_str_noop(): + sample_names = NameMap.from_arglist(["test1", "test2", "test3"]) copier = FastqCopier.from_dir(sample_names, SEQ_PATH_1) assert str(copier) == "" def test_copier_str_allskip(tmp_path): - sample_names = ["test1"] + sample_names = NameMap.from_arglist(["test1"]) copier = FastqCopier.from_dir(sample_names, SEQ_PATH_1) (tmp_path / "test1_R1.fastq.gz").touch() (tmp_path / "test1_R2.fastq.gz").touch() @@ -105,7 +106,7 @@ def test_copier_str_allskip(tmp_path): def test_copier_str_mixed(tmp_path): - sample_names = ["test1", "test2", "test3"] + sample_names = NameMap.from_arglist(["test1", "test2", "test3"]) copier = FastqCopier.from_dir(sample_names, SEQ_PATH_2) (tmp_path / "test2_R1.fastq.gz").touch() (tmp_path / "test2_R2.fastq.gz").touch() @@ -128,7 +129,7 @@ def test_copier_str_mixed(tmp_path): def test_copier_str_roundtrip(tmp_path): - sample_names = ["test1", "test2", "test3"] + sample_names = NameMap.from_arglist(["test1", "test2", "test3"]) copier = FastqCopier.from_dir(sample_names, SEQ_PATH_1) (tmp_path / "test2_R1.fastq.gz").touch() (tmp_path / "test2_R2.fastq.gz").touch() @@ -143,3 +144,26 @@ def test_copier_str_roundtrip(tmp_path): observed = copy_data["SkippedFiles"]["already_copied"] expected = ["test2_R1.fq.gz", "test2_R2.fq.gz"] assert observed == expected + + +def test_copier_with_renaming(tmp_path): + sample_names = NameMap.from_arglist( + ["test1:TestSampleA", "test2:TestSampleB", "test3:TestSampleC"] + ) + copier = FastqCopier.from_dir(sample_names, SEQ_PATH_1) + copier.copy_files(tmp_path) + observed = str(copier) + for sample, end in product("ABC", "12"): + fastq = tmp_path / f"TestSample{sample}_R{end}.fastq.gz" + assert fastq.is_file() + print(observed) + expected = """ +[CopiedFiles] +"test1_S1_L001_R1_001.fastq.gz" = "TestSampleA_R1.fastq.gz" +"test1_S1_L001_R2_001.fastq.gz" = "TestSampleA_R2.fastq.gz" +"test2_R1.fq.gz" = "TestSampleB_R1.fastq.gz" +"test2_R2.fq.gz" = "TestSampleB_R2.fastq.gz" +"test3-reads-r1.fastq" = "TestSampleC_R1.fastq.gz" +"test3-reads-r2.fastq" = "TestSampleC_R2.fastq.gz" +""" + assert observed.strip() == expected.strip() diff --git a/ezfastq/tests/test_namemap.py b/ezfastq/tests/test_namemap.py new file mode 100644 index 0000000..adf1f42 --- /dev/null +++ b/ezfastq/tests/test_namemap.py @@ -0,0 +1,63 @@ +# ------------------------------------------------------------------------------------------------- +# Copyright (c) 2025, DHS. This file is part of ezfastq: https://github.com/bioforensics/ezfastq. +# +# This software was prepared for the Department of Homeland Security (DHS) by the Battelle National +# Biodefense Institute, LLC (BNBI) as part of contract HSHQDC-15-C-00064 to manage and operate the +# National Biodefense Analysis and Countermeasures Center (NBACC), a Federally Funded Research and +# Development Center. +# ------------------------------------------------------------------------------------------------- + +from ezfastq.namemap import NameMap, SampleNameError +import pytest + + +@pytest.mark.parametrize( + "arglist,expected", + [ + (["s1", "s2", "s3"], {"s1": "s1", "s2": "s2", "s3": "s3"}), + (["s1:Sample1", "s2:Sample2"], {"s1": "Sample1", "s2": "Sample2"}), + ( + ["1-1:99-12-005-1-1", "99-12-005-1-2", "1-3:99-12-005-1-3"], + {"1-1": "99-12-005-1-1", "99-12-005-1-2": "99-12-005-1-2", "1-3": "99-12-005-1-3"}, + ), + ], +) +def test_name_map_from_arglist(arglist, expected): + observed = NameMap.from_arglist(arglist) + assert observed == expected + + +def test_name_map_bad_arglist(): + arglist = ["s1:Sample:1", "s2:Sample:2", "s3:Sample:3"] + message = 'expected 1 or 2 values in sample name, not 3: "s1:Sample:1"' + with pytest.raises(SampleNameError, match=message): + NameMap.from_arglist(arglist) + + +def test_name_map_from_empty_arglist(): + namemap = NameMap.from_arglist([]) + assert len(namemap) == 0 + + +@pytest.mark.parametrize( + "contents,expected", + [ + ("s1\ns2\ns3", {"s1": "s1", "s2": "s2", "s3": "s3"}), + ( + "1-1\t99-12-005-1-1\n1-2\t99-12-005-1-2\n1-3\t99-12-005-1-3", + {"1-1": "99-12-005-1-1", "1-2": "99-12-005-1-2", "1-3": "99-12-005-1-3"}, + ), + ], +) +def test_name_map_from_file(contents, expected, tmp_path): + mapfile = tmp_path / "map_file.txt" + mapfile.write_text(contents) + observed = NameMap.from_file(mapfile) + assert observed == expected + + +def test_name_map_from_empty_file(tmp_path): + mapfile = tmp_path / "map_file.txt" + mapfile.touch() + with pytest.raises(ValueError, match=r"sample name file .* is empty"): + NameMap.from_file(mapfile)