Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).


## Unreleased
### Added
- Support for renaming samples while copying (#6)


## [0.1.3] 2025-12-03
### Added
- Option to add source and destination paths to the copy log with a `--verbose` flag (#5)
Expand Down
4 changes: 2 additions & 2 deletions ezfastq/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,15 @@


def copy(
sample_names,
sample_name_map,
seq_path,
pair_mode=PairMode.Unspecified,
prefix="",
workdir=Path("."),
subdir="seq",
verbose=False,
):
copier = FastqCopier.from_dir(sample_names, seq_path, prefix=prefix, pair_mode=pair_mode)
copier = FastqCopier.from_dir(sample_name_map, seq_path, prefix=prefix, pair_mode=pair_mode)
copier.copy_files(workdir / subdir)
copier.print_copy_log()
nlogs = len(list((workdir / subdir).glob("copy-log-*.toml")))
Expand Down
23 changes: 19 additions & 4 deletions ezfastq/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,14 @@
# -------------------------------------------------------------------------------------------------

from .api import copy
from .namemap import NameMap
from .pair import PairMode
from argparse import ArgumentParser
from importlib.metadata import version
from pathlib import Path
from rich_argparse import RichHelpFormatter
from rich.text import Text
from rich_argparse import RawDescriptionRichHelpFormatter
from shutil import get_terminal_size


def main(arglist=None):
Expand All @@ -35,15 +38,27 @@ def parse_args(arglist=None):
samples_file = Path(args.samples[0])
samples_file_exists = samples_file.is_file() or samples_file.is_fifo()
if len(args.samples) == 1 and samples_file_exists:
args.samples = samples_file.read_text().strip().split("\n")
args.samples = NameMap.from_file(samples_file)
else:
args.samples = NameMap.from_arglist(args.samples)
args.pair_mode = PairMode(args.pair_mode)
return args


def get_parser():
epilog = """
[bold cyan]Examples:[/bold cyan]
[dim]ezfastq /path/to/fastqs/ sample1 sample2 sample3
ezfastq /path/to/fastqs/ s1:Sample1 s2:Sample2 s3:Sample3
ezfastq /path/to/fastqs/ samplenames.txt
ezfastq /path/to/fastqs/ samplenames.txt --workdir /path/to/projectdir/ --subdir seq/Run01/
ezfastq /path/to/fastqs/ samplenames.txt --pair-mode 2[/dim]
"""
width = min(99, get_terminal_size().columns - 2)
parser = ArgumentParser(
description="Copy FASTQ files and use sample names to make filenames consistent",
formatter_class=RichHelpFormatter,
formatter_class=lambda prog: RawDescriptionRichHelpFormatter(prog, width=width),
epilog=epilog,
)
parser.add_argument(
"seq_path",
Expand All @@ -52,7 +67,7 @@ def get_parser():
parser.add_argument(
"samples",
nargs="+",
help="name of one or more samples to process; can be provided as command-line arguments or as a file with one sample name per line",
help="name of one or more samples to process; samples can optionally be renamed by appending a colon and new name to each sample name; alternatively, sample names can be provided as a file with one sample name per line, or two tab-separated values to rename samples",
)
parser.add_argument(
"-v",
Expand Down
14 changes: 8 additions & 6 deletions ezfastq/copier.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from .fastq import FastqFile
from .map import SampleFastqMap
from .namemap import NameMap
from .pair import PairMode
from dataclasses import dataclass
from io import StringIO
Expand All @@ -35,18 +36,18 @@ class FastqCopier:
FASTQ file names are streamlined in the process, and read pairing status is validated.
"""

sample_names: List
sample_name_map: NameMap
copied_files: List
skipped_files: List
file_map: SampleFastqMap
prefix: str = ""

@classmethod
def from_dir(cls, sample_names, data_path, prefix="", pair_mode=PairMode.Unspecified):
def from_dir(cls, sample_name_map, data_path, prefix="", pair_mode=PairMode.Unspecified):
copied_files = list()
skipped_files = list()
file_map = SampleFastqMap.new(sample_names, data_path, pair_mode=pair_mode)
copier = cls(sorted(sample_names), copied_files, skipped_files, file_map, prefix)
file_map = SampleFastqMap.new(sample_name_map.keys(), data_path, pair_mode=pair_mode)
copier = cls(sample_name_map, copied_files, skipped_files, file_map, prefix)
return copier

def copy_files(self, destination):
Expand Down Expand Up @@ -95,7 +96,7 @@ def print_copy_log(self, outstream=sys.stderr):

@property
def length_longest_sample_name(self):
return max(len(sample) for sample in self.sample_names)
return max(len(sample) for sample in self.sample_name_map.keys())

def __len__(self):
return sum(len(fqfiles) for fqfiles in self.file_map.values())
Expand All @@ -105,7 +106,8 @@ def __iter__(self):
for n, fqfile in enumerate(fqfiles, 1):
source_path = Path(fqfile).absolute()
read = 0 if len(fqfiles) == 1 else n
yield FastqFile(source_path, sample_name, read, self.prefix)
new_name = self.sample_name_map[sample_name]
yield FastqFile(source_path, new_name, read, self.prefix)
Comment on lines -108 to +110
Copy link
Member Author

@standage standage Dec 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is really the only place where the new sample name is used. All the other changes in this file are variable name changes that don't really affect how the code works.


def __str__(self):
output = StringIO()
Expand Down
46 changes: 46 additions & 0 deletions ezfastq/namemap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# -------------------------------------------------------------------------------------------------
# Copyright (c) 2025, DHS. This file is part of ezfastq: https://github.com/bioforensics/ezfastq.
#
# This software was prepared for the Department of Homeland Security (DHS) by the Battelle National
# Biodefense Institute, LLC (BNBI) as part of contract HSHQDC-15-C-00064 to manage and operate the
# National Biodefense Analysis and Countermeasures Center (NBACC), a Federally Funded Research and
# Development Center.
# -------------------------------------------------------------------------------------------------


class NameMap(dict):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This class handles parsing sample names from argument lists or files.

@classmethod
def from_arglist(cls, arg_list):
name_map = cls()
for argument in arg_list:
old_name, new_name = cls.parse_name(argument, sep=":")
name_map[old_name] = new_name
return name_map

@classmethod
def from_file(cls, path):
name_map = cls()
with open(path, "r") as fh:
for line in fh:
old_name, new_name = cls.parse_name(line, sep="\t")
name_map[old_name] = new_name
if len(name_map) == 0:
raise ValueError(f'sample name file "{path}" is empty')
return name_map

@staticmethod
def parse_name(name_string, sep=":"):
name_string = name_string.strip()
num_values = name_string.count(sep) + 1
if num_values != 1 and num_values != 2:
message = f'expected 1 or 2 values in sample name, not {num_values}: "{name_string}"'
raise SampleNameError(message)
if num_values == 1:
return name_string, name_string
else:
old_name, new_name = name_string.split(sep)
return old_name, new_name


class SampleNameError(ValueError):
pass
44 changes: 34 additions & 10 deletions ezfastq/tests/test_copier.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@
# -------------------------------------------------------------------------------------------------

from ezfastq.copier import FastqCopier
from ezfastq.namemap import NameMap
from importlib.resources import files
import pytest
from itertools import product

try:
import tomllib
Expand All @@ -22,7 +23,7 @@


def test_copier_basic():
sample_names = ["test1", "test2"]
sample_names = NameMap.from_arglist(["test1", "test2"])
copier = FastqCopier.from_dir(sample_names, SEQ_PATH_1)
observed = [fqfile.source_path.name for fqfile in copier]
expected = [
Expand All @@ -36,7 +37,7 @@ def test_copier_basic():


def test_copier_copy(tmp_path):
sample_names = ["test1", "test2"]
sample_names = NameMap.from_arglist(["test1", "test2"])
# First pass: copy all 4
copier1 = FastqCopier.from_dir(sample_names, SEQ_PATH_1)
copier1.copy_files(tmp_path)
Expand All @@ -58,14 +59,14 @@ def test_copier_copy(tmp_path):


def test_copier_prefix(tmp_path):
sample_names = ["test2", "test3"]
sample_names = NameMap.from_arglist(["test2", "test3"])
copier = FastqCopier.from_dir(sample_names, SEQ_PATH_1, prefix="abc_")
copier.copy_files(tmp_path)
assert len(list(tmp_path.glob("abc_*.fastq.gz"))) == 4


def test_copier_str_basic(tmp_path):
sample_names = ["test1", "test2", "test3"]
sample_names = NameMap.from_arglist(["test1", "test2", "test3"])
copier = FastqCopier.from_dir(sample_names, SEQ_PATH_1)
copier.copy_files(tmp_path)
observed = str(copier)
Expand All @@ -81,14 +82,14 @@ def test_copier_str_basic(tmp_path):
assert observed.strip() == expected.strip()


def test_copier_str_noop(tmp_path):
sample_names = ["test1", "test2", "test3"]
def test_copier_str_noop():
sample_names = NameMap.from_arglist(["test1", "test2", "test3"])
copier = FastqCopier.from_dir(sample_names, SEQ_PATH_1)
assert str(copier) == ""


def test_copier_str_allskip(tmp_path):
sample_names = ["test1"]
sample_names = NameMap.from_arglist(["test1"])
copier = FastqCopier.from_dir(sample_names, SEQ_PATH_1)
(tmp_path / "test1_R1.fastq.gz").touch()
(tmp_path / "test1_R2.fastq.gz").touch()
Expand All @@ -105,7 +106,7 @@ def test_copier_str_allskip(tmp_path):


def test_copier_str_mixed(tmp_path):
sample_names = ["test1", "test2", "test3"]
sample_names = NameMap.from_arglist(["test1", "test2", "test3"])
copier = FastqCopier.from_dir(sample_names, SEQ_PATH_2)
(tmp_path / "test2_R1.fastq.gz").touch()
(tmp_path / "test2_R2.fastq.gz").touch()
Expand All @@ -128,7 +129,7 @@ def test_copier_str_mixed(tmp_path):


def test_copier_str_roundtrip(tmp_path):
sample_names = ["test1", "test2", "test3"]
sample_names = NameMap.from_arglist(["test1", "test2", "test3"])
copier = FastqCopier.from_dir(sample_names, SEQ_PATH_1)
(tmp_path / "test2_R1.fastq.gz").touch()
(tmp_path / "test2_R2.fastq.gz").touch()
Expand All @@ -143,3 +144,26 @@ def test_copier_str_roundtrip(tmp_path):
observed = copy_data["SkippedFiles"]["already_copied"]
expected = ["test2_R1.fq.gz", "test2_R2.fq.gz"]
assert observed == expected


def test_copier_with_renaming(tmp_path):
sample_names = NameMap.from_arglist(
["test1:TestSampleA", "test2:TestSampleB", "test3:TestSampleC"]
)
Comment on lines +149 to +152
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also added a test for the FastqCopier test to make sure it renames files correctly on copy.

copier = FastqCopier.from_dir(sample_names, SEQ_PATH_1)
copier.copy_files(tmp_path)
observed = str(copier)
for sample, end in product("ABC", "12"):
fastq = tmp_path / f"TestSample{sample}_R{end}.fastq.gz"
assert fastq.is_file()
print(observed)
expected = """
[CopiedFiles]
"test1_S1_L001_R1_001.fastq.gz" = "TestSampleA_R1.fastq.gz"
"test1_S1_L001_R2_001.fastq.gz" = "TestSampleA_R2.fastq.gz"
"test2_R1.fq.gz" = "TestSampleB_R1.fastq.gz"
"test2_R2.fq.gz" = "TestSampleB_R2.fastq.gz"
"test3-reads-r1.fastq" = "TestSampleC_R1.fastq.gz"
"test3-reads-r2.fastq" = "TestSampleC_R2.fastq.gz"
"""
assert observed.strip() == expected.strip()
63 changes: 63 additions & 0 deletions ezfastq/tests/test_namemap.py
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This file has unit tests for the new NameMap class.

Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# -------------------------------------------------------------------------------------------------
# Copyright (c) 2025, DHS. This file is part of ezfastq: https://github.com/bioforensics/ezfastq.
#
# This software was prepared for the Department of Homeland Security (DHS) by the Battelle National
# Biodefense Institute, LLC (BNBI) as part of contract HSHQDC-15-C-00064 to manage and operate the
# National Biodefense Analysis and Countermeasures Center (NBACC), a Federally Funded Research and
# Development Center.
# -------------------------------------------------------------------------------------------------

from ezfastq.namemap import NameMap, SampleNameError
import pytest


@pytest.mark.parametrize(
"arglist,expected",
[
(["s1", "s2", "s3"], {"s1": "s1", "s2": "s2", "s3": "s3"}),
(["s1:Sample1", "s2:Sample2"], {"s1": "Sample1", "s2": "Sample2"}),
(
["1-1:99-12-005-1-1", "99-12-005-1-2", "1-3:99-12-005-1-3"],
{"1-1": "99-12-005-1-1", "99-12-005-1-2": "99-12-005-1-2", "1-3": "99-12-005-1-3"},
),
],
)
def test_name_map_from_arglist(arglist, expected):
observed = NameMap.from_arglist(arglist)
assert observed == expected


def test_name_map_bad_arglist():
arglist = ["s1:Sample:1", "s2:Sample:2", "s3:Sample:3"]
message = 'expected 1 or 2 values in sample name, not 3: "s1:Sample:1"'
with pytest.raises(SampleNameError, match=message):
NameMap.from_arglist(arglist)


def test_name_map_from_empty_arglist():
namemap = NameMap.from_arglist([])
assert len(namemap) == 0


@pytest.mark.parametrize(
"contents,expected",
[
("s1\ns2\ns3", {"s1": "s1", "s2": "s2", "s3": "s3"}),
(
"1-1\t99-12-005-1-1\n1-2\t99-12-005-1-2\n1-3\t99-12-005-1-3",
{"1-1": "99-12-005-1-1", "1-2": "99-12-005-1-2", "1-3": "99-12-005-1-3"},
),
],
)
def test_name_map_from_file(contents, expected, tmp_path):
mapfile = tmp_path / "map_file.txt"
mapfile.write_text(contents)
observed = NameMap.from_file(mapfile)
assert observed == expected


def test_name_map_from_empty_file(tmp_path):
mapfile = tmp_path / "map_file.txt"
mapfile.touch()
with pytest.raises(ValueError, match=r"sample name file .* is empty"):
NameMap.from_file(mapfile)