-
Notifications
You must be signed in to change notification settings - Fork 0
Feature: rename samples #6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,46 @@ | ||
| # ------------------------------------------------------------------------------------------------- | ||
| # Copyright (c) 2025, DHS. This file is part of ezfastq: https://github.com/bioforensics/ezfastq. | ||
| # | ||
| # This software was prepared for the Department of Homeland Security (DHS) by the Battelle National | ||
| # Biodefense Institute, LLC (BNBI) as part of contract HSHQDC-15-C-00064 to manage and operate the | ||
| # National Biodefense Analysis and Countermeasures Center (NBACC), a Federally Funded Research and | ||
| # Development Center. | ||
| # ------------------------------------------------------------------------------------------------- | ||
|
|
||
|
|
||
| class NameMap(dict): | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This class handles parsing sample names from argument lists or files. |
||
| @classmethod | ||
| def from_arglist(cls, arg_list): | ||
| name_map = cls() | ||
| for argument in arg_list: | ||
| old_name, new_name = cls.parse_name(argument, sep=":") | ||
| name_map[old_name] = new_name | ||
| return name_map | ||
|
|
||
| @classmethod | ||
| def from_file(cls, path): | ||
| name_map = cls() | ||
| with open(path, "r") as fh: | ||
| for line in fh: | ||
| old_name, new_name = cls.parse_name(line, sep="\t") | ||
| name_map[old_name] = new_name | ||
| if len(name_map) == 0: | ||
| raise ValueError(f'sample name file "{path}" is empty') | ||
| return name_map | ||
|
|
||
| @staticmethod | ||
| def parse_name(name_string, sep=":"): | ||
| name_string = name_string.strip() | ||
| num_values = name_string.count(sep) + 1 | ||
| if num_values != 1 and num_values != 2: | ||
| message = f'expected 1 or 2 values in sample name, not {num_values}: "{name_string}"' | ||
| raise SampleNameError(message) | ||
| if num_values == 1: | ||
| return name_string, name_string | ||
| else: | ||
| old_name, new_name = name_string.split(sep) | ||
| return old_name, new_name | ||
|
|
||
|
|
||
| class SampleNameError(ValueError): | ||
| pass | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,8 +8,9 @@ | |
| # ------------------------------------------------------------------------------------------------- | ||
|
|
||
| from ezfastq.copier import FastqCopier | ||
| from ezfastq.namemap import NameMap | ||
| from importlib.resources import files | ||
| import pytest | ||
| from itertools import product | ||
|
|
||
| try: | ||
| import tomllib | ||
|
|
@@ -22,7 +23,7 @@ | |
|
|
||
|
|
||
| def test_copier_basic(): | ||
| sample_names = ["test1", "test2"] | ||
| sample_names = NameMap.from_arglist(["test1", "test2"]) | ||
| copier = FastqCopier.from_dir(sample_names, SEQ_PATH_1) | ||
| observed = [fqfile.source_path.name for fqfile in copier] | ||
| expected = [ | ||
|
|
@@ -36,7 +37,7 @@ def test_copier_basic(): | |
|
|
||
|
|
||
| def test_copier_copy(tmp_path): | ||
| sample_names = ["test1", "test2"] | ||
| sample_names = NameMap.from_arglist(["test1", "test2"]) | ||
| # First pass: copy all 4 | ||
| copier1 = FastqCopier.from_dir(sample_names, SEQ_PATH_1) | ||
| copier1.copy_files(tmp_path) | ||
|
|
@@ -58,14 +59,14 @@ def test_copier_copy(tmp_path): | |
|
|
||
|
|
||
| def test_copier_prefix(tmp_path): | ||
| sample_names = ["test2", "test3"] | ||
| sample_names = NameMap.from_arglist(["test2", "test3"]) | ||
| copier = FastqCopier.from_dir(sample_names, SEQ_PATH_1, prefix="abc_") | ||
| copier.copy_files(tmp_path) | ||
| assert len(list(tmp_path.glob("abc_*.fastq.gz"))) == 4 | ||
|
|
||
|
|
||
| def test_copier_str_basic(tmp_path): | ||
| sample_names = ["test1", "test2", "test3"] | ||
| sample_names = NameMap.from_arglist(["test1", "test2", "test3"]) | ||
| copier = FastqCopier.from_dir(sample_names, SEQ_PATH_1) | ||
| copier.copy_files(tmp_path) | ||
| observed = str(copier) | ||
|
|
@@ -81,14 +82,14 @@ def test_copier_str_basic(tmp_path): | |
| assert observed.strip() == expected.strip() | ||
|
|
||
|
|
||
| def test_copier_str_noop(tmp_path): | ||
| sample_names = ["test1", "test2", "test3"] | ||
| def test_copier_str_noop(): | ||
| sample_names = NameMap.from_arglist(["test1", "test2", "test3"]) | ||
| copier = FastqCopier.from_dir(sample_names, SEQ_PATH_1) | ||
| assert str(copier) == "" | ||
|
|
||
|
|
||
| def test_copier_str_allskip(tmp_path): | ||
| sample_names = ["test1"] | ||
| sample_names = NameMap.from_arglist(["test1"]) | ||
| copier = FastqCopier.from_dir(sample_names, SEQ_PATH_1) | ||
| (tmp_path / "test1_R1.fastq.gz").touch() | ||
| (tmp_path / "test1_R2.fastq.gz").touch() | ||
|
|
@@ -105,7 +106,7 @@ def test_copier_str_allskip(tmp_path): | |
|
|
||
|
|
||
| def test_copier_str_mixed(tmp_path): | ||
| sample_names = ["test1", "test2", "test3"] | ||
| sample_names = NameMap.from_arglist(["test1", "test2", "test3"]) | ||
| copier = FastqCopier.from_dir(sample_names, SEQ_PATH_2) | ||
| (tmp_path / "test2_R1.fastq.gz").touch() | ||
| (tmp_path / "test2_R2.fastq.gz").touch() | ||
|
|
@@ -128,7 +129,7 @@ def test_copier_str_mixed(tmp_path): | |
|
|
||
|
|
||
| def test_copier_str_roundtrip(tmp_path): | ||
| sample_names = ["test1", "test2", "test3"] | ||
| sample_names = NameMap.from_arglist(["test1", "test2", "test3"]) | ||
| copier = FastqCopier.from_dir(sample_names, SEQ_PATH_1) | ||
| (tmp_path / "test2_R1.fastq.gz").touch() | ||
| (tmp_path / "test2_R2.fastq.gz").touch() | ||
|
|
@@ -143,3 +144,26 @@ def test_copier_str_roundtrip(tmp_path): | |
| observed = copy_data["SkippedFiles"]["already_copied"] | ||
| expected = ["test2_R1.fq.gz", "test2_R2.fq.gz"] | ||
| assert observed == expected | ||
|
|
||
|
|
||
| def test_copier_with_renaming(tmp_path): | ||
| sample_names = NameMap.from_arglist( | ||
| ["test1:TestSampleA", "test2:TestSampleB", "test3:TestSampleC"] | ||
| ) | ||
|
Comment on lines
+149
to
+152
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also added a test for the |
||
| copier = FastqCopier.from_dir(sample_names, SEQ_PATH_1) | ||
| copier.copy_files(tmp_path) | ||
| observed = str(copier) | ||
| for sample, end in product("ABC", "12"): | ||
| fastq = tmp_path / f"TestSample{sample}_R{end}.fastq.gz" | ||
| assert fastq.is_file() | ||
| print(observed) | ||
| expected = """ | ||
| [CopiedFiles] | ||
| "test1_S1_L001_R1_001.fastq.gz" = "TestSampleA_R1.fastq.gz" | ||
| "test1_S1_L001_R2_001.fastq.gz" = "TestSampleA_R2.fastq.gz" | ||
| "test2_R1.fq.gz" = "TestSampleB_R1.fastq.gz" | ||
| "test2_R2.fq.gz" = "TestSampleB_R2.fastq.gz" | ||
| "test3-reads-r1.fastq" = "TestSampleC_R1.fastq.gz" | ||
| "test3-reads-r2.fastq" = "TestSampleC_R2.fastq.gz" | ||
| """ | ||
| assert observed.strip() == expected.strip() | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This file has unit tests for the new |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,63 @@ | ||
| # ------------------------------------------------------------------------------------------------- | ||
| # Copyright (c) 2025, DHS. This file is part of ezfastq: https://github.com/bioforensics/ezfastq. | ||
| # | ||
| # This software was prepared for the Department of Homeland Security (DHS) by the Battelle National | ||
| # Biodefense Institute, LLC (BNBI) as part of contract HSHQDC-15-C-00064 to manage and operate the | ||
| # National Biodefense Analysis and Countermeasures Center (NBACC), a Federally Funded Research and | ||
| # Development Center. | ||
| # ------------------------------------------------------------------------------------------------- | ||
|
|
||
| from ezfastq.namemap import NameMap, SampleNameError | ||
| import pytest | ||
|
|
||
|
|
||
| @pytest.mark.parametrize( | ||
| "arglist,expected", | ||
| [ | ||
| (["s1", "s2", "s3"], {"s1": "s1", "s2": "s2", "s3": "s3"}), | ||
| (["s1:Sample1", "s2:Sample2"], {"s1": "Sample1", "s2": "Sample2"}), | ||
| ( | ||
| ["1-1:99-12-005-1-1", "99-12-005-1-2", "1-3:99-12-005-1-3"], | ||
| {"1-1": "99-12-005-1-1", "99-12-005-1-2": "99-12-005-1-2", "1-3": "99-12-005-1-3"}, | ||
| ), | ||
| ], | ||
| ) | ||
| def test_name_map_from_arglist(arglist, expected): | ||
| observed = NameMap.from_arglist(arglist) | ||
| assert observed == expected | ||
|
|
||
|
|
||
| def test_name_map_bad_arglist(): | ||
| arglist = ["s1:Sample:1", "s2:Sample:2", "s3:Sample:3"] | ||
| message = 'expected 1 or 2 values in sample name, not 3: "s1:Sample:1"' | ||
| with pytest.raises(SampleNameError, match=message): | ||
| NameMap.from_arglist(arglist) | ||
|
|
||
|
|
||
| def test_name_map_from_empty_arglist(): | ||
| namemap = NameMap.from_arglist([]) | ||
| assert len(namemap) == 0 | ||
|
|
||
|
|
||
| @pytest.mark.parametrize( | ||
| "contents,expected", | ||
| [ | ||
| ("s1\ns2\ns3", {"s1": "s1", "s2": "s2", "s3": "s3"}), | ||
| ( | ||
| "1-1\t99-12-005-1-1\n1-2\t99-12-005-1-2\n1-3\t99-12-005-1-3", | ||
| {"1-1": "99-12-005-1-1", "1-2": "99-12-005-1-2", "1-3": "99-12-005-1-3"}, | ||
| ), | ||
| ], | ||
| ) | ||
| def test_name_map_from_file(contents, expected, tmp_path): | ||
| mapfile = tmp_path / "map_file.txt" | ||
| mapfile.write_text(contents) | ||
| observed = NameMap.from_file(mapfile) | ||
| assert observed == expected | ||
|
|
||
|
|
||
| def test_name_map_from_empty_file(tmp_path): | ||
| mapfile = tmp_path / "map_file.txt" | ||
| mapfile.touch() | ||
| with pytest.raises(ValueError, match=r"sample name file .* is empty"): | ||
| NameMap.from_file(mapfile) |
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is really the only place where the new sample name is used. All the other changes in this file are variable name changes that don't really affect how the code works.