diff --git a/lib/kb_bedtools/kb_bedtoolsImpl.py b/lib/kb_bedtools/kb_bedtoolsImpl.py index eb5b813..570788e 100644 --- a/lib/kb_bedtools/kb_bedtoolsImpl.py +++ b/lib/kb_bedtools/kb_bedtoolsImpl.py @@ -9,8 +9,7 @@ from installed_clients.ReadsUtilsClient import ReadsUtils from base import Core -from kb_bedtools.utils import Intersection -from kb_bedtools.utils import BamConversion +from kb_bedtools.utils import BamConversion, Intersection #END_HEADER diff --git a/lib/kb_bedtools/utils.py b/lib/kb_bedtools/utils.py index 94d9dc0..16738fb 100644 --- a/lib/kb_bedtools/utils.py +++ b/lib/kb_bedtools/utils.py @@ -3,6 +3,8 @@ import logging import os import subprocess +import uuid +import shutil from collections import Counter from shutil import copyfile @@ -45,7 +47,12 @@ def do_analysis(self, params: dict): wsname = params['workspace_name'] sequencing_tech = 'Illumina' interleaved = params['interleaved'] - fastq_path = self.bam_to_fastq(staging_path, shared_folder=self.shared_folder) + if params.get('paired_end'): + fastq_path = self.bam_to_paired_fastq(staging_path, shared_folder=self.shared_folder) + else: + fastq_path = self.bam_to_fastq(staging_path, shared_folder=self.shared_folder) + + reads_result = self.upload_reads(output_name, fastq_path, wsname, sequencing_tech, interleaved) @@ -69,27 +76,63 @@ def do_analysis(self, params: dict): @classmethod def bam_to_fastq(cls, bam_file, shared_folder=""): # add a dict parameter so those parameter could be use - with open(bam_file, 'rb') as file: - bam_data = file.read().decode('utf-8', 'ignore') - # best to use logging here so that messages are more visible + if not os.path.isfile(bam_file): + raise FileNotFoundError(f"{bam_file} not found") + + unique_id = str(uuid.uuid4())[:8] + temp_fastq = f"filename_end1_{unique_id}.fq" + output_path = os.path.join(shared_folder, f"output_{unique_id}.fq") + logging.warning(f'{">"*20}{os.getcwd()}') + with subprocess.Popen([ - 'bedtools', 'bamtofastq', '-i', bam_file, '-fq', 'filename_end1.fq' + 'bedtools', 'bamtofastq', '-i', bam_file, '-fq', temp_fastq ]) as proc: proc.wait() - if not os.path.exists("filename_end1.fq"): - raise FileNotFoundError("bedtools did not create FASTQ file") - if os.path.getsize("filename_end1.fq") < 100: + if not os.path.exists(temp_fastq): + raise FileNotFoundError("bedtools did not create FASTQ file") + + if os.path.getsize(temp_fastq) < 100: raise ValueError("Generated FASTQ file is unexpectedly small — check input BAM or bedtools error") - output_path = os.path.join(shared_folder, 'output.fq') - copyfile('filename_end1.fq', output_path) - + shutil.copyfile(temp_fastq, output_path) return output_path - - def upload_reads(self, name, reads_path, workspace_name, sequencing_tech, interleaved): + @classmethod + def bam_to_fastq_paired(cls, bam_file, shared_folder=""): + if not os.path.isfile(bam_file): + raise FileNotFoundError(f"{bam_file} not found") + + unique_id = str(uuid.uuid4())[:8] + read1_file = f"filename_end1_{unique_id}.fq" + read2_file = f"filename_end2_{unique_id}.fq" + + + logging.warning(f"{'>'*20}{os.getcwd()}") + + with subprocess.Popen([ + 'bedtools', 'bamtofastq', + '-i', bam_file, + '-fq', read1_file, + '-fq2', read2_file + ]) as proc: + proc.wait() + + if not os.path.exists(read1_file) or not os.path.exists(read2_file): + raise FileNotFoundError("Paired-end FASTQ files were not created") + + + output1 = os.path.join(shared_folder, f"read1_{unique_id}.fq") + output2 = os.path.join(shared_folder, f"read2_{unique_id}.fq") + + shutil.copyfile(read1_file, output1) + shutil.copyfile(read2_file, output2) + + return {"read1": output1, "read2": output2} + + def upload_reads(self, name, reads_path, workspace_name, + sequencing_tech=None, interleaved=None): """ Upload reads back to the KBase Workspace. This method only uses the minimal parameters necessary to provide a demonstration. There are many diff --git a/test/kb_bedtools_server_test.py b/test/kb_bedtools_server_test.py index 30fc07c..3434376 100644 --- a/test/kb_bedtools_server_test.py +++ b/test/kb_bedtools_server_test.py @@ -95,6 +95,7 @@ def test_intersect(self): "output_name": "intersectOutput", }) + def test_bamtofastq(self): params = { "workspace_name": self.wsName, "reads_ref": "70257/2/1", diff --git a/test/minimal.bam b/test/minimal.bam index 90a0dcd..8d7e3cf 100644 Binary files a/test/minimal.bam and b/test/minimal.bam differ diff --git a/test/unit_tests/test_kb_bedtools_utils.py b/test/unit_tests/test_kb_bedtools_utils.py index 78327d0..f367054 100644 --- a/test/unit_tests/test_kb_bedtools_utils.py +++ b/test/unit_tests/test_kb_bedtools_utils.py @@ -1,7 +1,8 @@ import logging import subprocess - +import os import pytest +from kb_bedtools.utils import BamConversion, Intersection @pytest.fixture @@ -15,3 +16,71 @@ def process(): def test_process(process): logging.info("Running a test") assert 1 == 1 + +def test_bam_to_fastq_creates_output(tmp_path): + output_dir = tmp_path / "output" + output_dir.mkdir() + + bam_path = os.path.join(os.path.dirname(__file__), "..", "minimal.bam") + + result_path = BamConversion.bam_to_fastq(bam_path, str(output_dir)) + + assert os.path.exists(result_path), "FASTQ output file was not created" + with open(result_path) as f: + contents = f.read() + assert contents.startswith("@"), "FASTQ output does not appear valid" + +def test_bam_to_fastq_invalid_input_raises(): + with pytest.raises(Exception): + BamConversion.bam_to_fastq("/invalid/path/to/file.bam") + +def test_bam_to_fastq_empty_file_raises(tmp_path): + empty_bam = tmp_path / "empty.bam" + empty_bam.touch() + + with pytest.raises(Exception): + BamConversion.bam_to_fastq(str(empty_bam), str(tmp_path)) + + +def test_bam_to_fastq_multiple_outputs(tmp_path): + bam_path = os.path.join(os.path.dirname(__file__), "..", "minimal.bam") + + result1 = BamConversion.bam_to_fastq(bam_path, tmp_path) + result2 = BamConversion.bam_to_fastq(bam_path, tmp_path) + + assert result1 != result2, "Multiple conversions should create unique output files" + +def test_bam_to_fastq_paired_end_output(tmp_path): + bam_path = os.path.join(os.path.dirname(__file__), "..", "minimal.bam") + + result_paths = BamConversion.bam_to_fastq_paired( + bam_file=bam_path, + shared_folder=str(tmp_path) + ) + + try: + + assert isinstance(result_paths, dict), "Output should be a dictionary" + assert "read1" in result_paths and "read2" in result_paths, "Missing paired-end keys" + assert os.path.exists(result_paths["read1"]), "FASTQ for read 1 not created" + assert os.path.exists(result_paths["read2"]), "FASTQ for read 2 not created" + + + with open(result_paths["read1"]) as f1, open(result_paths["read2"]) as f2: + content1 = f1.read(1) + content2 = f2.read(1) + + if content1: + assert content1 == "@", "Read 1 output is not a valid FASTQ file" + else: + print("Warning: Read 1 FASTQ file is empty") + + if content2: + assert content2 == "@", "Read 2 output is not a valid FASTQ file" + else: + print ("Warning: Read 2 FASTQ file is empty") + + finally: + for path in result_paths.values(): + if os.path.exists(path): + os.remove(path) diff --git a/ui/narrative/methods/run_kb_bedtools/display.yaml b/ui/narrative/methods/run_kb_bedtools/display.yaml index 1220db5..22a41e9 100644 --- a/ui/narrative/methods/run_kb_bedtools/display.yaml +++ b/ui/narrative/methods/run_kb_bedtools/display.yaml @@ -59,5 +59,6 @@ parameters : long-hint : | Select if reads file is interleaved + description : | -
Takes a BAM file and returns a FASTQ file.
\ No newline at end of file +Takes a BAM file and returns a FASTQ file.