From b55ae00d78908753812dbc262fd45ef4f82dd7b6 Mon Sep 17 00:00:00 2001 From: Mat Crocker Date: Mon, 3 Feb 2025 12:46:23 -0600 Subject: [PATCH 1/3] Implement unified Python runner with gzip handling fix - Add unified Python runner (laava_run.py) that orchestrates: * Vector type analysis * Reference name generation * Read mapping * Alignment analysis - Add gzipped file handling in summarize_alignment.py - Add minimal conda environment - Update documentation with Python runner usage and examples --- README.md | 29 ++++- laava_minimal.conda_env.yml | 13 +++ pyproject.toml | 24 ++++ src/__init__.py | 1 + src/laava_run.py | 215 ++++++++++++++++++++++++++++++++++++ src/map_reads.sh | 18 +-- src/summarize_alignment.py | 13 ++- 7 files changed, 296 insertions(+), 17 deletions(-) create mode 100644 laava_minimal.conda_env.yml create mode 100644 src/laava_run.py diff --git a/README.md b/README.md index ff5072c..a64a4bc 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,28 @@ Then run it interactively in your current directory: docker run -v $(pwd):$(pwd) -w $(pwd) -it ghcr.io/formbio/laava_dev:latest bash ``` +### Python Runner + +LAAVA now includes a unified Python-based runner that orchestrates the entire analysis pipeline: + +```bash +python src/laava_run.py --input-bam --vector-fasta \ + --annotation-bed --itr-label ITR +``` + +This runner coordinates vector type analysis, read mapping, and alignment analysis in a single command. A minimal conda environment (laava_minimal.conda_env.yml) is also available for running the core analysis components. + +Example using the test data: +```bash +# Using the included test samples +python src/laava_run.py \ + --input-bam test/samples/ss.subsample005.bam \ + --vector-fasta test/samples/ss.construct.fasta \ + --annotation-bed test/samples/ss.annotation.bed \ + --itr-label ITR \ + --output-dir test_runner_results/ss \ + --sample-id ss +``` ## For developers @@ -85,9 +107,10 @@ This opens a Bash shell with the scripts in the PATH, and the original working d ### Option 2: Conda The conda (or mamba) channels and dependencies are in the configuration files -`laava.conda_env.yml` and `laava_dev.conda_env.yml`. These two environments are similar, -and both will work for running LAAVA itself, but `_dev` includes some additional -developer tools. +`laava.conda_env.yml`, `laava_dev.conda_env.yml`, and `laava_minimal.conda_env.yml`. +The first two environments are similar and both will work for running LAAVA itself +(with `_dev` including additional developer tools), while `_minimal` provides just +the core components needed for the Python runner. First, install conda via Miniconda or Anaconda. Then, for example, suppose you have [anaconda](https://docs.anaconda.com/anaconda/install/linux/) installed and the binary diff --git a/laava_minimal.conda_env.yml b/laava_minimal.conda_env.yml new file mode 100644 index 0000000..563ca82 --- /dev/null +++ b/laava_minimal.conda_env.yml @@ -0,0 +1,13 @@ +name: laava_minimal +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - python>=3.7.6 + - biopython + - pandas + - pysam + - pytest + - minimap2 + - samtools diff --git a/pyproject.toml b/pyproject.toml index 1dccde4..0e919d2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,27 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "laava" +version = "0.1.0" +description = "Long-read AAV Analysis Pipeline" +readme = "README.md" +requires-python = ">=3.9" +license = { file = "LICENSE.md" } +authors = [ + { name = "Form Biosciences" } +] +dependencies = [ + "numpy", + "pandas", + "pysam", + "rmarkdown", +] + +[project.scripts] +laava-run = "src.laava_run:main" + [tool.ruff] # Minimum version to assume for error checking target-version = "py39" diff --git a/src/__init__.py b/src/__init__.py index e69de29..8540692 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -0,0 +1 @@ +"""LAAVA - Long-read AAV Analysis Pipeline.""" diff --git a/src/laava_run.py b/src/laava_run.py new file mode 100644 index 0000000..dc69675 --- /dev/null +++ b/src/laava_run.py @@ -0,0 +1,215 @@ +#!/usr/bin/env python3 +""" +LAAVA Runner Script - Unified command line interface for LAAVA analysis pipeline. +""" +import argparse +import logging +import os +import sys +from pathlib import Path +from typing import Optional +import subprocess + +# Add src directory to Python path +src_dir = str(Path(__file__).parent.parent) +if src_dir not in sys.path: + sys.path.insert(0, src_dir) + +# Import the modules +import importlib.util +get_reference_names = importlib.import_module("src.get_reference_names") +guess_vector_type_length = importlib.import_module("src.guess_vector_type_length") +summarize_alignment = importlib.import_module("src.summarize_alignment") + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s" +) +logger = logging.getLogger(__name__) + +def parse_args() -> argparse.Namespace: + """Parse command line arguments.""" + parser = argparse.ArgumentParser( + description="LAAVA Analysis Pipeline Runner", + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + + # Required arguments + parser.add_argument( + "--input-bam", + required=True, + help="Input BAM file containing sequencing reads" + ) + parser.add_argument( + "--vector-fasta", + required=True, + help="Vector reference sequence in FASTA format" + ) + parser.add_argument( + "--annotation-bed", + required=True, + help="Vector annotation in BED format" + ) + parser.add_argument( + "--itr-label", + required=True, + help="Label used for ITR in annotation" + ) + + # Optional arguments + parser.add_argument( + "--packaging-fasta", + help="Packaging sequence reference in FASTA format" + ) + parser.add_argument( + "--host-fasta", + help="Host genome reference in FASTA format" + ) + parser.add_argument( + "--output-dir", + default="./laava_results", + help="Output directory for analysis results" + ) + parser.add_argument( + "--sample-id", + default="sample", + help="Sample identifier" + ) + parser.add_argument( + "--sample-name", + help="Human-readable sample name" + ) + parser.add_argument( + "--verbose", + action="store_true", + help="Enable verbose logging" + ) + + return parser.parse_args() + +def setup_output_directory(output_dir: str) -> Path: + """Create and validate output directory.""" + out_path = Path(output_dir) + out_path.mkdir(parents=True, exist_ok=True) + return out_path + +def run_pipeline(args: argparse.Namespace) -> int: + """Execute the LAAVA analysis pipeline.""" + try: + # Setup output directory + output_dir = setup_output_directory(args.output_dir) + + # Step 1: Analyze vector type + logger.info("Analyzing vector type...") + cassette_len = guess_vector_type_length.length_from_annotation( + args.annotation_bed, + [args.itr_label] + ) + logger.info(f"Expression cassette length is {cassette_len}") + vector_type = "sc" if cassette_len <= guess_vector_type_length.SC_MAX_THRESHOLD else "ss" + logger.info(f"Determined vector type: {vector_type}") + + # Step 2: Generate reference names + logger.info("Generating reference names...") + ref_names_file = output_dir / f"{args.sample_id}.reference_names.tsv" + ref_names_args = argparse.Namespace( + vector=args.vector_fasta, + packaging=args.packaging_fasta, + host=args.host_fasta, + output=str(ref_names_file), + repcap_name=None, + helper_name=None, + lambda_name=None + ) + get_reference_names._main(ref_names_args) + + # Step 3: Map reads + logger.info("Mapping reads...") + + # Set up environment for the shell script + env = os.environ.copy() + env["PATH"] = f"{Path(__file__).parent}:{env['PATH']}" # Add src dir to PATH for Python scripts + + # Get number of CPUs for parallel processing + try: + import multiprocessing + threads = str(multiprocessing.cpu_count()) + except: + threads = "1" + env["THREADS"] = threads + + # Prepare command + map_script = Path(__file__).parent / "map_reads.sh" + if not map_script.exists(): + raise FileNotFoundError(f"Map reads script not found: {map_script}") + + # Build command with proper working directory + cmd = [ + "bash", + str(map_script), + args.sample_id, + str(Path(args.input_bam).absolute()), + str(Path(args.vector_fasta).absolute()) + ] + if args.packaging_fasta: + cmd.append(str(Path(args.packaging_fasta).absolute())) + if args.host_fasta: + cmd.append(str(Path(args.host_fasta).absolute())) + + # Run mapping command in output directory + logger.debug(f"Running command: {' '.join(cmd)}") + try: + subprocess.run( + cmd, + check=True, + cwd=output_dir, + env=env, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + universal_newlines=True + ) + except subprocess.CalledProcessError as e: + logger.error("Map reads command failed:") + logger.error(f"stdout:\n{e.stdout}") + logger.error(f"stderr:\n{e.stderr}") + raise + + # Verify output file exists + mapped_reads = output_dir / f"{args.sample_id}.sort_by_name.sam" + if not mapped_reads.exists(): + raise FileNotFoundError(f"Expected mapped reads file not found: {mapped_reads}") + + # Step 4: Analyze alignments and generate report + logger.info("Analyzing alignments and generating report...") + analysis_args = argparse.Namespace( + sam_filename=str(mapped_reads), + annotation_bed=args.annotation_bed, + reference_names=str(ref_names_file), + itr_labels=[args.itr_label], + output_prefix=str(output_dir / args.sample_id), + sample_id=args.sample_id, + vector_type=vector_type, + cpus=1, + debug=args.verbose + ) + summarize_alignment.main(analysis_args) + + logger.info(f"Analysis complete. Results available in: {output_dir}") + return 0 + + except Exception as e: + logger.error(f"Pipeline failed: {str(e)}") + if args.verbose: + logger.exception("Detailed error traceback:") + return 1 + +def main(): + """Main entry point for the LAAVA runner script.""" + args = parse_args() + if args.verbose: + logging.getLogger().setLevel(logging.DEBUG) + + sys.exit(run_pipeline(args)) + +if __name__ == "__main__": + main() diff --git a/src/map_reads.sh b/src/map_reads.sh index 19a3758..ea84503 100755 --- a/src/map_reads.sh +++ b/src/map_reads.sh @@ -15,13 +15,7 @@ lambda_name=$8 ls -Alh -get_reference_names.py "${vector_fa}" \ - ${packaging_fa:+--packaging "$packaging_fa"} \ - ${host_fa:+--host "$host_fa"} \ - ${repcap_name:+--repcap-name "$repcap_name"} \ - ${helper_name:+--helper-name "$helper_name"} \ - ${lambda_name:+--lambda-name "$lambda_name"} \ - -o "${sample_id}.reference_names.tsv" +# Skip reference names generation since it's handled by the Python runner ls -Alh @@ -49,17 +43,17 @@ else reads_fn="$reads" fi -threads=$(nproc) -minimap2 --eqx -a --secondary=no -t $threads all_refs.fa "$reads_fn" > tmp.mapped.sam +threads=${LAAVA_THREADS:-1} # Use LAAVA_THREADS env var if set, otherwise default to NPROC +minimap2 --eqx -a --secondary=no -t "$threads" all_refs.fa "$reads_fn" > tmp.mapped.sam # Sort the mapped reads by name name_sam="$sample_id.sort_by_name.sam" -samtools sort -@ $threads -n -O SAM -o "$name_sam" tmp.mapped.sam +samtools sort -@ "$threads" -n -O SAM -o "$name_sam" tmp.mapped.sam # Make a position-sorted BAM output file for other downstream consumers pos_bam="$sample_id.sort_by_pos.bam" # Drop unmapped reads -samtools view -@ $threads --fast -o tmp.sorted.bam tmp.mapped.sam -samtools sort -@ $threads -o "$pos_bam" tmp.sorted.bam +samtools view -@ "$threads" --fast -o tmp.sorted.bam tmp.mapped.sam +samtools sort -@ "$threads" -o "$pos_bam" tmp.sorted.bam samtools index "$pos_bam" # Logging diff --git a/src/summarize_alignment.py b/src/summarize_alignment.py index 7729c64..8264903 100755 --- a/src/summarize_alignment.py +++ b/src/summarize_alignment.py @@ -50,8 +50,17 @@ def subset_sam_by_readname_list( exclude_type=False, ): qname_lookup = {} # qname --> (a_type, a_subtype) - with gzip.open(per_read_tsv, "rt") as per_read_f: - for row in csv.DictReader(per_read_f, delimiter="\t"): + # Try to open as gzipped first, if that fails open as regular file + try: + f = gzip.open(per_read_tsv, "rt") + # Test if it's actually gzipped by reading a bit + f.read(1) + f.seek(0) + except (OSError, gzip.BadGzipFile): + f = open(per_read_tsv, "rt") + + with f: + for row in csv.DictReader(f, delimiter="\t"): # pdb.set_trace() if ( wanted_types is None From 343b66944b4a808a668d36ad758e65a5b8e75dfa Mon Sep 17 00:00:00 2001 From: Eric Talevich <52723+etal@users.noreply.github.com> Date: Mon, 3 Feb 2025 14:31:52 -0800 Subject: [PATCH 2/3] laava_run: Applied ruff check and ruff format --- src/laava_run.py | 118 +++++++++++++++++++++-------------------------- 1 file changed, 53 insertions(+), 65 deletions(-) mode change 100644 => 100755 src/laava_run.py diff --git a/src/laava_run.py b/src/laava_run.py old mode 100644 new mode 100755 index dc69675..815635e --- a/src/laava_run.py +++ b/src/laava_run.py @@ -1,14 +1,12 @@ #!/usr/bin/env python3 -""" -LAAVA Runner Script - Unified command line interface for LAAVA analysis pipeline. -""" +"""LAAVA Runner Script - Unified command line interface for LAAVA analysis pipeline.""" + import argparse import logging import os +import subprocess import sys from pathlib import Path -from typing import Optional -import subprocess # Add src directory to Python path src_dir = str(Path(__file__).parent.parent) @@ -17,98 +15,81 @@ # Import the modules import importlib.util + get_reference_names = importlib.import_module("src.get_reference_names") guess_vector_type_length = importlib.import_module("src.guess_vector_type_length") summarize_alignment = importlib.import_module("src.summarize_alignment") logging.basicConfig( - level=logging.INFO, - format="%(asctime)s - %(levelname)s - %(message)s" + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" ) logger = logging.getLogger(__name__) + def parse_args() -> argparse.Namespace: """Parse command line arguments.""" parser = argparse.ArgumentParser( description="LAAVA Analysis Pipeline Runner", - formatter_class=argparse.ArgumentDefaultsHelpFormatter + formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) - + # Required arguments parser.add_argument( - "--input-bam", - required=True, - help="Input BAM file containing sequencing reads" + "--input-bam", required=True, help="Input BAM file containing sequencing reads" ) parser.add_argument( "--vector-fasta", required=True, - help="Vector reference sequence in FASTA format" + help="Vector reference sequence in FASTA format", ) parser.add_argument( - "--annotation-bed", - required=True, - help="Vector annotation in BED format" + "--annotation-bed", required=True, help="Vector annotation in BED format" ) parser.add_argument( - "--itr-label", - required=True, - help="Label used for ITR in annotation" + "--itr-label", required=True, help="Label used for ITR in annotation" ) - + # Optional arguments parser.add_argument( - "--packaging-fasta", - help="Packaging sequence reference in FASTA format" - ) - parser.add_argument( - "--host-fasta", - help="Host genome reference in FASTA format" + "--packaging-fasta", help="Packaging sequence reference in FASTA format" ) + parser.add_argument("--host-fasta", help="Host genome reference in FASTA format") parser.add_argument( "--output-dir", default="./laava_results", - help="Output directory for analysis results" - ) - parser.add_argument( - "--sample-id", - default="sample", - help="Sample identifier" - ) - parser.add_argument( - "--sample-name", - help="Human-readable sample name" - ) - parser.add_argument( - "--verbose", - action="store_true", - help="Enable verbose logging" + help="Output directory for analysis results", ) - + parser.add_argument("--sample-id", default="sample", help="Sample identifier") + parser.add_argument("--sample-name", help="Human-readable sample name") + parser.add_argument("--verbose", action="store_true", help="Enable verbose logging") + return parser.parse_args() + def setup_output_directory(output_dir: str) -> Path: """Create and validate output directory.""" out_path = Path(output_dir) out_path.mkdir(parents=True, exist_ok=True) return out_path + def run_pipeline(args: argparse.Namespace) -> int: """Execute the LAAVA analysis pipeline.""" try: # Setup output directory output_dir = setup_output_directory(args.output_dir) - + # Step 1: Analyze vector type logger.info("Analyzing vector type...") cassette_len = guess_vector_type_length.length_from_annotation( - args.annotation_bed, - [args.itr_label] + args.annotation_bed, [args.itr_label] ) logger.info(f"Expression cassette length is {cassette_len}") - vector_type = "sc" if cassette_len <= guess_vector_type_length.SC_MAX_THRESHOLD else "ss" + vector_type = ( + "sc" if cassette_len <= guess_vector_type_length.SC_MAX_THRESHOLD else "ss" + ) logger.info(f"Determined vector type: {vector_type}") - + # Step 2: Generate reference names logger.info("Generating reference names...") ref_names_file = output_dir / f"{args.sample_id}.reference_names.tsv" @@ -119,43 +100,46 @@ def run_pipeline(args: argparse.Namespace) -> int: output=str(ref_names_file), repcap_name=None, helper_name=None, - lambda_name=None + lambda_name=None, ) get_reference_names._main(ref_names_args) - + # Step 3: Map reads logger.info("Mapping reads...") - + # Set up environment for the shell script env = os.environ.copy() - env["PATH"] = f"{Path(__file__).parent}:{env['PATH']}" # Add src dir to PATH for Python scripts - + env["PATH"] = ( + f"{Path(__file__).parent}:{env['PATH']}" # Add src dir to PATH for Python scripts + ) + # Get number of CPUs for parallel processing try: import multiprocessing + threads = str(multiprocessing.cpu_count()) except: threads = "1" env["THREADS"] = threads - + # Prepare command map_script = Path(__file__).parent / "map_reads.sh" if not map_script.exists(): raise FileNotFoundError(f"Map reads script not found: {map_script}") - + # Build command with proper working directory cmd = [ "bash", str(map_script), args.sample_id, str(Path(args.input_bam).absolute()), - str(Path(args.vector_fasta).absolute()) + str(Path(args.vector_fasta).absolute()), ] if args.packaging_fasta: cmd.append(str(Path(args.packaging_fasta).absolute())) if args.host_fasta: cmd.append(str(Path(args.host_fasta).absolute())) - + # Run mapping command in output directory logger.debug(f"Running command: {' '.join(cmd)}") try: @@ -166,19 +150,21 @@ def run_pipeline(args: argparse.Namespace) -> int: env=env, stdout=subprocess.PIPE, stderr=subprocess.PIPE, - universal_newlines=True + text=True, ) except subprocess.CalledProcessError as e: logger.error("Map reads command failed:") logger.error(f"stdout:\n{e.stdout}") logger.error(f"stderr:\n{e.stderr}") raise - + # Verify output file exists mapped_reads = output_dir / f"{args.sample_id}.sort_by_name.sam" if not mapped_reads.exists(): - raise FileNotFoundError(f"Expected mapped reads file not found: {mapped_reads}") - + raise FileNotFoundError( + f"Expected mapped reads file not found: {mapped_reads}" + ) + # Step 4: Analyze alignments and generate report logger.info("Analyzing alignments and generating report...") analysis_args = argparse.Namespace( @@ -190,26 +176,28 @@ def run_pipeline(args: argparse.Namespace) -> int: sample_id=args.sample_id, vector_type=vector_type, cpus=1, - debug=args.verbose + debug=args.verbose, ) summarize_alignment.main(analysis_args) - + logger.info(f"Analysis complete. Results available in: {output_dir}") return 0 - + except Exception as e: - logger.error(f"Pipeline failed: {str(e)}") + logger.error(f"Pipeline failed: {e!s}") if args.verbose: logger.exception("Detailed error traceback:") return 1 + def main(): """Main entry point for the LAAVA runner script.""" args = parse_args() if args.verbose: logging.getLogger().setLevel(logging.DEBUG) - + sys.exit(run_pipeline(args)) + if __name__ == "__main__": main() From b46f202ddd620983e0b63c53d9d0cea23c6d7348 Mon Sep 17 00:00:00 2001 From: Eric Talevich <52723+etal@users.noreply.github.com> Date: Mon, 3 Feb 2025 17:24:57 -0800 Subject: [PATCH 3/3] laava_run: Address more linting issues --- src/laava_run.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/laava_run.py b/src/laava_run.py index 815635e..9783f53 100755 --- a/src/laava_run.py +++ b/src/laava_run.py @@ -14,7 +14,7 @@ sys.path.insert(0, src_dir) # Import the modules -import importlib.util +import importlib.util # noqa: E402 get_reference_names = importlib.import_module("src.get_reference_names") guess_vector_type_length = importlib.import_module("src.guess_vector_type_length") @@ -118,7 +118,7 @@ def run_pipeline(args: argparse.Namespace) -> int: import multiprocessing threads = str(multiprocessing.cpu_count()) - except: + except (NotImplementedError, ValueError): threads = "1" env["THREADS"] = threads