diff --git a/README.md b/README.md index 8e2c895..d44d911 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,28 @@ Then run it interactively in your current directory: docker run -v $(pwd):/data -w /data -it ghcr.io/formbio/laava:latest bash ``` +### Python Runner + +LAAVA now includes a unified Python-based runner that orchestrates the entire analysis pipeline: + +```bash +python src/laava_run.py --input-bam --vector-fasta \ + --annotation-bed --itr-label ITR +``` + +This runner coordinates vector type analysis, read mapping, and alignment analysis in a single command. A minimal conda environment (laava_minimal.conda_env.yml) is also available for running the core analysis components. + +Example using the test data: +```bash +# Using the included test samples +python src/laava_run.py \ + --input-bam test/samples/ss.subsample005.bam \ + --vector-fasta test/samples/ss.construct.fasta \ + --annotation-bed test/samples/ss.annotation.bed \ + --itr-label ITR \ + --output-dir test_runner_results/ss \ + --sample-id ss +``` ## For developers @@ -85,12 +107,20 @@ This opens a Bash shell with the scripts in the PATH, and the original working d ### Option 2: Conda + +The conda (or mamba) channels and dependencies are in the configuration files +`laava.conda_env.yml`, `laava_dev.conda_env.yml`, and `laava_minimal.conda_env.yml`. +The first two environments are similar and both will work for running LAAVA itself +(with `_dev` including additional developer tools), while `_minimal` provides just +the core components needed for the Python runner. + The conda channels and dependencies are in the configuration file `conda_env.yml`. With this environment and a LaTeX installation (via e.g. apt), you'll have all the dependencies you need to run LAAVA scripts directly on Linux, and *nearly* everything you need on Mac. The same environment is also used by the Docker container images internally. + First, install conda via [Miniconda or Anaconda](https://www.anaconda.com/download/success). diff --git a/laava_minimal.conda_env.yml b/laava_minimal.conda_env.yml new file mode 100644 index 0000000..563ca82 --- /dev/null +++ b/laava_minimal.conda_env.yml @@ -0,0 +1,13 @@ +name: laava_minimal +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - python>=3.7.6 + - biopython + - pandas + - pysam + - pytest + - minimap2 + - samtools diff --git a/pyproject.toml b/pyproject.toml index 1dccde4..0e919d2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,27 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "laava" +version = "0.1.0" +description = "Long-read AAV Analysis Pipeline" +readme = "README.md" +requires-python = ">=3.9" +license = { file = "LICENSE.md" } +authors = [ + { name = "Form Biosciences" } +] +dependencies = [ + "numpy", + "pandas", + "pysam", + "rmarkdown", +] + +[project.scripts] +laava-run = "src.laava_run:main" + [tool.ruff] # Minimum version to assume for error checking target-version = "py39" diff --git a/src/__init__.py b/src/__init__.py index e69de29..8540692 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -0,0 +1 @@ +"""LAAVA - Long-read AAV Analysis Pipeline.""" diff --git a/src/laava_run.py b/src/laava_run.py new file mode 100755 index 0000000..9783f53 --- /dev/null +++ b/src/laava_run.py @@ -0,0 +1,203 @@ +#!/usr/bin/env python3 +"""LAAVA Runner Script - Unified command line interface for LAAVA analysis pipeline.""" + +import argparse +import logging +import os +import subprocess +import sys +from pathlib import Path + +# Add src directory to Python path +src_dir = str(Path(__file__).parent.parent) +if src_dir not in sys.path: + sys.path.insert(0, src_dir) + +# Import the modules +import importlib.util # noqa: E402 + +get_reference_names = importlib.import_module("src.get_reference_names") +guess_vector_type_length = importlib.import_module("src.guess_vector_type_length") +summarize_alignment = importlib.import_module("src.summarize_alignment") + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) +logger = logging.getLogger(__name__) + + +def parse_args() -> argparse.Namespace: + """Parse command line arguments.""" + parser = argparse.ArgumentParser( + description="LAAVA Analysis Pipeline Runner", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + + # Required arguments + parser.add_argument( + "--input-bam", required=True, help="Input BAM file containing sequencing reads" + ) + parser.add_argument( + "--vector-fasta", + required=True, + help="Vector reference sequence in FASTA format", + ) + parser.add_argument( + "--annotation-bed", required=True, help="Vector annotation in BED format" + ) + parser.add_argument( + "--itr-label", required=True, help="Label used for ITR in annotation" + ) + + # Optional arguments + parser.add_argument( + "--packaging-fasta", help="Packaging sequence reference in FASTA format" + ) + parser.add_argument("--host-fasta", help="Host genome reference in FASTA format") + parser.add_argument( + "--output-dir", + default="./laava_results", + help="Output directory for analysis results", + ) + parser.add_argument("--sample-id", default="sample", help="Sample identifier") + parser.add_argument("--sample-name", help="Human-readable sample name") + parser.add_argument("--verbose", action="store_true", help="Enable verbose logging") + + return parser.parse_args() + + +def setup_output_directory(output_dir: str) -> Path: + """Create and validate output directory.""" + out_path = Path(output_dir) + out_path.mkdir(parents=True, exist_ok=True) + return out_path + + +def run_pipeline(args: argparse.Namespace) -> int: + """Execute the LAAVA analysis pipeline.""" + try: + # Setup output directory + output_dir = setup_output_directory(args.output_dir) + + # Step 1: Analyze vector type + logger.info("Analyzing vector type...") + cassette_len = guess_vector_type_length.length_from_annotation( + args.annotation_bed, [args.itr_label] + ) + logger.info(f"Expression cassette length is {cassette_len}") + vector_type = ( + "sc" if cassette_len <= guess_vector_type_length.SC_MAX_THRESHOLD else "ss" + ) + logger.info(f"Determined vector type: {vector_type}") + + # Step 2: Generate reference names + logger.info("Generating reference names...") + ref_names_file = output_dir / f"{args.sample_id}.reference_names.tsv" + ref_names_args = argparse.Namespace( + vector=args.vector_fasta, + packaging=args.packaging_fasta, + host=args.host_fasta, + output=str(ref_names_file), + repcap_name=None, + helper_name=None, + lambda_name=None, + ) + get_reference_names._main(ref_names_args) + + # Step 3: Map reads + logger.info("Mapping reads...") + + # Set up environment for the shell script + env = os.environ.copy() + env["PATH"] = ( + f"{Path(__file__).parent}:{env['PATH']}" # Add src dir to PATH for Python scripts + ) + + # Get number of CPUs for parallel processing + try: + import multiprocessing + + threads = str(multiprocessing.cpu_count()) + except (NotImplementedError, ValueError): + threads = "1" + env["THREADS"] = threads + + # Prepare command + map_script = Path(__file__).parent / "map_reads.sh" + if not map_script.exists(): + raise FileNotFoundError(f"Map reads script not found: {map_script}") + + # Build command with proper working directory + cmd = [ + "bash", + str(map_script), + args.sample_id, + str(Path(args.input_bam).absolute()), + str(Path(args.vector_fasta).absolute()), + ] + if args.packaging_fasta: + cmd.append(str(Path(args.packaging_fasta).absolute())) + if args.host_fasta: + cmd.append(str(Path(args.host_fasta).absolute())) + + # Run mapping command in output directory + logger.debug(f"Running command: {' '.join(cmd)}") + try: + subprocess.run( + cmd, + check=True, + cwd=output_dir, + env=env, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + except subprocess.CalledProcessError as e: + logger.error("Map reads command failed:") + logger.error(f"stdout:\n{e.stdout}") + logger.error(f"stderr:\n{e.stderr}") + raise + + # Verify output file exists + mapped_reads = output_dir / f"{args.sample_id}.sort_by_name.sam" + if not mapped_reads.exists(): + raise FileNotFoundError( + f"Expected mapped reads file not found: {mapped_reads}" + ) + + # Step 4: Analyze alignments and generate report + logger.info("Analyzing alignments and generating report...") + analysis_args = argparse.Namespace( + sam_filename=str(mapped_reads), + annotation_bed=args.annotation_bed, + reference_names=str(ref_names_file), + itr_labels=[args.itr_label], + output_prefix=str(output_dir / args.sample_id), + sample_id=args.sample_id, + vector_type=vector_type, + cpus=1, + debug=args.verbose, + ) + summarize_alignment.main(analysis_args) + + logger.info(f"Analysis complete. Results available in: {output_dir}") + return 0 + + except Exception as e: + logger.error(f"Pipeline failed: {e!s}") + if args.verbose: + logger.exception("Detailed error traceback:") + return 1 + + +def main(): + """Main entry point for the LAAVA runner script.""" + args = parse_args() + if args.verbose: + logging.getLogger().setLevel(logging.DEBUG) + + sys.exit(run_pipeline(args)) + + +if __name__ == "__main__": + main() diff --git a/src/map_reads.sh b/src/map_reads.sh index b04813c..342ca9a 100755 --- a/src/map_reads.sh +++ b/src/map_reads.sh @@ -15,13 +15,7 @@ lambda_name=$8 ls -Alh -get_reference_names.py "${vector_fa}" \ - ${packaging_fa:+--packaging "$packaging_fa"} \ - ${host_fa:+--host "$host_fa"} \ - ${repcap_name:+--repcap-name "$repcap_name"} \ - ${helper_name:+--helper-name "$helper_name"} \ - ${lambda_name:+--lambda-name "$lambda_name"} \ - -o "${sample_id}.reference_names.tsv" +# Skip reference names generation since it's handled by the Python runner ls -Alh @@ -49,17 +43,22 @@ else reads_fn="$reads" fi -threads=$(nproc) -minimap2 --eqx -a --secondary=no -t $threads all_refs.fa "$reads_fn" > tmp.mapped.sam +threads=${LAAVA_THREADS:-1} # Use LAAVA_THREADS env var if set, otherwise default to NPROC +minimap2 --eqx -a --secondary=no -t "$threads" all_refs.fa "$reads_fn" > tmp.mapped.sam # Sort the mapped reads by name + +name_sam="$sample_id.sort_by_name.sam" +samtools sort -@ "$threads" -n -O SAM -o "$name_sam" tmp.mapped.sam + name_bam="$sample_id.sort_by_name.bam" samtools sort -@ $threads -n -O BAM -o "$name_bam" tmp.mapped.sam + # Make a position-sorted BAM output file for other downstream consumers pos_bam="$sample_id.sort_by_pos.bam" # Drop unmapped reads -samtools view -@ $threads --fast -o tmp.sorted.bam tmp.mapped.sam -samtools sort -@ $threads -o "$pos_bam" tmp.sorted.bam +samtools view -@ "$threads" --fast -o tmp.sorted.bam tmp.mapped.sam +samtools sort -@ "$threads" -o "$pos_bam" tmp.sorted.bam samtools index "$pos_bam" # Logging diff --git a/src/summarize_alignment.py b/src/summarize_alignment.py index e03a8ae..912a147 100755 --- a/src/summarize_alignment.py +++ b/src/summarize_alignment.py @@ -52,8 +52,17 @@ def subset_sam_by_readname_list( exclude_type=False, ): qname_lookup = {} # qname --> (a_type, a_subtype) - with gzip.open(per_read_tsv, "rt") as per_read_f: - for row in csv.DictReader(per_read_f, delimiter="\t"): + # Try to open as gzipped first, if that fails open as regular file + try: + f = gzip.open(per_read_tsv, "rt") + # Test if it's actually gzipped by reading a bit + f.read(1) + f.seek(0) + except (OSError, gzip.BadGzipFile): + f = open(per_read_tsv, "rt") + + with f: + for row in csv.DictReader(f, delimiter="\t"): # pdb.set_trace() if ( wanted_types is None