formbio · etal · Feb 3, 2025 · Feb 3, 2025 · Feb 3, 2025 · Feb 4, 2025
diff --git a/README.md b/README.md
@@ -49,6 +49,28 @@ Then run it interactively in your current directory:
 docker run -v $(pwd):/data -w /data -it ghcr.io/formbio/laava:latest bash
 ```
 
+### Python Runner
+
+LAAVA now includes a unified Python-based runner that orchestrates the entire analysis pipeline:
+
+```bash
+python src/laava_run.py --input-bam <input.bam> --vector-fasta <vector.fasta> \
+    --annotation-bed <annotation.bed> --itr-label ITR
+```
+
+This runner coordinates vector type analysis, read mapping, and alignment analysis in a single command. A minimal conda environment (laava_minimal.conda_env.yml) is also available for running the core analysis components.
+
+Example using the test data:
+```bash
+# Using the included test samples
+python src/laava_run.py \
+    --input-bam test/samples/ss.subsample005.bam \
+    --vector-fasta test/samples/ss.construct.fasta \
+    --annotation-bed test/samples/ss.annotation.bed \
+    --itr-label ITR \
+    --output-dir test_runner_results/ss \
+    --sample-id ss
+```
 
 ## For developers
 
@@ -85,12 +107,20 @@ This opens a Bash shell with the scripts in the PATH, and the original working d
 
 ### Option 2: Conda
 
+
+The conda (or mamba) channels and dependencies are in the configuration files
+`laava.conda_env.yml`, `laava_dev.conda_env.yml`, and `laava_minimal.conda_env.yml`. 
+The first two environments are similar and both will work for running LAAVA itself 
+(with `_dev` including additional developer tools), while `_minimal` provides just 
+the core components needed for the Python runner.
+
 The conda channels and dependencies are in the configuration file `conda_env.yml`.
 With this environment and a LaTeX installation (via e.g. apt), you'll have all the
 dependencies you need to run LAAVA scripts directly on Linux, and *nearly* everything
 you need on Mac.
 The same environment is also used by the Docker container images internally.
 
+
 First, install conda via [Miniconda or
 Anaconda](https://www.anaconda.com/download/success).
 

diff --git a/laava_minimal.conda_env.yml b/laava_minimal.conda_env.yml
@@ -0,0 +1,13 @@
+name: laava_minimal
+channels:
+    - conda-forge
+    - bioconda
+    - defaults
+dependencies:
+    - python>=3.7.6
+    - biopython
+    - pandas
+    - pysam
+    - pytest
+    - minimap2
+    - samtools
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,3 +1,27 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+name = "laava"
+version = "0.1.0"
+description = "Long-read AAV Analysis Pipeline"
+readme = "README.md"
+requires-python = ">=3.9"
+license = { file = "LICENSE.md" }
+authors = [
+    { name = "Form Biosciences" }
+]
+dependencies = [
+    "numpy",
+    "pandas",
+    "pysam",
+    "rmarkdown",
+]
+
+[project.scripts]
+laava-run = "src.laava_run:main"
+
 [tool.ruff]
 # Minimum version to assume for error checking
 target-version = "py39"

diff --git a/src/__init__.py b/src/__init__.py
@@ -0,0 +1 @@
+"""LAAVA - Long-read AAV Analysis Pipeline."""
diff --git a/src/laava_run.py b/src/laava_run.py
@@ -0,0 +1,203 @@
+#!/usr/bin/env python3
+"""LAAVA Runner Script - Unified command line interface for LAAVA analysis pipeline."""
+
+import argparse
+import logging
+import os
+import subprocess
+import sys
+from pathlib import Path
+
+# Add src directory to Python path
+src_dir = str(Path(__file__).parent.parent)
+if src_dir not in sys.path:
+    sys.path.insert(0, src_dir)
+
+# Import the modules
+import importlib.util  # noqa: E402
+
+get_reference_names = importlib.import_module("src.get_reference_names")
+guess_vector_type_length = importlib.import_module("src.guess_vector_type_length")
+summarize_alignment = importlib.import_module("src.summarize_alignment")
+
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+
+
+def parse_args() -> argparse.Namespace:
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(
+        description="LAAVA Analysis Pipeline Runner",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    # Required arguments
+    parser.add_argument(
+        "--input-bam", required=True, help="Input BAM file containing sequencing reads"
+    )
+    parser.add_argument(
+        "--vector-fasta",
+        required=True,
+        help="Vector reference sequence in FASTA format",
+    )
+    parser.add_argument(
+        "--annotation-bed", required=True, help="Vector annotation in BED format"
+    )
+    parser.add_argument(
+        "--itr-label", required=True, help="Label used for ITR in annotation"
+    )
+
+    # Optional arguments
+    parser.add_argument(
+        "--packaging-fasta", help="Packaging sequence reference in FASTA format"
+    )
+    parser.add_argument("--host-fasta", help="Host genome reference in FASTA format")
+    parser.add_argument(
+        "--output-dir",
+        default="./laava_results",
+        help="Output directory for analysis results",
+    )
+    parser.add_argument("--sample-id", default="sample", help="Sample identifier")
+    parser.add_argument("--sample-name", help="Human-readable sample name")
+    parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")
+
+    return parser.parse_args()
+
+
+def setup_output_directory(output_dir: str) -> Path:
+    """Create and validate output directory."""
+    out_path = Path(output_dir)
+    out_path.mkdir(parents=True, exist_ok=True)
+    return out_path
+
+
+def run_pipeline(args: argparse.Namespace) -> int:
+    """Execute the LAAVA analysis pipeline."""
+    try:
+        # Setup output directory
+        output_dir = setup_output_directory(args.output_dir)
+
+        # Step 1: Analyze vector type
+        logger.info("Analyzing vector type...")
+        cassette_len = guess_vector_type_length.length_from_annotation(
+            args.annotation_bed, [args.itr_label]
+        )
+        logger.info(f"Expression cassette length is {cassette_len}")
+        vector_type = (
+            "sc" if cassette_len <= guess_vector_type_length.SC_MAX_THRESHOLD else "ss"
+        )
+        logger.info(f"Determined vector type: {vector_type}")
+
+        # Step 2: Generate reference names
+        logger.info("Generating reference names...")
+        ref_names_file = output_dir / f"{args.sample_id}.reference_names.tsv"
+        ref_names_args = argparse.Namespace(
+            vector=args.vector_fasta,
+            packaging=args.packaging_fasta,
+            host=args.host_fasta,
+            output=str(ref_names_file),
+            repcap_name=None,
+            helper_name=None,
+            lambda_name=None,
+        )
+        get_reference_names._main(ref_names_args)
+
+        # Step 3: Map reads
+        logger.info("Mapping reads...")
+
+        # Set up environment for the shell script
+        env = os.environ.copy()
+        env["PATH"] = (
+            f"{Path(__file__).parent}:{env['PATH']}"  # Add src dir to PATH for Python scripts
+        )
+
+        # Get number of CPUs for parallel processing
+        try:
+            import multiprocessing
+
+            threads = str(multiprocessing.cpu_count())
+        except (NotImplementedError, ValueError):
+            threads = "1"
+        env["THREADS"] = threads
+
+        # Prepare command
+        map_script = Path(__file__).parent / "map_reads.sh"
+        if not map_script.exists():
+            raise FileNotFoundError(f"Map reads script not found: {map_script}")
+
+        # Build command with proper working directory
+        cmd = [
+            "bash",
+            str(map_script),
+            args.sample_id,
+            str(Path(args.input_bam).absolute()),
+            str(Path(args.vector_fasta).absolute()),
+        ]
+        if args.packaging_fasta:
+            cmd.append(str(Path(args.packaging_fasta).absolute()))
+        if args.host_fasta:
+            cmd.append(str(Path(args.host_fasta).absolute()))
+
+        # Run mapping command in output directory
+        logger.debug(f"Running command: {' '.join(cmd)}")
+        try:
+            subprocess.run(
+                cmd,
+                check=True,
+                cwd=output_dir,
+                env=env,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+            )
+        except subprocess.CalledProcessError as e:
+            logger.error("Map reads command failed:")
+            logger.error(f"stdout:\n{e.stdout}")
+            logger.error(f"stderr:\n{e.stderr}")
+            raise
+
+        # Verify output file exists
+        mapped_reads = output_dir / f"{args.sample_id}.sort_by_name.sam"
+        if not mapped_reads.exists():
+            raise FileNotFoundError(
+                f"Expected mapped reads file not found: {mapped_reads}"
+            )
+
+        # Step 4: Analyze alignments and generate report
+        logger.info("Analyzing alignments and generating report...")
+        analysis_args = argparse.Namespace(
+            sam_filename=str(mapped_reads),
+            annotation_bed=args.annotation_bed,
+            reference_names=str(ref_names_file),
+            itr_labels=[args.itr_label],
+            output_prefix=str(output_dir / args.sample_id),
+            sample_id=args.sample_id,
+            vector_type=vector_type,
+            cpus=1,
+            debug=args.verbose,
+        )
+        summarize_alignment.main(analysis_args)
+
+        logger.info(f"Analysis complete. Results available in: {output_dir}")
+        return 0
+
+    except Exception as e:
+        logger.error(f"Pipeline failed: {e!s}")
+        if args.verbose:
+            logger.exception("Detailed error traceback:")
+        return 1
+
+
+def main():
+    """Main entry point for the LAAVA runner script."""
+    args = parse_args()
+    if args.verbose:
+        logging.getLogger().setLevel(logging.DEBUG)
+
+    sys.exit(run_pipeline(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/map_reads.sh b/src/map_reads.sh
@@ -15,13 +15,7 @@ lambda_name=$8
 
 ls -Alh
 
-get_reference_names.py "${vector_fa}" \
-    ${packaging_fa:+--packaging "$packaging_fa"} \
-    ${host_fa:+--host "$host_fa"} \
-    ${repcap_name:+--repcap-name "$repcap_name"} \
-    ${helper_name:+--helper-name "$helper_name"} \
-    ${lambda_name:+--lambda-name "$lambda_name"} \
-    -o "${sample_id}.reference_names.tsv"
+# Skip reference names generation since it's handled by the Python runner
 
 
 ls -Alh
@@ -49,17 +43,22 @@ else
     reads_fn="$reads"
 fi
 
-threads=$(nproc)
-minimap2 --eqx -a --secondary=no -t $threads all_refs.fa "$reads_fn" > tmp.mapped.sam
+threads=${LAAVA_THREADS:-1}  # Use LAAVA_THREADS env var if set, otherwise default to NPROC
+minimap2 --eqx -a --secondary=no -t "$threads" all_refs.fa "$reads_fn" > tmp.mapped.sam
 # Sort the mapped reads by name
+
+name_sam="$sample_id.sort_by_name.sam"
+samtools sort -@ "$threads" -n -O SAM -o "$name_sam" tmp.mapped.sam
+
 name_bam="$sample_id.sort_by_name.bam"
 samtools sort -@ $threads -n -O BAM -o "$name_bam" tmp.mapped.sam
 
+
 # Make a position-sorted BAM output file for other downstream consumers
 pos_bam="$sample_id.sort_by_pos.bam"
 # Drop unmapped reads
-samtools view -@ $threads --fast -o tmp.sorted.bam tmp.mapped.sam
-samtools sort -@ $threads -o "$pos_bam" tmp.sorted.bam
+samtools view -@ "$threads" --fast -o tmp.sorted.bam tmp.mapped.sam
+samtools sort -@ "$threads" -o "$pos_bam" tmp.sorted.bam
 samtools index "$pos_bam"
 
 # Logging

diff --git a/src/summarize_alignment.py b/src/summarize_alignment.py
@@ -52,8 +52,17 @@ def subset_sam_by_readname_list(
     exclude_type=False,
 ):
     qname_lookup = {}  # qname --> (a_type, a_subtype)
-    with gzip.open(per_read_tsv, "rt") as per_read_f:
-        for row in csv.DictReader(per_read_f, delimiter="\t"):
+    # Try to open as gzipped first, if that fails open as regular file
+    try:
+        f = gzip.open(per_read_tsv, "rt")
+        # Test if it's actually gzipped by reading a bit
+        f.read(1)
+        f.seek(0)
+    except (OSError, gzip.BadGzipFile):
+        f = open(per_read_tsv, "rt")
+
+    with f:
+        for row in csv.DictReader(f, delimiter="\t"):
             # pdb.set_trace()
             if (
                 wanted_types is None