Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,28 @@ Then run it interactively in your current directory:
docker run -v $(pwd):/data -w /data -it ghcr.io/formbio/laava:latest bash
```

### Python Runner

LAAVA now includes a unified Python-based runner that orchestrates the entire analysis pipeline:

```bash
python src/laava_run.py --input-bam <input.bam> --vector-fasta <vector.fasta> \
--annotation-bed <annotation.bed> --itr-label ITR
```

This runner coordinates vector type analysis, read mapping, and alignment analysis in a single command. A minimal conda environment (laava_minimal.conda_env.yml) is also available for running the core analysis components.

Example using the test data:
```bash
# Using the included test samples
python src/laava_run.py \
--input-bam test/samples/ss.subsample005.bam \
--vector-fasta test/samples/ss.construct.fasta \
--annotation-bed test/samples/ss.annotation.bed \
--itr-label ITR \
--output-dir test_runner_results/ss \
--sample-id ss
```

## For developers

Expand Down Expand Up @@ -85,12 +107,20 @@ This opens a Bash shell with the scripts in the PATH, and the original working d

### Option 2: Conda


The conda (or mamba) channels and dependencies are in the configuration files
`laava.conda_env.yml`, `laava_dev.conda_env.yml`, and `laava_minimal.conda_env.yml`.
The first two environments are similar and both will work for running LAAVA itself
(with `_dev` including additional developer tools), while `_minimal` provides just
the core components needed for the Python runner.

The conda channels and dependencies are in the configuration file `conda_env.yml`.
With this environment and a LaTeX installation (via e.g. apt), you'll have all the
dependencies you need to run LAAVA scripts directly on Linux, and *nearly* everything
you need on Mac.
The same environment is also used by the Docker container images internally.


First, install conda via [Miniconda or
Anaconda](https://www.anaconda.com/download/success).

Expand Down
13 changes: 13 additions & 0 deletions laava_minimal.conda_env.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
name: laava_minimal
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- python>=3.7.6
- biopython
- pandas
- pysam
- pytest
- minimap2
- samtools
24 changes: 24 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,3 +1,27 @@
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[project]
name = "laava"
version = "0.1.0"
description = "Long-read AAV Analysis Pipeline"
readme = "README.md"
requires-python = ">=3.9"
license = { file = "LICENSE.md" }
authors = [
{ name = "Form Biosciences" }
]
dependencies = [
"numpy",
"pandas",
"pysam",
"rmarkdown",
]

[project.scripts]
laava-run = "src.laava_run:main"

[tool.ruff]
# Minimum version to assume for error checking
target-version = "py39"
Expand Down
1 change: 1 addition & 0 deletions src/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""LAAVA - Long-read AAV Analysis Pipeline."""
203 changes: 203 additions & 0 deletions src/laava_run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
#!/usr/bin/env python3
"""LAAVA Runner Script - Unified command line interface for LAAVA analysis pipeline."""

import argparse
import logging
import os
import subprocess
import sys
from pathlib import Path

# Add src directory to Python path
src_dir = str(Path(__file__).parent.parent)
if src_dir not in sys.path:
sys.path.insert(0, src_dir)

# Import the modules
import importlib.util # noqa: E402

get_reference_names = importlib.import_module("src.get_reference_names")
guess_vector_type_length = importlib.import_module("src.guess_vector_type_length")
summarize_alignment = importlib.import_module("src.summarize_alignment")

logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)


def parse_args() -> argparse.Namespace:
"""Parse command line arguments."""
parser = argparse.ArgumentParser(
description="LAAVA Analysis Pipeline Runner",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)

# Required arguments
parser.add_argument(
"--input-bam", required=True, help="Input BAM file containing sequencing reads"
)
parser.add_argument(
"--vector-fasta",
required=True,
help="Vector reference sequence in FASTA format",
)
parser.add_argument(
"--annotation-bed", required=True, help="Vector annotation in BED format"
)
parser.add_argument(
"--itr-label", required=True, help="Label used for ITR in annotation"
)

# Optional arguments
parser.add_argument(
"--packaging-fasta", help="Packaging sequence reference in FASTA format"
)
parser.add_argument("--host-fasta", help="Host genome reference in FASTA format")
parser.add_argument(
"--output-dir",
default="./laava_results",
help="Output directory for analysis results",
)
parser.add_argument("--sample-id", default="sample", help="Sample identifier")
parser.add_argument("--sample-name", help="Human-readable sample name")
parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")

return parser.parse_args()


def setup_output_directory(output_dir: str) -> Path:
"""Create and validate output directory."""
out_path = Path(output_dir)
out_path.mkdir(parents=True, exist_ok=True)
return out_path


def run_pipeline(args: argparse.Namespace) -> int:
"""Execute the LAAVA analysis pipeline."""
try:
# Setup output directory
output_dir = setup_output_directory(args.output_dir)

# Step 1: Analyze vector type
logger.info("Analyzing vector type...")
cassette_len = guess_vector_type_length.length_from_annotation(
args.annotation_bed, [args.itr_label]
)
logger.info(f"Expression cassette length is {cassette_len}")
vector_type = (
"sc" if cassette_len <= guess_vector_type_length.SC_MAX_THRESHOLD else "ss"
)
logger.info(f"Determined vector type: {vector_type}")

# Step 2: Generate reference names
logger.info("Generating reference names...")
ref_names_file = output_dir / f"{args.sample_id}.reference_names.tsv"
ref_names_args = argparse.Namespace(
vector=args.vector_fasta,
packaging=args.packaging_fasta,
host=args.host_fasta,
output=str(ref_names_file),
repcap_name=None,
helper_name=None,
lambda_name=None,
)
get_reference_names._main(ref_names_args)

# Step 3: Map reads
logger.info("Mapping reads...")

# Set up environment for the shell script
env = os.environ.copy()
env["PATH"] = (
f"{Path(__file__).parent}:{env['PATH']}" # Add src dir to PATH for Python scripts
)

# Get number of CPUs for parallel processing
try:
import multiprocessing

threads = str(multiprocessing.cpu_count())
except (NotImplementedError, ValueError):
threads = "1"
env["THREADS"] = threads

# Prepare command
map_script = Path(__file__).parent / "map_reads.sh"
if not map_script.exists():
raise FileNotFoundError(f"Map reads script not found: {map_script}")

# Build command with proper working directory
cmd = [
"bash",
str(map_script),
args.sample_id,
str(Path(args.input_bam).absolute()),
str(Path(args.vector_fasta).absolute()),
]
if args.packaging_fasta:
cmd.append(str(Path(args.packaging_fasta).absolute()))
if args.host_fasta:
cmd.append(str(Path(args.host_fasta).absolute()))

# Run mapping command in output directory
logger.debug(f"Running command: {' '.join(cmd)}")
try:
subprocess.run(
cmd,
check=True,
cwd=output_dir,
env=env,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
)
except subprocess.CalledProcessError as e:
logger.error("Map reads command failed:")
logger.error(f"stdout:\n{e.stdout}")
logger.error(f"stderr:\n{e.stderr}")
raise

# Verify output file exists
mapped_reads = output_dir / f"{args.sample_id}.sort_by_name.sam"
if not mapped_reads.exists():
raise FileNotFoundError(
f"Expected mapped reads file not found: {mapped_reads}"
)

# Step 4: Analyze alignments and generate report
logger.info("Analyzing alignments and generating report...")
analysis_args = argparse.Namespace(
sam_filename=str(mapped_reads),
annotation_bed=args.annotation_bed,
reference_names=str(ref_names_file),
itr_labels=[args.itr_label],
output_prefix=str(output_dir / args.sample_id),
sample_id=args.sample_id,
vector_type=vector_type,
cpus=1,
debug=args.verbose,
)
summarize_alignment.main(analysis_args)

logger.info(f"Analysis complete. Results available in: {output_dir}")
return 0

except Exception as e:
logger.error(f"Pipeline failed: {e!s}")
if args.verbose:
logger.exception("Detailed error traceback:")
return 1


def main():
"""Main entry point for the LAAVA runner script."""
args = parse_args()
if args.verbose:
logging.getLogger().setLevel(logging.DEBUG)

sys.exit(run_pipeline(args))


if __name__ == "__main__":
main()
21 changes: 10 additions & 11 deletions src/map_reads.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,7 @@ lambda_name=$8

ls -Alh

get_reference_names.py "${vector_fa}" \
${packaging_fa:+--packaging "$packaging_fa"} \
${host_fa:+--host "$host_fa"} \
${repcap_name:+--repcap-name "$repcap_name"} \
${helper_name:+--helper-name "$helper_name"} \
${lambda_name:+--lambda-name "$lambda_name"} \
-o "${sample_id}.reference_names.tsv"
# Skip reference names generation since it's handled by the Python runner


ls -Alh
Expand Down Expand Up @@ -49,17 +43,22 @@ else
reads_fn="$reads"
fi

threads=$(nproc)
minimap2 --eqx -a --secondary=no -t $threads all_refs.fa "$reads_fn" > tmp.mapped.sam
threads=${LAAVA_THREADS:-1} # Use LAAVA_THREADS env var if set, otherwise default to NPROC
minimap2 --eqx -a --secondary=no -t "$threads" all_refs.fa "$reads_fn" > tmp.mapped.sam
# Sort the mapped reads by name

name_sam="$sample_id.sort_by_name.sam"
samtools sort -@ "$threads" -n -O SAM -o "$name_sam" tmp.mapped.sam

name_bam="$sample_id.sort_by_name.bam"
samtools sort -@ $threads -n -O BAM -o "$name_bam" tmp.mapped.sam


# Make a position-sorted BAM output file for other downstream consumers
pos_bam="$sample_id.sort_by_pos.bam"
# Drop unmapped reads
samtools view -@ $threads --fast -o tmp.sorted.bam tmp.mapped.sam
samtools sort -@ $threads -o "$pos_bam" tmp.sorted.bam
samtools view -@ "$threads" --fast -o tmp.sorted.bam tmp.mapped.sam
samtools sort -@ "$threads" -o "$pos_bam" tmp.sorted.bam
samtools index "$pos_bam"

# Logging
Expand Down
13 changes: 11 additions & 2 deletions src/summarize_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,17 @@ def subset_sam_by_readname_list(
exclude_type=False,
):
qname_lookup = {} # qname --> (a_type, a_subtype)
with gzip.open(per_read_tsv, "rt") as per_read_f:
for row in csv.DictReader(per_read_f, delimiter="\t"):
# Try to open as gzipped first, if that fails open as regular file
try:
f = gzip.open(per_read_tsv, "rt")
# Test if it's actually gzipped by reading a bit
f.read(1)
f.seek(0)
except (OSError, gzip.BadGzipFile):
f = open(per_read_tsv, "rt")

with f:
for row in csv.DictReader(f, delimiter="\t"):
# pdb.set_trace()
if (
wanted_types is None
Expand Down
Loading