formbio · beichen1024 · Oct 7, 2025 · Oct 7, 2025 · Oct 7, 2025 · Oct 7, 2025
diff --git a/main.nf b/main.nf
@@ -124,6 +124,7 @@ workflow laava {
     flipflop_bams = make_report.out.flipflop_bams
     flipflop_tsv = make_report.out.flipflop_tsv
     sample_id = make_report.out.sample_id
+    vector_coverage = make_report.out.vector_coverage
 }
 
 

diff --git a/modules/local/laava.nf b/modules/local/laava.nf
@@ -81,6 +81,7 @@ process make_report() {
     path("${sample_id}.nonmatch.tsv.gz"), emit: nonmatch_tsv
     path("${sample_id}.agg_ref_type.tsv"), emit: agg_ref_type_tsv
     path("${sample_id}.agg_subtype.tsv"), emit: agg_subtype_tsv
+    path("${sample_id}.vector_coverage.tsv"), emit: vector_coverage
     path("${sample_id}.flipflop.tsv.gz"), emit: flipflop_tsv, optional: true
     path("${sample_id}.agg_flipflop.tsv"), emit: agg_flipflop_tsv, optional:true
 

diff --git a/src/make_report.sh b/src/make_report.sh
@@ -75,7 +75,7 @@ if [[ -n "$flipflop_name" || -n "$flipflop_fa" ]]; then
     elif [ "${vector_type}" == "ss" ]; then
        orientation="left"
     fi
-    
+
     python "$(dirname $0)/get_flipflop_config.py" \
         "$out_dir/${sample_id}.tagged.bam" "$out_dir/${sample_id}.per_read.tsv.gz" \
         "$vector_type" \

diff --git a/src/summarize_alignment.py b/src/summarize_alignment.py
@@ -868,6 +868,86 @@ def check_is_mitr_left(annotation_bed, itr_labels):
     return None
 
 
+def generate_vector_coverage(bam_filename, annotation, output_prefix):
+    """Generate per-base coverage statistics for vectors in annotation.
+
+    Parameters:
+        bam_filename (str): Path to the sorted BAM file
+        annotation (dict): Dictionary of annotation objects already loaded
+        output_prefix (str): Prefix for output file
+
+    Returns:
+        str: Path to the output TSV file
+    """
+    # Sort BAM by position for coverage calculation
+    pos_sorted_bam = f"{output_prefix}.pos_sorted.bam"
+    logging.info(f"Sorting BAM by position: {bam_filename} -> {pos_sorted_bam}")
+    pysam.sort("-o", pos_sorted_bam, bam_filename)
+
+    # Create index for position-sorted BAM
+    logging.info(f"Creating index for position-sorted BAM: {pos_sorted_bam}")
+    pysam.index(pos_sorted_bam)
+
+    # Open position-sorted BAM file
+    bam = pysam.AlignmentFile(pos_sorted_bam, "rb", check_sq=False)
+
+    # Count unique fragments (by query name)
+    read_names = set()
+    for read in bam:
+        if not read.is_unmapped:
+            read_names.add(read.query_name)
+
+    total_fragments = len(read_names)
+    bam.reset()
+
+    # Initialize list to store coverage data
+    coverage_data = []
+
+    # Process vectors from annotation
+    for seq_name, info in annotation.items():
+        if info["label"] != "vector":
+            continue
+
+        region = info["region"]
+        if region is None:
+            logging.warning(f"No region defined for vector {seq_name}, skipping coverage calculation")
+            continue
+
+        start_pos, end_pos = region
+
+        logging.info(f"Calculating coverage for vector {seq_name} ({start_pos}-{end_pos})")
+
+        # Create a position-coverage mapping
+        position_coverage = {i: 0 for i in range(start_pos, end_pos)}
+
+        # Use pileup with max_depth set to total fragments count (to avoid default limit)
+        for column in bam.pileup(seq_name, start_pos, end_pos, max_depth=total_fragments):
+            if start_pos <= column.pos < end_pos:
+                position_coverage[column.pos] = column.n
+
+        # Convert to output format with percentages
+        for position, coverage in position_coverage.items():
+            percent_coverage = (coverage / total_fragments) * 100 if total_fragments > 0 else 0
+
+            coverage_data.append({
+                "vector_name": seq_name,
+                "position": position + 1,  # Convert to 1-based for output
+                "raw_coverage": coverage,
+                "percent_coverage": percent_coverage
+            })
+
+    # Write coverage data to TSV
+    output_path = f"{output_prefix}.vector_coverage.tsv"
+    with open(output_path, "w") as f:
+        writer = csv.DictWriter(f, fieldnames=["vector_name", "position", "raw_coverage", "percent_coverage"], 
+                                delimiter="\t")
+        writer.writeheader()
+        writer.writerows(coverage_data)
+
+    logging.info(f"Vector coverage data written to {output_path}")
+    return output_path
+
+
 def main(args):
     """Entry point."""
     annotation = load_annotation_bed(
@@ -899,6 +979,13 @@ def main(args):
             num_chunks=args.cpus,
         )
 
+    # Generate vector coverage file
+    generate_vector_coverage(
+        args.bam_filename,
+        annotation,
+        args.output_prefix
+    )
+
     # subset BAM files into major categories for ease of loading into IGV for viewing
     # subset_sam_by_readname_list(in_bam, out_bam, per_read_tsv, wanted_types, wanted_subtypes)
     subset_bam_prefixes = []

diff --git a/test/Makefile b/test/Makefile
@@ -1,3 +1,4 @@
+
 # Local tests: Run the core analysis steps on downsampled copies of the public PacBio example data
 
 in_dir = samples
Original file line number	Diff line number	Diff line change
		@@ -1,3 +1,4 @@

		# Local tests: Run the core analysis steps on downsampled copies of the public PacBio example data

		in_dir = samples
Expand Down