Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ workflow laava {
flipflop_bams = make_report.out.flipflop_bams
flipflop_tsv = make_report.out.flipflop_tsv
sample_id = make_report.out.sample_id
vector_coverage = make_report.out.vector_coverage
}


Expand Down
1 change: 1 addition & 0 deletions modules/local/laava.nf
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ process make_report() {
path("${sample_id}.nonmatch.tsv.gz"), emit: nonmatch_tsv
path("${sample_id}.agg_ref_type.tsv"), emit: agg_ref_type_tsv
path("${sample_id}.agg_subtype.tsv"), emit: agg_subtype_tsv
path("${sample_id}.vector_coverage.tsv"), emit: vector_coverage
path("${sample_id}.flipflop.tsv.gz"), emit: flipflop_tsv, optional: true
path("${sample_id}.agg_flipflop.tsv"), emit: agg_flipflop_tsv, optional:true

Expand Down
2 changes: 1 addition & 1 deletion src/make_report.sh
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ if [[ -n "$flipflop_name" || -n "$flipflop_fa" ]]; then
elif [ "${vector_type}" == "ss" ]; then
orientation="left"
fi

python "$(dirname $0)/get_flipflop_config.py" \
"$out_dir/${sample_id}.tagged.bam" "$out_dir/${sample_id}.per_read.tsv.gz" \
"$vector_type" \
Expand Down
87 changes: 87 additions & 0 deletions src/summarize_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -868,6 +868,86 @@ def check_is_mitr_left(annotation_bed, itr_labels):
return None


def generate_vector_coverage(bam_filename, annotation, output_prefix):
"""Generate per-base coverage statistics for vectors in annotation.

Parameters:
bam_filename (str): Path to the sorted BAM file
annotation (dict): Dictionary of annotation objects already loaded
output_prefix (str): Prefix for output file

Returns:
str: Path to the output TSV file
"""
# Sort BAM by position for coverage calculation
pos_sorted_bam = f"{output_prefix}.pos_sorted.bam"
logging.info(f"Sorting BAM by position: {bam_filename} -> {pos_sorted_bam}")
pysam.sort("-o", pos_sorted_bam, bam_filename)

# Create index for position-sorted BAM
logging.info(f"Creating index for position-sorted BAM: {pos_sorted_bam}")
pysam.index(pos_sorted_bam)

# Open position-sorted BAM file
bam = pysam.AlignmentFile(pos_sorted_bam, "rb", check_sq=False)

# Count unique fragments (by query name)
read_names = set()
for read in bam:
if not read.is_unmapped:
read_names.add(read.query_name)

total_fragments = len(read_names)
bam.reset()

# Initialize list to store coverage data
coverage_data = []

# Process vectors from annotation
for seq_name, info in annotation.items():
if info["label"] != "vector":
continue

region = info["region"]
if region is None:
logging.warning(f"No region defined for vector {seq_name}, skipping coverage calculation")
continue

start_pos, end_pos = region

logging.info(f"Calculating coverage for vector {seq_name} ({start_pos}-{end_pos})")

# Create a position-coverage mapping
position_coverage = {i: 0 for i in range(start_pos, end_pos)}

# Use pileup with max_depth set to total fragments count (to avoid default limit)
for column in bam.pileup(seq_name, start_pos, end_pos, max_depth=total_fragments):
if start_pos <= column.pos < end_pos:
position_coverage[column.pos] = column.n

# Convert to output format with percentages
for position, coverage in position_coverage.items():
percent_coverage = (coverage / total_fragments) * 100 if total_fragments > 0 else 0

coverage_data.append({
"vector_name": seq_name,
"position": position + 1, # Convert to 1-based for output
"raw_coverage": coverage,
"percent_coverage": percent_coverage
})

# Write coverage data to TSV
output_path = f"{output_prefix}.vector_coverage.tsv"
with open(output_path, "w") as f:
writer = csv.DictWriter(f, fieldnames=["vector_name", "position", "raw_coverage", "percent_coverage"],
delimiter="\t")
writer.writeheader()
writer.writerows(coverage_data)

logging.info(f"Vector coverage data written to {output_path}")
return output_path


def main(args):
"""Entry point."""
annotation = load_annotation_bed(
Expand Down Expand Up @@ -899,6 +979,13 @@ def main(args):
num_chunks=args.cpus,
)

# Generate vector coverage file
generate_vector_coverage(
args.bam_filename,
annotation,
args.output_prefix
)

# subset BAM files into major categories for ease of loading into IGV for viewing
# subset_sam_by_readname_list(in_bam, out_bam, per_read_tsv, wanted_types, wanted_subtypes)
subset_bam_prefixes = []
Expand Down
1 change: 1 addition & 0 deletions test/Makefile
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@

# Local tests: Run the core analysis steps on downsampled copies of the public PacBio example data

in_dir = samples
Expand Down
Loading