diff --git a/main.nf b/main.nf index fa4d866..0d9ced0 100644 --- a/main.nf +++ b/main.nf @@ -124,6 +124,7 @@ workflow laava { flipflop_bams = make_report.out.flipflop_bams flipflop_tsv = make_report.out.flipflop_tsv sample_id = make_report.out.sample_id + vector_coverage = make_report.out.vector_coverage } diff --git a/modules/local/laava.nf b/modules/local/laava.nf index 3527273..bd0ea1c 100644 --- a/modules/local/laava.nf +++ b/modules/local/laava.nf @@ -81,6 +81,7 @@ process make_report() { path("${sample_id}.nonmatch.tsv.gz"), emit: nonmatch_tsv path("${sample_id}.agg_ref_type.tsv"), emit: agg_ref_type_tsv path("${sample_id}.agg_subtype.tsv"), emit: agg_subtype_tsv + path("${sample_id}.vector_coverage.tsv"), emit: vector_coverage path("${sample_id}.flipflop.tsv.gz"), emit: flipflop_tsv, optional: true path("${sample_id}.agg_flipflop.tsv"), emit: agg_flipflop_tsv, optional:true diff --git a/src/make_report.sh b/src/make_report.sh index d629f27..f6af988 100755 --- a/src/make_report.sh +++ b/src/make_report.sh @@ -75,7 +75,7 @@ if [[ -n "$flipflop_name" || -n "$flipflop_fa" ]]; then elif [ "${vector_type}" == "ss" ]; then orientation="left" fi - + python "$(dirname $0)/get_flipflop_config.py" \ "$out_dir/${sample_id}.tagged.bam" "$out_dir/${sample_id}.per_read.tsv.gz" \ "$vector_type" \ diff --git a/src/summarize_alignment.py b/src/summarize_alignment.py index e03a8ae..5e939dc 100755 --- a/src/summarize_alignment.py +++ b/src/summarize_alignment.py @@ -868,6 +868,86 @@ def check_is_mitr_left(annotation_bed, itr_labels): return None +def generate_vector_coverage(bam_filename, annotation, output_prefix): + """Generate per-base coverage statistics for vectors in annotation. + + Parameters: + bam_filename (str): Path to the sorted BAM file + annotation (dict): Dictionary of annotation objects already loaded + output_prefix (str): Prefix for output file + + Returns: + str: Path to the output TSV file + """ + # Sort BAM by position for coverage calculation + pos_sorted_bam = f"{output_prefix}.pos_sorted.bam" + logging.info(f"Sorting BAM by position: {bam_filename} -> {pos_sorted_bam}") + pysam.sort("-o", pos_sorted_bam, bam_filename) + + # Create index for position-sorted BAM + logging.info(f"Creating index for position-sorted BAM: {pos_sorted_bam}") + pysam.index(pos_sorted_bam) + + # Open position-sorted BAM file + bam = pysam.AlignmentFile(pos_sorted_bam, "rb", check_sq=False) + + # Count unique fragments (by query name) + read_names = set() + for read in bam: + if not read.is_unmapped: + read_names.add(read.query_name) + + total_fragments = len(read_names) + bam.reset() + + # Initialize list to store coverage data + coverage_data = [] + + # Process vectors from annotation + for seq_name, info in annotation.items(): + if info["label"] != "vector": + continue + + region = info["region"] + if region is None: + logging.warning(f"No region defined for vector {seq_name}, skipping coverage calculation") + continue + + start_pos, end_pos = region + + logging.info(f"Calculating coverage for vector {seq_name} ({start_pos}-{end_pos})") + + # Create a position-coverage mapping + position_coverage = {i: 0 for i in range(start_pos, end_pos)} + + # Use pileup with max_depth set to total fragments count (to avoid default limit) + for column in bam.pileup(seq_name, start_pos, end_pos, max_depth=total_fragments): + if start_pos <= column.pos < end_pos: + position_coverage[column.pos] = column.n + + # Convert to output format with percentages + for position, coverage in position_coverage.items(): + percent_coverage = (coverage / total_fragments) * 100 if total_fragments > 0 else 0 + + coverage_data.append({ + "vector_name": seq_name, + "position": position + 1, # Convert to 1-based for output + "raw_coverage": coverage, + "percent_coverage": percent_coverage + }) + + # Write coverage data to TSV + output_path = f"{output_prefix}.vector_coverage.tsv" + with open(output_path, "w") as f: + writer = csv.DictWriter(f, fieldnames=["vector_name", "position", "raw_coverage", "percent_coverage"], + delimiter="\t") + writer.writeheader() + writer.writerows(coverage_data) + + logging.info(f"Vector coverage data written to {output_path}") + return output_path + + def main(args): """Entry point.""" annotation = load_annotation_bed( @@ -899,6 +979,13 @@ def main(args): num_chunks=args.cpus, ) + # Generate vector coverage file + generate_vector_coverage( + args.bam_filename, + annotation, + args.output_prefix + ) + # subset BAM files into major categories for ease of loading into IGV for viewing # subset_sam_by_readname_list(in_bam, out_bam, per_read_tsv, wanted_types, wanted_subtypes) subset_bam_prefixes = [] diff --git a/test/Makefile b/test/Makefile index 65f962b..8541249 100644 --- a/test/Makefile +++ b/test/Makefile @@ -1,3 +1,4 @@ + # Local tests: Run the core analysis steps on downsampled copies of the public PacBio example data in_dir = samples