diff --git a/.github/workflows/formatting.yml b/.github/workflows/formatting.yml index 5edc917..5cac669 100644 --- a/.github/workflows/formatting.yml +++ b/.github/workflows/formatting.yml @@ -6,6 +6,8 @@ jobs: steps: - uses: actions/checkout@v4 - uses: chartboost/ruff-action@v1 + with: + version: 0.6.8 # TODO: Is it really necessary for these to be separate jobs? This seems redundant. ruff-format-check: runs-on: ubuntu-latest @@ -13,4 +15,5 @@ jobs: - uses: actions/checkout@v4 - uses: chartboost/ruff-action@v1 with: + version: 0.6.8 args: 'format --check' diff --git a/dimelo/export.py b/dimelo/export.py index 5f481a4..e2c973c 100644 --- a/dimelo/export.py +++ b/dimelo/export.py @@ -6,7 +6,7 @@ import pysam from tqdm.auto import tqdm -from . import utils +from . import load_processed, utils """ This module contains code to export indexed and compressed parse output files to other formats that may be helpful for downstream analysis. @@ -134,54 +134,31 @@ def pileup_to_bigwig( total=lines_by_contig[contig], leave=False, ): - # TODO: This code is copied from load_processed.pileup_counts_from_bedmethyl and should probably be consolidated at some point - tabix_fields = row.split("\t") - pileup_basemod = tabix_fields[3] - pileup_strand = tabix_fields[5] - keep_basemod = False - if (strand != ".") and (pileup_strand != strand): - # This entry is on the wrong strand - skip it - continue - elif len(pileup_basemod.split(",")) == 3: - pileup_modname, pileup_motif, pileup_mod_coord = ( - pileup_basemod.split(",") + keep_basemod, genomic_coord, modified_in_row, valid_in_row = ( + load_processed.process_pileup_row( + row=row, + parsed_motif=parsed_motif, + region_strand=strand, + single_strand=(strand != "."), ) - if ( - pileup_motif == parsed_motif.motif_seq - and int(pileup_mod_coord) == parsed_motif.modified_pos - and pileup_modname in parsed_motif.mod_codes - ): - keep_basemod = True - elif len(pileup_basemod.split(",")) == 1: - if pileup_basemod in parsed_motif.mod_codes: - keep_basemod = True - else: - raise ValueError( - f"Unexpected format in bedmethyl file: {row} contains {pileup_basemod} which cannot be parsed." - ) - # TODO: consolidate the above into a function; just do adding outside - if keep_basemod: - pileup_info = tabix_fields[9].split(" ") - valid_base_counts = int(pileup_info[0]) - modified_base_counts = int(pileup_info[2]) - if valid_base_counts > 0: - genomic_coord = int(tabix_fields[1]) - contig_list.append(contig) - start_list.append(genomic_coord) - end_list.append(genomic_coord + 1) - values_list.append(modified_base_counts / valid_base_counts) - - if len(values_list) > chunk_size: - bw.addEntries( - contig_list, # Contig names - start_list, # Start positions - ends=end_list, # End positions - values=values_list, # Corresponding values - ) - contig_list = [] - start_list = [] - end_list = [] - values_list = [] + ) + if keep_basemod and valid_in_row > 0: + contig_list.append(contig) + start_list.append(genomic_coord) + end_list.append(genomic_coord + 1) + values_list.append(modified_in_row / valid_in_row) + + if len(values_list) > chunk_size: + bw.addEntries( + contig_list, # Contig names + start_list, # Start positions + ends=end_list, # End positions + values=values_list, # Corresponding values + ) + contig_list = [] + start_list = [] + end_list = [] + values_list = [] bw.addEntries( contig_list, # Contig names start_list, # Start positions diff --git a/dimelo/load_processed.py b/dimelo/load_processed.py index 4570bf7..62a4556 100644 --- a/dimelo/load_processed.py +++ b/dimelo/load_processed.py @@ -1,7 +1,9 @@ +import concurrent.futures import gzip +import multiprocessing from collections import defaultdict -from concurrent.futures import ProcessPoolExecutor from functools import partial +from multiprocessing import shared_memory from pathlib import Path import h5py @@ -11,38 +13,46 @@ from . import test_data, utils +# the default chunk size is the number of bp to include per processing chunk in parallelization for loaders. +# 1e6 was empirically determined to be a good default: smaller than 1e5 we see slowdowns due to increased +# parallelization overhead, larger than 1e7 we see slowdowns due to worker utilization decreasing because even +# for whole chromosome processing there aren't always enough chunks to go around. In the 1e5-1e7 range, speed +# on 32 cores is fairly similar, but sitting in the middle of the range should support 10x more cores (beyond +# the reasonable upper bound) and 10x fewer cores (which is about the reasonable lower bound). +DEFAULT_CHUNK_SIZE = 1_000_000 -def process_region(region_string, function_handle, **kwargs): - """ - process_region simply exists to convert position arguments into keyword arguments to make executor.map work - - Args: - region_string: passed down with regions keyword - function_handle: function to call with regions and other kwargs - **kwargs: all keyword arguments passed to regions_to_list. These must be sufficient for whichever load_processed function - if being referenced by function_handle - Returns: - function_handle return value - """ - return function_handle(regions=region_string, **kwargs) +################################################################################################################ +#### Loader wrappers #### +################################################################################################################ def regions_to_list( function_handle, regions, window_size: int | None = None, + quiet: bool = True, cores: int | None = None, + split_large_regions: bool = False, **kwargs, ): """ + User-facing function. + Run any standard load_processed pileup or extract loader loading each region from the region specifier into a new element of a list. Args: function_handle: the loader function you want to run. - regions: the region specifier + regions: the region specifier. Typically we expect to get many regions for this function, in the form of a list + of strings or bed file paths. regions_to_list will run across all of these one-by-one returning a separate + function return for each independent region. window_size: window around centers of regions, defaults to None - cores: process count across which to parallelize. Each individual region will only ever get one core. + quiet: disables progress bars + cores: CPU cores across which to parallelize processing. Default to None, which means all available. + split_large_regions: if True, regions will be run sequentially in parallelized chunks. If False, + each individual region's chunks will be run sequentially but there will be parallelization across + regions, i.e. each core will be assigned one region at a time by the executor. Set to True if you + are running a small number of very large regions (e.g. one or two chromosomes), otherwise to to False (default). **kwargs: all necessary keyword arguments to pass down to the loader Returns: @@ -61,43 +71,71 @@ def regions_to_list( ] cores_to_run = utils.cores_to_run(cores) - - if cores_to_run > 1: - with ProcessPoolExecutor(max_workers=cores_to_run) as executor: - # Use functools.partial to pre-fill arguments - process_partial = partial( - process_region, function_handle=function_handle, cores=1, **kwargs - ) - - # Use executor.map without lambda - results = list( - tqdm( - executor.map(process_partial, region_strings), - total=len(region_strings), - desc=f"Processing regions in parallel across {cores_to_run}", - ) - ) - else: - # Single-threaded fallback - results = [ - process_region( - region_string=region, function_handle=function_handle, cores=1, **kwargs + # quiet and cores logic below is driven by the following: + # If the parallelization is within regions: + # (1) progress bars should happen within regions if at all, because we assume regions are + # large if they make sense to parallelize + # (2) the cores_to_run will be allocated to within-region parallelization, and the top-level + # jobs sequence is run sequentially + with concurrent.futures.ProcessPoolExecutor( + max_workers=1 if split_large_regions else cores_to_run + ) as executor: + # Use functools.partial to pre-fill arguments + process_partial = partial( + apply_loader_function_to_region, + function_handle=function_handle, + quiet=quiet or not split_large_regions, + cores=cores_to_run + if split_large_regions + else 1, # if parallelization is within region + **kwargs, + ) + results = list( + tqdm( + executor.map(process_partial, region_strings), + total=len(region_strings), + desc="Loading data", + disable=quiet or split_large_regions, + leave=False, ) - for region in tqdm(region_strings, desc="Processing regions") - ] + ) return results +def apply_loader_function_to_region(region_string, function_handle, **kwargs): + """ + apply_loader_function_to_region simply exists to convert position arguments into keyword arguments to make executor.map work + + Args: + region_string: passed down with regions keyword + function_handle: function to call with regions and other kwargs + **kwargs: all keyword arguments passed to regions_to_list. These must be sufficient for whichever load_processed function + if being referenced by function_handle + Returns: + function_handle return value + """ + return function_handle(regions=region_string, **kwargs) + + +################################################################################################################ +#### Pileup loaders #### +################################################################################################################ + + def pileup_counts_from_bedmethyl( bedmethyl_file: str | Path, motif: str, - regions: str | Path | list[str | Path] | None = None, + regions: str | Path | list[str | Path], window_size: int | None = None, single_strand: bool = False, - cores: int | None = None, # currently unused + quiet: bool = False, + cores: int | None = None, + chunk_size: int = DEFAULT_CHUNK_SIZE, ) -> tuple[int, int]: """ + User-facing function. + Extract number of modified bases and total number of bases from the given bedmethyl file. Called by plotters or by the user. @@ -118,101 +156,74 @@ def pileup_counts_from_bedmethyl( window_size: (currently disabled) window around center of region, +-window_size single_strand: True means we only grab counts from reads from the same strand as the region of interest, False means we always grab both strands within the regions - cores: cores across which to parallelize processes (currently unused) + quiet: disables progress bars + cores: CPU cores across which to parallelize processing. Default to None, which means all available. + chunk_size: size of genomic subregions to assign out to each process Returns: tuple containing counts of (modified_bases, total_bases) """ - source_tabix = pysam.TabixFile(str(bedmethyl_file)) - # Don't need vectors, just need counts; also not guaranteed that windows are the same length - valid_base_count = 0 - modified_base_count = 0 - parsed_motif = utils.ParsedMotif(motif) - if regions is not None: - # Get counts from the specified regions - regions_dict = utils.regions_dict_from_input( - regions, - window_size, - ) - for chromosome, region_list in regions_dict.items(): - for start_coord, end_coord, strand in region_list: - # TODO: change to try-except - if chromosome in source_tabix.contigs: - for row in source_tabix.fetch(chromosome, start_coord, end_coord): - # TODO Consider using csv module - # TODO: probably this whole block should share logic with vectors_from_bedmethyl AND from export module functions - tabix_fields = row.split("\t") - pileup_basemod = tabix_fields[3] - pileup_strand = tabix_fields[5] - keep_basemod = False - if single_strand and pileup_strand != strand: - # This entry is on the wrong strand - skip it - continue - elif len(pileup_basemod.split(",")) == 3: - pileup_modname, pileup_motif, pileup_mod_coord = ( - pileup_basemod.split(",") - ) - if ( - pileup_motif == parsed_motif.motif_seq - and int(pileup_mod_coord) == parsed_motif.modified_pos - and pileup_modname in parsed_motif.mod_codes - ): - keep_basemod = True - elif len(pileup_basemod.split(",")) == 1: - if pileup_basemod in parsed_motif.mod_codes: - keep_basemod = True - else: - raise ValueError( - f"Unexpected format in bedmethyl file: {row} contains {pileup_basemod} which cannot be parsed." - ) - # TODO: consolidate the above into a function; just do adding outside - if keep_basemod: - pileup_info = tabix_fields[9].split(" ") - valid_base_count += int(pileup_info[0]) - modified_base_count += int(pileup_info[2]) - else: - # Get counts from the whole input file - for row in source_tabix.fetch(): - tabix_fields = row.split("\t") - pileup_basemod = tabix_fields[3] - keep_basemod = False - if len(pileup_basemod.split(",")) == 3: - pileup_modname, pileup_motif, pileup_mod_coord = pileup_basemod.split( - "," - ) - if ( - pileup_motif == parsed_motif.motif_seq - and int(pileup_mod_coord) == parsed_motif.modified_pos - and pileup_modname in parsed_motif.mod_codes - ): - keep_basemod = True - elif len(pileup_basemod.split(",")) == 1: - if pileup_basemod in parsed_motif.mod_codes: - keep_basemod = True - else: - raise ValueError( - f"Unexpected format in bedmethyl file: {row} contains {pileup_basemod} which cannot be parsed." - ) - if keep_basemod: - pileup_info = tabix_fields[9].split(" ") - valid_base_count += int(pileup_info[0]) - modified_base_count += int(pileup_info[2]) + regions_dict = utils.regions_dict_from_input(regions, window_size) + chunks_list = utils.process_chunks_from_regions_dict( + regions_dict, chunk_size=chunk_size + ) - return (modified_base_count, valid_base_count) + cores_to_run = utils.cores_to_run(cores) + # Initialize shared memory as length-one numpy arrays to make it easy to map to buffer in subprocesses + shm_valid = shared_memory.SharedMemory( + create=True, size=np.dtype(np.int32).itemsize + ) + shm_modified = shared_memory.SharedMemory( + create=True, size=np.dtype(np.int32).itemsize + ) -def counts_from_fake(*args, **kwargs) -> tuple[int, int]: - """ - Generates a fake set of enrichment counts. Ignores all arguments. + manager = multiprocessing.Manager() + lock = manager.Lock() + + with concurrent.futures.ProcessPoolExecutor(max_workers=cores_to_run) as executor: + futures = [ + executor.submit( + pileup_counts_process_chunk, + bedmethyl_file, + parsed_motif, + chunk, + shm_modified.name, + shm_valid.name, + lock, + single_strand, + ) + for chunk in chunks_list + ] + for future in tqdm( + concurrent.futures.as_completed(futures), + total=len(futures), + disable=quiet, + desc="Loading data", + leave=False, + ): + try: + future.result() + except Exception as err: + raise RuntimeError("pileup_counts_process_chunk failed.") from err + + # Directly convert shared memory buffers to integers + modified_base_count = int.from_bytes( + shm_modified.buf[:4], byteorder="little", signed=True + ) + valid_base_count = int.from_bytes( + shm_valid.buf[:4], byteorder="little", signed=True + ) + # Close and unlink shared memory - not fully handled by garbage collection otherwise + shm_modified.close() + shm_modified.unlink() + shm_valid.close() + shm_valid.unlink() - Returns: - tuple containing counts of (modified_bases, total_bases) - """ - window_halfsize = 500 - return test_data.fake_peak_enrichment(halfsize=window_halfsize, peak_height=0.15) + return modified_base_count, valid_base_count def pileup_vectors_from_bedmethyl( @@ -222,9 +233,13 @@ def pileup_vectors_from_bedmethyl( window_size: int | None = None, single_strand: bool = False, regions_5to3prime: bool = False, - cores: int | None = None, # currently unused + quiet: bool = False, + cores: int | None = None, + chunk_size: int = DEFAULT_CHUNK_SIZE, ) -> tuple[np.ndarray, np.ndarray]: """ + User-facing function. + Extract per-position pileup counts at valid motifs across one or more superimposed regions. Called by profile plotters, can also be used by a user directly. @@ -254,86 +269,102 @@ def pileup_vectors_from_bedmethyl( single_strand: True means we only grab counts from reads from the same strand as the region of interest, False means we always grab both strands within the regions regions_5to3prime: True means negative strand regions get flipped, False means no flipping - cores: cores across which to parallelize processes (currently unused) + quiet: disables progress bars + cores: CPU cores across which to parallelize processing. Default to None, which means all available. + chunk_size: size of genomic subregions to assign out to each process Returns: tuple containing (modified_base_counts, valid_base_counts) """ - source_tabix = pysam.TabixFile(str(bedmethyl_file)) - parsed_motif = utils.ParsedMotif(motif) regions_dict = utils.regions_dict_from_input(regions, window_size) + chunks_list = utils.process_chunks_from_regions_dict( + regions_dict, chunk_size=chunk_size + ) + + cores_to_run = utils.cores_to_run(cores) # Peek at a region to figure out what size the vectors should be first_key = next(iter(regions_dict)) first_tuple = regions_dict[first_key][0] region_len = first_tuple[1] - first_tuple[0] - valid_base_counts = np.zeros(region_len, dtype=int) - modified_base_counts = np.zeros(region_len, dtype=int) - - for chromosome, region_list in regions_dict.items(): - for start_coord, end_coord, strand in region_list: - # TODO: This is not used anywhere; disabling for now - # center_coord = (start_coord+end_coord)//2 - if chromosome in source_tabix.contigs: - for row in source_tabix.fetch(chromosome, start_coord, end_coord): - # TODO: can we consolidate this with pileup_counts_from_bedmethyl? - # Just the checks? - # TODO: probably this whole block should share logic with counts_from_bedmethyl AND from export functions - tabix_fields = row.split("\t") - pileup_basemod = tabix_fields[3] - pileup_strand = tabix_fields[5] - keep_basemod = False - if single_strand and pileup_strand.strip() != strand: - # We are on the wrong strand, skip the rest of the steps for this row - continue - elif len(pileup_basemod.split(",")) == 3: - pileup_modname, pileup_motif, pileup_mod_coord = ( - pileup_basemod.split(",") - ) - if ( - pileup_motif == parsed_motif.motif_seq - and int(pileup_mod_coord) == parsed_motif.modified_pos - and pileup_modname in parsed_motif.mod_codes - ): - keep_basemod = True - elif len(pileup_basemod.split(",")) == 1: - if pileup_basemod in parsed_motif.mod_codes: - keep_basemod = True - else: - raise ValueError( - f"Unexpected format in bedmethyl file: {row} contains {pileup_basemod} which cannot be parsed." - ) - if keep_basemod: - pileup_info = tabix_fields[9].split(" ") - genomic_coord = int(tabix_fields[1]) - if regions_5to3prime and strand == "-": - # We want to flip the coordinates for this region so that it is recorded along the 5 prime to 3 prime direction - # This will enable analyses where the orientation of protein binding / transcriptional dynamics / etc is relevant for our pileup signal - pileup_coord_relative = end_coord - genomic_coord - 1 - else: - # Normal coordinates are the default. This will be used both for the '+' case and the '.' (no strand specified) case - pileup_coord_relative = genomic_coord - start_coord - if pileup_coord_relative > region_len: - print( - f"WARNING: You have specified a region {chromosome}:{start_coord}-{end_coord} that is longer than the first region; the end of the region will be skipped. To make a profile plot with differently-sized region, consider using the window_size parameter to make a profile across centered windows." - ) - break - else: - valid_base_counts[pileup_coord_relative] += int( - pileup_info[0] - ) - modified_base_counts[pileup_coord_relative] += int( - pileup_info[2] - ) + # Initialize shared memory as numpy arrays to make it easy to map to buffer in subprocesses + shm_valid = shared_memory.SharedMemory( + create=True, size=(region_len) * np.dtype(np.int32).itemsize + ) + shm_modified = shared_memory.SharedMemory( + create=True, size=(region_len) * np.dtype(np.int32).itemsize + ) + + manager = multiprocessing.Manager() + lock = manager.Lock() + + with concurrent.futures.ProcessPoolExecutor(max_workers=cores_to_run) as executor: + futures = [ + executor.submit( + pileup_vectors_process_chunk, + bedmethyl_file, + parsed_motif, + chunk, + region_len, + shm_modified.name, + shm_valid.name, + lock, + single_strand, + regions_5to3prime, + ) + for chunk in chunks_list + ] + for future in tqdm( + concurrent.futures.as_completed(futures), + total=len(futures), + disable=quiet, + desc="Loading data", + leave=False, + ): + try: + future.result() + except Exception as err: + raise RuntimeError("pileup_vectors_process_chunk failed.") from err + + # We need to convert these shared memory buffers to numpy arrays which + # we then copy, so that they no longer reference the shared memory which + # will soon be de-allocated + modified_base_counts = np.copy( + np.ndarray((region_len,), dtype=np.int32, buffer=shm_modified.buf) + ) + valid_base_counts = np.copy( + np.ndarray((region_len,), dtype=np.int32, buffer=shm_valid.buf) + ) + # Close and unlink shared memory - not fully handled by garbage collection otherwise + shm_modified.close() + shm_modified.unlink() + shm_valid.close() + shm_valid.unlink() + return modified_base_counts, valid_base_counts +def counts_from_fake(*args, **kwargs) -> tuple[int, int]: + """ + Test helper function. + + Generates a fake set of enrichment counts. Ignores all arguments. + + Returns: + tuple containing counts of (modified_bases, total_bases) + """ + window_halfsize = 500 + return test_data.fake_peak_enrichment(halfsize=window_halfsize, peak_height=0.15) + + def vector_from_fake(window_size: int, *args, **kwargs) -> np.ndarray: """ + Test helper function. + Generates a fake peak trace. Ignores all arguments except window_size. Args: @@ -347,6 +378,246 @@ def vector_from_fake(window_size: int, *args, **kwargs) -> np.ndarray: ) +def pileup_vectors_process_chunk( + bedmethyl_file, + parsed_motif, + chunk, + region_len, + shm_name_modified, + shm_name_valid, + lock, + single_strand, + regions_5to3prime, +) -> None: + """ + Helper function to allow pileup_vectors_from_bedmethyl to operate in a parallized fashion. + + Sum up modified and valid counts for a subregion chunk in a bedmethyl file. + + Args: + bedmethyl_file: Path to bedmethyl file + parsed_motif: ParsedMotif object + chunk: a dict containing subregion chunk information + shm_name_modified: the name string for the shared memory location containing the modified counts array + shm_name_valid: the name string for the shared memory location containing the valid counts array + lock: a manager.Lock object to allow synchronization in accessing shared memory + single_strand: True if only single-strand mods are desired + regions_5to3prime: True means negative strand regions get flipped, False means no flipping + + Returns: + None. Counts are added to arrays in-place to shared memory. + """ + source_tabix = pysam.TabixFile(str(bedmethyl_file)) + existing_valid = shared_memory.SharedMemory(name=shm_name_valid) + existing_modified = shared_memory.SharedMemory(name=shm_name_modified) + valid_base_counts = np.ndarray( + (region_len,), dtype=np.int32, buffer=existing_valid.buf + ) + modified_base_counts = np.ndarray( + (region_len,), dtype=np.int32, buffer=existing_modified.buf + ) + + chromosome = chunk["chromosome"] + region_start = chunk["region_start"] + region_end = chunk["region_end"] + subregion_start = chunk["subregion_start"] + subregion_end = chunk["subregion_end"] + strand = chunk["strand"] + + flip_coords = regions_5to3prime and strand == "-" + + if flip_coords: + subregion_offset = region_end - subregion_end + else: + subregion_offset = subregion_start - region_start + + if region_end - region_start > region_len: + print( + f"WARNING: You have specified a region at {chromosome}:{region_start}-{region_end} that is longer than the first region; the end of the region will be skipped. To make a profile plot with differently-sized region, consider using the window_size parameter to make a profile across centered windows." + ) + + valid_base_subregion = np.zeros(subregion_end - subregion_start, dtype=int) + modified_base_subregion = np.zeros(subregion_end - subregion_start, dtype=int) + + # tabix throws and error if the contig is not present + # by the current design, this should be silent + if chromosome in source_tabix.contigs: + for row in source_tabix.fetch( + chromosome, max(subregion_start, 0), subregion_end + ): + ( + keep_basemod, + genomic_coord, + modified_in_row, + valid_in_row, + ) = process_pileup_row( + row=row, + parsed_motif=parsed_motif, + region_strand=strand, + single_strand=single_strand, + ) + if keep_basemod: + if flip_coords: + # We want to flip the coordinates for this region so that it is recorded along the 5 prime to 3 prime direction + # This will enable analyses where the orientation of protein binding / transcriptional dynamics / etc is relevant for our pileup signal + pileup_coord_in_subregion = subregion_end - genomic_coord - 1 + else: + # Normal coordinates are the default. This will be used both for the '+' case and the '.' (no strand specified) case + pileup_coord_in_subregion = genomic_coord - subregion_start + if pileup_coord_in_subregion < (subregion_end - subregion_start): + valid_base_subregion[pileup_coord_in_subregion] += valid_in_row + modified_base_subregion[pileup_coord_in_subregion] += ( + modified_in_row + ) + + with lock: + valid_base_counts[ + subregion_offset : subregion_offset + abs(subregion_end - subregion_start) + ] += valid_base_subregion + modified_base_counts[ + subregion_offset : subregion_offset + abs(subregion_end - subregion_start) + ] += modified_base_subregion + # Close the file descriptor/handle to the shared memory + existing_modified.close() + existing_valid.close() + + +def pileup_counts_process_chunk( + bedmethyl_file, + parsed_motif, + chunk, + shm_name_modified, + shm_name_valid, + lock, + single_strand, +) -> None: + """ + Helper function to allow pileup_counts_from_bedmethyl to operate in a parallized fashion. + + Sum up modified and valid counts for a subregion chunk in a bedmethyl file. + + Args: + bedmethyl_file: Path to bedmethyl file + parsed_motif: ParsedMotif object + chunk: a dict containing subregion chunk information + shm_name_modified: the name string for the shared memory location containing the modified counts sum + shm_name_valid: the name string for the shared memory location containing the valid counts sum + lock: a manager.Lock object to allow synchronization in accessing shared memory + single_strand: True if only single-strand mods are desired + + Returns: + None. Counts are added in-place to shared memory. + """ + source_tabix = pysam.TabixFile(str(bedmethyl_file)) + existing_valid = shared_memory.SharedMemory(name=shm_name_valid) + existing_modified = shared_memory.SharedMemory(name=shm_name_modified) + valid_base_counts = np.ndarray((1,), dtype=np.int32, buffer=existing_valid.buf) + modified_base_counts = np.ndarray( + (1,), dtype=np.int32, buffer=existing_modified.buf + ) + + chromosome = chunk["chromosome"] + subregion_start = chunk["subregion_start"] + subregion_end = chunk["subregion_end"] + strand = chunk["strand"] + + valid_base_subregion_counts = 0 + modified_base_subregion_counts = 0 + + # tabix throws and error if the contig is not present + # by the current design, this should be silent + if chromosome in source_tabix.contigs: + for row in source_tabix.fetch( + chromosome, max(subregion_start, 0), subregion_end + ): + ( + keep_basemod, + _, + modified_in_row, + valid_in_row, + ) = process_pileup_row( + row=row, + parsed_motif=parsed_motif, + region_strand=strand, + single_strand=single_strand, + ) + if keep_basemod: + valid_base_subregion_counts += valid_in_row + modified_base_subregion_counts += modified_in_row + + with lock: + valid_base_counts[0] += valid_base_subregion_counts + modified_base_counts[0] += modified_base_subregion_counts + + # Close the file descriptor/handle to the shared memory + existing_valid.close() + existing_modified.close() + + +def process_pileup_row( + row: str, + parsed_motif: utils.ParsedMotif, + region_strand: str, + single_strand: bool = False, +) -> tuple[bool, int, int, int]: + """ + Helper function designed for pileup_counts_from_bedmethyl via pileup_counts_process_chunk, pileup_vectors_from_bedmethyl + via pileup_vectors_process_chunk, and export.pileup_to_bigwig; changes to logic here may impact some or all of + these. + + Process a row from a pileup, determining whether the basemod is relevant and passing back its coordinate, + modification count, and valid read count. + + Args: + row: a string row from a bedmethyl file + parsed_motif: a ParsedMotif object + region_strand: the strand from the query region + single_strand: True if only mods on the region_strand are to be kept + + Returns: keep_basemod, genomic_coord, modified_in_row, valid_in_row. Values are provided even if keep_basemod is False. + """ + tabix_fields = row.split("\t") + pileup_basemod = tabix_fields[3] + pileup_strand = tabix_fields[5] + + if single_strand and pileup_strand.strip() != region_strand: + # We are on the wrong strand, can't keep this position + keep_basemod = False + elif len(pileup_basemod.split(",")) == 3: + pileup_modname, pileup_motif, pileup_mod_coord = pileup_basemod.split(",") + if ( + pileup_motif == parsed_motif.motif_seq + and int(pileup_mod_coord) == parsed_motif.modified_pos + and pileup_modname in parsed_motif.mod_codes + ): + keep_basemod = True + else: + keep_basemod = False + elif len(pileup_basemod.split(",")) == 1: + keep_basemod = pileup_basemod in parsed_motif.mod_codes + else: + raise ValueError( + f"Unexpected format in bedmethyl file: {row} contains {pileup_basemod} which cannot be parsed." + ) + + pileup_info = tabix_fields[9].split(" ") + genomic_coord = int(tabix_fields[1]) + valid_in_row = int(pileup_info[0]) + modified_in_row = int(pileup_info[2]) + + return ( + keep_basemod, + genomic_coord, + modified_in_row, + valid_in_row, + ) + + +################################################################################################################ +#### Single read loaders #### +################################################################################################################ + + def read_vectors_from_hdf5( file: str | Path, motifs: list[str], @@ -355,10 +626,13 @@ def read_vectors_from_hdf5( single_strand: bool = False, sort_by: str | list[str] = ["chromosome", "region_start", "read_start"], calculate_mod_fractions: bool = True, + quiet: bool = True, # currently unused; change to default False when pbars are implemented cores: int | None = None, # currently unused subset_parameters: dict | None = None, ) -> tuple[list[tuple], list[str], dict | None]: """ + User-facing function. + Pulls a list of read data out of an .h5 file containing processed read vectors, formatted for read-by-read vector processing downstream use cases. @@ -376,6 +650,8 @@ def read_vectors_from_hdf5( After this processing, we calculate modification fractions, sort, and return. + TODO: Implement progress bars and parallelization as with pileup loaders + Args: file: Path to an hdf5 (.h5) file containing modification data for single reads, stored in datasets read_name, chromosome, read_start, @@ -398,6 +674,7 @@ def read_vectors_from_hdf5( sort_by: Read properties by which to sort, either one string or a list of strings. Options include chromosome, region_start, region_end, read_start, read_end, and motif. More to be added in future. + quiet: silences progress bars (currently unused) cores: cores across which to parallelize processes (currently unused) subset_parameters: Parameters to pass to the utils.random_sample() method, to subset the reads to be returned. If not None, at least one of n or frac must be provided. The array @@ -604,10 +881,13 @@ def readwise_binary_modification_arrays( sort_by: str | list[str] = ["chromosome", "region_start", "read_start"], thresh: float | None = None, relative: bool = True, + quiet: bool = True, # currently unused; change to default False when pbars are implemented cores: int | None = None, # currently unused subset_parameters: dict | None = None, ) -> tuple[list[np.ndarray], np.ndarray[int], np.ndarray[str], dict | None]: """ + Primarily designed as a helper function for single-read plotting, but can be used by a user. + Pulls a list of read data out of a file containing processed read vectors, formatted with seaborn plotting in mind. Currently we only support .h5 files. @@ -619,6 +899,8 @@ def readwise_binary_modification_arrays( coordinates. If positions are relative, regions_5to3prime can be used to show all regions as upstream-to-downstream along their respective strands. + TODO: Implement progress bars and parallelization as with pileup loaders + Args: file: Path to an hdf5 (.h5) file containing modification data for single reads, stored in datasets read_name, chromosome, read_start, @@ -647,6 +929,7 @@ def readwise_binary_modification_arrays( in the genomes, centered at the center of the region. If False, absolute coordinates are provided. There is not currently a check for all reads being on the same chromosome if relative=False, but this could create unexpected behaviour for a the standard visualizations. + quiet: silences progress bars (currently unused) cores: cores across which to parallelize processes (currently unused) subset_parameters: Parameters to pass to the utils.random_sample() method, to subset the reads to be returned. If not None, at least one of n or frac must be provided. The array @@ -764,6 +1047,8 @@ def reads_from_fake( file: Path, regions: Path, motifs: list[str] ) -> tuple[list[np.ndarray], np.ndarray[int], np.ndarray[str], dict]: """ + Helper function to support testing. + TODO: What does the bed file represent in this method? This one is breaking my brain a bit. TODO: Variable names in this method stink. TODO: Currently assumes mod calling (thresholding probabilities) was already performed elsewhere @@ -816,17 +1101,27 @@ def reads_from_fake( def convert_bytes_to_strings(tup): - """Convert all bytes elements in a tuple to strings.""" + """ + Helper function for single read loading. + Convert all bytes elements in a tuple to strings. + """ return tuple(item.decode() if isinstance(item, bytes) else item for item in tup) # tuple(convert_bytes(item) for item in tup) def adjust_mod_probs_in_arrays(mod_array, val_array): + """ + Helper function to correct for an idiosyncracy in modkit single-read parsing wherein 0-255 + "mod quality" values are parsed as floating-point values from 1/512 to 511/512. + """ mod_array[np.flatnonzero(val_array)] += 1 / 512 return mod_array def adjust_mod_probs_in_tuples(tup, mod_idx, val_idx): + """ + Helper function to apply mod prob adjustments + """ return tuple( item if index != mod_idx else adjust_mod_probs_in_arrays(item, tup[val_idx]) for index, item in enumerate(tup) @@ -834,6 +1129,9 @@ def adjust_mod_probs_in_tuples(tup, mod_idx, val_idx): def binary_to_np_array(compressed_bytes, dtype, decompressor, binarized, int8tofloat): + """ + Helper function to decompress binary data to boolean or floating point arrays + """ if binarized: return np.frombuffer(decompressor(compressed_bytes), dtype=dtype).astype(bool) elif int8tofloat: diff --git a/dimelo/parse_bam.py b/dimelo/parse_bam.py index 4837472..88a5310 100644 --- a/dimelo/parse_bam.py +++ b/dimelo/parse_bam.py @@ -531,7 +531,7 @@ def verify_inputs( ) elif correct_bases / total_bases < 0.35: raise ValueError( - f"First {NUM_READS_TO_CHECK} reads have anomalously low alignment quality: only {100*correct_bases/total_bases}% of bases align.\nPlease verify that {input_file.name} is actually aligned to {ref_genome.name}." + f"First {NUM_READS_TO_CHECK} reads have anomalously low alignment quality: only {100 * correct_bases / total_bases}% of bases align.\nPlease verify that {input_file.name} is actually aligned to {ref_genome.name}." ) return @@ -621,7 +621,7 @@ def check_bam_format( print( f""" WARNING: no modified appropriately-coded values found for {missing_bases} in the first {counter} reads. -Do you expect this file to contain these modifications? parse_bam is looking for {motifs} but for {missing_bases} found only found {[f'{base}+{mod_codes}' for base, mod_codes in mod_codes_found_dict.items()]}. +Do you expect this file to contain these modifications? parse_bam is looking for {motifs} but for {missing_bases} found only found {[f"{base}+{mod_codes}" for base, mod_codes in mod_codes_found_dict.items()]}. Consider passing only the motifs and mod codes (e.g. m,h,a) that you expect to be present in your file. You can use modkit adjust-mods --convert [OPTIONS] to update or consolidate mod codes. @@ -956,7 +956,7 @@ def read_by_base_txt_to_hdf5( iterator = tqdm( iterator, total=num_lines, - desc=f"Transferring {num_reads} from {input_txt.name} into {output_h5.name}, new size {old_size+num_reads}", + desc=f"Transferring {num_reads} from {input_txt.name} into {output_h5.name}, new size {old_size + num_reads}", bar_format="{bar}| {desc} {percentage:3.0f}% | {elapsed}<{remaining}", ) diff --git a/dimelo/plot_depth_histogram.py b/dimelo/plot_depth_histogram.py index 53afcd8..95b1f09 100644 --- a/dimelo/plot_depth_histogram.py +++ b/dimelo/plot_depth_histogram.py @@ -14,7 +14,9 @@ def plot_depth_histogram( window_size: int | None = None, single_strand: bool = False, one_depth_per_region: bool = False, - cores=None, + quiet: bool = False, + cores: int | None = None, + split_large_regions: bool = False, **kwargs, ) -> Axes: """ @@ -36,6 +38,12 @@ def plot_depth_histogram( the region of interest, False means we always grab both strands within the regions one_depth_per_region: if True, each region will only report a single depth value, averaging across all non-zero depths. If False depths will be reported separately for all nonzero count positions in each region for a more granular view of depth distribution. + quiet: disables progress bars + cores: CPU cores across which to parallelize processing. Default to None, which means all available. + split_large_regions: if True, regions will be run sequentially in parallelized chunks. If False, + each individual region's chunks will be run sequentially but there will be parallelization across + regions, i.e. each core will be assigned one region at a time by the executor. Set to True if you + are running a small number of very large regions (e.g. one or two chromosomes), otherwise to to False (default). kwargs: other keyword parameters passed through to utils.line_plot Returns: @@ -51,6 +59,7 @@ def plot_depth_histogram( window_size=window_size, single_strand=single_strand, one_depth_per_region=one_depth_per_region, + quiet=quiet, cores=cores, ) @@ -144,7 +153,8 @@ def get_depth_counts( window_size: int | None, single_strand: bool = False, one_depth_per_region: bool = False, - cores=1, + quiet: bool = False, + cores: int | None = 1, ) -> list[np.ndarray]: """ Get the depth counts, ready for plotting. @@ -161,6 +171,10 @@ def get_depth_counts( the region of interest, False means we always grab both strands within the regions one_depth_per_region: if True, each region will only report a single depth value, averaging across all non-zero depths. If False depths will be reported separately for all nonzero count positions in each region for a more granular view of depth distribution. + regions_5to3prime: True means negative strand regions get flipped, False means no flipping + smooth_window: size of the moving window to use for smoothing. If set to None, no smoothing is performed + quiet: disables progress bars + cores: CPU cores across which to parallelize processing Returns: List of depth vectors for histogram @@ -182,6 +196,7 @@ def get_depth_counts( motif=motif, window_size=window_size, single_strand=single_strand, + quiet=quiet, cores=cores, ) # places where read depth is zero are assumed to not have the motif present - this may not always be true, diff --git a/dimelo/plot_depth_profile.py b/dimelo/plot_depth_profile.py index 9c65651..f13962f 100644 --- a/dimelo/plot_depth_profile.py +++ b/dimelo/plot_depth_profile.py @@ -15,6 +15,8 @@ def plot_depth_profile( single_strand: bool = False, regions_5to3prime: bool = False, smooth_window: int | None = None, + quiet: bool = False, + cores: int | None = None, **kwargs, ) -> Axes: """ @@ -36,6 +38,8 @@ def plot_depth_profile( the region of interest, False means we always grab both strands within the regions regions_5to3prime: True means negative strand regions get flipped, False means no flipping smooth_window: size of the moving window to use for smoothing. If set to None, no smoothing is performed + quiet: disables progress bars + cores: CPU cores across which to parallelize processing. Default to None, which means all available. kwargs: other keyword parameters passed through to utils.line_plot Returns: @@ -52,6 +56,8 @@ def plot_depth_profile( single_strand=single_strand, regions_5to3prime=regions_5to3prime, smooth_window=smooth_window, + quiet=quiet, + cores=cores, ) axes = make_depth_profile_plot( @@ -141,6 +147,8 @@ def get_depth_profiles( single_strand: bool = False, regions_5to3prime: bool = False, smooth_window: int | None = None, + quiet: bool = False, + cores: int | None = None, ) -> list[np.ndarray]: """ Get the depth profile traces, ready for plotting. @@ -157,6 +165,8 @@ def get_depth_profiles( the region of interest, False means we always grab both strands within the regions regions_5to3prime: True means negative strand regions get flipped, False means no flipping smooth_window: size of the moving window to use for smoothing. If set to None, no smoothing is performed + quiet: disables progress bars + cores: CPU cores across which to parallelize processing Returns: List of depth profile traces @@ -178,6 +188,8 @@ def get_depth_profiles( window_size=window_size, single_strand=single_strand, regions_5to3prime=regions_5to3prime, + quiet=quiet, + cores=cores, ) trace = valid_base_counts.astype(float) trace[trace == 0] = np.nan diff --git a/dimelo/plot_enrichment.py b/dimelo/plot_enrichment.py index 74c80d6..d747b91 100644 --- a/dimelo/plot_enrichment.py +++ b/dimelo/plot_enrichment.py @@ -12,6 +12,8 @@ def plot_enrichment( sample_names: list[str], window_size: int | None = None, single_strand: bool = False, + quiet: bool = False, + cores: int | None = None, **kwargs, ) -> Axes: """ @@ -31,6 +33,8 @@ def plot_enrichment( window_size: (currently disabled) window around center of region, +-window_size//2 single_strand: True means we only grab counts from reads from the same strand as the region of interest, False means we always grab both strands within the regions + quiet: disables progress bars + cores: CPU cores across which to parallelize processing. Default to None, which means all available. kwargs: other keyword parameters passed through to utils.bar_plot Returns: @@ -45,6 +49,8 @@ def plot_enrichment( motifs=motifs, window_size=window_size, single_strand=single_strand, + quiet=quiet, + cores=cores, ) axes = make_enrichment_plot( @@ -143,6 +149,8 @@ def get_enrichments( motifs: list[str], window_size: int | None = None, single_strand: bool = False, + quiet: bool = False, + cores: int | None = None, ) -> list[float]: """ Get the enrichment values, ready for plotting. @@ -160,6 +168,8 @@ def get_enrichments( window_size: (currently disabled) window around center of region, +-window_size//2 single_strand: True means we only grab counts from reads from the same strand as the region of interest, False means we always grab both strands within the regions + quiet: disables progress bars + cores: CPU cores across which to parallelize processing Returns: List of modified fraction values. @@ -180,6 +190,8 @@ def get_enrichments( motif=motif, window_size=window_size, single_strand=single_strand, + quiet=quiet, + cores=cores, ) case ".fake": n_mod, n_total = load_processed.counts_from_fake( diff --git a/dimelo/plot_enrichment_profile.py b/dimelo/plot_enrichment_profile.py index 8a33cbf..97ec461 100644 --- a/dimelo/plot_enrichment_profile.py +++ b/dimelo/plot_enrichment_profile.py @@ -15,6 +15,8 @@ def plot_enrichment_profile( single_strand: bool = False, regions_5to3prime: bool = False, smooth_window: int | None = None, + quiet: bool = False, + cores: int | None = None, **kwargs, ) -> Axes: """ @@ -41,6 +43,8 @@ def plot_enrichment_profile( the region of interest, False means we always grab both strands within the regions regions_5to3prime: True means negative strand regions get flipped, False means no flipping smooth_window: size of the moving window to use for smoothing. If set to None, no smoothing is performed + quiet: disables progress bars + cores: CPU cores across which to parallelize processing. Default to None, which means all available. kwargs: other keyword parameters passed through to utils.line_plot Returns: @@ -57,6 +61,8 @@ def plot_enrichment_profile( single_strand=single_strand, regions_5to3prime=regions_5to3prime, smooth_window=smooth_window, + quiet=quiet, + cores=cores, ) axes = make_enrichment_profile_plot( @@ -157,6 +163,8 @@ def get_enrichment_profiles( single_strand: bool = False, regions_5to3prime: bool = False, smooth_window: int | None = None, + quiet: bool = False, + cores: int | None = None, ) -> list[np.ndarray]: """ Get the enrichment profile traces, ready for plotting. @@ -176,6 +184,8 @@ def get_enrichment_profiles( single_strand: True means we only grab counts from reads from the same strand as the region of interest, False means we always grab both strands within the regions regions_5to3prime: True means negative strand regions get flipped, False means no flipping + quiet: disables progress bars + cores: CPU cores across which to parallelize processing smooth_window: size of the moving window to use for smoothing. If set to None, no smoothing is performed Returns: @@ -199,6 +209,8 @@ def get_enrichment_profiles( window_size=window_size, single_strand=single_strand, regions_5to3prime=regions_5to3prime, + quiet=quiet, + cores=cores, ) ) # Default to nan so we can skip over unfilled values when plotting or doing a rolling average diff --git a/dimelo/run_modkit.py b/dimelo/run_modkit.py index aa8e707..c49106c 100644 --- a/dimelo/run_modkit.py +++ b/dimelo/run_modkit.py @@ -28,15 +28,16 @@ print( f"PATH does not include the conda environment /bin folder. Adding {env_bin_path}." ) - os.environ["PATH"] = f'{env_bin_path}:{os.environ["PATH"]}' - print(f'PATH is now {os.environ["PATH"]}') + os.environ["PATH"] = f"{env_bin_path}:{os.environ['PATH']}" + print(f"PATH is now {os.environ['PATH']}") # Check modkit on first import: does it run; does it have the right version try: result = subprocess.run(["modkit", "--version"], stdout=subprocess.PIPE, text=True) modkit_version = result.stdout if modkit_version.split()[1] == EXPECTED_MODKIT_VERSION: - print(f"modkit found with expected version {EXPECTED_MODKIT_VERSION}") + pass + # print(f"modkit found with expected version {EXPECTED_MODKIT_VERSION}") else: print( f"modkit found with unexpected version {modkit_version.split()[1]}. Versions other than {EXPECTED_MODKIT_VERSION} may exhibit unexpected behavior. It is recommended that you use v{EXPECTED_MODKIT_VERSION}" diff --git a/dimelo/test/cases.py b/dimelo/test/cases.py index a645ad5..402db58 100644 --- a/dimelo/test/cases.py +++ b/dimelo/test/cases.py @@ -38,6 +38,10 @@ "sort_by": ["read_start", "read_name", "motif"], "smooth_window": 1, "title": "megalodon_peaks_190", + "single_strand": False, + "regions_5to3prime": False, + "chunk_size": 1_000_000, + "cores": 1, }, # outputs dict function:values {}, # populated by generate_targets.py @@ -55,6 +59,10 @@ "sort_by": ["read_start", "read_name", "motif"], "smooth_window": 10, "title": "megalodon_single_190", + "single_strand": False, + "regions_5to3prime": False, + "chunk_size": 100, + "cores": 2, }, # outputs dict function:values {}, # populated by generate_targets.py @@ -72,6 +80,10 @@ "sort_by": ["read_start", "read_name", "motif"], "smooth_window": 100, "title": "megalodon_single_and_peaks_190", + "single_strand": True, + "regions_5to3prime": True, + "chunk_size": 10_000, + "cores": 3, }, # outputs dict function:values {}, # populated by generate_targets.py @@ -89,6 +101,10 @@ "sort_by": ["read_start", "read_name", "motif"], "smooth_window": 100, "title": "megalodon_peaks_nothresh", + "single_strand": True, + "regions_5to3prime": False, + "chunk_size": 1000, + "cores": 4, }, # outputs dict function:values {}, # populated by generate_targets.py @@ -106,6 +122,10 @@ "sort_by": ["read_start", "read_name", "motif"], "smooth_window": 1, "title": "megalodon_single_nothresh", + "single_strand": False, + "regions_5to3prime": True, + "chunk_size": 100, + "cores": None, }, # outputs dict function:values {}, # populated by generate_targets.py diff --git a/dimelo/test/data/ctcf_demo_not_peak.bed b/dimelo/test/data/ctcf_demo_not_peak.bed index 920b114..edfe622 100644 --- a/dimelo/test/data/ctcf_demo_not_peak.bed +++ b/dimelo/test/data/ctcf_demo_not_peak.bed @@ -1,200 +1,200 @@ -chr14 44123158 44123308 + 465.63275 . -chr14 44127026 44127176 + 465.63275 . -chr14 44123159 44123308 - 465.63275 . -chr14 44127026 44127175 - 465.63275 . -chr1 114356587 114356736 - 441.00102 . -chr1 114360454 114360603 - 441.00102 . -chr1 114356586 114356736 + 441.00102 . -chr1 114360454 114360604 + 441.00102 . -chrX 9700334 9700484 - 418.31102 . -chrX 9704202 9704352 - 418.31102 . -chr15 54632158 54632308 + 410.28758 . -chr15 54636060 54636176 + 410.28758 . -chr15 54632159 54632308 - 410.28758 . -chr15 54636060 54636175 - 410.28758 . -chr15 54632192 54632308 - 410.28758 . -chr15 54636060 54636210 - 410.28758 . -chr15 54632193 54632308 + 410.28758 . -chr15 54636060 54636209 + 410.28758 . -chr6 53009844 53009994 + 398.10126 . -chr6 53013712 53013862 + 398.10126 . -chr16 4279536 4279624 + 394.95247 . -chr16 4283404 4283554 + 394.95247 . -chr16 4279474 4279624 + 394.95247 . -chr16 4283404 4283490 + 394.95247 . -chr16 4279537 4279624 - 394.95247 . -chr16 4283404 4283553 - 394.95247 . -chr7 152343847 152343935 - 390.20159 . -chr7 152347715 152347865 - 390.20159 . -chr7 152343848 152343935 + 390.20159 . -chr7 152347715 152347864 + 390.20159 . -chr7 152343786 152343935 + 390.20159 . -chr7 152347715 152347802 + 390.20159 . -chr7 152343785 152343935 - 390.20159 . -chr7 152347715 152347803 - 390.20159 . -chr16 63442391 63442541 - 389.524 . -chr16 63446302 63446409 - 389.524 . -chr16 63442392 63442541 + 389.524 . -chr16 63446302 63446408 + 389.524 . -chr16 63442436 63442541 - 389.524 . -chr16 63446302 63446452 - 389.524 . -chr6 41196658 41196741 - 388.86052 . -chr6 41200525 41200674 - 388.86052 . -chr6 41196657 41196741 + 388.86052 . -chr6 41200525 41200675 + 388.86052 . -chr6 41196592 41196741 + 388.86052 . -chr6 41200525 41200608 + 388.86052 . -chr6 41196591 41196741 - 388.86052 . -chr6 41200525 41200609 - 388.86052 . -chr20 36033923 36034073 + 384.11318 . -chr20 36037791 36037941 + 384.11318 . -chr20 36033924 36034073 - 384.11318 . -chr20 36037791 36037940 - 384.11318 . -chr12 57871837 57871987 - 380.55342 . -chr12 57875753 57875855 - 380.55342 . -chr12 57871885 57871987 - 380.55342 . -chr12 57875753 57875903 - 380.55342 . -chr8 22846344 22846494 + 372.97972 . -chr8 22850296 22850362 + 372.97972 . -chr8 22846345 22846494 - 372.97972 . -chr8 22850296 22850361 - 372.97972 . -chr8 22846428 22846494 + 372.97972 . -chr8 22850296 22850446 + 372.97972 . -chr17 45987626 45987736 + 370.32321 . -chr17 45991494 45991644 + 370.32321 . -chr17 45987627 45987736 - 370.32321 . -chr17 45991494 45991643 - 370.32321 . -chr17 45987587 45987736 - 370.32321 . -chr17 45991494 45991603 - 370.32321 . -chr17 45987586 45987736 + 370.32321 . -chr17 45991494 45991604 + 370.32321 . -chr3 48462689 48462778 + 367.84554 . -chr3 48466556 48466705 + 367.84554 . -chr3 48462688 48462778 - 367.84554 . -chr3 48466556 48466706 - 367.84554 . -chr3 48462628 48462778 - 367.84554 . -chr3 48466556 48466646 - 367.84554 . -chr20 33335286 33335348 - 367.5176 . -chr20 33339154 33339304 - 367.5176 . -chr20 33335198 33335348 - 367.5176 . -chr20 33339154 33339216 - 367.5176 . -chr20 33335199 33335348 + 367.5176 . -chr20 33339154 33339215 + 367.5176 . -chr1 150101535 150101649 + 365.77314 . -chr1 150105402 150105551 + 365.77314 . -chr1 150101534 150101649 - 365.77314 . -chr1 150105402 150105552 - 365.77314 . -chr1 150101499 150101649 + 365.77314 . -chr1 150105402 150105517 + 365.77314 . -chr17 42517551 42517701 - 359.33549 . -chr17 42521481 42521569 - 359.33549 . -chr17 42517613 42517701 - 359.33549 . -chr17 42521481 42521631 - 359.33549 . -chr17 42517552 42517701 + 359.33549 . -chr17 42521481 42521568 + 359.33549 . -chr6 15561986 15562057 - 358.9469 . -chr6 15565853 15566002 - 358.9469 . -chr6 15561907 15562057 + 358.9469 . -chr6 15565853 15565925 + 358.9469 . -chr6 15561908 15562057 - 358.9469 . -chr6 15565853 15565924 - 358.9469 . -chr6 15561985 15562057 + 358.9469 . -chr6 15565853 15566003 + 358.9469 . -chrX 99209453 99209603 - 358.41391 . -chrX 99213321 99213471 - 358.41391 . -chrX 99209454 99209603 + 358.41391 . -chrX 99213321 99213470 + 358.41391 . -chr2 44482827 44482977 + 357.20939 . -chr2 44486695 44486845 + 357.20939 . -chr19 43453207 43453357 - 355.83581 . -chr19 43457120 43457225 - 355.83581 . -chr19 43453253 43453357 + 355.83581 . -chr19 43457120 43457269 + 355.83581 . -chr19 43453252 43453357 - 355.83581 . -chr19 43457120 43457270 - 355.83581 . -chr17 44334564 44334713 - 354.9178 . -chr17 44338431 44338580 - 354.9178 . -chr17 44334563 44334713 + 354.9178 . -chr17 44338431 44338581 + 354.9178 . -chr5 138876711 138876861 + 352.26403 . -chr5 138880579 138880729 + 352.26403 . -chr17 82008681 82008700 - 351.42341 . -chr17 82012549 82012699 - 351.42341 . -chr17 82008550 82008700 - 351.42341 . -chr17 82012549 82012568 - 351.42341 . -chr17 82008682 82008700 + 351.42341 . -chr17 82012549 82012698 + 351.42341 . -chr15 38664851 38665000 + 350.08504 . -chr15 38668760 38668867 + 350.08504 . -chr15 38664850 38665000 - 350.08504 . -chr15 38668760 38668868 - 350.08504 . -chr15 38664892 38665000 - 350.08504 . -chr15 38668760 38668910 - 350.08504 . -chr15 38664893 38665000 + 350.08504 . -chr15 38668760 38668909 + 350.08504 . -chr1 9168135 9168218 + 349.08889 . -chr1 9172003 9172153 + 349.08889 . -chr1 9168136 9168218 - 349.08889 . -chr1 9172003 9172152 - 349.08889 . -chr1 9168068 9168218 + 349.08889 . -chr1 9172003 9172086 + 349.08889 . -chr9 145234697 145234810 - 348.5771 . -chr9 145238565 145238715 - 348.5771 . -chr9 145234661 145234810 - 348.5771 . -chr9 145238565 145238677 - 348.5771 . -chr9 145234660 145234810 + 348.5771 . -chr9 145238565 145238678 + 348.5771 . -chr6 73588200 73588289 + 345.89289 . -chr6 73592068 73592218 + 345.89289 . -chr6 73588139 73588289 - 345.89289 . -chr6 73592068 73592157 - 345.89289 . -chr19 41732348 41732498 - 345.26706 . -chr19 41736216 41736366 - 345.26706 . -chr19 41732349 41732498 + 345.26706 . -chr19 41736216 41736365 + 345.26706 . -chr20 59989038 59989135 + 345.12156 . -chr20 59992905 59993054 + 345.12156 . -chr20 59988985 59989135 - 345.12156 . -chr20 59992905 59993003 - 345.12156 . -chr20 59989037 59989135 - 345.12156 . -chr20 59992905 59993055 - 345.12156 . -chr21 24894312 24894462 - 344.78198 . -chr21 24898180 24898330 - 344.78198 . -chr21 24894313 24894462 + 344.78198 . -chr21 24898180 24898329 + 344.78198 . -chr14 17377274 17377423 - 341.73484 . -chr14 17381209 17381290 - 341.73484 . -chr14 17377273 17377423 + 341.73484 . -chr14 17381209 17381291 + 341.73484 . -chr14 17377341 17377423 + 341.73484 . -chr14 17381209 17381359 + 341.73484 . -chr19 44926037 44926186 - 341.02209 . -chr19 44929904 44930053 - 341.02209 . -chr19 44926036 44926186 + 341.02209 . -chr19 44929904 44930054 + 341.02209 . -chr5 111666908 111667058 + 339.94407 . -chr5 111670776 111670926 + 339.94407 . -chr20 40161013 40161163 - 339.91828 . -chr20 40164881 40165031 - 339.91828 . -chr20 40161014 40161163 + 339.91828 . -chr20 40164881 40165030 + 339.91828 . -chr16 61261359 61261509 + 338.57132 . -chr16 61265306 61265377 + 338.57132 . -chr16 61261360 61261509 - 338.57132 . -chr16 61265306 61265376 - 338.57132 . -chr16 61261438 61261509 + 338.57132 . -chr16 61265306 61265456 + 338.57132 . -chr16 61261439 61261509 - 338.57132 . -chr16 61265306 61265455 - 338.57132 . -chr6 155876025 155876076 - 336.32967 . -chr6 155879892 155880041 - 336.32967 . -chr6 155876024 155876076 + 336.32967 . -chr6 155879892 155880042 + 336.32967 . -chr6 155875926 155876076 - 336.32967 . -chr6 155879892 155879942 - 336.32967 . -chr1 224661531 224661681 - 334.48922 . -chr1 224665451 224665547 - 334.48922 . -chr1 224661583 224661681 + 334.48922 . -chr1 224665451 224665601 + 334.48922 . -chr1 224661584 224661681 - 334.48922 . -chr1 224665451 224665600 - 334.48922 . \ No newline at end of file +chr14 44123158 44123308 + 465.63275 + +chr14 44127026 44127176 + 465.63275 + +chr14 44123159 44123308 - 465.63275 - +chr14 44127026 44127175 - 465.63275 - +chr1 114356587 114356736 - 441.00102 - +chr1 114360454 114360603 - 441.00102 - +chr1 114356586 114356736 + 441.00102 + +chr1 114360454 114360604 + 441.00102 + +chrX 9700334 9700484 - 418.31102 - +chrX 9704202 9704352 - 418.31102 - +chr15 54632158 54632308 + 410.28758 + +chr15 54636060 54636176 + 410.28758 + +chr15 54632159 54632308 - 410.28758 - +chr15 54636060 54636175 - 410.28758 - +chr15 54632192 54632308 - 410.28758 - +chr15 54636060 54636210 - 410.28758 - +chr15 54632193 54632308 + 410.28758 + +chr15 54636060 54636209 + 410.28758 + +chr6 53009844 53009994 + 398.10126 + +chr6 53013712 53013862 + 398.10126 + +chr16 4279536 4279624 + 394.95247 + +chr16 4283404 4283554 + 394.95247 + +chr16 4279474 4279624 + 394.95247 + +chr16 4283404 4283490 + 394.95247 + +chr16 4279537 4279624 - 394.95247 - +chr16 4283404 4283553 - 394.95247 - +chr7 152343847 152343935 - 390.20159 - +chr7 152347715 152347865 - 390.20159 - +chr7 152343848 152343935 + 390.20159 + +chr7 152347715 152347864 + 390.20159 + +chr7 152343786 152343935 + 390.20159 + +chr7 152347715 152347802 + 390.20159 + +chr7 152343785 152343935 - 390.20159 - +chr7 152347715 152347803 - 390.20159 - +chr16 63442391 63442541 - 389.524 - +chr16 63446302 63446409 - 389.524 - +chr16 63442392 63442541 + 389.524 + +chr16 63446302 63446408 + 389.524 + +chr16 63442436 63442541 - 389.524 - +chr16 63446302 63446452 - 389.524 - +chr6 41196658 41196741 - 388.86052 - +chr6 41200525 41200674 - 388.86052 - +chr6 41196657 41196741 + 388.86052 + +chr6 41200525 41200675 + 388.86052 + +chr6 41196592 41196741 + 388.86052 + +chr6 41200525 41200608 + 388.86052 + +chr6 41196591 41196741 - 388.86052 - +chr6 41200525 41200609 - 388.86052 - +chr20 36033923 36034073 + 384.11318 + +chr20 36037791 36037941 + 384.11318 + +chr20 36033924 36034073 - 384.11318 - +chr20 36037791 36037940 - 384.11318 - +chr12 57871837 57871987 - 380.55342 - +chr12 57875753 57875855 - 380.55342 - +chr12 57871885 57871987 - 380.55342 - +chr12 57875753 57875903 - 380.55342 - +chr8 22846344 22846494 + 372.97972 + +chr8 22850296 22850362 + 372.97972 + +chr8 22846345 22846494 - 372.97972 - +chr8 22850296 22850361 - 372.97972 - +chr8 22846428 22846494 + 372.97972 + +chr8 22850296 22850446 + 372.97972 + +chr17 45987626 45987736 + 370.32321 + +chr17 45991494 45991644 + 370.32321 + +chr17 45987627 45987736 - 370.32321 - +chr17 45991494 45991643 - 370.32321 - +chr17 45987587 45987736 - 370.32321 - +chr17 45991494 45991603 - 370.32321 - +chr17 45987586 45987736 + 370.32321 + +chr17 45991494 45991604 + 370.32321 + +chr3 48462689 48462778 + 367.84554 + +chr3 48466556 48466705 + 367.84554 + +chr3 48462688 48462778 - 367.84554 - +chr3 48466556 48466706 - 367.84554 - +chr3 48462628 48462778 - 367.84554 - +chr3 48466556 48466646 - 367.84554 - +chr20 33335286 33335348 - 367.5176 - +chr20 33339154 33339304 - 367.5176 - +chr20 33335198 33335348 - 367.5176 - +chr20 33339154 33339216 - 367.5176 - +chr20 33335199 33335348 + 367.5176 + +chr20 33339154 33339215 + 367.5176 + +chr1 150101535 150101649 + 365.77314 + +chr1 150105402 150105551 + 365.77314 + +chr1 150101534 150101649 - 365.77314 - +chr1 150105402 150105552 - 365.77314 - +chr1 150101499 150101649 + 365.77314 + +chr1 150105402 150105517 + 365.77314 + +chr17 42517551 42517701 - 359.33549 - +chr17 42521481 42521569 - 359.33549 - +chr17 42517613 42517701 - 359.33549 - +chr17 42521481 42521631 - 359.33549 - +chr17 42517552 42517701 + 359.33549 + +chr17 42521481 42521568 + 359.33549 + +chr6 15561986 15562057 - 358.9469 - +chr6 15565853 15566002 - 358.9469 - +chr6 15561907 15562057 + 358.9469 + +chr6 15565853 15565925 + 358.9469 + +chr6 15561908 15562057 - 358.9469 - +chr6 15565853 15565924 - 358.9469 - +chr6 15561985 15562057 + 358.9469 + +chr6 15565853 15566003 + 358.9469 + +chrX 99209453 99209603 - 358.41391 - +chrX 99213321 99213471 - 358.41391 - +chrX 99209454 99209603 + 358.41391 + +chrX 99213321 99213470 + 358.41391 + +chr2 44482827 44482977 + 357.20939 + +chr2 44486695 44486845 + 357.20939 + +chr19 43453207 43453357 - 355.83581 - +chr19 43457120 43457225 - 355.83581 - +chr19 43453253 43453357 + 355.83581 + +chr19 43457120 43457269 + 355.83581 + +chr19 43453252 43453357 - 355.83581 - +chr19 43457120 43457270 - 355.83581 - +chr17 44334564 44334713 - 354.9178 - +chr17 44338431 44338580 - 354.9178 - +chr17 44334563 44334713 + 354.9178 + +chr17 44338431 44338581 + 354.9178 + +chr5 138876711 138876861 + 352.26403 + +chr5 138880579 138880729 + 352.26403 + +chr17 82008681 82008700 - 351.42341 - +chr17 82012549 82012699 - 351.42341 - +chr17 82008550 82008700 - 351.42341 - +chr17 82012549 82012568 - 351.42341 - +chr17 82008682 82008700 + 351.42341 + +chr17 82012549 82012698 + 351.42341 + +chr15 38664851 38665000 + 350.08504 + +chr15 38668760 38668867 + 350.08504 + +chr15 38664850 38665000 - 350.08504 - +chr15 38668760 38668868 - 350.08504 - +chr15 38664892 38665000 - 350.08504 - +chr15 38668760 38668910 - 350.08504 - +chr15 38664893 38665000 + 350.08504 + +chr15 38668760 38668909 + 350.08504 + +chr1 9168135 9168218 + 349.08889 + +chr1 9172003 9172153 + 349.08889 + +chr1 9168136 9168218 - 349.08889 - +chr1 9172003 9172152 - 349.08889 - +chr1 9168068 9168218 + 349.08889 + +chr1 9172003 9172086 + 349.08889 + +chr9 145234697 145234810 - 348.5771 - +chr9 145238565 145238715 - 348.5771 - +chr9 145234661 145234810 - 348.5771 - +chr9 145238565 145238677 - 348.5771 - +chr9 145234660 145234810 + 348.5771 + +chr9 145238565 145238678 + 348.5771 + +chr6 73588200 73588289 + 345.89289 + +chr6 73592068 73592218 + 345.89289 + +chr6 73588139 73588289 - 345.89289 - +chr6 73592068 73592157 - 345.89289 - +chr19 41732348 41732498 - 345.26706 - +chr19 41736216 41736366 - 345.26706 - +chr19 41732349 41732498 + 345.26706 + +chr19 41736216 41736365 + 345.26706 + +chr20 59989038 59989135 + 345.12156 + +chr20 59992905 59993054 + 345.12156 + +chr20 59988985 59989135 - 345.12156 - +chr20 59992905 59993003 - 345.12156 - +chr20 59989037 59989135 - 345.12156 - +chr20 59992905 59993055 - 345.12156 - +chr21 24894312 24894462 - 344.78198 - +chr21 24898180 24898330 - 344.78198 - +chr21 24894313 24894462 + 344.78198 + +chr21 24898180 24898329 + 344.78198 + +chr14 17377274 17377423 - 341.73484 - +chr14 17381209 17381290 - 341.73484 - +chr14 17377273 17377423 + 341.73484 + +chr14 17381209 17381291 + 341.73484 + +chr14 17377341 17377423 + 341.73484 + +chr14 17381209 17381359 + 341.73484 + +chr19 44926037 44926186 - 341.02209 - +chr19 44929904 44930053 - 341.02209 - +chr19 44926036 44926186 + 341.02209 + +chr19 44929904 44930054 + 341.02209 + +chr5 111666908 111667058 + 339.94407 + +chr5 111670776 111670926 + 339.94407 + +chr20 40161013 40161163 - 339.91828 - +chr20 40164881 40165031 - 339.91828 - +chr20 40161014 40161163 + 339.91828 + +chr20 40164881 40165030 + 339.91828 + +chr16 61261359 61261509 + 338.57132 + +chr16 61265306 61265377 + 338.57132 + +chr16 61261360 61261509 - 338.57132 - +chr16 61265306 61265376 - 338.57132 - +chr16 61261438 61261509 + 338.57132 + +chr16 61265306 61265456 + 338.57132 + +chr16 61261439 61261509 - 338.57132 - +chr16 61265306 61265455 - 338.57132 - +chr6 155876025 155876076 - 336.32967 - +chr6 155879892 155880041 - 336.32967 - +chr6 155876024 155876076 + 336.32967 + +chr6 155879892 155880042 + 336.32967 + +chr6 155875926 155876076 - 336.32967 - +chr6 155879892 155879942 - 336.32967 - +chr1 224661531 224661681 - 334.48922 - +chr1 224665451 224665547 - 334.48922 - +chr1 224661583 224661681 + 334.48922 + +chr1 224665451 224665601 + 334.48922 + +chr1 224661584 224661681 - 334.48922 - +chr1 224665451 224665600 - 334.48922 - \ No newline at end of file diff --git a/dimelo/test/data/ctcf_demo_peak.bed b/dimelo/test/data/ctcf_demo_peak.bed index ef01c44..279cd25 100644 --- a/dimelo/test/data/ctcf_demo_peak.bed +++ b/dimelo/test/data/ctcf_demo_peak.bed @@ -1,100 +1,100 @@ -chr14 44125008 44125326 + 465.63275 . -chr14 44125009 44125325 - 465.63275 . -chr1 114358437 114358753 - 441.00102 . -chr1 114358436 114358754 + 441.00102 . -chrX 9702184 9702502 - 418.31102 . -chr15 54634008 54634326 + 410.28758 . -chr15 54634009 54634325 - 410.28758 . -chr15 54634042 54634360 - 410.28758 . -chr15 54634043 54634359 + 410.28758 . -chr6 53011694 53012012 + 398.10126 . -chr16 4281386 4281704 + 394.95247 . -chr16 4281324 4281640 + 394.95247 . -chr16 4281387 4281703 - 394.95247 . -chr7 152345697 152346015 - 390.20159 . -chr7 152345698 152346014 + 390.20159 . -chr7 152345636 152345952 + 390.20159 . -chr7 152345635 152345953 - 390.20159 . -chr16 63444241 63444559 - 389.524 . -chr16 63444242 63444558 + 389.524 . -chr16 63444286 63444602 - 389.524 . -chr6 41198508 41198824 - 388.86052 . -chr6 41198507 41198825 + 388.86052 . -chr6 41198442 41198758 + 388.86052 . -chr6 41198441 41198759 - 388.86052 . -chr20 36035773 36036091 + 384.11318 . -chr20 36035774 36036090 - 384.11318 . -chr12 57873687 57874005 - 380.55342 . -chr12 57873735 57874053 - 380.55342 . -chr8 22848194 22848512 + 372.97972 . -chr8 22848195 22848511 - 372.97972 . -chr8 22848278 22848596 + 372.97972 . -chr17 45989476 45989794 + 370.32321 . -chr17 45989477 45989793 - 370.32321 . -chr17 45989437 45989753 - 370.32321 . -chr17 45989436 45989754 + 370.32321 . -chr3 48464539 48464855 + 367.84554 . -chr3 48464538 48464856 - 367.84554 . -chr3 48464478 48464796 - 367.84554 . -chr20 33337136 33337454 - 367.5176 . -chr20 33337048 33337366 - 367.5176 . -chr20 33337049 33337365 + 367.5176 . -chr1 150103385 150103701 + 365.77314 . -chr1 150103384 150103702 - 365.77314 . -chr1 150103349 150103667 + 365.77314 . -chr17 42519401 42519719 - 359.33549 . -chr17 42519463 42519781 - 359.33549 . -chr17 42519402 42519718 + 359.33549 . -chr6 15563836 15564152 - 358.9469 . -chr6 15563757 15564075 + 358.9469 . -chr6 15563758 15564074 - 358.9469 . -chr6 15563835 15564153 + 358.9469 . -chrX 99211303 99211621 - 358.41391 . -chrX 99211304 99211620 + 358.41391 . -chr2 44484677 44484995 + 357.20939 . -chr19 43455057 43455375 - 355.83581 . -chr19 43455103 43455419 + 355.83581 . -chr19 43455102 43455420 - 355.83581 . -chr17 44336414 44336730 - 354.9178 . -chr17 44336413 44336731 + 354.9178 . -chr5 138878561 138878879 + 352.26403 . -chr17 82010531 82010849 - 351.42341 . -chr17 82010400 82010718 - 351.42341 . -chr17 82010532 82010848 + 351.42341 . -chr15 38666701 38667017 + 350.08504 . -chr15 38666700 38667018 - 350.08504 . -chr15 38666742 38667060 - 350.08504 . -chr15 38666743 38667059 + 350.08504 . -chr1 9169985 9170303 + 349.08889 . -chr1 9169986 9170302 - 349.08889 . -chr1 9169918 9170236 + 349.08889 . -chr9 145236547 145236865 - 348.5771 . -chr9 145236511 145236827 - 348.5771 . -chr9 145236510 145236828 + 348.5771 . -chr6 73590050 73590368 + 345.89289 . -chr6 73589989 73590307 - 345.89289 . -chr19 41734198 41734516 - 345.26706 . -chr19 41734199 41734515 + 345.26706 . -chr20 59990888 59991204 + 345.12156 . -chr20 59990835 59991153 - 345.12156 . -chr20 59990887 59991205 - 345.12156 . -chr21 24896162 24896480 - 344.78198 . -chr21 24896163 24896479 + 344.78198 . -chr14 17379124 17379440 - 341.73484 . -chr14 17379123 17379441 + 341.73484 . -chr14 17379191 17379509 + 341.73484 . -chr19 44927887 44928203 - 341.02209 . -chr19 44927886 44928204 + 341.02209 . -chr5 111668758 111669076 + 339.94407 . -chr20 40162863 40163181 - 339.91828 . -chr20 40162864 40163180 + 339.91828 . -chr16 61263209 61263527 + 338.57132 . -chr16 61263210 61263526 - 338.57132 . -chr16 61263288 61263606 + 338.57132 . -chr16 61263289 61263605 - 338.57132 . -chr6 155877875 155878191 - 336.32967 . -chr6 155877874 155878192 + 336.32967 . -chr6 155877776 155878092 - 336.32967 . -chr1 224663381 224663697 - 334.48922 . -chr1 224663433 224663751 + 334.48922 . -chr1 224663434 224663750 - 334.48922 . \ No newline at end of file +chr14 44125008 44125326 + 465.63275 + +chr14 44125009 44125325 - 465.63275 - +chr1 114358437 114358753 - 441.00102 - +chr1 114358436 114358754 + 441.00102 + +chrX 9702184 9702502 - 418.31102 - +chr15 54634008 54634326 + 410.28758 + +chr15 54634009 54634325 - 410.28758 - +chr15 54634042 54634360 - 410.28758 - +chr15 54634043 54634359 + 410.28758 + +chr6 53011694 53012012 + 398.10126 + +chr16 4281386 4281704 + 394.95247 + +chr16 4281324 4281640 + 394.95247 + +chr16 4281387 4281703 - 394.95247 - +chr7 152345697 152346015 - 390.20159 - +chr7 152345698 152346014 + 390.20159 + +chr7 152345636 152345952 + 390.20159 + +chr7 152345635 152345953 - 390.20159 - +chr16 63444241 63444559 - 389.524 - +chr16 63444242 63444558 + 389.524 + +chr16 63444286 63444602 - 389.524 - +chr6 41198508 41198824 - 388.86052 - +chr6 41198507 41198825 + 388.86052 + +chr6 41198442 41198758 + 388.86052 + +chr6 41198441 41198759 - 388.86052 - +chr20 36035773 36036091 + 384.11318 + +chr20 36035774 36036090 - 384.11318 - +chr12 57873687 57874005 - 380.55342 - +chr12 57873735 57874053 - 380.55342 - +chr8 22848194 22848512 + 372.97972 + +chr8 22848195 22848511 - 372.97972 - +chr8 22848278 22848596 + 372.97972 + +chr17 45989476 45989794 + 370.32321 + +chr17 45989477 45989793 - 370.32321 - +chr17 45989437 45989753 - 370.32321 - +chr17 45989436 45989754 + 370.32321 + +chr3 48464539 48464855 + 367.84554 + +chr3 48464538 48464856 - 367.84554 - +chr3 48464478 48464796 - 367.84554 - +chr20 33337136 33337454 - 367.5176 - +chr20 33337048 33337366 - 367.5176 - +chr20 33337049 33337365 + 367.5176 + +chr1 150103385 150103701 + 365.77314 + +chr1 150103384 150103702 - 365.77314 - +chr1 150103349 150103667 + 365.77314 + +chr17 42519401 42519719 - 359.33549 - +chr17 42519463 42519781 - 359.33549 - +chr17 42519402 42519718 + 359.33549 + +chr6 15563836 15564152 - 358.9469 - +chr6 15563757 15564075 + 358.9469 + +chr6 15563758 15564074 - 358.9469 - +chr6 15563835 15564153 + 358.9469 + +chrX 99211303 99211621 - 358.41391 - +chrX 99211304 99211620 + 358.41391 + +chr2 44484677 44484995 + 357.20939 + +chr19 43455057 43455375 - 355.83581 - +chr19 43455103 43455419 + 355.83581 + +chr19 43455102 43455420 - 355.83581 - +chr17 44336414 44336730 - 354.9178 - +chr17 44336413 44336731 + 354.9178 + +chr5 138878561 138878879 + 352.26403 + +chr17 82010531 82010849 - 351.42341 - +chr17 82010400 82010718 - 351.42341 - +chr17 82010532 82010848 + 351.42341 + +chr15 38666701 38667017 + 350.08504 + +chr15 38666700 38667018 - 350.08504 - +chr15 38666742 38667060 - 350.08504 - +chr15 38666743 38667059 + 350.08504 + +chr1 9169985 9170303 + 349.08889 + +chr1 9169986 9170302 - 349.08889 - +chr1 9169918 9170236 + 349.08889 + +chr9 145236547 145236865 - 348.5771 - +chr9 145236511 145236827 - 348.5771 - +chr9 145236510 145236828 + 348.5771 + +chr6 73590050 73590368 + 345.89289 + +chr6 73589989 73590307 - 345.89289 - +chr19 41734198 41734516 - 345.26706 - +chr19 41734199 41734515 + 345.26706 + +chr20 59990888 59991204 + 345.12156 + +chr20 59990835 59991153 - 345.12156 - +chr20 59990887 59991205 - 345.12156 - +chr21 24896162 24896480 - 344.78198 - +chr21 24896163 24896479 + 344.78198 + +chr14 17379124 17379440 - 341.73484 - +chr14 17379123 17379441 + 341.73484 + +chr14 17379191 17379509 + 341.73484 + +chr19 44927887 44928203 - 341.02209 - +chr19 44927886 44928204 + 341.02209 + +chr5 111668758 111669076 + 339.94407 + +chr20 40162863 40163181 - 339.91828 - +chr20 40162864 40163180 + 339.91828 + +chr16 61263209 61263527 + 338.57132 + +chr16 61263210 61263526 - 338.57132 - +chr16 61263288 61263606 + 338.57132 + +chr16 61263289 61263605 - 338.57132 - +chr6 155877875 155878191 - 336.32967 - +chr6 155877874 155878192 + 336.32967 + +chr6 155877776 155878092 - 336.32967 - +chr1 224663381 224663697 - 334.48922 - +chr1 224663433 224663751 + 334.48922 + +chr1 224663434 224663750 - 334.48922 - \ No newline at end of file diff --git a/dimelo/test/data/test_targets/test_matrix.pickle b/dimelo/test/data/test_targets/test_matrix.pickle index ba4d5d0..2632872 100644 Binary files a/dimelo/test/data/test_targets/test_matrix.pickle and b/dimelo/test/data/test_targets/test_matrix.pickle differ diff --git a/dimelo/test/dimelo_test.py b/dimelo/test/dimelo_test.py index f6f2d0b..f7b84b2 100644 --- a/dimelo/test/dimelo_test.py +++ b/dimelo/test/dimelo_test.py @@ -77,6 +77,13 @@ def test_unit__extract( ): kwargs_extract = filter_kwargs_for_func(dm.parse_bam.extract, kwargs) kwargs_extract["output_directory"] = cls.outDir + # extract can behave non-deterministically in terms of read output order + # if run on multiple cores. This is not an issue for sorted read loads, + # but means the file itself changes even though the meaningful contents don't + # given that parallelization here is purely within modkit anyway, the design + # choice was made to always single-core for extract testing + if "cores" in kwargs_extract: + del kwargs_extract["cores"] extract_h5, regions_processed = dm.parse_bam.extract( **kwargs_extract, ref_genome=cls.reference_genome, @@ -263,9 +270,9 @@ def test_integration__extract_load_plot( assert np.allclose( actual[key], expected[key], atol=1e-5 ), f"""{test_case}: Arrays for {key} are not equal -mismatch at {np.where(value!=actual[key])} -mismatch values expected {value[np.where(value!=actual[key])]} vs actual {actual[key][np.where(value!=actual[key])]} -{value[np.where(value!=actual[key])[0]]} vs {actual[key][np.where(value!=actual[key])[0]]}. +mismatch at {np.where(value != actual[key])} +mismatch values expected {value[np.where(value != actual[key])]} vs actual {actual[key][np.where(value != actual[key])]} +{value[np.where(value != actual[key])[0]]} vs {actual[key][np.where(value != actual[key])[0]]}. """ elif isinstance(value, (str, int, bool)): assert ( @@ -436,9 +443,9 @@ def test_unit__read_vectors_from_hdf5( assert np.allclose( actual[key], expected[key], atol=1e-5 ), f"""{test_case}: Arrays for {key} are not equal -mismatch at {np.where(value!=actual[key])} -mismatch values expected {value[np.where(value!=actual[key])]} vs actual {actual[key][np.where(value!=actual[key])]} -{value[np.where(value!=actual[key])[0]]} vs {actual[key][np.where(value!=actual[key])[0]]}. +mismatch at {np.where(value != actual[key])} +mismatch values expected {value[np.where(value != actual[key])]} vs actual {actual[key][np.where(value != actual[key])]} +{value[np.where(value != actual[key])[0]]} vs {actual[key][np.where(value != actual[key])[0]]}. """ elif isinstance(value, (str, int, bool)): assert ( diff --git a/dimelo/test/generate_targets.py b/dimelo/test/generate_targets.py index 9febefc..50dc141 100644 --- a/dimelo/test/generate_targets.py +++ b/dimelo/test/generate_targets.py @@ -36,7 +36,8 @@ def generate_extract(test_matrix, case_subset): for case in case_subset if case_subset is not None else test_matrix.keys(): kwargs, results = test_matrix[case] kwargs_extract = filter_kwargs_for_func(parse_bam.extract, kwargs) - # if kwargs['regions']==region: # for now, we only want to extract with the single region due to output file size + if "cores" in kwargs_extract: + del kwargs_extract["cores"] extract_file, extract_regions = parse_bam.extract( **kwargs_extract, ref_genome=ref_genome_file, @@ -167,9 +168,18 @@ def main(test_matrix): if Path(args.initial_target_pickle).exists(): with open(RelativePath(args.initial_target_pickle), "rb") as file: old_test_matrix = pickle.load(file) - for key, value in old_test_matrix.items(): + # loop through the old test matrix + for key, (old_kwargs, old_results) in old_test_matrix.items(): + # if the old test case is in the new test matrix, bring over the results + # either the test targets will be regenerated, and the results replaced, or they won't be regenerated, and we'll need the old results if key in test_matrix: - test_matrix[key] = value + new_kwargs = test_matrix[key][0] + # if case will be covered, bring in new kwargs + if args.case_subset is None or key in args.case_subset: + test_matrix[key] = (new_kwargs, old_results) + # if case will not be covered, keep old kwargs + else: + test_matrix[key] = (old_kwargs, old_results) print( f"Running {'all cases' if args.case_subset is None else args.case_subset} through {args.target_subset} to supplement test targets from {args.initial_target_pickle}. Any new tests in cases.py will be added to the test matrix." ) @@ -178,6 +188,11 @@ def main(test_matrix): f"Cannot run subset {args.target_subset} without a pre-existing complete set of generated targets, {args.initial_target_pickle} does not exist. Either specify an --initial-target-pickle that does exist or run without subsetting." ) + print("Generating targets for the following test matrix kwargs") + for test_name, (kwargs, _) in test_matrix.items(): + print(test_name) + print(kwargs) + for subset in args.target_subset: function_name = f"generate_{subset}" globals()[function_name](test_matrix, args.case_subset) diff --git a/dimelo/utils.py b/dimelo/utils.py index e213493..e4ad62b 100644 --- a/dimelo/utils.py +++ b/dimelo/utils.py @@ -89,7 +89,7 @@ def adjust_threshold( if thresh > 1: if not quiet: print( - f"Modification threshold of {thresh} assumed to be for range 0-255. {thresh}/255={thresh/255} will be sent to modkit." + f"Modification threshold of {thresh} assumed to be for range 0-255. {thresh}/255={thresh / 255} will be sent to modkit." ) thresh_scaled = thresh / 255 else: @@ -103,6 +103,33 @@ def adjust_threshold( return thresh +def process_chunks_from_regions_dict( + regions_dict: dict, + chunk_size: int, +): + """ + returns: a list of chunk specifier dictionaries, which contain region and subregion information. The subregion start and end + are always within the region. This information is sufficient for a downstream process to operate on the subregion chunk while + knowing where it lies within the larger region. + """ + chunk_list = [] + for chromosome, region_list in regions_dict.items(): + for start_coord, end_coord, strand in region_list: + for subregion_start in range(start_coord, end_coord, chunk_size): + subregion_end = min(end_coord, subregion_start + chunk_size) + chunk_list.append( + { + "chromosome": chromosome, + "region_start": start_coord, + "region_end": end_coord, + "subregion_start": subregion_start, + "subregion_end": subregion_end, + "strand": strand, + } + ) + return chunk_list + + def regions_dict_from_input( regions: str | Path | list[str | Path] | None = None, window_size: int | None = None,