diff --git a/examples/juicer_example/README.txt b/examples/juicer_example/README.txt new file mode 100644 index 000000000..0dd1e9872 --- /dev/null +++ b/examples/juicer_example/README.txt @@ -0,0 +1,2 @@ +Using Juicer to Run PASTIS on a .hic File +========================================= diff --git a/examples/juicer_example/data/Pfalciparum_trophozoite_Ay2014.hic b/examples/juicer_example/data/Pfalciparum_trophozoite_Ay2014.hic new file mode 100644 index 000000000..bf34b0968 Binary files /dev/null and b/examples/juicer_example/data/Pfalciparum_trophozoite_Ay2014.hic differ diff --git a/examples/juicer_example/data/counts.bed b/examples/juicer_example/data/counts.bed new file mode 100644 index 000000000..76bce5126 --- /dev/null +++ b/examples/juicer_example/data/counts.bed @@ -0,0 +1,26 @@ +Chr01 1 1 0 +Chr01 2 2 1 +Chr01 3 3 2 +Chr01 4 4 3 +Chr01 5 5 4 +Chr01 6 6 5 +Chr01 7 7 6 +Chr01 8 8 7 +Chr01 9 9 8 +Chr01 10 10 9 +Chr01 11 11 10 +Chr01 12 12 11 +Chr01 13 13 12 +Chr01 14 14 13 +Chr01 15 15 14 +Chr01 16 16 15 +Chr01 17 17 16 +Chr01 18 18 17 +Chr01 19 19 18 +Chr01 20 20 19 +Chr01 21 21 20 +Chr01 22 22 21 +Chr01 23 23 22 +Chr01 24 24 23 +Chr01 25 25 24 +Chr01 26 26 25 diff --git a/examples/juicer_example/data/counts.npy b/examples/juicer_example/data/counts.npy new file mode 100644 index 000000000..d9ef74829 Binary files /dev/null and b/examples/juicer_example/data/counts.npy differ diff --git a/examples/juicer_example/juicer_for_pastis.py b/examples/juicer_example/juicer_for_pastis.py new file mode 100644 index 000000000..c4b945e9c --- /dev/null +++ b/examples/juicer_example/juicer_for_pastis.py @@ -0,0 +1,89 @@ +""" +Using a .hic file (output of Juicer) with PASTIS +================================================ + +This example showcases how to convert a .hic file output by Juicer into a numpy +array, output a .bed file for the lengths, and then run PASTIS. +""" + + +import numpy as np +import strawC +import os +from scipy import sparse + + +############################################################################## +# Returns a numpy array with counts data from the given .hic file. +# val_type : "observed" for raw values; "oe" for observed/expected values +# norm : the normalization to apply ("NONE" for no normalization) +# hic_file : the path to the .hic file we want to get counts data for +# width : the size of the returned counts matrix +# chrom1 : the first chromosome +# chrom2 : the second chromosome +# resolution : the BP resolution +# -------------------------------------------------------------------------- +def grabRegion(val_type, norm, hic_file, width, chrom1, chrom2, resolution): + chrom_range1 = str(chrom1) + chrom_range2 = str(chrom2) + result = strawC.strawC(val_type, norm, hic_file, chrom_range1, + chrom_range2, 'BP', resolution) + row_indices, col_indices, data = list(), list(), list() + for record in result: + row_indices.append(record.binX) + col_indices.append(record.binY) + data.append(record.counts) + if chrom1 == chrom2 and record.binX != record.binY: + row_indices.append(record.binY) + col_indices.append(record.binX) + data.append(record.counts) + row_indices = (np.asarray(row_indices)) / resolution + col_indices = (np.asarray(col_indices)) / resolution + matrix = sparse.coo_matrix((data, (row_indices.astype(int), + col_indices.astype(int))), + shape=(width, width)).toarray() + matrix[np.isnan(matrix)] = 0 + matrix[np.isinf(matrix)] = 0 + return matrix + + +############################################################################## +# Define the path to our .hic file. +# --------------------------------- +FILE_PATH = './data/Pfalciparum_trophozoite_Ay2014.hic' + + +############################################################################## +# Use the grabRegion method to get counts data for the first chromosome +# in the .hic file at a resolution of 25000. We won't apply any normalization. +# ---------------------------------------------------------------------------- +counts = grabRegion('observed', 'NONE', FILE_PATH, 26, 1, 1, 25000) + + +############################################################################## +# Output a .bed file of lengths. Let's call the file "counts.bed". +# ---------------------------------------------------------------- +with open("./data/counts.bed", "w") as f: + the_length = len(counts) + for i in range(the_length): + curr_line = 'Chr01\t' + str(i + 1) + '\t' + str(i + 1) + '\t' + curr_line += str(i) + print(curr_line, file=f) + + +############################################################################## +# Save the counts as a .npy file. Let's call the file "counts.npy". +# ----------------------------------------------------------------- +np.save("./data/counts.npy", counts) + + +############################################################################## +# Make the following call using os to run PASTIS with the counts data. +# The call could be made in command line as well. Note that there are other +# settings we could run PASTIS with; this is simply one set of those settings. +# ---------------------------------------------------------------------------- +settings = "pastis-poisson --seed 0 --counts ./data/counts.npy --outdir" +settings += " ./results --lengths ./data/counts.bed --ploidy 1" +settings += " --filter_threshold 0.04 --multiscale_rounds 1 --max_iter 50000" +settings += " --dont-normalize --alpha 3" +os.system(settings)