Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
101 commits
Select commit Hold shift + click to select a range
fd91c25
Merge pull request #4 from dekkerlab/master
sergpolly May 3, 2019
05748ef
added initial dot-calling notebook
sergpolly May 3, 2019
cd08bc0
updated notebook
sergpolly May 8, 2019
956c5da
updated the dots notebook
sergpolly Jun 6, 2019
950d465
updated notebook again
sergpolly Jun 7, 2019
f3928b3
before vacation notebook updates
sergpolly Jul 9, 2019
b125e1c
dot-calling notebook cleanup
sergpolly Jul 9, 2019
ea21dba
Remove deprecated functions
nvictus Nov 14, 2019
74c622a
Merge pull request #8 from mirnylab/master
sergpolly Jan 26, 2020
9bff1ce
Merge pull request #9 from mirnylab/master
sergpolly Jan 26, 2020
e39c7c1
added logbin_expected
mimakaev Jan 31, 2020
bd52d3a
split logbin_expected into two functions
mimakaev Feb 2, 2020
dc7302a
improved logbins; added persistent logbins
mimakaev Feb 2, 2020
ec4acd5
updated docs for logbin_expected; black
mimakaev Mar 4, 2020
20cd01b
merge with upstream
mimakaev Mar 4, 2020
192cd18
added black
mimakaev Mar 4, 2020
6ebb1b4
expected: rename regions->supports
golobor Apr 1, 2020
c0e42db
common.py: add assign_regions_to_bins
golobor Apr 7, 2020
f413608
Merge branch 'master' into develop
nvictus Apr 7, 2020
b2b27f3
Merge branch 'master' into develop
nvictus Apr 7, 2020
379ab78
expected.py: bugfix; rename diag->dist
golobor Apr 14, 2020
2851367
Allow regions in expected CLI (#146)
Phlya Apr 14, 2020
d038b1e
Merge branch 'master' of http://github.com/mirnylab/cooltools
mimakaev Apr 20, 2020
21cec77
merge latest from master to develop (#155)
sergpolly May 5, 2020
a129acf
Bump again to 0.4.0-dev
nvictus May 5, 2020
32be1c5
Start using CliRunner for testing cli commands
nvictus May 5, 2020
34b4681
Develop (#156)
nvictus May 5, 2020
cab2fb3
Merge remote-tracking branch 'upstream/master'
sergpolly May 5, 2020
34034e6
Merge branch 'master' into develop
sergpolly May 5, 2020
2fbb2eb
fixed saddle code to accept bedslice
mimakaev May 7, 2020
4f421f4
Merge pull request #160 from mimakaev/expected_refactoring
itsameerkat May 8, 2020
093b28d
refactored diagsum
mimakaev May 8, 2020
46e6c29
Merge branch 'master' into expected_refactoring
mimakaev May 9, 2020
da8bfb9
implement bad_bins in make_diag_tables
sergpolly May 9, 2020
5daacd4
deprecate/remove cis_/trans_expected
sergpolly May 9, 2020
28efb3d
implement bad_bins in count_bad_pixels_per_block
sergpolly May 9, 2020
bc206fa
fix bad_bins docstrings in diag and block-sums
sergpolly May 9, 2020
8e957d3
changes in saddle and expected to accept new expected format
mimakaev May 9, 2020
5d28e20
Merge branch 'expected_refactoring' into develop
sergpolly May 9, 2020
f648a5c
fix silly changes brought from master
sergpolly May 9, 2020
590ac08
now actually fixed
sergpolly May 9, 2020
2d64f9a
minimize diffs for Max, resolve new bioframe
sergpolly May 9, 2020
a9043ef
saddles tentatively working
mimakaev May 9, 2020
7f2250a
renamed normalize_regions
mimakaev May 17, 2020
bd55d24
bad_bins masking for diagsum_symm
sergpolly May 19, 2020
1a50812
mask var name fix
sergpolly May 19, 2020
6dbdb33
added compose masking and transforms to diagsum_asymm and blocksum
sergpolly May 19, 2020
02159cf
column names fix in new _diagsum_asymm
sergpolly May 21, 2020
e87f8a5
more var names fixing in _diagsum_asymm
sergpolly May 21, 2020
2cfe933
attempt to unify block/diagsum API
sergpolly May 29, 2020
1a02bc9
quick fixes, add contact-type docstrings
sergpolly May 29, 2020
5a2b495
typo fix bad_bins_dict
sergpolly May 29, 2020
be5186c
change transforms default to empty dict
sergpolly May 30, 2020
6c54fe1
Merge branch 'expected_refactoring' into develop
sergpolly May 30, 2020
3a5935a
Merge branch 'master' of http://github.com/mirnylab/cooltools into ex…
mimakaev Jun 1, 2020
885d56d
Merge pull request #2 from dekkerlab/develop
mimakaev Jun 2, 2020
d755a83
Merge branch 'develop' of http://github.com/mirnylab/cooltools into e…
mimakaev Jun 4, 2020
4cf0299
fixed some bugs so more tests pass
mimakaev Jun 4, 2020
22a2ae9
Merge branch 'expected_refactoring' of http://github.com/mimakaev/coo…
mimakaev Jun 4, 2020
a102478
blocksum and diagsum with arbitraty regions
sergpolly Jun 16, 2020
57dd6b5
Merge branch 'develop' of https://github.com/dekkerlab/cooltools into…
sergpolly Jun 16, 2020
1a8444a
Merge pull request #3 from dekkerlab/develop
mimakaev Jun 17, 2020
2f0d462
minor fix in diagsum_asymm
Jun 19, 2020
b92199b
minor fix in diagsum_asymm
sergpolly Jun 19, 2020
59d9129
Merge branch 'develop' of https://github.com/dekkerlab/cooltools into…
sergpolly Jun 19, 2020
f7fea84
cis eig rewritten; added mp.Pool.map to it
mimakaev Jun 21, 2020
3a89d28
allow symmetric diagsum to handle overlapping regions
sergpolly Jun 23, 2020
e6d9f09
fix diagsum regions casting to ndarray
sergpolly Jun 23, 2020
f9ec065
docstrings regarding regions2 touchedup in expected
sergpolly Jun 23, 2020
8a2a706
initial tests for newer expected
sergpolly Jun 23, 2020
2eb417c
minor test expected fixes, names and typos
sergpolly Jun 23, 2020
68931f9
fixing requirements conflict: conda vs pip, travis please work
sergpolly Jun 24, 2020
b69befd
remove pytables fron conda-env in favor of tables in pip-reqs
sergpolly Jun 25, 2020
fb4bfbe
drop conda-environment-yaml, update travis-config
sergpolly Jun 26, 2020
f54dc9b
travis fix: source activate instead of conda activate
sergpolly Jun 26, 2020
125d1ea
typo fix
sergpolly Jun 26, 2020
40c2c4b
Update .travis.yml
nvictus Jun 26, 2020
16686a1
temporarily use bioframe-develop, make travis go
sergpolly Jun 26, 2020
c4a8180
Merge remote-tracking branch 'max/expected_refactoring' into develop
sergpolly Jun 26, 2020
6a90aef
Restore bioframe>=0.1.0 requirement
mimakaev Jun 30, 2020
f54b629
typo in bioframe requirement
mimakaev Jun 30, 2020
f0ffacd
Merge pull request #5 from dekkerlab/develop
mimakaev Jun 30, 2020
8687c8a
Merge branch 'expected_refactoring' into develop
mimakaev Jun 30, 2020
b801272
Merge branch 'develop' of https://github.com/mirnylab/cooltools into …
nvictus Sep 8, 2020
6cc2adf
Update formatting
nvictus Sep 9, 2020
4cd1e8b
Update formatting
nvictus Sep 9, 2020
bbb0505
Merge branch 'master' into develop
nvictus Sep 9, 2020
977a03f
Install bioframe from github develop branch
nvictus Sep 9, 2020
66866d5
Merge branch 'develop' of https://github.com/mirnylab/cooltools into …
nvictus Sep 9, 2020
19f755f
Fix eigdecomp and saddle to work with new bioframe
nvictus Sep 9, 2020
0c83037
Formatting
nvictus Sep 9, 2020
a5c2ef6
Merge pull request #11 from open2c/develop
Phlya Oct 13, 2020
409fb10
logbin-expected cli
Phlya Oct 13, 2020
245d48b
Merge branch 'master' into develop
nvictus Oct 16, 2020
557e31e
innocent update to Ilya's PR
sergpolly Oct 20, 2020
c2969e2
added csv validation and custom exp_summary_name
sergpolly Oct 20, 2020
51f3670
added simple CLI test for logbin expected
sergpolly Oct 20, 2020
07a2533
Merge branch 'develop' into develop
sergpolly Oct 20, 2020
fa8b551
final minor fixes - imports,flake8,test pass
sergpolly Oct 20, 2020
ef28860
logbin expected refactor: renamings,consistency,newlines
sergpolly Oct 21, 2020
341904c
Merge branch 'master' into logbin-refactor
sergpolly Oct 21, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions cooltools/cli/logbin_expected.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,12 @@ def logbin_expected(
verbose=False,
)

# name of the column with Probability of contacts is
# based on the name of the column with the diagonal-summary
# stats in the input expected DataFrame:
exp_summary_base, *_ = exp_summary_name.split(".")
Pc_name = f"{exp_summary_base}.avg"

lb_cvd, lb_slopes, lb_distbins = expected.logbin_expected(
cvd,
exp_summary_name=exp_summary_name,
Expand All @@ -156,15 +162,17 @@ def logbin_expected(
min_nvalid=min_nvalid,
min_count=min_count
)
# combine Probabilities of contact for the regions:
lb_cvd_agg, lb_slopes_agg = expected.combine_binned_expected(
lb_cvd,
Pc_name=Pc_name,
binned_exp_slope=lb_slopes,
spread_funcs=spread_funcs,
spread_funcs_slope=spread_funcs_slope
)
if resolution is not None:
lb_cvd_agg['s_bp'] = lb_cvd_agg['x'] * resolution
lb_slopes_agg['s_bp'] = lb_slopes_agg['x'] * resolution
lb_cvd_agg['s_bp'] = lb_cvd_agg['diag.avg'] * resolution
lb_slopes_agg['s_bp'] = lb_slopes_agg['diag.avg'] * resolution

lb_cvd_agg.to_csv(
f'{output_prefix}.log.tsv',
Expand Down
132 changes: 82 additions & 50 deletions cooltools/expected.py
Original file line number Diff line number Diff line change
Expand Up @@ -1054,70 +1054,84 @@ def logbin_expected(
"""
from cooltools.lib.numutils import logbins

raw_summary_name = "count.sum"
exp_summary_base, *_ = exp_summary_name.split(".")
Pc_name = f"{exp_summary_base}.avg"
diag_name = "diag"
diag_avg_name = f"{diag_name}.avg"

exp = exp[~pd.isna(exp[exp_summary_name])]
exp["x"] = exp.pop("diag")
diagmax = exp["x"].max()
exp[diag_avg_name] = exp.pop(diag_name) # "average" or weighted diagonals
diagmax = exp[diag_avg_name].max()

# create diag_bins based on chosen layout:
if bin_layout == "fixed":
bins = numutils.persistent_log_bins(
10, bins_per_order_magnitude=bins_per_order_magnitude
diag_bins = numutils.persistent_log_bins(
10,
bins_per_order_magnitude=bins_per_order_magnitude
)
elif bin_layout == "longest_region":
bins = logbins(1, diagmax + 1, ratio=10 ** (1 / bins_per_order_magnitude))
diag_bins = logbins(
1,
diagmax + 1,
ratio=10 ** (1 / bins_per_order_magnitude)
)
else:
bins = bin_layout
diag_bins = bin_layout

if bins[-1] < diagmax:
raise ValueError("Bins end is less than the size of the largest region")
if diag_bins[-1] < diagmax:
raise ValueError("Genomic separation bins end is less than the size of the largest region")

exp["bin_id"] = np.searchsorted(bins, exp["x"], side="right") - 1
exp = exp[exp["bin_id"] >= 0]
# assign diagonals in exp DataFrame to diag_bins, i.e. give them ids:
exp["diag_bin_id"] = np.searchsorted(diag_bins, exp[diag_avg_name], side="right") - 1
exp = exp[exp["diag_bin_id"] >= 0]

# constructing expected grouped by region
byReg = exp.copy()

# this averages x with the weight equal to n_valid, and sums everything else
byReg["x"] *= byReg["n_valid"]
byRegExp = byReg.groupby(["region", "bin_id"]).sum()
byRegExp["x"] /= byRegExp["n_valid"]
# this averages diag_avg with the weight equal to n_valid, and sums everything else
byReg[diag_avg_name] *= byReg["n_valid"]
byRegExp = byReg.groupby(["region", "diag_bin_id"]).sum()
byRegExp[diag_avg_name] /= byRegExp["n_valid"]

byRegExp = byRegExp.reset_index()
byRegExp = byRegExp[byRegExp["n_valid"] > min_nvalid] # filtering by n_valid
byRegExp["Pc"] = byRegExp[exp_summary_name] / byRegExp["n_valid"]
byRegExp = byRegExp[byRegExp["Pc"] > 0] # drop bins with 0 counts
byRegExp[Pc_name] = byRegExp[exp_summary_name] / byRegExp["n_valid"]
byRegExp = byRegExp[byRegExp[Pc_name] > 0] # drop diag_bins with 0 counts
if min_count:
if "count.sum" in byRegExp:
byRegExp = byRegExp[byRegExp["count.sum"] > min_count]
if raw_summary_name in byRegExp:
byRegExp = byRegExp[byRegExp[raw_summary_name] > min_count]
else:
warnings.warn(RuntimeWarning("counts not found"))
warnings.warn(RuntimeWarning(f"{raw_summary_name} not found in the input expected"))

byRegExp["bin_start"] = bins[byRegExp["bin_id"].values]
byRegExp["bin_end"] = bins[byRegExp["bin_id"].values + 1] - 1
byRegExp["diag_bin_start"] = diag_bins[byRegExp["diag_bin_id"].values]
byRegExp["diag_bin_end"] = diag_bins[byRegExp["diag_bin_id"].values + 1] - 1

byRegDer = []
for reg, subdf in byRegExp.groupby("region"):
subdf = subdf.sort_values("bin_id")
valid = np.minimum(subdf.n_valid.values[:-1], subdf.n_valid.values[1:])
mids = np.sqrt(subdf.x.values[:-1] * subdf.x.values[1:])
subdf = subdf.sort_values("diag_bin_id")
valid = np.minimum(subdf["n_valid"].values[:-1], subdf["n_valid"].values[1:])
mids = np.sqrt(subdf[diag_avg_name].values[:-1] * subdf[diag_avg_name].values[1:])
f = der_smooth_function_by_reg
slope = np.diff(f(np.log(subdf.Pc.values))) / np.diff(f(np.log(subdf.x.values)))
slope = np.diff(f(np.log(subdf[Pc_name].values))) / np.diff(f(np.log(subdf[diag_avg_name].values)))
newdf = pd.DataFrame(
{
"x": mids,
diag_avg_name: mids,
"slope": slope,
"n_valid": valid,
"bin_id": subdf.bin_id.values[:-1],
"diag_bin_id": subdf["diag_bin_id"].values[:-1],
}
)
newdf["region"] = reg
byRegDer.append(newdf)
byRegDer = pd.concat(byRegDer).reset_index(drop=True)
return byRegExp, byRegDer, bins[: byRegExp.bin_id.max() + 2]
return byRegExp, byRegDer, diag_bins[: byRegExp["diag_bin_id"].max() + 2]


def combine_binned_expected(
binned_exp,
binned_exp_slope=None,
Pc_name="balanced.avg",
der_smooth_function_combined=lambda x: numutils.robust_gauss_filter(x, 1.3),
spread_funcs="logstd",
spread_funcs_slope="std",
Expand All @@ -1136,6 +1150,10 @@ def combine_binned_expected(
If provided, estimates spread of slopes.
Is necessary if concat_original is True

Pc_name : str
Name of the column with the probability of contacts.
Defaults to "balanced.avg".

der_smooth_function_combined : callable
A smoothing function for calculating slopes on combined data

Expand Down Expand Up @@ -1181,8 +1199,12 @@ def combine_binned_expected(
noisy, and may become a 0 if only one region is contributing to the last
pixel.
"""
diag_avg_name = "diag.avg"
scal = numutils.weighted_groupby_mean(
binned_exp.select_dtypes(np.number), "bin_id", "n_valid", mode="mean"
binned_exp.select_dtypes(np.number),
group_by="diag_bin_id",
weigh_by="n_valid",
mode="mean"
)

if spread_funcs == "minmax":
Expand All @@ -1192,21 +1214,26 @@ def combine_binned_expected(
byRegVar.groupby("region")["n_valid"].tail(minmax_drop_bins).index
)
]
low_err = byRegVar.groupby("bin_id")["Pc"].min()
high_err = byRegVar.groupby("bin_id")["Pc"].max()
low_err = byRegVar.groupby("diag_bin_id")[Pc_name].min()
high_err = byRegVar.groupby("diag_bin_id")[Pc_name].max()
elif spread_funcs == "std":
var = numutils.weighted_groupby_mean(
binned_exp[["Pc", "bin_id", "n_valid"]], "bin_id", "n_valid", mode="std"
)["Pc"]
low_err = scal["Pc"] - var
high_err = scal["Pc"] + var
binned_exp[[Pc_name, "diag_bin_id", "n_valid"]],
group_by="diag_bin_id",
weigh_by="n_valid",
mode="std"
)[Pc_name]
low_err = scal[Pc_name] - var
high_err = scal[Pc_name] + var
elif spread_funcs == "logstd":
var = numutils.weighted_groupby_mean(
binned_exp[["Pc", "bin_id", "n_valid"]], "bin_id", "n_valid", mode="logstd"
)["Pc"]
low_err = scal["Pc"] / var
high_err = scal["Pc"] * var

binned_exp[[Pc_name, "diag_bin_id", "n_valid"]],
group_by="diag_bin_id",
weigh_by="n_valid",
mode="logstd"
)[Pc_name]
low_err = scal[Pc_name] / var
high_err = scal[Pc_name] * var
else:
low_err, high_err = spread_funcs(binned_exp, scal)

Expand All @@ -1215,13 +1242,18 @@ def combine_binned_expected(

f = der_smooth_function_combined

slope = np.diff(f(np.log(scal.Pc.values))) / np.diff(f(np.log(scal.x.values)))
valid = np.minimum(scal.n_valid.values[:-1], scal.n_valid.values[1:])
mids = np.sqrt(scal.x.values[:-1] * scal.x.values[1:])
slope = np.diff(f(np.log(scal[Pc_name].values))) / np.diff(f(np.log(scal[diag_avg_name].values)))
valid = np.minimum(scal["n_valid"].values[:-1], scal["n_valid"].values[1:])
mids = np.sqrt(scal[diag_avg_name].values[:-1] * scal[diag_avg_name].values[1:])
slope_df = pd.DataFrame(
{"x": mids, "slope": slope, "n_valid": valid, "bin_id": scal.index.values[:-1]}
{
diag_avg_name: mids,
"slope": slope,
"n_valid": valid,
"diag_bin_id": scal.index.values[:-1],
}
)
slope_df = slope_df.set_index("bin_id")
slope_df = slope_df.set_index("diag_bin_id")

if binned_exp_slope is not None:
if spread_funcs_slope == "minmax":
Expand All @@ -1231,13 +1263,13 @@ def combine_binned_expected(
byRegDer.groupby("region")["n_valid"].tail(minmax_drop_bins).index
)
]
low_err = byRegDer.groupby("bin_id")["slope"].min()
high_err = byRegDer.groupby("bin_id")["slope"].max()
low_err = byRegDer.groupby("diag_bin_id")["slope"].min()
high_err = byRegDer.groupby("diag_bin_id")["slope"].max()
elif spread_funcs_slope == "std":
var = numutils.weighted_groupby_mean(
binned_exp_slope[["slope", "bin_id", "n_valid"]],
"bin_id",
"n_valid",
binned_exp_slope[["slope", "diag_bin_id", "n_valid"]],
group_by="diag_bin_id",
weigh_by="n_valid",
mode="std",
)["slope"]
low_err = slope_df["slope"] - var
Expand Down