From 90f8371c4caf12fba52cc00234f2607ea2b8fcbc Mon Sep 17 00:00:00 2001 From: qhu Date: Wed, 3 Apr 2024 17:00:59 -0400 Subject: [PATCH 1/2] add `waterfall` function to sort genes and samples to plot waterfall --- comut/comut.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/comut/comut.py b/comut/comut.py index 270656c..0302fbf 100644 --- a/comut/comut.py +++ b/comut/comut.py @@ -1776,3 +1776,30 @@ def add_unified_legend(self, axis_name=None, border_white=None, headers=True, draw_area.set_visible(False) return leg + + def waterfall(self, data): + # Reshape data: wide format with samples as rows and genes as columns, counting occurrences + wide_data = data.pivot_table(index='sample', columns='category', aggfunc=len, fill_value=0) + + # Convert to boolean values (1's and 0's) based on presence of mutation + values = wide_data.astype(bool).astype(int) + + # Order columns by decreasing frequency of mutations + gorder = values.sum().sort_values(ascending=False).index + wide_boolean = values[gorder] + + # Hierarchical sort: prioritize samples with mutations + sample_order = wide_boolean.apply(tuple, axis=1) \ + .sort_values(ascending=False) \ + .index + + # Include samples not in data, maintaining original order + if self.samples: + not_in = set(self.samples) - set(sample_order) + sample_order = sample_order.append(pd.Index(not_in)) + + sorder = sample_order.values.tolist() + gorder = gorder.to_frame()['category'].values.tolist() + gorder.reverse() + + return sorder, gorder From dd1c6b2f5e2e7da860e10d1e7d40a69a373253dc Mon Sep 17 00:00:00 2001 From: qhu Date: Wed, 3 Apr 2024 17:07:20 -0400 Subject: [PATCH 2/2] add docs for `waterfall` --- comut/comut.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/comut/comut.py b/comut/comut.py index 0302fbf..c57f41e 100644 --- a/comut/comut.py +++ b/comut/comut.py @@ -1778,6 +1778,27 @@ def add_unified_legend(self, axis_name=None, border_white=None, headers=True, return leg def waterfall(self, data): + '''Sort genes and samples for waterfall plot. + + Params: + ------- + data: pandas dataframe + A tidy dataframe containing data. Required columns are + sample, category, and value. Other columns are ignored. + + Example: + ------- + sample | category | value + ---------------------------- + Sample_1 | TP53 | Missense + Sample_1 | Gender | Male + + Returns: + -------- + sorder: list of samples ordered by genes. + gorder: list of genes in ascending order. + + ''' # Reshape data: wide format with samples as rows and genes as columns, counting occurrences wide_data = data.pivot_table(index='sample', columns='category', aggfunc=len, fill_value=0)