From 90f8371c4caf12fba52cc00234f2607ea2b8fcbc Mon Sep 17 00:00:00 2001
From: qhu <qiang.hu@roswellpark.org>
Date: Wed, 3 Apr 2024 17:00:59 -0400
Subject: [PATCH 1/2] add `waterfall` function to sort genes and samples to
 plot waterfall

---
 comut/comut.py | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/comut/comut.py b/comut/comut.py
index 270656c..0302fbf 100644
--- a/comut/comut.py
+++ b/comut/comut.py
@@ -1776,3 +1776,30 @@ def add_unified_legend(self, axis_name=None, border_white=None, headers=True,
                             draw_area.set_visible(False)
 
         return leg
+
+    def waterfall(self, data):
+        # Reshape data: wide format with samples as rows and genes as columns, counting occurrences
+        wide_data = data.pivot_table(index='sample', columns='category', aggfunc=len, fill_value=0)
+    
+        # Convert to boolean values (1's and 0's) based on presence of mutation
+        values = wide_data.astype(bool).astype(int)
+    
+        # Order columns by decreasing frequency of mutations
+        gorder = values.sum().sort_values(ascending=False).index
+        wide_boolean = values[gorder]
+        
+        # Hierarchical sort: prioritize samples with mutations
+        sample_order = wide_boolean.apply(tuple, axis=1) \
+                                   .sort_values(ascending=False) \
+                                   .index
+    
+        # Include samples not in data, maintaining original order
+        if self.samples:
+            not_in = set(self.samples) - set(sample_order)
+            sample_order = sample_order.append(pd.Index(not_in))
+
+        sorder = sample_order.values.tolist()
+        gorder = gorder.to_frame()['category'].values.tolist()
+        gorder.reverse()
+        
+        return sorder, gorder

From dd1c6b2f5e2e7da860e10d1e7d40a69a373253dc Mon Sep 17 00:00:00 2001
From: qhu <qiang.hu@roswellpark.org>
Date: Wed, 3 Apr 2024 17:07:20 -0400
Subject: [PATCH 2/2] add docs for `waterfall`

---
 comut/comut.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/comut/comut.py b/comut/comut.py
index 0302fbf..c57f41e 100644
--- a/comut/comut.py
+++ b/comut/comut.py
@@ -1778,6 +1778,27 @@ def add_unified_legend(self, axis_name=None, border_white=None, headers=True,
         return leg
 
     def waterfall(self, data):
+        '''Sort genes and samples for waterfall plot.
+
+        Params:
+        -------
+        data: pandas dataframe
+            A tidy dataframe containing data. Required columns are
+            sample, category, and value. Other columns are ignored.
+
+            Example:
+            -------
+            sample   | category | value
+            ----------------------------
+            Sample_1 | TP53     | Missense
+            Sample_1 | Gender   | Male
+
+        Returns:
+        --------
+        sorder: list of samples ordered by genes.
+        gorder: list of genes in ascending order.
+
+        '''
         # Reshape data: wide format with samples as rows and genes as columns, counting occurrences
         wide_data = data.pivot_table(index='sample', columns='category', aggfunc=len, fill_value=0)