From 1c1e439d2275045fbc6e9205ce352604a6458428 Mon Sep 17 00:00:00 2001 From: Z_Wael Date: Sun, 21 Nov 2021 15:56:29 +0100 Subject: [PATCH 1/6] Update README.md --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 2333524..fc6e452 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,7 @@ 1. Outliers detection and batch detection & correction are shown in Methods_Description_-_Batch_correction.ipynb (Melanoma data) 1. Clusters generation from scaled signatures is shown in clustering_example.py. The process is the same for melanoma and pancan analysis 1. Finally, having a dataset with scaled signatures and known clusters we can classify another datasets using classification_example.py +1. A walkthrough of the clustering and classification (labeling to IE IE/F F or D) is illustarted in an illustrated python notebook. _.ipynb files could by opened at https://nbviewer.jupyter.org/ or downloaded as HTML files from upstream_html folder_ @@ -30,4 +31,4 @@ The Molecular Functional (MF) Portrait is a planetary schematic representation o Visual tool available at https://science.bostongene.com/tumor-portrait/
-© 2020 BostonGene Corporation. \ No newline at end of file +© 2020 BostonGene Corporation. From 3b8cd2c39169ae05d8d096687fd66efcecef3519 Mon Sep 17 00:00:00 2001 From: Z_Wael Date: Sun, 21 Nov 2021 16:24:35 +0100 Subject: [PATCH 2/6] Create Clustering Walkthrow.ipynb --- Clustering Walkthrow.ipynb | 1 + 1 file changed, 1 insertion(+) create mode 100644 Clustering Walkthrow.ipynb diff --git a/Clustering Walkthrow.ipynb b/Clustering Walkthrow.ipynb new file mode 100644 index 0000000..1a9afde --- /dev/null +++ b/Clustering Walkthrow.ipynb @@ -0,0 +1 @@ +python note book file From 526db07482c011d8262b1f2cc1cdaca2a7da7b4a Mon Sep 17 00:00:00 2001 From: Z_Wael Date: Sun, 21 Nov 2021 16:27:17 +0100 Subject: [PATCH 3/6] Delete Clustering Walkthrow.ipynb --- Clustering Walkthrow.ipynb | 1 - 1 file changed, 1 deletion(-) delete mode 100644 Clustering Walkthrow.ipynb diff --git a/Clustering Walkthrow.ipynb b/Clustering Walkthrow.ipynb deleted file mode 100644 index 1a9afde..0000000 --- a/Clustering Walkthrow.ipynb +++ /dev/null @@ -1 +0,0 @@ -python note book file From 4103013022b5d25787c1a79f1ff50baf91f8fdf4 Mon Sep 17 00:00:00 2001 From: Z_Wael Date: Sun, 21 Nov 2021 16:28:57 +0100 Subject: [PATCH 4/6] Add files via upload --- walkthrough.ipynb | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 walkthrough.ipynb diff --git a/walkthrough.ipynb b/walkthrough.ipynb new file mode 100644 index 0000000..d05f891 --- /dev/null +++ b/walkthrough.ipynb @@ -0,0 +1,30 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "walkthrough.ipynb", + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "code", + "metadata": { + "id": "WPaBxo9UHC4G" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file From 1f490b1f61d226043752877dfb075a2e0669d126 Mon Sep 17 00:00:00 2001 From: Z_Wael Date: Sun, 21 Nov 2021 19:52:15 +0000 Subject: [PATCH 5/6] Walkthrough example and code modif to run properly --- final_clusters.tsv | 48 ++++++++++++++++++++++++++++++++++++++++ portraits/clustering.py | 4 ++-- portraits/detect_type.py | 33 +++++++++++++++++++++++++++ 3 files changed, 83 insertions(+), 2 deletions(-) create mode 100644 final_clusters.tsv create mode 100644 portraits/detect_type.py diff --git a/final_clusters.tsv b/final_clusters.tsv new file mode 100644 index 0000000..11f0f8f --- /dev/null +++ b/final_clusters.tsv @@ -0,0 +1,48 @@ + MFP +GSM478912 D +GSM478913 D +GSM478937 D +GSM478939 D +GSM478946 D +GSM478947 D +GSM478948 D +GSM478914 F +GSM478915 F +GSM478916 F +GSM478925 F +GSM478943 F +GSM478932 F +GSM478924 F +GSM478942 F +GSM478944 F +GSM478945 F +GSM478956 F +GSM478917 D +GSM478920 IE +GSM478949 D +GSM478963 D +GSM478918 IE +GSM478919 IE +GSM478926 IE +GSM478927 IE +GSM478952 IE +GSM478960 IE +GSM478922 IE/F +GSM478923 IE/F +GSM478961 IE/F +GSM478955 F +GSM478934 IE +GSM478935 IE +GSM478954 IE +GSM478958 IE +GSM478950 IE +GSM478928 +GSM478929 +GSM478930 +GSM478959 +GSM478933 IE +GSM478936 D +GSM478938 D +GSM478940 IE/F +GSM478941 F +GSM478951 IE diff --git a/portraits/clustering.py b/portraits/clustering.py index 079b1cb..8f5cec2 100644 --- a/portraits/clustering.py +++ b/portraits/clustering.py @@ -1,6 +1,6 @@ import warnings -import community # louvain +import community.community_louvain as community import matplotlib.pyplot as plt import networkx as nx import numpy as np @@ -122,7 +122,7 @@ def clustering_profile_metrics_plot(cluster_metrics, num_clusters_ylim_max=7): clusters_perc = pd.DataFrame([x.value_counts() for x in cluster_metrics.perc], index=cluster_metrics.index).iloc[:, :10] - clusters_perc.plot(kind='bar', stached=True, ax=next(af), offset=.5) + clusters_perc.plot(kind='bar', stacked=True, ax=next(af)) #offset=.5 was specified in the original code ax.set_xticks(ax.get_xticks() - .5) ax.set_xticklabels(ax.get_xticklabels(), rotation=90) diff --git a/portraits/detect_type.py b/portraits/detect_type.py new file mode 100644 index 0000000..82825ed --- /dev/null +++ b/portraits/detect_type.py @@ -0,0 +1,33 @@ +import pandas as pd +from portraits.clustering import clustering_profile_metrics, clustering_profile_metrics_plot +from portraits.utils import read_gene_sets, ssgsea_formula, median_scale + + +def detect_type(data, threshold, scores): + ser = data.loc[threshold].perc # here threshold and ser were added to the original code + cmeans = pd.DataFrame({cg: scores.loc[samps.index].mean() for cg, samps in ser.groupby(ser)}) + mapper = {} + deltas = (cmeans.loc[['Angiogenesis', 'Endothelium', 'CAF', 'Matrix', 'Matrix_remodeling']].mean() - + cmeans.loc[['MHCII', 'Antitumor_cytokines', 'Coactivation_molecules', + 'B_cells', 'NK_cells', 'Checkpoint_inhibition', + 'Effector_cells', 'T_cells', 'Th1_signature', + 'T_cell_traffic', 'MHCI']].mean()).sort_values() + + mapper[deltas.index[-1]] = 'F' # That's fibrotic + mapper[deltas.index[0]] = 'IE' # Immune enriched, non-fibrotic + cmeans.pop(deltas.index[-1]) + cmeans.pop(deltas.index[0]) + + deltas = (cmeans.loc[['Angiogenesis', 'Endothelium', 'CAF', 'Matrix', 'Matrix_remodeling', + 'Protumor_cytokines', 'Neutrophil_signature', 'Granulocyte_traffic', + 'Macrophages', 'Macrophage_DC_traffic', 'MDSC_traffic', 'MDSC', + 'Th2_signature', 'T_reg_traffic', 'Treg', 'M1_signatures', 'MHCII', + 'Antitumor_cytokines', 'Coactivation_molecules', 'B_cells', 'NK_cells', + 'Checkpoint_inhibition', 'Effector_cells', 'T_cells', 'Th1_signature', + 'T_cell_traffic', 'MHCI', 'EMT_signature']].mean() - + cmeans.loc['Proliferation_rate']).sort_values() + + mapper[deltas.index[-1]] = 'IE/F' # Immune enriched & fibrotic + mapper[deltas.index[0]] = 'D' # Desert + return ser.map(mapper).rename('MFP') + From 3e9959c419549cc50ab1b832525767e2602b9d79 Mon Sep 17 00:00:00 2001 From: Z_Wael Date: Sun, 21 Nov 2021 21:19:38 +0100 Subject: [PATCH 6/6] A walkthrough runing well --- walkthrough.ipynb | 1066 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 1065 insertions(+), 1 deletion(-) diff --git a/walkthrough.ipynb b/walkthrough.ipynb index d05f891..78b2e97 100644 --- a/walkthrough.ipynb +++ b/walkthrough.ipynb @@ -4,7 +4,9 @@ "metadata": { "colab": { "name": "walkthrough.ipynb", - "provenance": [] + "provenance": [], + "collapsed_sections": [], + "include_colab_link": true }, "kernelspec": { "name": "python3", @@ -15,11 +17,1073 @@ } }, "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "kE12_q3ZKZ9N", + "outputId": "777eb119-508f-4f8f-a712-e0433c41e545" + }, + "source": [ + "from google.colab import drive\n", + "drive.mount('/content/drive')" + ], + "execution_count": 1, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "AuPIKrDqKCNF", + "outputId": "8d4013fe-b6f7-42ab-a587-593474b96746" + }, + "source": [ + "%cd /content/drive/MyDrive/MFP" + ], + "execution_count": 2, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "/content/drive/MyDrive/MFP\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "_fB04PzHJEpY", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "bcb8cb0e-9616-4a8c-b84b-b015c59d74c1" + }, + "source": [ + "!ls" + ], + "execution_count": 3, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "classification_example.py\t\t portraits\n", + "clustering_example.py\t\t\t README.md\n", + "Cohorts\t\t\t\t\t requirements.txt\n", + "From_cell_files.ipynb\t\t\t signatures\n", + "img\t\t\t\t\t upstream_html\n", + "license.md\t\t\t\t walkthrough.ipynb\n", + "Methods_Description_-_Batch_correction.ipynb\n" + ] + } + ] + }, { "cell_type": "code", "metadata": { "id": "WPaBxo9UHC4G" }, + "source": [ + "import pandas as pd\n", + "\n", + "from portraits.clustering import clustering_profile_metrics, clustering_profile_metrics_plot\n", + "from portraits.utils import read_gene_sets, ssgsea_formula, median_scale" + ], + "execution_count": 4, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vzWSwF9WMu6x" + }, + "source": [ + "# Read signatures" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "recvnCI8K2ZH" + }, + "source": [ + "gmt = read_gene_sets('signatures/gene_signatures.gmt') " + ], + "execution_count": 5, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "NrJSyfapL6VD", + "outputId": "c2401638-45f4-4b0d-e3bf-91f67ebb4af0" + }, + "source": [ + "gmt['Angiogenesis']" + ], + "execution_count": 6, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ] + }, + "metadata": {}, + "execution_count": 6 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "o0V31B14Mm7I" + }, + "source": [ + "# Read expressions" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "h-RV4CzTM3FB" + }, + "source": [ + "Here as an illustration we will use the Augustine cohort tsv file " + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "t1BSHBmOMfc4" + }, + "source": [ + "exp = pd.read_csv('Cohorts/Augustine/expressions.tsv.gz', sep='\\t', index_col=0)" + ], + "execution_count": 7, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 181 + }, + "id": "ejzK6pz7NWMu", + "outputId": "55d924d9-2456-403b-fe32-c0ddd0a286f3" + }, + "source": [ + "exp.head(3)" + ], + "execution_count": 8, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GSM478912GSM478913GSM478914GSM478915GSM478916GSM478917GSM478918GSM478919GSM478920GSM478921GSM478922GSM478923GSM478924GSM478925GSM478926GSM478927GSM478928GSM478929GSM478930GSM478931GSM478932GSM478933GSM478934GSM478935GSM478936GSM478937GSM478938GSM478939GSM478940GSM478941GSM478942GSM478943GSM478944GSM478945GSM478946GSM478947GSM478948GSM478949GSM478950GSM478951GSM478952GSM478953GSM478954GSM478955GSM478956GSM478957GSM478958GSM478959GSM478960GSM478961GSM478962GSM478963
A1BG44.76385640.57002247.41220638.60571238.60571240.57002262.70334865.15480242.46289263.50689250.63108158.72610790.42974955.58159940.48814940.48814940.57002240.57002246.12159040.57002273.69291329.86688812.43019624.35200740.38953440.57002240.57002236.90102340.57002240.48814938.60571238.83990961.52025156.48153338.60571230.61038448.34330047.98469840.57002239.49396740.63087360.48160140.38953471.05770538.60571238.60571238.60571251.89308340.57002241.63004543.34798440.580599
A1BG-AS111.48833511.48833511.48833511.48833511.02762911.4883359.59239911.48833511.48833512.8154796.6426236.64262320.76378311.48833526.54751920.0330158.67440311.46243111.48833511.4883358.4560576.2785818.74212111.48833511.48833510.73987111.48833511.97325411.48833511.53941821.69442018.67555615.15235211.47202513.11920111.48833511.48833511.48833514.63878711.48833513.45855113.5003837.66744228.26616014.65970011.92339311.48833511.74056617.10692312.65453816.93823411.488335
A1CF3.7767003.7767003.7767003.7767003.7767004.0474154.7920493.7767003.7767003.7767003.7767006.0572153.7767003.7767003.7767004.3118733.7767003.7767003.7767003.7767003.7767003.7767003.7767003.7767004.1517283.7767003.7767003.7767003.7767003.7767003.7767003.7767003.7767004.1381513.7767003.7767003.7949193.7767003.7767003.7832543.7767003.7767003.7767003.7767003.7767009.7697233.7767003.7767003.7767003.7767003.7767003.776700
\n", + "
" + ], + "text/plain": [ + " GSM478912 GSM478913 GSM478914 ... GSM478961 GSM478962 GSM478963\n", + "A1BG 44.763856 40.570022 47.412206 ... 41.630045 43.347984 40.580599\n", + "A1BG-AS1 11.488335 11.488335 11.488335 ... 12.654538 16.938234 11.488335\n", + "A1CF 3.776700 3.776700 3.776700 ... 3.776700 3.776700 3.776700\n", + "\n", + "[3 rows x 52 columns]" + ] + }, + "metadata": {}, + "execution_count": 8 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xFyHKILqN5Pq" + }, + "source": [ + "# Calc signature scores and scaling it" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "NkYpAGpbNjOt" + }, + "source": [ + "signature_scores = ssgsea_formula(exp.T, gmt) # her we transpose the expression table \n", + "# Scale signatures\n", + "signature_scores_scaled = median_scale(signature_scores)" + ], + "execution_count": 9, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 163 + }, + "id": "8lUbuKgTOF9q", + "outputId": "209fb2ce-4f1f-49fb-c33d-f877dddb7990" + }, + "source": [ + "signature_scores_scaled.head(3)" + ], + "execution_count": 10, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MHCIMHCIICoactivation_moleculesEffector_cellsT_cell_trafficNK_cellsT_cellsB_cellsM1_signaturesTh1_signatureAntitumor_cytokinesCheckpoint_inhibitionTregT_reg_trafficNeutrophil_signatureGranulocyte_trafficMDSCMDSC_trafficMacrophagesMacrophage_DC_trafficTh2_signatureProtumor_cytokinesCAFMatrixMatrix_remodelingAngiogenesisEndotheliumProliferation_rateEMT_signature
GSM478912-2.602821-7.350003-0.400397-0.457742-1.714376-1.041894-0.2130450.3717310.4665770.519831-2.252631-0.6585720.2031392.047850-0.7529900.140557-2.109979-2.541594-2.547508-2.4109800.291442-1.403598-5.230648-4.031180-1.369586-1.809785-2.9180183.649245-1.403900
GSM478913-2.645689-6.595055-0.556757-0.424833-1.694343-0.887625-0.2959660.4292641.2139580.356238-2.511562-0.1033071.1123091.030445-2.325240-0.215443-1.887350-2.316983-2.762238-2.6935581.406431-0.980988-5.198766-3.925075-1.196709-1.905589-2.6257373.365133-1.076704
GSM478914-1.316890-0.510827-0.424608-0.497104-0.9230380.528284-0.2815490.0536921.415917-1.409104-1.2214940.3181191.167260-0.732170-0.7783633.7045501.5989951.792117-0.2268960.748523-1.2656111.8923600.198815-0.1374860.0189850.902991-0.709575-1.1511772.785649
\n", + "
" + ], + "text/plain": [ + " MHCI MHCII ... Proliferation_rate EMT_signature\n", + "GSM478912 -2.602821 -7.350003 ... 3.649245 -1.403900\n", + "GSM478913 -2.645689 -6.595055 ... 3.365133 -1.076704\n", + "GSM478914 -1.316890 -0.510827 ... -1.151177 2.785649\n", + "\n", + "[3 rows x 29 columns]" + ] + }, + "metadata": {}, + "execution_count": 10 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "g6SFv1BDVFzY" + }, + "source": [ + "Check the clustering within a range of 30 to 65% similarity.\n", + ">65% - usually graph is not connected; \n", + "<30% - unreasonable correlation." + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "WHQ2nxTANt13", + "outputId": "a6da7bb2-fff2-4fd3-872d-2249e69ff70f" + }, + "source": [ + "clustering_metrics = clustering_profile_metrics(signature_scores_scaled, threshold_mm=(.3, .65), step=.01)" + ], + "execution_count": 11, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "100%|██████████| 35/35 [00:03<00:00, 11.34it/s]\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 143 + }, + "id": "2JhZfdNAhylP", + "outputId": "036244d4-590b-44bf-dc84-0d0a49676745" + }, + "source": [ + "clustering_metrics.head(3) # here some modification in the clustering.py were of necessity to run properly" + ], + "execution_count": 12, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
chdbscNperc
0.309.640161.498640.04569354GSM478912 4\n", + "GSM478913 4\n", + "GSM478931 4\n", + "G...
0.319.666571.569430.07152084GSM478912 1\n", + "GSM478913 1\n", + "GSM478931 1\n", + "G...
0.329.393981.597020.04177434GSM478912 1\n", + "GSM478913 1\n", + "GSM478931 1\n", + "G...
\n", + "
" + ], + "text/plain": [ + " ch db ... N perc\n", + "0.30 9.64016 1.49864 ... 4 GSM478912 4\n", + "GSM478913 4\n", + "GSM478931 4\n", + "G...\n", + "0.31 9.66657 1.56943 ... 4 GSM478912 1\n", + "GSM478913 1\n", + "GSM478931 1\n", + "G...\n", + "0.32 9.39398 1.59702 ... 4 GSM478912 1\n", + "GSM478913 1\n", + "GSM478931 1\n", + "G...\n", + "\n", + "[3 rows x 5 columns]" + ] + }, + "metadata": {}, + "execution_count": 12 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 143 + }, + "id": "gIvbu1of0R52", + "outputId": "2d499964-7d56-45be-9d28-00854c59bd9a" + }, + "source": [ + "clustering_metrics.tail(3)" + ], + "execution_count": 13, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
chdbscNperc
0.626.471371.331110.10966210GSM478912 2\n", + "GSM478913 2\n", + "GSM478946 ...
0.636.553561.312350.11789310GSM478912 10\n", + "GSM478913 10\n", + "GSM478946 1...
0.649.97311.097090.24586712GSM478912 1\n", + "GSM478913 1\n", + "GSM478914 ...
\n", + "
" + ], + "text/plain": [ + " ch db ... N perc\n", + "0.62 6.47137 1.33111 ... 10 GSM478912 2\n", + "GSM478913 2\n", + "GSM478946 ...\n", + "0.63 6.55356 1.31235 ... 10 GSM478912 10\n", + "GSM478913 10\n", + "GSM478946 1...\n", + "0.64 9.9731 1.09709 ... 12 GSM478912 1\n", + "GSM478913 1\n", + "GSM478914 ...\n", + "\n", + "[3 rows x 5 columns]" + ] + }, + "metadata": {}, + "execution_count": 13 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6vBT6cuwh5y_" + }, + "source": [ + "this table summarise the clustering metrics :\n", + "\n", + "- ch: calinski_harabasz_score\n", + "- db: davies_bouldin_score\n", + "- sc: silhouette_score\n", + "\n", + "ch db and sc as described in sklearn.metrics\n", + "\n", + "- N: number of clusters\n", + "- perc: samples in each cluster\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aKhzUEL9kTBK" + }, + "source": [ + "# Visualize the partitions" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 708 + }, + "id": "M9UTVfL-j8ET", + "outputId": "db0fbbbd-9314-41fe-85a6-cf8c6de1bafe" + }, + "source": [ + "clustering_profile_metrics_plot(clustering_metrics) # he a typo correction in the was of nessity" + ], + "execution_count": 14, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ] + }, + "metadata": {}, + "execution_count": 14 + }, + { + "output_type": "display_data", + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + } + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "OIPRILPRlrOm" + }, + "source": [ + "Note that the x axes is extended from the 0.30 to 0.64 the clustering range as explained in the previous paragraph \n", + "\n", + "For the illustration we will use 0.47 as the optimal threshold " + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "xVuPxJGjoJ_z" + }, + "source": [ + "best_threshold = '0.47'" + ], + "execution_count": 15, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TMpM4vXj86sz" + }, + "source": [ + "# Detect cluster types" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "A_MUTSBA1Zqy" + }, + "source": [ + "# defining detect_type as fuction is better than defining it in the example walkthrough\n", + "from portraits.detect_type import detect_type " + ], + "execution_count": 19, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "VX_FGeSf4P--" + }, + "source": [ + "final_clusters = detect_type(clustering_metrics, best_threshold, signature_scores_scaled)" + ], + "execution_count": 18, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "onR0QdkV8suB", + "outputId": "58da9aeb-240a-48c1-e1bf-69343a7879a3" + }, + "source": [ + "final_clusters.head(3)" + ], + "execution_count": 20, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "GSM478912 D\n", + "GSM478913 D\n", + "GSM478937 D\n", + "Name: MFP, dtype: object" + ] + }, + "metadata": {}, + "execution_count": 20 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iUMI0wCw8xV1" + }, + "source": [ + "# Exporting " + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "eAu38vGW80za" + }, + "source": [ + "final_clusters.to_csv('final_clusters.tsv', sep='\\t', index=True)" + ], + "execution_count": 21, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "KNQ-pIcu9NXx" + }, "source": [ "" ],