From 3d9f3d8bbf44fddef108abd2e243cc73c037c324 Mon Sep 17 00:00:00 2001 From: kellyyuan333 Date: Mon, 10 Feb 2025 15:16:26 -0500 Subject: [PATCH 1/3] implement KMeans clustering --- .vscode/launch.json | 15 +++ .../bitbirchX/.idea/workspace.xml | 93 +++++++++++++++++++ development-code/bitbirch_dev.py | 49 +++++++++- jt_fit_label.txt | 2 + 4 files changed, 158 insertions(+), 1 deletion(-) create mode 100644 .vscode/launch.json create mode 100644 development-code/bitbirchX/.idea/workspace.xml create mode 100644 jt_fit_label.txt diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..6b76b4f --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,15 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Python Debugger: Current File", + "type": "debugpy", + "request": "launch", + "program": "${file}", + "console": "integratedTerminal" + } + ] +} \ No newline at end of file diff --git a/development-code/bitbirchX/.idea/workspace.xml b/development-code/bitbirchX/.idea/workspace.xml new file mode 100644 index 0000000..a2ef5ee --- /dev/null +++ b/development-code/bitbirchX/.idea/workspace.xml @@ -0,0 +1,93 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 1707287077383 + + + + + + \ No newline at end of file diff --git a/development-code/bitbirch_dev.py b/development-code/bitbirch_dev.py index 4caf6e5..ca82849 100644 --- a/development-code/bitbirch_dev.py +++ b/development-code/bitbirch_dev.py @@ -26,8 +26,11 @@ ### Joel Nothman ### License: BSD 3 clause +import time import numpy as np from scipy import sparse +from sklearn.cluster import KMeans +from sklearn.metrics import pairwise_distances_argmin def jt_distances(X): """Calculates the matrix of Tanimoto distances @@ -439,6 +442,9 @@ class BitBirch(): subcluster_labels_ : ndarray Labels assigned to the centroids of the subclusters after they are clustered globally. + + labels_ : ndarray of shape (n_samples,) + Array of labels assigned to the input data. Notes ----- @@ -558,7 +564,7 @@ def _fit(self, X, partial): self._n_features_out = self.subcluster_centers_.shape[0] # TODO: Incorporate global_clustering option - #self._global_clustering(X) + self._global_clustering(X) self.first_call = False return self @@ -581,3 +587,44 @@ def _get_leaves(self): def retrieveVal(self): print() + def _global_clustering(self, X): + clusters = self.n_clusters + centroids = self.subcluster_centers_ + compute_labels = (X is not None) and self.compute_labels + + if isinstance(clusters, int): + km = KMeans(n_clusters=clusters) + self.subcluster_labels_ = km.fit_predict(centroids) + else: + # argument is None (skip global clustering) + self.subcluster_labels_ = np.arange(len(centroids)) + return + + if compute_labels: + argmin = pairwise_distances_argmin(X, centroids) + self.labels_ = self.subcluster_labels_[argmin] + +# Simple example running random sets with 1000 to 50000 molecules +z = '' +for n in range(1000, 2001, 1000): + print(n) + np.random.seed(0) # for testing + dat = np.random.randint(2, size=(n, 100), dtype='int64') + # filename = f"random_data_{n}.npy" + # np.save(filename, dat) + # print(dat) + brc = BitBirch(n_clusters=5, branching_factor=50, threshold = 0.50) + v = time.time() + brc.fit(dat) + #brc.check_threshold() + #labels = brc.predict(dat) + # leaves = brc._get_leaves() + # for leave in leaves: + # for subcluster in leave.subclusters_: + # print(len(subcluster.mol_indices)) + + z += '{:10} {:10.6}\n'.format(n, time.time() - v) + #z += '{:10} {:10}\n'.format(n, len(brc.subcluster_centers_)) + +with open('jt_fit_label.txt', 'w') as outfile: + outfile.write(z[:-1]) \ No newline at end of file diff --git a/jt_fit_label.txt b/jt_fit_label.txt new file mode 100644 index 0000000..af076ad --- /dev/null +++ b/jt_fit_label.txt @@ -0,0 +1,2 @@ + 1000 0.235511 + 2000 0.188385 \ No newline at end of file From d496c310f5d6b744eb5b70affaf1be048976b0cc Mon Sep 17 00:00:00 2001 From: kellyyuan333 Date: Wed, 26 Feb 2025 19:16:16 -0500 Subject: [PATCH 2/3] add perform_clustering boolean attribute --- development-code/bitbirch_dev.py | 33 +++++--------------------------- jt_fit_label.txt | 4 ++-- 2 files changed, 7 insertions(+), 30 deletions(-) diff --git a/development-code/bitbirch_dev.py b/development-code/bitbirch_dev.py index ca82849..0a4c589 100644 --- a/development-code/bitbirch_dev.py +++ b/development-code/bitbirch_dev.py @@ -470,6 +470,7 @@ def __init__( n_clusters=3, compute_labels=True, copy=True, + perform_clustering=False, ): self.threshold = threshold self.branching_factor = branching_factor @@ -478,6 +479,7 @@ def __init__( self.copy = copy self.index_tracker = 0 self.first_call = True + self.perform_clustering = perform_clustering def fit(self, X, y=None): """ @@ -563,8 +565,8 @@ def _fit(self, X, partial): self.subcluster_centers_ = centroids self._n_features_out = self.subcluster_centers_.shape[0] - # TODO: Incorporate global_clustering option - self._global_clustering(X) + if(self.perform_clustering): + self._global_clustering(X) self.first_call = False return self @@ -602,29 +604,4 @@ def _global_clustering(self, X): if compute_labels: argmin = pairwise_distances_argmin(X, centroids) - self.labels_ = self.subcluster_labels_[argmin] - -# Simple example running random sets with 1000 to 50000 molecules -z = '' -for n in range(1000, 2001, 1000): - print(n) - np.random.seed(0) # for testing - dat = np.random.randint(2, size=(n, 100), dtype='int64') - # filename = f"random_data_{n}.npy" - # np.save(filename, dat) - # print(dat) - brc = BitBirch(n_clusters=5, branching_factor=50, threshold = 0.50) - v = time.time() - brc.fit(dat) - #brc.check_threshold() - #labels = brc.predict(dat) - # leaves = brc._get_leaves() - # for leave in leaves: - # for subcluster in leave.subclusters_: - # print(len(subcluster.mol_indices)) - - z += '{:10} {:10.6}\n'.format(n, time.time() - v) - #z += '{:10} {:10}\n'.format(n, len(brc.subcluster_centers_)) - -with open('jt_fit_label.txt', 'w') as outfile: - outfile.write(z[:-1]) \ No newline at end of file + self.labels_ = self.subcluster_labels_[argmin] \ No newline at end of file diff --git a/jt_fit_label.txt b/jt_fit_label.txt index af076ad..1378674 100644 --- a/jt_fit_label.txt +++ b/jt_fit_label.txt @@ -1,2 +1,2 @@ - 1000 0.235511 - 2000 0.188385 \ No newline at end of file + 1000 0.0737119 + 2000 0.140676 \ No newline at end of file From ee0062cfb1adb625459a5c17b33a15a135eada4b Mon Sep 17 00:00:00 2001 From: kellyyuan333 Date: Wed, 26 Feb 2025 20:18:45 -0500 Subject: [PATCH 3/3] implemented hierarchical clustering --- development-code/bitbirch_dev.py | 19 ++++++++++++++----- jt_fit_label.txt | 4 ++-- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/development-code/bitbirch_dev.py b/development-code/bitbirch_dev.py index 0a4c589..19017c7 100644 --- a/development-code/bitbirch_dev.py +++ b/development-code/bitbirch_dev.py @@ -29,7 +29,7 @@ import time import numpy as np from scipy import sparse -from sklearn.cluster import KMeans +from sklearn.cluster import AgglomerativeClustering, KMeans from sklearn.metrics import pairwise_distances_argmin def jt_distances(X): @@ -471,6 +471,7 @@ def __init__( compute_labels=True, copy=True, perform_clustering=False, + clustering_type="" ): self.threshold = threshold self.branching_factor = branching_factor @@ -480,6 +481,7 @@ def __init__( self.index_tracker = 0 self.first_call = True self.perform_clustering = perform_clustering + self.clustering_type = clustering_type def fit(self, X, y=None): """ @@ -590,15 +592,22 @@ def retrieveVal(self): print() def _global_clustering(self, X): + """ + Global clustering for the subclusters obtained after fitting + """ clusters = self.n_clusters centroids = self.subcluster_centers_ + clustering_type = self.clustering_type compute_labels = (X is not None) and self.compute_labels - if isinstance(clusters, int): - km = KMeans(n_clusters=clusters) - self.subcluster_labels_ = km.fit_predict(centroids) + if clustering_type == "kmeans" and isinstance(clusters, int): + clusterer = KMeans(n_clusters=clusters) + self.subcluster_labels_ = clusterer.fit_predict(centroids) + elif clustering_type == "hierarchical" and isinstance(clusters, int): + clusterer = AgglomerativeClustering(n_clusters=clusters) + self.subcluster_labels_ = clusterer.fit_predict(centroids) else: - # argument is None (skip global clustering) + # n_clusters is None and/or clustering_type == "" (skip global clustering) self.subcluster_labels_ = np.arange(len(centroids)) return diff --git a/jt_fit_label.txt b/jt_fit_label.txt index 1378674..2aa3190 100644 --- a/jt_fit_label.txt +++ b/jt_fit_label.txt @@ -1,2 +1,2 @@ - 1000 0.0737119 - 2000 0.140676 \ No newline at end of file + 1000 0.386942 + 2000 0.20582 \ No newline at end of file