diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..6b76b4f --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,15 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Python Debugger: Current File", + "type": "debugpy", + "request": "launch", + "program": "${file}", + "console": "integratedTerminal" + } + ] +} \ No newline at end of file diff --git a/development-code/bitbirchX/.idea/workspace.xml b/development-code/bitbirchX/.idea/workspace.xml new file mode 100644 index 0000000..a2ef5ee --- /dev/null +++ b/development-code/bitbirchX/.idea/workspace.xml @@ -0,0 +1,93 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 1707287077383 + + + + + + \ No newline at end of file diff --git a/development-code/bitbirch_dev.py b/development-code/bitbirch_dev.py index 4caf6e5..19017c7 100644 --- a/development-code/bitbirch_dev.py +++ b/development-code/bitbirch_dev.py @@ -26,8 +26,11 @@ ### Joel Nothman ### License: BSD 3 clause +import time import numpy as np from scipy import sparse +from sklearn.cluster import AgglomerativeClustering, KMeans +from sklearn.metrics import pairwise_distances_argmin def jt_distances(X): """Calculates the matrix of Tanimoto distances @@ -439,6 +442,9 @@ class BitBirch(): subcluster_labels_ : ndarray Labels assigned to the centroids of the subclusters after they are clustered globally. + + labels_ : ndarray of shape (n_samples,) + Array of labels assigned to the input data. Notes ----- @@ -464,6 +470,8 @@ def __init__( n_clusters=3, compute_labels=True, copy=True, + perform_clustering=False, + clustering_type="" ): self.threshold = threshold self.branching_factor = branching_factor @@ -472,6 +480,8 @@ def __init__( self.copy = copy self.index_tracker = 0 self.first_call = True + self.perform_clustering = perform_clustering + self.clustering_type = clustering_type def fit(self, X, y=None): """ @@ -557,8 +567,8 @@ def _fit(self, X, partial): self.subcluster_centers_ = centroids self._n_features_out = self.subcluster_centers_.shape[0] - # TODO: Incorporate global_clustering option - #self._global_clustering(X) + if(self.perform_clustering): + self._global_clustering(X) self.first_call = False return self @@ -581,3 +591,26 @@ def _get_leaves(self): def retrieveVal(self): print() + def _global_clustering(self, X): + """ + Global clustering for the subclusters obtained after fitting + """ + clusters = self.n_clusters + centroids = self.subcluster_centers_ + clustering_type = self.clustering_type + compute_labels = (X is not None) and self.compute_labels + + if clustering_type == "kmeans" and isinstance(clusters, int): + clusterer = KMeans(n_clusters=clusters) + self.subcluster_labels_ = clusterer.fit_predict(centroids) + elif clustering_type == "hierarchical" and isinstance(clusters, int): + clusterer = AgglomerativeClustering(n_clusters=clusters) + self.subcluster_labels_ = clusterer.fit_predict(centroids) + else: + # n_clusters is None and/or clustering_type == "" (skip global clustering) + self.subcluster_labels_ = np.arange(len(centroids)) + return + + if compute_labels: + argmin = pairwise_distances_argmin(X, centroids) + self.labels_ = self.subcluster_labels_[argmin] \ No newline at end of file diff --git a/jt_fit_label.txt b/jt_fit_label.txt new file mode 100644 index 0000000..2aa3190 --- /dev/null +++ b/jt_fit_label.txt @@ -0,0 +1,2 @@ + 1000 0.386942 + 2000 0.20582 \ No newline at end of file