From 3d9f3d8bbf44fddef108abd2e243cc73c037c324 Mon Sep 17 00:00:00 2001
From: kellyyuan333 <kellyyuan333@gmail.com>
Date: Mon, 10 Feb 2025 15:16:26 -0500
Subject: [PATCH 1/3] implement KMeans clustering

---
 .vscode/launch.json                           | 15 +++
 .../bitbirchX/.idea/workspace.xml             | 93 +++++++++++++++++++
 development-code/bitbirch_dev.py              | 49 +++++++++-
 jt_fit_label.txt                              |  2 +
 4 files changed, 158 insertions(+), 1 deletion(-)
 create mode 100644 .vscode/launch.json
 create mode 100644 development-code/bitbirchX/.idea/workspace.xml
 create mode 100644 jt_fit_label.txt
diff --git a/.vscode/launch.json b/.vscode/launch.json
new file mode 100644
index 0000000..6b76b4f
--- /dev/null
+++ b/.vscode/launch.json
@@ -0,0 +1,15 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python Debugger: Current File",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/development-code/bitbirchX/.idea/workspace.xml b/development-code/bitbirchX/.idea/workspace.xml
new file mode 100644
index 0000000..a2ef5ee
--- /dev/null
+++ b/development-code/bitbirchX/.idea/workspace.xml
@@ -0,0 +1,93 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="AutoImportSettings">
+    <option name="autoReloadType" value="SELECTIVE" />
+  </component>
+  <component name="CMakePresetLoader"><![CDATA[{
+  "useNewFormat": true
+}]]></component>
+  <component name="CMakeReloadState">
+    <option name="reloaded" value="true" />
+  </component>
+  <component name="CMakeRunConfigurationManager">
+    <generated />
+  </component>
+  <component name="CMakeSettings">
+    <configurations>
+      <configuration PROFILE_NAME="Debug" ENABLED="true" CONFIG_NAME="Debug" />
+    </configurations>
+  </component>
+  <component name="ChangeListManager">
+    <list default="true" id="8a307921-de39-44d5-a6be-d584e738c01e" name="Changes" comment="" />
+    <option name="SHOW_DIALOG" value="false" />
+    <option name="HIGHLIGHT_CONFLICTS" value="true" />
+    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
+    <option name="LAST_RESOLUTION" value="IGNORE" />
+  </component>
+  <component name="ClangdSettings">
+    <option name="formatViaClangd" value="false" />
+  </component>
+  <component name="MarkdownSettingsMigration">
+    <option name="stateVersion" value="1" />
+  </component>
+  <component name="ProjectApplicationVersion">
+    <option name="ide" value="CLion" />
+    <option name="majorVersion" value="2022" />
+    <option name="minorVersion" value="3" />
+  </component>
+  <component name="ProjectId" id="2c1lgTPS2d1QfhTyLTP51EpdO01" />
+  <component name="ProjectViewState">
+    <option name="hideEmptyMiddlePackages" value="true" />
+    <option name="showLibraryContents" value="true" />
+  </component>
+  <component name="PropertiesComponent"><![CDATA[{
+  "keyToString": {
+    "RunOnceActivity.OpenProjectViewOnStart": "true",
+    "RunOnceActivity.ShowReadmeOnStart": "true",
+    "RunOnceActivity.cidr.known.project.marker": "true",
+    "WebServerToolWindowFactoryState": "false",
+    "cf.first.check.clang-format": "false",
+    "cidr.known.project.marker": "true",
+    "last_opened_file_path": "C:/Users/Kelly/Projects/bitbirchX",
+    "node.js.detected.package.eslint": "true",
+    "node.js.detected.package.tslint": "true",
+    "node.js.selected.package.eslint": "(autodetect)",
+    "node.js.selected.package.tslint": "(autodetect)",
+    "nodejs_package_manager_path": "npm",
+    "vue.rearranger.settings.migration": "true"
+  }
+}]]></component>
+  <component name="RunManager">
+    <configuration default="true" type="CLionExternalRunConfiguration" factoryName="Application" REDIRECT_INPUT="false" ELEVATE="false" USE_EXTERNAL_CONSOLE="false" PASS_PARENT_ENVS_2="true">
+      <method v="2">
+        <option name="CLION.EXTERNAL.BUILD" enabled="true" />
+      </method>
+    </configuration>
+    <configuration name="main.cpp" type="CppFileRunConfiguration" factoryName="CppFileRunConfiguration" temporary="true" REDIRECT_INPUT="false" ELEVATE="false" USE_EXTERNAL_CONSOLE="false" PASS_PARENT_ENVS_2="true" PROJECT_NAME="bitbirchX" TARGET_NAME="main.cpp" CONFIG_NAME="main.cpp">
+      <option name="sourceFile" value="main.cpp" />
+      <method v="2">
+        <option name="com.jetbrains.cidr.cpp.runfile.CppFileBuildBeforeRunTaskProvider$BasicBuildBeforeRunTask" enabled="true" />
+      </method>
+    </configuration>
+    <recent_temporary>
+      <list>
+        <item itemvalue="C/C++ File.main.cpp" />
+      </list>
+    </recent_temporary>
+  </component>
+  <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
+  <component name="TaskManager">
+    <task active="true" id="Default" summary="Default task">
+      <changelist id="8a307921-de39-44d5-a6be-d584e738c01e" name="Changes" comment="" />
+      <created>1707287077383</created>
+      <option name="number" value="Default" />
+      <option name="presentableId" value="Default" />
+      <updated>1707287077383</updated>
+      <workItem from="1707287079020" duration="715000" />
+    </task>
+    <servers />
+  </component>
+  <component name="TypeScriptGeneratedFilesManager">
+    <option name="version" value="3" />
+  </component>
+</project>
\ No newline at end of file
diff --git a/development-code/bitbirch_dev.py b/development-code/bitbirch_dev.py
index 4caf6e5..ca82849 100644
--- a/development-code/bitbirch_dev.py
+++ b/development-code/bitbirch_dev.py
@@ -26,8 +26,11 @@
 ###          Joel Nothman <joel.nothman@gmail.com>
 ### License: BSD 3 clause
 
+import time
 import numpy as np
 from scipy import sparse
+from sklearn.cluster import KMeans
+from sklearn.metrics import pairwise_distances_argmin
 
 def jt_distances(X):
     """Calculates the matrix of Tanimoto distances
@@ -439,6 +442,9 @@ class BitBirch():
     subcluster_labels_ : ndarray
         Labels assigned to the centroids of the subclusters after
         they are clustered globally.
+    
+    labels_ : ndarray of shape (n_samples,)
+        Array of labels assigned to the input data.
 
     Notes
     -----
@@ -558,7 +564,7 @@ def _fit(self, X, partial):
         self._n_features_out = self.subcluster_centers_.shape[0]
         
         # TODO: Incorporate global_clustering option
-        #self._global_clustering(X)
+        self._global_clustering(X)
         self.first_call = False
         return self
 
@@ -581,3 +587,44 @@ def _get_leaves(self):
     def retrieveVal(self):
         print()
 
+    def _global_clustering(self, X):
+        clusters = self.n_clusters
+        centroids = self.subcluster_centers_
+        compute_labels = (X is not None) and self.compute_labels
+
+        if isinstance(clusters, int):
+            km = KMeans(n_clusters=clusters)
+            self.subcluster_labels_ = km.fit_predict(centroids)
+        else:
+            # argument is None (skip global clustering)
+            self.subcluster_labels_ = np.arange(len(centroids))
+            return
+
+        if compute_labels:
+            argmin = pairwise_distances_argmin(X, centroids)
+            self.labels_ = self.subcluster_labels_[argmin]
+
+# Simple example running random sets with 1000 to 50000 molecules
+z = ''
+for n in range(1000, 2001, 1000):
+    print(n)
+    np.random.seed(0) # for testing
+    dat = np.random.randint(2, size=(n, 100), dtype='int64')
+    # filename = f"random_data_{n}.npy"
+    # np.save(filename, dat)
+    # print(dat)
+    brc = BitBirch(n_clusters=5, branching_factor=50, threshold = 0.50)
+    v = time.time()
+    brc.fit(dat)
+    #brc.check_threshold()
+    #labels = brc.predict(dat)
+    # leaves = brc._get_leaves()
+    # for leave in leaves:
+    #    for subcluster in leave.subclusters_:
+    #        print(len(subcluster.mol_indices))
+            
+    z += '{:10}    {:10.6}\n'.format(n, time.time() - v)
+    #z += '{:10}    {:10}\n'.format(n, len(brc.subcluster_centers_))
+
+with open('jt_fit_label.txt', 'w') as outfile:
+    outfile.write(z[:-1])
\ No newline at end of file
diff --git a/jt_fit_label.txt b/jt_fit_label.txt
new file mode 100644
index 0000000..af076ad
--- /dev/null
+++ b/jt_fit_label.txt
@@ -0,0 +1,2 @@
+      1000      0.235511
+      2000      0.188385
\ No newline at end of file

From d496c310f5d6b744eb5b70affaf1be048976b0cc Mon Sep 17 00:00:00 2001
From: kellyyuan333 <kellyyuan333@gmail.com>
Date: Wed, 26 Feb 2025 19:16:16 -0500
Subject: [PATCH 2/3] add perform_clustering boolean attribute

---
 development-code/bitbirch_dev.py | 33 +++++---------------------------
 jt_fit_label.txt                 |  4 ++--
 2 files changed, 7 insertions(+), 30 deletions(-)

diff --git a/development-code/bitbirch_dev.py b/development-code/bitbirch_dev.py
index ca82849..0a4c589 100644
--- a/development-code/bitbirch_dev.py
+++ b/development-code/bitbirch_dev.py
@@ -470,6 +470,7 @@ def __init__(
         n_clusters=3,
         compute_labels=True,
         copy=True,
+        perform_clustering=False,
     ):
         self.threshold = threshold
         self.branching_factor = branching_factor
@@ -478,6 +479,7 @@ def __init__(
         self.copy = copy
         self.index_tracker = 0
         self.first_call = True
+        self.perform_clustering = perform_clustering
 
     def fit(self, X, y=None):
         """
@@ -563,8 +565,8 @@ def _fit(self, X, partial):
         self.subcluster_centers_ = centroids
         self._n_features_out = self.subcluster_centers_.shape[0]
         
-        # TODO: Incorporate global_clustering option
-        self._global_clustering(X)
+        if(self.perform_clustering):
+            self._global_clustering(X)
         self.first_call = False
         return self
 
@@ -602,29 +604,4 @@ def _global_clustering(self, X):
 
         if compute_labels:
             argmin = pairwise_distances_argmin(X, centroids)
-            self.labels_ = self.subcluster_labels_[argmin]
-
-# Simple example running random sets with 1000 to 50000 molecules
-z = ''
-for n in range(1000, 2001, 1000):
-    print(n)
-    np.random.seed(0) # for testing
-    dat = np.random.randint(2, size=(n, 100), dtype='int64')
-    # filename = f"random_data_{n}.npy"
-    # np.save(filename, dat)
-    # print(dat)
-    brc = BitBirch(n_clusters=5, branching_factor=50, threshold = 0.50)
-    v = time.time()
-    brc.fit(dat)
-    #brc.check_threshold()
-    #labels = brc.predict(dat)
-    # leaves = brc._get_leaves()
-    # for leave in leaves:
-    #    for subcluster in leave.subclusters_:
-    #        print(len(subcluster.mol_indices))
-            
-    z += '{:10}    {:10.6}\n'.format(n, time.time() - v)
-    #z += '{:10}    {:10}\n'.format(n, len(brc.subcluster_centers_))
-
-with open('jt_fit_label.txt', 'w') as outfile:
-    outfile.write(z[:-1])
\ No newline at end of file
+            self.labels_ = self.subcluster_labels_[argmin]
\ No newline at end of file
diff --git a/jt_fit_label.txt b/jt_fit_label.txt
index af076ad..1378674 100644
--- a/jt_fit_label.txt
+++ b/jt_fit_label.txt
@@ -1,2 +1,2 @@
-      1000      0.235511
-      2000      0.188385
\ No newline at end of file
+      1000     0.0737119
+      2000      0.140676
\ No newline at end of file

From ee0062cfb1adb625459a5c17b33a15a135eada4b Mon Sep 17 00:00:00 2001
From: kellyyuan333 <kellyyuan333@gmail.com>
Date: Wed, 26 Feb 2025 20:18:45 -0500
Subject: [PATCH 3/3] implemented hierarchical clustering

---
 development-code/bitbirch_dev.py | 19 ++++++++++++++-----
 jt_fit_label.txt                 |  4 ++--
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/development-code/bitbirch_dev.py b/development-code/bitbirch_dev.py
index 0a4c589..19017c7 100644
--- a/development-code/bitbirch_dev.py
+++ b/development-code/bitbirch_dev.py
@@ -29,7 +29,7 @@
 import time
 import numpy as np
 from scipy import sparse
-from sklearn.cluster import KMeans
+from sklearn.cluster import AgglomerativeClustering, KMeans
 from sklearn.metrics import pairwise_distances_argmin
 
 def jt_distances(X):
@@ -471,6 +471,7 @@ def __init__(
         compute_labels=True,
         copy=True,
         perform_clustering=False,
+        clustering_type=""
     ):
         self.threshold = threshold
         self.branching_factor = branching_factor
@@ -480,6 +481,7 @@ def __init__(
         self.index_tracker = 0
         self.first_call = True
         self.perform_clustering = perform_clustering
+        self.clustering_type = clustering_type
 
     def fit(self, X, y=None):
         """
@@ -590,15 +592,22 @@ def retrieveVal(self):
         print()
 
     def _global_clustering(self, X):
+        """
+        Global clustering for the subclusters obtained after fitting
+        """
         clusters = self.n_clusters
         centroids = self.subcluster_centers_
+        clustering_type = self.clustering_type
         compute_labels = (X is not None) and self.compute_labels
 
-        if isinstance(clusters, int):
-            km = KMeans(n_clusters=clusters)
-            self.subcluster_labels_ = km.fit_predict(centroids)
+        if clustering_type == "kmeans" and isinstance(clusters, int):
+            clusterer = KMeans(n_clusters=clusters)
+            self.subcluster_labels_ = clusterer.fit_predict(centroids)
+        elif clustering_type == "hierarchical" and isinstance(clusters, int):
+            clusterer = AgglomerativeClustering(n_clusters=clusters)
+            self.subcluster_labels_ = clusterer.fit_predict(centroids)
         else:
-            # argument is None (skip global clustering)
+            # n_clusters is None and/or clustering_type == "" (skip global clustering)
             self.subcluster_labels_ = np.arange(len(centroids))
             return
 
diff --git a/jt_fit_label.txt b/jt_fit_label.txt
index 1378674..2aa3190 100644
--- a/jt_fit_label.txt
+++ b/jt_fit_label.txt
@@ -1,2 +1,2 @@
-      1000     0.0737119
-      2000      0.140676
\ No newline at end of file
+      1000      0.386942
+      2000       0.20582
\ No newline at end of file