Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:

strategy:
matrix:
python-version: [3.11, 3.12]
python-version: [3.11]

steps:
- uses: actions/checkout@v3
Expand Down
38 changes: 30 additions & 8 deletions jale/core/utils/hierarchical.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,10 @@ def hierarchical_clustering_pipeline(
project_path=project_path,
meta_name=meta_name,
silhouette_scores=silhouette_scores,
null_silhouette_scores=null_silhouette_scores,
silhouette_scores_z=silhouette_scores_z,
calinski_harabasz_scores=calinski_harabasz_scores,
null_calinski_harabasz_scores=null_calinski_harabasz_scores,
calinski_harabasz_scores_z=calinski_harabasz_scores_z,
exp_separation_density=exp_separation_density,
correlation_type=correlation_type,
Expand Down Expand Up @@ -299,7 +301,7 @@ def compute_hierarchical_clustering(correlation_matrix, k, linkage_method):
Z = linkage(condensed_distance, method=linkage_method)
cluster_labels = fcluster(Z, k, criterion="maxclust")

# Safeguard: If only one unique cluster label, set metrics to np.nan
# Safeguard: If number of cluster labels is smaller than k, set metrics to np.nan
if len(np.unique(cluster_labels)) < 2:
silhouette = np.nan
calinski_harabasz = np.nan
Expand Down Expand Up @@ -328,21 +330,21 @@ def compute_hc_metrics_z(
def pooled_std(sample1, sample2):
"""Compute the pooled standard deviation of two samples."""
n1, n2 = sample1.shape[1], sample2.shape[1]
var1, var2 = np.var(sample1, axis=1, ddof=1), np.var(sample2, axis=1, ddof=1)
var1, var2 = np.nanvar(sample1, axis=1, ddof=1), np.nanvar(sample2, axis=1, ddof=1)
return np.sqrt(((n1 - 1) * var1 + (n2 - 1) * var2) / (n1 + n2 - 2))

silhouette_scores_avg = np.average(silhouette_scores, axis=1)
null_silhouette_scores_avg = np.average(null_silhouette_scores, axis=1)
silhouette_scores_avg = np.nanmean(silhouette_scores, axis=1)
null_silhouette_scores_avg = np.nanmean(null_silhouette_scores, axis=1)

if use_pooled_std:
silhouette_std = pooled_std(silhouette_scores, null_silhouette_scores)
else:
silhouette_std = np.std(null_silhouette_scores, axis=1, ddof=1)
silhouette_std = np.nanstd(null_silhouette_scores, axis=1, ddof=1)

silhouette_z = (silhouette_scores_avg - null_silhouette_scores_avg) / silhouette_std

calinski_harabasz_scores_avg = np.average(calinski_harabasz_scores, axis=1)
null_calinski_harabasz_scores_avg = np.average(
calinski_harabasz_scores_avg = np.nanmean(calinski_harabasz_scores, axis=1)
null_calinski_harabasz_scores_avg = np.nanmean(
null_calinski_harabasz_scores, axis=1
)

Expand All @@ -351,7 +353,7 @@ def pooled_std(sample1, sample2):
calinski_harabasz_scores, null_calinski_harabasz_scores
)
else:
calinski_harabasz_std = np.std(null_calinski_harabasz_scores, axis=1, ddof=1)
calinski_harabasz_std = np.nanstd(null_calinski_harabasz_scores, axis=1, ddof=1)

calinski_harabasz_z = (
calinski_harabasz_scores_avg - null_calinski_harabasz_scores_avg
Expand Down Expand Up @@ -435,8 +437,10 @@ def save_hc_metrics(
project_path,
meta_name,
silhouette_scores,
null_silhouette_scores,
silhouette_scores_z,
calinski_harabasz_scores,
null_calinski_harabasz_scores,
calinski_harabasz_scores_z,
exp_separation_density,
correlation_type,
Expand All @@ -448,9 +452,13 @@ def save_hc_metrics(
"Number of Clusters": range(2, max_k + 1),
"Silhouette Scores": np.average(silhouette_scores, axis=1),
"Silhouette Scores SD": np.std(silhouette_scores, axis=1),
"Silhouette Scores NULL": np.nanmean(null_silhouette_scores, axis=1),
"Silhouette Scores NULL SD": np.nanstd(null_silhouette_scores, axis=1),
"Silhouette Scores Z": silhouette_scores_z,
"Calinski-Harabasz Scores": np.average(calinski_harabasz_scores, axis=1),
"Calinski-Harabasz Scores SD": np.std(calinski_harabasz_scores, axis=1),
"Calinski-Harabasz Scores NULL": np.nanmean(null_calinski_harabasz_scores, axis=1),
"Calinski-Harabasz Scores NULL SD": np.nanstd(null_calinski_harabasz_scores, axis=1),
"Calinski-Harabasz Scores Z": calinski_harabasz_scores_z,
# Pad with NaN for k=2 as metrics start at k=3
"Experiment Separation Density": np.concatenate(
Expand All @@ -471,13 +479,27 @@ def save_hc_metrics(
header=[f"k={k}" for k in range(2, max_k + 1)],
)

pd.DataFrame(null_silhouette_scores.T).to_csv(
project_path
/ f"Results/MA_Clustering/metrics/{meta_name}_NULLsilhouette_scores_{correlation_type}_hc_{linkage_method}.csv",
index=False,
header=[f"k={k}" for k in range(2, max_k + 1)],
)

pd.DataFrame(calinski_harabasz_scores.T).to_csv(
project_path
/ f"Results/MA_Clustering/metrics/{meta_name}_calinski_harabasz_scores_{correlation_type}_hc_{linkage_method}.csv",
index=False,
header=[f"k={k}" for k in range(2, max_k + 1)],
)

pd.DataFrame(null_calinski_harabasz_scores.T).to_csv(
project_path
/ f"Results/MA_Clustering/metrics/{meta_name}_NULLcalinski_harabasz_scores_{correlation_type}_hc_{linkage_method}.csv",
index=False,
header=[f"k={k}" for k in range(2, max_k + 1)],
)

pd.DataFrame(exp_separation_density.T).to_csv(
project_path
/ f"Results/MA_Clustering/metrics/{meta_name}_exp_separation_density_{correlation_type}_hc_{linkage_method}.csv",
Expand Down
3 changes: 1 addition & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,12 @@ name = "jale"
version = "0.2.9"
description = "Package allowing users to run Activation Likelihood Estimation Meta-Analysis"
readme = "README.md"
requires-python = ">=3.9, <3.13"
requires-python = ">=3.9, <3.12"
license = "MIT"
authors = [{ name = "Lennart Frahm", email = "l.frahm@mailbox.org" }]
classifiers = [
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Operating System :: OS Independent",
]
dependencies = [
Expand Down