From a3794975f49e5e4f0ba87a61e217d4c0dcd9829b Mon Sep 17 00:00:00 2001 From: Mark Robinson Date: Tue, 20 Jan 2026 20:59:55 +0100 Subject: [PATCH 1/4] add everything to conda yml --- Clustering_conda.yml | 175 ++++++++----------------------------------- 1 file changed, 30 insertions(+), 145 deletions(-) diff --git a/Clustering_conda.yml b/Clustering_conda.yml index f0ece39..b9bd5dc 100644 --- a/Clustering_conda.yml +++ b/Clustering_conda.yml @@ -11,25 +11,21 @@ software_environments: clustbench: description: "clustbench on py3.12.6" conda: envs/clustbench.yml - envmodule: clustbench - apptainer: oras://quay.io/imallona/clustering_example/clustbench:latest fcps: description: "CRAN's FCPS" conda: envs/fcps.yml - apptainer: oras://quay.io/imallona/clustering_example/fcps:latest - envmodule: fcps -metric_collectors: - - id: plotting - name: "Single-backend metric collector." - software_environment: "fcps" - repository: - url: https://github.com/imallona/clustering_report - commit: 040 - inputs: - - metrics.scores - outputs: - - id: plotting.html - path: "{name}/plotting_report.html" +#metric_collectors: +# - id: plotting +# name: "Single-backend metric collector." +# software_environment: "fcps" +# repository: +# url: https://github.com/imallona/clustering_report +# commit: 040 +# inputs: +# - metrics.scores +# outputs: +# - id: plotting.html +# path: "{name}/plotting_report.html" stages: ## clustbench data ########################################################## @@ -43,115 +39,17 @@ stages: commit: fc67ebd parameters: # comments depict the possible cardinalities and the number of curated labelsets - dataset_generator: "fcps" - dataset_name: ["atom", "chainlink"] # 2 1 - # - dataset_generator: "fcps" - # dataset_name: ["engytime", "hepta", "lsun", "target", "tetra", "twodiamonds", "wingnut"] # 7 1, 3 1, 2,6 2, 4 1, 2 1, 2 1 - # - dataset_generator: "graves" - # dataset_name: ["dense"] # 2 1 - # - dataset_generator: "graves" - # dataset_name: ["fuzzyx"] # 2,4,5 6 - # - dataset_generator: "graves" - # dataset_name: ["line"] # 2 1 - # - dataset_generator: "graves" - # dataset_name: ["parabolic"] # 2,4 2 - # - dataset_generator: "graves" - # dataset_name: ["ring"] # 2 1 - # - dataset_generator: "graves" - # dataset_name: ["ring_noisy"] # 2 1 - # - dataset_generator: "graves" - # dataset_name: ["ring_outliers"] # 2,5 2 - # - dataset_generator: "graves" - # dataset_name: ["zigzag"] # 3,5 2 - # - dataset_generator: "graves" - # dataset_name: ["zigzag_noisy"] # 3,5 2 - # - dataset_generator: "graves" - # dataset_name: ["zigzag_outliers"] # 3,5 2 - # - dataset_generator: "other" - # dataset_name: ["chameleon_t4_8k"] # 6 1 - # - dataset_generator: "other" - # dataset_name: ["chameleon_t5_8k"] # 6 1 - # - dataset_generator: "other" - # dataset_name: ["hdbscan"] # 6 1 - # - dataset_generator: "other" - # dataset_name: ["iris"] # 3 1 - # - dataset_generator: "other" - # dataset_name: ["iris5"] # 3 1 - # - dataset_generator: "other" - # dataset_name: ["square"] # 2 1 - # - dataset_generator: "sipu" - # dataset_name: ["aggregation"] # 7 1 - # - dataset_generator: "sipu" - # dataset_name: ["compound"] # 4,5,6 5 - # - dataset_generator: "sipu" - # dataset_name: ["flame"] # 2 2 - # - dataset_generator: "sipu" - # dataset_name: ["jain"] # 2 1 - # - dataset_generator: "sipu" - # dataset_name: ["pathbased"] # 3,4 2 - # - dataset_generator: "sipu" - # dataset_name: ["r15"] # 8,9,15 3 - # - dataset_generator: "sipu" - # dataset_name: ["spiral"] # 3 1 - # - dataset_generator: "sipu" - # dataset_name: ["unbalance"] # 8 1 - # - dataset_generator: "uci" - # dataset_name: ["ecoli"] # 8 1 - # - dataset_generator: "uci" - # dataset_name: ["ionosphere"] # 2 1 - # - dataset_generator: "uci" - # dataset_name: ["sonar"] # 2 1 - # - dataset_generator: "uci" - # dataset_name: ["statlog"] # 7 1 - # - dataset_generator: "uci" - # dataset_name: ["wdbc"] # 2 1 - # - dataset_generator: "uci" - # dataset_name: ["wine"] # 3 1 - # - dataset_generator: "uci" - # dataset_name: ["yeast"] # 10 1 - # - dataset_generator: "wut" - # dataset_name: ["circles"] # 4 1 - # - dataset_generator: "wut" - # dataset_name: ["cross"] # 4 1 - # - dataset_generator: "wut" - # dataset_name: ["graph"] # 10 1 - # - dataset_generator: "wut" - # dataset_name: ["isolation"] # 3 1 - # - dataset_generator: "wut" - # dataset_name: ["labirynth"] # 6 1 - # - dataset_generator: "wut" - # dataset_name: ["mk1"] # 3 1 - # - dataset_generator: "wut" - # dataset_name: ["mk2"] # 2 1 - # - dataset_generator: "wut" - # dataset_name: ["mk3"] # 3 1 - # - dataset_generator: "wut" - # dataset_name: ["mk4"] # 3 1 - # - dataset_generator: "wut" - # dataset_name: ["olympic"] # 5 1 - # - dataset_generator: "wut" - # dataset_name: ["smile"] # 4,6 2 - # - dataset_generator: "wut" - # dataset_name: ["stripes"] # 2 1 - # - dataset_generator: "wut" - # dataset_name: ["trajectories"] # 4 1 - # - dataset_generator: "wut" - # dataset_name: ["trapped_lovers"] # 3 1 - # - dataset_generator: "wut" - # dataset_name: ["twosplashes"] # 2 1 - # - dataset_generator: "wut" - # dataset_name: ["windows"] # 5 1 - # - dataset_generator: "wut" - # dataset_name: ["x1"] # 3 1 - # - dataset_generator: "wut" - # dataset_name: ["x2"] # 3 1 - # - dataset_generator: "wut" - # dataset_name: ["x3"] # 4 1 - # - dataset_generator: "wut" - # dataset_name: ["z1"] # 3 1 - # - dataset_generator: "wut" - # dataset_name: ["z2"] # 5 1 - # - dataset_generator: "wut" - # dataset_name: ["z3"] # 4 1 + dataset_name: ["atom", "chainlink", "engytime", "hepta", "lsun", "target", "tetra", "twodiamonds", "wingnut"] + - dataset_generator: "graves" + dataset_name: ["dense", "fuzzyx", "line", "parabolic", "ring", "ring_noisy", "ring_outliers", "zigzag", "zigzag_noisy", "zigzag_outliers"] + - dataset_generator: "other" + dataset_name: ["chameleon_t4_8k", "chameleon_t5_8k", "hdbscan", "iris", "iris5", "square"] + - dataset_generator: "sipu" + dataset_name: ["aggregation", "compound", "flame", "jain", "pathbased", "r15", "spiral", "unbalance"] + - dataset_generator: "uci" + dataset_name: ["ecoli", "ionosphere", "sonar", "statlog", "wdbc", "wine", "yeast"] + - dataset_generator: "wut" + dataset_name: ["circles", "cross", "graph", "isolation", "labirynth", "mk1", "mk2", "mk3", "mk4", "olympic", "smile", "stripes", "trajectories", "trapped_lovers", "twosplashes", "windows", "x1", "x2", "x3", "z1", "z2", "z3"] outputs: - id: data.matrix path: "{dataset}.data.gz" @@ -169,8 +67,7 @@ stages: url: https://github.com/imallona/clustbench_fastcluster commit: e644ce5 parameters: - - linkage: "complete" - #- linkage: ["ward", "average", "weighted", "median", "centroid"] + - linkage: ["complete", "ward", "average", "weighted", "median", "centroid"] - id: sklearn name: "sklearn" software_environment: "clustbench" @@ -178,9 +75,8 @@ stages: url: https://github.com/imallona/clustbench_sklearn commit: dcf35e1 parameters: - - method: "birch" - # ["kmeans, "gm"] - # ["spectral"] ## too slow + - method: ["birch", "kmeans", "gm"] + # "spectral" ## too slow - id: agglomerative name: "agglomerative" software_environment: "clustbench" @@ -188,8 +84,7 @@ stages: url: https://github.com/imallona/clustbench_agglomerative commit: 9d086a9 parameters: - - linkage: "average" - # ["complete", "ward"] + - linkage: ["average", "complete", "ward"] - id: genieclust name: "genieclust" software_environment: "clustbench" @@ -198,8 +93,8 @@ stages: commit: 7d9e799 parameters: - method: "genie" - # method: ["gic", "ica"] gini_threshold: 0.5 + - method: ["gic", "ica"] - id: fcps name: "fcps" software_environment: "fcps" @@ -207,19 +102,9 @@ stages: url: https://github.com/imallona/clustbench_fcps commit: e780fed parameters: - - method: "FCPS_Minimax" + - method: ["FCPS_Minimax", "FCPS_MinEnergy", "FCPS_HDBSCAN_2", "FCPS_HDBSCAN_4", "FCPS_HDBSCAN_8", "FCPS_Diana", "FCPS_Fanny", "FCPS_Hardcl", "FCPS_Softcl", "FCPS_Clara", "FCPS_PAM"] seed: 2 # - "FCPS_AdaptiveDensityPeak" # not in Conda - # - "FCPS_MinEnergy", - # - "FCPS_HDBSCAN_2", - # - "FCPS_HDBSCAN_4", - # - "FCPS_HDBSCAN_8", - # - "FCPS_Diana", - # - "FCPS_Fanny", - # - "FCPS_Hardcl", - # - "FCPS_Softcl", - # - "FCPS_Clara", - # - "FCPS_PAM" inputs: - data.matrix - data.true_labels @@ -236,7 +121,7 @@ stages: url: https://github.com/imallona/clustbench_metrics commit: c4eda85 parameters: - - metric: ["normalized_clustering_accuracy", "adjusted_fm_score"] + - metric: ["normalized_clustering_accuracy", "adjusted_fm_score", "adjusted_rand_score"] # - "adjusted_mi_score" # - "adjusted_rand_score" # - "fm_score" From ce838e25c3c8231069046e966731db265def0fb0 Mon Sep 17 00:00:00 2001 From: Mark Robinson Date: Tue, 20 Jan 2026 21:11:28 +0100 Subject: [PATCH 2/4] updates to README.md + conda plan --- Clustering_conda.yml | 2 +- README.md | 96 +++++++++++++++++++++----------------------- 2 files changed, 47 insertions(+), 51 deletions(-) diff --git a/Clustering_conda.yml b/Clustering_conda.yml index b9bd5dc..3159e86 100644 --- a/Clustering_conda.yml +++ b/Clustering_conda.yml @@ -1,4 +1,4 @@ -id: clustering_example_conda +id: omni-clustbench description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. version: "1.5.0" benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" diff --git a/README.md b/README.md index 8532c72..955393e 100644 --- a/README.md +++ b/README.md @@ -3,9 +3,9 @@ A clustering example for omnibenchmark # How to run 1. Install omnibenchmark using [our tutorial](https://docs.omnibenchmark.org/latest/howto/#install-omnibenchmark) -2. Clone the benchmark definition / this repository with `git clone git@github.com:omnibenchmark/clustering_example.git` +2. Clone the benchmark plan / this repository with `git clone git@github.com:omnibenchmark/clustering_example.git` 3. Move to the cloned repository `cd clustering_example` -4. Run locally, somewhat in parallel `ob run benchmark -b CLUSTERING.YAML --local-storage --cores 6`. Choose `Clustering.yml` specification based on whether running it with conda, easybuild, apptainer, etc. [More details about the available backends](https://github.com/omnibenchmark/clustering_example/blob/main/envs/README.md). +4. Run locally, somewhat in parallel `ob run CLUSTERING.YAML --cores 6`. Choose `Clustering.yml` plan based on whether running it with conda, easybuild, apptainer, etc. [More details about the available backends](https://github.com/omnibenchmark/clustering_example/blob/main/envs/README.md). # Disclaimer @@ -13,7 +13,7 @@ This does not work in arm64. # Clustbench attribution -by Marek Gagolewski, modified by Izaskun Mallona +by Marek Gagolewski, modified by Izaskun Mallona (some edits to the plan(s) by Ben Carrillo, Mark Robinson) # Data disclaimer @@ -21,65 +21,61 @@ Some datasets are commented out to speed up calculations. From [Are cluster validity measures (in) valid?](https://www.sciencedirect.com/science/article/pii/S0020025521010082): -> The original benchmark battery consists of 79 data instances, however 16 datasets are accompanied by labels that yield ; they were omitted for their computation would be too lengthy (namely: mnist/digits, mnist/fashion, other/chameleon_t7_10k, other/chameleon_t8_8k, sipu/a1, sipu/a2, sipu/a3, sipu/birch1, sipu/birch2, sipu/d31, sipu/s1, sipu/s2, sipu/s3, sipu/s4, sipu/worms_2, sipu/worms_64). Also uci/glass has been removed as one of its 25-near-neighbour graph’s connected components was too small for the NN-based methods to succeed. This leaves us with 62 datasets in total, see Table 1. +> The original benchmark battery consists of 79 data instances, however 16 datasets are accompanied by labels that yield ; they were omitted for their computation would be too lengthy (namely: mnist/digits, mnist/fashion, other/chameleon_t7_10k, other/chameleon_t8_8k, sipu/a1, sipu/a2, sipu/a3, sipu/birch1, sipu/birch2, sipu/d31, sipu/s1, sipu/s2, sipu/s3, sipu/s4, sipu/worms_2, sipu/worms_64). Also uci/glass has been removed as one of its 25-near-neighbour graph’s connected components was too small for the NN-based methods to succeed. This leaves us with 62 datasets in total. A yaml such as [0a88c91](https://github.com/omnibenchmark/clustering_example/blob/0a88c910bbda62d1b593f4215a682770227f39ff/Clustering.yaml) with 30 cores should run half of the stuff in ~4 h and reach 97% completion in ~8h. # Summary -- Data. Example datasets (not a comprehensive list, it's >79 of them): +- Data. Example datasets (not a comprehensive list, it's >60 of them): - https://github.com/imallona/clustbench_data - - args: ["--dataset_generator", "mnist", "--dataset_name", "fashion"] - - args: ["--dataset_generator", "other", "--dataset_name", "iris"] - - args: ["--dataset_generator", "mnist", "--dataset_name", "digits"] - - args: ["--dataset_generator", "wut", "--dataset_name", "circles"] + parameters: + - dataset_generator: "fcps" # 9 + dataset_name: ["atom", "chainlink", "engytime", "hepta", "lsun", "target", "tetra", "twodiamonds", "wingnut"] + - dataset_generator: "graves" # 10 datasets + dataset_name: ["dense", "fuzzyx", "line", "parabolic", "ring", "ring_noisy", "ring_outliers", "zigzag", "zigzag_noisy", "zigzag_outliers"] + - dataset_generator: "other" # 6 datasets + dataset_name: ["chameleon_t4_8k", "chameleon_t5_8k", "hdbscan", "iris", "iris5", "square"] + - dataset_generator: "sipu" # 8 datasets + dataset_name: ["aggregation", "compound", "flame", "jain", "pathbased", "r15", "spiral", "unbalance"] + - dataset_generator: "uci" # 7 datasets + dataset_name: ["ecoli", "ionosphere", "sonar", "statlog", "wdbc", "wine", "yeast"] + - dataset_generator: "wut" # 22 datasets + dataset_name: ["circles", "cross", "graph", "isolation", "labirynth", "mk1", "mk2", "mk3", "mk4", "olympic", "smile", "stripes", "trajectories", "trapped_lovers", "twosplashes", "windows", "x1", "x2", "x3", "z1", "z2", "z3"] - Method families/providers (they include several methods each) - https://github.com/imallona/clustbench_fastcluster - - args: ["--linkage", "complete"] - - args: ["--linkage", "ward"] - - args: ["--linkage", "average"] - - args: ["--linkage", "weighted"] - - args: ["--linkage", "median"] - - args: ["--linkage", "centroid"] + parameters: + - linkage: ["complete", "ward", "average", "weighted", "median", "centroid"] - https://github.com/imallona/clustbench_sklearn - - args: ["--method", "birch"] - - args: ["--method", "kmeans"] - - args: ["--method", "spectral"] ## too slow - - args: ["--method", "gm"] + parameters: + - method: ["birch", "kmeans", "gm"] + # "spectral" ## too slow - https://github.com/imallona/clustbench_agglomerative - - args: ["--linkage", "average"] - - args: ["--linkage", "complete"] - - args: ["--linkage", "ward"] + parameters: + - linkage: ["average", "complete", "ward"] - https://github.com/imallona/clustbench_genieclust - - args: ["--method", "genie", "--gini_threshold", 0.5] - - args: ["--method", "gic"] - - args: ["--method", "ica"] + parameters: + - method: "genie" + gini_threshold: 0.5 + - method: ["gic", "ica"] - https://github.com/imallona/clustbench_fcps - - args: ["--method", "FCPS_Minimax"] - - args: ["--method", "FCPS_MinEnergy"] - - args: ["--method", "FCPS_HDBSCAN_2"] - - args: ["--method", "FCPS_HDBSCAN_4"] - - args: ["--method", "FCPS_HDBSCAN_8"] - - args: ["--method", "FCPS_Diana"] - - args: ["--method", "FCPS_Fanny"] - - args: ["--method", "FCPS_Hardcl"] - - args: ["--method", "FCPS_Softcl"] - - args: ["--method", "FCPS_Clara"] - - args: ["--method", "FCPS_PAM"] + parameters: + - method: ["FCPS_Minimax", "FCPS_MinEnergy", "FCPS_HDBSCAN_2", "FCPS_HDBSCAN_4", "FCPS_HDBSCAN_8", "FCPS_Diana", "FCPS_Fanny", "FCPS_Hardcl", "FCPS_Softcl", "FCPS_Clara", "FCPS_PAM"] + seed: 2 + # - "FCPS_AdaptiveDensityPeak" # not in Conda - Metric providers (several metrics) - https://github.com/imallona/clustbench_metrics - - args: ["--metric", "normalized_clustering_accuracy"] - - args: ["--metric", "adjusted_fm_score"] - - args: ["--metric", "adjusted_mi_score"] - - args: ["--metric", "adjusted_rand_score"] - - args: ["--metric", "fm_score"] - - args: ["--metric", "mi_score"] - - args: ["--metric", "normalized_clustering_accuracy"] - - args: ["--metric", "normalized_mi_score"] - - args: ["--metric", "normalized_pivoted_accuracy"] - - args: ["--metric", "pair_sets_index"] - - args: ["--metric", "rand_score"] -- Metric collector + parameters: + - metric: ["normalized_clustering_accuracy", "adjusted_fm_score", "adjusted_rand_score"] + # - "adjusted_mi_score" + # - "adjusted_rand_score" + # - "fm_score" + # - "mi_score" + # - "normalized_clustering_accuracy" + # - "normalized_mi_score" + # - "normalized_pivoted_accuracy" + # - "pair_sets_index" + # - "rand_score" - https://github.com/imallona/clustering_report - Daniel modules (independent from clustbench) - https://github.com/omnibenchmark-example/iris.git @@ -96,6 +92,6 @@ In `envs`: conda, apptainer, easybuild (lmod modules) # Warnings -Mind we try to run clusterings specifying the true number of clusters +- 2. But sometimes the true number is k=3. Then we do `k=2, k=2, k=3, k=5, k=6` filling with k=2s as needed, and recomputing the same values multiple times (so runtimes are comparable across datasets, regardless of their true number of clusters). +Note that we try to run clusterings specifying the true number of clusters +- 2. But sometimes the true number is k=3. Then we do `k=2, k=2, k=3, k=5, k=6` filling with k=2s as needed, and recomputing the same values multiple times (so runtimes are comparable across datasets, regardless of their true number of clusters). -Also, we have modules by Daniel not fully incorporated into Gagolewski's flow. +Also, we have modules by Daniel Incicau that are not fully incorporated into Gagolewski's flow. From d2c4458919dda7b6ce40425779446d28026576e7 Mon Sep 17 00:00:00 2001 From: Mark Robinson Date: Tue, 20 Jan 2026 21:24:54 +0100 Subject: [PATCH 3/4] harmonize all three backends to have the same data/methods/metrics --- Clustering_conda.yml | 2 +- Clustering_envmodules.yml | 192 +++++++------------------------------- Clustering_oras.yml | 192 +++++++------------------------------- 3 files changed, 65 insertions(+), 321 deletions(-) diff --git a/Clustering_conda.yml b/Clustering_conda.yml index 3159e86..3dfd3cc 100644 --- a/Clustering_conda.yml +++ b/Clustering_conda.yml @@ -1,4 +1,4 @@ -id: omni-clustbench +id: omni-clustbench-conda description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. version: "1.5.0" benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" diff --git a/Clustering_envmodules.yml b/Clustering_envmodules.yml index 47d35b3..ab90b12 100644 --- a/Clustering_envmodules.yml +++ b/Clustering_envmodules.yml @@ -1,7 +1,7 @@ -id: clustering_example_envmodules +id: omni-clustbench-envmodules description: "Clustering benchmark on Gagolewski's, true number of clusters plus minus 2." version: "1.5.0" -benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" +benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo, Mark Robinson" storage: api: S3 endpoint: http://omnibenchmark.mls.uzh.ch:9000 @@ -10,26 +10,22 @@ software_backend: envmodules software_environments: clustbench: description: "clustbench python" - conda: envs/clustbench.yml envmodule: clustbench - apptainer: oras://quay.io/imallona/clustering_example/clustbench:latest fcps: description: "R deps" - conda: envs/fcps.yml - apptainer: oras://quay.io/imallona/clustering_example/fcps:latest envmodule: fcps -metric_collectors: - - id: plotting - name: "Single-backend metric collector." - software_environment: "fcps" - repository: - url: https://github.com/imallona/clustering_report - commit: 040 - inputs: - - metrics.scores - outputs: - - id: plotting.html - path: "{name}/plotting_report.html" +#metric_collectors: +# - id: plotting +# name: "Single-backend metric collector." +# software_environment: "fcps" +# repository: +# url: https://github.com/imallona/clustering_report +# commit: 040 +# inputs: +# - metrics.scores +# outputs: +# - id: plotting.html +# path: "{name}/plotting_report.html" stages: ## clustbench data ########################################################## @@ -43,128 +39,17 @@ stages: commit: fc67ebd parameters: # comments depict the possible cardinalities and the number of curated labelsets - dataset_generator: "fcps" - dataset_name: ["atom", "chainlink"] # 2 1 - - # - dataset_generator: "fcps" - # dataset_name: "engytime" # 2 2 - # - dataset_generator: "fcps" - # dataset_name: "hepta" # 7 1 - # - dataset_generator: "fcps" - # dataset_name: "lsun" # 3 1 - # - dataset_generator: "fcps" - # dataset_name: "target" # 2, 6 2 - # - dataset_generator: "fcps" - # dataset_name: "tetra" # 4 1 - # - dataset_generator: "fcps" - # dataset_name: "twodiamonds" # 2 1 - # - dataset_generator: "fcps" - # dataset_name: "wingnut" # 2 1 - # - dataset_generator: "graves" - # dataset_name: "dense" # 2 1 - # - dataset_generator: "graves" - # dataset_name: "fuzzyx" # 2, 4, 5 6 - # - dataset_generator: "graves" - # dataset_name: "line" # 2 1 - # - dataset_generator: "graves" - # dataset_name: "parabolic" # 2, 42 - # - dataset_generator: "graves" - # dataset_name: "ring" # 2 1 - # - dataset_generator: "graves" - # dataset_name: "ring_noisy" # 2 1 - # - dataset_generator: "graves" - # dataset_name: "ring_outliers" # 2, 52 - # - dataset_generator: "graves" - # dataset_name: "zigzag" # 3, 5 2 - # - dataset_generator: "graves" - # dataset_name: "zigzag_noisy" # 3, 52 - # - dataset_generator: "graves" - # dataset_name: "zigzag_outliers" # 3, 52 - # - dataset_generator: "other" - # dataset_name: "chameleon_t4_8k" # 6 1 - # - dataset_generator: "other" - # dataset_name: "chameleon_t5_8k" # 6 1 - # - dataset_generator: "other" - # dataset_name: "hdbscan" # 6 1 - # - dataset_generator: "other" - # dataset_name: "iris" # 3 1 - # - dataset_generator: "other" - # dataset_name: "iris5" # 3 1 - # - dataset_generator: "other" - # dataset_name: "square" # 2 1 - # - dataset_generator: "sipu" - # dataset_name: "aggregation" # 7 1 - # - dataset_generator: "sipu" - # dataset_name: "compound" # 4, 5, 6 5 - # - dataset_generator: "sipu" - # dataset_name: "flame" # 2 2 - # - dataset_generator: "sipu" - # dataset_name: "jain" # 2 1 - # - dataset_generator: "sipu" - # dataset_name: "pathbased" # 3, 4 2 - # - dataset_generator: "sipu" - # dataset_name: "r15" # 8, 9, 15 3 - # - dataset_generator: "sipu" - # dataset_name: "spiral" # 3 1 - # - dataset_generator: "sipu" - # dataset_name: "unbalance" # 8 1 - # - dataset_generator: "uci" - # dataset_name: "ecoli" # 8 1 - # - dataset_generator: "uci" - # dataset_name: "ionosphere" # 2 1 - # - dataset_generator: "uci" - # dataset_name: "sonar" # 2 1 - # - dataset_generator: "uci" - # dataset_name: "statlog" # 7 1 - # - dataset_generator: "uci" - # dataset_name: "wdbc" # 2 1 - # - dataset_generator: "uci" - # dataset_name: "wine" # 3 1 - # - dataset_generator: "uci" - # dataset_name: "yeast" # 10 1 - # - dataset_generator: "wut" - # dataset_name: "circles" # 4 1 - # - dataset_generator: "wut" - # dataset_name: "cross" # 4 1 - # - dataset_generator: "wut" - # dataset_name: "graph" # 10 1 - # - dataset_generator: "wut" - # dataset_name: "isolation" # 3 1 - # - dataset_generator: "wut" - # dataset_name: "labirynth" # 6 1 - # - dataset_generator: "wut" - # dataset_name: "mk1" # 3 1 - # - dataset_generator: "wut" - # dataset_name: "mk2" # 2 1 - # - dataset_generator: "wut" - # dataset_name: "mk3" # 3 1 - # - dataset_generator: "wut" - # dataset_name: "mk4" # 3 1 - # - dataset_generator: "wut" - # dataset_name: "olympic" # 5 1 - # - dataset_generator: "wut" - # dataset_name: "smile" # 4, 6 2 - # - dataset_generator: "wut" - # dataset_name: "stripes" # 2 1 - # - dataset_generator: "wut" - # dataset_name: "trajectories" # 4 1 - # - dataset_generator: "wut" - # dataset_name: "trapped_lovers" # 3 1 - # - dataset_generator: "wut" - # dataset_name: "twosplashes" # 2 1 - # - dataset_generator: "wut" - # dataset_name: "windows" # 5 1 - # - dataset_generator: "wut" - # dataset_name: "x1" # 3 1 - # - dataset_generator: "wut" - # dataset_name: "x2" # 3 1 - # - dataset_generator: "wut" - # dataset_name: "x3" # 4 1 - # - dataset_generator: "wut" - # dataset_name: "z1" # 3 1 - # - dataset_generator: "wut" - # dataset_name: "z2" # 5 1 - # - dataset_generator: "wut" - # dataset_name: "z3" # 4 1 + dataset_name: ["atom", "chainlink", "engytime", "hepta", "lsun", "target", "tetra", "twodiamonds", "wingnut"] + - dataset_generator: "graves" + dataset_name: ["dense", "fuzzyx", "line", "parabolic", "ring", "ring_noisy", "ring_outliers", "zigzag", "zigzag_noisy", "zigzag_outliers"] + - dataset_generator: "other" + dataset_name: ["chameleon_t4_8k", "chameleon_t5_8k", "hdbscan", "iris", "iris5", "square"] + - dataset_generator: "sipu" + dataset_name: ["aggregation", "compound", "flame", "jain", "pathbased", "r15", "spiral", "unbalance"] + - dataset_generator: "uci" + dataset_name: ["ecoli", "ionosphere", "sonar", "statlog", "wdbc", "wine", "yeast"] + - dataset_generator: "wut" + dataset_name: ["circles", "cross", "graph", "isolation", "labirynth", "mk1", "mk2", "mk3", "mk4", "olympic", "smile", "stripes", "trajectories", "trapped_lovers", "twosplashes", "windows", "x1", "x2", "x3", "z1", "z2", "z3"] outputs: - id: data.matrix path: "{dataset}.data.gz" @@ -182,8 +67,7 @@ stages: url: https://github.com/imallona/clustbench_fastcluster commit: e644ce5 parameters: - - linkage: "complete" - #- linkage: ["ward", "average", "weighted", "median", "centroid"] + - linkage: ["complete", "ward", "average", "weighted", "median", "centroid"] - id: sklearn name: "sklearn" software_environment: "clustbench" @@ -191,9 +75,8 @@ stages: url: https://github.com/imallona/clustbench_sklearn commit: dcf35e1 parameters: - - method: "birch" - # ["kmeans, "gm"] - # ["spectral"] ## too slow + - method: ["birch", "kmeans", "gm"] + # "spectral" ## too slow - id: agglomerative name: "agglomerative" software_environment: "clustbench" @@ -201,8 +84,7 @@ stages: url: https://github.com/imallona/clustbench_agglomerative commit: 9d086a9 parameters: - - linkage: "average" - # ["complete", "ward"] + - linkage: ["average", "complete", "ward"] - id: genieclust name: "genieclust" software_environment: "clustbench" @@ -212,7 +94,7 @@ stages: parameters: - method: "genie" gini_threshold: 0.5 - # method: ["gic", "ica"] + - method: ["gic", "ica"] - id: fcps name: "fcps" software_environment: "fcps" @@ -220,19 +102,9 @@ stages: url: https://github.com/imallona/clustbench_fcps commit: e780fed parameters: - - method: "FCPS_Minimax" + - method: ["FCPS_Minimax", "FCPS_MinEnergy", "FCPS_HDBSCAN_2", "FCPS_HDBSCAN_4", "FCPS_HDBSCAN_8", "FCPS_Diana", "FCPS_Fanny", "FCPS_Hardcl", "FCPS_Softcl", "FCPS_Clara", "FCPS_PAM"] seed: 2 # - "FCPS_AdaptiveDensityPeak" # not in Conda - # - "FCPS_MinEnergy", - # - "FCPS_HDBSCAN_2", - # - "FCPS_HDBSCAN_4", - # - "FCPS_HDBSCAN_8", - # - "FCPS_Diana", - # - "FCPS_Fanny", - # - "FCPS_Hardcl", - # - "FCPS_Softcl", - # - "FCPS_Clara", - # - "FCPS_PAM" inputs: - data.matrix - data.true_labels @@ -249,7 +121,7 @@ stages: url: https://github.com/imallona/clustbench_metrics commit: c4eda85 parameters: - - metric: ["normalized_clustering_accuracy", "adjusted_fm_score"] + - metric: ["normalized_clustering_accuracy", "adjusted_fm_score", "adjusted_rand_score"] # - "adjusted_mi_score" # - "adjusted_rand_score" # - "fm_score" diff --git a/Clustering_oras.yml b/Clustering_oras.yml index 606f3e0..2e1c7f0 100644 --- a/Clustering_oras.yml +++ b/Clustering_oras.yml @@ -1,7 +1,7 @@ -id: clustering_example_oras +id: omni-clustbench-apptainer description: "Clustering benchmark on Gagolewski's, true number of clusters plus minus 2." version: "1.5.0" -benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" +benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo, Mark Robinson" storage: api: S3 endpoint: http://omnibenchmark.mls.uzh.ch:9000 @@ -10,26 +10,22 @@ software_backend: apptainer software_environments: clustbench: description: "clustbench on py3.12.6" - conda: envs/clustbench.yml - envmodule: clustbench apptainer: oras://quay.io/imallona/clustering_example/clustbench:latest fcps: description: "CRAN's FCPS" - conda: envs/fcps.yml apptainer: oras://quay.io/imallona/clustering_example/fcps:latest - envmodule: fcps -metric_collectors: - - id: plotting - name: "Single-backend metric collector." - software_environment: fcps - repository: - url: https://github.com/imallona/clustering_report - commit: 040 - inputs: - - metrics.scores - outputs: - - id: plotting.html - path: "{name}/plotting_report.html" +#metric_collectors: +# - id: plotting +# name: "Single-backend metric collector." +# software_environment: fcps +# repository: +# url: https://github.com/imallona/clustering_report +# commit: 040 +# inputs: +# - metrics.scores +# outputs: +# - id: plotting.html +# path: "{name}/plotting_report.html" stages: ## clustbench data ########################################################## @@ -43,128 +39,17 @@ stages: commit: fc67ebd parameters: # comments depict the possible cardinalities and the number of curated labelsets - dataset_generator: "fcps" - dataset_name: ["atom", "chainlink"] # 2 1 - - # - dataset_generator: "fcps" - # dataset_name: "engytime" # 2 2 - # - dataset_generator: "fcps" - # dataset_name: "hepta" # 7 1 - # - dataset_generator: "fcps" - # dataset_name: "lsun" # 3 1 - # - dataset_generator: "fcps" - # dataset_name: "target" # 2, 6 2 - # - dataset_generator: "fcps" - # dataset_name: "tetra" # 4 1 - # - dataset_generator: "fcps" - # dataset_name: "twodiamonds" # 2 1 - # - dataset_generator: "fcps" - # dataset_name: "wingnut" # 2 1 - # - dataset_generator: "graves" - # dataset_name: "dense" # 2 1 - # - dataset_generator: "graves" - # dataset_name: "fuzzyx" # 2, 4, 5 6 - # - dataset_generator: "graves" - # dataset_name: "line" # 2 1 - # - dataset_generator: "graves" - # dataset_name: "parabolic" # 2, 42 - # - dataset_generator: "graves" - # dataset_name: "ring" # 2 1 - # - dataset_generator: "graves" - # dataset_name: "ring_noisy" # 2 1 - # - dataset_generator: "graves" - # dataset_name: "ring_outliers" # 2, 52 - # - dataset_generator: "graves" - # dataset_name: "zigzag" # 3, 5 2 - # - dataset_generator: "graves" - # dataset_name: "zigzag_noisy" # 3, 52 - # - dataset_generator: "graves" - # dataset_name: "zigzag_outliers" # 3, 52 - # - dataset_generator: "other" - # dataset_name: "chameleon_t4_8k" # 6 1 - # - dataset_generator: "other" - # dataset_name: "chameleon_t5_8k" # 6 1 - # - dataset_generator: "other" - # dataset_name: "hdbscan" # 6 1 - # - dataset_generator: "other" - # dataset_name: "iris" # 3 1 - # - dataset_generator: "other" - # dataset_name: "iris5" # 3 1 - # - dataset_generator: "other" - # dataset_name: "square" # 2 1 - # - dataset_generator: "sipu" - # dataset_name: "aggregation" # 7 1 - # - dataset_generator: "sipu" - # dataset_name: "compound" # 4, 5, 6 5 - # - dataset_generator: "sipu" - # dataset_name: "flame" # 2 2 - # - dataset_generator: "sipu" - # dataset_name: "jain" # 2 1 - # - dataset_generator: "sipu" - # dataset_name: "pathbased" # 3, 4 2 - # - dataset_generator: "sipu" - # dataset_name: "r15" # 8, 9, 15 3 - # - dataset_generator: "sipu" - # dataset_name: "spiral" # 3 1 - # - dataset_generator: "sipu" - # dataset_name: "unbalance" # 8 1 - # - dataset_generator: "uci" - # dataset_name: "ecoli" # 8 1 - # - dataset_generator: "uci" - # dataset_name: "ionosphere" # 2 1 - # - dataset_generator: "uci" - # dataset_name: "sonar" # 2 1 - # - dataset_generator: "uci" - # dataset_name: "statlog" # 7 1 - # - dataset_generator: "uci" - # dataset_name: "wdbc" # 2 1 - # - dataset_generator: "uci" - # dataset_name: "wine" # 3 1 - # - dataset_generator: "uci" - # dataset_name: "yeast" # 10 1 - # - dataset_generator: "wut" - # dataset_name: "circles" # 4 1 - # - dataset_generator: "wut" - # dataset_name: "cross" # 4 1 - # - dataset_generator: "wut" - # dataset_name: "graph" # 10 1 - # - dataset_generator: "wut" - # dataset_name: "isolation" # 3 1 - # - dataset_generator: "wut" - # dataset_name: "labirynth" # 6 1 - # - dataset_generator: "wut" - # dataset_name: "mk1" # 3 1 - # - dataset_generator: "wut" - # dataset_name: "mk2" # 2 1 - # - dataset_generator: "wut" - # dataset_name: "mk3" # 3 1 - # - dataset_generator: "wut" - # dataset_name: "mk4" # 3 1 - # - dataset_generator: "wut" - # dataset_name: "olympic" # 5 1 - # - dataset_generator: "wut" - # dataset_name: "smile" # 4, 6 2 - # - dataset_generator: "wut" - # dataset_name: "stripes" # 2 1 - # - dataset_generator: "wut" - # dataset_name: "trajectories" # 4 1 - # - dataset_generator: "wut" - # dataset_name: "trapped_lovers" # 3 1 - # - dataset_generator: "wut" - # dataset_name: "twosplashes" # 2 1 - # - dataset_generator: "wut" - # dataset_name: "windows" # 5 1 - # - dataset_generator: "wut" - # dataset_name: "x1" # 3 1 - # - dataset_generator: "wut" - # dataset_name: "x2" # 3 1 - # - dataset_generator: "wut" - # dataset_name: "x3" # 4 1 - # - dataset_generator: "wut" - # dataset_name: "z1" # 3 1 - # - dataset_generator: "wut" - # dataset_name: "z2" # 5 1 - # - dataset_generator: "wut" - # dataset_name: "z3" # 4 1 + dataset_name: ["atom", "chainlink", "engytime", "hepta", "lsun", "target", "tetra", "twodiamonds", "wingnut"] + - dataset_generator: "graves" + dataset_name: ["dense", "fuzzyx", "line", "parabolic", "ring", "ring_noisy", "ring_outliers", "zigzag", "zigzag_noisy", "zigzag_outliers"] + - dataset_generator: "other" + dataset_name: ["chameleon_t4_8k", "chameleon_t5_8k", "hdbscan", "iris", "iris5", "square"] + - dataset_generator: "sipu" + dataset_name: ["aggregation", "compound", "flame", "jain", "pathbased", "r15", "spiral", "unbalance"] + - dataset_generator: "uci" + dataset_name: ["ecoli", "ionosphere", "sonar", "statlog", "wdbc", "wine", "yeast"] + - dataset_generator: "wut" + dataset_name: ["circles", "cross", "graph", "isolation", "labirynth", "mk1", "mk2", "mk3", "mk4", "olympic", "smile", "stripes", "trajectories", "trapped_lovers", "twosplashes", "windows", "x1", "x2", "x3", "z1", "z2", "z3"] outputs: - id: data.matrix path: "{dataset}.data.gz" @@ -182,8 +67,7 @@ stages: url: https://github.com/imallona/clustbench_fastcluster commit: e644ce5 parameters: - - linkage: "complete" - #- linkage: ["ward", "average", "weighted", "median", "centroid"] + - linkage: ["complete", "ward", "average", "weighted", "median", "centroid"] - id: sklearn name: "sklearn" software_environment: "clustbench" @@ -191,9 +75,8 @@ stages: url: https://github.com/imallona/clustbench_sklearn commit: dcf35e1 parameters: - - method: "birch" - # ["kmeans, "gm"] - # ["spectral"] ## too slow + - method: ["birch", "kmeans", "gm"] + # "spectral" ## too slow - id: agglomerative name: "agglomerative" software_environment: "clustbench" @@ -201,8 +84,7 @@ stages: url: https://github.com/imallona/clustbench_agglomerative commit: 9d086a9 parameters: - - linkage: "average" - # ["complete", "ward"] + - linkage: ["average", "complete", "ward"] - id: genieclust name: "genieclust" software_environment: "clustbench" @@ -211,8 +93,8 @@ stages: commit: 7d9e799 parameters: - method: "genie" - # method: ["gic", "ica"] gini_threshold: 0.5 + - method: ["gic", "ica"] - id: fcps name: "fcps" software_environment: "fcps" @@ -220,19 +102,9 @@ stages: url: https://github.com/imallona/clustbench_fcps commit: e780fed parameters: - - method: "FCPS_Minimax" + - method: ["FCPS_Minimax", "FCPS_MinEnergy", "FCPS_HDBSCAN_2", "FCPS_HDBSCAN_4", "FCPS_HDBSCAN_8", "FCPS_Diana", "FCPS_Fanny", "FCPS_Hardcl", "FCPS_Softcl", "FCPS_Clara", "FCPS_PAM"] seed: 2 # - "FCPS_AdaptiveDensityPeak" # not in Conda - # - "FCPS_MinEnergy", - # - "FCPS_HDBSCAN_2", - # - "FCPS_HDBSCAN_4", - # - "FCPS_HDBSCAN_8", - # - "FCPS_Diana", - # - "FCPS_Fanny", - # - "FCPS_Hardcl", - # - "FCPS_Softcl", - # - "FCPS_Clara", - # - "FCPS_PAM" inputs: - data.matrix - data.true_labels @@ -249,7 +121,7 @@ stages: url: https://github.com/imallona/clustbench_metrics commit: c4eda85 parameters: - - metric: ["normalized_clustering_accuracy", "adjusted_fm_score"] + - metric: ["normalized_clustering_accuracy", "adjusted_fm_score", "adjusted_rand_score"] # - "adjusted_mi_score" # - "adjusted_rand_score" # - "fm_score" From 97bed635cb60228f991af7c18b204ab379759fe6 Mon Sep 17 00:00:00 2001 From: Mark Robinson Date: Tue, 20 Jan 2026 21:30:14 +0100 Subject: [PATCH 4/4] few more harmonizations - naming mostly --- Clustering_conda.yml | 6 +++--- Clustering_envmodules.yml | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Clustering_conda.yml b/Clustering_conda.yml index 3dfd3cc..de30450 100644 --- a/Clustering_conda.yml +++ b/Clustering_conda.yml @@ -1,7 +1,7 @@ id: omni-clustbench-conda -description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. +description: "Clustering benchmark on Gagolewski's, true number of clusters plus minus 2." version: "1.5.0" -benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" +benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo, Mark Robinson" storage: api: S3 endpoint: http://omnibenchmark.mls.uzh.ch:9000 @@ -17,7 +17,7 @@ software_environments: #metric_collectors: # - id: plotting # name: "Single-backend metric collector." -# software_environment: "fcps" +# software_environment: fcps # repository: # url: https://github.com/imallona/clustering_report # commit: 040 diff --git a/Clustering_envmodules.yml b/Clustering_envmodules.yml index ab90b12..14214ae 100644 --- a/Clustering_envmodules.yml +++ b/Clustering_envmodules.yml @@ -9,10 +9,10 @@ storage: software_backend: envmodules software_environments: clustbench: - description: "clustbench python" + description: "clustbench on py3.12.6" envmodule: clustbench fcps: - description: "R deps" + description: "CRAN's FCPS" envmodule: fcps #metric_collectors: # - id: plotting