From 0db12b5659b1469eaabfb3ea3819824969d921bb Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Wed, 14 Dec 2022 14:53:28 +0100
Subject: [PATCH 01/42] add mask_dataset

---
 src/joint_embedding/api/anndata_dataset.yaml  | 75 ++++++++++++++
 src/joint_embedding/api/anndata_solution.yaml | 57 +++++++++++
 src/joint_embedding/api/authors.yaml          |  8 ++
 .../mask_dataset/config.vsh.yaml              | 24 +++++
 src/joint_embedding/mask_dataset/script.R     | 97 +++++++++++++++++++
 src/joint_embedding/mask_dataset/test.R       | 55 +++++++++++
 .../split_dataset/config.vsh.yaml             | 57 +++++++++++
 src/joint_embedding/split_dataset/script.R    | 97 +++++++++++++++++++
 src/joint_embedding/split_dataset/test.R      | 55 +++++++++++
 9 files changed, 525 insertions(+)
 create mode 100644 src/joint_embedding/api/anndata_dataset.yaml
 create mode 100644 src/joint_embedding/api/anndata_solution.yaml
 create mode 100644 src/joint_embedding/api/authors.yaml
 create mode 100644 src/joint_embedding/mask_dataset/config.vsh.yaml
 create mode 100644 src/joint_embedding/mask_dataset/script.R
 create mode 100644 src/joint_embedding/mask_dataset/test.R
 create mode 100644 src/joint_embedding/split_dataset/config.vsh.yaml
 create mode 100644 src/joint_embedding/split_dataset/script.R
 create mode 100644 src/joint_embedding/split_dataset/test.R

diff --git a/src/joint_embedding/api/anndata_dataset.yaml b/src/joint_embedding/api/anndata_dataset.yaml
new file mode 100644
index 0000000000..90a5f4a385
--- /dev/null
+++ b/src/joint_embedding/api/anndata_dataset.yaml
@@ -0,0 +1,75 @@
+type: file
+description: "A raw dataset"
+example: "dataset.h5ad"
+info:
+  label: "Dataset"
+  slots:
+    layers: 
+      - type: integer
+        name: counts
+        description: Raw counts
+        required: true
+    obs:
+      - type: string
+        name: batch
+        description: Batch information
+        required: false
+      - type: double
+        name: size_factors
+        description: The size factors created by the normalisation method, if any.
+        required: false
+      - type: string
+        name: cell_type
+        description: Type of cells
+        required: false
+      - type: string
+        name: pseudotime_order_GEX
+        description:
+        required: false
+      - type: string
+        name: pseudotime_order_ATAC
+        description:
+        required: false
+      - type: string
+        name: pseudotime_order_ADT
+        description:
+        required: false
+      - type: double
+        name: S_score
+        description:
+        required: false
+      - type: double
+        name: G2M_score
+        description:
+        required: false
+      - type: boolean
+        name: is_train
+        description: if sample is train data
+        required: true
+    var:
+      - type: string
+        name: gene_ids
+        description: 
+        required: false
+      - type: string
+        name: feature_types
+        description:
+        required: true
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: organism
+        description: "data from which organism "
+        required: false
+      - type: string
+        name: gene_activity_var_names
+        description:
+        required: false
+      - type: string
+        name: sample_pm_varnames
+        description:
+        required: false
+    
diff --git a/src/joint_embedding/api/anndata_solution.yaml b/src/joint_embedding/api/anndata_solution.yaml
new file mode 100644
index 0000000000..2ed03e1cc3
--- /dev/null
+++ b/src/joint_embedding/api/anndata_solution.yaml
@@ -0,0 +1,57 @@
+type: file
+description: "The solution for the data"
+example: "solution.h5ad"
+info:
+  short_description: "Solution"
+  slots:
+    layers: 
+      - type: integer
+        name: counts
+        description: Raw counts
+    obs:
+      - type: string
+        name: batch
+        description: Batch information
+        required: false
+      - type: string
+        name: cell_type
+        description: Type of cells
+        required: false
+      - type: string
+        name: pseudotime_order_GEX
+        description:
+        required: false
+      - type: string
+        name: pseudotime_order_ATAC
+        description:
+        required: false
+      - type: string
+        name: pseudotime_order_ADT
+        description:
+        required: false
+      - type: double
+        name: S_score
+        description:
+        required: false
+      - type: double
+        name: G2M_score
+        description:
+        required: false
+    var:
+      - type: string
+        name: feature_types
+        description: 
+        required: true
+      - type: string
+        name: gene_ids
+        description:
+        required: false
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: organism
+        description: which organism
+        required: true
diff --git a/src/joint_embedding/api/authors.yaml b/src/joint_embedding/api/authors.yaml
new file mode 100644
index 0000000000..7fc237c267
--- /dev/null
+++ b/src/joint_embedding/api/authors.yaml
@@ -0,0 +1,8 @@
+functionality:
+  authors:
+    - name: Robrecht Cannoodt
+      roles: [ author ]
+      props: { github: rcannood, orcid: "0000-0003-3641-729X" }
+    - name: Kai Waldrant
+      roles: [ contributor ]
+      props: { github: KaiWaldrant }
\ No newline at end of file
diff --git a/src/joint_embedding/mask_dataset/config.vsh.yaml b/src/joint_embedding/mask_dataset/config.vsh.yaml
new file mode 100644
index 0000000000..7ec45d9c5e
--- /dev/null
+++ b/src/joint_embedding/mask_dataset/config.vsh.yaml
@@ -0,0 +1,24 @@
+__merge__: ../api/comp_mask_dataset.yaml
+functionality:
+  name: mask_dataset
+  namespace: joint_embedding
+  description: |
+    A component for censoring joint embedding datasets to be given
+    to competition participants for the 'joint embedding' task.
+  arguments:
+    - name: "--train_only"
+      type: "boolean_true"
+      description: Whether or not to only omit the train cells.
+  resources:
+    - type: r_script
+      path: script.R
+  tests:
+    - type: r_script
+      path: test.R
+    - path: ../../../../resources_test
+platforms:
+  - type: docker
+    image: dataintuitive/randpy:r4.0_py3.8_bioc3.12
+  - type: nextflow
+    publish: true
+    labels: [ midmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/mask_dataset/script.R b/src/joint_embedding/mask_dataset/script.R
new file mode 100644
index 0000000000..58935f5bca
--- /dev/null
+++ b/src/joint_embedding/mask_dataset/script.R
@@ -0,0 +1,97 @@
+cat("Loading dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+requireNamespace("anndata", quietly = TRUE)
+library(assertthat, quietly = TRUE, warn.conflicts = FALSE)
+library(Matrix, quietly = TRUE, warn.conflicts = FALSE)
+
+## VIASH START
+input_path <- "output/datasets_2021-11-08/common/openproblems_bmmc_multiome_phase1v2/openproblems_bmmc_multiome_phase1v2.manual_formatting."
+output_path <- ""
+
+par <- list(
+  input_mod1 = paste0(input_path, "output_rna.h5ad"),
+  input_mod2 = paste0(input_path, "output_mod2.h5ad"),
+  output_mod1 = paste0(output_path, "output_mod1.h5ad"),
+  output_mod2 = paste0(output_path, "output_mod2.h5ad"),
+  output_solution = paste0(output_path, "solution.h5ad"),
+  train_only = TRUE
+)
+## VIASH END
+
+cat("Reading mod1 data\n")
+input_mod1 <- anndata::read_h5ad(par$input_mod1)
+ad1_mod <- unique(input_mod1$var[["feature_types"]])
+new_dataset_id <- paste0(input_mod1$uns[["dataset_id"]], "_JE")
+ad1_uns <- list(dataset_id = new_dataset_id, organism = "human")
+ad2_uns <- list(dataset_id = new_dataset_id, organism = "human")
+
+cat("Creating mod1 object\n")
+out_mod1 <- anndata::AnnData(
+  X = input_mod1$X,
+  layers = list(counts = input_mod1$layers[["counts"]]),
+  var = input_mod1$var %>% select(one_of("gene_ids"), feature_types),
+  obs = input_mod1$obs %>% select(one_of("batch", "size_factors")),
+  uns = ad1_uns
+)
+
+cat("Create solution object\n")
+out_solution <- anndata::AnnData(
+  X = input_mod1$X,
+  var = input_mod1$var %>% select(one_of("gene_ids"), feature_types),
+  obs = input_mod1$obs %>% select(
+    one_of("batch", "cell_type", "pseudotime_order_GEX", "pseudotime_order_ATAC", "pseudotime_order_ADT", "S_score", "G2M_score")
+  ),
+  uns = ad1_uns
+)
+
+is_train <- input_mod1$obs$is_train
+
+if (par$train_only) {
+  cat("Filtering out test cells\n", sep = "")
+  out_mod1 <- out_mod1[is_train, ] #$copy()
+  out_solution <- out_solution[is_train, ]# $copy()
+}
+
+rm(input_mod1)
+gc()
+
+cat("Reading mod2 data\n")
+input_mod2 <- anndata::read_h5ad(par$input_mod2)
+ad2_mod <- unique(input_mod2$var[["feature_types"]])
+ad2_obsm <- list()
+
+if (ad2_mod == "ATAC") {
+  ad2_uns$gene_activity_var_names <- input_mod2$uns$gene_activity_var_names
+  ad2_obsm$gene_activity <- as(input_mod2$obsm$gene_activity, "CsparseMatrix")
+}
+
+cat("Creating mod2 object\n")
+out_mod2 <- anndata::AnnData(
+  X = input_mod2$X,
+  layers = list(counts = input_mod2$layers[["counts"]]),
+  var = input_mod2$var %>% select(one_of("gene_ids"), feature_types),
+  obs = input_mod2$obs %>% select(one_of("batch")),
+  obsm = ad2_obsm,
+  uns = ad2_uns
+)
+rm(input_mod2)
+gc()
+
+if (par$train_only) {
+  cat("Filtering out test cells\n", sep = "")
+  out_mod2 <- out_mod2[is_train, ] #$copy()
+}
+
+cat("Saving output files as h5ad\n")
+cat("output_mod1:")
+print(out_mod1)
+zzz <- out_mod1$write_h5ad(par$output_mod1, compression = "gzip")
+
+cat("output_mod2:")
+print(out_mod2)
+zzz <- out_mod2$write_h5ad(par$output_mod2, compression = "gzip")
+
+cat("output_solution:")
+print(out_solution)
+zzz <- out_solution$write_h5ad(par$output_solution, compression = "gzip")
diff --git a/src/joint_embedding/mask_dataset/test.R b/src/joint_embedding/mask_dataset/test.R
new file mode 100644
index 0000000000..09fe193b55
--- /dev/null
+++ b/src/joint_embedding/mask_dataset/test.R
@@ -0,0 +1,55 @@
+library(testthat, quietly = TRUE, warn.conflicts = FALSE)
+requireNamespace("anndata", quietly = TRUE)
+
+par <- list(
+  input_mod1 = "resources_test/common/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.output_rna.h5ad",
+  input_mod2 = "resources_test/common/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.output_mod2.h5ad",
+  output_mod1 = "output_mod1.h5ad",
+  output_mod2 = "output_mod2.h5ad",
+  output_solution = "solution.h5ad"
+)
+
+cat("> Running censor component\n")
+out <- processx::run(
+  command = paste0("./", meta["functionality_name"]),
+  args = c(
+    "--input_mod1", par$input_mod1,
+    "--input_mod2", par$input_mod2,
+    "--output_mod1", par$output_mod1,
+    "--output_mod2", par$output_mod2,
+    "--output_solution", par$output_solution
+  ),
+  stderr_to_stdout = TRUE
+)
+
+cat("> Checking whether output files were created\n")
+expect_true(file.exists(par$output_mod1))
+expect_true(file.exists(par$output_mod2))
+expect_true(file.exists(par$output_solution))
+
+cat("> Reading h5ad files\n")
+input_mod1 <- anndata::read_h5ad(par$input_mod1)
+input_mod2 <- anndata::read_h5ad(par$input_mod2)
+output_mod1 <- anndata::read_h5ad(par$output_mod1)
+output_mod2 <- anndata::read_h5ad(par$output_mod2)
+output_solution <- anndata::read_h5ad(par$output_solution)
+
+cat("> Checking contents of h5ad files\n")
+expect_equal(output_mod1$uns[["dataset_id"]], paste0(input_mod1$uns[["dataset_id"]], "_JE"))
+expect_equal(output_mod2$uns[["dataset_id"]], paste0(input_mod1$uns[["dataset_id"]], "_JE"))
+expect_equal(output_solution$uns[["dataset_id"]], paste0(input_mod1$uns[["dataset_id"]], "_JE"))
+expect_equal(output_mod1$uns[["organism"]], input_mod1$uns[["organism"]])
+expect_equal(output_mod2$uns[["organism"]], input_mod1$uns[["organism"]])
+expect_equal(output_solution$uns[["organism"]], input_mod1$uns[["organism"]])
+expect_equal(output_mod1$n_obs, input_mod1$n_obs)
+expect_equal(output_mod2$n_obs, input_mod2$n_obs)
+expect_equal(output_mod1$n_vars, input_mod1$n_vars)
+expect_equal(output_mod2$n_vars, input_mod2$n_vars)
+expect_equal(output_mod1$var_names, input_mod1$var_names)
+expect_equal(output_mod2$var_names, input_mod2$var_names)
+expect_equal(output_mod1$obs_names, input_mod1$obs_names)
+expect_equal(output_mod2$obs_names, input_mod2$obs_names)
+
+# TODO check contents of matrices, check rownames
+
+cat("> Test succeeded!\n")
diff --git a/src/joint_embedding/split_dataset/config.vsh.yaml b/src/joint_embedding/split_dataset/config.vsh.yaml
new file mode 100644
index 0000000000..2e85c26fc4
--- /dev/null
+++ b/src/joint_embedding/split_dataset/config.vsh.yaml
@@ -0,0 +1,57 @@
+functionality:
+  name: censor_dataset
+  namespace: joint_embedding_datasets
+  version: dev
+  description: |
+    A component for censoring joint embedding datasets to be given
+    to competition participants for the 'joint embedding' task.
+  authors:
+    - name: Robrecht Cannoodt
+      email: rcannood@gmail.com
+      roles: [ author, maintainer ]
+      props: { github: rcannood, orcid: "0000-0003-3641-729X" }
+  arguments:
+    - name: "--input_mod1"
+      type: "file"
+      example: "dataset.h5ad"
+      description: An input h5ad dataset.
+      required: true
+    - name: "--input_mod2"
+      type: "file"
+      example: "dataset.h5ad"
+      description: An input h5ad dataset.
+      required: true
+    - name: "--output_mod1"
+      type: "file"
+      direction: "output"
+      example: "dataset_mod1.h5ad"
+      description: Output mod1 file.
+      required: true
+    - name: "--output_mod2"
+      type: "file"
+      direction: "output"
+      example: "dataset_mod2.h5ad"
+      description: Output mod1 file.
+      required: true
+    - name: "--output_solution"
+      type: "file"
+      direction: "output"
+      example: "dataset_solution.h5ad"
+      description: The solution file.
+      required: true
+    - name: "--train_only"
+      type: "boolean_true"
+      description: Whether or not to only omit the train cells.
+  resources:
+    - type: r_script
+      path: script.R
+  tests:
+    - type: r_script
+      path: test.R
+    - path: ../../../../resources_test
+platforms:
+  - type: docker
+    image: dataintuitive/randpy:r4.0_py3.8_bioc3.12
+  - type: nextflow
+    publish: true
+    labels: [ midmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/split_dataset/script.R b/src/joint_embedding/split_dataset/script.R
new file mode 100644
index 0000000000..58935f5bca
--- /dev/null
+++ b/src/joint_embedding/split_dataset/script.R
@@ -0,0 +1,97 @@
+cat("Loading dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+requireNamespace("anndata", quietly = TRUE)
+library(assertthat, quietly = TRUE, warn.conflicts = FALSE)
+library(Matrix, quietly = TRUE, warn.conflicts = FALSE)
+
+## VIASH START
+input_path <- "output/datasets_2021-11-08/common/openproblems_bmmc_multiome_phase1v2/openproblems_bmmc_multiome_phase1v2.manual_formatting."
+output_path <- ""
+
+par <- list(
+  input_mod1 = paste0(input_path, "output_rna.h5ad"),
+  input_mod2 = paste0(input_path, "output_mod2.h5ad"),
+  output_mod1 = paste0(output_path, "output_mod1.h5ad"),
+  output_mod2 = paste0(output_path, "output_mod2.h5ad"),
+  output_solution = paste0(output_path, "solution.h5ad"),
+  train_only = TRUE
+)
+## VIASH END
+
+cat("Reading mod1 data\n")
+input_mod1 <- anndata::read_h5ad(par$input_mod1)
+ad1_mod <- unique(input_mod1$var[["feature_types"]])
+new_dataset_id <- paste0(input_mod1$uns[["dataset_id"]], "_JE")
+ad1_uns <- list(dataset_id = new_dataset_id, organism = "human")
+ad2_uns <- list(dataset_id = new_dataset_id, organism = "human")
+
+cat("Creating mod1 object\n")
+out_mod1 <- anndata::AnnData(
+  X = input_mod1$X,
+  layers = list(counts = input_mod1$layers[["counts"]]),
+  var = input_mod1$var %>% select(one_of("gene_ids"), feature_types),
+  obs = input_mod1$obs %>% select(one_of("batch", "size_factors")),
+  uns = ad1_uns
+)
+
+cat("Create solution object\n")
+out_solution <- anndata::AnnData(
+  X = input_mod1$X,
+  var = input_mod1$var %>% select(one_of("gene_ids"), feature_types),
+  obs = input_mod1$obs %>% select(
+    one_of("batch", "cell_type", "pseudotime_order_GEX", "pseudotime_order_ATAC", "pseudotime_order_ADT", "S_score", "G2M_score")
+  ),
+  uns = ad1_uns
+)
+
+is_train <- input_mod1$obs$is_train
+
+if (par$train_only) {
+  cat("Filtering out test cells\n", sep = "")
+  out_mod1 <- out_mod1[is_train, ] #$copy()
+  out_solution <- out_solution[is_train, ]# $copy()
+}
+
+rm(input_mod1)
+gc()
+
+cat("Reading mod2 data\n")
+input_mod2 <- anndata::read_h5ad(par$input_mod2)
+ad2_mod <- unique(input_mod2$var[["feature_types"]])
+ad2_obsm <- list()
+
+if (ad2_mod == "ATAC") {
+  ad2_uns$gene_activity_var_names <- input_mod2$uns$gene_activity_var_names
+  ad2_obsm$gene_activity <- as(input_mod2$obsm$gene_activity, "CsparseMatrix")
+}
+
+cat("Creating mod2 object\n")
+out_mod2 <- anndata::AnnData(
+  X = input_mod2$X,
+  layers = list(counts = input_mod2$layers[["counts"]]),
+  var = input_mod2$var %>% select(one_of("gene_ids"), feature_types),
+  obs = input_mod2$obs %>% select(one_of("batch")),
+  obsm = ad2_obsm,
+  uns = ad2_uns
+)
+rm(input_mod2)
+gc()
+
+if (par$train_only) {
+  cat("Filtering out test cells\n", sep = "")
+  out_mod2 <- out_mod2[is_train, ] #$copy()
+}
+
+cat("Saving output files as h5ad\n")
+cat("output_mod1:")
+print(out_mod1)
+zzz <- out_mod1$write_h5ad(par$output_mod1, compression = "gzip")
+
+cat("output_mod2:")
+print(out_mod2)
+zzz <- out_mod2$write_h5ad(par$output_mod2, compression = "gzip")
+
+cat("output_solution:")
+print(out_solution)
+zzz <- out_solution$write_h5ad(par$output_solution, compression = "gzip")
diff --git a/src/joint_embedding/split_dataset/test.R b/src/joint_embedding/split_dataset/test.R
new file mode 100644
index 0000000000..09fe193b55
--- /dev/null
+++ b/src/joint_embedding/split_dataset/test.R
@@ -0,0 +1,55 @@
+library(testthat, quietly = TRUE, warn.conflicts = FALSE)
+requireNamespace("anndata", quietly = TRUE)
+
+par <- list(
+  input_mod1 = "resources_test/common/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.output_rna.h5ad",
+  input_mod2 = "resources_test/common/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.output_mod2.h5ad",
+  output_mod1 = "output_mod1.h5ad",
+  output_mod2 = "output_mod2.h5ad",
+  output_solution = "solution.h5ad"
+)
+
+cat("> Running censor component\n")
+out <- processx::run(
+  command = paste0("./", meta["functionality_name"]),
+  args = c(
+    "--input_mod1", par$input_mod1,
+    "--input_mod2", par$input_mod2,
+    "--output_mod1", par$output_mod1,
+    "--output_mod2", par$output_mod2,
+    "--output_solution", par$output_solution
+  ),
+  stderr_to_stdout = TRUE
+)
+
+cat("> Checking whether output files were created\n")
+expect_true(file.exists(par$output_mod1))
+expect_true(file.exists(par$output_mod2))
+expect_true(file.exists(par$output_solution))
+
+cat("> Reading h5ad files\n")
+input_mod1 <- anndata::read_h5ad(par$input_mod1)
+input_mod2 <- anndata::read_h5ad(par$input_mod2)
+output_mod1 <- anndata::read_h5ad(par$output_mod1)
+output_mod2 <- anndata::read_h5ad(par$output_mod2)
+output_solution <- anndata::read_h5ad(par$output_solution)
+
+cat("> Checking contents of h5ad files\n")
+expect_equal(output_mod1$uns[["dataset_id"]], paste0(input_mod1$uns[["dataset_id"]], "_JE"))
+expect_equal(output_mod2$uns[["dataset_id"]], paste0(input_mod1$uns[["dataset_id"]], "_JE"))
+expect_equal(output_solution$uns[["dataset_id"]], paste0(input_mod1$uns[["dataset_id"]], "_JE"))
+expect_equal(output_mod1$uns[["organism"]], input_mod1$uns[["organism"]])
+expect_equal(output_mod2$uns[["organism"]], input_mod1$uns[["organism"]])
+expect_equal(output_solution$uns[["organism"]], input_mod1$uns[["organism"]])
+expect_equal(output_mod1$n_obs, input_mod1$n_obs)
+expect_equal(output_mod2$n_obs, input_mod2$n_obs)
+expect_equal(output_mod1$n_vars, input_mod1$n_vars)
+expect_equal(output_mod2$n_vars, input_mod2$n_vars)
+expect_equal(output_mod1$var_names, input_mod1$var_names)
+expect_equal(output_mod2$var_names, input_mod2$var_names)
+expect_equal(output_mod1$obs_names, input_mod1$obs_names)
+expect_equal(output_mod2$obs_names, input_mod2$obs_names)
+
+# TODO check contents of matrices, check rownames
+
+cat("> Test succeeded!\n")

From bc9911253979a4bd33977eb0a87312f578532dfb Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Wed, 14 Dec 2022 15:20:21 +0100
Subject: [PATCH 02/42] debug mask_dataset test

---
 src/joint_embedding/api/comp_mask_dataset.yaml   | 15 +++++++++++++++
 src/joint_embedding/mask_dataset/config.vsh.yaml |  5 ++---
 src/joint_embedding/mask_dataset/test.R          |  2 +-
 3 files changed, 18 insertions(+), 4 deletions(-)
 create mode 100644 src/joint_embedding/api/comp_mask_dataset.yaml

diff --git a/src/joint_embedding/api/comp_mask_dataset.yaml b/src/joint_embedding/api/comp_mask_dataset.yaml
new file mode 100644
index 0000000000..0b97e89fca
--- /dev/null
+++ b/src/joint_embedding/api/comp_mask_dataset.yaml
@@ -0,0 +1,15 @@
+functionality:
+  arguments:
+    - name: "--input_mod1"
+      __merge__: anndata_dataset.yaml
+    - name: "--input_mod2"
+      __merge__: anndata_dataset.yaml
+    - name: "--output_mod1"
+      __merge__: anndata_masked_mod1.yaml
+      direction: output
+    - name: "--output_mod2"
+      __merge__: anndata_masked_mod2.yaml
+      direction: output
+    - name: "--output_solution"
+      __merge__: anndata_solution.yaml
+      direction: output
\ No newline at end of file
diff --git a/src/joint_embedding/mask_dataset/config.vsh.yaml b/src/joint_embedding/mask_dataset/config.vsh.yaml
index 7ec45d9c5e..5d2890598e 100644
--- a/src/joint_embedding/mask_dataset/config.vsh.yaml
+++ b/src/joint_embedding/mask_dataset/config.vsh.yaml
@@ -15,10 +15,9 @@ functionality:
   tests:
     - type: r_script
       path: test.R
-    - path: ../../../../resources_test
+    - path: ../../../resources_test
 platforms:
   - type: docker
     image: dataintuitive/randpy:r4.0_py3.8_bioc3.12
   - type: nextflow
-    publish: true
-    labels: [ midmem, lowtime, lowcpu ]
+    directives: [ midmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/mask_dataset/test.R b/src/joint_embedding/mask_dataset/test.R
index 09fe193b55..ad8cc55eec 100644
--- a/src/joint_embedding/mask_dataset/test.R
+++ b/src/joint_embedding/mask_dataset/test.R
@@ -9,7 +9,7 @@ par <- list(
   output_solution = "solution.h5ad"
 )
 
-cat("> Running censor component\n")
+cat("> Running mask component\n")
 out <- processx::run(
   command = paste0("./", meta["functionality_name"]),
   args = c(

From 262a1edb761ec5d8450422be4e5bc69efb6c3dd3 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Wed, 14 Dec 2022 15:23:39 +0100
Subject: [PATCH 03/42] add masked anddata api

---
 .../api/anndata_masked_mod1.yaml              | 37 ++++++++++++++++++
 .../api/anndata_masked_mod2.yaml              | 39 +++++++++++++++++++
 2 files changed, 76 insertions(+)
 create mode 100644 src/joint_embedding/api/anndata_masked_mod1.yaml
 create mode 100644 src/joint_embedding/api/anndata_masked_mod2.yaml

diff --git a/src/joint_embedding/api/anndata_masked_mod1.yaml b/src/joint_embedding/api/anndata_masked_mod1.yaml
new file mode 100644
index 0000000000..c247565645
--- /dev/null
+++ b/src/joint_embedding/api/anndata_masked_mod1.yaml
@@ -0,0 +1,37 @@
+type: file
+description: "The masked data"
+example: "masked.h5ad"
+info:
+  short_description: "masked data"
+  slots:
+    layers: 
+      - type: integer
+        name: counts
+        description: Raw counts
+    obs:
+      - type: string
+        name: batch
+        description: Batch information
+        required: false
+      - type: double
+        name: size_factors
+        description:
+        required: false
+    var:
+      - type: string
+        name: feature_types
+        description: 
+        required: true
+      - type: string
+        name: gene_ids
+        description:
+        required: false
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: organism
+        description: which organism
+        required: true
diff --git a/src/joint_embedding/api/anndata_masked_mod2.yaml b/src/joint_embedding/api/anndata_masked_mod2.yaml
new file mode 100644
index 0000000000..ad735fffba
--- /dev/null
+++ b/src/joint_embedding/api/anndata_masked_mod2.yaml
@@ -0,0 +1,39 @@
+type: file
+description: "The masked data for mod2 file"
+example: "masked.h5ad"
+info:
+  short_description: "Masked data"
+  slots:
+    layers: 
+      - type: integer
+        name: counts
+        description: Raw counts
+        required: true
+    obs:
+      - type: string
+        name: batch
+        description: Batch information
+        required: true
+    var:
+      - type: string
+        name: feature_types
+        description:
+        required: true
+      - type: string
+        name: gene_ids
+        description:
+        required: false
+    obsm:
+      - type: double
+        name: gene_activity
+        description: 
+        required: false
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: organism
+        description: which organism
+        required: true

From 3f367e120fc83bde8f08f4bca5d4f1fa9fb9c7c8 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Wed, 14 Dec 2022 17:00:13 +0100
Subject: [PATCH 04/42] add random_embed negative control

---
 src/joint_embedding/api/anndata_dataset.yaml  |  2 +-
 .../api/anndata_masked_mod1.yaml              |  2 +-
 .../api/anndata_prediction.yaml               | 25 +++++
 src/joint_embedding/api/authors.yaml          |  6 +-
 .../api/comp_control_method.yaml              | 47 +++++++++
 .../random_embed/config.vsh.yaml              | 24 +++++
 .../control_methods/random_embed/script.py    | 26 +++++
 .../mask_dataset/config.vsh.yaml              |  2 +-
 src/joint_embedding/mask_dataset/script.R     |  2 +-
 .../split_dataset/config.vsh.yaml             | 57 -----------
 src/joint_embedding/split_dataset/script.R    | 97 -------------------
 src/joint_embedding/split_dataset/test.R      | 55 -----------
 12 files changed, 131 insertions(+), 214 deletions(-)
 create mode 100644 src/joint_embedding/api/anndata_prediction.yaml
 create mode 100644 src/joint_embedding/api/comp_control_method.yaml
 create mode 100644 src/joint_embedding/control_methods/random_embed/config.vsh.yaml
 create mode 100644 src/joint_embedding/control_methods/random_embed/script.py
 delete mode 100644 src/joint_embedding/split_dataset/config.vsh.yaml
 delete mode 100644 src/joint_embedding/split_dataset/script.R
 delete mode 100644 src/joint_embedding/split_dataset/test.R

diff --git a/src/joint_embedding/api/anndata_dataset.yaml b/src/joint_embedding/api/anndata_dataset.yaml
index 90a5f4a385..23c6b427be 100644
--- a/src/joint_embedding/api/anndata_dataset.yaml
+++ b/src/joint_embedding/api/anndata_dataset.yaml
@@ -13,7 +13,7 @@ info:
       - type: string
         name: batch
         description: Batch information
-        required: false
+        required: true
       - type: double
         name: size_factors
         description: The size factors created by the normalisation method, if any.
diff --git a/src/joint_embedding/api/anndata_masked_mod1.yaml b/src/joint_embedding/api/anndata_masked_mod1.yaml
index c247565645..7ca6820671 100644
--- a/src/joint_embedding/api/anndata_masked_mod1.yaml
+++ b/src/joint_embedding/api/anndata_masked_mod1.yaml
@@ -12,7 +12,7 @@ info:
       - type: string
         name: batch
         description: Batch information
-        required: false
+        required: true
       - type: double
         name: size_factors
         description:
diff --git a/src/joint_embedding/api/anndata_prediction.yaml b/src/joint_embedding/api/anndata_prediction.yaml
new file mode 100644
index 0000000000..49d8ae7d79
--- /dev/null
+++ b/src/joint_embedding/api/anndata_prediction.yaml
@@ -0,0 +1,25 @@
+type: file
+description: "The prediction file"
+example: "prediction.h5ad"
+info:
+  short_description: "Prediction"
+  slots:     
+    obs:
+      - type: string
+        name: batch
+        description: Batch information
+        required: true
+    obsm:
+      - type: double
+        name: X_emb
+        description:
+        required: true
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: method_id
+        description: "A unique identifier for the method"
+        required: true
diff --git a/src/joint_embedding/api/authors.yaml b/src/joint_embedding/api/authors.yaml
index 7fc237c267..a1467a402c 100644
--- a/src/joint_embedding/api/authors.yaml
+++ b/src/joint_embedding/api/authors.yaml
@@ -5,4 +5,8 @@ functionality:
       props: { github: rcannood, orcid: "0000-0003-3641-729X" }
     - name: Kai Waldrant
       roles: [ contributor ]
-      props: { github: KaiWaldrant }
\ No newline at end of file
+      props: { github: KaiWaldrant }
+    - name: Alex Tong
+      email: alexandertongdev@gmail.com
+      roles: [ author, maintainer ]
+      props: { github: atong01 }
\ No newline at end of file
diff --git a/src/joint_embedding/api/comp_control_method.yaml b/src/joint_embedding/api/comp_control_method.yaml
new file mode 100644
index 0000000000..5477d32e75
--- /dev/null
+++ b/src/joint_embedding/api/comp_control_method.yaml
@@ -0,0 +1,47 @@
+functionality:
+  arguments:
+    - name: "--input_mod1"
+      __merge__: anndata_masked_mod1.yaml
+    - name: "--output"
+      __merge__: anndata_prediction.yaml
+      direction: output
+  test_resources:
+    - path: ../../../../resources_test   
+    - type: python_script
+      path: generic_test.py
+      text: |
+        from os import path
+        import subprocess
+        import anndata as ad
+        import scipy
+
+        input_mod1_path = "resources_test/common/joint_embedding/openproblems_bmmc_multiome_starter.mod1.h5ad"
+        output_path = "output.h5ad"
+
+        cmd = [
+            meta['executable'],
+            "--input_mod1", input_mod1_path,
+            "--output", output_path
+        ]
+
+        print(">> Running script as test")
+        out = subprocess.run(cmd, check=True, capture_output=True, text=True)
+
+        print("> Checking whether output files were created")
+        assert path.exists(output_path)
+
+        print("> Reading h5ad files")
+        input_mod1 = ad.read_h5ad(input_mod1_path)
+        output = ad.read_h5ad(output_path)
+
+        print("> Checking contents of output.h5ad")
+        assert output.uns['dataset_id'] == input_mod1.uns['dataset_id']
+        assert output.uns['method_id'] == meta['functionality_name']
+        assert output.n_obs == input_mod1.n_obs
+        print(output.n_vars)
+        assert output.n_vars >= 1
+        assert output.n_vars <= 100
+        assert all(output.obs_names == input_mod1.obs_names)
+        assert not scipy.sparse.issparse(output.obsm[X_emb])
+
+        print("> Test succeeded!")
diff --git a/src/joint_embedding/control_methods/random_embed/config.vsh.yaml b/src/joint_embedding/control_methods/random_embed/config.vsh.yaml
new file mode 100644
index 0000000000..cdc3061a8e
--- /dev/null
+++ b/src/joint_embedding/control_methods/random_embed/config.vsh.yaml
@@ -0,0 +1,24 @@
+__merge__: ../../api/comp_control_method.yaml
+functionality:
+  name: random_embed
+  namespace: joint_embedding/control_methods
+  description: Generate a random embedding from a normal distribution.
+  info:
+    type: negative_control
+    label: Normal Dist.
+  arguments:
+    - name: "--n_dims"
+      type: "integer"
+      default: 100
+      description: Number of dimensions to output.
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: "python:3.10"
+    setup:
+      - type: python
+        pip: [ anndata>=0.8, numpy , scipy]
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/control_methods/random_embed/script.py b/src/joint_embedding/control_methods/random_embed/script.py
new file mode 100644
index 0000000000..199ad08176
--- /dev/null
+++ b/src/joint_embedding/control_methods/random_embed/script.py
@@ -0,0 +1,26 @@
+import anndata
+import numpy as np
+from scipy import sparse
+
+## VIASH START
+par = {
+    "input_mod1": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.mod1.h5ad",
+    "output": "output/output_prediction.h5ad",
+    "n_dims": 100,
+}
+## VIASH END
+
+print("Load and prepare data")
+adata_mod1 = anndata.read_h5ad(par["input_mod1"])
+
+X = np.random.randn(adata_mod1.shape[0], par["n_dims"])
+print("Saving output")
+adata_out = anndata.AnnData(
+    X=X,
+    obsm= {"X_emb": sparse.csr_matrix(X)},
+    obs=adata_mod1.obs[["batch"]],
+    uns={"dataset_id": adata_mod1.uns["dataset_id"], "method_id": "random_embed"},
+)
+del adata_out.X
+
+adata_out.write_h5ad(par["output"], compression="gzip")
diff --git a/src/joint_embedding/mask_dataset/config.vsh.yaml b/src/joint_embedding/mask_dataset/config.vsh.yaml
index 5d2890598e..1f54f2df62 100644
--- a/src/joint_embedding/mask_dataset/config.vsh.yaml
+++ b/src/joint_embedding/mask_dataset/config.vsh.yaml
@@ -12,7 +12,7 @@ functionality:
   resources:
     - type: r_script
       path: script.R
-  tests:
+  test_resources:
     - type: r_script
       path: test.R
     - path: ../../../resources_test
diff --git a/src/joint_embedding/mask_dataset/script.R b/src/joint_embedding/mask_dataset/script.R
index 58935f5bca..2d70248e69 100644
--- a/src/joint_embedding/mask_dataset/script.R
+++ b/src/joint_embedding/mask_dataset/script.R
@@ -6,7 +6,7 @@ library(assertthat, quietly = TRUE, warn.conflicts = FALSE)
 library(Matrix, quietly = TRUE, warn.conflicts = FALSE)
 
 ## VIASH START
-input_path <- "output/datasets_2021-11-08/common/openproblems_bmmc_multiome_phase1v2/openproblems_bmmc_multiome_phase1v2.manual_formatting."
+input_path <- "resources_test/common/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter."
 output_path <- ""
 
 par <- list(
diff --git a/src/joint_embedding/split_dataset/config.vsh.yaml b/src/joint_embedding/split_dataset/config.vsh.yaml
deleted file mode 100644
index 2e85c26fc4..0000000000
--- a/src/joint_embedding/split_dataset/config.vsh.yaml
+++ /dev/null
@@ -1,57 +0,0 @@
-functionality:
-  name: censor_dataset
-  namespace: joint_embedding_datasets
-  version: dev
-  description: |
-    A component for censoring joint embedding datasets to be given
-    to competition participants for the 'joint embedding' task.
-  authors:
-    - name: Robrecht Cannoodt
-      email: rcannood@gmail.com
-      roles: [ author, maintainer ]
-      props: { github: rcannood, orcid: "0000-0003-3641-729X" }
-  arguments:
-    - name: "--input_mod1"
-      type: "file"
-      example: "dataset.h5ad"
-      description: An input h5ad dataset.
-      required: true
-    - name: "--input_mod2"
-      type: "file"
-      example: "dataset.h5ad"
-      description: An input h5ad dataset.
-      required: true
-    - name: "--output_mod1"
-      type: "file"
-      direction: "output"
-      example: "dataset_mod1.h5ad"
-      description: Output mod1 file.
-      required: true
-    - name: "--output_mod2"
-      type: "file"
-      direction: "output"
-      example: "dataset_mod2.h5ad"
-      description: Output mod1 file.
-      required: true
-    - name: "--output_solution"
-      type: "file"
-      direction: "output"
-      example: "dataset_solution.h5ad"
-      description: The solution file.
-      required: true
-    - name: "--train_only"
-      type: "boolean_true"
-      description: Whether or not to only omit the train cells.
-  resources:
-    - type: r_script
-      path: script.R
-  tests:
-    - type: r_script
-      path: test.R
-    - path: ../../../../resources_test
-platforms:
-  - type: docker
-    image: dataintuitive/randpy:r4.0_py3.8_bioc3.12
-  - type: nextflow
-    publish: true
-    labels: [ midmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/split_dataset/script.R b/src/joint_embedding/split_dataset/script.R
deleted file mode 100644
index 58935f5bca..0000000000
--- a/src/joint_embedding/split_dataset/script.R
+++ /dev/null
@@ -1,97 +0,0 @@
-cat("Loading dependencies\n")
-options(tidyverse.quiet = TRUE)
-library(tidyverse)
-requireNamespace("anndata", quietly = TRUE)
-library(assertthat, quietly = TRUE, warn.conflicts = FALSE)
-library(Matrix, quietly = TRUE, warn.conflicts = FALSE)
-
-## VIASH START
-input_path <- "output/datasets_2021-11-08/common/openproblems_bmmc_multiome_phase1v2/openproblems_bmmc_multiome_phase1v2.manual_formatting."
-output_path <- ""
-
-par <- list(
-  input_mod1 = paste0(input_path, "output_rna.h5ad"),
-  input_mod2 = paste0(input_path, "output_mod2.h5ad"),
-  output_mod1 = paste0(output_path, "output_mod1.h5ad"),
-  output_mod2 = paste0(output_path, "output_mod2.h5ad"),
-  output_solution = paste0(output_path, "solution.h5ad"),
-  train_only = TRUE
-)
-## VIASH END
-
-cat("Reading mod1 data\n")
-input_mod1 <- anndata::read_h5ad(par$input_mod1)
-ad1_mod <- unique(input_mod1$var[["feature_types"]])
-new_dataset_id <- paste0(input_mod1$uns[["dataset_id"]], "_JE")
-ad1_uns <- list(dataset_id = new_dataset_id, organism = "human")
-ad2_uns <- list(dataset_id = new_dataset_id, organism = "human")
-
-cat("Creating mod1 object\n")
-out_mod1 <- anndata::AnnData(
-  X = input_mod1$X,
-  layers = list(counts = input_mod1$layers[["counts"]]),
-  var = input_mod1$var %>% select(one_of("gene_ids"), feature_types),
-  obs = input_mod1$obs %>% select(one_of("batch", "size_factors")),
-  uns = ad1_uns
-)
-
-cat("Create solution object\n")
-out_solution <- anndata::AnnData(
-  X = input_mod1$X,
-  var = input_mod1$var %>% select(one_of("gene_ids"), feature_types),
-  obs = input_mod1$obs %>% select(
-    one_of("batch", "cell_type", "pseudotime_order_GEX", "pseudotime_order_ATAC", "pseudotime_order_ADT", "S_score", "G2M_score")
-  ),
-  uns = ad1_uns
-)
-
-is_train <- input_mod1$obs$is_train
-
-if (par$train_only) {
-  cat("Filtering out test cells\n", sep = "")
-  out_mod1 <- out_mod1[is_train, ] #$copy()
-  out_solution <- out_solution[is_train, ]# $copy()
-}
-
-rm(input_mod1)
-gc()
-
-cat("Reading mod2 data\n")
-input_mod2 <- anndata::read_h5ad(par$input_mod2)
-ad2_mod <- unique(input_mod2$var[["feature_types"]])
-ad2_obsm <- list()
-
-if (ad2_mod == "ATAC") {
-  ad2_uns$gene_activity_var_names <- input_mod2$uns$gene_activity_var_names
-  ad2_obsm$gene_activity <- as(input_mod2$obsm$gene_activity, "CsparseMatrix")
-}
-
-cat("Creating mod2 object\n")
-out_mod2 <- anndata::AnnData(
-  X = input_mod2$X,
-  layers = list(counts = input_mod2$layers[["counts"]]),
-  var = input_mod2$var %>% select(one_of("gene_ids"), feature_types),
-  obs = input_mod2$obs %>% select(one_of("batch")),
-  obsm = ad2_obsm,
-  uns = ad2_uns
-)
-rm(input_mod2)
-gc()
-
-if (par$train_only) {
-  cat("Filtering out test cells\n", sep = "")
-  out_mod2 <- out_mod2[is_train, ] #$copy()
-}
-
-cat("Saving output files as h5ad\n")
-cat("output_mod1:")
-print(out_mod1)
-zzz <- out_mod1$write_h5ad(par$output_mod1, compression = "gzip")
-
-cat("output_mod2:")
-print(out_mod2)
-zzz <- out_mod2$write_h5ad(par$output_mod2, compression = "gzip")
-
-cat("output_solution:")
-print(out_solution)
-zzz <- out_solution$write_h5ad(par$output_solution, compression = "gzip")
diff --git a/src/joint_embedding/split_dataset/test.R b/src/joint_embedding/split_dataset/test.R
deleted file mode 100644
index 09fe193b55..0000000000
--- a/src/joint_embedding/split_dataset/test.R
+++ /dev/null
@@ -1,55 +0,0 @@
-library(testthat, quietly = TRUE, warn.conflicts = FALSE)
-requireNamespace("anndata", quietly = TRUE)
-
-par <- list(
-  input_mod1 = "resources_test/common/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.output_rna.h5ad",
-  input_mod2 = "resources_test/common/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.output_mod2.h5ad",
-  output_mod1 = "output_mod1.h5ad",
-  output_mod2 = "output_mod2.h5ad",
-  output_solution = "solution.h5ad"
-)
-
-cat("> Running censor component\n")
-out <- processx::run(
-  command = paste0("./", meta["functionality_name"]),
-  args = c(
-    "--input_mod1", par$input_mod1,
-    "--input_mod2", par$input_mod2,
-    "--output_mod1", par$output_mod1,
-    "--output_mod2", par$output_mod2,
-    "--output_solution", par$output_solution
-  ),
-  stderr_to_stdout = TRUE
-)
-
-cat("> Checking whether output files were created\n")
-expect_true(file.exists(par$output_mod1))
-expect_true(file.exists(par$output_mod2))
-expect_true(file.exists(par$output_solution))
-
-cat("> Reading h5ad files\n")
-input_mod1 <- anndata::read_h5ad(par$input_mod1)
-input_mod2 <- anndata::read_h5ad(par$input_mod2)
-output_mod1 <- anndata::read_h5ad(par$output_mod1)
-output_mod2 <- anndata::read_h5ad(par$output_mod2)
-output_solution <- anndata::read_h5ad(par$output_solution)
-
-cat("> Checking contents of h5ad files\n")
-expect_equal(output_mod1$uns[["dataset_id"]], paste0(input_mod1$uns[["dataset_id"]], "_JE"))
-expect_equal(output_mod2$uns[["dataset_id"]], paste0(input_mod1$uns[["dataset_id"]], "_JE"))
-expect_equal(output_solution$uns[["dataset_id"]], paste0(input_mod1$uns[["dataset_id"]], "_JE"))
-expect_equal(output_mod1$uns[["organism"]], input_mod1$uns[["organism"]])
-expect_equal(output_mod2$uns[["organism"]], input_mod1$uns[["organism"]])
-expect_equal(output_solution$uns[["organism"]], input_mod1$uns[["organism"]])
-expect_equal(output_mod1$n_obs, input_mod1$n_obs)
-expect_equal(output_mod2$n_obs, input_mod2$n_obs)
-expect_equal(output_mod1$n_vars, input_mod1$n_vars)
-expect_equal(output_mod2$n_vars, input_mod2$n_vars)
-expect_equal(output_mod1$var_names, input_mod1$var_names)
-expect_equal(output_mod2$var_names, input_mod2$var_names)
-expect_equal(output_mod1$obs_names, input_mod1$obs_names)
-expect_equal(output_mod2$obs_names, input_mod2$obs_names)
-
-# TODO check contents of matrices, check rownames
-
-cat("> Test succeeded!\n")

From 861072ede48eb9f8f4cbccbd3d9f37c217f8d749 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Wed, 14 Dec 2022 17:00:52 +0100
Subject: [PATCH 05/42] update control_method api

---
 src/joint_embedding/api/comp_control_method.yaml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/joint_embedding/api/comp_control_method.yaml b/src/joint_embedding/api/comp_control_method.yaml
index 5477d32e75..d1ec13ed17 100644
--- a/src/joint_embedding/api/comp_control_method.yaml
+++ b/src/joint_embedding/api/comp_control_method.yaml
@@ -13,7 +13,7 @@ functionality:
         from os import path
         import subprocess
         import anndata as ad
-        import scipy
+        from scipy import sparse
 
         input_mod1_path = "resources_test/common/joint_embedding/openproblems_bmmc_multiome_starter.mod1.h5ad"
         output_path = "output.h5ad"
@@ -38,10 +38,9 @@ functionality:
         assert output.uns['dataset_id'] == input_mod1.uns['dataset_id']
         assert output.uns['method_id'] == meta['functionality_name']
         assert output.n_obs == input_mod1.n_obs
-        print(output.n_vars)
         assert output.n_vars >= 1
         assert output.n_vars <= 100
         assert all(output.obs_names == input_mod1.obs_names)
-        assert not scipy.sparse.issparse(output.obsm[X_emb])
+        assert sparse.issparse(output.obsm['X_emb'])
 
         print("> Test succeeded!")

From 8749e2a33e37906d954be0f9f2ae51254ed34bc5 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Thu, 15 Dec 2022 10:23:07 +0100
Subject: [PATCH 06/42] add zeros_embed control

---
 .../zeros_embed/config.vsh.yaml               | 24 +++++++++++++++++++
 .../control_methods/zeros_embed/script.py     | 23 ++++++++++++++++++
 2 files changed, 47 insertions(+)
 create mode 100644 src/joint_embedding/control_methods/zeros_embed/config.vsh.yaml
 create mode 100644 src/joint_embedding/control_methods/zeros_embed/script.py

diff --git a/src/joint_embedding/control_methods/zeros_embed/config.vsh.yaml b/src/joint_embedding/control_methods/zeros_embed/config.vsh.yaml
new file mode 100644
index 0000000000..b2c95a79e2
--- /dev/null
+++ b/src/joint_embedding/control_methods/zeros_embed/config.vsh.yaml
@@ -0,0 +1,24 @@
+__merge__: ../../api/comp_control_method.yaml
+functionality:
+  name: zeros_embed
+  namespace: joint_embedding/control_methods
+  description: Generate an embedding containing only zero values.
+  info:
+    type: negative_control
+    label: zeros_embed
+  arguments:
+    - name: "--n_dims"
+      type: "integer"
+      default: 1
+      description: Number of dimensions to output.
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: "python:3.10"
+    setup:
+      - type: python
+        pip: [ anndata, numpy, scipy ]
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/control_methods/zeros_embed/script.py b/src/joint_embedding/control_methods/zeros_embed/script.py
new file mode 100644
index 0000000000..964dae1744
--- /dev/null
+++ b/src/joint_embedding/control_methods/zeros_embed/script.py
@@ -0,0 +1,23 @@
+import anndata
+import numpy as np
+
+## VIASH START
+par = {
+    "input_mod1": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.mod1.h5ad",
+    "input_mod2": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.mod2.h5ad",
+    "output": "tmp/output_prediction.h5ad",
+    "n_dims": 1,
+}
+## VIASH END
+
+print("Load and prepare data")
+adata_mod1 = anndata.read_h5ad(par["input_mod1"])
+
+X = np.zeros((adata_mod1.shape[0], par["n_dims"]))
+print("Saving output")
+adata_out = anndata.AnnData(
+    X=X,
+    obs=adata_mod1.obs,
+    uns={"dataset_id": adata_mod1.uns["dataset_id"], "method_id": "dummy_zeros"},
+)
+adata_out.write_h5ad(par["output"], compression="gzip")

From 7c89329a2bae233f0031c5110cf3c93986ead9ec Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Thu, 15 Dec 2022 16:24:13 +0100
Subject: [PATCH 07/42] add lmds method

---
 src/joint_embedding/api/comp_method.yaml      | 50 +++++++++++++++++++
 .../control_methods/zeros_embed/script.py     |  8 ++-
 .../methods/lmds/config.vsh.yaml              | 28 +++++++++++
 src/joint_embedding/methods/lmds/script.R     | 44 ++++++++++++++++
 4 files changed, 128 insertions(+), 2 deletions(-)
 create mode 100644 src/joint_embedding/api/comp_method.yaml
 create mode 100644 src/joint_embedding/methods/lmds/config.vsh.yaml
 create mode 100644 src/joint_embedding/methods/lmds/script.R

diff --git a/src/joint_embedding/api/comp_method.yaml b/src/joint_embedding/api/comp_method.yaml
new file mode 100644
index 0000000000..e75d2fe07b
--- /dev/null
+++ b/src/joint_embedding/api/comp_method.yaml
@@ -0,0 +1,50 @@
+functionality:
+  arguments:
+    - name: "--input_mod1"
+      __merge__: anndata_masked_mod1.yaml
+    - name: "--input_mod2"
+      __merge__: anndata_masked_mod2.yaml
+    - name: "--output"
+      __merge__: anndata_prediction.yaml
+      direction: output
+  test_resources:
+    - path: ../../../../resources_test   
+    - type: python_script
+      path: generic_test.py
+      text: |
+        from os import path
+        import subprocess
+        import anndata as ad
+        from scipy import sparse
+
+        input_mod1_path = "resources_test/common/joint_embedding/openproblems_bmmc_multiome_starter.mod1.h5ad"
+        input_mod2_path = "resources_test/common/joint_embedding/openproblems_bmmc_multiome_starter.mod2.h5ad"
+        output_path = "output.h5ad"
+
+        cmd = [
+            meta['executable'],
+            "--input_mod1", input_mod1_path,
+            "--input_mod2", input_mod2_path,
+            "--output", output_path
+        ]
+
+        print(">> Running script as test")
+        out = subprocess.run(cmd, check=True, capture_output=True, text=True).stderr
+
+        print("> Checking whether output files were created")
+        assert path.exists(output_path)
+
+        print("> Reading h5ad files")
+        input_mod1 = ad.read_h5ad(input_mod1_path)
+        output = ad.read_h5ad(output_path)
+
+        print("> Checking contents of output.h5ad")
+        assert output.uns['dataset_id'] == input_mod1.uns['dataset_id']
+        assert output.uns['method_id'] == meta['functionality_name']
+        assert output.n_obs == input_mod1.n_obs
+        assert output.n_vars >= 1
+        assert output.n_vars <= 100
+        assert all(output.obs_names == input_mod1.obs_names)
+        assert sparse.issparse(output.obsm['X_emb'])
+
+        print("> Test succeeded!")
\ No newline at end of file
diff --git a/src/joint_embedding/control_methods/zeros_embed/script.py b/src/joint_embedding/control_methods/zeros_embed/script.py
index 964dae1744..f54ef3ce18 100644
--- a/src/joint_embedding/control_methods/zeros_embed/script.py
+++ b/src/joint_embedding/control_methods/zeros_embed/script.py
@@ -1,10 +1,10 @@
 import anndata
 import numpy as np
+from scipy import sparse
 
 ## VIASH START
 par = {
     "input_mod1": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.mod1.h5ad",
-    "input_mod2": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.mod2.h5ad",
     "output": "tmp/output_prediction.h5ad",
     "n_dims": 1,
 }
@@ -18,6 +18,10 @@
 adata_out = anndata.AnnData(
     X=X,
     obs=adata_mod1.obs,
-    uns={"dataset_id": adata_mod1.uns["dataset_id"], "method_id": "dummy_zeros"},
+    uns={"dataset_id": adata_mod1.uns["dataset_id"], "method_id": "zeros_embed"},
+    obsm={"X_emb": sparse.csr_matrix(X) }
 )
+
+del adata_out.X
+
 adata_out.write_h5ad(par["output"], compression="gzip")
diff --git a/src/joint_embedding/methods/lmds/config.vsh.yaml b/src/joint_embedding/methods/lmds/config.vsh.yaml
new file mode 100644
index 0000000000..b9e096a6d1
--- /dev/null
+++ b/src/joint_embedding/methods/lmds/config.vsh.yaml
@@ -0,0 +1,28 @@
+__merge__: ../../api/comp_method.yaml
+functionality:
+  name: lmds
+  namespace: joint_embedding/methods
+  description: Landmark MDS dimensionality reduction on the Spearman distance.
+  info:
+    type: method
+    label: "LMDS"
+  arguments:
+    - name: "--distance_method"
+      type: "string"
+      default: "spearman"
+      description: The distance method to use. Possible values are euclidean, pearson, spearman and others.
+    - name: "--n_dims"
+      type: integer
+      default: 10
+      description: Number of dimensions to output.
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: dataintuitive/randpy:r4.0_py3.8_bioc3.12
+    setup:
+      - type: r
+        packages: [ lmds ]
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/methods/lmds/script.R b/src/joint_embedding/methods/lmds/script.R
new file mode 100644
index 0000000000..92700691e0
--- /dev/null
+++ b/src/joint_embedding/methods/lmds/script.R
@@ -0,0 +1,44 @@
+cat("Loading dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+requireNamespace("anndata", quietly = TRUE)
+library(Matrix, warn.conflicts = FALSE, quietly = TRUE)
+
+## VIASH START
+# path <- "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter."
+path <- "resources_test/common/joint_embedding/openproblems_bmmc_multiome_starter."
+par <- list(
+  input_mod1 = paste0(path, "mod1.h5ad"),
+  input_mod2 = paste0(path, "mod2.h5ad"),
+  output = "output/lmds/output.h5ad",
+  n_dims = 10L,
+  distance_method = "spearman"
+)
+## VIASH END
+
+cat("Reading h5ad files\n")
+ad1 <- anndata::read_h5ad(par$input_mod1)
+ad2 <- anndata::read_h5ad(par$input_mod2)
+
+cat("Performing DR\n")
+dr <- lmds::lmds(
+  cbind(ad1$X, ad2$X),
+  ndim = par$n_dims,
+  distance_method = par$distance_method
+)
+
+rownames(dr) <- rownames(ad1)
+colnames(dr) <- paste0("comp_", seq_len(par$n_dims))
+
+out <- anndata::AnnData(
+  X = dr,
+  uns = list(
+    dataset_id = ad1$uns[["dataset_id"]],
+    method_id = meta$functionality_name
+  ),
+  obsm = list(X_emb = as(dr, "CsparseMatrix"))
+)
+
+
+cat("Writing predictions to file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")

From 0d29dd03d99f109cb972fb57f9c328053b7c63a2 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Thu, 15 Dec 2022 16:33:16 +0100
Subject: [PATCH 08/42] add mnn method

---
 .../methods/mnn/config.vsh.yaml               | 24 +++++++
 src/joint_embedding/methods/mnn/script.R      | 67 +++++++++++++++++++
 2 files changed, 91 insertions(+)
 create mode 100644 src/joint_embedding/methods/mnn/config.vsh.yaml
 create mode 100644 src/joint_embedding/methods/mnn/script.R

diff --git a/src/joint_embedding/methods/mnn/config.vsh.yaml b/src/joint_embedding/methods/mnn/config.vsh.yaml
new file mode 100644
index 0000000000..8759ade4e0
--- /dev/null
+++ b/src/joint_embedding/methods/mnn/config.vsh.yaml
@@ -0,0 +1,24 @@
+__merge__: ../../api/comp_method.yaml
+functionality:
+  name: mnn
+  namespace: joint_embedding/methods
+  description: Mutual nearest neighbors correction followed by PCA.
+  info:
+    type: method
+    label: "MNN"
+  arguments:
+    - name: "--hvg_sel"
+      type: "integer"
+      default: 1000
+      description: Number of features per modality to use.
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: dataintuitive/randpy:r4.0_py3.8_bioc3.12
+    setup:
+      - type: r
+        bioc: [ SingleCellExperiment, batchelor, proxyC ]
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/methods/mnn/script.R b/src/joint_embedding/methods/mnn/script.R
new file mode 100644
index 0000000000..27a4454bff
--- /dev/null
+++ b/src/joint_embedding/methods/mnn/script.R
@@ -0,0 +1,67 @@
+cat("Loading dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+requireNamespace("anndata", quietly = TRUE)
+library(Matrix, warn.conflicts = FALSE, quietly = TRUE)
+requireNamespace("batchelor", quietly = TRUE)
+requireNamespace("SingleCellExperiment", quietly = TRUE)
+
+## VIASH START
+# path <- "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter."
+path <- "output/datasets/joint_embedding/openproblems_bmmc_multiome_phase1/openproblems_bmmc_multiome_phase1.censor_dataset.output_"
+# path <- "output/public_datasets/joint_embedding/dyngen_citeseq_1/dyngen_citeseq_1.censor_dataset.output_"
+par <- list(
+  input_mod1 = paste0(path, "mod1.h5ad"),
+  input_mod2 = paste0(path, "mod2.h5ad"),
+  output = "output.h5ad",
+  hvg_sel = 1000L
+)
+meta <- list(functionality_name = "foo")
+## VIASH END
+
+method_id <- meta$functionality_name
+
+cat("Reading h5ad files\n")
+input_mod1 <- anndata::read_h5ad(par$input_mod1)
+
+rn <- rownames(input_mod1)
+batch <- input_mod1$obs$batch
+dataset_id <- input_mod1$uns[["dataset_id"]]
+Xt_mod1 <- t(input_mod1$X)
+
+# select hvg
+if (!is.null(par$hvg_sel) && nrow(Xt_mod1) > par$hvg_sel) {
+  sd_mod1 <- proxyC::rowSds(Xt_mod1)
+  Xt_mod1 <- Xt_mod1[order(sd_mod1, decreasing = TRUE)[seq_len(par$hvg_sel)], ]
+}
+
+rm(input_mod1)
+gc()
+
+Xt_mod2 <- t(anndata::read_h5ad(par$input_mod2)$X)
+if (!is.null(par$hvg_sel) && nrow(Xt_mod2) > par$hvg_sel) {
+  sd_mod2 <- proxyC::rowSds(Xt_mod2)
+  Xt_mod2 <- Xt_mod2[order(sd_mod2, decreasing = TRUE)[seq_len(par$hvg_sel)], ]
+}
+
+cat("Running fastMNN\n")
+mnn_out <- batchelor::fastMNN(
+  rbind(Xt_mod1, Xt_mod2),
+  batch = batch
+)
+dr <- SingleCellExperiment::reducedDim(mnn_out, "corrected")
+
+rownames(dr) <- rn
+colnames(dr) <- paste0("comp_", seq_len(ncol(dr)))
+
+out <- anndata::AnnData(
+  X = dr,
+  uns = list(
+    dataset_id = dataset_id,
+    method_id = meta$functionality_name
+  ),
+  obsm = list(X_emb = as(dr, "CsparseMatrix"))
+)
+
+cat("Writing predictions to file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")

From 3c46c4d331ddff14eee86016f58bf19e3d6dbf86 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Thu, 15 Dec 2022 17:06:26 +0100
Subject: [PATCH 09/42] add newwave method

---
 .../methods/newwave/config.vsh.yaml           |  33 ++++++
 src/joint_embedding/methods/newwave/script.R  | 111 ++++++++++++++++++
 2 files changed, 144 insertions(+)
 create mode 100644 src/joint_embedding/methods/newwave/config.vsh.yaml
 create mode 100644 src/joint_embedding/methods/newwave/script.R

diff --git a/src/joint_embedding/methods/newwave/config.vsh.yaml b/src/joint_embedding/methods/newwave/config.vsh.yaml
new file mode 100644
index 0000000000..c327dcfa8c
--- /dev/null
+++ b/src/joint_embedding/methods/newwave/config.vsh.yaml
@@ -0,0 +1,33 @@
+__merge__: ../../api/comp_method.yaml
+functionality:
+  name: newwave
+  namespace: joint_embedding/methods
+  description: Concatenated NewWave.
+  info:
+    type: method
+    label: "NewWave"
+    doi: "10.1101/2021.08.02.453487"
+  arguments:
+    - name: "--maxiter"
+      type: "integer"
+      default: 100
+      description: Maximum number of NewWave iterations.
+    - name: "--k"
+      type: "integer"
+      default: 10
+      description: NewWave K parameter.
+    - name: "--hvg_sel"
+      type: "integer"
+      default: 1000
+      description: Number of features per modality to use.
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: dataintuitive/randpy:r4.0_py3.8_bioc3.12
+    setup:
+      - type: r
+        bioc: [ SingleCellExperiment, NewWave, proxyC ]
+  - type: nextflow
+    directives: [ highmem, hightime, highcpu ]
diff --git a/src/joint_embedding/methods/newwave/script.R b/src/joint_embedding/methods/newwave/script.R
new file mode 100644
index 0000000000..f87d79cdd5
--- /dev/null
+++ b/src/joint_embedding/methods/newwave/script.R
@@ -0,0 +1,111 @@
+cat("Loading dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+requireNamespace("anndata", quietly = TRUE)
+library(Matrix, warn.conflicts = FALSE, quietly = TRUE)
+requireNamespace("NewWave", quietly = TRUE)
+requireNamespace("SingleCellExperiment", quietly = TRUE)
+
+## VIASH START
+# path <- "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter."
+path <- "output/datasets/joint_embedding/openproblems_bmmc_multiome_phase1/openproblems_bmmc_multiome_phase1.censor_dataset.output_"
+# path <- "output/public_datasets/joint_embedding/dyngen_citeseq_1/dyngen_citeseq_1.censor_dataset.output_"
+par <- list(
+  input_mod1 = paste0(path, "mod1.h5ad"),
+  input_mod2 = paste0(path, "mod2.h5ad"),
+  output = "output.h5ad",
+  maxiter = 2L,
+  k = 3L,
+  hvg_sel = 1000
+)
+meta <- list(functionality_name = "foo")
+## VIASH END
+
+method_id <- meta$functionality_name
+
+cat("Reading mod1 h5ad\n")
+input_mod1 <- anndata::read_h5ad(par$input_mod1)
+
+rn <- rownames(input_mod1)
+batch <- input_mod1$obs$batch
+dataset_id <- input_mod1$uns[["dataset_id"]]
+
+sd1 <- proxyC::colSds(input_mod1$X)
+fil1 <-
+  if (!is.null(par$hvg_sel) && ncol(input_mod1) > par$hvg_sel) {
+    head(order(sd1, decreasing = TRUE), par$hvg_sel)
+  } else {
+    which(sd1 > 0)
+  }
+data1 <- SummarizedExperiment::SummarizedExperiment(
+  assays = list(counts = t(input_mod1$layers[["counts"]][, fil1])),
+  colData = data.frame(batch = factor(batch))
+)
+rm(input_mod1)
+gc()
+
+cat("Running NewWave on mod1\n")
+res1 <- NewWave::newWave(
+  data1,
+  X = "~batch",
+  verbose = TRUE,
+  K = par$k,
+  maxiter_optimize = par$maxiter,
+  n_gene_par = min(300, nrow(data1)),
+  n_cell_par = min(300, ncol(data1)),
+  commondispersion = FALSE
+)
+rm(data1)
+
+dr_x1 <- SingleCellExperiment::reducedDim(res1)
+
+cat("Reading mod2 anndata\n")
+input_mod2 <- anndata::read_h5ad(par$input_mod2)
+sd2 <- proxyC::colSds(input_mod2$X)
+fil2 <-
+  if (!is.null(par$hvg_sel) && ncol(input_mod2) > par$hvg_sel) {
+    head(order(sd2, decreasing = TRUE), par$hvg_sel)
+  } else {
+    which(sd2 > 0)
+  }
+data2 <- SummarizedExperiment::SummarizedExperiment(
+  assays = list(counts = t(input_mod2$layers[["counts"]][, fil2])),
+  colData = data.frame(batch = factor(batch))
+)
+rm(input_mod2)
+gc()
+
+cat("Running NewWave on mod2\n")
+res2 <- NewWave::newWave(
+  data2,
+  X = "~batch",
+  verbose = TRUE,
+  K = par$k,
+  maxiter_optimize = par$maxiter,
+  n_gene_par = min(300, nrow(data2)),
+  n_cell_par = min(300, ncol(data2)),
+  commondispersion = FALSE
+)
+dr_x2 <- SingleCellExperiment::reducedDim(res2)
+rm(data2)
+
+cat("Spline separate DRs\n")
+dr <- do.call(cbind, lapply(seq_len(ncol(dr_x1)), function(i) {
+  cbind(dr_x1[, i], dr_x2[, i])
+}))
+
+rownames(dr) <- rn
+colnames(dr) <- paste0("comp_", seq_len(ncol(dr)))
+
+out <- anndata::AnnData(
+  X = dr,
+  uns = list(
+    dataset_id = dataset_id,
+    method_id = meta$functionality_name
+  ),
+  obsm = list(X_emb = as(dr, "CsparseMatrix"))
+
+)
+
+cat("Writing predictions to file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")

From 4ddb315a339815396983a499c896fcde59cd5352 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Thu, 15 Dec 2022 17:11:30 +0100
Subject: [PATCH 10/42] add pca method

---
 .../methods/pca/config.vsh.yaml               | 29 +++++++++
 src/joint_embedding/methods/pca/script.R      | 63 +++++++++++++++++++
 2 files changed, 92 insertions(+)
 create mode 100644 src/joint_embedding/methods/pca/config.vsh.yaml
 create mode 100644 src/joint_embedding/methods/pca/script.R

diff --git a/src/joint_embedding/methods/pca/config.vsh.yaml b/src/joint_embedding/methods/pca/config.vsh.yaml
new file mode 100644
index 0000000000..cb4d24642f
--- /dev/null
+++ b/src/joint_embedding/methods/pca/config.vsh.yaml
@@ -0,0 +1,29 @@
+__merge__: ../../api/comp_method.yaml
+functionality:
+  name: pca
+  namespace: joint_embedding/methods
+  description: PCA dimensionality reduction.
+  info:
+    type: method
+    label: "PCA"
+  authors:
+  arguments:
+    - name: "--n_dims"
+      type: "integer"
+      default: 10
+      description: Number of dimensions to output.
+    - name: "--hvg_sel"
+      type: "integer"
+      default: 1000
+      description: Number of features per modality to use.
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: dataintuitive/randpy:r4.0_py3.8_bioc3.12
+    setup:
+      - type: r
+        packages: [ irlba, proxyC ]
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/methods/pca/script.R b/src/joint_embedding/methods/pca/script.R
new file mode 100644
index 0000000000..d16eb027cf
--- /dev/null
+++ b/src/joint_embedding/methods/pca/script.R
@@ -0,0 +1,63 @@
+cat("Loading dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+requireNamespace("anndata", quietly = TRUE)
+library(Matrix, warn.conflicts = FALSE, quietly = TRUE)
+
+## VIASH START
+# path <- "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter."
+path <- "output/datasets/joint_embedding/openproblems_bmmc_multiome_phase1/openproblems_bmmc_multiome_phase1.censor_dataset.output_"
+# path <- "output/public_datasets/joint_embedding/dyngen_citeseq_1/dyngen_citeseq_1.censor_dataset.output_"
+par <- list(
+  input_mod1 = paste0(path, "mod1.h5ad"),
+  input_mod2 = paste0(path, "mod2.h5ad"),
+  output = "output.h5ad",
+  n_dims = 4L,
+  hvg_sel = 1000L
+)
+meta <- list(functionality_name = "foo")
+## VIASH END
+
+cat("Reading h5ad files\n")
+input_mod1 <- anndata::read_h5ad(par$input_mod1)
+
+rn <- rownames(input_mod1)
+batch <- input_mod1$obs$batch
+dataset_id <- input_mod1$uns[["dataset_id"]]
+X_mod1 <- input_mod1$X
+
+# select hvg
+if (!is.null(par$hvg_sel) && ncol(X_mod1) > par$hvg_sel) {
+  sd_mod1 <- proxyC::colSds(X_mod1)
+  X_mod1 <- X_mod1[, head(order(sd_mod1, decreasing = TRUE), par$hvg_sel)]
+}
+
+rm(input_mod1)
+gc()
+
+X_mod2 <- anndata::read_h5ad(par$input_mod2)$X
+if (!is.null(par$hvg_sel) && ncol(X_mod2) > par$hvg_sel) {
+  sd_mod2 <- proxyC::colSds(X_mod2)
+  X_mod2 <- X_mod2[, head(order(sd_mod2, decreasing = TRUE), par$hvg_sel)]
+}
+
+cat("Performing DR\n")
+dr <- irlba::prcomp_irlba(
+  cbind(X_mod1, X_mod2),
+  n = par$n_dims
+)$x
+
+rownames(dr) <- rn
+colnames(dr) <- paste0("comp_", seq_len(par$n_dims))
+
+out <- anndata::AnnData(
+  X = dr,
+  uns = list(
+    dataset_id = dataset_id,
+    method_id = meta$functionality_name
+  ),
+  obsm = list( X_emb = as(dr, "CsparseMatrix"))
+)
+
+cat("Writing predictions to file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")

From 3ae18553b63805ad76c5ed5b891e7b7008d80d43 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Fri, 16 Dec 2022 11:11:22 +0100
Subject: [PATCH 11/42] Add totalVI method

---
 src/joint_embedding/api/authors.yaml          |  6 +-
 src/joint_embedding/api/comp_method.yaml      |  2 +-
 .../methods/totalvi/config.vsh.yaml           | 33 +++++++++++
 src/joint_embedding/methods/totalvi/script.py | 58 +++++++++++++++++++
 4 files changed, 97 insertions(+), 2 deletions(-)
 create mode 100644 src/joint_embedding/methods/totalvi/config.vsh.yaml
 create mode 100644 src/joint_embedding/methods/totalvi/script.py

diff --git a/src/joint_embedding/api/authors.yaml b/src/joint_embedding/api/authors.yaml
index a1467a402c..70f8fc3141 100644
--- a/src/joint_embedding/api/authors.yaml
+++ b/src/joint_embedding/api/authors.yaml
@@ -9,4 +9,8 @@ functionality:
     - name: Alex Tong
       email: alexandertongdev@gmail.com
       roles: [ author, maintainer ]
-      props: { github: atong01 }
\ No newline at end of file
+      props: { github: atong01 }
+    - name: Christopher Lance
+      email: clance.connect@gmail.com
+      roles: [ author, maintainer ]
+      props: { github: xlancelottx }
\ No newline at end of file
diff --git a/src/joint_embedding/api/comp_method.yaml b/src/joint_embedding/api/comp_method.yaml
index e75d2fe07b..415f42ea3d 100644
--- a/src/joint_embedding/api/comp_method.yaml
+++ b/src/joint_embedding/api/comp_method.yaml
@@ -29,7 +29,7 @@ functionality:
         ]
 
         print(">> Running script as test")
-        out = subprocess.run(cmd, check=True, capture_output=True, text=True).stderr
+        out = subprocess.run(cmd, check=True, capture_output=True, text=True)
 
         print("> Checking whether output files were created")
         assert path.exists(output_path)
diff --git a/src/joint_embedding/methods/totalvi/config.vsh.yaml b/src/joint_embedding/methods/totalvi/config.vsh.yaml
new file mode 100644
index 0000000000..74116f9abe
--- /dev/null
+++ b/src/joint_embedding/methods/totalvi/config.vsh.yaml
@@ -0,0 +1,33 @@
+__merge__: ../../api/comp_method.yaml
+functionality:
+  name: totalvi
+  namespace: joint_embedding/methods
+  version: dev
+  description: "totalVI: joint probabilistic modeling with Total Variational Inference"
+  info:
+    type: method
+    label: totalVI
+    doi: 10.1038/s41592-020-01050-x    
+  arguments:
+    - name: --hvg_number
+      type: integer
+      default: 4000
+      description: Number of HVG to include in totalVI
+    - name: --max_epochs
+      type: integer
+      default: 400
+      description: Number of max epochs to run totalVI
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: "python:3.10"
+    setup:
+      - type: python
+        pip: [ anndata>=0.8, scanpy, scikit-misc, scipy, scikit-learn, scvi-tools]
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
+
+
+
diff --git a/src/joint_embedding/methods/totalvi/script.py b/src/joint_embedding/methods/totalvi/script.py
new file mode 100644
index 0000000000..b47400816d
--- /dev/null
+++ b/src/joint_embedding/methods/totalvi/script.py
@@ -0,0 +1,58 @@
+import anndata
+import scanpy as sc
+from scipy import sparse
+from scvi.model import TOTALVI
+
+## VIASH START
+par = {
+    "input_mod1": "output/public_datasets/joint_embedding/totalvi_spleen_lymph_111/totalvi_spleen_lymph_111.censor_dataset.output_mod1.h5ad",
+    "input_mod2": "output/public_datasets/joint_embedding/totalvi_spleen_lymph_111/totalvi_spleen_lymph_111.censor_dataset.output_mod2.h5ad",
+    "output": "tmp/output_prediction.h5ad",
+    "hvg_number": 4000,
+    "max_epochs": 20
+}
+
+meta = {
+    'funcionality_name': "foo"
+}
+## VIASH END
+
+print("Load and prepare data")
+adata_mod1 = anndata.read_h5ad(par['input_mod1'])
+adata_mod2 = anndata.read_h5ad(par['input_mod2'])
+adata_mod1.obsm['protein_expression'] = adata_mod2.X.toarray()
+
+print('Select highly variable genes')
+sc.pp.highly_variable_genes(
+    adata_mod1,
+    n_top_genes=par['hvg_number'],
+    flavor="seurat_v3",
+    batch_key="batch",
+    subset=True
+)
+
+print("Set up model")
+TOTALVI.setup_anndata(
+    adata_mod1,
+    batch_key="batch",
+    protein_expression_obsm_key="protein_expression"
+)
+
+print('Train totalVI with', par['max_epochs'], 'epochs')
+vae = TOTALVI(adata_mod1, latent_distribution="normal")
+vae.train(max_epochs = par['max_epochs'])
+
+print("Postprocessing and saving output")
+adata_out = anndata.AnnData(
+    X=vae.get_latent_representation(),
+    obs=adata_mod1.obs[['batch']],
+    uns={
+        "dataset_id": adata_mod1.uns["dataset_id"],
+        "method_id": meta["functionality_name"]
+    },
+    obsm = {"X_emb": sparse.csr_matrix(vae.get_latent_representation())}
+)
+
+del adata_out.X
+
+adata_out.write_h5ad(par['output'], compression = "gzip")

From 87f84cba53d2f665a4e8bdfbfba40fe028e04a87 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Fri, 16 Dec 2022 11:50:58 +0100
Subject: [PATCH 12/42] add umap method

---
 .../methods/umap/config.vsh.yaml              | 41 ++++++++++
 src/joint_embedding/methods/umap/script.R     | 80 +++++++++++++++++++
 2 files changed, 121 insertions(+)
 create mode 100644 src/joint_embedding/methods/umap/config.vsh.yaml
 create mode 100644 src/joint_embedding/methods/umap/script.R

diff --git a/src/joint_embedding/methods/umap/config.vsh.yaml b/src/joint_embedding/methods/umap/config.vsh.yaml
new file mode 100644
index 0000000000..f8b27fdaec
--- /dev/null
+++ b/src/joint_embedding/methods/umap/config.vsh.yaml
@@ -0,0 +1,41 @@
+__merge__: ../../api/comp_method.yaml
+functionality:
+  name: umam
+  namespace: joint_embedding/methods
+  version: dev
+  description: UMAP dimensionality reduction on the Euclidean distance.
+  info:
+    type: method
+    label: UMAP
+  arguments:
+    - name: "--n_dims"
+      type: "integer"
+      default: 10
+      description: Number of dimensions to output.
+    - name: "--metric"
+      type: "string"
+      default: "euclidean"
+      description: The metric to use. Possible values are euclidean, cosine, manhattan.
+    - name: "--n_neighbors"
+      type: "integer"
+      default: 15
+      description: Number of neighbor to use int he KNN.
+    - name: "--n_pcs"
+      type: "integer"
+      default: 50
+      description: Number of principal components to use in the PCA step.
+    - name: "--hvg_sel"
+      type: "integer"
+      default: 1000
+      description: Number of features per modality to use.
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: dataintuitive/randpy:r4.0_py3.8_bioc3.12
+    setup:
+      - type: r
+        packages: [ uwot, irlba, proxyC ]
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/methods/umap/script.R b/src/joint_embedding/methods/umap/script.R
new file mode 100644
index 0000000000..2f654be245
--- /dev/null
+++ b/src/joint_embedding/methods/umap/script.R
@@ -0,0 +1,80 @@
+cat("Loading dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+requireNamespace("anndata", quietly = TRUE)
+library(Matrix, warn.conflicts = FALSE, quietly = TRUE)
+
+## VIASH START
+# path <- "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter."
+path <- "output/datasets/joint_embedding/openproblems_bmmc_multiome_phase1/openproblems_bmmc_multiome_phase1.censor_dataset.output_"
+# path <- "output/public_datasets/joint_embedding/dyngen_citeseq_1/dyngen_citeseq_1.censor_dataset.output_"
+par <- list(
+  input_mod1 = paste0(path, "mod1.h5ad"),
+  input_mod2 = paste0(path, "mod2.h5ad"),
+  output = "output.h5ad",
+  n_dims = 10L,
+  n_neighbors = 15L,
+  metric = "euclidean",
+  n_pcs = 50L,
+  hvg_sel = 100L
+)
+meta <- list(functionality_name = "foo")
+## VIASH END
+
+n_cores <- parallel::detectCores(all.tests = FALSE, logical = TRUE)
+
+cat("Reading h5ad files\n")
+input_mod1 <- anndata::read_h5ad(par$input_mod1)
+
+rn <- rownames(input_mod1)
+batch <- input_mod1$obs$batch
+dataset_id <- input_mod1$uns[["dataset_id"]]
+X_mod1 <- input_mod1$X
+
+# select hvg
+if (!is.null(par$hvg_sel) && ncol(X_mod1) > par$hvg_sel) {
+  sd_mod1 <- proxyC::colSds(X_mod1)
+  X_mod1 <- X_mod1[, head(order(sd_mod1, decreasing = TRUE), par$hvg_sel)]
+}
+
+rm(input_mod1)
+gc()
+
+X_mod2 <- anndata::read_h5ad(par$input_mod2)$X
+if (!is.null(par$hvg_sel) && ncol(X_mod2) > par$hvg_sel) {
+  sd_mod2 <- proxyC::colSds(X_mod2)
+  X_mod2 <- X_mod2[, head(order(sd_mod2, decreasing = TRUE), par$hvg_sel)]
+}
+
+cat("Performing PCA\n")
+X_pca <- irlba::prcomp_irlba(
+  cbind(X_mod1, X_mod2),
+  n = 100
+)$x
+
+cat("Performing UMap\n")
+dr <- uwot::umap(
+  X_pca,
+  n_components = par$n_dims,
+  n_neighbors = par$n_neighbors,
+  metric = par$metric,
+  n_threads = n_cores,
+  nn_method = "annoy"
+)
+
+rownames(dr) <- rn
+colnames(dr) <- paste0("comp_", seq_len(par$n_dims))
+
+out <- anndata::AnnData(
+  X = dr,
+  uns = list(
+    dataset_id = dataset_id,
+    method_id = meta$functionality_name
+  ),
+  obsm = list(
+    X_emb = as(dr, "CsparseMatrix")
+  )
+)
+
+cat("Writing predictions to file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")

From 7cc07bfb2f9f6687e59ad5f1c37291037cd68eb7 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Fri, 16 Dec 2022 14:48:20 +0100
Subject: [PATCH 13/42] add metric ari

---
 src/joint_embedding/api/anndata_score.yaml    | 25 +++++
 src/joint_embedding/api/comp_metric.yaml      | 93 +++++++++++++++++++
 .../metrics/ari/config.vsh.yaml               | 13 +++
 src/joint_embedding/metrics/ari/script.py     | 58 ++++++++++++
 4 files changed, 189 insertions(+)
 create mode 100644 src/joint_embedding/api/anndata_score.yaml
 create mode 100644 src/joint_embedding/api/comp_metric.yaml
 create mode 100644 src/joint_embedding/metrics/ari/config.vsh.yaml
 create mode 100644 src/joint_embedding/metrics/ari/script.py

diff --git a/src/joint_embedding/api/anndata_score.yaml b/src/joint_embedding/api/anndata_score.yaml
new file mode 100644
index 0000000000..bfe79f07cc
--- /dev/null
+++ b/src/joint_embedding/api/anndata_score.yaml
@@ -0,0 +1,25 @@
+type: file
+description: "Metric score file"
+example: "output.h5ad"
+info:
+  short_description: "Score"
+  slots:
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: method_id
+        description: "A unique identifier for the method"
+        required: true
+      - type: string
+        name: metric_ids
+        description: "One or more unique metric identifiers"
+        multiple: true
+        required: true
+      - type: double
+        name: metric_values
+        description: "The metric values obtained for the given prediction. Must be of same length as 'metric_ids'."
+        multiple: true
+        required: true
diff --git a/src/joint_embedding/api/comp_metric.yaml b/src/joint_embedding/api/comp_metric.yaml
new file mode 100644
index 0000000000..a33b4268d7
--- /dev/null
+++ b/src/joint_embedding/api/comp_metric.yaml
@@ -0,0 +1,93 @@
+functionality:
+  arguments:
+    - name: --input_prediction
+      __merge__: anndata_prediction.yaml
+    - name: --input_solution
+      __merge__: anndata_solution.yaml
+    - name: --output
+      __merge__: anndata_score.yaml
+      direction: output
+  test_resources:
+    - path: ../../../../resources_test
+    - type: python_script
+      path: generic_test.py
+      text: |
+        from os import path
+        import subprocess
+        import anndata as ad
+        import pandas as pd
+
+        ## VIASH START
+        # This code block will be replaced by viash at runtime.
+        meta = { 'functionality_name': 'foo' }
+        meta_path = "src/joint_embedding/metrics/check_format/metric_meta_check_format.tsv"
+        ## VIASH END
+
+        method_id = meta['functionality_name']
+        command = "./" + method_id
+
+        # define some filenames
+        testpar = {
+          "input_prediction": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+          "input_solution": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
+          "output": "output.h5ad"
+        }
+        meta_path = resources_dir + '/metric_meta.tsv'
+
+        print("> Running method")
+        out = subprocess.check_output([
+          "./" + meta['functionality_name'],
+          "--input_prediction", testpar['input_prediction'],
+          "--input_solution", testpar['input_solution'],
+          "--output", testpar['output']
+        ]).decode("utf-8")
+
+        print("> Checking whether output files were created")
+        assert path.exists(testpar['output'])
+
+        print("> Reading h5ad files")
+        input_prediction = ad.read_h5ad(testpar['input_prediction'])
+        input_solution = ad.read_h5ad(testpar['input_solution'])
+        output = ad.read_h5ad(testpar['output'])
+
+        metric_meta = pd.read_csv(
+          meta_path, 
+          delimiter="\t",
+          header=0,
+          dtype={ 'metric_id': str, 'metric_min': float, 'metric_max': float, 'metric_higherisbetter': bool }
+        )
+
+        print("> Checking contents of metric_meta.tsv")
+        assert 'metric_id' in metric_meta
+        assert 'metric_min' in metric_meta
+        assert 'metric_max' in metric_meta
+        assert 'metric_higherisbetter' in metric_meta
+
+        print("> Checking .uns['dataset_id']")
+        assert 'dataset_id' in output.uns
+        assert output.uns['dataset_id'] == input_prediction.uns['dataset_id']
+
+        print("> Checking .uns['method_id']")
+        assert 'method_id' in output.uns
+        assert output.uns['method_id'] == input_prediction.uns['method_id']
+
+        print("> Checking .uns['metric_ids']")
+        assert 'metric_ids' in output.uns
+        assert set(output.uns['metric_ids']) == set(metric_meta.metric_id)
+
+        print("> Checking .uns['metric_values']")
+        assert 'metric_values' in output.uns
+        assert output.uns['metric_ids'].size == output.uns['metric_values'].size
+
+        # merge with metric_meta to see if metric_value lies within the expected range
+        output_uns = pd.DataFrame({
+          'metric_id': output.uns['metric_ids'], 
+          'metric_value': output.uns['metric_values']
+        })
+
+        scores = metric_meta.merge(output_uns, on="metric_id")
+
+        assert all(scores.metric_value >= scores.metric_min)
+        assert all(scores.metric_value <= scores.metric_max)
+
+        print("> Test succeeded!")
diff --git a/src/joint_embedding/metrics/ari/config.vsh.yaml b/src/joint_embedding/metrics/ari/config.vsh.yaml
new file mode 100644
index 0000000000..12268747b4
--- /dev/null
+++ b/src/joint_embedding/metrics/ari/config.vsh.yaml
@@ -0,0 +1,13 @@
+__merge__: ../../api/comp_metrics.yaml
+functionality:
+  name: ari
+  namespace: joint_embedding/metrics
+  description: Adjusted rand index (ARI)
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: mumichae/scib-base:1.0.0
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/metrics/ari/script.py b/src/joint_embedding/metrics/ari/script.py
new file mode 100644
index 0000000000..dc7c195f66
--- /dev/null
+++ b/src/joint_embedding/metrics/ari/script.py
@@ -0,0 +1,58 @@
+import pprint
+import scanpy as sc
+import anndata as ad
+import scib
+
+## VIASH START
+par = dict(
+    input_prediction="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+    input_solution="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
+    output="openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.ari.had",
+    debug=True
+)
+
+## VIASH END
+
+if par['debug']:
+    pprint.pprint(par)
+
+print("Read prediction anndata")
+adata = ad.read_h5ad(par['input_prediction'])
+dataset_id = adata.uns['dataset_id']
+
+print("Read solution anndata")
+adata_solution = ad.read_h5ad(par['input_solution'])
+
+print('Transfer obs annotations')
+adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names]
+adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names]
+
+print('Preprocessing')
+sc.pp.neighbors(adata, use_rep='X_emb')
+
+print('Clustering')
+scib.cl.opt_louvain(
+    adata,
+    label_key='cell_type',
+    cluster_key='cluster',
+    plot=False,
+    inplace=True,
+    force=True
+)
+
+print('Compute score')
+score = scib.me.ari(adata, group1='cluster', group2='cell_type')
+
+# store adata with metrics
+print("Create output object")
+out = ad.AnnData(
+    uns=dict(
+        dataset_id=adata.uns['dataset_id'],
+        method_id=adata.uns['method_id'],
+        metric_ids=["ari"],
+        metric_values=[score]
+    )
+)
+
+print("Write output to h5ad file")
+out.write(par['output'], compression='gzip')

From caff25dc2a4b36f0fd91d752fbeb3647283846df Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Fri, 16 Dec 2022 14:48:52 +0100
Subject: [PATCH 14/42] update comp_metric

---
 src/joint_embedding/api/comp_metric.yaml | 37 ++++++++++--------------
 1 file changed, 16 insertions(+), 21 deletions(-)

diff --git a/src/joint_embedding/api/comp_metric.yaml b/src/joint_embedding/api/comp_metric.yaml
index a33b4268d7..1308ca03b8 100644
--- a/src/joint_embedding/api/comp_metric.yaml
+++ b/src/joint_embedding/api/comp_metric.yaml
@@ -7,6 +7,9 @@ functionality:
     - name: --output
       __merge__: anndata_score.yaml
       direction: output
+    - name: --debug
+      type: boolean_true
+      description: Verbose output for debugging.
   test_resources:
     - path: ../../../../resources_test
     - type: python_script
@@ -17,30 +20,22 @@ functionality:
         import anndata as ad
         import pandas as pd
 
-        ## VIASH START
-        # This code block will be replaced by viash at runtime.
-        meta = { 'functionality_name': 'foo' }
-        meta_path = "src/joint_embedding/metrics/check_format/metric_meta_check_format.tsv"
-        ## VIASH END
-
-        method_id = meta['functionality_name']
-        command = "./" + method_id
-
         # define some filenames
-        testpar = {
-          "input_prediction": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
-          "input_solution": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
-          "output": "output.h5ad"
-        }
+       
+        input_prediction_path =  "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+        input_solution_path = "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
+        output_path = "output.h5ad"
         meta_path = resources_dir + '/metric_meta.tsv'
 
-        print("> Running method")
-        out = subprocess.check_output([
-          "./" + meta['functionality_name'],
-          "--input_prediction", testpar['input_prediction'],
-          "--input_solution", testpar['input_solution'],
-          "--output", testpar['output']
-        ]).decode("utf-8")
+        cmd = [
+            meta['executable'],
+            "--input_prediction", input_prediction_path,
+            "--input_solution", input_solution_path,
+            "--output", output_path
+        ]
+
+        print(">> Running script as test")
+        out = subprocess.run(cmd, check=True, capture_output=True, text=True)
 
         print("> Checking whether output files were created")
         assert path.exists(testpar['output'])

From f7e0e0b90073168b025f75e3bd3639d03676ae3c Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Fri, 16 Dec 2022 16:31:57 +0100
Subject: [PATCH 15/42] update ari metric

---
 src/joint_embedding/api/comp_metric.yaml      | 120 +++++++++---------
 .../metrics/ari/config.vsh.yaml               |  15 ++-
 2 files changed, 73 insertions(+), 62 deletions(-)

diff --git a/src/joint_embedding/api/comp_metric.yaml b/src/joint_embedding/api/comp_metric.yaml
index 1308ca03b8..813d865dc2 100644
--- a/src/joint_embedding/api/comp_metric.yaml
+++ b/src/joint_embedding/api/comp_metric.yaml
@@ -10,79 +10,79 @@ functionality:
     - name: --debug
       type: boolean_true
       description: Verbose output for debugging.
-  test_resources:
-    - path: ../../../../resources_test
-    - type: python_script
-      path: generic_test.py
-      text: |
-        from os import path
-        import subprocess
-        import anndata as ad
-        import pandas as pd
+  # test_resources:
+  #   - path: ../../../../resources_test
+  #   - type: python_script
+  #     path: generic_test.py
+      # text: |
+      #   from os import path
+      #   import subprocess
+      #   import anndata as ad
+      #   import pandas as pd
 
-        # define some filenames
+      #   # define some filenames
        
-        input_prediction_path =  "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
-        input_solution_path = "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
-        output_path = "output.h5ad"
-        meta_path = resources_dir + '/metric_meta.tsv'
+      #   input_prediction_path =  "resources_test/common/joint_embedding/prediction.h5ad",
+      #   input_solution_path = "resources_test/common/joint_embedding/openproblems_bmmc_multiome_starter.solution.h5ad",
+      #   output_path = "output.h5ad"
+      #   meta_path = resources_dir + '/metric_meta.tsv'
 
-        cmd = [
-            meta['executable'],
-            "--input_prediction", input_prediction_path,
-            "--input_solution", input_solution_path,
-            "--output", output_path
-        ]
+      #   cmd = [
+      #       meta['executable'],
+      #       "--input_prediction", input_prediction_path,
+      #       "--input_solution", input_solution_path,
+      #       "--output", output_path
+      #   ]
 
-        print(">> Running script as test")
-        out = subprocess.run(cmd, check=True, capture_output=True, text=True)
+      #   print(">> Running script as test")
+      #   out = subprocess.run(cmd, check=True, capture_output=True, text=True)
 
-        print("> Checking whether output files were created")
-        assert path.exists(testpar['output'])
+      #   print("> Checking whether output files were created")
+      #   assert path.exists(testpar['output'])
 
-        print("> Reading h5ad files")
-        input_prediction = ad.read_h5ad(testpar['input_prediction'])
-        input_solution = ad.read_h5ad(testpar['input_solution'])
-        output = ad.read_h5ad(testpar['output'])
+      #   print("> Reading h5ad files")
+      #   input_prediction = ad.read_h5ad(testpar['input_prediction'])
+      #   input_solution = ad.read_h5ad(testpar['input_solution'])
+      #   output = ad.read_h5ad(testpar['output'])
 
-        metric_meta = pd.read_csv(
-          meta_path, 
-          delimiter="\t",
-          header=0,
-          dtype={ 'metric_id': str, 'metric_min': float, 'metric_max': float, 'metric_higherisbetter': bool }
-        )
+      #   metric_meta = pd.read_csv(
+      #     meta_path, 
+      #     delimiter="\t",
+      #     header=0,
+      #     dtype={ 'metric_id': str, 'metric_min': float, 'metric_max': float, 'metric_higherisbetter': bool }
+      #   )
 
-        print("> Checking contents of metric_meta.tsv")
-        assert 'metric_id' in metric_meta
-        assert 'metric_min' in metric_meta
-        assert 'metric_max' in metric_meta
-        assert 'metric_higherisbetter' in metric_meta
+      #   print("> Checking contents of metric_meta.tsv")
+      #   assert 'metric_id' in metric_meta
+      #   assert 'metric_min' in metric_meta
+      #   assert 'metric_max' in metric_meta
+      #   assert 'metric_higherisbetter' in metric_meta
 
-        print("> Checking .uns['dataset_id']")
-        assert 'dataset_id' in output.uns
-        assert output.uns['dataset_id'] == input_prediction.uns['dataset_id']
+      #   print("> Checking .uns['dataset_id']")
+      #   assert 'dataset_id' in output.uns
+      #   assert output.uns['dataset_id'] == input_prediction.uns['dataset_id']
 
-        print("> Checking .uns['method_id']")
-        assert 'method_id' in output.uns
-        assert output.uns['method_id'] == input_prediction.uns['method_id']
+      #   print("> Checking .uns['method_id']")
+      #   assert 'method_id' in output.uns
+      #   assert output.uns['method_id'] == input_prediction.uns['method_id']
 
-        print("> Checking .uns['metric_ids']")
-        assert 'metric_ids' in output.uns
-        assert set(output.uns['metric_ids']) == set(metric_meta.metric_id)
+      #   print("> Checking .uns['metric_ids']")
+      #   assert 'metric_ids' in output.uns
+      #   assert set(output.uns['metric_ids']) == set(metric_meta.metric_id)
 
-        print("> Checking .uns['metric_values']")
-        assert 'metric_values' in output.uns
-        assert output.uns['metric_ids'].size == output.uns['metric_values'].size
+      #   print("> Checking .uns['metric_values']")
+      #   assert 'metric_values' in output.uns
+      #   assert output.uns['metric_ids'].size == output.uns['metric_values'].size
 
-        # merge with metric_meta to see if metric_value lies within the expected range
-        output_uns = pd.DataFrame({
-          'metric_id': output.uns['metric_ids'], 
-          'metric_value': output.uns['metric_values']
-        })
+      #   # merge with metric_meta to see if metric_value lies within the expected range
+      #   output_uns = pd.DataFrame({
+      #     'metric_id': output.uns['metric_ids'], 
+      #     'metric_value': output.uns['metric_values']
+      #   })
 
-        scores = metric_meta.merge(output_uns, on="metric_id")
+      #   scores = metric_meta.merge(output_uns, on="metric_id")
 
-        assert all(scores.metric_value >= scores.metric_min)
-        assert all(scores.metric_value <= scores.metric_max)
+      #   assert all(scores.metric_value >= scores.metric_min)
+      #   assert all(scores.metric_value <= scores.metric_max)
 
-        print("> Test succeeded!")
+      #   print("> Test succeeded!")
diff --git a/src/joint_embedding/metrics/ari/config.vsh.yaml b/src/joint_embedding/metrics/ari/config.vsh.yaml
index 12268747b4..066a0bfe98 100644
--- a/src/joint_embedding/metrics/ari/config.vsh.yaml
+++ b/src/joint_embedding/metrics/ari/config.vsh.yaml
@@ -1,13 +1,24 @@
-__merge__: ../../api/comp_metrics.yaml
+__merge__: ../../api/comp_metric.yaml
 functionality:
   name: ari
   namespace: joint_embedding/metrics
   description: Adjusted rand index (ARI)
+  info:
+    metrics:
+      - id: ari
+        label: ari
+        description: Adjusted rand index (ARI)
+        min: 0
+        max: 1
+        maximize: true
   resources:
     - type: python_script
       path: script.py
 platforms:
   - type: docker
-    image: mumichae/scib-base:1.0.0
+    image: python:3.10
+    setup:
+      - type: python
+        pip: [anndata>=0.8, scib, scanpy]
   - type: nextflow
     directives: [ lowmem, lowtime, lowcpu ]

From 22c7f464f7e5a83c93f44098269f31658f71e97f Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Fri, 16 Dec 2022 16:40:46 +0100
Subject: [PATCH 16/42] add asw_batch metric

---
 .../metrics/asw_batch/config.vsh.yaml         | 24 ++++++++
 .../metrics/asw_batch/script.py               | 55 +++++++++++++++++++
 2 files changed, 79 insertions(+)
 create mode 100644 src/joint_embedding/metrics/asw_batch/config.vsh.yaml
 create mode 100644 src/joint_embedding/metrics/asw_batch/script.py

diff --git a/src/joint_embedding/metrics/asw_batch/config.vsh.yaml b/src/joint_embedding/metrics/asw_batch/config.vsh.yaml
new file mode 100644
index 0000000000..b0758506a5
--- /dev/null
+++ b/src/joint_embedding/metrics/asw_batch/config.vsh.yaml
@@ -0,0 +1,24 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: asw_batch
+  namespace: joint_embedding/metrics
+  description: Average silhouette width (ASW) of batches per label
+  info:
+    metrics:
+      - id: asw_batch
+        label: asw_batch
+        description: Average silhouette width (ASW) of batches per label
+        min: 0
+        max: 1
+        maximize: True
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: python:3.10
+    setup:
+      - type: python
+        pip: [ anndata>=0.8, scanpy, scib]
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/metrics/asw_batch/script.py b/src/joint_embedding/metrics/asw_batch/script.py
new file mode 100644
index 0000000000..d111a71d0c
--- /dev/null
+++ b/src/joint_embedding/metrics/asw_batch/script.py
@@ -0,0 +1,55 @@
+import pprint
+import scanpy as sc
+import anndata as ad
+import scib
+
+## VIASH START
+par = dict(
+    input_prediction="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+    input_solution="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
+    output="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.asw_batch.tsv",
+    debug=True
+)
+
+## VIASH END
+
+if par['debug']:
+    pprint.pprint(par)
+
+input_prediction = par['input_prediction']
+input_solution = par['input_solution']
+output = par['output']
+
+print("Read prediction anndata")
+adata = ad.read(input_prediction)
+dataset_id = adata.uns['dataset_id']
+
+print("Read solution anndata")
+adata_solution = ad.read(input_solution)
+
+print('Transfer obs annotations')
+adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names]
+adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names]
+
+print('Compute score')
+score = scib.me.silhouette_batch(
+    adata,
+    batch_key='batch',
+    group_key='cell_type',
+    embed='X_emb',
+    verbose=False
+)
+
+# store adata with metrics
+print("Create output object")
+out = ad.AnnData(
+    uns=dict(
+        dataset_id=adata.uns['dataset_id'],
+        method_id=adata.uns['method_id'],
+        metric_ids=['asw_batch'],
+        metric_values=[score]
+    )
+)
+
+print("Write output to h5ad file")
+out.write(output, compression='gzip')

From d7e03de572219ce00b24179b12823fb50b24db4f Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Fri, 16 Dec 2022 16:53:18 +0100
Subject: [PATCH 17/42] add asw_label metric

---
 .../metrics/asw_label/config.vsh.yaml         | 24 +++++++++
 .../metrics/asw_label/script.py               | 49 +++++++++++++++++++
 2 files changed, 73 insertions(+)
 create mode 100644 src/joint_embedding/metrics/asw_label/config.vsh.yaml
 create mode 100644 src/joint_embedding/metrics/asw_label/script.py

diff --git a/src/joint_embedding/metrics/asw_label/config.vsh.yaml b/src/joint_embedding/metrics/asw_label/config.vsh.yaml
new file mode 100644
index 0000000000..bee00b60d2
--- /dev/null
+++ b/src/joint_embedding/metrics/asw_label/config.vsh.yaml
@@ -0,0 +1,24 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: asw_label
+  namespace: joint_embedding_metrics
+  description: Average silhouette width (ASW) of labels
+  info:
+    metrics:
+      - id: asw_label
+        label: asw_label
+        description: Average silhouette width (ASW) of labels
+        min: 0
+        max: 1
+        maximize: True
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: python:3.10
+    setup:
+      - type: python
+        pip: [anndata>=0.8, scib]
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/metrics/asw_label/script.py b/src/joint_embedding/metrics/asw_label/script.py
new file mode 100644
index 0000000000..4d04092004
--- /dev/null
+++ b/src/joint_embedding/metrics/asw_label/script.py
@@ -0,0 +1,49 @@
+import pprint
+import anndata as ad
+import scib
+
+## VIASH START
+par = dict(
+    input_prediction="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+    input_solution="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
+    output="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.asw_batch.tsv",
+    debug=True
+)
+
+## VIASH END
+
+if par['debug']:
+    pprint.pprint(par)
+
+input_prediction = par['input_prediction']
+input_solution = par['input_solution']
+output = par['output']
+
+print("Read prediction anndata")
+adata = ad.read_h5ad(input_prediction)
+dataset_id = adata.uns['dataset_id']
+
+print("Read solution anndata")
+adata_solution = ad.read_h5ad(input_solution)
+
+print('Transfer obs annotations')
+adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names]
+adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names]
+
+print('Compute score')
+score = scib.me.silhouette(adata, group_key='cell_type', embed='X_emb')
+
+# store adata with metrics
+print("Create output object")
+out = ad.AnnData(
+    uns=dict(
+        dataset_id=adata.uns['dataset_id'],
+        method_id=adata.uns['method_id'],
+        metric_ids=['asw_label'],
+        metric_values=[score]
+    )
+)
+
+print("Write output to h5ad file")
+out.write(output, compression='gzip')
+

From 1b47472afdd66614ad1a897c30491c3ebba82364 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Fri, 16 Dec 2022 21:59:31 +0100
Subject: [PATCH 18/42] add cc_cons metric

---
 .../metrics/asw_batch/script.py               | 12 ++---
 .../metrics/asw_label/config.vsh.yaml         |  2 +-
 .../metrics/cc_cons/config.vsh.yaml           | 24 +++++++++
 src/joint_embedding/metrics/cc_cons/script.py | 54 +++++++++++++++++++
 4 files changed, 85 insertions(+), 7 deletions(-)
 create mode 100644 src/joint_embedding/metrics/cc_cons/config.vsh.yaml
 create mode 100644 src/joint_embedding/metrics/cc_cons/script.py

diff --git a/src/joint_embedding/metrics/asw_batch/script.py b/src/joint_embedding/metrics/asw_batch/script.py
index d111a71d0c..88e89c21a8 100644
--- a/src/joint_embedding/metrics/asw_batch/script.py
+++ b/src/joint_embedding/metrics/asw_batch/script.py
@@ -5,10 +5,10 @@
 
 ## VIASH START
 par = dict(
-    input_prediction="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
-    input_solution="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
-    output="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.asw_batch.tsv",
-    debug=True
+    input_prediction="resources_test/common/joint_embedding/cite_random_prediction.h5ad",
+    input_solution="resources_test/common/joint_embedding/cite_solution.h5ad",
+    output="resources_test/common/joint_embedding/score_cc_cons.h5ad",
+    debug=False
 )
 
 ## VIASH END
@@ -21,11 +21,11 @@
 output = par['output']
 
 print("Read prediction anndata")
-adata = ad.read(input_prediction)
+adata = ad.read_h5ad(input_prediction)
 dataset_id = adata.uns['dataset_id']
 
 print("Read solution anndata")
-adata_solution = ad.read(input_solution)
+adata_solution = ad.read_h5ad(input_solution)
 
 print('Transfer obs annotations')
 adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names]
diff --git a/src/joint_embedding/metrics/asw_label/config.vsh.yaml b/src/joint_embedding/metrics/asw_label/config.vsh.yaml
index bee00b60d2..9b1f331cb2 100644
--- a/src/joint_embedding/metrics/asw_label/config.vsh.yaml
+++ b/src/joint_embedding/metrics/asw_label/config.vsh.yaml
@@ -1,7 +1,7 @@
 __merge__: ../../api/comp_metric.yaml
 functionality:
   name: asw_label
-  namespace: joint_embedding_metrics
+  namespace: joint_embedding/metrics
   description: Average silhouette width (ASW) of labels
   info:
     metrics:
diff --git a/src/joint_embedding/metrics/cc_cons/config.vsh.yaml b/src/joint_embedding/metrics/cc_cons/config.vsh.yaml
new file mode 100644
index 0000000000..dc8b3ab1c9
--- /dev/null
+++ b/src/joint_embedding/metrics/cc_cons/config.vsh.yaml
@@ -0,0 +1,24 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: cc_cons
+  namespace: joint_embedding/metrics
+  description: Cell cycle conservation score
+  info:
+    metrics:
+      - id: cc_cons
+        label: cc_cons
+        description: Cell cycle conservation score
+        min: 0
+        max: 1
+        maximize: True
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: python:3.10
+    setup:
+      - type: python
+        pip: [anndata>=0.8, scib]
+  - type: nextflow
+    directives: [ vhighmem, midtime, midcpu ]
diff --git a/src/joint_embedding/metrics/cc_cons/script.py b/src/joint_embedding/metrics/cc_cons/script.py
new file mode 100644
index 0000000000..a8741fb511
--- /dev/null
+++ b/src/joint_embedding/metrics/cc_cons/script.py
@@ -0,0 +1,54 @@
+import pprint
+import anndata as ad
+import scib
+
+## VIASH START
+par = dict(
+    input_prediction="resources_test/common/joint_embedding/cite_random_prediction.h5ad",
+    input_solution="resources_test/common/joint_embedding/cite_solution.h5ad",
+    output="resources_test/common/joint_embedding/score_cc_cons.h5ad",
+    debug=False
+)
+## VIASH END
+
+
+if par['debug']:
+    pprint.pprint(par)
+
+print("Read prediction anndata")
+adata = ad.read_h5ad(par['input_prediction'])
+dataset_id = adata.uns['dataset_id']
+
+print("Read solution anndata")
+adata_solution = ad.read_h5ad(par['input_solution'])
+organism = adata_solution.uns['organism']
+
+print('Transfer obs annotations')
+adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names]
+adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names]
+recompute_cc = 'S_score' not in adata_solution.obs_keys() or \
+               'G2M_score' not in adata_solution.obs_keys()
+
+print('Compute score')
+score = scib.me.cell_cycle(
+    adata_pre=adata_solution,
+    adata_post=adata,
+    batch_key='batch',
+    embed='X_emb',
+    recompute_cc=recompute_cc,
+    organism=organism
+)
+
+# store adata with metrics
+print("Create output object")
+out = ad.AnnData(
+    uns= {
+        "dataset_id":adata.uns['dataset_id'],
+        "method_id":adata.uns['method_id'],
+        "metric_ids":['cc_cons'],
+        "metric_values":[score],
+    }
+)
+
+print("Write output to h5ad file")
+out.write(par['output'], compression='gzip')

From ea82ca5b3e1618453082bfe2b977ab2931c60f7d Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Wed, 4 Jan 2023 14:34:30 +0100
Subject: [PATCH 19/42] remove DI docker because of old anndata package

---
 src/joint_embedding/mask_dataset/config.vsh.yaml | 9 ++++++++-
 src/joint_embedding/mask_dataset/script.R        | 2 +-
 src/joint_embedding/mask_dataset/test.R          | 2 +-
 src/joint_embedding/methods/lmds/config.vsh.yaml | 9 +++++++--
 src/joint_embedding/methods/lmds/script.R        | 2 +-
 5 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/src/joint_embedding/mask_dataset/config.vsh.yaml b/src/joint_embedding/mask_dataset/config.vsh.yaml
index 1f54f2df62..d49464163d 100644
--- a/src/joint_embedding/mask_dataset/config.vsh.yaml
+++ b/src/joint_embedding/mask_dataset/config.vsh.yaml
@@ -18,6 +18,13 @@ functionality:
     - path: ../../../resources_test
 platforms:
   - type: docker
-    image: dataintuitive/randpy:r4.0_py3.8_bioc3.12
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata, tidyverse , testthat]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [ anndata>=0.8 ]
   - type: nextflow
     directives: [ midmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/mask_dataset/script.R b/src/joint_embedding/mask_dataset/script.R
index 2d70248e69..d417775314 100644
--- a/src/joint_embedding/mask_dataset/script.R
+++ b/src/joint_embedding/mask_dataset/script.R
@@ -1,7 +1,7 @@
 cat("Loading dependencies\n")
 options(tidyverse.quiet = TRUE)
 library(tidyverse)
-requireNamespace("anndata", quietly = TRUE)
+library(anndata, warn.conflicts = FALSE)
 library(assertthat, quietly = TRUE, warn.conflicts = FALSE)
 library(Matrix, quietly = TRUE, warn.conflicts = FALSE)
 
diff --git a/src/joint_embedding/mask_dataset/test.R b/src/joint_embedding/mask_dataset/test.R
index ad8cc55eec..88930e00d2 100644
--- a/src/joint_embedding/mask_dataset/test.R
+++ b/src/joint_embedding/mask_dataset/test.R
@@ -1,5 +1,5 @@
 library(testthat, quietly = TRUE, warn.conflicts = FALSE)
-requireNamespace("anndata", quietly = TRUE)
+library(anndata, warn.conflicts = FALSE)
 
 par <- list(
   input_mod1 = "resources_test/common/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.output_rna.h5ad",
diff --git a/src/joint_embedding/methods/lmds/config.vsh.yaml b/src/joint_embedding/methods/lmds/config.vsh.yaml
index b9e096a6d1..a0a3ef40af 100644
--- a/src/joint_embedding/methods/lmds/config.vsh.yaml
+++ b/src/joint_embedding/methods/lmds/config.vsh.yaml
@@ -20,9 +20,14 @@ functionality:
       path: script.R
 platforms:
   - type: docker
-    image: dataintuitive/randpy:r4.0_py3.8_bioc3.12
+    image: eddelbuettel/r2u:22.04
     setup:
       - type: r
-        packages: [ lmds ]
+        cran: [ anndata, lmds, tidyverse ]
+    test_setup:
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [anndata>=0.8]
   - type: nextflow
     directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/methods/lmds/script.R b/src/joint_embedding/methods/lmds/script.R
index 92700691e0..7dda2a3c4c 100644
--- a/src/joint_embedding/methods/lmds/script.R
+++ b/src/joint_embedding/methods/lmds/script.R
@@ -1,7 +1,7 @@
 cat("Loading dependencies\n")
 options(tidyverse.quiet = TRUE)
 library(tidyverse)
-requireNamespace("anndata", quietly = TRUE)
+library(anndata, warn.conflicts = FALSE)
 library(Matrix, warn.conflicts = FALSE, quietly = TRUE)
 
 ## VIASH START

From 16ce7765320a8d6eb571dd4b24b8063239cd03be Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Wed, 4 Jan 2023 14:34:45 +0100
Subject: [PATCH 20/42] add check_format metric

---
 .../metrics/check_format/config.vsh.yaml      | 34 +++++++++++
 .../metrics/check_format/script.R             | 61 +++++++++++++++++++
 2 files changed, 95 insertions(+)
 create mode 100644 src/joint_embedding/metrics/check_format/config.vsh.yaml
 create mode 100644 src/joint_embedding/metrics/check_format/script.R

diff --git a/src/joint_embedding/metrics/check_format/config.vsh.yaml b/src/joint_embedding/metrics/check_format/config.vsh.yaml
new file mode 100644
index 0000000000..1fae80d3aa
--- /dev/null
+++ b/src/joint_embedding/metrics/check_format/config.vsh.yaml
@@ -0,0 +1,34 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: check_format
+  namespace: joint_embedding/metrics
+  description: Checking whether the prediction of a method has the right format.
+  info:
+    metrics:
+      - id: finished
+        label: finished
+        description: 
+        min: 0
+        max: 1
+        maximize: true
+      - id: correct_format
+        label: correct_format
+        description: 
+        min: 0
+        max: 1
+        maximize: true
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata, lmds, tidyverse ]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [ anndata>=0.8 ]
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/metrics/check_format/script.R b/src/joint_embedding/metrics/check_format/script.R
new file mode 100644
index 0000000000..afe8ed10fd
--- /dev/null
+++ b/src/joint_embedding/metrics/check_format/script.R
@@ -0,0 +1,61 @@
+cat("Load dependencies\n")
+library(assertthat, quietly = TRUE, warn.conflicts = FALSE)
+library(anndata, warn.conflicts = FALSE)
+
+## VIASH START
+task <- "joint_embedding"
+par <- list(
+  input_solution = paste0("resources_test/", task, "/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad"),
+  input_prediction = paste0("resources_test/", task, "/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad"),
+  output = paste0("resources_test/", task, "/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.scores.h5ad")
+)
+## VIASH END
+
+cat("Read prediction h5ad\n")
+ad_sol <- read_h5ad(par$input_solution)
+
+cat("Checking solution h5ad\n")
+correct_format <- tryCatch({
+  # read prediction
+  ad_pred <- read_h5ad(par$input_prediction)
+
+  # check dataset id
+  dataset_id <- ad_pred$uns[["dataset_id"]]
+  assert_that(dataset_id == ad_sol$uns[["dataset_id"]])
+
+  # check method id
+  method_id <- ad_pred$uns[["method_id"]]
+  assert_that(
+    is.character(method_id),
+    method_id != ""
+  )
+
+  # check X
+  assert_that(
+    ad_pred$n_obs == ad_sol$n_obs,
+    ad_pred$n_vars >= 1,
+    ad_pred$n_vars <= 100,
+    !is.null(ad_pred$obs_names),
+    all(ad_pred$obs_names == ad_sol$obs_names)
+  )
+
+  1
+}, error = function(e) {
+  cat("ERROR: ", e$message, "\n", sep = "")
+  0
+})
+
+
+cat("Create output object\n")
+out <- AnnData(
+  shape = c(0, 0),
+  uns = list(
+    dataset_id = ad_pred$uns$dataset_id,
+    method_id = ad_pred$uns$method_id,
+    metric_ids = c("finished", "correct_format"),
+    metric_values = c(1, correct_format)
+  )
+)
+
+cat("Write output to h5ad file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")

From 4bce62c0544e8718a4763ca13e977dd62588d356 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Wed, 4 Jan 2023 14:55:47 +0100
Subject: [PATCH 21/42] add graph connectivity metric

---
 src/joint_embedding/api/authors.yaml          |  6 ++-
 .../graph_connectivity/config.vsh.yaml        | 24 +++++++++
 .../metrics/graph_connectivity/script.py      | 53 +++++++++++++++++++
 3 files changed, 82 insertions(+), 1 deletion(-)
 create mode 100644 src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml
 create mode 100644 src/joint_embedding/metrics/graph_connectivity/script.py

diff --git a/src/joint_embedding/api/authors.yaml b/src/joint_embedding/api/authors.yaml
index 70f8fc3141..fbac6221d1 100644
--- a/src/joint_embedding/api/authors.yaml
+++ b/src/joint_embedding/api/authors.yaml
@@ -13,4 +13,8 @@ functionality:
     - name: Christopher Lance
       email: clance.connect@gmail.com
       roles: [ author, maintainer ]
-      props: { github: xlancelottx }
\ No newline at end of file
+      props: { github: xlancelottx }
+    - name: Michaela Mueller
+      email: mumichae@in.tum.de
+      roles: [ author, maintainer ]
+      props: { github: mumichae, orcid: "0000-0002-1401-1785" }
\ No newline at end of file
diff --git a/src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml b/src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml
new file mode 100644
index 0000000000..a575757ba6
--- /dev/null
+++ b/src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml
@@ -0,0 +1,24 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: graph_connectivity
+  namespace: joint_embedding/metrics
+  description: Graph connectivity
+  info:
+    metrics:
+      - id: graph_conn
+        label: graph_conn
+        description: Graph connectivity
+        min: 0
+        max: 1
+        maximize: True
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: python:3.10
+    setup:
+      - type: python
+        pip: [anndata>=0.8, scib]
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/metrics/graph_connectivity/script.py b/src/joint_embedding/metrics/graph_connectivity/script.py
new file mode 100644
index 0000000000..ab9089d74a
--- /dev/null
+++ b/src/joint_embedding/metrics/graph_connectivity/script.py
@@ -0,0 +1,53 @@
+import pprint
+import scanpy as sc
+import anndata as ad
+import scib
+
+## VIASH START
+par = dict(
+    input_prediction="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+    input_solution="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
+    output="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.graph_conn.tsv",
+    debug=True
+)
+
+## VIASH END
+
+if par['debug']:
+    pprint.pprint(par)
+
+
+input_prediction = par['input_prediction']
+input_solution = par['input_solution']
+output = par['output']
+
+print("Read prediction anndata")
+adata = ad.read_h5ad(input_prediction)
+dataset_id = adata.uns['dataset_id']
+
+print("Read solution anndata")
+adata_solution = ad.read_h5ad(input_solution)
+
+print('Transfer obs annotations')
+adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names]
+adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names]
+
+print('Preprocessing')
+sc.pp.neighbors(adata, use_rep='X_emb')
+
+print('Compute score')
+score = scib.me.graph_connectivity(adata, label_key='cell_type')
+
+# store adata with metrics
+print("Create output object")
+out = ad.AnnData(
+    uns = { 
+        'dataset_id':adata.uns['dataset_id'],
+        'method_id':adata.uns['method_id'],
+        'metric_ids':['graph_conn'],
+        'metric_values':[score]
+        }
+)
+
+print("Write output to h5ad file")
+out.write(output, compression='gzip')

From bdbdbfda1bbcad1e5ac40c86442184014ad2d4a0 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Wed, 4 Jan 2023 15:54:14 +0100
Subject: [PATCH 22/42] add latent mixing metric

---
 src/joint_embedding/api/authors.yaml          |  6 +-
 .../metrics/latent_mixing/config.vsh.yaml     | 28 ++++++
 .../metrics/latent_mixing/script.py           | 92 +++++++++++++++++++
 3 files changed, 125 insertions(+), 1 deletion(-)
 create mode 100644 src/joint_embedding/metrics/latent_mixing/config.vsh.yaml
 create mode 100644 src/joint_embedding/metrics/latent_mixing/script.py

diff --git a/src/joint_embedding/api/authors.yaml b/src/joint_embedding/api/authors.yaml
index fbac6221d1..f2e96c35fd 100644
--- a/src/joint_embedding/api/authors.yaml
+++ b/src/joint_embedding/api/authors.yaml
@@ -17,4 +17,8 @@ functionality:
     - name: Michaela Mueller
       email: mumichae@in.tum.de
       roles: [ author, maintainer ]
-      props: { github: mumichae, orcid: "0000-0002-1401-1785" }
\ No newline at end of file
+      props: { github: mumichae, orcid: "0000-0002-1401-1785" }
+    - name: Ann Chen
+      email: ann.chen@czbiohub.org
+      roles: [ author, maintainer ]
+      props: { github: atchen}
\ No newline at end of file
diff --git a/src/joint_embedding/metrics/latent_mixing/config.vsh.yaml b/src/joint_embedding/metrics/latent_mixing/config.vsh.yaml
new file mode 100644
index 0000000000..b65e2f8c66
--- /dev/null
+++ b/src/joint_embedding/metrics/latent_mixing/config.vsh.yaml
@@ -0,0 +1,28 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: latent_mixing
+  namespace: joint_embedding/metrics
+  description: Calculate latent mixing metric for joint embedding task.
+  info:
+    metrics:
+      - id: latent_mixing
+        label: Latent mixing
+        min: -1
+        max: 0
+        maximize: True
+  arguments:
+    - name: "--n_neighbors"
+      type: integer
+      default: 100
+      description: Number of neighbors for the entropy_batch_mixing metric.
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: python:3.10
+    setup:
+      - type: python
+        pip: [anndata>=0.8, scikit-learn, scipy]
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
\ No newline at end of file
diff --git a/src/joint_embedding/metrics/latent_mixing/script.py b/src/joint_embedding/metrics/latent_mixing/script.py
new file mode 100644
index 0000000000..511e7f8d2a
--- /dev/null
+++ b/src/joint_embedding/metrics/latent_mixing/script.py
@@ -0,0 +1,92 @@
+import anndata as ad
+import numpy as np
+import scipy
+from sklearn.neighbors import NearestNeighbors
+
+# VIASH START
+par = {
+    "input_prediction": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+    "input_solution": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
+    "output": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.scores_totalvi.h5ad",
+    "n_neighbors": 100
+}
+# VIASH END
+
+print("Read input files")
+predict_adata = ad.read_h5ad(par["input_prediction"])
+solution_adata = ad.read_h5ad(par["input_solution"])
+
+print("Merge prediction with solution")
+merged_adata = predict_adata.copy()
+
+batch_val = solution_adata.obs["batch"].astype(str)
+batch_unique_values, batch_index = np.unique(batch_val, return_inverse=True)
+
+merged_adata.obs["batch"] = batch_index
+
+def entropy_batch_mixing(
+    latent_space, batches, n_neighbors=50, n_pools=50, n_samples_per_pool=100
+):
+
+    def neg_kl(hist_data, global_freq):
+        n_batches = len(np.unique(hist_data))
+        if n_batches > 2:
+            raise ValueError("Should be only two clusters for this metric")
+        frequency = np.mean(hist_data == 1)
+        if frequency == 0 or frequency == 1:
+            return 0
+        return -(
+            frequency * np.log(frequency / global_freq)
+            + (1 - frequency) * np.log((1 - frequency) / (1 - global_freq))
+        )
+
+    n_neighbors = min(n_neighbors, latent_space.getnnz() - 1)
+    nne = NearestNeighbors(n_neighbors=1 + n_neighbors, n_jobs=8)
+    nne.fit(latent_space)
+    kmatrix = nne.kneighbors_graph(latent_space) - scipy.sparse.identity(
+        latent_space.shape[0]
+    )
+
+    global_freq = np.mean(batches)
+    print(global_freq)
+    score = 0
+    for t in range(n_pools):
+        indices = np.random.choice(
+            np.arange(latent_space.shape[0]), size=n_samples_per_pool
+        )
+        score += np.mean(
+            [
+                neg_kl(
+                    batches[  # the batches of cell i's neighbors
+                        kmatrix[indices].nonzero()[
+                            1
+                        ][  # the neighbors of cell i (columns in row i)
+                            kmatrix[indices].nonzero()[0] == i  # the row of cell i
+                        ]
+                    ],
+                    global_freq,
+                )
+                for i in range(n_samples_per_pool)
+            ]
+        )
+    return score / float(n_pools)
+
+
+print("Calculate latent mixing metric")
+latent_mixing = entropy_batch_mixing(
+    latent_space=merged_adata.obsm['X_emb'],
+    batches=merged_adata.obs["batch"].values,
+    n_neighbors=par["n_neighbors"]
+)
+
+print("Write output")
+adata_out = ad.AnnData(
+    uns = {
+        "dataset_id": predict_adata.uns["dataset_id"],
+        "method_id" : predict_adata.uns["method_id"],
+        "metric_ids" : ["latent_mixing"],
+        "metric_values" : [latent_mixing]
+    }
+)
+
+adata_out.write_h5ad(par['output'], compression = "gzip")
\ No newline at end of file

From 5457a6c3f195ec0bf70db0dbf689544834d3915b Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Wed, 4 Jan 2023 16:19:12 +0100
Subject: [PATCH 23/42] add nmi metric

---
 .../metrics/nmi/config.vsh.yaml               | 24 +++++++
 src/joint_embedding/metrics/nmi/script.py     | 64 +++++++++++++++++++
 2 files changed, 88 insertions(+)
 create mode 100644 src/joint_embedding/metrics/nmi/config.vsh.yaml
 create mode 100644 src/joint_embedding/metrics/nmi/script.py

diff --git a/src/joint_embedding/metrics/nmi/config.vsh.yaml b/src/joint_embedding/metrics/nmi/config.vsh.yaml
new file mode 100644
index 0000000000..9f507916a4
--- /dev/null
+++ b/src/joint_embedding/metrics/nmi/config.vsh.yaml
@@ -0,0 +1,24 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: nmi
+  namespace: joint_embedding/metrics
+  description: Normalised mutual information (NMI)
+  info:
+    metrics:
+      - id: nmi
+        label: NMI
+        description: Normalised mutual information (NMI)
+        min: 0
+        max: 1
+        maximize: True
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: python:3.10
+    setup:
+      - type: python
+        pip: [anndata>=0.8, scib]
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/metrics/nmi/script.py b/src/joint_embedding/metrics/nmi/script.py
new file mode 100644
index 0000000000..73ef901bd7
--- /dev/null
+++ b/src/joint_embedding/metrics/nmi/script.py
@@ -0,0 +1,64 @@
+import pprint
+import scanpy as sc
+import anndata as ad
+import scib
+
+
+## VIASH START
+par = dict(
+    input_prediction="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+    input_solution="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
+    output="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.nmi.tsv",
+    debug=True
+)
+
+## VIASH END
+
+if par['debug']:
+    pprint.pprint(par)
+
+
+input_prediction = par['input_prediction']
+input_solution = par['input_solution']
+output = par['output']
+
+print("Read prediction anndata")
+adata = ad.read_h5ad(input_prediction)
+dataset_id = adata.uns['dataset_id']
+
+print("Read solution anndata")
+adata_solution = ad.read_h5ad(input_solution)
+
+print('Transfer obs annotations')
+adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names]
+adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names]
+
+print('Preprocessing')
+sc.pp.neighbors(adata, use_rep='X_emb')
+
+print('Clustering')
+scib.cl.opt_louvain(
+    adata,
+    label_key='cell_type',
+    cluster_key='cluster',
+    plot=False,
+    inplace=True,
+    force=True
+)
+
+print('Compute score')
+score = scib.me.nmi(adata, group1='cluster', group2='cell_type')
+
+# store adata with metrics
+print("Create output object")
+out = ad.AnnData(
+    uns=dict(
+        dataset_id=adata.uns['dataset_id'],
+        method_id=adata.uns['method_id'],
+        metric_ids=['nmi'],
+        metric_values=[score]
+    )
+)
+
+print("Write output to h5ad file")
+out.write(output, compression='gzip')
\ No newline at end of file

From 6d50fc49b95bc5df9be17e2f21c9a79e1cf57b4d Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Wed, 4 Jan 2023 16:58:39 +0100
Subject: [PATCH 24/42] add rfoob metric

---
 .../metrics/rfoob/config.vsh.yaml             | 46 +++++++++++++
 src/joint_embedding/metrics/rfoob/script.R    | 68 +++++++++++++++++++
 2 files changed, 114 insertions(+)
 create mode 100644 src/joint_embedding/metrics/rfoob/config.vsh.yaml
 create mode 100644 src/joint_embedding/metrics/rfoob/script.R

diff --git a/src/joint_embedding/metrics/rfoob/config.vsh.yaml b/src/joint_embedding/metrics/rfoob/config.vsh.yaml
new file mode 100644
index 0000000000..700eb49716
--- /dev/null
+++ b/src/joint_embedding/metrics/rfoob/config.vsh.yaml
@@ -0,0 +1,46 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: rfoob
+  namespace: joint_embedding/metrics
+  description: Calculating basic metrics for the joint embedding task.
+  info:
+    metrics:
+      - id: rfoob_celltype_accuracy
+        label: rfoob_celltype_accuracy
+        description:
+        min: 0
+        max: 1
+        maximize: True
+      - id: rfoob_pseudotimegex_rsq
+        label: rfoob_pseudotimegex_rsq
+        description:
+        min: -1
+        max: 1
+        maximize: true
+      - id: rfoob_pseudotimeadt_rsq
+        label: rfoob_pseudotimeadt_rsq
+        description:
+        min: -1
+        max: 1
+        maximize: True
+      - id: rfoob_batch_error
+        label: rfoob_batch_error
+        description:
+        min: 0
+        max: 1
+        maximize: True
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3, git ]
+      - type: python
+        pip: [ anndata>=0.8 ]
+      - type: r
+        cran: [  anndata, ranger, tidyverse, testthat]
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
\ No newline at end of file
diff --git a/src/joint_embedding/metrics/rfoob/script.R b/src/joint_embedding/metrics/rfoob/script.R
new file mode 100644
index 0000000000..e237192fe3
--- /dev/null
+++ b/src/joint_embedding/metrics/rfoob/script.R
@@ -0,0 +1,68 @@
+cat("Load dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+library(testthat, quietly = TRUE, warn.conflicts = FALSE)
+requireNamespace("anndata", quietly = TRUE)
+
+## VIASH START
+par <- list(
+  input_solution = "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
+  input_prediction = "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+  output = "scores.h5ad"
+)
+## VIASH END
+
+cat("Read solution h5ad\n")
+ad_sol <- anndata::read_h5ad(par$input_solution)
+
+cat("Read prediction h5ad\n")
+expect_true(
+  grepl("\\.h5ad$", par$input_prediction),
+  info = "Prediction file should be an h5ad file"
+)
+ad_pred <-
+  tryCatch({
+    anndata::read_h5ad(par$input_prediction)
+  }, error = function(e) {
+    stop(paste0("Can't open prediction h5ad file. Detailed error message:\n", e$message))
+  })
+expect_true(
+  ad_sol$uns$dataset_id == ad_pred$uns$dataset_id
+)
+
+cat("Calculating metrics\n")
+df <- data.frame(as.matrix(ad_pred$obsm[["X_emb"]]), SOLUTION_CELL_TYPE = ad_sol$obs[["cell_type"]])
+rf1 <- ranger::ranger(SOLUTION_CELL_TYPE ~ ., df)
+
+df <- data.frame(as.matrix(ad_pred$obsm[["X_emb"]]), SOLUTION_PSEUDOTIME_ORDER = ad_sol$obs$pseudotime_order_GEX)
+df <- df[is.finite(df$SOLUTION_PSEUDOTIME_ORDER), , drop = FALSE]
+rf2 <- ranger::ranger(SOLUTION_PSEUDOTIME_ORDER ~ ., df)
+
+colname <- colnames(ad_sol$obs)[grepl("pseudotime_order_A.*", colnames(ad_sol$obs))]
+df <- data.frame(as.matrix(ad_pred$obsm[["X_emb"]]), SOLUTION_PSEUDOTIME_ORDER = ad_sol$obs[[colname]])
+df <- df[is.finite(df$SOLUTION_PSEUDOTIME_ORDER), , drop = FALSE]
+rf3 <- ranger::ranger(SOLUTION_PSEUDOTIME_ORDER ~ ., df)
+
+df <- data.frame(as.matrix(ad_pred$obsm[["X_emb"]]), SOLUTION_BATCH = ad_sol$obs$batch)
+rf4 <- ranger::ranger(SOLUTION_BATCH ~ ., df)
+
+metric_values <- c(
+  rfoob_celltype_accuracy = 1 - rf1$prediction.error,
+  rfoob_pseudotimegex_rsq = rf2$r.squared,
+  rfoob_pseudotimeadt_rsq = rf3$r.squared,
+  rfoob_batch_error = rf4$prediction.error
+)
+
+cat("Create output object\n")
+out <- anndata::AnnData(
+  shape = c(0, 0),
+  uns = list(
+    dataset_id = ad_pred$uns$dataset_id,
+    method_id = ad_pred$uns$method_id,
+    metric_ids = names(metric_values),
+    metric_values = metric_values
+  )
+)
+
+cat("Write output to h5ad file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")

From 82ae20eedc4160221af634f81433532acd166c29 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Wed, 4 Jan 2023 17:06:08 +0100
Subject: [PATCH 25/42] add ti_cons metric

---
 .../metrics/ti_cons/config.vsh.yaml           | 36 +++++++++
 src/joint_embedding/metrics/ti_cons/script.py | 81 +++++++++++++++++++
 2 files changed, 117 insertions(+)
 create mode 100644 src/joint_embedding/metrics/ti_cons/config.vsh.yaml
 create mode 100644 src/joint_embedding/metrics/ti_cons/script.py

diff --git a/src/joint_embedding/metrics/ti_cons/config.vsh.yaml b/src/joint_embedding/metrics/ti_cons/config.vsh.yaml
new file mode 100644
index 0000000000..e260a94010
--- /dev/null
+++ b/src/joint_embedding/metrics/ti_cons/config.vsh.yaml
@@ -0,0 +1,36 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: ti_cons
+  namespace: joint_embedding/metrics
+  description: Trajectory inference conservation score
+  info:
+    metrics:
+      - id: ti_cons_RNA
+        label: ti_cons_RNA
+        description:
+        min: 0
+        max: 1
+        maximize: True
+      - id: ti_cons_ADT_ATAC
+        label: ti_cons_ADT_ATAC
+        description:
+        min: 0
+        max: 1
+        maximize: True
+      - id: ti_cons_mean
+        label: ti_cons_mean
+        description:
+        min: 0
+        max: 1
+        maximize: True
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: python:3.10
+    setup:
+      - type: python
+        pip: [anndata>=0.8, scib]
+  - type: nextflow
+    directives: [ midmem, lowtime, midcpu ]
diff --git a/src/joint_embedding/metrics/ti_cons/script.py b/src/joint_embedding/metrics/ti_cons/script.py
new file mode 100644
index 0000000000..1d04067024
--- /dev/null
+++ b/src/joint_embedding/metrics/ti_cons/script.py
@@ -0,0 +1,81 @@
+
+
+print('Importing libraries')
+import pprint
+import numpy as np
+import scanpy as sc
+import anndata as ad
+import scib
+
+## VIASH START
+par = dict(
+    input_prediction="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+    input_solution="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
+    output="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.ti_cons.h5ad",
+    debug=True
+)
+## VIASH END
+
+if par['debug']:
+    pprint.pprint(par)
+
+OUTPUT_TYPE = 'graph'
+METRIC = 'ti_cons'
+
+input_prediction = par['input_prediction']
+input_solution = par['input_solution']
+output = par['output']
+
+print("Read prediction anndata")
+adata = ad.read_h5ad(input_prediction)
+dataset_id = adata.uns['dataset_id']
+
+print("Read solution anndata")
+adata_solution = ad.read_h5ad(input_solution)
+
+print('Transfer obs annotations')
+adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names]
+adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names]
+adt_atac_trajectory = 'pseudotime_order_ATAC' if 'pseudotime_order_ATAC' in adata_solution.obs else 'pseudotime_order_ADT'
+
+print('Preprocessing')
+sc.pp.neighbors(adata, use_rep='X_emb')
+
+print('Compute scores')
+obs_keys = adata_solution.obs_keys()
+
+if 'pseudotime_order_GEX' in obs_keys:
+    score_rna = scib.me.trajectory_conservation(
+        adata_pre=adata_solution,
+        adata_post=adata,
+        label_key='cell_type',
+        pseudotime_key='pseudotime_order_GEX'
+    )
+else:
+    score_rna = np.nan
+
+if adt_atac_trajectory in obs_keys:
+    score_adt_atac = scib.me.trajectory_conservation(
+        adata_pre=adata_solution,
+        adata_post=adata,
+        label_key='cell_type',
+        pseudotime_key=adt_atac_trajectory
+    )
+else:
+    score_adt_atac = np.nan
+
+score_mean = (score_rna + score_adt_atac) / 2
+
+# store adata with metrics
+print("Create output object")
+out = ad.AnnData(
+    uns=dict(
+        dataset_id=adata.uns['dataset_id'],
+        method_id=adata.uns['method_id'],
+        metric_ids=['ti_cons_RNA', 'ti_cons_ADT_ATAC', 'ti_cons_mean'],
+        metric_values=[score_rna, score_adt_atac, score_mean]
+    )
+)
+
+print("Write output to h5ad file")
+out.write(output, compression='gzip')
\ No newline at end of file

From acfb631ba0859ae1d8cf3bfb14c986dda3a1a59d Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Wed, 4 Jan 2023 17:10:08 +0100
Subject: [PATCH 26/42] add ti_cons_batch metric

---
 .../metrics/ti_cons_batch/config.vsh.yaml     | 36 ++++++++
 .../metrics/ti_cons_batch/script.py           | 87 +++++++++++++++++++
 2 files changed, 123 insertions(+)
 create mode 100644 src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml
 create mode 100644 src/joint_embedding/metrics/ti_cons_batch/script.py

diff --git a/src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml b/src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml
new file mode 100644
index 0000000000..515366b45b
--- /dev/null
+++ b/src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml
@@ -0,0 +1,36 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: ti_cons_batch
+  namespace: joint_embedding/metrics
+  description: Trajectory inference conservation score per batch
+  info:
+    metrics:
+      - id: ti_cons_batch_RNA
+        label: ti_cons_batch_RNA
+        description:
+        min: 0
+        max: 1
+        maximize: True
+      - id: ti_cons_batch_ADT_ATAC
+        label: ti_cons_batch_ADT_ATAC
+        description:
+        min: 0
+        max: 1
+        maximize: True
+      - id: ti_cons_batch_mean
+        label: ti_cons_batch_mean
+        description:
+        min: 0
+        max: 1
+        maximize: True
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: python:3.10
+    setup:
+      - type: python
+        pip: [anndata>=0.8, scib]
+  - type: nextflow
+    directives: [ midmem, lowtime, midcpu ]
diff --git a/src/joint_embedding/metrics/ti_cons_batch/script.py b/src/joint_embedding/metrics/ti_cons_batch/script.py
new file mode 100644
index 0000000000..5206b59610
--- /dev/null
+++ b/src/joint_embedding/metrics/ti_cons_batch/script.py
@@ -0,0 +1,87 @@
+import pprint
+import numpy as np
+import scanpy as sc
+import anndata as ad
+import scib
+
+## VIASH START
+par = dict(
+    input_prediction="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+    input_solution="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
+    output="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.ti_cons.h5ad",
+    debug=True
+)
+## VIASH END
+
+if par['debug']:
+    pprint.pprint(par)
+
+OUTPUT_TYPE = 'graph'
+METRIC = 'ti_cons_batch'
+
+input_prediction = par['input_prediction']
+input_solution = par['input_solution']
+output = par['output']
+
+print("Read prediction anndata")
+adata = ad.read_h5ad(input_prediction)
+dataset_id = adata.uns['dataset_id']
+
+print("Read solution anndata")
+adata_solution = ad.read_h5ad(input_solution)
+
+print('Transfer obs annotations')
+adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names]
+adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names]
+adt_atac_trajectory = 'pseudotime_order_ATAC' if 'pseudotime_order_ATAC' in adata_solution.obs else 'pseudotime_order_ADT'
+
+print('Preprocessing')
+sc.pp.neighbors(adata, use_rep='X_emb')
+
+print('Compute scores')
+obs_keys = adata_solution.obs_keys()
+
+if 'pseudotime_order_GEX' in obs_keys:
+    score_rna = scib.me.trajectory_conservation(
+        adata_pre=adata_solution,
+        adata_post=adata,
+        label_key='cell_type',
+        batch_key='batch',
+        pseudotime_key='pseudotime_order_GEX'
+    )
+else:
+    score_rna = np.nan
+
+if adt_atac_trajectory in obs_keys:
+    score_adt_atac = scib.me.trajectory_conservation(
+        adata_pre=adata_solution,
+        adata_post=adata,
+        label_key='cell_type',
+        batch_key='batch',
+        pseudotime_key=adt_atac_trajectory
+    )
+else:
+    score_adt_atac = np.nan
+
+score_mean = (score_rna + score_adt_atac) / 2
+
+# store adata with metrics
+print("Create output object")
+out = ad.AnnData(
+    uns=dict(
+        dataset_id=adata.uns['dataset_id'],
+        method_id=adata.uns['method_id'],
+        metric_ids=['ti_cons_batch_RNA', 'ti_cons_batch_ADT_ATAC', 'ti_cons_batch_mean'],
+        metric_values=[score_rna, score_adt_atac, score_mean]
+    )
+)
+
+print("Write output to h5ad file")
+out.write(output, compression='gzip')
+
+# # store score as tsv
+# with open(output, 'w') as file:
+#     header = ['dataset', 'output_type', 'metric', 'value']
+#     entry = [dataset_id, OUTPUT_TYPE, METRIC, score]
+#     file.write('\t'.join(header) + '\n')
+#     file.write('\t'.join([str(x) for x in entry]))

From 71ae0e9430a570ba0ad7077dc8819b7a5df41e99 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Thu, 5 Jan 2023 14:45:18 +0100
Subject: [PATCH 27/42] add metric unit test

---
 src/joint_embedding/api/comp_metric.yaml      | 161 +++++++++---------
 .../methods/lmds/config.vsh.yaml              |   1 -
 .../metrics/ari/config.vsh.yaml               |   3 +
 .../metrics/asw_batch/config.vsh.yaml         |   3 +
 .../metrics/asw_label/config.vsh.yaml         |   3 +
 .../metrics/cc_cons/config.vsh.yaml           |   3 +
 .../metrics/check_format/config.vsh.yaml      |   3 +
 .../graph_connectivity/config.vsh.yaml        |   3 +
 .../metrics/latent_mixing/config.vsh.yaml     |   3 +
 .../metrics/nmi/config.vsh.yaml               |   3 +
 .../metrics/rfoob/config.vsh.yaml             |   3 +
 .../metrics/ti_cons/config.vsh.yaml           |   3 +
 .../metrics/ti_cons_batch/config.vsh.yaml     |   3 +
 13 files changed, 118 insertions(+), 77 deletions(-)

diff --git a/src/joint_embedding/api/comp_metric.yaml b/src/joint_embedding/api/comp_metric.yaml
index 813d865dc2..bde43a316f 100644
--- a/src/joint_embedding/api/comp_metric.yaml
+++ b/src/joint_embedding/api/comp_metric.yaml
@@ -10,79 +10,88 @@ functionality:
     - name: --debug
       type: boolean_true
       description: Verbose output for debugging.
-  # test_resources:
-  #   - path: ../../../../resources_test
-  #   - type: python_script
-  #     path: generic_test.py
-      # text: |
-      #   from os import path
-      #   import subprocess
-      #   import anndata as ad
-      #   import pandas as pd
-
-      #   # define some filenames
-       
-      #   input_prediction_path =  "resources_test/common/joint_embedding/prediction.h5ad",
-      #   input_solution_path = "resources_test/common/joint_embedding/openproblems_bmmc_multiome_starter.solution.h5ad",
-      #   output_path = "output.h5ad"
-      #   meta_path = resources_dir + '/metric_meta.tsv'
-
-      #   cmd = [
-      #       meta['executable'],
-      #       "--input_prediction", input_prediction_path,
-      #       "--input_solution", input_solution_path,
-      #       "--output", output_path
-      #   ]
-
-      #   print(">> Running script as test")
-      #   out = subprocess.run(cmd, check=True, capture_output=True, text=True)
-
-      #   print("> Checking whether output files were created")
-      #   assert path.exists(testpar['output'])
-
-      #   print("> Reading h5ad files")
-      #   input_prediction = ad.read_h5ad(testpar['input_prediction'])
-      #   input_solution = ad.read_h5ad(testpar['input_solution'])
-      #   output = ad.read_h5ad(testpar['output'])
-
-      #   metric_meta = pd.read_csv(
-      #     meta_path, 
-      #     delimiter="\t",
-      #     header=0,
-      #     dtype={ 'metric_id': str, 'metric_min': float, 'metric_max': float, 'metric_higherisbetter': bool }
-      #   )
-
-      #   print("> Checking contents of metric_meta.tsv")
-      #   assert 'metric_id' in metric_meta
-      #   assert 'metric_min' in metric_meta
-      #   assert 'metric_max' in metric_meta
-      #   assert 'metric_higherisbetter' in metric_meta
-
-      #   print("> Checking .uns['dataset_id']")
-      #   assert 'dataset_id' in output.uns
-      #   assert output.uns['dataset_id'] == input_prediction.uns['dataset_id']
-
-      #   print("> Checking .uns['method_id']")
-      #   assert 'method_id' in output.uns
-      #   assert output.uns['method_id'] == input_prediction.uns['method_id']
-
-      #   print("> Checking .uns['metric_ids']")
-      #   assert 'metric_ids' in output.uns
-      #   assert set(output.uns['metric_ids']) == set(metric_meta.metric_id)
-
-      #   print("> Checking .uns['metric_values']")
-      #   assert 'metric_values' in output.uns
-      #   assert output.uns['metric_ids'].size == output.uns['metric_values'].size
-
-      #   # merge with metric_meta to see if metric_value lies within the expected range
-      #   output_uns = pd.DataFrame({
-      #     'metric_id': output.uns['metric_ids'], 
-      #     'metric_value': output.uns['metric_values']
-      #   })
-
-      #   scores = metric_meta.merge(output_uns, on="metric_id")
-
-      #   assert all(scores.metric_value >= scores.metric_min)
-      #   assert all(scores.metric_value <= scores.metric_max)
-
-      #   print("> Test succeeded!")
+  test_resources:
+    - path: ../../../../resources_test
+    - type: python_script
+      path: generic_test.py
+      text: |
+        from os import path
+        import subprocess
+        import anndata as ad
+        import pandas as pd
+        import yaml
+
+        ## VIASH START
+        # This code block will be replaced by viash at runtime.
+        par = {
+          "input_prediction": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+          "input_solution": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
+          "output": "output.h5ad"
+        }
+        meta = { 'functionality_name': 'foo' }
+
+        ## VIASH END
+
+        input_prediction_path = "resources_test/common/joint_embedding/cite_random_prediction.h5ad"
+        input_solution_path = "resources_test/common/joint_embedding/cite_solution.h5ad"
+        output_path = "output.h5ad"
+        # define some filenames
+        with open(meta["config"], "r") as file:
+                config = yaml.safe_load(file)
+
+        cmd = [
+            meta['executable'],
+            "--input_prediction", input_prediction_path,
+            "--input_solution", input_solution_path,
+            "--output", output_path
+        ]
+
+        print("> Running method", flush=True)
+        out = subprocess.run(cmd, capture_output=True, text=True, check=True).stdout
+
+        print("> Checking whether output files were created", flush=True)
+        assert path.exists(output_path)
+
+        print("> Reading h5ad files", flush=True)
+        input_prediction = ad.read_h5ad(input_prediction_path)
+        input_solution = ad.read_h5ad(input_solution_path)
+        output = ad.read_h5ad(output_path)
+
+        # Create DF from metric config info
+        metric_info = config['functionality']['info']['metrics']
+        metric_meta = pd.DataFrame(metric_info)
+        metric_meta = metric_meta.astype({'id': str, 'label': str, 'description': str, 'min': float, 'max': float, 'maximize': bool})
+        print("> Checking contents of metric info", flush=True)
+        assert 'id' in metric_meta
+        assert 'min' in metric_meta
+        assert 'max' in metric_meta
+        assert 'maximize' in metric_meta
+
+        print("> Checking .uns['dataset_id']", flush=True)
+        assert 'dataset_id' in output.uns
+        assert output.uns['dataset_id'] == input_prediction.uns['dataset_id']
+
+        print("> Checking .uns['method_id']", flush=True)
+        assert 'method_id' in output.uns
+        assert output.uns['method_id'] == input_prediction.uns['method_id']
+
+        print("> Checking .uns['metric_ids']", flush=True)
+        assert 'metric_ids' in output.uns
+        assert set(output.uns['metric_ids']) == set(metric_meta.id)
+
+        print("> Checking .uns['metric_values']", flush=True)
+        assert 'metric_values' in output.uns
+        assert output.uns['metric_ids'].size == output.uns['metric_values'].size
+
+        # merge with metric_meta to see if metric_value lies within the expected range
+        output_uns = pd.DataFrame({
+          'id': output.uns['metric_ids'], 
+          'value': output.uns['metric_values']
+        })
+
+        scores = metric_meta.merge(output_uns, on="id")
+
+        assert all(scores.value >= scores['min'])
+        assert all(scores.value <= scores['max'])
+
+        print("> Test succeeded!", flush=True)
diff --git a/src/joint_embedding/methods/lmds/config.vsh.yaml b/src/joint_embedding/methods/lmds/config.vsh.yaml
index a0a3ef40af..d01b922fd7 100644
--- a/src/joint_embedding/methods/lmds/config.vsh.yaml
+++ b/src/joint_embedding/methods/lmds/config.vsh.yaml
@@ -24,7 +24,6 @@ platforms:
     setup:
       - type: r
         cran: [ anndata, lmds, tidyverse ]
-    test_setup:
       - type: apt
         packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
       - type: python
diff --git a/src/joint_embedding/metrics/ari/config.vsh.yaml b/src/joint_embedding/metrics/ari/config.vsh.yaml
index 066a0bfe98..5097080a70 100644
--- a/src/joint_embedding/metrics/ari/config.vsh.yaml
+++ b/src/joint_embedding/metrics/ari/config.vsh.yaml
@@ -20,5 +20,8 @@ platforms:
     setup:
       - type: python
         pip: [anndata>=0.8, scib, scanpy]
+    test_setup:
+      - type: python
+        pip: [ pyyaml ]
   - type: nextflow
     directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/metrics/asw_batch/config.vsh.yaml b/src/joint_embedding/metrics/asw_batch/config.vsh.yaml
index b0758506a5..24ab2503b5 100644
--- a/src/joint_embedding/metrics/asw_batch/config.vsh.yaml
+++ b/src/joint_embedding/metrics/asw_batch/config.vsh.yaml
@@ -20,5 +20,8 @@ platforms:
     setup:
       - type: python
         pip: [ anndata>=0.8, scanpy, scib]
+    test_setup:
+      - type: python
+        pip: [ pyyaml ]
   - type: nextflow
     directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/metrics/asw_label/config.vsh.yaml b/src/joint_embedding/metrics/asw_label/config.vsh.yaml
index 9b1f331cb2..74d7c24afe 100644
--- a/src/joint_embedding/metrics/asw_label/config.vsh.yaml
+++ b/src/joint_embedding/metrics/asw_label/config.vsh.yaml
@@ -20,5 +20,8 @@ platforms:
     setup:
       - type: python
         pip: [anndata>=0.8, scib]
+    test_setup:
+      - type: python
+        pip: [ pyyaml ]
   - type: nextflow
     directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/metrics/cc_cons/config.vsh.yaml b/src/joint_embedding/metrics/cc_cons/config.vsh.yaml
index dc8b3ab1c9..86a624c632 100644
--- a/src/joint_embedding/metrics/cc_cons/config.vsh.yaml
+++ b/src/joint_embedding/metrics/cc_cons/config.vsh.yaml
@@ -20,5 +20,8 @@ platforms:
     setup:
       - type: python
         pip: [anndata>=0.8, scib]
+    test_setup:
+      - type: python
+        pip: [ pyyaml ]
   - type: nextflow
     directives: [ vhighmem, midtime, midcpu ]
diff --git a/src/joint_embedding/metrics/check_format/config.vsh.yaml b/src/joint_embedding/metrics/check_format/config.vsh.yaml
index 1fae80d3aa..934af734e3 100644
--- a/src/joint_embedding/metrics/check_format/config.vsh.yaml
+++ b/src/joint_embedding/metrics/check_format/config.vsh.yaml
@@ -30,5 +30,8 @@ platforms:
         packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
       - type: python
         pip: [ anndata>=0.8 ]
+    test_setup:
+      - type: python
+        pip: [ pyyaml ]
   - type: nextflow
     directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml b/src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml
index a575757ba6..8cbde10e3a 100644
--- a/src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml
+++ b/src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml
@@ -20,5 +20,8 @@ platforms:
     setup:
       - type: python
         pip: [anndata>=0.8, scib]
+    test_setup:
+      - type: python
+        pip: [ pyyaml ]
   - type: nextflow
     directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/metrics/latent_mixing/config.vsh.yaml b/src/joint_embedding/metrics/latent_mixing/config.vsh.yaml
index b65e2f8c66..2ef4049453 100644
--- a/src/joint_embedding/metrics/latent_mixing/config.vsh.yaml
+++ b/src/joint_embedding/metrics/latent_mixing/config.vsh.yaml
@@ -24,5 +24,8 @@ platforms:
     setup:
       - type: python
         pip: [anndata>=0.8, scikit-learn, scipy]
+    test_setup:
+      - type: python
+        pip: [ pyyaml ]
   - type: nextflow
     directives: [ lowmem, lowtime, lowcpu ]
\ No newline at end of file
diff --git a/src/joint_embedding/metrics/nmi/config.vsh.yaml b/src/joint_embedding/metrics/nmi/config.vsh.yaml
index 9f507916a4..bf98b925de 100644
--- a/src/joint_embedding/metrics/nmi/config.vsh.yaml
+++ b/src/joint_embedding/metrics/nmi/config.vsh.yaml
@@ -20,5 +20,8 @@ platforms:
     setup:
       - type: python
         pip: [anndata>=0.8, scib]
+    test_setup:
+      - type: python
+        pip: [ pyyaml ]
   - type: nextflow
     directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/metrics/rfoob/config.vsh.yaml b/src/joint_embedding/metrics/rfoob/config.vsh.yaml
index 700eb49716..5f080b8b08 100644
--- a/src/joint_embedding/metrics/rfoob/config.vsh.yaml
+++ b/src/joint_embedding/metrics/rfoob/config.vsh.yaml
@@ -42,5 +42,8 @@ platforms:
         pip: [ anndata>=0.8 ]
       - type: r
         cran: [  anndata, ranger, tidyverse, testthat]
+    test_setup:
+      - type: python
+        pip: [ pyyaml ]
   - type: nextflow
     directives: [ lowmem, lowtime, lowcpu ]
\ No newline at end of file
diff --git a/src/joint_embedding/metrics/ti_cons/config.vsh.yaml b/src/joint_embedding/metrics/ti_cons/config.vsh.yaml
index e260a94010..e7d70b24b6 100644
--- a/src/joint_embedding/metrics/ti_cons/config.vsh.yaml
+++ b/src/joint_embedding/metrics/ti_cons/config.vsh.yaml
@@ -32,5 +32,8 @@ platforms:
     setup:
       - type: python
         pip: [anndata>=0.8, scib]
+    test_setup:
+      - type: python
+        pip: [ pyyaml ]
   - type: nextflow
     directives: [ midmem, lowtime, midcpu ]
diff --git a/src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml b/src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml
index 515366b45b..43aed327f6 100644
--- a/src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml
+++ b/src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml
@@ -32,5 +32,8 @@ platforms:
     setup:
       - type: python
         pip: [anndata>=0.8, scib]
+    test_setup:
+      - type: python
+        pip: [ pyyaml ]
   - type: nextflow
     directives: [ midmem, lowtime, midcpu ]

From ed38c115bc0ab522959b33aa3866fe3d2feb9834 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Thu, 5 Jan 2023 21:24:41 +0100
Subject: [PATCH 28/42] add task_info.yaml

---
 src/joint_embedding/docs/task_info.yaml | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 src/joint_embedding/docs/task_info.yaml

diff --git a/src/joint_embedding/docs/task_info.yaml b/src/joint_embedding/docs/task_info.yaml
new file mode 100644
index 0000000000..6565c0c292
--- /dev/null
+++ b/src/joint_embedding/docs/task_info.yaml
@@ -0,0 +1,16 @@
+task_id: joint_embedding
+task_name: Joint Embedding
+v1_url: neurips2021_multimodal_viash/src/joint_embedding/readme.md
+v1_commit: 0f8eae583444ba3f71c3083b860cc34b9ecb2fa2
+short_description: Learning of an embedded space that leverages the information of multiple modalities (e.g. for improved cell type annotation).
+description: |
+  The functioning of organs, tissues, and whole organisms is determined by the interplay of cells. 
+  Cells are characterised into broad types, which in turn can take on different states. Here, a cell 
+  state is made up of the sum of all processes that are occurring within the cell. We can gain insight 
+  into the state of a cell by different types of measurements: e.g., RNA expression, protein abundance, 
+  or chromatin conformation. Combining this information to describe cellular heterogeneity requires the 
+  formation of joint embeddings generated from this multimodal data. These embeddings must account for 
+  and remove possible batch effects between different measurement batches. The reward for methods that 
+  can achieve this is great: a highly resolved description of the underlying biological state of a cell 
+  that determines its function, how it interacts with other cells, and thus the cell’s role in the f
+  unctioning of the whole tissue.
\ No newline at end of file

From b6d5bbdcfb5c7aed7ad3858df47006876f6edb3b Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Fri, 6 Jan 2023 21:37:57 +0100
Subject: [PATCH 29/42] create NF workflow

---
 .../methods/mnn/config.vsh.yaml               |   8 +-
 .../methods/newwave/config.vsh.yaml           |   8 +-
 .../methods/pca/config.vsh.yaml               |   8 +-
 src/joint_embedding/methods/totalvi/script.py |  10 +-
 .../methods/umap/config.vsh.yaml              |  10 +-
 .../resources_scripts/mask_datasets.sh        |  64 ++++++++
 .../resources_scripts/run_benchmarks.sh       |  74 +++++++++
 .../resources_test_scripts/bmmc_cite.sh       |  57 +++++++
 .../workflows/run/config.vsh.yaml             |  26 +++
 src/joint_embedding/workflows/run/main.nf     | 152 ++++++++++++++++++
 .../workflows/run/nextflow.config             |  14 ++
 11 files changed, 419 insertions(+), 12 deletions(-)
 create mode 100644 src/joint_embedding/resources_scripts/mask_datasets.sh
 create mode 100644 src/joint_embedding/resources_scripts/run_benchmarks.sh
 create mode 100644 src/joint_embedding/resources_test_scripts/bmmc_cite.sh
 create mode 100644 src/joint_embedding/workflows/run/config.vsh.yaml
 create mode 100644 src/joint_embedding/workflows/run/main.nf
 create mode 100644 src/joint_embedding/workflows/run/nextflow.config

diff --git a/src/joint_embedding/methods/mnn/config.vsh.yaml b/src/joint_embedding/methods/mnn/config.vsh.yaml
index 8759ade4e0..486beb9301 100644
--- a/src/joint_embedding/methods/mnn/config.vsh.yaml
+++ b/src/joint_embedding/methods/mnn/config.vsh.yaml
@@ -16,9 +16,15 @@ functionality:
       path: script.R
 platforms:
   - type: docker
-    image: dataintuitive/randpy:r4.0_py3.8_bioc3.12
+    image: eddelbuettel/r2u:22.04
     setup:
+      - type: r
+        cran: [ anndata, lmds, tidyverse, bioconductor]
       - type: r
         bioc: [ SingleCellExperiment, batchelor, proxyC ]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [anndata>=0.8]      
   - type: nextflow
     directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/methods/newwave/config.vsh.yaml b/src/joint_embedding/methods/newwave/config.vsh.yaml
index c327dcfa8c..2da0df3a4b 100644
--- a/src/joint_embedding/methods/newwave/config.vsh.yaml
+++ b/src/joint_embedding/methods/newwave/config.vsh.yaml
@@ -25,9 +25,15 @@ functionality:
       path: script.R
 platforms:
   - type: docker
-    image: dataintuitive/randpy:r4.0_py3.8_bioc3.12
+    image: eddelbuettel/r2u:22.04
     setup:
+      - type: r
+        cran: [ anndata, lmds, tidyverse, bioconductor]
       - type: r
         bioc: [ SingleCellExperiment, NewWave, proxyC ]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [anndata>=0.8]
   - type: nextflow
     directives: [ highmem, hightime, highcpu ]
diff --git a/src/joint_embedding/methods/pca/config.vsh.yaml b/src/joint_embedding/methods/pca/config.vsh.yaml
index cb4d24642f..967de6da6e 100644
--- a/src/joint_embedding/methods/pca/config.vsh.yaml
+++ b/src/joint_embedding/methods/pca/config.vsh.yaml
@@ -21,9 +21,13 @@ functionality:
       path: script.R
 platforms:
   - type: docker
-    image: dataintuitive/randpy:r4.0_py3.8_bioc3.12
+    image: eddelbuettel/r2u:22.04
     setup:
       - type: r
-        packages: [ irlba, proxyC ]
+        cran: [ anndata, lmds, tidyverse, bioconductor, irlba, proxyC]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [anndata>=0.8]
   - type: nextflow
     directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/methods/totalvi/script.py b/src/joint_embedding/methods/totalvi/script.py
index b47400816d..0c22dee08f 100644
--- a/src/joint_embedding/methods/totalvi/script.py
+++ b/src/joint_embedding/methods/totalvi/script.py
@@ -17,12 +17,12 @@
 }
 ## VIASH END
 
-print("Load and prepare data")
+print("Load and prepare data", flush=True)
 adata_mod1 = anndata.read_h5ad(par['input_mod1'])
 adata_mod2 = anndata.read_h5ad(par['input_mod2'])
 adata_mod1.obsm['protein_expression'] = adata_mod2.X.toarray()
 
-print('Select highly variable genes')
+print('Select highly variable genes', flush=True)
 sc.pp.highly_variable_genes(
     adata_mod1,
     n_top_genes=par['hvg_number'],
@@ -31,18 +31,18 @@
     subset=True
 )
 
-print("Set up model")
+print("Set up model", flush=True)
 TOTALVI.setup_anndata(
     adata_mod1,
     batch_key="batch",
     protein_expression_obsm_key="protein_expression"
 )
 
-print('Train totalVI with', par['max_epochs'], 'epochs')
+print('Train totalVI with', par['max_epochs'], 'epochs', flush=True)
 vae = TOTALVI(adata_mod1, latent_distribution="normal")
 vae.train(max_epochs = par['max_epochs'])
 
-print("Postprocessing and saving output")
+print("Postprocessing and saving output", flush=True)
 adata_out = anndata.AnnData(
     X=vae.get_latent_representation(),
     obs=adata_mod1.obs[['batch']],
diff --git a/src/joint_embedding/methods/umap/config.vsh.yaml b/src/joint_embedding/methods/umap/config.vsh.yaml
index f8b27fdaec..4b10222d1c 100644
--- a/src/joint_embedding/methods/umap/config.vsh.yaml
+++ b/src/joint_embedding/methods/umap/config.vsh.yaml
@@ -1,6 +1,6 @@
 __merge__: ../../api/comp_method.yaml
 functionality:
-  name: umam
+  name: umap
   namespace: joint_embedding/methods
   version: dev
   description: UMAP dimensionality reduction on the Euclidean distance.
@@ -33,9 +33,13 @@ functionality:
       path: script.R
 platforms:
   - type: docker
-    image: dataintuitive/randpy:r4.0_py3.8_bioc3.12
+    image: eddelbuettel/r2u:22.04
     setup:
       - type: r
-        packages: [ uwot, irlba, proxyC ]
+        cran: [ anndata, lmds, tidyverse, irlba, proxyC, uwot]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [anndata>=0.8]
   - type: nextflow
     directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/resources_scripts/mask_datasets.sh b/src/joint_embedding/resources_scripts/mask_datasets.sh
new file mode 100644
index 0000000000..dfb1295fb7
--- /dev/null
+++ b/src/joint_embedding/resources_scripts/mask_datasets.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+COMMON_DATASETS="resources/datasets/openproblems_v1"
+OUTPUT_DIR="resources/joint_embedding/datasets/openproblems_v1"
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+params_file="$OUTPUT_DIR/params.yaml"
+
+if [ ! -f $params_file ]; then
+  python << HERE
+import anndata as ad
+import glob
+import yaml
+
+h5ad_files = glob.glob("$COMMON_DATASETS/**.h5ad")
+
+# this task doesn't use normalizations
+# 
+param_list = {}
+
+for h5ad_file in h5ad_files:
+  print(f"Checking {h5ad_file}")
+  adata = ad.read_h5ad(h5ad_file, backed=True)
+  if "counts" in adata.layers:
+    dataset_id = adata.uns["dataset_id"].replace("/", ".")
+    obj = {
+      'id': dataset_id, 
+      'input': h5ad_file,
+      'dataset_id': dataset_id,
+    }
+    param_list[dataset_id] = obj
+
+output = {
+  "param_list": list(param_list.values()),
+  "seed": 123,
+  "output_train": "\$id.train.h5ad",
+  "output_test": "\$id.test.h5ad"
+}
+
+with open("$params_file", "w") as file:
+  yaml.dump(output, file)
+HERE
+fi
+
+export NXF_VER=22.04.5
+nextflow \
+  run . \
+  -main-script target/nextflow/denoising/split_dataset/main.nf \
+  -profile docker \
+  -resume \
+  -params-file $params_file \
+  --publish_dir "$OUTPUT_DIR"
+
+bin/tools/docker/nextflow/process_log/process_log \
+  --output "$OUTPUT_DIR/nextflow_log.tsv"
\ No newline at end of file
diff --git a/src/joint_embedding/resources_scripts/run_benchmarks.sh b/src/joint_embedding/resources_scripts/run_benchmarks.sh
new file mode 100644
index 0000000000..01d6ef92a5
--- /dev/null
+++ b/src/joint_embedding/resources_scripts/run_benchmarks.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+set -e
+
+export TOWER_WORKSPACE_ID=53907369739130
+
+DATASETS_DIR="resources/denoising/datasets/openproblems_v1"
+OUTPUT_DIR="resources/denoising/benchmarks/openproblems_v1"
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+params_file="$OUTPUT_DIR/params.yaml"
+
+if [ ! -f $params_file ]; then
+  python << HERE
+import yaml
+import os
+
+dataset_dir = "$DATASETS_DIR"
+output_dir = "$OUTPUT_DIR"
+
+# read split datasets yaml
+with open(dataset_dir + "/params.yaml", "r") as file:
+  split_list = yaml.safe_load(file)
+datasets = split_list['param_list']
+
+# figure out where train/test files were stored
+param_list = []
+
+for dataset in datasets:
+  id = dataset["id"]
+  input_train = dataset_dir + "/" + id + ".train.h5ad"
+  input_test = dataset_dir + "/" + id + ".test.h5ad"
+  
+  if os.path.exists(input_test):
+    obj = {
+      'id': id, 
+    'id': id, 
+      'id': id, 
+      'dataset_id': dataset["dataset_id"],
+      'input_train': input_train,
+      'input_test': input_test
+    }
+    param_list.append(obj)
+
+# write as output file
+output = {
+  "param_list": param_list,
+}
+
+with open(output_dir + "/params.yaml", "w") as file:
+  yaml.dump(output, file)
+HERE
+fi
+
+export NXF_VER=22.04.5
+nextflow \
+  run . \
+  -main-script src/denoising/workflows/run/main.nf \
+  -profile docker \
+  -params-file "$params_file" \
+  --publish_dir "$OUTPUT_DIR" \
+  -with-tower
+
+bin/tools/docker/nextflow/process_log/process_log \
+  --output "$OUTPUT_DIR/nextflow_log.tsv"
\ No newline at end of file
diff --git a/src/joint_embedding/resources_test_scripts/bmmc_cite.sh b/src/joint_embedding/resources_test_scripts/bmmc_cite.sh
new file mode 100644
index 0000000000..72967ef27b
--- /dev/null
+++ b/src/joint_embedding/resources_test_scripts/bmmc_cite.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+#
+#make sure the following command has been executed
+#bin/viash_build -q 'denoising|common'
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+MOD_1_DATA=resources_test/common/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.output_rna.h5ad
+MOD_2_DATA=resources_test/common/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.output_mod2.h5ad
+DATASET_DIR=resources_test/joint_embedding/bmmc_cite
+
+if [ ! -f $MOD_1_DATA ]; then
+    echo "Error! Could not find raw data"
+    exit 1
+fi
+
+mkdir -p $DATASET_DIR
+
+# split dataset
+bin/viash run src/joint_embedding/mask_dataset/config.vsh.yaml -- \
+    --input_mod1 $MOD_1_DATA \
+    --input_mod2 $MOD_2_DATA \
+    --output_mod1 $DATASET_DIR/cite_mod1.h5ad \
+    --output_mod2 $DATASET_DIR/cite_mod2.h5ad \
+    --output_solution $DATASET_DIR/cite_solution.h5ad
+
+# run one method
+bin/viash run src/joint_embedding/methods/pca/config.vsh.yaml -- \
+    --input_mod1 $DATASET_DIR/cite_mod1.h5ad \
+    --input_mod2 $DATASET_DIR/cite_mod2.h5ad \
+    --output $DATASET_DIR/pca.h5ad
+
+# run one metric
+bin/viash run src/joint_embedding/metrics/ari/config.vsh.yaml -- \
+    --input_prediction $DATASET_DIR/pca.h5ad \
+    --input_solution $DATASET_DIR/cite_solution.h5ad \
+    --output $DATASET_DIR/ari.h5ad
+
+# run benchmark
+export NXF_VER=22.04.5
+
+bin/nextflow \
+  run . \
+  -main-script src/joint_embedding/workflows/run/main.nf \
+  -profile docker \
+  -resume \
+  --id bmmc_cite \
+  --dataset_id bmmc_site \
+  --input_mod1 $DATASET_DIR/cite_mod1.h5ad \
+  --input_mod2 $DATASET_DIR/cite_mod2.h5ad \
+  --input_solution $DATASET_DIR/cite_solution.h5ad \
+  --output scores.tsv \
+  --publish_dir $DATASET_DIR/
\ No newline at end of file
diff --git a/src/joint_embedding/workflows/run/config.vsh.yaml b/src/joint_embedding/workflows/run/config.vsh.yaml
new file mode 100644
index 0000000000..5f3e7800cf
--- /dev/null
+++ b/src/joint_embedding/workflows/run/config.vsh.yaml
@@ -0,0 +1,26 @@
+functionality:
+  name: "run_benchmark"
+  namespace: "joint_embedding/workflows"
+  argument_groups:
+    - name: Inputs
+      arguments:
+        - name: "--id"
+          type: "string"
+          description: "The ID of the dataset"
+          required: true
+        - name: "--input_mod1"
+          type: "file" # todo: replace with includes
+        - name: "--input_mod2"
+          type: "file" # todo: replace with includes
+        - name: "--input_solution"
+          type: "file" # todo: replace with includes
+    - name: Outputs
+      arguments:
+        - name: "--output"
+          direction: "output"
+          type: file
+  resources:
+    - type: nextflow_script
+      path: main.nf
+platforms:
+  - type: nextflow
\ No newline at end of file
diff --git a/src/joint_embedding/workflows/run/main.nf b/src/joint_embedding/workflows/run/main.nf
new file mode 100644
index 0000000000..45ac7ad76d
--- /dev/null
+++ b/src/joint_embedding/workflows/run/main.nf
@@ -0,0 +1,152 @@
+nextflow.enable.dsl=2
+
+sourceDir = params.rootDir + "/src"
+targetDir = params.rootDir + "/target/nextflow"
+
+// import control methods
+include { random_embed } from "$targetDir/joint_embedding/control_methods/random_embed/main.nf"
+include { zeros_embed } from "$targetDir/joint_embedding/control_methods/zeros_embed/main.nf"
+
+// import methods
+include { lmds } from "$targetDir/joint_embedding/methods/lmds/main.nf"
+include { mnn } from "$targetDir/joint_embedding/methods/mnn/main.nf"
+include { newwave } from "$targetDir/joint_embedding/methods/newwave/main.nf"
+include { pca } from "$targetDir/joint_embedding/methods/pca/main.nf"
+include { totalvi } from "$targetDir/joint_embedding/methods/totalvi/main.nf"
+include { umap } from "$targetDir/joint_embedding/methods/umap/main.nf"
+
+// import metrics
+include { ari } from "$targetDir/joint_embedding/metrics/ari/main.nf"
+include { asw_batch } from "$targetDir/joint_embedding/metrics/asw_batch/main.nf"
+include { asw_label } from "$targetDir/joint_embedding/metrics/asw_label/main.nf"
+include { cc_cons } from "$targetDir/joint_embedding/metrics/cc_cons/main.nf"
+include { check_format } from "$targetDir/joint_embedding/metrics/check_format/main.nf"
+include { graph_connectivity } from "$targetDir/joint_embedding/metrics/graph_connectivity/main.nf"
+include { latent_mixing } from "$targetDir/joint_embedding/metrics/latent_mixing/main.nf"
+include { nmi } from "$targetDir/joint_embedding/metrics/nmi/main.nf"
+include { rfoob } from "$targetDir/joint_embedding/metrics/rfoob/main.nf"
+include { ti_cons } from "$targetDir/joint_embedding/metrics/ti_cons/main.nf"
+include { ti_cons_batch } from "$targetDir/joint_embedding/metrics/ti_cons_batch/main.nf"
+
+// tsv generation component
+include { extract_scores } from "$targetDir/common/extract_scores/main.nf"
+
+// import helper functions
+include { readConfig; viashChannel; helpMessage } from sourceDir + "/wf_utils/WorkflowHelper.nf"
+include { setWorkflowArguments; getWorkflowArguments; passthroughMap as pmap } from sourceDir + "/wf_utils/DataflowHelper.nf"
+
+config = readConfig("$projectDir/config.vsh.yaml")
+
+// construct a map of methods (id -> method_module)
+methods = [ lmds, mnn, newwave, pca, totalvi, umap]
+  .collectEntries{method ->
+    [method.config.functionality.name, method]
+  }
+
+workflow {
+  helpMessage(config)
+
+  viashChannel(params, config)
+    | run_wf
+}
+
+workflow run_wf {
+  take:
+  input_ch
+
+  main:
+  output_ch = input_ch
+
+    // split params for downstream components
+    | setWorkflowArguments(
+      method: ["input_mod1", "input_mod2"],
+      metric: ["input_solution"],
+      output: ["output"]
+    )
+
+    // multiply events by the number of method
+    | add_methods
+
+    // run methods
+    | getWorkflowArguments(key: "method")
+    | run_methods
+
+    // construct tuples for metrics
+    | pmap{ id, file, passthrough ->
+      // derive unique ids from output filenames
+      def newId = file.getName().replaceAll(".output.*", "")
+      // combine prediction with solution
+      def newData = [ input_prediction: file, input_solution: passthrough.metric.input_solution ]
+      [ newId, newData, passthrough ]
+    }
+    
+    // run metrics
+    | getWorkflowArguments(key: "metric")
+    | run_metrics
+    
+    // convert to tsv  
+    | aggregate_results
+
+  emit:
+  output_ch
+}
+
+workflow add_methods {
+  take: input_ch
+  main:
+  output_ch = Channel.fromList(methods.keySet())
+    | combine(input_ch)
+
+    // generate combined id for method_id and dataset_id
+    | pmap{method_id, dataset_id, data ->
+      def new_id = dataset_id + "." + method_id
+      def new_data = data.clone() + [method_id: method_id]
+      new_data.remove("id")
+      [new_id, new_data]
+    }
+  emit: output_ch
+}
+
+workflow run_methods {
+  take: input_ch
+  main:
+    // generate one channel per method
+    method_chs = methods.collect { method_id, method_module ->
+        input_ch
+          | filter{it[1].method_id == method_id}
+          | method_module
+      }
+    // mix all results
+    output_ch = method_chs[0].mix(*method_chs.drop(1))
+
+  emit: output_ch
+}
+
+workflow run_metrics {
+  take: input_ch
+  main:
+
+  output_ch = input_ch
+    | (ari & asw_batch & asw_label & cc_cons & check_format & graph_connectivity & latent_mixing & nmi & rfoob & ti_cons & ti_cons_batch)
+    | mix
+
+  emit: output_ch
+}
+
+workflow aggregate_results {
+  take: input_ch
+  main:
+
+  output_ch = input_ch
+    | toSortedList
+    | filter{ it.size() > 0 }
+    | map{ it -> 
+      [ "combined", it.collect{ it[1] } ] + it[0].drop(2) 
+    }
+    | getWorkflowArguments(key: "output")
+    | extract_scores.run(
+        auto: [ publish: true ]
+    )
+
+  emit: output_ch
+}
\ No newline at end of file
diff --git a/src/joint_embedding/workflows/run/nextflow.config b/src/joint_embedding/workflows/run/nextflow.config
new file mode 100644
index 0000000000..ea674ffa07
--- /dev/null
+++ b/src/joint_embedding/workflows/run/nextflow.config
@@ -0,0 +1,14 @@
+manifest {
+  name = 'joint_embedding/workflows/run'
+  mainScript = 'main.nf'
+  nextflowVersion = '!>=22.04.5'
+  description = 'Multi modality - joint embedding'
+}
+
+params {
+  rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString()
+}
+
+// include common settings
+includeConfig("${params.rootDir}/src/wf_utils/ProfilesHelper.config")
+includeConfig("${params.rootDir}/src/wf_utils/labels.config")
\ No newline at end of file

From 99b05244058a33b2db8beb617e03c9e773e7c834 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Fri, 6 Jan 2023 21:52:18 +0100
Subject: [PATCH 30/42] update changelog

---
 CHANGELOG.md | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a55a57a969..8a50fd2171 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -163,3 +163,66 @@
 * Higher dimensional data used to obtain the metrics is calculated from test data instead of the whole dataset. So far test and train data contain the same counts values, but this may change eventually.
 
 * Test data is used instead of the whole dataset in control (baseline) methods.
+
+
+## Multi modality - Joint Embedding
+
+### New functinality
+
+* `api/anndata_*`: Created a file format specifications for the h5ad files throughout the pipeline.
+
+* `api/comp_*`: Created an api definition for the mask, method and metric components.
+
+* `mask_dataset`: Added a component for masking raw datasets into task-ready dataset objects.
+
+* `resources_test/joint_embedding/pancreas` with `src/joint_embedding/resources_test_scripts/pancreas.sh`.
+  
+### neurips 2021 migration
+
+* `control_methods/random_embed`: Migrated from v1. Extracted from baseline method `dummy_random`.
+
+* `control_methods/zeros_embed`: Migrated from v1. Extracted from baseline method `dummy_zeros`.
+
+* `methods/lmds`: Migrated from v1.
+
+* `methods/mnn`: Migrated and adapted from v1.
+
+* `methods/newwave`: Migrated and adapted from v1.
+
+* `methods/pca`: Migrated from v1.
+
+* `methods/totalvi`: Migrated from v1.
+
+* `methods/umap`: Migrated from v1.
+
+* `metrics/ari`: Migrated from v1.
+  
+* `metrics/asw_batch`: Migrated from v1.
+
+* `metrics/asw_label`: Migrated from v1.
+
+* `metrics/cc_cons`: Migrated from v1.
+
+* `metrics/check_format`: Migrated from v1.
+
+* `metrics/graph_connectivity`: Migrated from v1.
+
+* `metrics/latent_mixing`: Migrated from v1.
+
+* `metrics/nmi`: Migrated from v1.
+
+* `metrics/rfoob`: Migrated from v1.
+
+* `metrics/ti_cons`: Migrated from v1.
+
+* `metrics/ti_cons_batch`: Migrated from v1.
+
+### changes from neurips 2021
+
+* Updated docker config from R script. Was using an old `anndata` package which was giving warnings
+
+* stores the output from the methods in `.obsm["X_emb"]` instead of `.X` in the `anndata`
+
+* `X_emb ` data is stored as a `Sparse Matrix`
+  
+* updated configs to latest `viash` 
\ No newline at end of file

From c8ae6017be1b3ca558ccdd237f81c159635edff2 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Fri, 6 Jan 2023 21:55:25 +0100
Subject: [PATCH 31/42] update changelog

---
 CHANGELOG.md | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8a50fd2171..8c075e9139 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -179,43 +179,43 @@
   
 ### neurips 2021 migration
 
-* `control_methods/random_embed`: Migrated from v1. Extracted from baseline method `dummy_random`.
+* `control_methods/random_embed`: Migrated from neurips 2021. Extracted from baseline method `dummy_random`.
 
-* `control_methods/zeros_embed`: Migrated from v1. Extracted from baseline method `dummy_zeros`.
+* `control_methods/zeros_embed`: Migrated from neurips 2021. Extracted from baseline method `dummy_zeros`.
 
-* `methods/lmds`: Migrated from v1.
+* `methods/lmds`: Migrated from neurips 2021.
 
-* `methods/mnn`: Migrated and adapted from v1.
+* `methods/mnn`: Migrated and adapted from neurips 2021.
 
-* `methods/newwave`: Migrated and adapted from v1.
+* `methods/newwave`: Migrated and adapted from neurips 2021.
 
-* `methods/pca`: Migrated from v1.
+* `methods/pca`: Migrated from neurips 2021.
 
-* `methods/totalvi`: Migrated from v1.
+* `methods/totalvi`: Migrated from neurips 2021.
 
-* `methods/umap`: Migrated from v1.
+* `methods/umap`: Migrated from neurips 2021.
 
-* `metrics/ari`: Migrated from v1.
+* `metrics/ari`: Migrated from neurips 2021.
   
-* `metrics/asw_batch`: Migrated from v1.
+* `metrics/asw_batch`: Migrated from neurips 2021.
 
-* `metrics/asw_label`: Migrated from v1.
+* `metrics/asw_label`: Migrated from neurips 2021.
 
-* `metrics/cc_cons`: Migrated from v1.
+* `metrics/cc_cons`: Migrated from neurips 2021.
 
-* `metrics/check_format`: Migrated from v1.
+* `metrics/check_format`: Migrated from neurips 2021.
 
-* `metrics/graph_connectivity`: Migrated from v1.
+* `metrics/graph_connectivity`: Migrated from neurips 2021.
 
-* `metrics/latent_mixing`: Migrated from v1.
+* `metrics/latent_mixing`: Migrated from neurips 2021.
 
-* `metrics/nmi`: Migrated from v1.
+* `metrics/nmi`: Migrated from neurips 2021.
 
-* `metrics/rfoob`: Migrated from v1.
+* `metrics/rfoob`: Migrated from neurips 2021.
 
-* `metrics/ti_cons`: Migrated from v1.
+* `metrics/ti_cons`: Migrated from neurips 2021.
 
-* `metrics/ti_cons_batch`: Migrated from v1.
+* `metrics/ti_cons_batch`: Migrated from neurips 2021.
 
 ### changes from neurips 2021
 

From 10f75d487bd0107686afd301029143621c3801ee Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Fri, 6 Jan 2023 21:55:47 +0100
Subject: [PATCH 32/42] fix typo in changelog

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8c075e9139..ae98ed9906 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -167,7 +167,7 @@
 
 ## Multi modality - Joint Embedding
 
-### New functinality
+### New functionality
 
 * `api/anndata_*`: Created a file format specifications for the h5ad files throughout the pipeline.
 

From e0aef20fc956722ce102d9535c2f323a489e7eb4 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Fri, 6 Jan 2023 21:56:22 +0100
Subject: [PATCH 33/42] fix typo in changelog

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ae98ed9906..9c63253290 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -223,6 +223,6 @@
 
 * stores the output from the methods in `.obsm["X_emb"]` instead of `.X` in the `anndata`
 
-* `X_emb ` data is stored as a `Sparse Matrix`
+* `X_emb` data is stored as a `Sparse Matrix`
   
 * updated configs to latest `viash` 
\ No newline at end of file

From 8327637dbf339375118edd644d27983bda4f2378 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Mon, 9 Jan 2023 10:39:08 +0100
Subject: [PATCH 34/42] convert sparse matrix to array

---
 src/joint_embedding/metrics/cc_cons/script.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/joint_embedding/metrics/cc_cons/script.py b/src/joint_embedding/metrics/cc_cons/script.py
index a8741fb511..feecf45f12 100644
--- a/src/joint_embedding/metrics/cc_cons/script.py
+++ b/src/joint_embedding/metrics/cc_cons/script.py
@@ -30,6 +30,8 @@
                'G2M_score' not in adata_solution.obs_keys()
 
 print('Compute score')
+adata.obsm['X_emb'] = adata.obsm['X_emb'].toarray()
+
 score = scib.me.cell_cycle(
     adata_pre=adata_solution,
     adata_post=adata,

From 1b2dd900cb0f13de52e5a038e0ee1adf354879cb Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Mon, 9 Jan 2023 12:38:01 +0100
Subject: [PATCH 35/42] use denormalized counts data

---
 src/joint_embedding/methods/totalvi/script.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/joint_embedding/methods/totalvi/script.py b/src/joint_embedding/methods/totalvi/script.py
index 0c22dee08f..9b40e1f3d1 100644
--- a/src/joint_embedding/methods/totalvi/script.py
+++ b/src/joint_embedding/methods/totalvi/script.py
@@ -20,7 +20,7 @@
 print("Load and prepare data", flush=True)
 adata_mod1 = anndata.read_h5ad(par['input_mod1'])
 adata_mod2 = anndata.read_h5ad(par['input_mod2'])
-adata_mod1.obsm['protein_expression'] = adata_mod2.X.toarray()
+adata_mod1.obsm['protein_expression'] = adata_mod2.layers["counts"].toarray()
 
 print('Select highly variable genes', flush=True)
 sc.pp.highly_variable_genes(

From a8895dc2d28a84124759320b9c53e5da20912139 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Fri, 13 Jan 2023 16:54:23 +0100
Subject: [PATCH 36/42] fix directive labels

---
 .../control_methods/random_embed/config.vsh.yaml               | 3 ++-
 .../control_methods/zeros_embed/config.vsh.yaml                | 3 ++-
 src/joint_embedding/mask_dataset/config.vsh.yaml               | 3 ++-
 src/joint_embedding/methods/lmds/config.vsh.yaml               | 3 ++-
 src/joint_embedding/methods/mnn/config.vsh.yaml                | 3 ++-
 src/joint_embedding/methods/newwave/config.vsh.yaml            | 3 ++-
 src/joint_embedding/methods/pca/config.vsh.yaml                | 3 ++-
 src/joint_embedding/methods/totalvi/config.vsh.yaml            | 3 ++-
 src/joint_embedding/methods/umap/config.vsh.yaml               | 3 ++-
 src/joint_embedding/metrics/ari/config.vsh.yaml                | 3 ++-
 src/joint_embedding/metrics/asw_batch/config.vsh.yaml          | 3 ++-
 src/joint_embedding/metrics/asw_label/config.vsh.yaml          | 3 ++-
 src/joint_embedding/metrics/cc_cons/config.vsh.yaml            | 3 ++-
 src/joint_embedding/metrics/check_format/config.vsh.yaml       | 3 ++-
 src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml | 3 ++-
 src/joint_embedding/metrics/latent_mixing/config.vsh.yaml      | 3 ++-
 src/joint_embedding/metrics/nmi/config.vsh.yaml                | 3 ++-
 src/joint_embedding/metrics/rfoob/config.vsh.yaml              | 3 ++-
 src/joint_embedding/metrics/ti_cons/config.vsh.yaml            | 3 ++-
 src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml      | 3 ++-
 20 files changed, 40 insertions(+), 20 deletions(-)

diff --git a/src/joint_embedding/control_methods/random_embed/config.vsh.yaml b/src/joint_embedding/control_methods/random_embed/config.vsh.yaml
index cdc3061a8e..5eefbfcd1e 100644
--- a/src/joint_embedding/control_methods/random_embed/config.vsh.yaml
+++ b/src/joint_embedding/control_methods/random_embed/config.vsh.yaml
@@ -21,4 +21,5 @@ platforms:
       - type: python
         pip: [ anndata>=0.8, numpy , scipy]
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/control_methods/zeros_embed/config.vsh.yaml b/src/joint_embedding/control_methods/zeros_embed/config.vsh.yaml
index b2c95a79e2..32c2648284 100644
--- a/src/joint_embedding/control_methods/zeros_embed/config.vsh.yaml
+++ b/src/joint_embedding/control_methods/zeros_embed/config.vsh.yaml
@@ -21,4 +21,5 @@ platforms:
       - type: python
         pip: [ anndata, numpy, scipy ]
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/mask_dataset/config.vsh.yaml b/src/joint_embedding/mask_dataset/config.vsh.yaml
index d49464163d..872df1d9fa 100644
--- a/src/joint_embedding/mask_dataset/config.vsh.yaml
+++ b/src/joint_embedding/mask_dataset/config.vsh.yaml
@@ -27,4 +27,5 @@ platforms:
       - type: python
         pip: [ anndata>=0.8 ]
   - type: nextflow
-    directives: [ midmem, lowtime, lowcpu ]
+    directives: 
+      label: [ midmem, lowcpu ]
diff --git a/src/joint_embedding/methods/lmds/config.vsh.yaml b/src/joint_embedding/methods/lmds/config.vsh.yaml
index d01b922fd7..8d11aee807 100644
--- a/src/joint_embedding/methods/lmds/config.vsh.yaml
+++ b/src/joint_embedding/methods/lmds/config.vsh.yaml
@@ -29,4 +29,5 @@ platforms:
       - type: python
         pip: [anndata>=0.8]
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/methods/mnn/config.vsh.yaml b/src/joint_embedding/methods/mnn/config.vsh.yaml
index 486beb9301..48b8403903 100644
--- a/src/joint_embedding/methods/mnn/config.vsh.yaml
+++ b/src/joint_embedding/methods/mnn/config.vsh.yaml
@@ -27,4 +27,5 @@ platforms:
       - type: python
         pip: [anndata>=0.8]      
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/methods/newwave/config.vsh.yaml b/src/joint_embedding/methods/newwave/config.vsh.yaml
index 2da0df3a4b..151dee6363 100644
--- a/src/joint_embedding/methods/newwave/config.vsh.yaml
+++ b/src/joint_embedding/methods/newwave/config.vsh.yaml
@@ -36,4 +36,5 @@ platforms:
       - type: python
         pip: [anndata>=0.8]
   - type: nextflow
-    directives: [ highmem, hightime, highcpu ]
+    directives: 
+      label: [ highmem, highcpu ]
diff --git a/src/joint_embedding/methods/pca/config.vsh.yaml b/src/joint_embedding/methods/pca/config.vsh.yaml
index 967de6da6e..a2187266ad 100644
--- a/src/joint_embedding/methods/pca/config.vsh.yaml
+++ b/src/joint_embedding/methods/pca/config.vsh.yaml
@@ -30,4 +30,5 @@ platforms:
       - type: python
         pip: [anndata>=0.8]
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/methods/totalvi/config.vsh.yaml b/src/joint_embedding/methods/totalvi/config.vsh.yaml
index 74116f9abe..e418b21bf5 100644
--- a/src/joint_embedding/methods/totalvi/config.vsh.yaml
+++ b/src/joint_embedding/methods/totalvi/config.vsh.yaml
@@ -27,7 +27,8 @@ platforms:
       - type: python
         pip: [ anndata>=0.8, scanpy, scikit-misc, scipy, scikit-learn, scvi-tools]
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
+    directives: 
+      label: [ lowmem, lowcpu ]
 
 
 
diff --git a/src/joint_embedding/methods/umap/config.vsh.yaml b/src/joint_embedding/methods/umap/config.vsh.yaml
index 4b10222d1c..ee9650ab95 100644
--- a/src/joint_embedding/methods/umap/config.vsh.yaml
+++ b/src/joint_embedding/methods/umap/config.vsh.yaml
@@ -42,4 +42,5 @@ platforms:
       - type: python
         pip: [anndata>=0.8]
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/metrics/ari/config.vsh.yaml b/src/joint_embedding/metrics/ari/config.vsh.yaml
index 5097080a70..69e01c5a26 100644
--- a/src/joint_embedding/metrics/ari/config.vsh.yaml
+++ b/src/joint_embedding/metrics/ari/config.vsh.yaml
@@ -24,4 +24,5 @@ platforms:
       - type: python
         pip: [ pyyaml ]
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/metrics/asw_batch/config.vsh.yaml b/src/joint_embedding/metrics/asw_batch/config.vsh.yaml
index 24ab2503b5..5abd76fe8f 100644
--- a/src/joint_embedding/metrics/asw_batch/config.vsh.yaml
+++ b/src/joint_embedding/metrics/asw_batch/config.vsh.yaml
@@ -24,4 +24,5 @@ platforms:
       - type: python
         pip: [ pyyaml ]
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/metrics/asw_label/config.vsh.yaml b/src/joint_embedding/metrics/asw_label/config.vsh.yaml
index 74d7c24afe..9cf7ceac08 100644
--- a/src/joint_embedding/metrics/asw_label/config.vsh.yaml
+++ b/src/joint_embedding/metrics/asw_label/config.vsh.yaml
@@ -24,4 +24,5 @@ platforms:
       - type: python
         pip: [ pyyaml ]
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/metrics/cc_cons/config.vsh.yaml b/src/joint_embedding/metrics/cc_cons/config.vsh.yaml
index 86a624c632..16c562c37c 100644
--- a/src/joint_embedding/metrics/cc_cons/config.vsh.yaml
+++ b/src/joint_embedding/metrics/cc_cons/config.vsh.yaml
@@ -24,4 +24,5 @@ platforms:
       - type: python
         pip: [ pyyaml ]
   - type: nextflow
-    directives: [ vhighmem, midtime, midcpu ]
+    directives: 
+      label: [ vhighmem, midcpu ]
diff --git a/src/joint_embedding/metrics/check_format/config.vsh.yaml b/src/joint_embedding/metrics/check_format/config.vsh.yaml
index 934af734e3..b2d8a021b4 100644
--- a/src/joint_embedding/metrics/check_format/config.vsh.yaml
+++ b/src/joint_embedding/metrics/check_format/config.vsh.yaml
@@ -34,4 +34,5 @@ platforms:
       - type: python
         pip: [ pyyaml ]
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml b/src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml
index 8cbde10e3a..8b6a3871d0 100644
--- a/src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml
+++ b/src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml
@@ -24,4 +24,5 @@ platforms:
       - type: python
         pip: [ pyyaml ]
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/metrics/latent_mixing/config.vsh.yaml b/src/joint_embedding/metrics/latent_mixing/config.vsh.yaml
index 2ef4049453..ed07c525eb 100644
--- a/src/joint_embedding/metrics/latent_mixing/config.vsh.yaml
+++ b/src/joint_embedding/metrics/latent_mixing/config.vsh.yaml
@@ -28,4 +28,5 @@ platforms:
       - type: python
         pip: [ pyyaml ]
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
\ No newline at end of file
+    directives: 
+      label: [ lowmem, lowcpu ]
\ No newline at end of file
diff --git a/src/joint_embedding/metrics/nmi/config.vsh.yaml b/src/joint_embedding/metrics/nmi/config.vsh.yaml
index bf98b925de..cd49b342e8 100644
--- a/src/joint_embedding/metrics/nmi/config.vsh.yaml
+++ b/src/joint_embedding/metrics/nmi/config.vsh.yaml
@@ -24,4 +24,5 @@ platforms:
       - type: python
         pip: [ pyyaml ]
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/metrics/rfoob/config.vsh.yaml b/src/joint_embedding/metrics/rfoob/config.vsh.yaml
index 5f080b8b08..a11a77d71f 100644
--- a/src/joint_embedding/metrics/rfoob/config.vsh.yaml
+++ b/src/joint_embedding/metrics/rfoob/config.vsh.yaml
@@ -46,4 +46,5 @@ platforms:
       - type: python
         pip: [ pyyaml ]
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
\ No newline at end of file
+    directives: 
+      label: [ lowmem, lowcpu ]
\ No newline at end of file
diff --git a/src/joint_embedding/metrics/ti_cons/config.vsh.yaml b/src/joint_embedding/metrics/ti_cons/config.vsh.yaml
index e7d70b24b6..cf153437db 100644
--- a/src/joint_embedding/metrics/ti_cons/config.vsh.yaml
+++ b/src/joint_embedding/metrics/ti_cons/config.vsh.yaml
@@ -36,4 +36,5 @@ platforms:
       - type: python
         pip: [ pyyaml ]
   - type: nextflow
-    directives: [ midmem, lowtime, midcpu ]
+    directives: 
+      label: [ midmem, midcpu ]
diff --git a/src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml b/src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml
index 43aed327f6..c0396b3516 100644
--- a/src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml
+++ b/src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml
@@ -36,4 +36,5 @@ platforms:
       - type: python
         pip: [ pyyaml ]
   - type: nextflow
-    directives: [ midmem, lowtime, midcpu ]
+    directives: 
+      label: [ midmem, midcpu ]

From a849f0bff7db704424ca8f712b096d38ae6862e8 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Fri, 13 Jan 2023 22:20:21 +0100
Subject: [PATCH 37/42] update configs to align with v1 metadata

---
 .../random_embed/config.vsh.yaml              |  2 +-
 .../zeros_embed/config.vsh.yaml               |  2 +-
 .../methods/lmds/config.vsh.yaml              |  4 ++--
 .../methods/mnn/config.vsh.yaml               |  2 +-
 .../methods/newwave/config.vsh.yaml           |  2 +-
 .../methods/pca/config.vsh.yaml               |  2 +-
 .../methods/totalvi/config.vsh.yaml           |  2 +-
 .../methods/umap/config.vsh.yaml              |  2 +-
 .../metrics/ari/config.vsh.yaml               |  6 ++---
 .../metrics/asw_batch/config.vsh.yaml         |  6 ++---
 .../metrics/asw_label/config.vsh.yaml         |  6 ++---
 .../metrics/cc_cons/config.vsh.yaml           |  6 ++---
 .../metrics/check_format/config.vsh.yaml      | 12 +++++-----
 .../graph_connectivity/config.vsh.yaml        |  6 ++---
 .../metrics/latent_mixing/config.vsh.yaml     |  5 ++--
 .../metrics/nmi/config.vsh.yaml               |  6 ++---
 .../metrics/rfoob/config.vsh.yaml             | 24 +++++++++----------
 .../metrics/ti_cons/config.vsh.yaml           | 18 +++++++-------
 .../metrics/ti_cons_batch/config.vsh.yaml     | 18 +++++++-------
 19 files changed, 66 insertions(+), 65 deletions(-)

diff --git a/src/joint_embedding/control_methods/random_embed/config.vsh.yaml b/src/joint_embedding/control_methods/random_embed/config.vsh.yaml
index 5eefbfcd1e..15b0c745e9 100644
--- a/src/joint_embedding/control_methods/random_embed/config.vsh.yaml
+++ b/src/joint_embedding/control_methods/random_embed/config.vsh.yaml
@@ -5,7 +5,7 @@ functionality:
   description: Generate a random embedding from a normal distribution.
   info:
     type: negative_control
-    label: Normal Dist.
+    method_name: Normal Dist.
   arguments:
     - name: "--n_dims"
       type: "integer"
diff --git a/src/joint_embedding/control_methods/zeros_embed/config.vsh.yaml b/src/joint_embedding/control_methods/zeros_embed/config.vsh.yaml
index 32c2648284..2cd0a9b71f 100644
--- a/src/joint_embedding/control_methods/zeros_embed/config.vsh.yaml
+++ b/src/joint_embedding/control_methods/zeros_embed/config.vsh.yaml
@@ -5,7 +5,7 @@ functionality:
   description: Generate an embedding containing only zero values.
   info:
     type: negative_control
-    label: zeros_embed
+    method_name: zeros_embed
   arguments:
     - name: "--n_dims"
       type: "integer"
diff --git a/src/joint_embedding/methods/lmds/config.vsh.yaml b/src/joint_embedding/methods/lmds/config.vsh.yaml
index 8d11aee807..440172d748 100644
--- a/src/joint_embedding/methods/lmds/config.vsh.yaml
+++ b/src/joint_embedding/methods/lmds/config.vsh.yaml
@@ -1,11 +1,11 @@
-__merge__: ../../api/comp_method.yaml
+__merge__: ../../api/comp_method.yamllowmem
 functionality:
   name: lmds
   namespace: joint_embedding/methods
   description: Landmark MDS dimensionality reduction on the Spearman distance.
   info:
     type: method
-    label: "LMDS"
+    method_name: "LMDS"
   arguments:
     - name: "--distance_method"
       type: "string"
diff --git a/src/joint_embedding/methods/mnn/config.vsh.yaml b/src/joint_embedding/methods/mnn/config.vsh.yaml
index 48b8403903..30dc728fd9 100644
--- a/src/joint_embedding/methods/mnn/config.vsh.yaml
+++ b/src/joint_embedding/methods/mnn/config.vsh.yaml
@@ -5,7 +5,7 @@ functionality:
   description: Mutual nearest neighbors correction followed by PCA.
   info:
     type: method
-    label: "MNN"
+    method_name: "MNN"
   arguments:
     - name: "--hvg_sel"
       type: "integer"
diff --git a/src/joint_embedding/methods/newwave/config.vsh.yaml b/src/joint_embedding/methods/newwave/config.vsh.yaml
index 151dee6363..0939d7b6f7 100644
--- a/src/joint_embedding/methods/newwave/config.vsh.yaml
+++ b/src/joint_embedding/methods/newwave/config.vsh.yaml
@@ -5,7 +5,7 @@ functionality:
   description: Concatenated NewWave.
   info:
     type: method
-    label: "NewWave"
+    method_name: "NewWave"
     doi: "10.1101/2021.08.02.453487"
   arguments:
     - name: "--maxiter"
diff --git a/src/joint_embedding/methods/pca/config.vsh.yaml b/src/joint_embedding/methods/pca/config.vsh.yaml
index a2187266ad..860a48c300 100644
--- a/src/joint_embedding/methods/pca/config.vsh.yaml
+++ b/src/joint_embedding/methods/pca/config.vsh.yaml
@@ -5,7 +5,7 @@ functionality:
   description: PCA dimensionality reduction.
   info:
     type: method
-    label: "PCA"
+    method_name: "PCA"
   authors:
   arguments:
     - name: "--n_dims"
diff --git a/src/joint_embedding/methods/totalvi/config.vsh.yaml b/src/joint_embedding/methods/totalvi/config.vsh.yaml
index e418b21bf5..5b0969172a 100644
--- a/src/joint_embedding/methods/totalvi/config.vsh.yaml
+++ b/src/joint_embedding/methods/totalvi/config.vsh.yaml
@@ -6,7 +6,7 @@ functionality:
   description: "totalVI: joint probabilistic modeling with Total Variational Inference"
   info:
     type: method
-    label: totalVI
+    method_name: totalVI
     doi: 10.1038/s41592-020-01050-x    
   arguments:
     - name: --hvg_number
diff --git a/src/joint_embedding/methods/umap/config.vsh.yaml b/src/joint_embedding/methods/umap/config.vsh.yaml
index ee9650ab95..5ef2d3d11e 100644
--- a/src/joint_embedding/methods/umap/config.vsh.yaml
+++ b/src/joint_embedding/methods/umap/config.vsh.yaml
@@ -6,7 +6,7 @@ functionality:
   description: UMAP dimensionality reduction on the Euclidean distance.
   info:
     type: method
-    label: UMAP
+    method_name: UMAP
   arguments:
     - name: "--n_dims"
       type: "integer"
diff --git a/src/joint_embedding/metrics/ari/config.vsh.yaml b/src/joint_embedding/metrics/ari/config.vsh.yaml
index 69e01c5a26..41ef532e5a 100644
--- a/src/joint_embedding/metrics/ari/config.vsh.yaml
+++ b/src/joint_embedding/metrics/ari/config.vsh.yaml
@@ -5,9 +5,9 @@ functionality:
   description: Adjusted rand index (ARI)
   info:
     metrics:
-      - id: ari
-        label: ari
-        description: Adjusted rand index (ARI)
+      - metric_id: ari
+        metric_name: ari
+        metric_description: Adjusted rand index (ARI)
         min: 0
         max: 1
         maximize: true
diff --git a/src/joint_embedding/metrics/asw_batch/config.vsh.yaml b/src/joint_embedding/metrics/asw_batch/config.vsh.yaml
index 5abd76fe8f..e89b30dc90 100644
--- a/src/joint_embedding/metrics/asw_batch/config.vsh.yaml
+++ b/src/joint_embedding/metrics/asw_batch/config.vsh.yaml
@@ -5,9 +5,9 @@ functionality:
   description: Average silhouette width (ASW) of batches per label
   info:
     metrics:
-      - id: asw_batch
-        label: asw_batch
-        description: Average silhouette width (ASW) of batches per label
+      - metric_id: asw_batch
+        metric_name: asw_batch
+        metric_description: Average silhouette width (ASW) of batches per label
         min: 0
         max: 1
         maximize: True
diff --git a/src/joint_embedding/metrics/asw_label/config.vsh.yaml b/src/joint_embedding/metrics/asw_label/config.vsh.yaml
index 9cf7ceac08..262b48d6f4 100644
--- a/src/joint_embedding/metrics/asw_label/config.vsh.yaml
+++ b/src/joint_embedding/metrics/asw_label/config.vsh.yaml
@@ -5,9 +5,9 @@ functionality:
   description: Average silhouette width (ASW) of labels
   info:
     metrics:
-      - id: asw_label
-        label: asw_label
-        description: Average silhouette width (ASW) of labels
+      - metric_id: asw_label
+        metric_name: asw_label
+        metric_description: Average silhouette width (ASW) of labels
         min: 0
         max: 1
         maximize: True
diff --git a/src/joint_embedding/metrics/cc_cons/config.vsh.yaml b/src/joint_embedding/metrics/cc_cons/config.vsh.yaml
index 16c562c37c..ba501a5e21 100644
--- a/src/joint_embedding/metrics/cc_cons/config.vsh.yaml
+++ b/src/joint_embedding/metrics/cc_cons/config.vsh.yaml
@@ -5,9 +5,9 @@ functionality:
   description: Cell cycle conservation score
   info:
     metrics:
-      - id: cc_cons
-        label: cc_cons
-        description: Cell cycle conservation score
+      - metric_id: cc_cons
+        metric_name: cc_cons
+        metric_description: Cell cycle conservation score
         min: 0
         max: 1
         maximize: True
diff --git a/src/joint_embedding/metrics/check_format/config.vsh.yaml b/src/joint_embedding/metrics/check_format/config.vsh.yaml
index b2d8a021b4..90294f01e6 100644
--- a/src/joint_embedding/metrics/check_format/config.vsh.yaml
+++ b/src/joint_embedding/metrics/check_format/config.vsh.yaml
@@ -5,15 +5,15 @@ functionality:
   description: Checking whether the prediction of a method has the right format.
   info:
     metrics:
-      - id: finished
-        label: finished
-        description: 
+      - metric_id: finished
+        metric_name: finished
+        metric_description: check if metric finished
         min: 0
         max: 1
         maximize: true
-      - id: correct_format
-        label: correct_format
-        description: 
+      - metric_id: correct_format
+        metric_name: correct_format
+        metric_description: check if format is correct
         min: 0
         max: 1
         maximize: true
diff --git a/src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml b/src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml
index 8b6a3871d0..c79966893c 100644
--- a/src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml
+++ b/src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml
@@ -5,9 +5,9 @@ functionality:
   description: Graph connectivity
   info:
     metrics:
-      - id: graph_conn
-        label: graph_conn
-        description: Graph connectivity
+      - metric_id: graph_conn
+        metric_name: graph_conn
+        metric_description: Graph connectivity
         min: 0
         max: 1
         maximize: True
diff --git a/src/joint_embedding/metrics/latent_mixing/config.vsh.yaml b/src/joint_embedding/metrics/latent_mixing/config.vsh.yaml
index ed07c525eb..bf989bfc20 100644
--- a/src/joint_embedding/metrics/latent_mixing/config.vsh.yaml
+++ b/src/joint_embedding/metrics/latent_mixing/config.vsh.yaml
@@ -5,8 +5,9 @@ functionality:
   description: Calculate latent mixing metric for joint embedding task.
   info:
     metrics:
-      - id: latent_mixing
-        label: Latent mixing
+      - metric_id: latent_mixing
+        metric_name: Latent mixing
+        metric_description: Calculate latent mixing 
         min: -1
         max: 0
         maximize: True
diff --git a/src/joint_embedding/metrics/nmi/config.vsh.yaml b/src/joint_embedding/metrics/nmi/config.vsh.yaml
index cd49b342e8..e83772d65e 100644
--- a/src/joint_embedding/metrics/nmi/config.vsh.yaml
+++ b/src/joint_embedding/metrics/nmi/config.vsh.yaml
@@ -5,9 +5,9 @@ functionality:
   description: Normalised mutual information (NMI)
   info:
     metrics:
-      - id: nmi
-        label: NMI
-        description: Normalised mutual information (NMI)
+      - metric_id: nmi
+        metric_name: NMI
+        metric_description: Normalised mutual information (NMI)
         min: 0
         max: 1
         maximize: True
diff --git a/src/joint_embedding/metrics/rfoob/config.vsh.yaml b/src/joint_embedding/metrics/rfoob/config.vsh.yaml
index a11a77d71f..099219de47 100644
--- a/src/joint_embedding/metrics/rfoob/config.vsh.yaml
+++ b/src/joint_embedding/metrics/rfoob/config.vsh.yaml
@@ -5,27 +5,27 @@ functionality:
   description: Calculating basic metrics for the joint embedding task.
   info:
     metrics:
-      - id: rfoob_celltype_accuracy
-        label: rfoob_celltype_accuracy
-        description:
+      - metric_id: rfoob_celltype_accuracy
+        metric_name: Rfoob Celltype Accuray
+        metric_description: check the celltype accuracy
         min: 0
         max: 1
         maximize: True
-      - id: rfoob_pseudotimegex_rsq
-        label: rfoob_pseudotimegex_rsq
-        description:
+      - metric_metric_id: rfoob_pseudotimegex_rsq
+        metric_name: rfoob pseudotimegex rsq
+        metric_description: rfoob pseudotimegex rsq
         min: -1
         max: 1
         maximize: true
-      - id: rfoob_pseudotimeadt_rsq
-        label: rfoob_pseudotimeadt_rsq
-        description:
+      - metric_id: rfoob_pseudotimeadt_rsq
+        metric_name: rfoob pseudotimeadt rsq
+        metric_description: rfoob pseudotimeadt rsq
         min: -1
         max: 1
         maximize: True
-      - id: rfoob_batch_error
-        label: rfoob_batch_error
-        description:
+      - metric_id: rfoob_batch_error
+        metric_name: rfoob batch error
+        metric_description: rfoob batch error
         min: 0
         max: 1
         maximize: True
diff --git a/src/joint_embedding/metrics/ti_cons/config.vsh.yaml b/src/joint_embedding/metrics/ti_cons/config.vsh.yaml
index cf153437db..68525b337b 100644
--- a/src/joint_embedding/metrics/ti_cons/config.vsh.yaml
+++ b/src/joint_embedding/metrics/ti_cons/config.vsh.yaml
@@ -5,21 +5,21 @@ functionality:
   description: Trajectory inference conservation score
   info:
     metrics:
-      - id: ti_cons_RNA
-        label: ti_cons_RNA
-        description:
+      - metric_id: ti_cons_RNA
+        metric_name: ti cons RNA
+        metric_description: ti cons RNA
         min: 0
         max: 1
         maximize: True
-      - id: ti_cons_ADT_ATAC
-        label: ti_cons_ADT_ATAC
-        description:
+      - metric_id: ti_cons_ADT_ATAC
+        metric_name: ti cons ADT ATAC
+        metric_description: ti cons ADT ATAC
         min: 0
         max: 1
         maximize: True
-      - id: ti_cons_mean
-        label: ti_cons_mean
-        description:
+      - metric_id: ti_cons_mean
+        metric_name: ti cons mean
+        metric_description: ti cons mean
         min: 0
         max: 1
         maximize: True
diff --git a/src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml b/src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml
index c0396b3516..7139b8aadc 100644
--- a/src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml
+++ b/src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml
@@ -5,21 +5,21 @@ functionality:
   description: Trajectory inference conservation score per batch
   info:
     metrics:
-      - id: ti_cons_batch_RNA
-        label: ti_cons_batch_RNA
-        description:
+      - metric_id: ti_cons_batch_RNA
+        metric_name: ti cons batch RNA
+        metric_description: placeholder
         min: 0
         max: 1
         maximize: True
-      - id: ti_cons_batch_ADT_ATAC
-        label: ti_cons_batch_ADT_ATAC
-        description:
+      - metric_id: ti_cons_batch_ADT_ATAC
+        metric_name: ti cons batch ADT ATAC
+        metric_description: placeholder
         min: 0
         max: 1
         maximize: True
-      - id: ti_cons_batch_mean
-        label: ti_cons_batch_mean
-        description:
+      - metric_id: ti_cons_batch_mean
+        metric_name: ti cons batch mean
+        metric_description: placeholder
         min: 0
         max: 1
         maximize: True

From 399a316650102fbf227fa300fcceb0092c4798f1 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Fri, 13 Jan 2023 22:20:33 +0100
Subject: [PATCH 38/42] add readme

---
 src/joint_embedding/README.qmd | 263 +++++++++++++++++++++++++++++++++
 1 file changed, 263 insertions(+)
 create mode 100644 src/joint_embedding/README.qmd

diff --git a/src/joint_embedding/README.qmd b/src/joint_embedding/README.qmd
new file mode 100644
index 0000000000..d6fb56cae9
--- /dev/null
+++ b/src/joint_embedding/README.qmd
@@ -0,0 +1,263 @@
+---
+format: gfm
+toc: true
+---
+
+```{r setup, include=FALSE}
+library(tidyverse)
+library(rlang)
+
+strip_margin <- function(text, symbol = "\\|") {
+  str_replace_all(text, paste0("(\n?)[ \t]*", symbol), "\\1") 
+}
+
+dir <- "src/joint_embedding"
+# dir <- "."
+```
+
+# Joint Embedding
+
+## Task description
+```{r task description, echo=FALSE,warning=FALSE,error=FALSE,output='asis'}
+task_info <- yaml::yaml.load_file(paste0(dir,"/docs/task_info.yaml"))
+
+cat(task_info$description)
+
+```
+
+## Methods
+
+Methods for assigning labels from a reference dataset to a new dataset.
+
+```{r methods, echo=FALSE,warning=FALSE,error=FALSE,output="asis"}
+method_ns_list <- processx::run("viash", c("ns", "list", "-q", "methods", "--src", "."), wd = dir)
+method_configs <- yaml::yaml.load(method_ns_list$stdout)
+
+method_info <- map_df(method_configs, function(config) {
+  if (length(config$functionality$status) > 0 && config$functionality$status == "disabled") return(NULL)
+  info <- as_tibble(config$functionality$info)
+  info$comp_yaml <- config$info$config
+  info$name <- config$functionality$name
+  info$namespace <- config$functionality$namespace
+  info$description <- config$functionality$description
+  info
+})
+
+method_info_view <- 
+  method_info %>%
+    arrange(type, label) %>%
+    transmute(
+      Name = paste0("[", method_name, "](", comp_yaml, ")"),
+      Type = type,
+      Description = description,
+    #   DOI = ifelse(!is.na(paper_doi), paste0("[link](https://doi.org/", paper_doi, ")"), ""),
+    #   URL = ifelse(!is.na(code_url), paste0("[link](", code_url, ")"), "")
+    )
+
+cat(paste(knitr::kable(method_info_view, format = 'pipe'), collapse = "\n"))
+```
+
+
+## Metrics
+
+Metrics for joint embedding aim to characterize how well each ...
+
+```{r metrics, echo=FALSE,warning=FALSE,error=FALSE,output="asis"}
+metric_ns_list <- processx::run("viash", c("ns", "list", "-q", "metrics", "--src", "."), wd = dir)
+metric_configs <- yaml::yaml.load(metric_ns_list$stdout)
+
+metric_info <- map_df(metric_configs, function(config) {
+  metric_info <- as_tibble(map_df(config$functionality$info$metrics, as.data.frame))
+  metric_info$comp_yaml <- config$info$config
+  metric_info$comp_name <- config$functionality$name
+  metric_info$comp_namespace <- config$functionality$namespace
+  metric_info
+})
+
+metric_info_view <- 
+  metric_info %>%
+    transmute(
+      Name = paste0("[", metric_name, "](", comp_yaml, ")"),
+      Description = paste0(description, " ", ifelse(maximize, "Higher is better.", "Lower is better.")),
+      Range = paste0("[", min, ", ", max, "]")
+    )
+
+cat(paste(knitr::kable(metric_info_view, format = 'pipe'), collapse = "\n"))
+```
+
+
+## Pipeline topology
+
+```{r data, include=FALSE}
+comp_yamls <- list.files(paste0(dir, "/api"), pattern = "comp_", full.names = TRUE)
+file_yamls <- list.files(paste0(dir, "/api"), pattern = "anndata_", full.names = TRUE)
+
+comp_file <- map_df(comp_yamls, function(yaml_file) {
+  conf <- yaml::read_yaml(yaml_file)
+
+  map_df(conf$functionality$arguments, function(arg) {
+    tibble(
+      comp_name = basename(yaml_file) %>% gsub("\\.yaml", "", .),
+      arg_name = str_replace_all(arg$name, "^-*", ""),
+      direction = arg$direction %||% "input",
+      file_name = basename(arg$`__merge__`) %>% gsub("\\.yaml", "", .)
+    )
+  })
+})
+
+comp_info <- map_df(comp_yamls, function(yaml_file) {
+  conf <- yaml::read_yaml(yaml_file)
+
+  tibble(
+    name = basename(yaml_file) %>% gsub("\\.yaml", "", .),
+    label = name %>% gsub("comp_", "", .) %>% gsub("_", " ", .)
+  )
+})
+
+
+file_info <- map_df(file_yamls, function(yaml_file) {
+  arg <- yaml::read_yaml(yaml_file)
+  
+  tibble(
+    name = basename(yaml_file) %>% gsub("\\.yaml", "", .),
+    description = arg$description,
+    short_description = arg$info$short_description,
+    example = arg$example,
+    label = name %>% gsub("anndata_", "", .) %>% gsub("_", " ", .)
+  )
+})
+
+file_slot <- map_df(file_yamls, function(yaml_file) {
+  arg <- yaml::read_yaml(yaml_file)
+
+  map2_df(names(arg$info$slots), arg$info$slots, function(group_name, slot) {
+    df <- map_df(slot, as.data.frame)
+    df$struct <- group_name
+    df$file_name = basename(yaml_file) %>% gsub("\\.yaml", "", .)
+    as_tibble(df)
+  })
+}) %>% 
+  mutate(multiple = multiple %|% FALSE)
+```
+
+```{r flow, echo=FALSE,warning=FALSE,error=FALSE}
+nodes <- bind_rows(
+  file_info %>%
+    transmute(id = name, label = str_to_title(label), is_comp = FALSE),
+  comp_info %>%
+    transmute(id = name, label = str_to_title(label), is_comp = TRUE)
+) %>%
+  mutate(str = paste0(
+    "  ",
+    id, 
+    ifelse(is_comp, "[/", "("), 
+    label,
+    ifelse(is_comp, "/]", ")")
+  ))
+edges <- bind_rows(
+  comp_file %>%
+    filter(direction == "input") %>%
+    transmute(
+      from = file_name,
+      to = comp_name,
+      arrow = "---"
+    ),
+  comp_file %>%
+    filter(direction == "output") %>%
+    transmute(
+      from = comp_name, 
+      to = file_name, 
+      arrow = "-->"
+    )
+) %>%
+  mutate(str = paste0("  ", from, arrow, to))
+
+# note: use ```{mermaid} instead of ```mermaid when rendering to html
+out_str <- strip_margin(glue::glue("
+  §```mermaid
+  §%%| column: screen-inset-shaded
+  §flowchart LR
+  §{paste(nodes$str, collapse = '\n')}
+  §{paste(edges$str, collapse = '\n')}
+  §```
+  §"), symbol = "§")
+knitr::asis_output(out_str)
+```
+
+## File format API
+
+```{r file_api, echo=FALSE,warning=FALSE,error=FALSE,output="asis"}
+for (file_name in file_info$name) {
+  arg_info <- file_info %>% filter(name == file_name)
+  sub_out <- file_slot %>% 
+    filter(file_name == !!file_name) %>% 
+    select(struct, name, type, description)
+
+  used_in <- comp_file %>%
+    filter(file_name == !!file_name) %>%
+    left_join(comp_info %>% select(comp_name = name, comp_label = label), by = "comp_name") %>%
+    mutate(str = paste0("* [", comp_label, "](#", comp_label, "): ", arg_name, " (as ", direction, ")")) %>%
+    pull(str)
+
+  example <- sub_out %>%
+    group_by(struct) %>%
+    summarise(
+      str = paste0(unique(struct), ": ", paste0("'", name, "'", collapse = ", "))
+    ) %>%
+    arrange(match(struct, c("obs", "var", "uns", "obsm", "obsp", "varm", "varp", "layers")))
+
+  example_str <- c("    AnnData object", paste0("     ", example$str))
+  
+  out_str <- strip_margin(glue::glue("
+    §### `{str_to_title(arg_info$label)}`
+    §
+    §{arg_info$description}
+    §
+    §Used in:
+    §
+    §{paste(used_in, collapse = '\n')}
+    §
+    §Slots:
+    §
+    §{paste(knitr::kable(sub_out, format = 'pipe'), collapse = '\n')}
+    §
+    §Example:
+    §
+    §{paste(example_str, collapse = '\n')}
+    §
+    §"), symbol = "§")
+  cat(out_str)
+}
+```
+
+
+
+## Component API
+
+```{r comp_api, echo=FALSE,warning=FALSE,error=FALSE,output="asis"}
+# todo: add description
+# todo: add required info fields
+for (comp_name in comp_info$name) {
+  comp <- comp_info %>% filter(name == comp_name)
+  sub_out <- comp_file %>% 
+    filter(comp_name == !!comp_name) %>%
+    left_join(file_info %>% select(file_name = name, file_desc = description, file_sdesc = short_description, file_label = label), by = "file_name") %>%
+    transmute(
+      Name = paste0("`--", arg_name, "`"),
+      `File format` = paste0("[", str_to_title(file_label), "](#", file_label, ")"),
+      Direction = direction,
+      Description = file_sdesc
+    )
+  
+  out_str <- strip_margin(glue::glue("
+    §### `{str_to_title(comp$label)}`
+    §
+    §{ifelse(\"description\" %in% names(comp), comp$description, \"\")}
+    §
+    §Arguments:
+    §
+    §{paste(knitr::kable(sub_out, format = 'pipe'), collapse = '\n')}
+    §"), symbol = "§")
+  cat(out_str)
+}
+```
\ No newline at end of file

From be3e17564f9a18ab380957976bdbaf276f0572e6 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Fri, 13 Jan 2023 22:25:07 +0100
Subject: [PATCH 39/42] update readme

---
 src/joint_embedding/README.qmd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/joint_embedding/README.qmd b/src/joint_embedding/README.qmd
index d6fb56cae9..9d30030ce5 100644
--- a/src/joint_embedding/README.qmd
+++ b/src/joint_embedding/README.qmd
@@ -12,7 +12,7 @@ strip_margin <- function(text, symbol = "\\|") {
 }
 
 dir <- "src/joint_embedding"
-# dir <- "."
+dir <- "."
 ```
 
 # Joint Embedding

From 0bce13790b7600d477b2ecc3a979a316ffe3fd5f Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Tue, 24 Jan 2023 11:34:22 +0100
Subject: [PATCH 40/42] update task info and readme

---
 src/joint_embedding/README.md                 |  23 ++
 src/joint_embedding/README.qmd                | 263 ------------------
 .../{docs => api}/task_info.yaml              |   4 +-
 3 files changed, 24 insertions(+), 266 deletions(-)
 create mode 100644 src/joint_embedding/README.md
 delete mode 100644 src/joint_embedding/README.qmd
 rename src/joint_embedding/{docs => api}/task_info.yaml (79%)

diff --git a/src/joint_embedding/README.md b/src/joint_embedding/README.md
new file mode 100644
index 0000000000..7d9f65431c
--- /dev/null
+++ b/src/joint_embedding/README.md
@@ -0,0 +1,23 @@
+# Joint embedding
+
+Structure of this task:
+
+    src/embedding
+    ├── api                          Interface specifications for components and datasets in this task
+    ├── control_methods              Baseline (random/ground truth) methods to compare methods against
+    ├── methods                      Methods to be benchmarked
+    ├── metrics                      Metrics used to quantify performance of methods
+    ├── README.md                    This file
+    ├── resources_scripts            Scripts to process the datasets
+    ├── resources_test_scripts       Scripts to process the test resources
+    ├── split_dataset                Component to prepare common datasets
+    └── workflows                    Pipelines to run the full benchmark
+
+Relevant links:
+
+* [Description and results at openproblems.bio](https://openproblems.bio/neurips_2021/)
+
+* [Experimental results](https://openproblems-experimental.netlify.app/results/joint_embedding/)
+
+<!-- update this to openproblems.bio/guide when possible -->
+* [Contribution guide](https://github.com/openproblems-bio/openproblems-v2/blob/main/CONTRIBUTING.md)
diff --git a/src/joint_embedding/README.qmd b/src/joint_embedding/README.qmd
deleted file mode 100644
index 9d30030ce5..0000000000
--- a/src/joint_embedding/README.qmd
+++ /dev/null
@@ -1,263 +0,0 @@
----
-format: gfm
-toc: true
----
-
-```{r setup, include=FALSE}
-library(tidyverse)
-library(rlang)
-
-strip_margin <- function(text, symbol = "\\|") {
-  str_replace_all(text, paste0("(\n?)[ \t]*", symbol), "\\1") 
-}
-
-dir <- "src/joint_embedding"
-dir <- "."
-```
-
-# Joint Embedding
-
-## Task description
-```{r task description, echo=FALSE,warning=FALSE,error=FALSE,output='asis'}
-task_info <- yaml::yaml.load_file(paste0(dir,"/docs/task_info.yaml"))
-
-cat(task_info$description)
-
-```
-
-## Methods
-
-Methods for assigning labels from a reference dataset to a new dataset.
-
-```{r methods, echo=FALSE,warning=FALSE,error=FALSE,output="asis"}
-method_ns_list <- processx::run("viash", c("ns", "list", "-q", "methods", "--src", "."), wd = dir)
-method_configs <- yaml::yaml.load(method_ns_list$stdout)
-
-method_info <- map_df(method_configs, function(config) {
-  if (length(config$functionality$status) > 0 && config$functionality$status == "disabled") return(NULL)
-  info <- as_tibble(config$functionality$info)
-  info$comp_yaml <- config$info$config
-  info$name <- config$functionality$name
-  info$namespace <- config$functionality$namespace
-  info$description <- config$functionality$description
-  info
-})
-
-method_info_view <- 
-  method_info %>%
-    arrange(type, label) %>%
-    transmute(
-      Name = paste0("[", method_name, "](", comp_yaml, ")"),
-      Type = type,
-      Description = description,
-    #   DOI = ifelse(!is.na(paper_doi), paste0("[link](https://doi.org/", paper_doi, ")"), ""),
-    #   URL = ifelse(!is.na(code_url), paste0("[link](", code_url, ")"), "")
-    )
-
-cat(paste(knitr::kable(method_info_view, format = 'pipe'), collapse = "\n"))
-```
-
-
-## Metrics
-
-Metrics for joint embedding aim to characterize how well each ...
-
-```{r metrics, echo=FALSE,warning=FALSE,error=FALSE,output="asis"}
-metric_ns_list <- processx::run("viash", c("ns", "list", "-q", "metrics", "--src", "."), wd = dir)
-metric_configs <- yaml::yaml.load(metric_ns_list$stdout)
-
-metric_info <- map_df(metric_configs, function(config) {
-  metric_info <- as_tibble(map_df(config$functionality$info$metrics, as.data.frame))
-  metric_info$comp_yaml <- config$info$config
-  metric_info$comp_name <- config$functionality$name
-  metric_info$comp_namespace <- config$functionality$namespace
-  metric_info
-})
-
-metric_info_view <- 
-  metric_info %>%
-    transmute(
-      Name = paste0("[", metric_name, "](", comp_yaml, ")"),
-      Description = paste0(description, " ", ifelse(maximize, "Higher is better.", "Lower is better.")),
-      Range = paste0("[", min, ", ", max, "]")
-    )
-
-cat(paste(knitr::kable(metric_info_view, format = 'pipe'), collapse = "\n"))
-```
-
-
-## Pipeline topology
-
-```{r data, include=FALSE}
-comp_yamls <- list.files(paste0(dir, "/api"), pattern = "comp_", full.names = TRUE)
-file_yamls <- list.files(paste0(dir, "/api"), pattern = "anndata_", full.names = TRUE)
-
-comp_file <- map_df(comp_yamls, function(yaml_file) {
-  conf <- yaml::read_yaml(yaml_file)
-
-  map_df(conf$functionality$arguments, function(arg) {
-    tibble(
-      comp_name = basename(yaml_file) %>% gsub("\\.yaml", "", .),
-      arg_name = str_replace_all(arg$name, "^-*", ""),
-      direction = arg$direction %||% "input",
-      file_name = basename(arg$`__merge__`) %>% gsub("\\.yaml", "", .)
-    )
-  })
-})
-
-comp_info <- map_df(comp_yamls, function(yaml_file) {
-  conf <- yaml::read_yaml(yaml_file)
-
-  tibble(
-    name = basename(yaml_file) %>% gsub("\\.yaml", "", .),
-    label = name %>% gsub("comp_", "", .) %>% gsub("_", " ", .)
-  )
-})
-
-
-file_info <- map_df(file_yamls, function(yaml_file) {
-  arg <- yaml::read_yaml(yaml_file)
-  
-  tibble(
-    name = basename(yaml_file) %>% gsub("\\.yaml", "", .),
-    description = arg$description,
-    short_description = arg$info$short_description,
-    example = arg$example,
-    label = name %>% gsub("anndata_", "", .) %>% gsub("_", " ", .)
-  )
-})
-
-file_slot <- map_df(file_yamls, function(yaml_file) {
-  arg <- yaml::read_yaml(yaml_file)
-
-  map2_df(names(arg$info$slots), arg$info$slots, function(group_name, slot) {
-    df <- map_df(slot, as.data.frame)
-    df$struct <- group_name
-    df$file_name = basename(yaml_file) %>% gsub("\\.yaml", "", .)
-    as_tibble(df)
-  })
-}) %>% 
-  mutate(multiple = multiple %|% FALSE)
-```
-
-```{r flow, echo=FALSE,warning=FALSE,error=FALSE}
-nodes <- bind_rows(
-  file_info %>%
-    transmute(id = name, label = str_to_title(label), is_comp = FALSE),
-  comp_info %>%
-    transmute(id = name, label = str_to_title(label), is_comp = TRUE)
-) %>%
-  mutate(str = paste0(
-    "  ",
-    id, 
-    ifelse(is_comp, "[/", "("), 
-    label,
-    ifelse(is_comp, "/]", ")")
-  ))
-edges <- bind_rows(
-  comp_file %>%
-    filter(direction == "input") %>%
-    transmute(
-      from = file_name,
-      to = comp_name,
-      arrow = "---"
-    ),
-  comp_file %>%
-    filter(direction == "output") %>%
-    transmute(
-      from = comp_name, 
-      to = file_name, 
-      arrow = "-->"
-    )
-) %>%
-  mutate(str = paste0("  ", from, arrow, to))
-
-# note: use ```{mermaid} instead of ```mermaid when rendering to html
-out_str <- strip_margin(glue::glue("
-  §```mermaid
-  §%%| column: screen-inset-shaded
-  §flowchart LR
-  §{paste(nodes$str, collapse = '\n')}
-  §{paste(edges$str, collapse = '\n')}
-  §```
-  §"), symbol = "§")
-knitr::asis_output(out_str)
-```
-
-## File format API
-
-```{r file_api, echo=FALSE,warning=FALSE,error=FALSE,output="asis"}
-for (file_name in file_info$name) {
-  arg_info <- file_info %>% filter(name == file_name)
-  sub_out <- file_slot %>% 
-    filter(file_name == !!file_name) %>% 
-    select(struct, name, type, description)
-
-  used_in <- comp_file %>%
-    filter(file_name == !!file_name) %>%
-    left_join(comp_info %>% select(comp_name = name, comp_label = label), by = "comp_name") %>%
-    mutate(str = paste0("* [", comp_label, "](#", comp_label, "): ", arg_name, " (as ", direction, ")")) %>%
-    pull(str)
-
-  example <- sub_out %>%
-    group_by(struct) %>%
-    summarise(
-      str = paste0(unique(struct), ": ", paste0("'", name, "'", collapse = ", "))
-    ) %>%
-    arrange(match(struct, c("obs", "var", "uns", "obsm", "obsp", "varm", "varp", "layers")))
-
-  example_str <- c("    AnnData object", paste0("     ", example$str))
-  
-  out_str <- strip_margin(glue::glue("
-    §### `{str_to_title(arg_info$label)}`
-    §
-    §{arg_info$description}
-    §
-    §Used in:
-    §
-    §{paste(used_in, collapse = '\n')}
-    §
-    §Slots:
-    §
-    §{paste(knitr::kable(sub_out, format = 'pipe'), collapse = '\n')}
-    §
-    §Example:
-    §
-    §{paste(example_str, collapse = '\n')}
-    §
-    §"), symbol = "§")
-  cat(out_str)
-}
-```
-
-
-
-## Component API
-
-```{r comp_api, echo=FALSE,warning=FALSE,error=FALSE,output="asis"}
-# todo: add description
-# todo: add required info fields
-for (comp_name in comp_info$name) {
-  comp <- comp_info %>% filter(name == comp_name)
-  sub_out <- comp_file %>% 
-    filter(comp_name == !!comp_name) %>%
-    left_join(file_info %>% select(file_name = name, file_desc = description, file_sdesc = short_description, file_label = label), by = "file_name") %>%
-    transmute(
-      Name = paste0("`--", arg_name, "`"),
-      `File format` = paste0("[", str_to_title(file_label), "](#", file_label, ")"),
-      Direction = direction,
-      Description = file_sdesc
-    )
-  
-  out_str <- strip_margin(glue::glue("
-    §### `{str_to_title(comp$label)}`
-    §
-    §{ifelse(\"description\" %in% names(comp), comp$description, \"\")}
-    §
-    §Arguments:
-    §
-    §{paste(knitr::kable(sub_out, format = 'pipe'), collapse = '\n')}
-    §"), symbol = "§")
-  cat(out_str)
-}
-```
\ No newline at end of file
diff --git a/src/joint_embedding/docs/task_info.yaml b/src/joint_embedding/api/task_info.yaml
similarity index 79%
rename from src/joint_embedding/docs/task_info.yaml
rename to src/joint_embedding/api/task_info.yaml
index 6565c0c292..79267651ce 100644
--- a/src/joint_embedding/docs/task_info.yaml
+++ b/src/joint_embedding/api/task_info.yaml
@@ -1,8 +1,6 @@
 task_id: joint_embedding
 task_name: Joint Embedding
-v1_url: neurips2021_multimodal_viash/src/joint_embedding/readme.md
-v1_commit: 0f8eae583444ba3f71c3083b860cc34b9ecb2fa2
-short_description: Learning of an embedded space that leverages the information of multiple modalities (e.g. for improved cell type annotation).
+summary: Learning of an embedded space that leverages the information of multiple modalities (e.g. for improved cell type annotation).
 description: |
   The functioning of organs, tissues, and whole organisms is determined by the interplay of cells. 
   Cells are characterised into broad types, which in turn can take on different states. Here, a cell 

From e7abed3185a3eb11b7b296425b04da3517bcd7bf Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Tue, 24 Jan 2023 15:40:37 +0100
Subject: [PATCH 41/42] update comp_metric

---
 src/joint_embedding/api/comp_metric.yaml | 26 ++++++++----------------
 1 file changed, 8 insertions(+), 18 deletions(-)

diff --git a/src/joint_embedding/api/comp_metric.yaml b/src/joint_embedding/api/comp_metric.yaml
index bde43a316f..76e3b17430 100644
--- a/src/joint_embedding/api/comp_metric.yaml
+++ b/src/joint_embedding/api/comp_metric.yaml
@@ -21,21 +21,11 @@ functionality:
         import pandas as pd
         import yaml
 
-        ## VIASH START
-        # This code block will be replaced by viash at runtime.
-        par = {
-          "input_prediction": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
-          "input_solution": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
-          "output": "output.h5ad"
-        }
-        meta = { 'functionality_name': 'foo' }
-
-        ## VIASH END
-
         input_prediction_path = "resources_test/common/joint_embedding/cite_random_prediction.h5ad"
         input_solution_path = "resources_test/common/joint_embedding/cite_solution.h5ad"
         output_path = "output.h5ad"
-        # define some filenames
+
+        # load config yaml
         with open(meta["config"], "r") as file:
                 config = yaml.safe_load(file)
 
@@ -47,7 +37,7 @@ functionality:
         ]
 
         print("> Running method", flush=True)
-        out = subprocess.run(cmd, capture_output=True, text=True, check=True).stdout
+        out = subprocess.run(cmd, stderr=subprocess.STDOUT, check=True)
 
         print("> Checking whether output files were created", flush=True)
         assert path.exists(output_path)
@@ -60,9 +50,9 @@ functionality:
         # Create DF from metric config info
         metric_info = config['functionality']['info']['metrics']
         metric_meta = pd.DataFrame(metric_info)
-        metric_meta = metric_meta.astype({'id': str, 'label': str, 'description': str, 'min': float, 'max': float, 'maximize': bool})
+        metric_meta = metric_meta.astype({'metric_id': str, 'metric_name': str, 'metric_description': str, 'min': float, 'max': float, 'maximize': bool})
         print("> Checking contents of metric info", flush=True)
-        assert 'id' in metric_meta
+        assert 'metric_id' in metric_meta
         assert 'min' in metric_meta
         assert 'max' in metric_meta
         assert 'maximize' in metric_meta
@@ -77,7 +67,7 @@ functionality:
 
         print("> Checking .uns['metric_ids']", flush=True)
         assert 'metric_ids' in output.uns
-        assert set(output.uns['metric_ids']) == set(metric_meta.id)
+        assert set(output.uns['metric_ids']) == set(metric_meta.metric_id)
 
         print("> Checking .uns['metric_values']", flush=True)
         assert 'metric_values' in output.uns
@@ -85,11 +75,11 @@ functionality:
 
         # merge with metric_meta to see if metric_value lies within the expected range
         output_uns = pd.DataFrame({
-          'id': output.uns['metric_ids'], 
+          'metric_id': output.uns['metric_ids'], 
           'value': output.uns['metric_values']
         })
 
-        scores = metric_meta.merge(output_uns, on="id")
+        scores = metric_meta.merge(output_uns, on="metric_id")
 
         assert all(scores.value >= scores['min'])
         assert all(scores.value <= scores['max'])

From f781da5f736bb4863bc6c5ce8e99262b28bfb460 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Wed, 25 Jan 2023 15:14:24 +0100
Subject: [PATCH 42/42] resolve personal comments

---
 src/joint_embedding/resources_scripts/mask_datasets.sh |  2 +-
 .../resources_scripts/run_benchmarks.sh                |  6 +++---
 .../resources_test_scripts/bmmc_cite.sh                | 10 +++++-----
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/joint_embedding/resources_scripts/mask_datasets.sh b/src/joint_embedding/resources_scripts/mask_datasets.sh
index dfb1295fb7..b5a194e7b8 100644
--- a/src/joint_embedding/resources_scripts/mask_datasets.sh
+++ b/src/joint_embedding/resources_scripts/mask_datasets.sh
@@ -54,7 +54,7 @@ fi
 export NXF_VER=22.04.5
 nextflow \
   run . \
-  -main-script target/nextflow/denoising/split_dataset/main.nf \
+  -main-script target/nextflow/joint_embedding/split_dataset/main.nf \
   -profile docker \
   -resume \
   -params-file $params_file \
diff --git a/src/joint_embedding/resources_scripts/run_benchmarks.sh b/src/joint_embedding/resources_scripts/run_benchmarks.sh
index 01d6ef92a5..8a74b5a49a 100644
--- a/src/joint_embedding/resources_scripts/run_benchmarks.sh
+++ b/src/joint_embedding/resources_scripts/run_benchmarks.sh
@@ -10,8 +10,8 @@ set -e
 
 export TOWER_WORKSPACE_ID=53907369739130
 
-DATASETS_DIR="resources/denoising/datasets/openproblems_v1"
-OUTPUT_DIR="resources/denoising/benchmarks/openproblems_v1"
+DATASETS_DIR="resources/joint_embedding/datasets/openproblems_v1"
+OUTPUT_DIR="resources/joint_embedding/benchmarks/openproblems_v1"
 
 if [ ! -d "$OUTPUT_DIR" ]; then
   mkdir -p "$OUTPUT_DIR"
@@ -64,7 +64,7 @@ fi
 export NXF_VER=22.04.5
 nextflow \
   run . \
-  -main-script src/denoising/workflows/run/main.nf \
+  -main-script src/joint_embedding/workflows/run/main.nf \
   -profile docker \
   -params-file "$params_file" \
   --publish_dir "$OUTPUT_DIR" \
diff --git a/src/joint_embedding/resources_test_scripts/bmmc_cite.sh b/src/joint_embedding/resources_test_scripts/bmmc_cite.sh
index 72967ef27b..5541edfcb5 100644
--- a/src/joint_embedding/resources_test_scripts/bmmc_cite.sh
+++ b/src/joint_embedding/resources_test_scripts/bmmc_cite.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 #
 #make sure the following command has been executed
-#bin/viash_build -q 'denoising|common'
+#bin/viash_build -q 'joint_embedding|common'
 
 # get the root of the directory
 REPO_ROOT=$(git rev-parse --show-toplevel)
@@ -21,7 +21,7 @@ fi
 mkdir -p $DATASET_DIR
 
 # split dataset
-bin/viash run src/joint_embedding/mask_dataset/config.vsh.yaml -- \
+viash run src/joint_embedding/mask_dataset/config.vsh.yaml -- \
     --input_mod1 $MOD_1_DATA \
     --input_mod2 $MOD_2_DATA \
     --output_mod1 $DATASET_DIR/cite_mod1.h5ad \
@@ -29,13 +29,13 @@ bin/viash run src/joint_embedding/mask_dataset/config.vsh.yaml -- \
     --output_solution $DATASET_DIR/cite_solution.h5ad
 
 # run one method
-bin/viash run src/joint_embedding/methods/pca/config.vsh.yaml -- \
+viash run src/joint_embedding/methods/pca/config.vsh.yaml -- \
     --input_mod1 $DATASET_DIR/cite_mod1.h5ad \
     --input_mod2 $DATASET_DIR/cite_mod2.h5ad \
     --output $DATASET_DIR/pca.h5ad
 
 # run one metric
-bin/viash run src/joint_embedding/metrics/ari/config.vsh.yaml -- \
+viash run src/joint_embedding/metrics/ari/config.vsh.yaml -- \
     --input_prediction $DATASET_DIR/pca.h5ad \
     --input_solution $DATASET_DIR/cite_solution.h5ad \
     --output $DATASET_DIR/ari.h5ad
@@ -43,7 +43,7 @@ bin/viash run src/joint_embedding/metrics/ari/config.vsh.yaml -- \
 # run benchmark
 export NXF_VER=22.04.5
 
-bin/nextflow \
+nextflow \
   run . \
   -main-script src/joint_embedding/workflows/run/main.nf \
   -profile docker \