From 0db12b5659b1469eaabfb3ea3819824969d921bb Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Wed, 14 Dec 2022 14:53:28 +0100
Subject: [PATCH 01/82] add mask_dataset

---
 src/joint_embedding/api/anndata_dataset.yaml  | 75 ++++++++++++++
 src/joint_embedding/api/anndata_solution.yaml | 57 +++++++++++
 src/joint_embedding/api/authors.yaml          |  8 ++
 .../mask_dataset/config.vsh.yaml              | 24 +++++
 src/joint_embedding/mask_dataset/script.R     | 97 +++++++++++++++++++
 src/joint_embedding/mask_dataset/test.R       | 55 +++++++++++
 .../split_dataset/config.vsh.yaml             | 57 +++++++++++
 src/joint_embedding/split_dataset/script.R    | 97 +++++++++++++++++++
 src/joint_embedding/split_dataset/test.R      | 55 +++++++++++
 9 files changed, 525 insertions(+)
 create mode 100644 src/joint_embedding/api/anndata_dataset.yaml
 create mode 100644 src/joint_embedding/api/anndata_solution.yaml
 create mode 100644 src/joint_embedding/api/authors.yaml
 create mode 100644 src/joint_embedding/mask_dataset/config.vsh.yaml
 create mode 100644 src/joint_embedding/mask_dataset/script.R
 create mode 100644 src/joint_embedding/mask_dataset/test.R
 create mode 100644 src/joint_embedding/split_dataset/config.vsh.yaml
 create mode 100644 src/joint_embedding/split_dataset/script.R
 create mode 100644 src/joint_embedding/split_dataset/test.R

diff --git a/src/joint_embedding/api/anndata_dataset.yaml b/src/joint_embedding/api/anndata_dataset.yaml
new file mode 100644
index 0000000000..90a5f4a385
--- /dev/null
+++ b/src/joint_embedding/api/anndata_dataset.yaml
@@ -0,0 +1,75 @@
+type: file
+description: "A raw dataset"
+example: "dataset.h5ad"
+info:
+  label: "Dataset"
+  slots:
+    layers: 
+      - type: integer
+        name: counts
+        description: Raw counts
+        required: true
+    obs:
+      - type: string
+        name: batch
+        description: Batch information
+        required: false
+      - type: double
+        name: size_factors
+        description: The size factors created by the normalisation method, if any.
+        required: false
+      - type: string
+        name: cell_type
+        description: Type of cells
+        required: false
+      - type: string
+        name: pseudotime_order_GEX
+        description:
+        required: false
+      - type: string
+        name: pseudotime_order_ATAC
+        description:
+        required: false
+      - type: string
+        name: pseudotime_order_ADT
+        description:
+        required: false
+      - type: double
+        name: S_score
+        description:
+        required: false
+      - type: double
+        name: G2M_score
+        description:
+        required: false
+      - type: boolean
+        name: is_train
+        description: if sample is train data
+        required: true
+    var:
+      - type: string
+        name: gene_ids
+        description: 
+        required: false
+      - type: string
+        name: feature_types
+        description:
+        required: true
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: organism
+        description: "data from which organism "
+        required: false
+      - type: string
+        name: gene_activity_var_names
+        description:
+        required: false
+      - type: string
+        name: sample_pm_varnames
+        description:
+        required: false
+    
diff --git a/src/joint_embedding/api/anndata_solution.yaml b/src/joint_embedding/api/anndata_solution.yaml
new file mode 100644
index 0000000000..2ed03e1cc3
--- /dev/null
+++ b/src/joint_embedding/api/anndata_solution.yaml
@@ -0,0 +1,57 @@
+type: file
+description: "The solution for the data"
+example: "solution.h5ad"
+info:
+  short_description: "Solution"
+  slots:
+    layers: 
+      - type: integer
+        name: counts
+        description: Raw counts
+    obs:
+      - type: string
+        name: batch
+        description: Batch information
+        required: false
+      - type: string
+        name: cell_type
+        description: Type of cells
+        required: false
+      - type: string
+        name: pseudotime_order_GEX
+        description:
+        required: false
+      - type: string
+        name: pseudotime_order_ATAC
+        description:
+        required: false
+      - type: string
+        name: pseudotime_order_ADT
+        description:
+        required: false
+      - type: double
+        name: S_score
+        description:
+        required: false
+      - type: double
+        name: G2M_score
+        description:
+        required: false
+    var:
+      - type: string
+        name: feature_types
+        description: 
+        required: true
+      - type: string
+        name: gene_ids
+        description:
+        required: false
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: organism
+        description: which organism
+        required: true
diff --git a/src/joint_embedding/api/authors.yaml b/src/joint_embedding/api/authors.yaml
new file mode 100644
index 0000000000..7fc237c267
--- /dev/null
+++ b/src/joint_embedding/api/authors.yaml
@@ -0,0 +1,8 @@
+functionality:
+  authors:
+    - name: Robrecht Cannoodt
+      roles: [ author ]
+      props: { github: rcannood, orcid: "0000-0003-3641-729X" }
+    - name: Kai Waldrant
+      roles: [ contributor ]
+      props: { github: KaiWaldrant }
\ No newline at end of file
diff --git a/src/joint_embedding/mask_dataset/config.vsh.yaml b/src/joint_embedding/mask_dataset/config.vsh.yaml
new file mode 100644
index 0000000000..7ec45d9c5e
--- /dev/null
+++ b/src/joint_embedding/mask_dataset/config.vsh.yaml
@@ -0,0 +1,24 @@
+__merge__: ../api/comp_mask_dataset.yaml
+functionality:
+  name: mask_dataset
+  namespace: joint_embedding
+  description: |
+    A component for censoring joint embedding datasets to be given
+    to competition participants for the 'joint embedding' task.
+  arguments:
+    - name: "--train_only"
+      type: "boolean_true"
+      description: Whether or not to only omit the train cells.
+  resources:
+    - type: r_script
+      path: script.R
+  tests:
+    - type: r_script
+      path: test.R
+    - path: ../../../../resources_test
+platforms:
+  - type: docker
+    image: dataintuitive/randpy:r4.0_py3.8_bioc3.12
+  - type: nextflow
+    publish: true
+    labels: [ midmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/mask_dataset/script.R b/src/joint_embedding/mask_dataset/script.R
new file mode 100644
index 0000000000..58935f5bca
--- /dev/null
+++ b/src/joint_embedding/mask_dataset/script.R
@@ -0,0 +1,97 @@
+cat("Loading dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+requireNamespace("anndata", quietly = TRUE)
+library(assertthat, quietly = TRUE, warn.conflicts = FALSE)
+library(Matrix, quietly = TRUE, warn.conflicts = FALSE)
+
+## VIASH START
+input_path <- "output/datasets_2021-11-08/common/openproblems_bmmc_multiome_phase1v2/openproblems_bmmc_multiome_phase1v2.manual_formatting."
+output_path <- ""
+
+par <- list(
+  input_mod1 = paste0(input_path, "output_rna.h5ad"),
+  input_mod2 = paste0(input_path, "output_mod2.h5ad"),
+  output_mod1 = paste0(output_path, "output_mod1.h5ad"),
+  output_mod2 = paste0(output_path, "output_mod2.h5ad"),
+  output_solution = paste0(output_path, "solution.h5ad"),
+  train_only = TRUE
+)
+## VIASH END
+
+cat("Reading mod1 data\n")
+input_mod1 <- anndata::read_h5ad(par$input_mod1)
+ad1_mod <- unique(input_mod1$var[["feature_types"]])
+new_dataset_id <- paste0(input_mod1$uns[["dataset_id"]], "_JE")
+ad1_uns <- list(dataset_id = new_dataset_id, organism = "human")
+ad2_uns <- list(dataset_id = new_dataset_id, organism = "human")
+
+cat("Creating mod1 object\n")
+out_mod1 <- anndata::AnnData(
+  X = input_mod1$X,
+  layers = list(counts = input_mod1$layers[["counts"]]),
+  var = input_mod1$var %>% select(one_of("gene_ids"), feature_types),
+  obs = input_mod1$obs %>% select(one_of("batch", "size_factors")),
+  uns = ad1_uns
+)
+
+cat("Create solution object\n")
+out_solution <- anndata::AnnData(
+  X = input_mod1$X,
+  var = input_mod1$var %>% select(one_of("gene_ids"), feature_types),
+  obs = input_mod1$obs %>% select(
+    one_of("batch", "cell_type", "pseudotime_order_GEX", "pseudotime_order_ATAC", "pseudotime_order_ADT", "S_score", "G2M_score")
+  ),
+  uns = ad1_uns
+)
+
+is_train <- input_mod1$obs$is_train
+
+if (par$train_only) {
+  cat("Filtering out test cells\n", sep = "")
+  out_mod1 <- out_mod1[is_train, ] #$copy()
+  out_solution <- out_solution[is_train, ]# $copy()
+}
+
+rm(input_mod1)
+gc()
+
+cat("Reading mod2 data\n")
+input_mod2 <- anndata::read_h5ad(par$input_mod2)
+ad2_mod <- unique(input_mod2$var[["feature_types"]])
+ad2_obsm <- list()
+
+if (ad2_mod == "ATAC") {
+  ad2_uns$gene_activity_var_names <- input_mod2$uns$gene_activity_var_names
+  ad2_obsm$gene_activity <- as(input_mod2$obsm$gene_activity, "CsparseMatrix")
+}
+
+cat("Creating mod2 object\n")
+out_mod2 <- anndata::AnnData(
+  X = input_mod2$X,
+  layers = list(counts = input_mod2$layers[["counts"]]),
+  var = input_mod2$var %>% select(one_of("gene_ids"), feature_types),
+  obs = input_mod2$obs %>% select(one_of("batch")),
+  obsm = ad2_obsm,
+  uns = ad2_uns
+)
+rm(input_mod2)
+gc()
+
+if (par$train_only) {
+  cat("Filtering out test cells\n", sep = "")
+  out_mod2 <- out_mod2[is_train, ] #$copy()
+}
+
+cat("Saving output files as h5ad\n")
+cat("output_mod1:")
+print(out_mod1)
+zzz <- out_mod1$write_h5ad(par$output_mod1, compression = "gzip")
+
+cat("output_mod2:")
+print(out_mod2)
+zzz <- out_mod2$write_h5ad(par$output_mod2, compression = "gzip")
+
+cat("output_solution:")
+print(out_solution)
+zzz <- out_solution$write_h5ad(par$output_solution, compression = "gzip")
diff --git a/src/joint_embedding/mask_dataset/test.R b/src/joint_embedding/mask_dataset/test.R
new file mode 100644
index 0000000000..09fe193b55
--- /dev/null
+++ b/src/joint_embedding/mask_dataset/test.R
@@ -0,0 +1,55 @@
+library(testthat, quietly = TRUE, warn.conflicts = FALSE)
+requireNamespace("anndata", quietly = TRUE)
+
+par <- list(
+  input_mod1 = "resources_test/common/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.output_rna.h5ad",
+  input_mod2 = "resources_test/common/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.output_mod2.h5ad",
+  output_mod1 = "output_mod1.h5ad",
+  output_mod2 = "output_mod2.h5ad",
+  output_solution = "solution.h5ad"
+)
+
+cat("> Running censor component\n")
+out <- processx::run(
+  command = paste0("./", meta["functionality_name"]),
+  args = c(
+    "--input_mod1", par$input_mod1,
+    "--input_mod2", par$input_mod2,
+    "--output_mod1", par$output_mod1,
+    "--output_mod2", par$output_mod2,
+    "--output_solution", par$output_solution
+  ),
+  stderr_to_stdout = TRUE
+)
+
+cat("> Checking whether output files were created\n")
+expect_true(file.exists(par$output_mod1))
+expect_true(file.exists(par$output_mod2))
+expect_true(file.exists(par$output_solution))
+
+cat("> Reading h5ad files\n")
+input_mod1 <- anndata::read_h5ad(par$input_mod1)
+input_mod2 <- anndata::read_h5ad(par$input_mod2)
+output_mod1 <- anndata::read_h5ad(par$output_mod1)
+output_mod2 <- anndata::read_h5ad(par$output_mod2)
+output_solution <- anndata::read_h5ad(par$output_solution)
+
+cat("> Checking contents of h5ad files\n")
+expect_equal(output_mod1$uns[["dataset_id"]], paste0(input_mod1$uns[["dataset_id"]], "_JE"))
+expect_equal(output_mod2$uns[["dataset_id"]], paste0(input_mod1$uns[["dataset_id"]], "_JE"))
+expect_equal(output_solution$uns[["dataset_id"]], paste0(input_mod1$uns[["dataset_id"]], "_JE"))
+expect_equal(output_mod1$uns[["organism"]], input_mod1$uns[["organism"]])
+expect_equal(output_mod2$uns[["organism"]], input_mod1$uns[["organism"]])
+expect_equal(output_solution$uns[["organism"]], input_mod1$uns[["organism"]])
+expect_equal(output_mod1$n_obs, input_mod1$n_obs)
+expect_equal(output_mod2$n_obs, input_mod2$n_obs)
+expect_equal(output_mod1$n_vars, input_mod1$n_vars)
+expect_equal(output_mod2$n_vars, input_mod2$n_vars)
+expect_equal(output_mod1$var_names, input_mod1$var_names)
+expect_equal(output_mod2$var_names, input_mod2$var_names)
+expect_equal(output_mod1$obs_names, input_mod1$obs_names)
+expect_equal(output_mod2$obs_names, input_mod2$obs_names)
+
+# TODO check contents of matrices, check rownames
+
+cat("> Test succeeded!\n")
diff --git a/src/joint_embedding/split_dataset/config.vsh.yaml b/src/joint_embedding/split_dataset/config.vsh.yaml
new file mode 100644
index 0000000000..2e85c26fc4
--- /dev/null
+++ b/src/joint_embedding/split_dataset/config.vsh.yaml
@@ -0,0 +1,57 @@
+functionality:
+  name: censor_dataset
+  namespace: joint_embedding_datasets
+  version: dev
+  description: |
+    A component for censoring joint embedding datasets to be given
+    to competition participants for the 'joint embedding' task.
+  authors:
+    - name: Robrecht Cannoodt
+      email: rcannood@gmail.com
+      roles: [ author, maintainer ]
+      props: { github: rcannood, orcid: "0000-0003-3641-729X" }
+  arguments:
+    - name: "--input_mod1"
+      type: "file"
+      example: "dataset.h5ad"
+      description: An input h5ad dataset.
+      required: true
+    - name: "--input_mod2"
+      type: "file"
+      example: "dataset.h5ad"
+      description: An input h5ad dataset.
+      required: true
+    - name: "--output_mod1"
+      type: "file"
+      direction: "output"
+      example: "dataset_mod1.h5ad"
+      description: Output mod1 file.
+      required: true
+    - name: "--output_mod2"
+      type: "file"
+      direction: "output"
+      example: "dataset_mod2.h5ad"
+      description: Output mod1 file.
+      required: true
+    - name: "--output_solution"
+      type: "file"
+      direction: "output"
+      example: "dataset_solution.h5ad"
+      description: The solution file.
+      required: true
+    - name: "--train_only"
+      type: "boolean_true"
+      description: Whether or not to only omit the train cells.
+  resources:
+    - type: r_script
+      path: script.R
+  tests:
+    - type: r_script
+      path: test.R
+    - path: ../../../../resources_test
+platforms:
+  - type: docker
+    image: dataintuitive/randpy:r4.0_py3.8_bioc3.12
+  - type: nextflow
+    publish: true
+    labels: [ midmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/split_dataset/script.R b/src/joint_embedding/split_dataset/script.R
new file mode 100644
index 0000000000..58935f5bca
--- /dev/null
+++ b/src/joint_embedding/split_dataset/script.R
@@ -0,0 +1,97 @@
+cat("Loading dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+requireNamespace("anndata", quietly = TRUE)
+library(assertthat, quietly = TRUE, warn.conflicts = FALSE)
+library(Matrix, quietly = TRUE, warn.conflicts = FALSE)
+
+## VIASH START
+input_path <- "output/datasets_2021-11-08/common/openproblems_bmmc_multiome_phase1v2/openproblems_bmmc_multiome_phase1v2.manual_formatting."
+output_path <- ""
+
+par <- list(
+  input_mod1 = paste0(input_path, "output_rna.h5ad"),
+  input_mod2 = paste0(input_path, "output_mod2.h5ad"),
+  output_mod1 = paste0(output_path, "output_mod1.h5ad"),
+  output_mod2 = paste0(output_path, "output_mod2.h5ad"),
+  output_solution = paste0(output_path, "solution.h5ad"),
+  train_only = TRUE
+)
+## VIASH END
+
+cat("Reading mod1 data\n")
+input_mod1 <- anndata::read_h5ad(par$input_mod1)
+ad1_mod <- unique(input_mod1$var[["feature_types"]])
+new_dataset_id <- paste0(input_mod1$uns[["dataset_id"]], "_JE")
+ad1_uns <- list(dataset_id = new_dataset_id, organism = "human")
+ad2_uns <- list(dataset_id = new_dataset_id, organism = "human")
+
+cat("Creating mod1 object\n")
+out_mod1 <- anndata::AnnData(
+  X = input_mod1$X,
+  layers = list(counts = input_mod1$layers[["counts"]]),
+  var = input_mod1$var %>% select(one_of("gene_ids"), feature_types),
+  obs = input_mod1$obs %>% select(one_of("batch", "size_factors")),
+  uns = ad1_uns
+)
+
+cat("Create solution object\n")
+out_solution <- anndata::AnnData(
+  X = input_mod1$X,
+  var = input_mod1$var %>% select(one_of("gene_ids"), feature_types),
+  obs = input_mod1$obs %>% select(
+    one_of("batch", "cell_type", "pseudotime_order_GEX", "pseudotime_order_ATAC", "pseudotime_order_ADT", "S_score", "G2M_score")
+  ),
+  uns = ad1_uns
+)
+
+is_train <- input_mod1$obs$is_train
+
+if (par$train_only) {
+  cat("Filtering out test cells\n", sep = "")
+  out_mod1 <- out_mod1[is_train, ] #$copy()
+  out_solution <- out_solution[is_train, ]# $copy()
+}
+
+rm(input_mod1)
+gc()
+
+cat("Reading mod2 data\n")
+input_mod2 <- anndata::read_h5ad(par$input_mod2)
+ad2_mod <- unique(input_mod2$var[["feature_types"]])
+ad2_obsm <- list()
+
+if (ad2_mod == "ATAC") {
+  ad2_uns$gene_activity_var_names <- input_mod2$uns$gene_activity_var_names
+  ad2_obsm$gene_activity <- as(input_mod2$obsm$gene_activity, "CsparseMatrix")
+}
+
+cat("Creating mod2 object\n")
+out_mod2 <- anndata::AnnData(
+  X = input_mod2$X,
+  layers = list(counts = input_mod2$layers[["counts"]]),
+  var = input_mod2$var %>% select(one_of("gene_ids"), feature_types),
+  obs = input_mod2$obs %>% select(one_of("batch")),
+  obsm = ad2_obsm,
+  uns = ad2_uns
+)
+rm(input_mod2)
+gc()
+
+if (par$train_only) {
+  cat("Filtering out test cells\n", sep = "")
+  out_mod2 <- out_mod2[is_train, ] #$copy()
+}
+
+cat("Saving output files as h5ad\n")
+cat("output_mod1:")
+print(out_mod1)
+zzz <- out_mod1$write_h5ad(par$output_mod1, compression = "gzip")
+
+cat("output_mod2:")
+print(out_mod2)
+zzz <- out_mod2$write_h5ad(par$output_mod2, compression = "gzip")
+
+cat("output_solution:")
+print(out_solution)
+zzz <- out_solution$write_h5ad(par$output_solution, compression = "gzip")
diff --git a/src/joint_embedding/split_dataset/test.R b/src/joint_embedding/split_dataset/test.R
new file mode 100644
index 0000000000..09fe193b55
--- /dev/null
+++ b/src/joint_embedding/split_dataset/test.R
@@ -0,0 +1,55 @@
+library(testthat, quietly = TRUE, warn.conflicts = FALSE)
+requireNamespace("anndata", quietly = TRUE)
+
+par <- list(
+  input_mod1 = "resources_test/common/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.output_rna.h5ad",
+  input_mod2 = "resources_test/common/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.output_mod2.h5ad",
+  output_mod1 = "output_mod1.h5ad",
+  output_mod2 = "output_mod2.h5ad",
+  output_solution = "solution.h5ad"
+)
+
+cat("> Running censor component\n")
+out <- processx::run(
+  command = paste0("./", meta["functionality_name"]),
+  args = c(
+    "--input_mod1", par$input_mod1,
+    "--input_mod2", par$input_mod2,
+    "--output_mod1", par$output_mod1,
+    "--output_mod2", par$output_mod2,
+    "--output_solution", par$output_solution
+  ),
+  stderr_to_stdout = TRUE
+)
+
+cat("> Checking whether output files were created\n")
+expect_true(file.exists(par$output_mod1))
+expect_true(file.exists(par$output_mod2))
+expect_true(file.exists(par$output_solution))
+
+cat("> Reading h5ad files\n")
+input_mod1 <- anndata::read_h5ad(par$input_mod1)
+input_mod2 <- anndata::read_h5ad(par$input_mod2)
+output_mod1 <- anndata::read_h5ad(par$output_mod1)
+output_mod2 <- anndata::read_h5ad(par$output_mod2)
+output_solution <- anndata::read_h5ad(par$output_solution)
+
+cat("> Checking contents of h5ad files\n")
+expect_equal(output_mod1$uns[["dataset_id"]], paste0(input_mod1$uns[["dataset_id"]], "_JE"))
+expect_equal(output_mod2$uns[["dataset_id"]], paste0(input_mod1$uns[["dataset_id"]], "_JE"))
+expect_equal(output_solution$uns[["dataset_id"]], paste0(input_mod1$uns[["dataset_id"]], "_JE"))
+expect_equal(output_mod1$uns[["organism"]], input_mod1$uns[["organism"]])
+expect_equal(output_mod2$uns[["organism"]], input_mod1$uns[["organism"]])
+expect_equal(output_solution$uns[["organism"]], input_mod1$uns[["organism"]])
+expect_equal(output_mod1$n_obs, input_mod1$n_obs)
+expect_equal(output_mod2$n_obs, input_mod2$n_obs)
+expect_equal(output_mod1$n_vars, input_mod1$n_vars)
+expect_equal(output_mod2$n_vars, input_mod2$n_vars)
+expect_equal(output_mod1$var_names, input_mod1$var_names)
+expect_equal(output_mod2$var_names, input_mod2$var_names)
+expect_equal(output_mod1$obs_names, input_mod1$obs_names)
+expect_equal(output_mod2$obs_names, input_mod2$obs_names)
+
+# TODO check contents of matrices, check rownames
+
+cat("> Test succeeded!\n")

From bc9911253979a4bd33977eb0a87312f578532dfb Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Wed, 14 Dec 2022 15:20:21 +0100
Subject: [PATCH 02/82] debug mask_dataset test

---
 src/joint_embedding/api/comp_mask_dataset.yaml   | 15 +++++++++++++++
 src/joint_embedding/mask_dataset/config.vsh.yaml |  5 ++---
 src/joint_embedding/mask_dataset/test.R          |  2 +-
 3 files changed, 18 insertions(+), 4 deletions(-)
 create mode 100644 src/joint_embedding/api/comp_mask_dataset.yaml

diff --git a/src/joint_embedding/api/comp_mask_dataset.yaml b/src/joint_embedding/api/comp_mask_dataset.yaml
new file mode 100644
index 0000000000..0b97e89fca
--- /dev/null
+++ b/src/joint_embedding/api/comp_mask_dataset.yaml
@@ -0,0 +1,15 @@
+functionality:
+  arguments:
+    - name: "--input_mod1"
+      __merge__: anndata_dataset.yaml
+    - name: "--input_mod2"
+      __merge__: anndata_dataset.yaml
+    - name: "--output_mod1"
+      __merge__: anndata_masked_mod1.yaml
+      direction: output
+    - name: "--output_mod2"
+      __merge__: anndata_masked_mod2.yaml
+      direction: output
+    - name: "--output_solution"
+      __merge__: anndata_solution.yaml
+      direction: output
\ No newline at end of file
diff --git a/src/joint_embedding/mask_dataset/config.vsh.yaml b/src/joint_embedding/mask_dataset/config.vsh.yaml
index 7ec45d9c5e..5d2890598e 100644
--- a/src/joint_embedding/mask_dataset/config.vsh.yaml
+++ b/src/joint_embedding/mask_dataset/config.vsh.yaml
@@ -15,10 +15,9 @@ functionality:
   tests:
     - type: r_script
       path: test.R
-    - path: ../../../../resources_test
+    - path: ../../../resources_test
 platforms:
   - type: docker
     image: dataintuitive/randpy:r4.0_py3.8_bioc3.12
   - type: nextflow
-    publish: true
-    labels: [ midmem, lowtime, lowcpu ]
+    directives: [ midmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/mask_dataset/test.R b/src/joint_embedding/mask_dataset/test.R
index 09fe193b55..ad8cc55eec 100644
--- a/src/joint_embedding/mask_dataset/test.R
+++ b/src/joint_embedding/mask_dataset/test.R
@@ -9,7 +9,7 @@ par <- list(
   output_solution = "solution.h5ad"
 )
 
-cat("> Running censor component\n")
+cat("> Running mask component\n")
 out <- processx::run(
   command = paste0("./", meta["functionality_name"]),
   args = c(

From 262a1edb761ec5d8450422be4e5bc69efb6c3dd3 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Wed, 14 Dec 2022 15:23:39 +0100
Subject: [PATCH 03/82] add masked anddata api

---
 .../api/anndata_masked_mod1.yaml              | 37 ++++++++++++++++++
 .../api/anndata_masked_mod2.yaml              | 39 +++++++++++++++++++
 2 files changed, 76 insertions(+)
 create mode 100644 src/joint_embedding/api/anndata_masked_mod1.yaml
 create mode 100644 src/joint_embedding/api/anndata_masked_mod2.yaml

diff --git a/src/joint_embedding/api/anndata_masked_mod1.yaml b/src/joint_embedding/api/anndata_masked_mod1.yaml
new file mode 100644
index 0000000000..c247565645
--- /dev/null
+++ b/src/joint_embedding/api/anndata_masked_mod1.yaml
@@ -0,0 +1,37 @@
+type: file
+description: "The masked data"
+example: "masked.h5ad"
+info:
+  short_description: "masked data"
+  slots:
+    layers: 
+      - type: integer
+        name: counts
+        description: Raw counts
+    obs:
+      - type: string
+        name: batch
+        description: Batch information
+        required: false
+      - type: double
+        name: size_factors
+        description:
+        required: false
+    var:
+      - type: string
+        name: feature_types
+        description: 
+        required: true
+      - type: string
+        name: gene_ids
+        description:
+        required: false
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: organism
+        description: which organism
+        required: true
diff --git a/src/joint_embedding/api/anndata_masked_mod2.yaml b/src/joint_embedding/api/anndata_masked_mod2.yaml
new file mode 100644
index 0000000000..ad735fffba
--- /dev/null
+++ b/src/joint_embedding/api/anndata_masked_mod2.yaml
@@ -0,0 +1,39 @@
+type: file
+description: "The masked data for mod2 file"
+example: "masked.h5ad"
+info:
+  short_description: "Masked data"
+  slots:
+    layers: 
+      - type: integer
+        name: counts
+        description: Raw counts
+        required: true
+    obs:
+      - type: string
+        name: batch
+        description: Batch information
+        required: true
+    var:
+      - type: string
+        name: feature_types
+        description:
+        required: true
+      - type: string
+        name: gene_ids
+        description:
+        required: false
+    obsm:
+      - type: double
+        name: gene_activity
+        description: 
+        required: false
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: organism
+        description: which organism
+        required: true

From 3f367e120fc83bde8f08f4bca5d4f1fa9fb9c7c8 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Wed, 14 Dec 2022 17:00:13 +0100
Subject: [PATCH 04/82] add random_embed negative control

---
 src/joint_embedding/api/anndata_dataset.yaml  |  2 +-
 .../api/anndata_masked_mod1.yaml              |  2 +-
 .../api/anndata_prediction.yaml               | 25 +++++
 src/joint_embedding/api/authors.yaml          |  6 +-
 .../api/comp_control_method.yaml              | 47 +++++++++
 .../random_embed/config.vsh.yaml              | 24 +++++
 .../control_methods/random_embed/script.py    | 26 +++++
 .../mask_dataset/config.vsh.yaml              |  2 +-
 src/joint_embedding/mask_dataset/script.R     |  2 +-
 .../split_dataset/config.vsh.yaml             | 57 -----------
 src/joint_embedding/split_dataset/script.R    | 97 -------------------
 src/joint_embedding/split_dataset/test.R      | 55 -----------
 12 files changed, 131 insertions(+), 214 deletions(-)
 create mode 100644 src/joint_embedding/api/anndata_prediction.yaml
 create mode 100644 src/joint_embedding/api/comp_control_method.yaml
 create mode 100644 src/joint_embedding/control_methods/random_embed/config.vsh.yaml
 create mode 100644 src/joint_embedding/control_methods/random_embed/script.py
 delete mode 100644 src/joint_embedding/split_dataset/config.vsh.yaml
 delete mode 100644 src/joint_embedding/split_dataset/script.R
 delete mode 100644 src/joint_embedding/split_dataset/test.R

diff --git a/src/joint_embedding/api/anndata_dataset.yaml b/src/joint_embedding/api/anndata_dataset.yaml
index 90a5f4a385..23c6b427be 100644
--- a/src/joint_embedding/api/anndata_dataset.yaml
+++ b/src/joint_embedding/api/anndata_dataset.yaml
@@ -13,7 +13,7 @@ info:
       - type: string
         name: batch
         description: Batch information
-        required: false
+        required: true
       - type: double
         name: size_factors
         description: The size factors created by the normalisation method, if any.
diff --git a/src/joint_embedding/api/anndata_masked_mod1.yaml b/src/joint_embedding/api/anndata_masked_mod1.yaml
index c247565645..7ca6820671 100644
--- a/src/joint_embedding/api/anndata_masked_mod1.yaml
+++ b/src/joint_embedding/api/anndata_masked_mod1.yaml
@@ -12,7 +12,7 @@ info:
       - type: string
         name: batch
         description: Batch information
-        required: false
+        required: true
       - type: double
         name: size_factors
         description:
diff --git a/src/joint_embedding/api/anndata_prediction.yaml b/src/joint_embedding/api/anndata_prediction.yaml
new file mode 100644
index 0000000000..49d8ae7d79
--- /dev/null
+++ b/src/joint_embedding/api/anndata_prediction.yaml
@@ -0,0 +1,25 @@
+type: file
+description: "The prediction file"
+example: "prediction.h5ad"
+info:
+  short_description: "Prediction"
+  slots:     
+    obs:
+      - type: string
+        name: batch
+        description: Batch information
+        required: true
+    obsm:
+      - type: double
+        name: X_emb
+        description:
+        required: true
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: method_id
+        description: "A unique identifier for the method"
+        required: true
diff --git a/src/joint_embedding/api/authors.yaml b/src/joint_embedding/api/authors.yaml
index 7fc237c267..a1467a402c 100644
--- a/src/joint_embedding/api/authors.yaml
+++ b/src/joint_embedding/api/authors.yaml
@@ -5,4 +5,8 @@ functionality:
       props: { github: rcannood, orcid: "0000-0003-3641-729X" }
     - name: Kai Waldrant
       roles: [ contributor ]
-      props: { github: KaiWaldrant }
\ No newline at end of file
+      props: { github: KaiWaldrant }
+    - name: Alex Tong
+      email: alexandertongdev@gmail.com
+      roles: [ author, maintainer ]
+      props: { github: atong01 }
\ No newline at end of file
diff --git a/src/joint_embedding/api/comp_control_method.yaml b/src/joint_embedding/api/comp_control_method.yaml
new file mode 100644
index 0000000000..5477d32e75
--- /dev/null
+++ b/src/joint_embedding/api/comp_control_method.yaml
@@ -0,0 +1,47 @@
+functionality:
+  arguments:
+    - name: "--input_mod1"
+      __merge__: anndata_masked_mod1.yaml
+    - name: "--output"
+      __merge__: anndata_prediction.yaml
+      direction: output
+  test_resources:
+    - path: ../../../../resources_test   
+    - type: python_script
+      path: generic_test.py
+      text: |
+        from os import path
+        import subprocess
+        import anndata as ad
+        import scipy
+
+        input_mod1_path = "resources_test/common/joint_embedding/openproblems_bmmc_multiome_starter.mod1.h5ad"
+        output_path = "output.h5ad"
+
+        cmd = [
+            meta['executable'],
+            "--input_mod1", input_mod1_path,
+            "--output", output_path
+        ]
+
+        print(">> Running script as test")
+        out = subprocess.run(cmd, check=True, capture_output=True, text=True)
+
+        print("> Checking whether output files were created")
+        assert path.exists(output_path)
+
+        print("> Reading h5ad files")
+        input_mod1 = ad.read_h5ad(input_mod1_path)
+        output = ad.read_h5ad(output_path)
+
+        print("> Checking contents of output.h5ad")
+        assert output.uns['dataset_id'] == input_mod1.uns['dataset_id']
+        assert output.uns['method_id'] == meta['functionality_name']
+        assert output.n_obs == input_mod1.n_obs
+        print(output.n_vars)
+        assert output.n_vars >= 1
+        assert output.n_vars <= 100
+        assert all(output.obs_names == input_mod1.obs_names)
+        assert not scipy.sparse.issparse(output.obsm[X_emb])
+
+        print("> Test succeeded!")
diff --git a/src/joint_embedding/control_methods/random_embed/config.vsh.yaml b/src/joint_embedding/control_methods/random_embed/config.vsh.yaml
new file mode 100644
index 0000000000..cdc3061a8e
--- /dev/null
+++ b/src/joint_embedding/control_methods/random_embed/config.vsh.yaml
@@ -0,0 +1,24 @@
+__merge__: ../../api/comp_control_method.yaml
+functionality:
+  name: random_embed
+  namespace: joint_embedding/control_methods
+  description: Generate a random embedding from a normal distribution.
+  info:
+    type: negative_control
+    label: Normal Dist.
+  arguments:
+    - name: "--n_dims"
+      type: "integer"
+      default: 100
+      description: Number of dimensions to output.
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: "python:3.10"
+    setup:
+      - type: python
+        pip: [ anndata>=0.8, numpy , scipy]
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/control_methods/random_embed/script.py b/src/joint_embedding/control_methods/random_embed/script.py
new file mode 100644
index 0000000000..199ad08176
--- /dev/null
+++ b/src/joint_embedding/control_methods/random_embed/script.py
@@ -0,0 +1,26 @@
+import anndata
+import numpy as np
+from scipy import sparse
+
+## VIASH START
+par = {
+    "input_mod1": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.mod1.h5ad",
+    "output": "output/output_prediction.h5ad",
+    "n_dims": 100,
+}
+## VIASH END
+
+print("Load and prepare data")
+adata_mod1 = anndata.read_h5ad(par["input_mod1"])
+
+X = np.random.randn(adata_mod1.shape[0], par["n_dims"])
+print("Saving output")
+adata_out = anndata.AnnData(
+    X=X,
+    obsm= {"X_emb": sparse.csr_matrix(X)},
+    obs=adata_mod1.obs[["batch"]],
+    uns={"dataset_id": adata_mod1.uns["dataset_id"], "method_id": "random_embed"},
+)
+del adata_out.X
+
+adata_out.write_h5ad(par["output"], compression="gzip")
diff --git a/src/joint_embedding/mask_dataset/config.vsh.yaml b/src/joint_embedding/mask_dataset/config.vsh.yaml
index 5d2890598e..1f54f2df62 100644
--- a/src/joint_embedding/mask_dataset/config.vsh.yaml
+++ b/src/joint_embedding/mask_dataset/config.vsh.yaml
@@ -12,7 +12,7 @@ functionality:
   resources:
     - type: r_script
       path: script.R
-  tests:
+  test_resources:
     - type: r_script
       path: test.R
     - path: ../../../resources_test
diff --git a/src/joint_embedding/mask_dataset/script.R b/src/joint_embedding/mask_dataset/script.R
index 58935f5bca..2d70248e69 100644
--- a/src/joint_embedding/mask_dataset/script.R
+++ b/src/joint_embedding/mask_dataset/script.R
@@ -6,7 +6,7 @@ library(assertthat, quietly = TRUE, warn.conflicts = FALSE)
 library(Matrix, quietly = TRUE, warn.conflicts = FALSE)
 
 ## VIASH START
-input_path <- "output/datasets_2021-11-08/common/openproblems_bmmc_multiome_phase1v2/openproblems_bmmc_multiome_phase1v2.manual_formatting."
+input_path <- "resources_test/common/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter."
 output_path <- ""
 
 par <- list(
diff --git a/src/joint_embedding/split_dataset/config.vsh.yaml b/src/joint_embedding/split_dataset/config.vsh.yaml
deleted file mode 100644
index 2e85c26fc4..0000000000
--- a/src/joint_embedding/split_dataset/config.vsh.yaml
+++ /dev/null
@@ -1,57 +0,0 @@
-functionality:
-  name: censor_dataset
-  namespace: joint_embedding_datasets
-  version: dev
-  description: |
-    A component for censoring joint embedding datasets to be given
-    to competition participants for the 'joint embedding' task.
-  authors:
-    - name: Robrecht Cannoodt
-      email: rcannood@gmail.com
-      roles: [ author, maintainer ]
-      props: { github: rcannood, orcid: "0000-0003-3641-729X" }
-  arguments:
-    - name: "--input_mod1"
-      type: "file"
-      example: "dataset.h5ad"
-      description: An input h5ad dataset.
-      required: true
-    - name: "--input_mod2"
-      type: "file"
-      example: "dataset.h5ad"
-      description: An input h5ad dataset.
-      required: true
-    - name: "--output_mod1"
-      type: "file"
-      direction: "output"
-      example: "dataset_mod1.h5ad"
-      description: Output mod1 file.
-      required: true
-    - name: "--output_mod2"
-      type: "file"
-      direction: "output"
-      example: "dataset_mod2.h5ad"
-      description: Output mod1 file.
-      required: true
-    - name: "--output_solution"
-      type: "file"
-      direction: "output"
-      example: "dataset_solution.h5ad"
-      description: The solution file.
-      required: true
-    - name: "--train_only"
-      type: "boolean_true"
-      description: Whether or not to only omit the train cells.
-  resources:
-    - type: r_script
-      path: script.R
-  tests:
-    - type: r_script
-      path: test.R
-    - path: ../../../../resources_test
-platforms:
-  - type: docker
-    image: dataintuitive/randpy:r4.0_py3.8_bioc3.12
-  - type: nextflow
-    publish: true
-    labels: [ midmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/split_dataset/script.R b/src/joint_embedding/split_dataset/script.R
deleted file mode 100644
index 58935f5bca..0000000000
--- a/src/joint_embedding/split_dataset/script.R
+++ /dev/null
@@ -1,97 +0,0 @@
-cat("Loading dependencies\n")
-options(tidyverse.quiet = TRUE)
-library(tidyverse)
-requireNamespace("anndata", quietly = TRUE)
-library(assertthat, quietly = TRUE, warn.conflicts = FALSE)
-library(Matrix, quietly = TRUE, warn.conflicts = FALSE)
-
-## VIASH START
-input_path <- "output/datasets_2021-11-08/common/openproblems_bmmc_multiome_phase1v2/openproblems_bmmc_multiome_phase1v2.manual_formatting."
-output_path <- ""
-
-par <- list(
-  input_mod1 = paste0(input_path, "output_rna.h5ad"),
-  input_mod2 = paste0(input_path, "output_mod2.h5ad"),
-  output_mod1 = paste0(output_path, "output_mod1.h5ad"),
-  output_mod2 = paste0(output_path, "output_mod2.h5ad"),
-  output_solution = paste0(output_path, "solution.h5ad"),
-  train_only = TRUE
-)
-## VIASH END
-
-cat("Reading mod1 data\n")
-input_mod1 <- anndata::read_h5ad(par$input_mod1)
-ad1_mod <- unique(input_mod1$var[["feature_types"]])
-new_dataset_id <- paste0(input_mod1$uns[["dataset_id"]], "_JE")
-ad1_uns <- list(dataset_id = new_dataset_id, organism = "human")
-ad2_uns <- list(dataset_id = new_dataset_id, organism = "human")
-
-cat("Creating mod1 object\n")
-out_mod1 <- anndata::AnnData(
-  X = input_mod1$X,
-  layers = list(counts = input_mod1$layers[["counts"]]),
-  var = input_mod1$var %>% select(one_of("gene_ids"), feature_types),
-  obs = input_mod1$obs %>% select(one_of("batch", "size_factors")),
-  uns = ad1_uns
-)
-
-cat("Create solution object\n")
-out_solution <- anndata::AnnData(
-  X = input_mod1$X,
-  var = input_mod1$var %>% select(one_of("gene_ids"), feature_types),
-  obs = input_mod1$obs %>% select(
-    one_of("batch", "cell_type", "pseudotime_order_GEX", "pseudotime_order_ATAC", "pseudotime_order_ADT", "S_score", "G2M_score")
-  ),
-  uns = ad1_uns
-)
-
-is_train <- input_mod1$obs$is_train
-
-if (par$train_only) {
-  cat("Filtering out test cells\n", sep = "")
-  out_mod1 <- out_mod1[is_train, ] #$copy()
-  out_solution <- out_solution[is_train, ]# $copy()
-}
-
-rm(input_mod1)
-gc()
-
-cat("Reading mod2 data\n")
-input_mod2 <- anndata::read_h5ad(par$input_mod2)
-ad2_mod <- unique(input_mod2$var[["feature_types"]])
-ad2_obsm <- list()
-
-if (ad2_mod == "ATAC") {
-  ad2_uns$gene_activity_var_names <- input_mod2$uns$gene_activity_var_names
-  ad2_obsm$gene_activity <- as(input_mod2$obsm$gene_activity, "CsparseMatrix")
-}
-
-cat("Creating mod2 object\n")
-out_mod2 <- anndata::AnnData(
-  X = input_mod2$X,
-  layers = list(counts = input_mod2$layers[["counts"]]),
-  var = input_mod2$var %>% select(one_of("gene_ids"), feature_types),
-  obs = input_mod2$obs %>% select(one_of("batch")),
-  obsm = ad2_obsm,
-  uns = ad2_uns
-)
-rm(input_mod2)
-gc()
-
-if (par$train_only) {
-  cat("Filtering out test cells\n", sep = "")
-  out_mod2 <- out_mod2[is_train, ] #$copy()
-}
-
-cat("Saving output files as h5ad\n")
-cat("output_mod1:")
-print(out_mod1)
-zzz <- out_mod1$write_h5ad(par$output_mod1, compression = "gzip")
-
-cat("output_mod2:")
-print(out_mod2)
-zzz <- out_mod2$write_h5ad(par$output_mod2, compression = "gzip")
-
-cat("output_solution:")
-print(out_solution)
-zzz <- out_solution$write_h5ad(par$output_solution, compression = "gzip")
diff --git a/src/joint_embedding/split_dataset/test.R b/src/joint_embedding/split_dataset/test.R
deleted file mode 100644
index 09fe193b55..0000000000
--- a/src/joint_embedding/split_dataset/test.R
+++ /dev/null
@@ -1,55 +0,0 @@
-library(testthat, quietly = TRUE, warn.conflicts = FALSE)
-requireNamespace("anndata", quietly = TRUE)
-
-par <- list(
-  input_mod1 = "resources_test/common/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.output_rna.h5ad",
-  input_mod2 = "resources_test/common/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.output_mod2.h5ad",
-  output_mod1 = "output_mod1.h5ad",
-  output_mod2 = "output_mod2.h5ad",
-  output_solution = "solution.h5ad"
-)
-
-cat("> Running censor component\n")
-out <- processx::run(
-  command = paste0("./", meta["functionality_name"]),
-  args = c(
-    "--input_mod1", par$input_mod1,
-    "--input_mod2", par$input_mod2,
-    "--output_mod1", par$output_mod1,
-    "--output_mod2", par$output_mod2,
-    "--output_solution", par$output_solution
-  ),
-  stderr_to_stdout = TRUE
-)
-
-cat("> Checking whether output files were created\n")
-expect_true(file.exists(par$output_mod1))
-expect_true(file.exists(par$output_mod2))
-expect_true(file.exists(par$output_solution))
-
-cat("> Reading h5ad files\n")
-input_mod1 <- anndata::read_h5ad(par$input_mod1)
-input_mod2 <- anndata::read_h5ad(par$input_mod2)
-output_mod1 <- anndata::read_h5ad(par$output_mod1)
-output_mod2 <- anndata::read_h5ad(par$output_mod2)
-output_solution <- anndata::read_h5ad(par$output_solution)
-
-cat("> Checking contents of h5ad files\n")
-expect_equal(output_mod1$uns[["dataset_id"]], paste0(input_mod1$uns[["dataset_id"]], "_JE"))
-expect_equal(output_mod2$uns[["dataset_id"]], paste0(input_mod1$uns[["dataset_id"]], "_JE"))
-expect_equal(output_solution$uns[["dataset_id"]], paste0(input_mod1$uns[["dataset_id"]], "_JE"))
-expect_equal(output_mod1$uns[["organism"]], input_mod1$uns[["organism"]])
-expect_equal(output_mod2$uns[["organism"]], input_mod1$uns[["organism"]])
-expect_equal(output_solution$uns[["organism"]], input_mod1$uns[["organism"]])
-expect_equal(output_mod1$n_obs, input_mod1$n_obs)
-expect_equal(output_mod2$n_obs, input_mod2$n_obs)
-expect_equal(output_mod1$n_vars, input_mod1$n_vars)
-expect_equal(output_mod2$n_vars, input_mod2$n_vars)
-expect_equal(output_mod1$var_names, input_mod1$var_names)
-expect_equal(output_mod2$var_names, input_mod2$var_names)
-expect_equal(output_mod1$obs_names, input_mod1$obs_names)
-expect_equal(output_mod2$obs_names, input_mod2$obs_names)
-
-# TODO check contents of matrices, check rownames
-
-cat("> Test succeeded!\n")

From 861072ede48eb9f8f4cbccbd3d9f37c217f8d749 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Wed, 14 Dec 2022 17:00:52 +0100
Subject: [PATCH 05/82] update control_method api

---
 src/joint_embedding/api/comp_control_method.yaml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/joint_embedding/api/comp_control_method.yaml b/src/joint_embedding/api/comp_control_method.yaml
index 5477d32e75..d1ec13ed17 100644
--- a/src/joint_embedding/api/comp_control_method.yaml
+++ b/src/joint_embedding/api/comp_control_method.yaml
@@ -13,7 +13,7 @@ functionality:
         from os import path
         import subprocess
         import anndata as ad
-        import scipy
+        from scipy import sparse
 
         input_mod1_path = "resources_test/common/joint_embedding/openproblems_bmmc_multiome_starter.mod1.h5ad"
         output_path = "output.h5ad"
@@ -38,10 +38,9 @@ functionality:
         assert output.uns['dataset_id'] == input_mod1.uns['dataset_id']
         assert output.uns['method_id'] == meta['functionality_name']
         assert output.n_obs == input_mod1.n_obs
-        print(output.n_vars)
         assert output.n_vars >= 1
         assert output.n_vars <= 100
         assert all(output.obs_names == input_mod1.obs_names)
-        assert not scipy.sparse.issparse(output.obsm[X_emb])
+        assert sparse.issparse(output.obsm['X_emb'])
 
         print("> Test succeeded!")

From 8749e2a33e37906d954be0f9f2ae51254ed34bc5 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Thu, 15 Dec 2022 10:23:07 +0100
Subject: [PATCH 06/82] add zeros_embed control

---
 .../zeros_embed/config.vsh.yaml               | 24 +++++++++++++++++++
 .../control_methods/zeros_embed/script.py     | 23 ++++++++++++++++++
 2 files changed, 47 insertions(+)
 create mode 100644 src/joint_embedding/control_methods/zeros_embed/config.vsh.yaml
 create mode 100644 src/joint_embedding/control_methods/zeros_embed/script.py

diff --git a/src/joint_embedding/control_methods/zeros_embed/config.vsh.yaml b/src/joint_embedding/control_methods/zeros_embed/config.vsh.yaml
new file mode 100644
index 0000000000..b2c95a79e2
--- /dev/null
+++ b/src/joint_embedding/control_methods/zeros_embed/config.vsh.yaml
@@ -0,0 +1,24 @@
+__merge__: ../../api/comp_control_method.yaml
+functionality:
+  name: zeros_embed
+  namespace: joint_embedding/control_methods
+  description: Generate an embedding containing only zero values.
+  info:
+    type: negative_control
+    label: zeros_embed
+  arguments:
+    - name: "--n_dims"
+      type: "integer"
+      default: 1
+      description: Number of dimensions to output.
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: "python:3.10"
+    setup:
+      - type: python
+        pip: [ anndata, numpy, scipy ]
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/control_methods/zeros_embed/script.py b/src/joint_embedding/control_methods/zeros_embed/script.py
new file mode 100644
index 0000000000..964dae1744
--- /dev/null
+++ b/src/joint_embedding/control_methods/zeros_embed/script.py
@@ -0,0 +1,23 @@
+import anndata
+import numpy as np
+
+## VIASH START
+par = {
+    "input_mod1": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.mod1.h5ad",
+    "input_mod2": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.mod2.h5ad",
+    "output": "tmp/output_prediction.h5ad",
+    "n_dims": 1,
+}
+## VIASH END
+
+print("Load and prepare data")
+adata_mod1 = anndata.read_h5ad(par["input_mod1"])
+
+X = np.zeros((adata_mod1.shape[0], par["n_dims"]))
+print("Saving output")
+adata_out = anndata.AnnData(
+    X=X,
+    obs=adata_mod1.obs,
+    uns={"dataset_id": adata_mod1.uns["dataset_id"], "method_id": "dummy_zeros"},
+)
+adata_out.write_h5ad(par["output"], compression="gzip")

From 7c89329a2bae233f0031c5110cf3c93986ead9ec Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Thu, 15 Dec 2022 16:24:13 +0100
Subject: [PATCH 07/82] add lmds method

---
 src/joint_embedding/api/comp_method.yaml      | 50 +++++++++++++++++++
 .../control_methods/zeros_embed/script.py     |  8 ++-
 .../methods/lmds/config.vsh.yaml              | 28 +++++++++++
 src/joint_embedding/methods/lmds/script.R     | 44 ++++++++++++++++
 4 files changed, 128 insertions(+), 2 deletions(-)
 create mode 100644 src/joint_embedding/api/comp_method.yaml
 create mode 100644 src/joint_embedding/methods/lmds/config.vsh.yaml
 create mode 100644 src/joint_embedding/methods/lmds/script.R

diff --git a/src/joint_embedding/api/comp_method.yaml b/src/joint_embedding/api/comp_method.yaml
new file mode 100644
index 0000000000..e75d2fe07b
--- /dev/null
+++ b/src/joint_embedding/api/comp_method.yaml
@@ -0,0 +1,50 @@
+functionality:
+  arguments:
+    - name: "--input_mod1"
+      __merge__: anndata_masked_mod1.yaml
+    - name: "--input_mod2"
+      __merge__: anndata_masked_mod2.yaml
+    - name: "--output"
+      __merge__: anndata_prediction.yaml
+      direction: output
+  test_resources:
+    - path: ../../../../resources_test   
+    - type: python_script
+      path: generic_test.py
+      text: |
+        from os import path
+        import subprocess
+        import anndata as ad
+        from scipy import sparse
+
+        input_mod1_path = "resources_test/common/joint_embedding/openproblems_bmmc_multiome_starter.mod1.h5ad"
+        input_mod2_path = "resources_test/common/joint_embedding/openproblems_bmmc_multiome_starter.mod2.h5ad"
+        output_path = "output.h5ad"
+
+        cmd = [
+            meta['executable'],
+            "--input_mod1", input_mod1_path,
+            "--input_mod2", input_mod2_path,
+            "--output", output_path
+        ]
+
+        print(">> Running script as test")
+        out = subprocess.run(cmd, check=True, capture_output=True, text=True).stderr
+
+        print("> Checking whether output files were created")
+        assert path.exists(output_path)
+
+        print("> Reading h5ad files")
+        input_mod1 = ad.read_h5ad(input_mod1_path)
+        output = ad.read_h5ad(output_path)
+
+        print("> Checking contents of output.h5ad")
+        assert output.uns['dataset_id'] == input_mod1.uns['dataset_id']
+        assert output.uns['method_id'] == meta['functionality_name']
+        assert output.n_obs == input_mod1.n_obs
+        assert output.n_vars >= 1
+        assert output.n_vars <= 100
+        assert all(output.obs_names == input_mod1.obs_names)
+        assert sparse.issparse(output.obsm['X_emb'])
+
+        print("> Test succeeded!")
\ No newline at end of file
diff --git a/src/joint_embedding/control_methods/zeros_embed/script.py b/src/joint_embedding/control_methods/zeros_embed/script.py
index 964dae1744..f54ef3ce18 100644
--- a/src/joint_embedding/control_methods/zeros_embed/script.py
+++ b/src/joint_embedding/control_methods/zeros_embed/script.py
@@ -1,10 +1,10 @@
 import anndata
 import numpy as np
+from scipy import sparse
 
 ## VIASH START
 par = {
     "input_mod1": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.mod1.h5ad",
-    "input_mod2": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.mod2.h5ad",
     "output": "tmp/output_prediction.h5ad",
     "n_dims": 1,
 }
@@ -18,6 +18,10 @@
 adata_out = anndata.AnnData(
     X=X,
     obs=adata_mod1.obs,
-    uns={"dataset_id": adata_mod1.uns["dataset_id"], "method_id": "dummy_zeros"},
+    uns={"dataset_id": adata_mod1.uns["dataset_id"], "method_id": "zeros_embed"},
+    obsm={"X_emb": sparse.csr_matrix(X) }
 )
+
+del adata_out.X
+
 adata_out.write_h5ad(par["output"], compression="gzip")
diff --git a/src/joint_embedding/methods/lmds/config.vsh.yaml b/src/joint_embedding/methods/lmds/config.vsh.yaml
new file mode 100644
index 0000000000..b9e096a6d1
--- /dev/null
+++ b/src/joint_embedding/methods/lmds/config.vsh.yaml
@@ -0,0 +1,28 @@
+__merge__: ../../api/comp_method.yaml
+functionality:
+  name: lmds
+  namespace: joint_embedding/methods
+  description: Landmark MDS dimensionality reduction on the Spearman distance.
+  info:
+    type: method
+    label: "LMDS"
+  arguments:
+    - name: "--distance_method"
+      type: "string"
+      default: "spearman"
+      description: The distance method to use. Possible values are euclidean, pearson, spearman and others.
+    - name: "--n_dims"
+      type: integer
+      default: 10
+      description: Number of dimensions to output.
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: dataintuitive/randpy:r4.0_py3.8_bioc3.12
+    setup:
+      - type: r
+        packages: [ lmds ]
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/methods/lmds/script.R b/src/joint_embedding/methods/lmds/script.R
new file mode 100644
index 0000000000..92700691e0
--- /dev/null
+++ b/src/joint_embedding/methods/lmds/script.R
@@ -0,0 +1,44 @@
+cat("Loading dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+requireNamespace("anndata", quietly = TRUE)
+library(Matrix, warn.conflicts = FALSE, quietly = TRUE)
+
+## VIASH START
+# path <- "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter."
+path <- "resources_test/common/joint_embedding/openproblems_bmmc_multiome_starter."
+par <- list(
+  input_mod1 = paste0(path, "mod1.h5ad"),
+  input_mod2 = paste0(path, "mod2.h5ad"),
+  output = "output/lmds/output.h5ad",
+  n_dims = 10L,
+  distance_method = "spearman"
+)
+## VIASH END
+
+cat("Reading h5ad files\n")
+ad1 <- anndata::read_h5ad(par$input_mod1)
+ad2 <- anndata::read_h5ad(par$input_mod2)
+
+cat("Performing DR\n")
+dr <- lmds::lmds(
+  cbind(ad1$X, ad2$X),
+  ndim = par$n_dims,
+  distance_method = par$distance_method
+)
+
+rownames(dr) <- rownames(ad1)
+colnames(dr) <- paste0("comp_", seq_len(par$n_dims))
+
+out <- anndata::AnnData(
+  X = dr,
+  uns = list(
+    dataset_id = ad1$uns[["dataset_id"]],
+    method_id = meta$functionality_name
+  ),
+  obsm = list(X_emb = as(dr, "CsparseMatrix"))
+)
+
+
+cat("Writing predictions to file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")

From 0d29dd03d99f109cb972fb57f9c328053b7c63a2 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Thu, 15 Dec 2022 16:33:16 +0100
Subject: [PATCH 08/82] add mnn method

---
 .../methods/mnn/config.vsh.yaml               | 24 +++++++
 src/joint_embedding/methods/mnn/script.R      | 67 +++++++++++++++++++
 2 files changed, 91 insertions(+)
 create mode 100644 src/joint_embedding/methods/mnn/config.vsh.yaml
 create mode 100644 src/joint_embedding/methods/mnn/script.R

diff --git a/src/joint_embedding/methods/mnn/config.vsh.yaml b/src/joint_embedding/methods/mnn/config.vsh.yaml
new file mode 100644
index 0000000000..8759ade4e0
--- /dev/null
+++ b/src/joint_embedding/methods/mnn/config.vsh.yaml
@@ -0,0 +1,24 @@
+__merge__: ../../api/comp_method.yaml
+functionality:
+  name: mnn
+  namespace: joint_embedding/methods
+  description: Mutual nearest neighbors correction followed by PCA.
+  info:
+    type: method
+    label: "MNN"
+  arguments:
+    - name: "--hvg_sel"
+      type: "integer"
+      default: 1000
+      description: Number of features per modality to use.
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: dataintuitive/randpy:r4.0_py3.8_bioc3.12
+    setup:
+      - type: r
+        bioc: [ SingleCellExperiment, batchelor, proxyC ]
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/methods/mnn/script.R b/src/joint_embedding/methods/mnn/script.R
new file mode 100644
index 0000000000..27a4454bff
--- /dev/null
+++ b/src/joint_embedding/methods/mnn/script.R
@@ -0,0 +1,67 @@
+cat("Loading dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+requireNamespace("anndata", quietly = TRUE)
+library(Matrix, warn.conflicts = FALSE, quietly = TRUE)
+requireNamespace("batchelor", quietly = TRUE)
+requireNamespace("SingleCellExperiment", quietly = TRUE)
+
+## VIASH START
+# path <- "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter."
+path <- "output/datasets/joint_embedding/openproblems_bmmc_multiome_phase1/openproblems_bmmc_multiome_phase1.censor_dataset.output_"
+# path <- "output/public_datasets/joint_embedding/dyngen_citeseq_1/dyngen_citeseq_1.censor_dataset.output_"
+par <- list(
+  input_mod1 = paste0(path, "mod1.h5ad"),
+  input_mod2 = paste0(path, "mod2.h5ad"),
+  output = "output.h5ad",
+  hvg_sel = 1000L
+)
+meta <- list(functionality_name = "foo")
+## VIASH END
+
+method_id <- meta$functionality_name
+
+cat("Reading h5ad files\n")
+input_mod1 <- anndata::read_h5ad(par$input_mod1)
+
+rn <- rownames(input_mod1)
+batch <- input_mod1$obs$batch
+dataset_id <- input_mod1$uns[["dataset_id"]]
+Xt_mod1 <- t(input_mod1$X)
+
+# select hvg
+if (!is.null(par$hvg_sel) && nrow(Xt_mod1) > par$hvg_sel) {
+  sd_mod1 <- proxyC::rowSds(Xt_mod1)
+  Xt_mod1 <- Xt_mod1[order(sd_mod1, decreasing = TRUE)[seq_len(par$hvg_sel)], ]
+}
+
+rm(input_mod1)
+gc()
+
+Xt_mod2 <- t(anndata::read_h5ad(par$input_mod2)$X)
+if (!is.null(par$hvg_sel) && nrow(Xt_mod2) > par$hvg_sel) {
+  sd_mod2 <- proxyC::rowSds(Xt_mod2)
+  Xt_mod2 <- Xt_mod2[order(sd_mod2, decreasing = TRUE)[seq_len(par$hvg_sel)], ]
+}
+
+cat("Running fastMNN\n")
+mnn_out <- batchelor::fastMNN(
+  rbind(Xt_mod1, Xt_mod2),
+  batch = batch
+)
+dr <- SingleCellExperiment::reducedDim(mnn_out, "corrected")
+
+rownames(dr) <- rn
+colnames(dr) <- paste0("comp_", seq_len(ncol(dr)))
+
+out <- anndata::AnnData(
+  X = dr,
+  uns = list(
+    dataset_id = dataset_id,
+    method_id = meta$functionality_name
+  ),
+  obsm = list(X_emb = as(dr, "CsparseMatrix"))
+)
+
+cat("Writing predictions to file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")

From 3c46c4d331ddff14eee86016f58bf19e3d6dbf86 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Thu, 15 Dec 2022 17:06:26 +0100
Subject: [PATCH 09/82] add newwave method

---
 .../methods/newwave/config.vsh.yaml           |  33 ++++++
 src/joint_embedding/methods/newwave/script.R  | 111 ++++++++++++++++++
 2 files changed, 144 insertions(+)
 create mode 100644 src/joint_embedding/methods/newwave/config.vsh.yaml
 create mode 100644 src/joint_embedding/methods/newwave/script.R

diff --git a/src/joint_embedding/methods/newwave/config.vsh.yaml b/src/joint_embedding/methods/newwave/config.vsh.yaml
new file mode 100644
index 0000000000..c327dcfa8c
--- /dev/null
+++ b/src/joint_embedding/methods/newwave/config.vsh.yaml
@@ -0,0 +1,33 @@
+__merge__: ../../api/comp_method.yaml
+functionality:
+  name: newwave
+  namespace: joint_embedding/methods
+  description: Concatenated NewWave.
+  info:
+    type: method
+    label: "NewWave"
+    doi: "10.1101/2021.08.02.453487"
+  arguments:
+    - name: "--maxiter"
+      type: "integer"
+      default: 100
+      description: Maximum number of NewWave iterations.
+    - name: "--k"
+      type: "integer"
+      default: 10
+      description: NewWave K parameter.
+    - name: "--hvg_sel"
+      type: "integer"
+      default: 1000
+      description: Number of features per modality to use.
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: dataintuitive/randpy:r4.0_py3.8_bioc3.12
+    setup:
+      - type: r
+        bioc: [ SingleCellExperiment, NewWave, proxyC ]
+  - type: nextflow
+    directives: [ highmem, hightime, highcpu ]
diff --git a/src/joint_embedding/methods/newwave/script.R b/src/joint_embedding/methods/newwave/script.R
new file mode 100644
index 0000000000..f87d79cdd5
--- /dev/null
+++ b/src/joint_embedding/methods/newwave/script.R
@@ -0,0 +1,111 @@
+cat("Loading dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+requireNamespace("anndata", quietly = TRUE)
+library(Matrix, warn.conflicts = FALSE, quietly = TRUE)
+requireNamespace("NewWave", quietly = TRUE)
+requireNamespace("SingleCellExperiment", quietly = TRUE)
+
+## VIASH START
+# path <- "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter."
+path <- "output/datasets/joint_embedding/openproblems_bmmc_multiome_phase1/openproblems_bmmc_multiome_phase1.censor_dataset.output_"
+# path <- "output/public_datasets/joint_embedding/dyngen_citeseq_1/dyngen_citeseq_1.censor_dataset.output_"
+par <- list(
+  input_mod1 = paste0(path, "mod1.h5ad"),
+  input_mod2 = paste0(path, "mod2.h5ad"),
+  output = "output.h5ad",
+  maxiter = 2L,
+  k = 3L,
+  hvg_sel = 1000
+)
+meta <- list(functionality_name = "foo")
+## VIASH END
+
+method_id <- meta$functionality_name
+
+cat("Reading mod1 h5ad\n")
+input_mod1 <- anndata::read_h5ad(par$input_mod1)
+
+rn <- rownames(input_mod1)
+batch <- input_mod1$obs$batch
+dataset_id <- input_mod1$uns[["dataset_id"]]
+
+sd1 <- proxyC::colSds(input_mod1$X)
+fil1 <-
+  if (!is.null(par$hvg_sel) && ncol(input_mod1) > par$hvg_sel) {
+    head(order(sd1, decreasing = TRUE), par$hvg_sel)
+  } else {
+    which(sd1 > 0)
+  }
+data1 <- SummarizedExperiment::SummarizedExperiment(
+  assays = list(counts = t(input_mod1$layers[["counts"]][, fil1])),
+  colData = data.frame(batch = factor(batch))
+)
+rm(input_mod1)
+gc()
+
+cat("Running NewWave on mod1\n")
+res1 <- NewWave::newWave(
+  data1,
+  X = "~batch",
+  verbose = TRUE,
+  K = par$k,
+  maxiter_optimize = par$maxiter,
+  n_gene_par = min(300, nrow(data1)),
+  n_cell_par = min(300, ncol(data1)),
+  commondispersion = FALSE
+)
+rm(data1)
+
+dr_x1 <- SingleCellExperiment::reducedDim(res1)
+
+cat("Reading mod2 anndata\n")
+input_mod2 <- anndata::read_h5ad(par$input_mod2)
+sd2 <- proxyC::colSds(input_mod2$X)
+fil2 <-
+  if (!is.null(par$hvg_sel) && ncol(input_mod2) > par$hvg_sel) {
+    head(order(sd2, decreasing = TRUE), par$hvg_sel)
+  } else {
+    which(sd2 > 0)
+  }
+data2 <- SummarizedExperiment::SummarizedExperiment(
+  assays = list(counts = t(input_mod2$layers[["counts"]][, fil2])),
+  colData = data.frame(batch = factor(batch))
+)
+rm(input_mod2)
+gc()
+
+cat("Running NewWave on mod2\n")
+res2 <- NewWave::newWave(
+  data2,
+  X = "~batch",
+  verbose = TRUE,
+  K = par$k,
+  maxiter_optimize = par$maxiter,
+  n_gene_par = min(300, nrow(data2)),
+  n_cell_par = min(300, ncol(data2)),
+  commondispersion = FALSE
+)
+dr_x2 <- SingleCellExperiment::reducedDim(res2)
+rm(data2)
+
+cat("Spline separate DRs\n")
+dr <- do.call(cbind, lapply(seq_len(ncol(dr_x1)), function(i) {
+  cbind(dr_x1[, i], dr_x2[, i])
+}))
+
+rownames(dr) <- rn
+colnames(dr) <- paste0("comp_", seq_len(ncol(dr)))
+
+out <- anndata::AnnData(
+  X = dr,
+  uns = list(
+    dataset_id = dataset_id,
+    method_id = meta$functionality_name
+  ),
+  obsm = list(X_emb = as(dr, "CsparseMatrix"))
+
+)
+
+cat("Writing predictions to file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")

From 4ddb315a339815396983a499c896fcde59cd5352 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Thu, 15 Dec 2022 17:11:30 +0100
Subject: [PATCH 10/82] add pca method

---
 .../methods/pca/config.vsh.yaml               | 29 +++++++++
 src/joint_embedding/methods/pca/script.R      | 63 +++++++++++++++++++
 2 files changed, 92 insertions(+)
 create mode 100644 src/joint_embedding/methods/pca/config.vsh.yaml
 create mode 100644 src/joint_embedding/methods/pca/script.R

diff --git a/src/joint_embedding/methods/pca/config.vsh.yaml b/src/joint_embedding/methods/pca/config.vsh.yaml
new file mode 100644
index 0000000000..cb4d24642f
--- /dev/null
+++ b/src/joint_embedding/methods/pca/config.vsh.yaml
@@ -0,0 +1,29 @@
+__merge__: ../../api/comp_method.yaml
+functionality:
+  name: pca
+  namespace: joint_embedding/methods
+  description: PCA dimensionality reduction.
+  info:
+    type: method
+    label: "PCA"
+  authors:
+  arguments:
+    - name: "--n_dims"
+      type: "integer"
+      default: 10
+      description: Number of dimensions to output.
+    - name: "--hvg_sel"
+      type: "integer"
+      default: 1000
+      description: Number of features per modality to use.
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: dataintuitive/randpy:r4.0_py3.8_bioc3.12
+    setup:
+      - type: r
+        packages: [ irlba, proxyC ]
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/methods/pca/script.R b/src/joint_embedding/methods/pca/script.R
new file mode 100644
index 0000000000..d16eb027cf
--- /dev/null
+++ b/src/joint_embedding/methods/pca/script.R
@@ -0,0 +1,63 @@
+cat("Loading dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+requireNamespace("anndata", quietly = TRUE)
+library(Matrix, warn.conflicts = FALSE, quietly = TRUE)
+
+## VIASH START
+# path <- "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter."
+path <- "output/datasets/joint_embedding/openproblems_bmmc_multiome_phase1/openproblems_bmmc_multiome_phase1.censor_dataset.output_"
+# path <- "output/public_datasets/joint_embedding/dyngen_citeseq_1/dyngen_citeseq_1.censor_dataset.output_"
+par <- list(
+  input_mod1 = paste0(path, "mod1.h5ad"),
+  input_mod2 = paste0(path, "mod2.h5ad"),
+  output = "output.h5ad",
+  n_dims = 4L,
+  hvg_sel = 1000L
+)
+meta <- list(functionality_name = "foo")
+## VIASH END
+
+cat("Reading h5ad files\n")
+input_mod1 <- anndata::read_h5ad(par$input_mod1)
+
+rn <- rownames(input_mod1)
+batch <- input_mod1$obs$batch
+dataset_id <- input_mod1$uns[["dataset_id"]]
+X_mod1 <- input_mod1$X
+
+# select hvg
+if (!is.null(par$hvg_sel) && ncol(X_mod1) > par$hvg_sel) {
+  sd_mod1 <- proxyC::colSds(X_mod1)
+  X_mod1 <- X_mod1[, head(order(sd_mod1, decreasing = TRUE), par$hvg_sel)]
+}
+
+rm(input_mod1)
+gc()
+
+X_mod2 <- anndata::read_h5ad(par$input_mod2)$X
+if (!is.null(par$hvg_sel) && ncol(X_mod2) > par$hvg_sel) {
+  sd_mod2 <- proxyC::colSds(X_mod2)
+  X_mod2 <- X_mod2[, head(order(sd_mod2, decreasing = TRUE), par$hvg_sel)]
+}
+
+cat("Performing DR\n")
+dr <- irlba::prcomp_irlba(
+  cbind(X_mod1, X_mod2),
+  n = par$n_dims
+)$x
+
+rownames(dr) <- rn
+colnames(dr) <- paste0("comp_", seq_len(par$n_dims))
+
+out <- anndata::AnnData(
+  X = dr,
+  uns = list(
+    dataset_id = dataset_id,
+    method_id = meta$functionality_name
+  ),
+  obsm = list( X_emb = as(dr, "CsparseMatrix"))
+)
+
+cat("Writing predictions to file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")

From 3ae18553b63805ad76c5ed5b891e7b7008d80d43 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Fri, 16 Dec 2022 11:11:22 +0100
Subject: [PATCH 11/82] Add totalVI method

---
 src/joint_embedding/api/authors.yaml          |  6 +-
 src/joint_embedding/api/comp_method.yaml      |  2 +-
 .../methods/totalvi/config.vsh.yaml           | 33 +++++++++++
 src/joint_embedding/methods/totalvi/script.py | 58 +++++++++++++++++++
 4 files changed, 97 insertions(+), 2 deletions(-)
 create mode 100644 src/joint_embedding/methods/totalvi/config.vsh.yaml
 create mode 100644 src/joint_embedding/methods/totalvi/script.py

diff --git a/src/joint_embedding/api/authors.yaml b/src/joint_embedding/api/authors.yaml
index a1467a402c..70f8fc3141 100644
--- a/src/joint_embedding/api/authors.yaml
+++ b/src/joint_embedding/api/authors.yaml
@@ -9,4 +9,8 @@ functionality:
     - name: Alex Tong
       email: alexandertongdev@gmail.com
       roles: [ author, maintainer ]
-      props: { github: atong01 }
\ No newline at end of file
+      props: { github: atong01 }
+    - name: Christopher Lance
+      email: clance.connect@gmail.com
+      roles: [ author, maintainer ]
+      props: { github: xlancelottx }
\ No newline at end of file
diff --git a/src/joint_embedding/api/comp_method.yaml b/src/joint_embedding/api/comp_method.yaml
index e75d2fe07b..415f42ea3d 100644
--- a/src/joint_embedding/api/comp_method.yaml
+++ b/src/joint_embedding/api/comp_method.yaml
@@ -29,7 +29,7 @@ functionality:
         ]
 
         print(">> Running script as test")
-        out = subprocess.run(cmd, check=True, capture_output=True, text=True).stderr
+        out = subprocess.run(cmd, check=True, capture_output=True, text=True)
 
         print("> Checking whether output files were created")
         assert path.exists(output_path)
diff --git a/src/joint_embedding/methods/totalvi/config.vsh.yaml b/src/joint_embedding/methods/totalvi/config.vsh.yaml
new file mode 100644
index 0000000000..74116f9abe
--- /dev/null
+++ b/src/joint_embedding/methods/totalvi/config.vsh.yaml
@@ -0,0 +1,33 @@
+__merge__: ../../api/comp_method.yaml
+functionality:
+  name: totalvi
+  namespace: joint_embedding/methods
+  version: dev
+  description: "totalVI: joint probabilistic modeling with Total Variational Inference"
+  info:
+    type: method
+    label: totalVI
+    doi: 10.1038/s41592-020-01050-x    
+  arguments:
+    - name: --hvg_number
+      type: integer
+      default: 4000
+      description: Number of HVG to include in totalVI
+    - name: --max_epochs
+      type: integer
+      default: 400
+      description: Number of max epochs to run totalVI
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: "python:3.10"
+    setup:
+      - type: python
+        pip: [ anndata>=0.8, scanpy, scikit-misc, scipy, scikit-learn, scvi-tools]
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
+
+
+
diff --git a/src/joint_embedding/methods/totalvi/script.py b/src/joint_embedding/methods/totalvi/script.py
new file mode 100644
index 0000000000..b47400816d
--- /dev/null
+++ b/src/joint_embedding/methods/totalvi/script.py
@@ -0,0 +1,58 @@
+import anndata
+import scanpy as sc
+from scipy import sparse
+from scvi.model import TOTALVI
+
+## VIASH START
+par = {
+    "input_mod1": "output/public_datasets/joint_embedding/totalvi_spleen_lymph_111/totalvi_spleen_lymph_111.censor_dataset.output_mod1.h5ad",
+    "input_mod2": "output/public_datasets/joint_embedding/totalvi_spleen_lymph_111/totalvi_spleen_lymph_111.censor_dataset.output_mod2.h5ad",
+    "output": "tmp/output_prediction.h5ad",
+    "hvg_number": 4000,
+    "max_epochs": 20
+}
+
+meta = {
+    'funcionality_name': "foo"
+}
+## VIASH END
+
+print("Load and prepare data")
+adata_mod1 = anndata.read_h5ad(par['input_mod1'])
+adata_mod2 = anndata.read_h5ad(par['input_mod2'])
+adata_mod1.obsm['protein_expression'] = adata_mod2.X.toarray()
+
+print('Select highly variable genes')
+sc.pp.highly_variable_genes(
+    adata_mod1,
+    n_top_genes=par['hvg_number'],
+    flavor="seurat_v3",
+    batch_key="batch",
+    subset=True
+)
+
+print("Set up model")
+TOTALVI.setup_anndata(
+    adata_mod1,
+    batch_key="batch",
+    protein_expression_obsm_key="protein_expression"
+)
+
+print('Train totalVI with', par['max_epochs'], 'epochs')
+vae = TOTALVI(adata_mod1, latent_distribution="normal")
+vae.train(max_epochs = par['max_epochs'])
+
+print("Postprocessing and saving output")
+adata_out = anndata.AnnData(
+    X=vae.get_latent_representation(),
+    obs=adata_mod1.obs[['batch']],
+    uns={
+        "dataset_id": adata_mod1.uns["dataset_id"],
+        "method_id": meta["functionality_name"]
+    },
+    obsm = {"X_emb": sparse.csr_matrix(vae.get_latent_representation())}
+)
+
+del adata_out.X
+
+adata_out.write_h5ad(par['output'], compression = "gzip")

From 87f84cba53d2f665a4e8bdfbfba40fe028e04a87 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Fri, 16 Dec 2022 11:50:58 +0100
Subject: [PATCH 12/82] add umap method

---
 .../methods/umap/config.vsh.yaml              | 41 ++++++++++
 src/joint_embedding/methods/umap/script.R     | 80 +++++++++++++++++++
 2 files changed, 121 insertions(+)
 create mode 100644 src/joint_embedding/methods/umap/config.vsh.yaml
 create mode 100644 src/joint_embedding/methods/umap/script.R

diff --git a/src/joint_embedding/methods/umap/config.vsh.yaml b/src/joint_embedding/methods/umap/config.vsh.yaml
new file mode 100644
index 0000000000..f8b27fdaec
--- /dev/null
+++ b/src/joint_embedding/methods/umap/config.vsh.yaml
@@ -0,0 +1,41 @@
+__merge__: ../../api/comp_method.yaml
+functionality:
+  name: umam
+  namespace: joint_embedding/methods
+  version: dev
+  description: UMAP dimensionality reduction on the Euclidean distance.
+  info:
+    type: method
+    label: UMAP
+  arguments:
+    - name: "--n_dims"
+      type: "integer"
+      default: 10
+      description: Number of dimensions to output.
+    - name: "--metric"
+      type: "string"
+      default: "euclidean"
+      description: The metric to use. Possible values are euclidean, cosine, manhattan.
+    - name: "--n_neighbors"
+      type: "integer"
+      default: 15
+      description: Number of neighbor to use int he KNN.
+    - name: "--n_pcs"
+      type: "integer"
+      default: 50
+      description: Number of principal components to use in the PCA step.
+    - name: "--hvg_sel"
+      type: "integer"
+      default: 1000
+      description: Number of features per modality to use.
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: dataintuitive/randpy:r4.0_py3.8_bioc3.12
+    setup:
+      - type: r
+        packages: [ uwot, irlba, proxyC ]
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/methods/umap/script.R b/src/joint_embedding/methods/umap/script.R
new file mode 100644
index 0000000000..2f654be245
--- /dev/null
+++ b/src/joint_embedding/methods/umap/script.R
@@ -0,0 +1,80 @@
+cat("Loading dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+requireNamespace("anndata", quietly = TRUE)
+library(Matrix, warn.conflicts = FALSE, quietly = TRUE)
+
+## VIASH START
+# path <- "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter."
+path <- "output/datasets/joint_embedding/openproblems_bmmc_multiome_phase1/openproblems_bmmc_multiome_phase1.censor_dataset.output_"
+# path <- "output/public_datasets/joint_embedding/dyngen_citeseq_1/dyngen_citeseq_1.censor_dataset.output_"
+par <- list(
+  input_mod1 = paste0(path, "mod1.h5ad"),
+  input_mod2 = paste0(path, "mod2.h5ad"),
+  output = "output.h5ad",
+  n_dims = 10L,
+  n_neighbors = 15L,
+  metric = "euclidean",
+  n_pcs = 50L,
+  hvg_sel = 100L
+)
+meta <- list(functionality_name = "foo")
+## VIASH END
+
+n_cores <- parallel::detectCores(all.tests = FALSE, logical = TRUE)
+
+cat("Reading h5ad files\n")
+input_mod1 <- anndata::read_h5ad(par$input_mod1)
+
+rn <- rownames(input_mod1)
+batch <- input_mod1$obs$batch
+dataset_id <- input_mod1$uns[["dataset_id"]]
+X_mod1 <- input_mod1$X
+
+# select hvg
+if (!is.null(par$hvg_sel) && ncol(X_mod1) > par$hvg_sel) {
+  sd_mod1 <- proxyC::colSds(X_mod1)
+  X_mod1 <- X_mod1[, head(order(sd_mod1, decreasing = TRUE), par$hvg_sel)]
+}
+
+rm(input_mod1)
+gc()
+
+X_mod2 <- anndata::read_h5ad(par$input_mod2)$X
+if (!is.null(par$hvg_sel) && ncol(X_mod2) > par$hvg_sel) {
+  sd_mod2 <- proxyC::colSds(X_mod2)
+  X_mod2 <- X_mod2[, head(order(sd_mod2, decreasing = TRUE), par$hvg_sel)]
+}
+
+cat("Performing PCA\n")
+X_pca <- irlba::prcomp_irlba(
+  cbind(X_mod1, X_mod2),
+  n = 100
+)$x
+
+cat("Performing UMap\n")
+dr <- uwot::umap(
+  X_pca,
+  n_components = par$n_dims,
+  n_neighbors = par$n_neighbors,
+  metric = par$metric,
+  n_threads = n_cores,
+  nn_method = "annoy"
+)
+
+rownames(dr) <- rn
+colnames(dr) <- paste0("comp_", seq_len(par$n_dims))
+
+out <- anndata::AnnData(
+  X = dr,
+  uns = list(
+    dataset_id = dataset_id,
+    method_id = meta$functionality_name
+  ),
+  obsm = list(
+    X_emb = as(dr, "CsparseMatrix")
+  )
+)
+
+cat("Writing predictions to file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")

From 7cc07bfb2f9f6687e59ad5f1c37291037cd68eb7 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Fri, 16 Dec 2022 14:48:20 +0100
Subject: [PATCH 13/82] add metric ari

---
 src/joint_embedding/api/anndata_score.yaml    | 25 +++++
 src/joint_embedding/api/comp_metric.yaml      | 93 +++++++++++++++++++
 .../metrics/ari/config.vsh.yaml               | 13 +++
 src/joint_embedding/metrics/ari/script.py     | 58 ++++++++++++
 4 files changed, 189 insertions(+)
 create mode 100644 src/joint_embedding/api/anndata_score.yaml
 create mode 100644 src/joint_embedding/api/comp_metric.yaml
 create mode 100644 src/joint_embedding/metrics/ari/config.vsh.yaml
 create mode 100644 src/joint_embedding/metrics/ari/script.py

diff --git a/src/joint_embedding/api/anndata_score.yaml b/src/joint_embedding/api/anndata_score.yaml
new file mode 100644
index 0000000000..bfe79f07cc
--- /dev/null
+++ b/src/joint_embedding/api/anndata_score.yaml
@@ -0,0 +1,25 @@
+type: file
+description: "Metric score file"
+example: "output.h5ad"
+info:
+  short_description: "Score"
+  slots:
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: method_id
+        description: "A unique identifier for the method"
+        required: true
+      - type: string
+        name: metric_ids
+        description: "One or more unique metric identifiers"
+        multiple: true
+        required: true
+      - type: double
+        name: metric_values
+        description: "The metric values obtained for the given prediction. Must be of same length as 'metric_ids'."
+        multiple: true
+        required: true
diff --git a/src/joint_embedding/api/comp_metric.yaml b/src/joint_embedding/api/comp_metric.yaml
new file mode 100644
index 0000000000..a33b4268d7
--- /dev/null
+++ b/src/joint_embedding/api/comp_metric.yaml
@@ -0,0 +1,93 @@
+functionality:
+  arguments:
+    - name: --input_prediction
+      __merge__: anndata_prediction.yaml
+    - name: --input_solution
+      __merge__: anndata_solution.yaml
+    - name: --output
+      __merge__: anndata_score.yaml
+      direction: output
+  test_resources:
+    - path: ../../../../resources_test
+    - type: python_script
+      path: generic_test.py
+      text: |
+        from os import path
+        import subprocess
+        import anndata as ad
+        import pandas as pd
+
+        ## VIASH START
+        # This code block will be replaced by viash at runtime.
+        meta = { 'functionality_name': 'foo' }
+        meta_path = "src/joint_embedding/metrics/check_format/metric_meta_check_format.tsv"
+        ## VIASH END
+
+        method_id = meta['functionality_name']
+        command = "./" + method_id
+
+        # define some filenames
+        testpar = {
+          "input_prediction": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+          "input_solution": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
+          "output": "output.h5ad"
+        }
+        meta_path = resources_dir + '/metric_meta.tsv'
+
+        print("> Running method")
+        out = subprocess.check_output([
+          "./" + meta['functionality_name'],
+          "--input_prediction", testpar['input_prediction'],
+          "--input_solution", testpar['input_solution'],
+          "--output", testpar['output']
+        ]).decode("utf-8")
+
+        print("> Checking whether output files were created")
+        assert path.exists(testpar['output'])
+
+        print("> Reading h5ad files")
+        input_prediction = ad.read_h5ad(testpar['input_prediction'])
+        input_solution = ad.read_h5ad(testpar['input_solution'])
+        output = ad.read_h5ad(testpar['output'])
+
+        metric_meta = pd.read_csv(
+          meta_path, 
+          delimiter="\t",
+          header=0,
+          dtype={ 'metric_id': str, 'metric_min': float, 'metric_max': float, 'metric_higherisbetter': bool }
+        )
+
+        print("> Checking contents of metric_meta.tsv")
+        assert 'metric_id' in metric_meta
+        assert 'metric_min' in metric_meta
+        assert 'metric_max' in metric_meta
+        assert 'metric_higherisbetter' in metric_meta
+
+        print("> Checking .uns['dataset_id']")
+        assert 'dataset_id' in output.uns
+        assert output.uns['dataset_id'] == input_prediction.uns['dataset_id']
+
+        print("> Checking .uns['method_id']")
+        assert 'method_id' in output.uns
+        assert output.uns['method_id'] == input_prediction.uns['method_id']
+
+        print("> Checking .uns['metric_ids']")
+        assert 'metric_ids' in output.uns
+        assert set(output.uns['metric_ids']) == set(metric_meta.metric_id)
+
+        print("> Checking .uns['metric_values']")
+        assert 'metric_values' in output.uns
+        assert output.uns['metric_ids'].size == output.uns['metric_values'].size
+
+        # merge with metric_meta to see if metric_value lies within the expected range
+        output_uns = pd.DataFrame({
+          'metric_id': output.uns['metric_ids'], 
+          'metric_value': output.uns['metric_values']
+        })
+
+        scores = metric_meta.merge(output_uns, on="metric_id")
+
+        assert all(scores.metric_value >= scores.metric_min)
+        assert all(scores.metric_value <= scores.metric_max)
+
+        print("> Test succeeded!")
diff --git a/src/joint_embedding/metrics/ari/config.vsh.yaml b/src/joint_embedding/metrics/ari/config.vsh.yaml
new file mode 100644
index 0000000000..12268747b4
--- /dev/null
+++ b/src/joint_embedding/metrics/ari/config.vsh.yaml
@@ -0,0 +1,13 @@
+__merge__: ../../api/comp_metrics.yaml
+functionality:
+  name: ari
+  namespace: joint_embedding/metrics
+  description: Adjusted rand index (ARI)
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: mumichae/scib-base:1.0.0
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/metrics/ari/script.py b/src/joint_embedding/metrics/ari/script.py
new file mode 100644
index 0000000000..dc7c195f66
--- /dev/null
+++ b/src/joint_embedding/metrics/ari/script.py
@@ -0,0 +1,58 @@
+import pprint
+import scanpy as sc
+import anndata as ad
+import scib
+
+## VIASH START
+par = dict(
+    input_prediction="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+    input_solution="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
+    output="openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.ari.had",
+    debug=True
+)
+
+## VIASH END
+
+if par['debug']:
+    pprint.pprint(par)
+
+print("Read prediction anndata")
+adata = ad.read_h5ad(par['input_prediction'])
+dataset_id = adata.uns['dataset_id']
+
+print("Read solution anndata")
+adata_solution = ad.read_h5ad(par['input_solution'])
+
+print('Transfer obs annotations')
+adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names]
+adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names]
+
+print('Preprocessing')
+sc.pp.neighbors(adata, use_rep='X_emb')
+
+print('Clustering')
+scib.cl.opt_louvain(
+    adata,
+    label_key='cell_type',
+    cluster_key='cluster',
+    plot=False,
+    inplace=True,
+    force=True
+)
+
+print('Compute score')
+score = scib.me.ari(adata, group1='cluster', group2='cell_type')
+
+# store adata with metrics
+print("Create output object")
+out = ad.AnnData(
+    uns=dict(
+        dataset_id=adata.uns['dataset_id'],
+        method_id=adata.uns['method_id'],
+        metric_ids=["ari"],
+        metric_values=[score]
+    )
+)
+
+print("Write output to h5ad file")
+out.write(par['output'], compression='gzip')

From caff25dc2a4b36f0fd91d752fbeb3647283846df Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Fri, 16 Dec 2022 14:48:52 +0100
Subject: [PATCH 14/82] update comp_metric

---
 src/joint_embedding/api/comp_metric.yaml | 37 ++++++++++--------------
 1 file changed, 16 insertions(+), 21 deletions(-)

diff --git a/src/joint_embedding/api/comp_metric.yaml b/src/joint_embedding/api/comp_metric.yaml
index a33b4268d7..1308ca03b8 100644
--- a/src/joint_embedding/api/comp_metric.yaml
+++ b/src/joint_embedding/api/comp_metric.yaml
@@ -7,6 +7,9 @@ functionality:
     - name: --output
       __merge__: anndata_score.yaml
       direction: output
+    - name: --debug
+      type: boolean_true
+      description: Verbose output for debugging.
   test_resources:
     - path: ../../../../resources_test
     - type: python_script
@@ -17,30 +20,22 @@ functionality:
         import anndata as ad
         import pandas as pd
 
-        ## VIASH START
-        # This code block will be replaced by viash at runtime.
-        meta = { 'functionality_name': 'foo' }
-        meta_path = "src/joint_embedding/metrics/check_format/metric_meta_check_format.tsv"
-        ## VIASH END
-
-        method_id = meta['functionality_name']
-        command = "./" + method_id
-
         # define some filenames
-        testpar = {
-          "input_prediction": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
-          "input_solution": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
-          "output": "output.h5ad"
-        }
+       
+        input_prediction_path =  "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+        input_solution_path = "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
+        output_path = "output.h5ad"
         meta_path = resources_dir + '/metric_meta.tsv'
 
-        print("> Running method")
-        out = subprocess.check_output([
-          "./" + meta['functionality_name'],
-          "--input_prediction", testpar['input_prediction'],
-          "--input_solution", testpar['input_solution'],
-          "--output", testpar['output']
-        ]).decode("utf-8")
+        cmd = [
+            meta['executable'],
+            "--input_prediction", input_prediction_path,
+            "--input_solution", input_solution_path,
+            "--output", output_path
+        ]
+
+        print(">> Running script as test")
+        out = subprocess.run(cmd, check=True, capture_output=True, text=True)
 
         print("> Checking whether output files were created")
         assert path.exists(testpar['output'])

From f7e0e0b90073168b025f75e3bd3639d03676ae3c Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Fri, 16 Dec 2022 16:31:57 +0100
Subject: [PATCH 15/82] update ari metric

---
 src/joint_embedding/api/comp_metric.yaml      | 120 +++++++++---------
 .../metrics/ari/config.vsh.yaml               |  15 ++-
 2 files changed, 73 insertions(+), 62 deletions(-)

diff --git a/src/joint_embedding/api/comp_metric.yaml b/src/joint_embedding/api/comp_metric.yaml
index 1308ca03b8..813d865dc2 100644
--- a/src/joint_embedding/api/comp_metric.yaml
+++ b/src/joint_embedding/api/comp_metric.yaml
@@ -10,79 +10,79 @@ functionality:
     - name: --debug
       type: boolean_true
       description: Verbose output for debugging.
-  test_resources:
-    - path: ../../../../resources_test
-    - type: python_script
-      path: generic_test.py
-      text: |
-        from os import path
-        import subprocess
-        import anndata as ad
-        import pandas as pd
+  # test_resources:
+  #   - path: ../../../../resources_test
+  #   - type: python_script
+  #     path: generic_test.py
+      # text: |
+      #   from os import path
+      #   import subprocess
+      #   import anndata as ad
+      #   import pandas as pd
 
-        # define some filenames
+      #   # define some filenames
        
-        input_prediction_path =  "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
-        input_solution_path = "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
-        output_path = "output.h5ad"
-        meta_path = resources_dir + '/metric_meta.tsv'
+      #   input_prediction_path =  "resources_test/common/joint_embedding/prediction.h5ad",
+      #   input_solution_path = "resources_test/common/joint_embedding/openproblems_bmmc_multiome_starter.solution.h5ad",
+      #   output_path = "output.h5ad"
+      #   meta_path = resources_dir + '/metric_meta.tsv'
 
-        cmd = [
-            meta['executable'],
-            "--input_prediction", input_prediction_path,
-            "--input_solution", input_solution_path,
-            "--output", output_path
-        ]
+      #   cmd = [
+      #       meta['executable'],
+      #       "--input_prediction", input_prediction_path,
+      #       "--input_solution", input_solution_path,
+      #       "--output", output_path
+      #   ]
 
-        print(">> Running script as test")
-        out = subprocess.run(cmd, check=True, capture_output=True, text=True)
+      #   print(">> Running script as test")
+      #   out = subprocess.run(cmd, check=True, capture_output=True, text=True)
 
-        print("> Checking whether output files were created")
-        assert path.exists(testpar['output'])
+      #   print("> Checking whether output files were created")
+      #   assert path.exists(testpar['output'])
 
-        print("> Reading h5ad files")
-        input_prediction = ad.read_h5ad(testpar['input_prediction'])
-        input_solution = ad.read_h5ad(testpar['input_solution'])
-        output = ad.read_h5ad(testpar['output'])
+      #   print("> Reading h5ad files")
+      #   input_prediction = ad.read_h5ad(testpar['input_prediction'])
+      #   input_solution = ad.read_h5ad(testpar['input_solution'])
+      #   output = ad.read_h5ad(testpar['output'])
 
-        metric_meta = pd.read_csv(
-          meta_path, 
-          delimiter="\t",
-          header=0,
-          dtype={ 'metric_id': str, 'metric_min': float, 'metric_max': float, 'metric_higherisbetter': bool }
-        )
+      #   metric_meta = pd.read_csv(
+      #     meta_path, 
+      #     delimiter="\t",
+      #     header=0,
+      #     dtype={ 'metric_id': str, 'metric_min': float, 'metric_max': float, 'metric_higherisbetter': bool }
+      #   )
 
-        print("> Checking contents of metric_meta.tsv")
-        assert 'metric_id' in metric_meta
-        assert 'metric_min' in metric_meta
-        assert 'metric_max' in metric_meta
-        assert 'metric_higherisbetter' in metric_meta
+      #   print("> Checking contents of metric_meta.tsv")
+      #   assert 'metric_id' in metric_meta
+      #   assert 'metric_min' in metric_meta
+      #   assert 'metric_max' in metric_meta
+      #   assert 'metric_higherisbetter' in metric_meta
 
-        print("> Checking .uns['dataset_id']")
-        assert 'dataset_id' in output.uns
-        assert output.uns['dataset_id'] == input_prediction.uns['dataset_id']
+      #   print("> Checking .uns['dataset_id']")
+      #   assert 'dataset_id' in output.uns
+      #   assert output.uns['dataset_id'] == input_prediction.uns['dataset_id']
 
-        print("> Checking .uns['method_id']")
-        assert 'method_id' in output.uns
-        assert output.uns['method_id'] == input_prediction.uns['method_id']
+      #   print("> Checking .uns['method_id']")
+      #   assert 'method_id' in output.uns
+      #   assert output.uns['method_id'] == input_prediction.uns['method_id']
 
-        print("> Checking .uns['metric_ids']")
-        assert 'metric_ids' in output.uns
-        assert set(output.uns['metric_ids']) == set(metric_meta.metric_id)
+      #   print("> Checking .uns['metric_ids']")
+      #   assert 'metric_ids' in output.uns
+      #   assert set(output.uns['metric_ids']) == set(metric_meta.metric_id)
 
-        print("> Checking .uns['metric_values']")
-        assert 'metric_values' in output.uns
-        assert output.uns['metric_ids'].size == output.uns['metric_values'].size
+      #   print("> Checking .uns['metric_values']")
+      #   assert 'metric_values' in output.uns
+      #   assert output.uns['metric_ids'].size == output.uns['metric_values'].size
 
-        # merge with metric_meta to see if metric_value lies within the expected range
-        output_uns = pd.DataFrame({
-          'metric_id': output.uns['metric_ids'], 
-          'metric_value': output.uns['metric_values']
-        })
+      #   # merge with metric_meta to see if metric_value lies within the expected range
+      #   output_uns = pd.DataFrame({
+      #     'metric_id': output.uns['metric_ids'], 
+      #     'metric_value': output.uns['metric_values']
+      #   })
 
-        scores = metric_meta.merge(output_uns, on="metric_id")
+      #   scores = metric_meta.merge(output_uns, on="metric_id")
 
-        assert all(scores.metric_value >= scores.metric_min)
-        assert all(scores.metric_value <= scores.metric_max)
+      #   assert all(scores.metric_value >= scores.metric_min)
+      #   assert all(scores.metric_value <= scores.metric_max)
 
-        print("> Test succeeded!")
+      #   print("> Test succeeded!")
diff --git a/src/joint_embedding/metrics/ari/config.vsh.yaml b/src/joint_embedding/metrics/ari/config.vsh.yaml
index 12268747b4..066a0bfe98 100644
--- a/src/joint_embedding/metrics/ari/config.vsh.yaml
+++ b/src/joint_embedding/metrics/ari/config.vsh.yaml
@@ -1,13 +1,24 @@
-__merge__: ../../api/comp_metrics.yaml
+__merge__: ../../api/comp_metric.yaml
 functionality:
   name: ari
   namespace: joint_embedding/metrics
   description: Adjusted rand index (ARI)
+  info:
+    metrics:
+      - id: ari
+        label: ari
+        description: Adjusted rand index (ARI)
+        min: 0
+        max: 1
+        maximize: true
   resources:
     - type: python_script
       path: script.py
 platforms:
   - type: docker
-    image: mumichae/scib-base:1.0.0
+    image: python:3.10
+    setup:
+      - type: python
+        pip: [anndata>=0.8, scib, scanpy]
   - type: nextflow
     directives: [ lowmem, lowtime, lowcpu ]

From 22c7f464f7e5a83c93f44098269f31658f71e97f Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Fri, 16 Dec 2022 16:40:46 +0100
Subject: [PATCH 16/82] add asw_batch metric

---
 .../metrics/asw_batch/config.vsh.yaml         | 24 ++++++++
 .../metrics/asw_batch/script.py               | 55 +++++++++++++++++++
 2 files changed, 79 insertions(+)
 create mode 100644 src/joint_embedding/metrics/asw_batch/config.vsh.yaml
 create mode 100644 src/joint_embedding/metrics/asw_batch/script.py

diff --git a/src/joint_embedding/metrics/asw_batch/config.vsh.yaml b/src/joint_embedding/metrics/asw_batch/config.vsh.yaml
new file mode 100644
index 0000000000..b0758506a5
--- /dev/null
+++ b/src/joint_embedding/metrics/asw_batch/config.vsh.yaml
@@ -0,0 +1,24 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: asw_batch
+  namespace: joint_embedding/metrics
+  description: Average silhouette width (ASW) of batches per label
+  info:
+    metrics:
+      - id: asw_batch
+        label: asw_batch
+        description: Average silhouette width (ASW) of batches per label
+        min: 0
+        max: 1
+        maximize: True
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: python:3.10
+    setup:
+      - type: python
+        pip: [ anndata>=0.8, scanpy, scib]
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/metrics/asw_batch/script.py b/src/joint_embedding/metrics/asw_batch/script.py
new file mode 100644
index 0000000000..d111a71d0c
--- /dev/null
+++ b/src/joint_embedding/metrics/asw_batch/script.py
@@ -0,0 +1,55 @@
+import pprint
+import scanpy as sc
+import anndata as ad
+import scib
+
+## VIASH START
+par = dict(
+    input_prediction="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+    input_solution="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
+    output="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.asw_batch.tsv",
+    debug=True
+)
+
+## VIASH END
+
+if par['debug']:
+    pprint.pprint(par)
+
+input_prediction = par['input_prediction']
+input_solution = par['input_solution']
+output = par['output']
+
+print("Read prediction anndata")
+adata = ad.read(input_prediction)
+dataset_id = adata.uns['dataset_id']
+
+print("Read solution anndata")
+adata_solution = ad.read(input_solution)
+
+print('Transfer obs annotations')
+adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names]
+adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names]
+
+print('Compute score')
+score = scib.me.silhouette_batch(
+    adata,
+    batch_key='batch',
+    group_key='cell_type',
+    embed='X_emb',
+    verbose=False
+)
+
+# store adata with metrics
+print("Create output object")
+out = ad.AnnData(
+    uns=dict(
+        dataset_id=adata.uns['dataset_id'],
+        method_id=adata.uns['method_id'],
+        metric_ids=['asw_batch'],
+        metric_values=[score]
+    )
+)
+
+print("Write output to h5ad file")
+out.write(output, compression='gzip')

From d7e03de572219ce00b24179b12823fb50b24db4f Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Fri, 16 Dec 2022 16:53:18 +0100
Subject: [PATCH 17/82] add asw_label metric

---
 .../metrics/asw_label/config.vsh.yaml         | 24 +++++++++
 .../metrics/asw_label/script.py               | 49 +++++++++++++++++++
 2 files changed, 73 insertions(+)
 create mode 100644 src/joint_embedding/metrics/asw_label/config.vsh.yaml
 create mode 100644 src/joint_embedding/metrics/asw_label/script.py

diff --git a/src/joint_embedding/metrics/asw_label/config.vsh.yaml b/src/joint_embedding/metrics/asw_label/config.vsh.yaml
new file mode 100644
index 0000000000..bee00b60d2
--- /dev/null
+++ b/src/joint_embedding/metrics/asw_label/config.vsh.yaml
@@ -0,0 +1,24 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: asw_label
+  namespace: joint_embedding_metrics
+  description: Average silhouette width (ASW) of labels
+  info:
+    metrics:
+      - id: asw_label
+        label: asw_label
+        description: Average silhouette width (ASW) of labels
+        min: 0
+        max: 1
+        maximize: True
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: python:3.10
+    setup:
+      - type: python
+        pip: [anndata>=0.8, scib]
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/metrics/asw_label/script.py b/src/joint_embedding/metrics/asw_label/script.py
new file mode 100644
index 0000000000..4d04092004
--- /dev/null
+++ b/src/joint_embedding/metrics/asw_label/script.py
@@ -0,0 +1,49 @@
+import pprint
+import anndata as ad
+import scib
+
+## VIASH START
+par = dict(
+    input_prediction="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+    input_solution="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
+    output="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.asw_batch.tsv",
+    debug=True
+)
+
+## VIASH END
+
+if par['debug']:
+    pprint.pprint(par)
+
+input_prediction = par['input_prediction']
+input_solution = par['input_solution']
+output = par['output']
+
+print("Read prediction anndata")
+adata = ad.read_h5ad(input_prediction)
+dataset_id = adata.uns['dataset_id']
+
+print("Read solution anndata")
+adata_solution = ad.read_h5ad(input_solution)
+
+print('Transfer obs annotations')
+adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names]
+adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names]
+
+print('Compute score')
+score = scib.me.silhouette(adata, group_key='cell_type', embed='X_emb')
+
+# store adata with metrics
+print("Create output object")
+out = ad.AnnData(
+    uns=dict(
+        dataset_id=adata.uns['dataset_id'],
+        method_id=adata.uns['method_id'],
+        metric_ids=['asw_label'],
+        metric_values=[score]
+    )
+)
+
+print("Write output to h5ad file")
+out.write(output, compression='gzip')
+

From 1b47472afdd66614ad1a897c30491c3ebba82364 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Fri, 16 Dec 2022 21:59:31 +0100
Subject: [PATCH 18/82] add cc_cons metric

---
 .../metrics/asw_batch/script.py               | 12 ++---
 .../metrics/asw_label/config.vsh.yaml         |  2 +-
 .../metrics/cc_cons/config.vsh.yaml           | 24 +++++++++
 src/joint_embedding/metrics/cc_cons/script.py | 54 +++++++++++++++++++
 4 files changed, 85 insertions(+), 7 deletions(-)
 create mode 100644 src/joint_embedding/metrics/cc_cons/config.vsh.yaml
 create mode 100644 src/joint_embedding/metrics/cc_cons/script.py

diff --git a/src/joint_embedding/metrics/asw_batch/script.py b/src/joint_embedding/metrics/asw_batch/script.py
index d111a71d0c..88e89c21a8 100644
--- a/src/joint_embedding/metrics/asw_batch/script.py
+++ b/src/joint_embedding/metrics/asw_batch/script.py
@@ -5,10 +5,10 @@
 
 ## VIASH START
 par = dict(
-    input_prediction="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
-    input_solution="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
-    output="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.asw_batch.tsv",
-    debug=True
+    input_prediction="resources_test/common/joint_embedding/cite_random_prediction.h5ad",
+    input_solution="resources_test/common/joint_embedding/cite_solution.h5ad",
+    output="resources_test/common/joint_embedding/score_cc_cons.h5ad",
+    debug=False
 )
 
 ## VIASH END
@@ -21,11 +21,11 @@
 output = par['output']
 
 print("Read prediction anndata")
-adata = ad.read(input_prediction)
+adata = ad.read_h5ad(input_prediction)
 dataset_id = adata.uns['dataset_id']
 
 print("Read solution anndata")
-adata_solution = ad.read(input_solution)
+adata_solution = ad.read_h5ad(input_solution)
 
 print('Transfer obs annotations')
 adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names]
diff --git a/src/joint_embedding/metrics/asw_label/config.vsh.yaml b/src/joint_embedding/metrics/asw_label/config.vsh.yaml
index bee00b60d2..9b1f331cb2 100644
--- a/src/joint_embedding/metrics/asw_label/config.vsh.yaml
+++ b/src/joint_embedding/metrics/asw_label/config.vsh.yaml
@@ -1,7 +1,7 @@
 __merge__: ../../api/comp_metric.yaml
 functionality:
   name: asw_label
-  namespace: joint_embedding_metrics
+  namespace: joint_embedding/metrics
   description: Average silhouette width (ASW) of labels
   info:
     metrics:
diff --git a/src/joint_embedding/metrics/cc_cons/config.vsh.yaml b/src/joint_embedding/metrics/cc_cons/config.vsh.yaml
new file mode 100644
index 0000000000..dc8b3ab1c9
--- /dev/null
+++ b/src/joint_embedding/metrics/cc_cons/config.vsh.yaml
@@ -0,0 +1,24 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: cc_cons
+  namespace: joint_embedding/metrics
+  description: Cell cycle conservation score
+  info:
+    metrics:
+      - id: cc_cons
+        label: cc_cons
+        description: Cell cycle conservation score
+        min: 0
+        max: 1
+        maximize: True
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: python:3.10
+    setup:
+      - type: python
+        pip: [anndata>=0.8, scib]
+  - type: nextflow
+    directives: [ vhighmem, midtime, midcpu ]
diff --git a/src/joint_embedding/metrics/cc_cons/script.py b/src/joint_embedding/metrics/cc_cons/script.py
new file mode 100644
index 0000000000..a8741fb511
--- /dev/null
+++ b/src/joint_embedding/metrics/cc_cons/script.py
@@ -0,0 +1,54 @@
+import pprint
+import anndata as ad
+import scib
+
+## VIASH START
+par = dict(
+    input_prediction="resources_test/common/joint_embedding/cite_random_prediction.h5ad",
+    input_solution="resources_test/common/joint_embedding/cite_solution.h5ad",
+    output="resources_test/common/joint_embedding/score_cc_cons.h5ad",
+    debug=False
+)
+## VIASH END
+
+
+if par['debug']:
+    pprint.pprint(par)
+
+print("Read prediction anndata")
+adata = ad.read_h5ad(par['input_prediction'])
+dataset_id = adata.uns['dataset_id']
+
+print("Read solution anndata")
+adata_solution = ad.read_h5ad(par['input_solution'])
+organism = adata_solution.uns['organism']
+
+print('Transfer obs annotations')
+adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names]
+adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names]
+recompute_cc = 'S_score' not in adata_solution.obs_keys() or \
+               'G2M_score' not in adata_solution.obs_keys()
+
+print('Compute score')
+score = scib.me.cell_cycle(
+    adata_pre=adata_solution,
+    adata_post=adata,
+    batch_key='batch',
+    embed='X_emb',
+    recompute_cc=recompute_cc,
+    organism=organism
+)
+
+# store adata with metrics
+print("Create output object")
+out = ad.AnnData(
+    uns= {
+        "dataset_id":adata.uns['dataset_id'],
+        "method_id":adata.uns['method_id'],
+        "metric_ids":['cc_cons'],
+        "metric_values":[score],
+    }
+)
+
+print("Write output to h5ad file")
+out.write(par['output'], compression='gzip')

From ea82ca5b3e1618453082bfe2b977ab2931c60f7d Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Wed, 4 Jan 2023 14:34:30 +0100
Subject: [PATCH 19/82] remove DI docker because of old anndata package

---
 src/joint_embedding/mask_dataset/config.vsh.yaml | 9 ++++++++-
 src/joint_embedding/mask_dataset/script.R        | 2 +-
 src/joint_embedding/mask_dataset/test.R          | 2 +-
 src/joint_embedding/methods/lmds/config.vsh.yaml | 9 +++++++--
 src/joint_embedding/methods/lmds/script.R        | 2 +-
 5 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/src/joint_embedding/mask_dataset/config.vsh.yaml b/src/joint_embedding/mask_dataset/config.vsh.yaml
index 1f54f2df62..d49464163d 100644
--- a/src/joint_embedding/mask_dataset/config.vsh.yaml
+++ b/src/joint_embedding/mask_dataset/config.vsh.yaml
@@ -18,6 +18,13 @@ functionality:
     - path: ../../../resources_test
 platforms:
   - type: docker
-    image: dataintuitive/randpy:r4.0_py3.8_bioc3.12
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata, tidyverse , testthat]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [ anndata>=0.8 ]
   - type: nextflow
     directives: [ midmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/mask_dataset/script.R b/src/joint_embedding/mask_dataset/script.R
index 2d70248e69..d417775314 100644
--- a/src/joint_embedding/mask_dataset/script.R
+++ b/src/joint_embedding/mask_dataset/script.R
@@ -1,7 +1,7 @@
 cat("Loading dependencies\n")
 options(tidyverse.quiet = TRUE)
 library(tidyverse)
-requireNamespace("anndata", quietly = TRUE)
+library(anndata, warn.conflicts = FALSE)
 library(assertthat, quietly = TRUE, warn.conflicts = FALSE)
 library(Matrix, quietly = TRUE, warn.conflicts = FALSE)
 
diff --git a/src/joint_embedding/mask_dataset/test.R b/src/joint_embedding/mask_dataset/test.R
index ad8cc55eec..88930e00d2 100644
--- a/src/joint_embedding/mask_dataset/test.R
+++ b/src/joint_embedding/mask_dataset/test.R
@@ -1,5 +1,5 @@
 library(testthat, quietly = TRUE, warn.conflicts = FALSE)
-requireNamespace("anndata", quietly = TRUE)
+library(anndata, warn.conflicts = FALSE)
 
 par <- list(
   input_mod1 = "resources_test/common/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.output_rna.h5ad",
diff --git a/src/joint_embedding/methods/lmds/config.vsh.yaml b/src/joint_embedding/methods/lmds/config.vsh.yaml
index b9e096a6d1..a0a3ef40af 100644
--- a/src/joint_embedding/methods/lmds/config.vsh.yaml
+++ b/src/joint_embedding/methods/lmds/config.vsh.yaml
@@ -20,9 +20,14 @@ functionality:
       path: script.R
 platforms:
   - type: docker
-    image: dataintuitive/randpy:r4.0_py3.8_bioc3.12
+    image: eddelbuettel/r2u:22.04
     setup:
       - type: r
-        packages: [ lmds ]
+        cran: [ anndata, lmds, tidyverse ]
+    test_setup:
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [anndata>=0.8]
   - type: nextflow
     directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/methods/lmds/script.R b/src/joint_embedding/methods/lmds/script.R
index 92700691e0..7dda2a3c4c 100644
--- a/src/joint_embedding/methods/lmds/script.R
+++ b/src/joint_embedding/methods/lmds/script.R
@@ -1,7 +1,7 @@
 cat("Loading dependencies\n")
 options(tidyverse.quiet = TRUE)
 library(tidyverse)
-requireNamespace("anndata", quietly = TRUE)
+library(anndata, warn.conflicts = FALSE)
 library(Matrix, warn.conflicts = FALSE, quietly = TRUE)
 
 ## VIASH START

From 16ce7765320a8d6eb571dd4b24b8063239cd03be Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Wed, 4 Jan 2023 14:34:45 +0100
Subject: [PATCH 20/82] add check_format metric

---
 .../metrics/check_format/config.vsh.yaml      | 34 +++++++++++
 .../metrics/check_format/script.R             | 61 +++++++++++++++++++
 2 files changed, 95 insertions(+)
 create mode 100644 src/joint_embedding/metrics/check_format/config.vsh.yaml
 create mode 100644 src/joint_embedding/metrics/check_format/script.R

diff --git a/src/joint_embedding/metrics/check_format/config.vsh.yaml b/src/joint_embedding/metrics/check_format/config.vsh.yaml
new file mode 100644
index 0000000000..1fae80d3aa
--- /dev/null
+++ b/src/joint_embedding/metrics/check_format/config.vsh.yaml
@@ -0,0 +1,34 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: check_format
+  namespace: joint_embedding/metrics
+  description: Checking whether the prediction of a method has the right format.
+  info:
+    metrics:
+      - id: finished
+        label: finished
+        description: 
+        min: 0
+        max: 1
+        maximize: true
+      - id: correct_format
+        label: correct_format
+        description: 
+        min: 0
+        max: 1
+        maximize: true
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata, lmds, tidyverse ]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [ anndata>=0.8 ]
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/metrics/check_format/script.R b/src/joint_embedding/metrics/check_format/script.R
new file mode 100644
index 0000000000..afe8ed10fd
--- /dev/null
+++ b/src/joint_embedding/metrics/check_format/script.R
@@ -0,0 +1,61 @@
+cat("Load dependencies\n")
+library(assertthat, quietly = TRUE, warn.conflicts = FALSE)
+library(anndata, warn.conflicts = FALSE)
+
+## VIASH START
+task <- "joint_embedding"
+par <- list(
+  input_solution = paste0("resources_test/", task, "/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad"),
+  input_prediction = paste0("resources_test/", task, "/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad"),
+  output = paste0("resources_test/", task, "/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.scores.h5ad")
+)
+## VIASH END
+
+cat("Read prediction h5ad\n")
+ad_sol <- read_h5ad(par$input_solution)
+
+cat("Checking solution h5ad\n")
+correct_format <- tryCatch({
+  # read prediction
+  ad_pred <- read_h5ad(par$input_prediction)
+
+  # check dataset id
+  dataset_id <- ad_pred$uns[["dataset_id"]]
+  assert_that(dataset_id == ad_sol$uns[["dataset_id"]])
+
+  # check method id
+  method_id <- ad_pred$uns[["method_id"]]
+  assert_that(
+    is.character(method_id),
+    method_id != ""
+  )
+
+  # check X
+  assert_that(
+    ad_pred$n_obs == ad_sol$n_obs,
+    ad_pred$n_vars >= 1,
+    ad_pred$n_vars <= 100,
+    !is.null(ad_pred$obs_names),
+    all(ad_pred$obs_names == ad_sol$obs_names)
+  )
+
+  1
+}, error = function(e) {
+  cat("ERROR: ", e$message, "\n", sep = "")
+  0
+})
+
+
+cat("Create output object\n")
+out <- AnnData(
+  shape = c(0, 0),
+  uns = list(
+    dataset_id = ad_pred$uns$dataset_id,
+    method_id = ad_pred$uns$method_id,
+    metric_ids = c("finished", "correct_format"),
+    metric_values = c(1, correct_format)
+  )
+)
+
+cat("Write output to h5ad file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")

From 4bce62c0544e8718a4763ca13e977dd62588d356 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Wed, 4 Jan 2023 14:55:47 +0100
Subject: [PATCH 21/82] add graph connectivity metric

---
 src/joint_embedding/api/authors.yaml          |  6 ++-
 .../graph_connectivity/config.vsh.yaml        | 24 +++++++++
 .../metrics/graph_connectivity/script.py      | 53 +++++++++++++++++++
 3 files changed, 82 insertions(+), 1 deletion(-)
 create mode 100644 src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml
 create mode 100644 src/joint_embedding/metrics/graph_connectivity/script.py

diff --git a/src/joint_embedding/api/authors.yaml b/src/joint_embedding/api/authors.yaml
index 70f8fc3141..fbac6221d1 100644
--- a/src/joint_embedding/api/authors.yaml
+++ b/src/joint_embedding/api/authors.yaml
@@ -13,4 +13,8 @@ functionality:
     - name: Christopher Lance
       email: clance.connect@gmail.com
       roles: [ author, maintainer ]
-      props: { github: xlancelottx }
\ No newline at end of file
+      props: { github: xlancelottx }
+    - name: Michaela Mueller
+      email: mumichae@in.tum.de
+      roles: [ author, maintainer ]
+      props: { github: mumichae, orcid: "0000-0002-1401-1785" }
\ No newline at end of file
diff --git a/src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml b/src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml
new file mode 100644
index 0000000000..a575757ba6
--- /dev/null
+++ b/src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml
@@ -0,0 +1,24 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: graph_connectivity
+  namespace: joint_embedding/metrics
+  description: Graph connectivity
+  info:
+    metrics:
+      - id: graph_conn
+        label: graph_conn
+        description: Graph connectivity
+        min: 0
+        max: 1
+        maximize: True
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: python:3.10
+    setup:
+      - type: python
+        pip: [anndata>=0.8, scib]
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/metrics/graph_connectivity/script.py b/src/joint_embedding/metrics/graph_connectivity/script.py
new file mode 100644
index 0000000000..ab9089d74a
--- /dev/null
+++ b/src/joint_embedding/metrics/graph_connectivity/script.py
@@ -0,0 +1,53 @@
+import pprint
+import scanpy as sc
+import anndata as ad
+import scib
+
+## VIASH START
+par = dict(
+    input_prediction="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+    input_solution="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
+    output="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.graph_conn.tsv",
+    debug=True
+)
+
+## VIASH END
+
+if par['debug']:
+    pprint.pprint(par)
+
+
+input_prediction = par['input_prediction']
+input_solution = par['input_solution']
+output = par['output']
+
+print("Read prediction anndata")
+adata = ad.read_h5ad(input_prediction)
+dataset_id = adata.uns['dataset_id']
+
+print("Read solution anndata")
+adata_solution = ad.read_h5ad(input_solution)
+
+print('Transfer obs annotations')
+adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names]
+adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names]
+
+print('Preprocessing')
+sc.pp.neighbors(adata, use_rep='X_emb')
+
+print('Compute score')
+score = scib.me.graph_connectivity(adata, label_key='cell_type')
+
+# store adata with metrics
+print("Create output object")
+out = ad.AnnData(
+    uns = { 
+        'dataset_id':adata.uns['dataset_id'],
+        'method_id':adata.uns['method_id'],
+        'metric_ids':['graph_conn'],
+        'metric_values':[score]
+        }
+)
+
+print("Write output to h5ad file")
+out.write(output, compression='gzip')

From bdbdbfda1bbcad1e5ac40c86442184014ad2d4a0 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Wed, 4 Jan 2023 15:54:14 +0100
Subject: [PATCH 22/82] add latent mixing metric

---
 src/joint_embedding/api/authors.yaml          |  6 +-
 .../metrics/latent_mixing/config.vsh.yaml     | 28 ++++++
 .../metrics/latent_mixing/script.py           | 92 +++++++++++++++++++
 3 files changed, 125 insertions(+), 1 deletion(-)
 create mode 100644 src/joint_embedding/metrics/latent_mixing/config.vsh.yaml
 create mode 100644 src/joint_embedding/metrics/latent_mixing/script.py

diff --git a/src/joint_embedding/api/authors.yaml b/src/joint_embedding/api/authors.yaml
index fbac6221d1..f2e96c35fd 100644
--- a/src/joint_embedding/api/authors.yaml
+++ b/src/joint_embedding/api/authors.yaml
@@ -17,4 +17,8 @@ functionality:
     - name: Michaela Mueller
       email: mumichae@in.tum.de
       roles: [ author, maintainer ]
-      props: { github: mumichae, orcid: "0000-0002-1401-1785" }
\ No newline at end of file
+      props: { github: mumichae, orcid: "0000-0002-1401-1785" }
+    - name: Ann Chen
+      email: ann.chen@czbiohub.org
+      roles: [ author, maintainer ]
+      props: { github: atchen}
\ No newline at end of file
diff --git a/src/joint_embedding/metrics/latent_mixing/config.vsh.yaml b/src/joint_embedding/metrics/latent_mixing/config.vsh.yaml
new file mode 100644
index 0000000000..b65e2f8c66
--- /dev/null
+++ b/src/joint_embedding/metrics/latent_mixing/config.vsh.yaml
@@ -0,0 +1,28 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: latent_mixing
+  namespace: joint_embedding/metrics
+  description: Calculate latent mixing metric for joint embedding task.
+  info:
+    metrics:
+      - id: latent_mixing
+        label: Latent mixing
+        min: -1
+        max: 0
+        maximize: True
+  arguments:
+    - name: "--n_neighbors"
+      type: integer
+      default: 100
+      description: Number of neighbors for the entropy_batch_mixing metric.
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: python:3.10
+    setup:
+      - type: python
+        pip: [anndata>=0.8, scikit-learn, scipy]
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
\ No newline at end of file
diff --git a/src/joint_embedding/metrics/latent_mixing/script.py b/src/joint_embedding/metrics/latent_mixing/script.py
new file mode 100644
index 0000000000..511e7f8d2a
--- /dev/null
+++ b/src/joint_embedding/metrics/latent_mixing/script.py
@@ -0,0 +1,92 @@
+import anndata as ad
+import numpy as np
+import scipy
+from sklearn.neighbors import NearestNeighbors
+
+# VIASH START
+par = {
+    "input_prediction": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+    "input_solution": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
+    "output": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.scores_totalvi.h5ad",
+    "n_neighbors": 100
+}
+# VIASH END
+
+print("Read input files")
+predict_adata = ad.read_h5ad(par["input_prediction"])
+solution_adata = ad.read_h5ad(par["input_solution"])
+
+print("Merge prediction with solution")
+merged_adata = predict_adata.copy()
+
+batch_val = solution_adata.obs["batch"].astype(str)
+batch_unique_values, batch_index = np.unique(batch_val, return_inverse=True)
+
+merged_adata.obs["batch"] = batch_index
+
+def entropy_batch_mixing(
+    latent_space, batches, n_neighbors=50, n_pools=50, n_samples_per_pool=100
+):
+
+    def neg_kl(hist_data, global_freq):
+        n_batches = len(np.unique(hist_data))
+        if n_batches > 2:
+            raise ValueError("Should be only two clusters for this metric")
+        frequency = np.mean(hist_data == 1)
+        if frequency == 0 or frequency == 1:
+            return 0
+        return -(
+            frequency * np.log(frequency / global_freq)
+            + (1 - frequency) * np.log((1 - frequency) / (1 - global_freq))
+        )
+
+    n_neighbors = min(n_neighbors, latent_space.getnnz() - 1)
+    nne = NearestNeighbors(n_neighbors=1 + n_neighbors, n_jobs=8)
+    nne.fit(latent_space)
+    kmatrix = nne.kneighbors_graph(latent_space) - scipy.sparse.identity(
+        latent_space.shape[0]
+    )
+
+    global_freq = np.mean(batches)
+    print(global_freq)
+    score = 0
+    for t in range(n_pools):
+        indices = np.random.choice(
+            np.arange(latent_space.shape[0]), size=n_samples_per_pool
+        )
+        score += np.mean(
+            [
+                neg_kl(
+                    batches[  # the batches of cell i's neighbors
+                        kmatrix[indices].nonzero()[
+                            1
+                        ][  # the neighbors of cell i (columns in row i)
+                            kmatrix[indices].nonzero()[0] == i  # the row of cell i
+                        ]
+                    ],
+                    global_freq,
+                )
+                for i in range(n_samples_per_pool)
+            ]
+        )
+    return score / float(n_pools)
+
+
+print("Calculate latent mixing metric")
+latent_mixing = entropy_batch_mixing(
+    latent_space=merged_adata.obsm['X_emb'],
+    batches=merged_adata.obs["batch"].values,
+    n_neighbors=par["n_neighbors"]
+)
+
+print("Write output")
+adata_out = ad.AnnData(
+    uns = {
+        "dataset_id": predict_adata.uns["dataset_id"],
+        "method_id" : predict_adata.uns["method_id"],
+        "metric_ids" : ["latent_mixing"],
+        "metric_values" : [latent_mixing]
+    }
+)
+
+adata_out.write_h5ad(par['output'], compression = "gzip")
\ No newline at end of file

From 5457a6c3f195ec0bf70db0dbf689544834d3915b Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Wed, 4 Jan 2023 16:19:12 +0100
Subject: [PATCH 23/82] add nmi metric

---
 .../metrics/nmi/config.vsh.yaml               | 24 +++++++
 src/joint_embedding/metrics/nmi/script.py     | 64 +++++++++++++++++++
 2 files changed, 88 insertions(+)
 create mode 100644 src/joint_embedding/metrics/nmi/config.vsh.yaml
 create mode 100644 src/joint_embedding/metrics/nmi/script.py

diff --git a/src/joint_embedding/metrics/nmi/config.vsh.yaml b/src/joint_embedding/metrics/nmi/config.vsh.yaml
new file mode 100644
index 0000000000..9f507916a4
--- /dev/null
+++ b/src/joint_embedding/metrics/nmi/config.vsh.yaml
@@ -0,0 +1,24 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: nmi
+  namespace: joint_embedding/metrics
+  description: Normalised mutual information (NMI)
+  info:
+    metrics:
+      - id: nmi
+        label: NMI
+        description: Normalised mutual information (NMI)
+        min: 0
+        max: 1
+        maximize: True
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: python:3.10
+    setup:
+      - type: python
+        pip: [anndata>=0.8, scib]
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/metrics/nmi/script.py b/src/joint_embedding/metrics/nmi/script.py
new file mode 100644
index 0000000000..73ef901bd7
--- /dev/null
+++ b/src/joint_embedding/metrics/nmi/script.py
@@ -0,0 +1,64 @@
+import pprint
+import scanpy as sc
+import anndata as ad
+import scib
+
+
+## VIASH START
+par = dict(
+    input_prediction="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+    input_solution="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
+    output="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.nmi.tsv",
+    debug=True
+)
+
+## VIASH END
+
+if par['debug']:
+    pprint.pprint(par)
+
+
+input_prediction = par['input_prediction']
+input_solution = par['input_solution']
+output = par['output']
+
+print("Read prediction anndata")
+adata = ad.read_h5ad(input_prediction)
+dataset_id = adata.uns['dataset_id']
+
+print("Read solution anndata")
+adata_solution = ad.read_h5ad(input_solution)
+
+print('Transfer obs annotations')
+adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names]
+adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names]
+
+print('Preprocessing')
+sc.pp.neighbors(adata, use_rep='X_emb')
+
+print('Clustering')
+scib.cl.opt_louvain(
+    adata,
+    label_key='cell_type',
+    cluster_key='cluster',
+    plot=False,
+    inplace=True,
+    force=True
+)
+
+print('Compute score')
+score = scib.me.nmi(adata, group1='cluster', group2='cell_type')
+
+# store adata with metrics
+print("Create output object")
+out = ad.AnnData(
+    uns=dict(
+        dataset_id=adata.uns['dataset_id'],
+        method_id=adata.uns['method_id'],
+        metric_ids=['nmi'],
+        metric_values=[score]
+    )
+)
+
+print("Write output to h5ad file")
+out.write(output, compression='gzip')
\ No newline at end of file

From 6d50fc49b95bc5df9be17e2f21c9a79e1cf57b4d Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Wed, 4 Jan 2023 16:58:39 +0100
Subject: [PATCH 24/82] add rfoob metric

---
 .../metrics/rfoob/config.vsh.yaml             | 46 +++++++++++++
 src/joint_embedding/metrics/rfoob/script.R    | 68 +++++++++++++++++++
 2 files changed, 114 insertions(+)
 create mode 100644 src/joint_embedding/metrics/rfoob/config.vsh.yaml
 create mode 100644 src/joint_embedding/metrics/rfoob/script.R

diff --git a/src/joint_embedding/metrics/rfoob/config.vsh.yaml b/src/joint_embedding/metrics/rfoob/config.vsh.yaml
new file mode 100644
index 0000000000..700eb49716
--- /dev/null
+++ b/src/joint_embedding/metrics/rfoob/config.vsh.yaml
@@ -0,0 +1,46 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: rfoob
+  namespace: joint_embedding/metrics
+  description: Calculating basic metrics for the joint embedding task.
+  info:
+    metrics:
+      - id: rfoob_celltype_accuracy
+        label: rfoob_celltype_accuracy
+        description:
+        min: 0
+        max: 1
+        maximize: True
+      - id: rfoob_pseudotimegex_rsq
+        label: rfoob_pseudotimegex_rsq
+        description:
+        min: -1
+        max: 1
+        maximize: true
+      - id: rfoob_pseudotimeadt_rsq
+        label: rfoob_pseudotimeadt_rsq
+        description:
+        min: -1
+        max: 1
+        maximize: True
+      - id: rfoob_batch_error
+        label: rfoob_batch_error
+        description:
+        min: 0
+        max: 1
+        maximize: True
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3, git ]
+      - type: python
+        pip: [ anndata>=0.8 ]
+      - type: r
+        cran: [  anndata, ranger, tidyverse, testthat]
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
\ No newline at end of file
diff --git a/src/joint_embedding/metrics/rfoob/script.R b/src/joint_embedding/metrics/rfoob/script.R
new file mode 100644
index 0000000000..e237192fe3
--- /dev/null
+++ b/src/joint_embedding/metrics/rfoob/script.R
@@ -0,0 +1,68 @@
+cat("Load dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+library(testthat, quietly = TRUE, warn.conflicts = FALSE)
+requireNamespace("anndata", quietly = TRUE)
+
+## VIASH START
+par <- list(
+  input_solution = "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
+  input_prediction = "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+  output = "scores.h5ad"
+)
+## VIASH END
+
+cat("Read solution h5ad\n")
+ad_sol <- anndata::read_h5ad(par$input_solution)
+
+cat("Read prediction h5ad\n")
+expect_true(
+  grepl("\\.h5ad$", par$input_prediction),
+  info = "Prediction file should be an h5ad file"
+)
+ad_pred <-
+  tryCatch({
+    anndata::read_h5ad(par$input_prediction)
+  }, error = function(e) {
+    stop(paste0("Can't open prediction h5ad file. Detailed error message:\n", e$message))
+  })
+expect_true(
+  ad_sol$uns$dataset_id == ad_pred$uns$dataset_id
+)
+
+cat("Calculating metrics\n")
+df <- data.frame(as.matrix(ad_pred$obsm[["X_emb"]]), SOLUTION_CELL_TYPE = ad_sol$obs[["cell_type"]])
+rf1 <- ranger::ranger(SOLUTION_CELL_TYPE ~ ., df)
+
+df <- data.frame(as.matrix(ad_pred$obsm[["X_emb"]]), SOLUTION_PSEUDOTIME_ORDER = ad_sol$obs$pseudotime_order_GEX)
+df <- df[is.finite(df$SOLUTION_PSEUDOTIME_ORDER), , drop = FALSE]
+rf2 <- ranger::ranger(SOLUTION_PSEUDOTIME_ORDER ~ ., df)
+
+colname <- colnames(ad_sol$obs)[grepl("pseudotime_order_A.*", colnames(ad_sol$obs))]
+df <- data.frame(as.matrix(ad_pred$obsm[["X_emb"]]), SOLUTION_PSEUDOTIME_ORDER = ad_sol$obs[[colname]])
+df <- df[is.finite(df$SOLUTION_PSEUDOTIME_ORDER), , drop = FALSE]
+rf3 <- ranger::ranger(SOLUTION_PSEUDOTIME_ORDER ~ ., df)
+
+df <- data.frame(as.matrix(ad_pred$obsm[["X_emb"]]), SOLUTION_BATCH = ad_sol$obs$batch)
+rf4 <- ranger::ranger(SOLUTION_BATCH ~ ., df)
+
+metric_values <- c(
+  rfoob_celltype_accuracy = 1 - rf1$prediction.error,
+  rfoob_pseudotimegex_rsq = rf2$r.squared,
+  rfoob_pseudotimeadt_rsq = rf3$r.squared,
+  rfoob_batch_error = rf4$prediction.error
+)
+
+cat("Create output object\n")
+out <- anndata::AnnData(
+  shape = c(0, 0),
+  uns = list(
+    dataset_id = ad_pred$uns$dataset_id,
+    method_id = ad_pred$uns$method_id,
+    metric_ids = names(metric_values),
+    metric_values = metric_values
+  )
+)
+
+cat("Write output to h5ad file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")

From 82ae20eedc4160221af634f81433532acd166c29 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Wed, 4 Jan 2023 17:06:08 +0100
Subject: [PATCH 25/82] add ti_cons metric

---
 .../metrics/ti_cons/config.vsh.yaml           | 36 +++++++++
 src/joint_embedding/metrics/ti_cons/script.py | 81 +++++++++++++++++++
 2 files changed, 117 insertions(+)
 create mode 100644 src/joint_embedding/metrics/ti_cons/config.vsh.yaml
 create mode 100644 src/joint_embedding/metrics/ti_cons/script.py

diff --git a/src/joint_embedding/metrics/ti_cons/config.vsh.yaml b/src/joint_embedding/metrics/ti_cons/config.vsh.yaml
new file mode 100644
index 0000000000..e260a94010
--- /dev/null
+++ b/src/joint_embedding/metrics/ti_cons/config.vsh.yaml
@@ -0,0 +1,36 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: ti_cons
+  namespace: joint_embedding/metrics
+  description: Trajectory inference conservation score
+  info:
+    metrics:
+      - id: ti_cons_RNA
+        label: ti_cons_RNA
+        description:
+        min: 0
+        max: 1
+        maximize: True
+      - id: ti_cons_ADT_ATAC
+        label: ti_cons_ADT_ATAC
+        description:
+        min: 0
+        max: 1
+        maximize: True
+      - id: ti_cons_mean
+        label: ti_cons_mean
+        description:
+        min: 0
+        max: 1
+        maximize: True
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: python:3.10
+    setup:
+      - type: python
+        pip: [anndata>=0.8, scib]
+  - type: nextflow
+    directives: [ midmem, lowtime, midcpu ]
diff --git a/src/joint_embedding/metrics/ti_cons/script.py b/src/joint_embedding/metrics/ti_cons/script.py
new file mode 100644
index 0000000000..1d04067024
--- /dev/null
+++ b/src/joint_embedding/metrics/ti_cons/script.py
@@ -0,0 +1,81 @@
+
+
+print('Importing libraries')
+import pprint
+import numpy as np
+import scanpy as sc
+import anndata as ad
+import scib
+
+## VIASH START
+par = dict(
+    input_prediction="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+    input_solution="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
+    output="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.ti_cons.h5ad",
+    debug=True
+)
+## VIASH END
+
+if par['debug']:
+    pprint.pprint(par)
+
+OUTPUT_TYPE = 'graph'
+METRIC = 'ti_cons'
+
+input_prediction = par['input_prediction']
+input_solution = par['input_solution']
+output = par['output']
+
+print("Read prediction anndata")
+adata = ad.read_h5ad(input_prediction)
+dataset_id = adata.uns['dataset_id']
+
+print("Read solution anndata")
+adata_solution = ad.read_h5ad(input_solution)
+
+print('Transfer obs annotations')
+adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names]
+adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names]
+adt_atac_trajectory = 'pseudotime_order_ATAC' if 'pseudotime_order_ATAC' in adata_solution.obs else 'pseudotime_order_ADT'
+
+print('Preprocessing')
+sc.pp.neighbors(adata, use_rep='X_emb')
+
+print('Compute scores')
+obs_keys = adata_solution.obs_keys()
+
+if 'pseudotime_order_GEX' in obs_keys:
+    score_rna = scib.me.trajectory_conservation(
+        adata_pre=adata_solution,
+        adata_post=adata,
+        label_key='cell_type',
+        pseudotime_key='pseudotime_order_GEX'
+    )
+else:
+    score_rna = np.nan
+
+if adt_atac_trajectory in obs_keys:
+    score_adt_atac = scib.me.trajectory_conservation(
+        adata_pre=adata_solution,
+        adata_post=adata,
+        label_key='cell_type',
+        pseudotime_key=adt_atac_trajectory
+    )
+else:
+    score_adt_atac = np.nan
+
+score_mean = (score_rna + score_adt_atac) / 2
+
+# store adata with metrics
+print("Create output object")
+out = ad.AnnData(
+    uns=dict(
+        dataset_id=adata.uns['dataset_id'],
+        method_id=adata.uns['method_id'],
+        metric_ids=['ti_cons_RNA', 'ti_cons_ADT_ATAC', 'ti_cons_mean'],
+        metric_values=[score_rna, score_adt_atac, score_mean]
+    )
+)
+
+print("Write output to h5ad file")
+out.write(output, compression='gzip')
\ No newline at end of file

From acfb631ba0859ae1d8cf3bfb14c986dda3a1a59d Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Wed, 4 Jan 2023 17:10:08 +0100
Subject: [PATCH 26/82] add ti_cons_batch metric

---
 .../metrics/ti_cons_batch/config.vsh.yaml     | 36 ++++++++
 .../metrics/ti_cons_batch/script.py           | 87 +++++++++++++++++++
 2 files changed, 123 insertions(+)
 create mode 100644 src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml
 create mode 100644 src/joint_embedding/metrics/ti_cons_batch/script.py

diff --git a/src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml b/src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml
new file mode 100644
index 0000000000..515366b45b
--- /dev/null
+++ b/src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml
@@ -0,0 +1,36 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: ti_cons_batch
+  namespace: joint_embedding/metrics
+  description: Trajectory inference conservation score per batch
+  info:
+    metrics:
+      - id: ti_cons_batch_RNA
+        label: ti_cons_batch_RNA
+        description:
+        min: 0
+        max: 1
+        maximize: True
+      - id: ti_cons_batch_ADT_ATAC
+        label: ti_cons_batch_ADT_ATAC
+        description:
+        min: 0
+        max: 1
+        maximize: True
+      - id: ti_cons_batch_mean
+        label: ti_cons_batch_mean
+        description:
+        min: 0
+        max: 1
+        maximize: True
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: python:3.10
+    setup:
+      - type: python
+        pip: [anndata>=0.8, scib]
+  - type: nextflow
+    directives: [ midmem, lowtime, midcpu ]
diff --git a/src/joint_embedding/metrics/ti_cons_batch/script.py b/src/joint_embedding/metrics/ti_cons_batch/script.py
new file mode 100644
index 0000000000..5206b59610
--- /dev/null
+++ b/src/joint_embedding/metrics/ti_cons_batch/script.py
@@ -0,0 +1,87 @@
+import pprint
+import numpy as np
+import scanpy as sc
+import anndata as ad
+import scib
+
+## VIASH START
+par = dict(
+    input_prediction="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+    input_solution="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
+    output="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.ti_cons.h5ad",
+    debug=True
+)
+## VIASH END
+
+if par['debug']:
+    pprint.pprint(par)
+
+OUTPUT_TYPE = 'graph'
+METRIC = 'ti_cons_batch'
+
+input_prediction = par['input_prediction']
+input_solution = par['input_solution']
+output = par['output']
+
+print("Read prediction anndata")
+adata = ad.read_h5ad(input_prediction)
+dataset_id = adata.uns['dataset_id']
+
+print("Read solution anndata")
+adata_solution = ad.read_h5ad(input_solution)
+
+print('Transfer obs annotations')
+adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names]
+adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names]
+adt_atac_trajectory = 'pseudotime_order_ATAC' if 'pseudotime_order_ATAC' in adata_solution.obs else 'pseudotime_order_ADT'
+
+print('Preprocessing')
+sc.pp.neighbors(adata, use_rep='X_emb')
+
+print('Compute scores')
+obs_keys = adata_solution.obs_keys()
+
+if 'pseudotime_order_GEX' in obs_keys:
+    score_rna = scib.me.trajectory_conservation(
+        adata_pre=adata_solution,
+        adata_post=adata,
+        label_key='cell_type',
+        batch_key='batch',
+        pseudotime_key='pseudotime_order_GEX'
+    )
+else:
+    score_rna = np.nan
+
+if adt_atac_trajectory in obs_keys:
+    score_adt_atac = scib.me.trajectory_conservation(
+        adata_pre=adata_solution,
+        adata_post=adata,
+        label_key='cell_type',
+        batch_key='batch',
+        pseudotime_key=adt_atac_trajectory
+    )
+else:
+    score_adt_atac = np.nan
+
+score_mean = (score_rna + score_adt_atac) / 2
+
+# store adata with metrics
+print("Create output object")
+out = ad.AnnData(
+    uns=dict(
+        dataset_id=adata.uns['dataset_id'],
+        method_id=adata.uns['method_id'],
+        metric_ids=['ti_cons_batch_RNA', 'ti_cons_batch_ADT_ATAC', 'ti_cons_batch_mean'],
+        metric_values=[score_rna, score_adt_atac, score_mean]
+    )
+)
+
+print("Write output to h5ad file")
+out.write(output, compression='gzip')
+
+# # store score as tsv
+# with open(output, 'w') as file:
+#     header = ['dataset', 'output_type', 'metric', 'value']
+#     entry = [dataset_id, OUTPUT_TYPE, METRIC, score]
+#     file.write('\t'.join(header) + '\n')
+#     file.write('\t'.join([str(x) for x in entry]))

From 71ae0e9430a570ba0ad7077dc8819b7a5df41e99 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Thu, 5 Jan 2023 14:45:18 +0100
Subject: [PATCH 27/82] add metric unit test

---
 src/joint_embedding/api/comp_metric.yaml      | 161 +++++++++---------
 .../methods/lmds/config.vsh.yaml              |   1 -
 .../metrics/ari/config.vsh.yaml               |   3 +
 .../metrics/asw_batch/config.vsh.yaml         |   3 +
 .../metrics/asw_label/config.vsh.yaml         |   3 +
 .../metrics/cc_cons/config.vsh.yaml           |   3 +
 .../metrics/check_format/config.vsh.yaml      |   3 +
 .../graph_connectivity/config.vsh.yaml        |   3 +
 .../metrics/latent_mixing/config.vsh.yaml     |   3 +
 .../metrics/nmi/config.vsh.yaml               |   3 +
 .../metrics/rfoob/config.vsh.yaml             |   3 +
 .../metrics/ti_cons/config.vsh.yaml           |   3 +
 .../metrics/ti_cons_batch/config.vsh.yaml     |   3 +
 13 files changed, 118 insertions(+), 77 deletions(-)

diff --git a/src/joint_embedding/api/comp_metric.yaml b/src/joint_embedding/api/comp_metric.yaml
index 813d865dc2..bde43a316f 100644
--- a/src/joint_embedding/api/comp_metric.yaml
+++ b/src/joint_embedding/api/comp_metric.yaml
@@ -10,79 +10,88 @@ functionality:
     - name: --debug
       type: boolean_true
       description: Verbose output for debugging.
-  # test_resources:
-  #   - path: ../../../../resources_test
-  #   - type: python_script
-  #     path: generic_test.py
-      # text: |
-      #   from os import path
-      #   import subprocess
-      #   import anndata as ad
-      #   import pandas as pd
-
-      #   # define some filenames
-       
-      #   input_prediction_path =  "resources_test/common/joint_embedding/prediction.h5ad",
-      #   input_solution_path = "resources_test/common/joint_embedding/openproblems_bmmc_multiome_starter.solution.h5ad",
-      #   output_path = "output.h5ad"
-      #   meta_path = resources_dir + '/metric_meta.tsv'
-
-      #   cmd = [
-      #       meta['executable'],
-      #       "--input_prediction", input_prediction_path,
-      #       "--input_solution", input_solution_path,
-      #       "--output", output_path
-      #   ]
-
-      #   print(">> Running script as test")
-      #   out = subprocess.run(cmd, check=True, capture_output=True, text=True)
-
-      #   print("> Checking whether output files were created")
-      #   assert path.exists(testpar['output'])
-
-      #   print("> Reading h5ad files")
-      #   input_prediction = ad.read_h5ad(testpar['input_prediction'])
-      #   input_solution = ad.read_h5ad(testpar['input_solution'])
-      #   output = ad.read_h5ad(testpar['output'])
-
-      #   metric_meta = pd.read_csv(
-      #     meta_path, 
-      #     delimiter="\t",
-      #     header=0,
-      #     dtype={ 'metric_id': str, 'metric_min': float, 'metric_max': float, 'metric_higherisbetter': bool }
-      #   )
-
-      #   print("> Checking contents of metric_meta.tsv")
-      #   assert 'metric_id' in metric_meta
-      #   assert 'metric_min' in metric_meta
-      #   assert 'metric_max' in metric_meta
-      #   assert 'metric_higherisbetter' in metric_meta
-
-      #   print("> Checking .uns['dataset_id']")
-      #   assert 'dataset_id' in output.uns
-      #   assert output.uns['dataset_id'] == input_prediction.uns['dataset_id']
-
-      #   print("> Checking .uns['method_id']")
-      #   assert 'method_id' in output.uns
-      #   assert output.uns['method_id'] == input_prediction.uns['method_id']
-
-      #   print("> Checking .uns['metric_ids']")
-      #   assert 'metric_ids' in output.uns
-      #   assert set(output.uns['metric_ids']) == set(metric_meta.metric_id)
-
-      #   print("> Checking .uns['metric_values']")
-      #   assert 'metric_values' in output.uns
-      #   assert output.uns['metric_ids'].size == output.uns['metric_values'].size
-
-      #   # merge with metric_meta to see if metric_value lies within the expected range
-      #   output_uns = pd.DataFrame({
-      #     'metric_id': output.uns['metric_ids'], 
-      #     'metric_value': output.uns['metric_values']
-      #   })
-
-      #   scores = metric_meta.merge(output_uns, on="metric_id")
-
-      #   assert all(scores.metric_value >= scores.metric_min)
-      #   assert all(scores.metric_value <= scores.metric_max)
-
-      #   print("> Test succeeded!")
+  test_resources:
+    - path: ../../../../resources_test
+    - type: python_script
+      path: generic_test.py
+      text: |
+        from os import path
+        import subprocess
+        import anndata as ad
+        import pandas as pd
+        import yaml
+
+        ## VIASH START
+        # This code block will be replaced by viash at runtime.
+        par = {
+          "input_prediction": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+          "input_solution": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
+          "output": "output.h5ad"
+        }
+        meta = { 'functionality_name': 'foo' }
+
+        ## VIASH END
+
+        input_prediction_path = "resources_test/common/joint_embedding/cite_random_prediction.h5ad"
+        input_solution_path = "resources_test/common/joint_embedding/cite_solution.h5ad"
+        output_path = "output.h5ad"
+        # define some filenames
+        with open(meta["config"], "r") as file:
+                config = yaml.safe_load(file)
+
+        cmd = [
+            meta['executable'],
+            "--input_prediction", input_prediction_path,
+            "--input_solution", input_solution_path,
+            "--output", output_path
+        ]
+
+        print("> Running method", flush=True)
+        out = subprocess.run(cmd, capture_output=True, text=True, check=True).stdout
+
+        print("> Checking whether output files were created", flush=True)
+        assert path.exists(output_path)
+
+        print("> Reading h5ad files", flush=True)
+        input_prediction = ad.read_h5ad(input_prediction_path)
+        input_solution = ad.read_h5ad(input_solution_path)
+        output = ad.read_h5ad(output_path)
+
+        # Create DF from metric config info
+        metric_info = config['functionality']['info']['metrics']
+        metric_meta = pd.DataFrame(metric_info)
+        metric_meta = metric_meta.astype({'id': str, 'label': str, 'description': str, 'min': float, 'max': float, 'maximize': bool})
+        print("> Checking contents of metric info", flush=True)
+        assert 'id' in metric_meta
+        assert 'min' in metric_meta
+        assert 'max' in metric_meta
+        assert 'maximize' in metric_meta
+
+        print("> Checking .uns['dataset_id']", flush=True)
+        assert 'dataset_id' in output.uns
+        assert output.uns['dataset_id'] == input_prediction.uns['dataset_id']
+
+        print("> Checking .uns['method_id']", flush=True)
+        assert 'method_id' in output.uns
+        assert output.uns['method_id'] == input_prediction.uns['method_id']
+
+        print("> Checking .uns['metric_ids']", flush=True)
+        assert 'metric_ids' in output.uns
+        assert set(output.uns['metric_ids']) == set(metric_meta.id)
+
+        print("> Checking .uns['metric_values']", flush=True)
+        assert 'metric_values' in output.uns
+        assert output.uns['metric_ids'].size == output.uns['metric_values'].size
+
+        # merge with metric_meta to see if metric_value lies within the expected range
+        output_uns = pd.DataFrame({
+          'id': output.uns['metric_ids'], 
+          'value': output.uns['metric_values']
+        })
+
+        scores = metric_meta.merge(output_uns, on="id")
+
+        assert all(scores.value >= scores['min'])
+        assert all(scores.value <= scores['max'])
+
+        print("> Test succeeded!", flush=True)
diff --git a/src/joint_embedding/methods/lmds/config.vsh.yaml b/src/joint_embedding/methods/lmds/config.vsh.yaml
index a0a3ef40af..d01b922fd7 100644
--- a/src/joint_embedding/methods/lmds/config.vsh.yaml
+++ b/src/joint_embedding/methods/lmds/config.vsh.yaml
@@ -24,7 +24,6 @@ platforms:
     setup:
       - type: r
         cran: [ anndata, lmds, tidyverse ]
-    test_setup:
       - type: apt
         packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
       - type: python
diff --git a/src/joint_embedding/metrics/ari/config.vsh.yaml b/src/joint_embedding/metrics/ari/config.vsh.yaml
index 066a0bfe98..5097080a70 100644
--- a/src/joint_embedding/metrics/ari/config.vsh.yaml
+++ b/src/joint_embedding/metrics/ari/config.vsh.yaml
@@ -20,5 +20,8 @@ platforms:
     setup:
       - type: python
         pip: [anndata>=0.8, scib, scanpy]
+    test_setup:
+      - type: python
+        pip: [ pyyaml ]
   - type: nextflow
     directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/metrics/asw_batch/config.vsh.yaml b/src/joint_embedding/metrics/asw_batch/config.vsh.yaml
index b0758506a5..24ab2503b5 100644
--- a/src/joint_embedding/metrics/asw_batch/config.vsh.yaml
+++ b/src/joint_embedding/metrics/asw_batch/config.vsh.yaml
@@ -20,5 +20,8 @@ platforms:
     setup:
       - type: python
         pip: [ anndata>=0.8, scanpy, scib]
+    test_setup:
+      - type: python
+        pip: [ pyyaml ]
   - type: nextflow
     directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/metrics/asw_label/config.vsh.yaml b/src/joint_embedding/metrics/asw_label/config.vsh.yaml
index 9b1f331cb2..74d7c24afe 100644
--- a/src/joint_embedding/metrics/asw_label/config.vsh.yaml
+++ b/src/joint_embedding/metrics/asw_label/config.vsh.yaml
@@ -20,5 +20,8 @@ platforms:
     setup:
       - type: python
         pip: [anndata>=0.8, scib]
+    test_setup:
+      - type: python
+        pip: [ pyyaml ]
   - type: nextflow
     directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/metrics/cc_cons/config.vsh.yaml b/src/joint_embedding/metrics/cc_cons/config.vsh.yaml
index dc8b3ab1c9..86a624c632 100644
--- a/src/joint_embedding/metrics/cc_cons/config.vsh.yaml
+++ b/src/joint_embedding/metrics/cc_cons/config.vsh.yaml
@@ -20,5 +20,8 @@ platforms:
     setup:
       - type: python
         pip: [anndata>=0.8, scib]
+    test_setup:
+      - type: python
+        pip: [ pyyaml ]
   - type: nextflow
     directives: [ vhighmem, midtime, midcpu ]
diff --git a/src/joint_embedding/metrics/check_format/config.vsh.yaml b/src/joint_embedding/metrics/check_format/config.vsh.yaml
index 1fae80d3aa..934af734e3 100644
--- a/src/joint_embedding/metrics/check_format/config.vsh.yaml
+++ b/src/joint_embedding/metrics/check_format/config.vsh.yaml
@@ -30,5 +30,8 @@ platforms:
         packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
       - type: python
         pip: [ anndata>=0.8 ]
+    test_setup:
+      - type: python
+        pip: [ pyyaml ]
   - type: nextflow
     directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml b/src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml
index a575757ba6..8cbde10e3a 100644
--- a/src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml
+++ b/src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml
@@ -20,5 +20,8 @@ platforms:
     setup:
       - type: python
         pip: [anndata>=0.8, scib]
+    test_setup:
+      - type: python
+        pip: [ pyyaml ]
   - type: nextflow
     directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/metrics/latent_mixing/config.vsh.yaml b/src/joint_embedding/metrics/latent_mixing/config.vsh.yaml
index b65e2f8c66..2ef4049453 100644
--- a/src/joint_embedding/metrics/latent_mixing/config.vsh.yaml
+++ b/src/joint_embedding/metrics/latent_mixing/config.vsh.yaml
@@ -24,5 +24,8 @@ platforms:
     setup:
       - type: python
         pip: [anndata>=0.8, scikit-learn, scipy]
+    test_setup:
+      - type: python
+        pip: [ pyyaml ]
   - type: nextflow
     directives: [ lowmem, lowtime, lowcpu ]
\ No newline at end of file
diff --git a/src/joint_embedding/metrics/nmi/config.vsh.yaml b/src/joint_embedding/metrics/nmi/config.vsh.yaml
index 9f507916a4..bf98b925de 100644
--- a/src/joint_embedding/metrics/nmi/config.vsh.yaml
+++ b/src/joint_embedding/metrics/nmi/config.vsh.yaml
@@ -20,5 +20,8 @@ platforms:
     setup:
       - type: python
         pip: [anndata>=0.8, scib]
+    test_setup:
+      - type: python
+        pip: [ pyyaml ]
   - type: nextflow
     directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/metrics/rfoob/config.vsh.yaml b/src/joint_embedding/metrics/rfoob/config.vsh.yaml
index 700eb49716..5f080b8b08 100644
--- a/src/joint_embedding/metrics/rfoob/config.vsh.yaml
+++ b/src/joint_embedding/metrics/rfoob/config.vsh.yaml
@@ -42,5 +42,8 @@ platforms:
         pip: [ anndata>=0.8 ]
       - type: r
         cran: [  anndata, ranger, tidyverse, testthat]
+    test_setup:
+      - type: python
+        pip: [ pyyaml ]
   - type: nextflow
     directives: [ lowmem, lowtime, lowcpu ]
\ No newline at end of file
diff --git a/src/joint_embedding/metrics/ti_cons/config.vsh.yaml b/src/joint_embedding/metrics/ti_cons/config.vsh.yaml
index e260a94010..e7d70b24b6 100644
--- a/src/joint_embedding/metrics/ti_cons/config.vsh.yaml
+++ b/src/joint_embedding/metrics/ti_cons/config.vsh.yaml
@@ -32,5 +32,8 @@ platforms:
     setup:
       - type: python
         pip: [anndata>=0.8, scib]
+    test_setup:
+      - type: python
+        pip: [ pyyaml ]
   - type: nextflow
     directives: [ midmem, lowtime, midcpu ]
diff --git a/src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml b/src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml
index 515366b45b..43aed327f6 100644
--- a/src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml
+++ b/src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml
@@ -32,5 +32,8 @@ platforms:
     setup:
       - type: python
         pip: [anndata>=0.8, scib]
+    test_setup:
+      - type: python
+        pip: [ pyyaml ]
   - type: nextflow
     directives: [ midmem, lowtime, midcpu ]

From ed38c115bc0ab522959b33aa3866fe3d2feb9834 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Thu, 5 Jan 2023 21:24:41 +0100
Subject: [PATCH 28/82] add task_info.yaml

---
 src/joint_embedding/docs/task_info.yaml | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 src/joint_embedding/docs/task_info.yaml

diff --git a/src/joint_embedding/docs/task_info.yaml b/src/joint_embedding/docs/task_info.yaml
new file mode 100644
index 0000000000..6565c0c292
--- /dev/null
+++ b/src/joint_embedding/docs/task_info.yaml
@@ -0,0 +1,16 @@
+task_id: joint_embedding
+task_name: Joint Embedding
+v1_url: neurips2021_multimodal_viash/src/joint_embedding/readme.md
+v1_commit: 0f8eae583444ba3f71c3083b860cc34b9ecb2fa2
+short_description: Learning of an embedded space that leverages the information of multiple modalities (e.g. for improved cell type annotation).
+description: |
+  The functioning of organs, tissues, and whole organisms is determined by the interplay of cells. 
+  Cells are characterised into broad types, which in turn can take on different states. Here, a cell 
+  state is made up of the sum of all processes that are occurring within the cell. We can gain insight 
+  into the state of a cell by different types of measurements: e.g., RNA expression, protein abundance, 
+  or chromatin conformation. Combining this information to describe cellular heterogeneity requires the 
+  formation of joint embeddings generated from this multimodal data. These embeddings must account for 
+  and remove possible batch effects between different measurement batches. The reward for methods that 
+  can achieve this is great: a highly resolved description of the underlying biological state of a cell 
+  that determines its function, how it interacts with other cells, and thus the cell’s role in the f
+  unctioning of the whole tissue.
\ No newline at end of file

From b6d5bbdcfb5c7aed7ad3858df47006876f6edb3b Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Fri, 6 Jan 2023 21:37:57 +0100
Subject: [PATCH 29/82] create NF workflow

---
 .../methods/mnn/config.vsh.yaml               |   8 +-
 .../methods/newwave/config.vsh.yaml           |   8 +-
 .../methods/pca/config.vsh.yaml               |   8 +-
 src/joint_embedding/methods/totalvi/script.py |  10 +-
 .../methods/umap/config.vsh.yaml              |  10 +-
 .../resources_scripts/mask_datasets.sh        |  64 ++++++++
 .../resources_scripts/run_benchmarks.sh       |  74 +++++++++
 .../resources_test_scripts/bmmc_cite.sh       |  57 +++++++
 .../workflows/run/config.vsh.yaml             |  26 +++
 src/joint_embedding/workflows/run/main.nf     | 152 ++++++++++++++++++
 .../workflows/run/nextflow.config             |  14 ++
 11 files changed, 419 insertions(+), 12 deletions(-)
 create mode 100644 src/joint_embedding/resources_scripts/mask_datasets.sh
 create mode 100644 src/joint_embedding/resources_scripts/run_benchmarks.sh
 create mode 100644 src/joint_embedding/resources_test_scripts/bmmc_cite.sh
 create mode 100644 src/joint_embedding/workflows/run/config.vsh.yaml
 create mode 100644 src/joint_embedding/workflows/run/main.nf
 create mode 100644 src/joint_embedding/workflows/run/nextflow.config

diff --git a/src/joint_embedding/methods/mnn/config.vsh.yaml b/src/joint_embedding/methods/mnn/config.vsh.yaml
index 8759ade4e0..486beb9301 100644
--- a/src/joint_embedding/methods/mnn/config.vsh.yaml
+++ b/src/joint_embedding/methods/mnn/config.vsh.yaml
@@ -16,9 +16,15 @@ functionality:
       path: script.R
 platforms:
   - type: docker
-    image: dataintuitive/randpy:r4.0_py3.8_bioc3.12
+    image: eddelbuettel/r2u:22.04
     setup:
+      - type: r
+        cran: [ anndata, lmds, tidyverse, bioconductor]
       - type: r
         bioc: [ SingleCellExperiment, batchelor, proxyC ]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [anndata>=0.8]      
   - type: nextflow
     directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/methods/newwave/config.vsh.yaml b/src/joint_embedding/methods/newwave/config.vsh.yaml
index c327dcfa8c..2da0df3a4b 100644
--- a/src/joint_embedding/methods/newwave/config.vsh.yaml
+++ b/src/joint_embedding/methods/newwave/config.vsh.yaml
@@ -25,9 +25,15 @@ functionality:
       path: script.R
 platforms:
   - type: docker
-    image: dataintuitive/randpy:r4.0_py3.8_bioc3.12
+    image: eddelbuettel/r2u:22.04
     setup:
+      - type: r
+        cran: [ anndata, lmds, tidyverse, bioconductor]
       - type: r
         bioc: [ SingleCellExperiment, NewWave, proxyC ]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [anndata>=0.8]
   - type: nextflow
     directives: [ highmem, hightime, highcpu ]
diff --git a/src/joint_embedding/methods/pca/config.vsh.yaml b/src/joint_embedding/methods/pca/config.vsh.yaml
index cb4d24642f..967de6da6e 100644
--- a/src/joint_embedding/methods/pca/config.vsh.yaml
+++ b/src/joint_embedding/methods/pca/config.vsh.yaml
@@ -21,9 +21,13 @@ functionality:
       path: script.R
 platforms:
   - type: docker
-    image: dataintuitive/randpy:r4.0_py3.8_bioc3.12
+    image: eddelbuettel/r2u:22.04
     setup:
       - type: r
-        packages: [ irlba, proxyC ]
+        cran: [ anndata, lmds, tidyverse, bioconductor, irlba, proxyC]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [anndata>=0.8]
   - type: nextflow
     directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/methods/totalvi/script.py b/src/joint_embedding/methods/totalvi/script.py
index b47400816d..0c22dee08f 100644
--- a/src/joint_embedding/methods/totalvi/script.py
+++ b/src/joint_embedding/methods/totalvi/script.py
@@ -17,12 +17,12 @@
 }
 ## VIASH END
 
-print("Load and prepare data")
+print("Load and prepare data", flush=True)
 adata_mod1 = anndata.read_h5ad(par['input_mod1'])
 adata_mod2 = anndata.read_h5ad(par['input_mod2'])
 adata_mod1.obsm['protein_expression'] = adata_mod2.X.toarray()
 
-print('Select highly variable genes')
+print('Select highly variable genes', flush=True)
 sc.pp.highly_variable_genes(
     adata_mod1,
     n_top_genes=par['hvg_number'],
@@ -31,18 +31,18 @@
     subset=True
 )
 
-print("Set up model")
+print("Set up model", flush=True)
 TOTALVI.setup_anndata(
     adata_mod1,
     batch_key="batch",
     protein_expression_obsm_key="protein_expression"
 )
 
-print('Train totalVI with', par['max_epochs'], 'epochs')
+print('Train totalVI with', par['max_epochs'], 'epochs', flush=True)
 vae = TOTALVI(adata_mod1, latent_distribution="normal")
 vae.train(max_epochs = par['max_epochs'])
 
-print("Postprocessing and saving output")
+print("Postprocessing and saving output", flush=True)
 adata_out = anndata.AnnData(
     X=vae.get_latent_representation(),
     obs=adata_mod1.obs[['batch']],
diff --git a/src/joint_embedding/methods/umap/config.vsh.yaml b/src/joint_embedding/methods/umap/config.vsh.yaml
index f8b27fdaec..4b10222d1c 100644
--- a/src/joint_embedding/methods/umap/config.vsh.yaml
+++ b/src/joint_embedding/methods/umap/config.vsh.yaml
@@ -1,6 +1,6 @@
 __merge__: ../../api/comp_method.yaml
 functionality:
-  name: umam
+  name: umap
   namespace: joint_embedding/methods
   version: dev
   description: UMAP dimensionality reduction on the Euclidean distance.
@@ -33,9 +33,13 @@ functionality:
       path: script.R
 platforms:
   - type: docker
-    image: dataintuitive/randpy:r4.0_py3.8_bioc3.12
+    image: eddelbuettel/r2u:22.04
     setup:
       - type: r
-        packages: [ uwot, irlba, proxyC ]
+        cran: [ anndata, lmds, tidyverse, irlba, proxyC, uwot]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [anndata>=0.8]
   - type: nextflow
     directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/joint_embedding/resources_scripts/mask_datasets.sh b/src/joint_embedding/resources_scripts/mask_datasets.sh
new file mode 100644
index 0000000000..dfb1295fb7
--- /dev/null
+++ b/src/joint_embedding/resources_scripts/mask_datasets.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+COMMON_DATASETS="resources/datasets/openproblems_v1"
+OUTPUT_DIR="resources/joint_embedding/datasets/openproblems_v1"
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+params_file="$OUTPUT_DIR/params.yaml"
+
+if [ ! -f $params_file ]; then
+  python << HERE
+import anndata as ad
+import glob
+import yaml
+
+h5ad_files = glob.glob("$COMMON_DATASETS/**.h5ad")
+
+# this task doesn't use normalizations
+# 
+param_list = {}
+
+for h5ad_file in h5ad_files:
+  print(f"Checking {h5ad_file}")
+  adata = ad.read_h5ad(h5ad_file, backed=True)
+  if "counts" in adata.layers:
+    dataset_id = adata.uns["dataset_id"].replace("/", ".")
+    obj = {
+      'id': dataset_id, 
+      'input': h5ad_file,
+      'dataset_id': dataset_id,
+    }
+    param_list[dataset_id] = obj
+
+output = {
+  "param_list": list(param_list.values()),
+  "seed": 123,
+  "output_train": "\$id.train.h5ad",
+  "output_test": "\$id.test.h5ad"
+}
+
+with open("$params_file", "w") as file:
+  yaml.dump(output, file)
+HERE
+fi
+
+export NXF_VER=22.04.5
+nextflow \
+  run . \
+  -main-script target/nextflow/denoising/split_dataset/main.nf \
+  -profile docker \
+  -resume \
+  -params-file $params_file \
+  --publish_dir "$OUTPUT_DIR"
+
+bin/tools/docker/nextflow/process_log/process_log \
+  --output "$OUTPUT_DIR/nextflow_log.tsv"
\ No newline at end of file
diff --git a/src/joint_embedding/resources_scripts/run_benchmarks.sh b/src/joint_embedding/resources_scripts/run_benchmarks.sh
new file mode 100644
index 0000000000..01d6ef92a5
--- /dev/null
+++ b/src/joint_embedding/resources_scripts/run_benchmarks.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+set -e
+
+export TOWER_WORKSPACE_ID=53907369739130
+
+DATASETS_DIR="resources/denoising/datasets/openproblems_v1"
+OUTPUT_DIR="resources/denoising/benchmarks/openproblems_v1"
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+params_file="$OUTPUT_DIR/params.yaml"
+
+if [ ! -f $params_file ]; then
+  python << HERE
+import yaml
+import os
+
+dataset_dir = "$DATASETS_DIR"
+output_dir = "$OUTPUT_DIR"
+
+# read split datasets yaml
+with open(dataset_dir + "/params.yaml", "r") as file:
+  split_list = yaml.safe_load(file)
+datasets = split_list['param_list']
+
+# figure out where train/test files were stored
+param_list = []
+
+for dataset in datasets:
+  id = dataset["id"]
+  input_train = dataset_dir + "/" + id + ".train.h5ad"
+  input_test = dataset_dir + "/" + id + ".test.h5ad"
+  
+  if os.path.exists(input_test):
+    obj = {
+      'id': id, 
+    'id': id, 
+      'id': id, 
+      'dataset_id': dataset["dataset_id"],
+      'input_train': input_train,
+      'input_test': input_test
+    }
+    param_list.append(obj)
+
+# write as output file
+output = {
+  "param_list": param_list,
+}
+
+with open(output_dir + "/params.yaml", "w") as file:
+  yaml.dump(output, file)
+HERE
+fi
+
+export NXF_VER=22.04.5
+nextflow \
+  run . \
+  -main-script src/denoising/workflows/run/main.nf \
+  -profile docker \
+  -params-file "$params_file" \
+  --publish_dir "$OUTPUT_DIR" \
+  -with-tower
+
+bin/tools/docker/nextflow/process_log/process_log \
+  --output "$OUTPUT_DIR/nextflow_log.tsv"
\ No newline at end of file
diff --git a/src/joint_embedding/resources_test_scripts/bmmc_cite.sh b/src/joint_embedding/resources_test_scripts/bmmc_cite.sh
new file mode 100644
index 0000000000..72967ef27b
--- /dev/null
+++ b/src/joint_embedding/resources_test_scripts/bmmc_cite.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+#
+#make sure the following command has been executed
+#bin/viash_build -q 'denoising|common'
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+MOD_1_DATA=resources_test/common/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.output_rna.h5ad
+MOD_2_DATA=resources_test/common/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.output_mod2.h5ad
+DATASET_DIR=resources_test/joint_embedding/bmmc_cite
+
+if [ ! -f $MOD_1_DATA ]; then
+    echo "Error! Could not find raw data"
+    exit 1
+fi
+
+mkdir -p $DATASET_DIR
+
+# split dataset
+bin/viash run src/joint_embedding/mask_dataset/config.vsh.yaml -- \
+    --input_mod1 $MOD_1_DATA \
+    --input_mod2 $MOD_2_DATA \
+    --output_mod1 $DATASET_DIR/cite_mod1.h5ad \
+    --output_mod2 $DATASET_DIR/cite_mod2.h5ad \
+    --output_solution $DATASET_DIR/cite_solution.h5ad
+
+# run one method
+bin/viash run src/joint_embedding/methods/pca/config.vsh.yaml -- \
+    --input_mod1 $DATASET_DIR/cite_mod1.h5ad \
+    --input_mod2 $DATASET_DIR/cite_mod2.h5ad \
+    --output $DATASET_DIR/pca.h5ad
+
+# run one metric
+bin/viash run src/joint_embedding/metrics/ari/config.vsh.yaml -- \
+    --input_prediction $DATASET_DIR/pca.h5ad \
+    --input_solution $DATASET_DIR/cite_solution.h5ad \
+    --output $DATASET_DIR/ari.h5ad
+
+# run benchmark
+export NXF_VER=22.04.5
+
+bin/nextflow \
+  run . \
+  -main-script src/joint_embedding/workflows/run/main.nf \
+  -profile docker \
+  -resume \
+  --id bmmc_cite \
+  --dataset_id bmmc_site \
+  --input_mod1 $DATASET_DIR/cite_mod1.h5ad \
+  --input_mod2 $DATASET_DIR/cite_mod2.h5ad \
+  --input_solution $DATASET_DIR/cite_solution.h5ad \
+  --output scores.tsv \
+  --publish_dir $DATASET_DIR/
\ No newline at end of file
diff --git a/src/joint_embedding/workflows/run/config.vsh.yaml b/src/joint_embedding/workflows/run/config.vsh.yaml
new file mode 100644
index 0000000000..5f3e7800cf
--- /dev/null
+++ b/src/joint_embedding/workflows/run/config.vsh.yaml
@@ -0,0 +1,26 @@
+functionality:
+  name: "run_benchmark"
+  namespace: "joint_embedding/workflows"
+  argument_groups:
+    - name: Inputs
+      arguments:
+        - name: "--id"
+          type: "string"
+          description: "The ID of the dataset"
+          required: true
+        - name: "--input_mod1"
+          type: "file" # todo: replace with includes
+        - name: "--input_mod2"
+          type: "file" # todo: replace with includes
+        - name: "--input_solution"
+          type: "file" # todo: replace with includes
+    - name: Outputs
+      arguments:
+        - name: "--output"
+          direction: "output"
+          type: file
+  resources:
+    - type: nextflow_script
+      path: main.nf
+platforms:
+  - type: nextflow
\ No newline at end of file
diff --git a/src/joint_embedding/workflows/run/main.nf b/src/joint_embedding/workflows/run/main.nf
new file mode 100644
index 0000000000..45ac7ad76d
--- /dev/null
+++ b/src/joint_embedding/workflows/run/main.nf
@@ -0,0 +1,152 @@
+nextflow.enable.dsl=2
+
+sourceDir = params.rootDir + "/src"
+targetDir = params.rootDir + "/target/nextflow"
+
+// import control methods
+include { random_embed } from "$targetDir/joint_embedding/control_methods/random_embed/main.nf"
+include { zeros_embed } from "$targetDir/joint_embedding/control_methods/zeros_embed/main.nf"
+
+// import methods
+include { lmds } from "$targetDir/joint_embedding/methods/lmds/main.nf"
+include { mnn } from "$targetDir/joint_embedding/methods/mnn/main.nf"
+include { newwave } from "$targetDir/joint_embedding/methods/newwave/main.nf"
+include { pca } from "$targetDir/joint_embedding/methods/pca/main.nf"
+include { totalvi } from "$targetDir/joint_embedding/methods/totalvi/main.nf"
+include { umap } from "$targetDir/joint_embedding/methods/umap/main.nf"
+
+// import metrics
+include { ari } from "$targetDir/joint_embedding/metrics/ari/main.nf"
+include { asw_batch } from "$targetDir/joint_embedding/metrics/asw_batch/main.nf"
+include { asw_label } from "$targetDir/joint_embedding/metrics/asw_label/main.nf"
+include { cc_cons } from "$targetDir/joint_embedding/metrics/cc_cons/main.nf"
+include { check_format } from "$targetDir/joint_embedding/metrics/check_format/main.nf"
+include { graph_connectivity } from "$targetDir/joint_embedding/metrics/graph_connectivity/main.nf"
+include { latent_mixing } from "$targetDir/joint_embedding/metrics/latent_mixing/main.nf"
+include { nmi } from "$targetDir/joint_embedding/metrics/nmi/main.nf"
+include { rfoob } from "$targetDir/joint_embedding/metrics/rfoob/main.nf"
+include { ti_cons } from "$targetDir/joint_embedding/metrics/ti_cons/main.nf"
+include { ti_cons_batch } from "$targetDir/joint_embedding/metrics/ti_cons_batch/main.nf"
+
+// tsv generation component
+include { extract_scores } from "$targetDir/common/extract_scores/main.nf"
+
+// import helper functions
+include { readConfig; viashChannel; helpMessage } from sourceDir + "/wf_utils/WorkflowHelper.nf"
+include { setWorkflowArguments; getWorkflowArguments; passthroughMap as pmap } from sourceDir + "/wf_utils/DataflowHelper.nf"
+
+config = readConfig("$projectDir/config.vsh.yaml")
+
+// construct a map of methods (id -> method_module)
+methods = [ lmds, mnn, newwave, pca, totalvi, umap]
+  .collectEntries{method ->
+    [method.config.functionality.name, method]
+  }
+
+workflow {
+  helpMessage(config)
+
+  viashChannel(params, config)
+    | run_wf
+}
+
+workflow run_wf {
+  take:
+  input_ch
+
+  main:
+  output_ch = input_ch
+
+    // split params for downstream components
+    | setWorkflowArguments(
+      method: ["input_mod1", "input_mod2"],
+      metric: ["input_solution"],
+      output: ["output"]
+    )
+
+    // multiply events by the number of method
+    | add_methods
+
+    // run methods
+    | getWorkflowArguments(key: "method")
+    | run_methods
+
+    // construct tuples for metrics
+    | pmap{ id, file, passthrough ->
+      // derive unique ids from output filenames
+      def newId = file.getName().replaceAll(".output.*", "")
+      // combine prediction with solution
+      def newData = [ input_prediction: file, input_solution: passthrough.metric.input_solution ]
+      [ newId, newData, passthrough ]
+    }
+    
+    // run metrics
+    | getWorkflowArguments(key: "metric")
+    | run_metrics
+    
+    // convert to tsv  
+    | aggregate_results
+
+  emit:
+  output_ch
+}
+
+workflow add_methods {
+  take: input_ch
+  main:
+  output_ch = Channel.fromList(methods.keySet())
+    | combine(input_ch)
+
+    // generate combined id for method_id and dataset_id
+    | pmap{method_id, dataset_id, data ->
+      def new_id = dataset_id + "." + method_id
+      def new_data = data.clone() + [method_id: method_id]
+      new_data.remove("id")
+      [new_id, new_data]
+    }
+  emit: output_ch
+}
+
+workflow run_methods {
+  take: input_ch
+  main:
+    // generate one channel per method
+    method_chs = methods.collect { method_id, method_module ->
+        input_ch
+          | filter{it[1].method_id == method_id}
+          | method_module
+      }
+    // mix all results
+    output_ch = method_chs[0].mix(*method_chs.drop(1))
+
+  emit: output_ch
+}
+
+workflow run_metrics {
+  take: input_ch
+  main:
+
+  output_ch = input_ch
+    | (ari & asw_batch & asw_label & cc_cons & check_format & graph_connectivity & latent_mixing & nmi & rfoob & ti_cons & ti_cons_batch)
+    | mix
+
+  emit: output_ch
+}
+
+workflow aggregate_results {
+  take: input_ch
+  main:
+
+  output_ch = input_ch
+    | toSortedList
+    | filter{ it.size() > 0 }
+    | map{ it -> 
+      [ "combined", it.collect{ it[1] } ] + it[0].drop(2) 
+    }
+    | getWorkflowArguments(key: "output")
+    | extract_scores.run(
+        auto: [ publish: true ]
+    )
+
+  emit: output_ch
+}
\ No newline at end of file
diff --git a/src/joint_embedding/workflows/run/nextflow.config b/src/joint_embedding/workflows/run/nextflow.config
new file mode 100644
index 0000000000..ea674ffa07
--- /dev/null
+++ b/src/joint_embedding/workflows/run/nextflow.config
@@ -0,0 +1,14 @@
+manifest {
+  name = 'joint_embedding/workflows/run'
+  mainScript = 'main.nf'
+  nextflowVersion = '!>=22.04.5'
+  description = 'Multi modality - joint embedding'
+}
+
+params {
+  rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString()
+}
+
+// include common settings
+includeConfig("${params.rootDir}/src/wf_utils/ProfilesHelper.config")
+includeConfig("${params.rootDir}/src/wf_utils/labels.config")
\ No newline at end of file

From 99b05244058a33b2db8beb617e03c9e773e7c834 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Fri, 6 Jan 2023 21:52:18 +0100
Subject: [PATCH 30/82] update changelog

---
 CHANGELOG.md | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a55a57a969..8a50fd2171 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -163,3 +163,66 @@
 * Higher dimensional data used to obtain the metrics is calculated from test data instead of the whole dataset. So far test and train data contain the same counts values, but this may change eventually.
 
 * Test data is used instead of the whole dataset in control (baseline) methods.
+
+
+## Multi modality - Joint Embedding
+
+### New functinality
+
+* `api/anndata_*`: Created a file format specifications for the h5ad files throughout the pipeline.
+
+* `api/comp_*`: Created an api definition for the mask, method and metric components.
+
+* `mask_dataset`: Added a component for masking raw datasets into task-ready dataset objects.
+
+* `resources_test/joint_embedding/pancreas` with `src/joint_embedding/resources_test_scripts/pancreas.sh`.
+  
+### neurips 2021 migration
+
+* `control_methods/random_embed`: Migrated from v1. Extracted from baseline method `dummy_random`.
+
+* `control_methods/zeros_embed`: Migrated from v1. Extracted from baseline method `dummy_zeros`.
+
+* `methods/lmds`: Migrated from v1.
+
+* `methods/mnn`: Migrated and adapted from v1.
+
+* `methods/newwave`: Migrated and adapted from v1.
+
+* `methods/pca`: Migrated from v1.
+
+* `methods/totalvi`: Migrated from v1.
+
+* `methods/umap`: Migrated from v1.
+
+* `metrics/ari`: Migrated from v1.
+  
+* `metrics/asw_batch`: Migrated from v1.
+
+* `metrics/asw_label`: Migrated from v1.
+
+* `metrics/cc_cons`: Migrated from v1.
+
+* `metrics/check_format`: Migrated from v1.
+
+* `metrics/graph_connectivity`: Migrated from v1.
+
+* `metrics/latent_mixing`: Migrated from v1.
+
+* `metrics/nmi`: Migrated from v1.
+
+* `metrics/rfoob`: Migrated from v1.
+
+* `metrics/ti_cons`: Migrated from v1.
+
+* `metrics/ti_cons_batch`: Migrated from v1.
+
+### changes from neurips 2021
+
+* Updated docker config from R script. Was using an old `anndata` package which was giving warnings
+
+* stores the output from the methods in `.obsm["X_emb"]` instead of `.X` in the `anndata`
+
+* `X_emb ` data is stored as a `Sparse Matrix`
+  
+* updated configs to latest `viash` 
\ No newline at end of file

From c8ae6017be1b3ca558ccdd237f81c159635edff2 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Fri, 6 Jan 2023 21:55:25 +0100
Subject: [PATCH 31/82] update changelog

---
 CHANGELOG.md | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8a50fd2171..8c075e9139 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -179,43 +179,43 @@
   
 ### neurips 2021 migration
 
-* `control_methods/random_embed`: Migrated from v1. Extracted from baseline method `dummy_random`.
+* `control_methods/random_embed`: Migrated from neurips 2021. Extracted from baseline method `dummy_random`.
 
-* `control_methods/zeros_embed`: Migrated from v1. Extracted from baseline method `dummy_zeros`.
+* `control_methods/zeros_embed`: Migrated from neurips 2021. Extracted from baseline method `dummy_zeros`.
 
-* `methods/lmds`: Migrated from v1.
+* `methods/lmds`: Migrated from neurips 2021.
 
-* `methods/mnn`: Migrated and adapted from v1.
+* `methods/mnn`: Migrated and adapted from neurips 2021.
 
-* `methods/newwave`: Migrated and adapted from v1.
+* `methods/newwave`: Migrated and adapted from neurips 2021.
 
-* `methods/pca`: Migrated from v1.
+* `methods/pca`: Migrated from neurips 2021.
 
-* `methods/totalvi`: Migrated from v1.
+* `methods/totalvi`: Migrated from neurips 2021.
 
-* `methods/umap`: Migrated from v1.
+* `methods/umap`: Migrated from neurips 2021.
 
-* `metrics/ari`: Migrated from v1.
+* `metrics/ari`: Migrated from neurips 2021.
   
-* `metrics/asw_batch`: Migrated from v1.
+* `metrics/asw_batch`: Migrated from neurips 2021.
 
-* `metrics/asw_label`: Migrated from v1.
+* `metrics/asw_label`: Migrated from neurips 2021.
 
-* `metrics/cc_cons`: Migrated from v1.
+* `metrics/cc_cons`: Migrated from neurips 2021.
 
-* `metrics/check_format`: Migrated from v1.
+* `metrics/check_format`: Migrated from neurips 2021.
 
-* `metrics/graph_connectivity`: Migrated from v1.
+* `metrics/graph_connectivity`: Migrated from neurips 2021.
 
-* `metrics/latent_mixing`: Migrated from v1.
+* `metrics/latent_mixing`: Migrated from neurips 2021.
 
-* `metrics/nmi`: Migrated from v1.
+* `metrics/nmi`: Migrated from neurips 2021.
 
-* `metrics/rfoob`: Migrated from v1.
+* `metrics/rfoob`: Migrated from neurips 2021.
 
-* `metrics/ti_cons`: Migrated from v1.
+* `metrics/ti_cons`: Migrated from neurips 2021.
 
-* `metrics/ti_cons_batch`: Migrated from v1.
+* `metrics/ti_cons_batch`: Migrated from neurips 2021.
 
 ### changes from neurips 2021
 

From 10f75d487bd0107686afd301029143621c3801ee Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Fri, 6 Jan 2023 21:55:47 +0100
Subject: [PATCH 32/82] fix typo in changelog

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8c075e9139..ae98ed9906 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -167,7 +167,7 @@
 
 ## Multi modality - Joint Embedding
 
-### New functinality
+### New functionality
 
 * `api/anndata_*`: Created a file format specifications for the h5ad files throughout the pipeline.
 

From e0aef20fc956722ce102d9535c2f323a489e7eb4 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Fri, 6 Jan 2023 21:56:22 +0100
Subject: [PATCH 33/82] fix typo in changelog

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ae98ed9906..9c63253290 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -223,6 +223,6 @@
 
 * stores the output from the methods in `.obsm["X_emb"]` instead of `.X` in the `anndata`
 
-* `X_emb ` data is stored as a `Sparse Matrix`
+* `X_emb` data is stored as a `Sparse Matrix`
   
 * updated configs to latest `viash` 
\ No newline at end of file

From 8327637dbf339375118edd644d27983bda4f2378 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Mon, 9 Jan 2023 10:39:08 +0100
Subject: [PATCH 34/82] convert sparse matrix to array

---
 src/joint_embedding/metrics/cc_cons/script.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/joint_embedding/metrics/cc_cons/script.py b/src/joint_embedding/metrics/cc_cons/script.py
index a8741fb511..feecf45f12 100644
--- a/src/joint_embedding/metrics/cc_cons/script.py
+++ b/src/joint_embedding/metrics/cc_cons/script.py
@@ -30,6 +30,8 @@
                'G2M_score' not in adata_solution.obs_keys()
 
 print('Compute score')
+adata.obsm['X_emb'] = adata.obsm['X_emb'].toarray()
+
 score = scib.me.cell_cycle(
     adata_pre=adata_solution,
     adata_post=adata,

From 1b2dd900cb0f13de52e5a038e0ee1adf354879cb Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Mon, 9 Jan 2023 12:38:01 +0100
Subject: [PATCH 35/82] use denormalized counts data

---
 src/joint_embedding/methods/totalvi/script.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/joint_embedding/methods/totalvi/script.py b/src/joint_embedding/methods/totalvi/script.py
index 0c22dee08f..9b40e1f3d1 100644
--- a/src/joint_embedding/methods/totalvi/script.py
+++ b/src/joint_embedding/methods/totalvi/script.py
@@ -20,7 +20,7 @@
 print("Load and prepare data", flush=True)
 adata_mod1 = anndata.read_h5ad(par['input_mod1'])
 adata_mod2 = anndata.read_h5ad(par['input_mod2'])
-adata_mod1.obsm['protein_expression'] = adata_mod2.X.toarray()
+adata_mod1.obsm['protein_expression'] = adata_mod2.layers["counts"].toarray()
 
 print('Select highly variable genes', flush=True)
 sc.pp.highly_variable_genes(

From 3f02cbe6ea46c510fa743fe2aac7bc4277acfcf1 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Tue, 10 Jan 2023 11:18:24 +0100
Subject: [PATCH 36/82] Add api yaml files

---
 src/match_modality/api/anndata_dataset.yaml   | 52 +++++++++++++++++++
 .../api/anndata_masked_mod.yaml               | 46 ++++++++++++++++
 .../api/anndata_masked_sol.yaml               | 20 +++++++
 src/match_modality/api/comp_mask_dataset.yaml | 40 ++++++++++++++
 4 files changed, 158 insertions(+)
 create mode 100644 src/match_modality/api/anndata_dataset.yaml
 create mode 100644 src/match_modality/api/anndata_masked_mod.yaml
 create mode 100644 src/match_modality/api/anndata_masked_sol.yaml
 create mode 100644 src/match_modality/api/comp_mask_dataset.yaml

diff --git a/src/match_modality/api/anndata_dataset.yaml b/src/match_modality/api/anndata_dataset.yaml
new file mode 100644
index 0000000000..3382dba2ff
--- /dev/null
+++ b/src/match_modality/api/anndata_dataset.yaml
@@ -0,0 +1,52 @@
+type: file
+description: An input h5ad dataset.
+example: "dataset.h5ad"
+info:
+  label: "Dataset"
+  slots:
+    layers: 
+      - type: integer
+        name: counts
+        description: Raw counts
+        required: true
+    obs:
+      - type: string
+        name: batch
+        description: Batch information
+        required: true
+      - type: double
+        name: size_factors
+        description: The size factors created by the normalisation method, if any.
+        required: false
+      - type: boolean
+        name: is_train
+        description: if sample is train data
+        required: true
+    var:
+      - type: string
+        name: gene_ids
+        description: 
+        required: false
+      - type: string
+        name: feature_types
+        description:
+        required: true
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: organism
+        description: "data from which organism "
+        required: false
+      - type: string
+        name: gene_activity_var_names
+        description:
+        required: true
+    obsm:
+      - type: double
+        name: gene_activity
+        description:
+        required: true
+    
diff --git a/src/match_modality/api/anndata_masked_mod.yaml b/src/match_modality/api/anndata_masked_mod.yaml
new file mode 100644
index 0000000000..31ef25d06e
--- /dev/null
+++ b/src/match_modality/api/anndata_masked_mod.yaml
@@ -0,0 +1,46 @@
+type: file
+description: "The masked data"
+example: "masked.h5ad"
+info:
+  short_description: "masked data"
+  slots:
+    layers: 
+      - type: integer
+        name: counts
+        description: Raw counts
+    obs:
+      - type: string
+        name: batch
+        description: Batch information
+        required: true
+      - type: double
+        name: size_factors
+        description:
+        required: false
+    var:
+      - type: string
+        name: feature_types
+        description: 
+        required: true
+      - type: string
+        name: gene_ids
+        description:
+        required: false
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: organism
+        description: which organism
+        required: true
+      - type: string
+        name: gene_activity_var_names
+        description:
+        required: true
+    obsm:
+      - type: double
+        name: gene_activity
+        description:
+        required: true
\ No newline at end of file
diff --git a/src/match_modality/api/anndata_masked_sol.yaml b/src/match_modality/api/anndata_masked_sol.yaml
new file mode 100644
index 0000000000..e486646e2f
--- /dev/null
+++ b/src/match_modality/api/anndata_masked_sol.yaml
@@ -0,0 +1,20 @@
+type: file
+description: "The masked solution data"
+example: "masked.h5ad"
+info:
+  short_description: "masked solution data"
+  slots:
+    obs:
+      - type: string
+        name: batch
+        description: Batch information
+        required: true
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: double
+        name: pairing_ix
+        description: number of samples
+        required: true
diff --git a/src/match_modality/api/comp_mask_dataset.yaml b/src/match_modality/api/comp_mask_dataset.yaml
new file mode 100644
index 0000000000..517b7f0f4d
--- /dev/null
+++ b/src/match_modality/api/comp_mask_dataset.yaml
@@ -0,0 +1,40 @@
+functionality:
+  arguments:
+    - name: "--input_mod1"
+      __merge__: anndata_dataset.yaml
+    - name: "--input_mod2"
+      __merge__: anndata_dataset.yaml
+    - name: "--output_train_mod1"
+      direction: "output"
+      example: "dataset_masked.h5ad"
+      description: "The masked shuffled train mod1 profiles."
+      __merge__: anndata_masked_mod.yaml
+    - name: "--output_train_mod2"
+      direction: "output"
+      example: "dataset_masked.h5ad"
+      description: "The masked shuffled train mod2 profiles."
+      __merge__: anndata_masked_mod.yaml
+    - name: "--output_train_sol"
+      direction: "output"
+      example: "dataset_solution.h5ad"
+      description: "The pairing of train mod1&mod2 profiles."
+      __merge__: anndata_masked_sol.yaml
+    - name: "--output_test_mod1"
+      direction: "output"
+      example: "dataset_masked.h5ad"
+      description: "The masked shuffled test mod1 profiles."
+      __merge__: anndata_masked_mod.yaml
+    - name: "--output_test_mod2"
+      direction: "output"
+      __merge__: anndata_masked_mod.yaml
+    - name: "--output_test_sol"
+      direction: "output"
+      __merge__: anndata_masked_sol.yaml
+    - name: "--knn"
+      type: "integer"
+      default: 20
+      description: The KNN cutoff for computing nearest neighbors on the mod1 and mod2 profiles.
+    - name: "--seed"
+      type: "integer"
+      default: 1
+      description: The seed.
\ No newline at end of file

From b64c9643af02c557fe9927de82d8c58e3070eb9d Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Tue, 10 Jan 2023 11:18:55 +0100
Subject: [PATCH 37/82] add mask_dataset

---
 .../mask_dataset/config.vsh.yaml              |  23 +++
 src/match_modality/mask_dataset/script.R      | 179 ++++++++++++++++++
 2 files changed, 202 insertions(+)
 create mode 100644 src/match_modality/mask_dataset/config.vsh.yaml
 create mode 100644 src/match_modality/mask_dataset/script.R

diff --git a/src/match_modality/mask_dataset/config.vsh.yaml b/src/match_modality/mask_dataset/config.vsh.yaml
new file mode 100644
index 0000000000..5dd18a45f9
--- /dev/null
+++ b/src/match_modality/mask_dataset/config.vsh.yaml
@@ -0,0 +1,23 @@
+_merge__: ../api/comp_mask_dataset.yaml
+functionality:
+  name: mask_dataset
+  namespace: match_modality
+  description: |
+    Censor an existing dataset: obfuscate gene names, remove cell identities and
+    shuffle cells of modalities, for distribution to competitors.    
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata, tidyverse , testthat]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [ anndata>=0.8 ]
+  - type: nextflow
+    publish: true
+    directives: [ highmem, midtime, highcpu ]
diff --git a/src/match_modality/mask_dataset/script.R b/src/match_modality/mask_dataset/script.R
new file mode 100644
index 0000000000..f41de56600
--- /dev/null
+++ b/src/match_modality/mask_dataset/script.R
@@ -0,0 +1,179 @@
+cat("Loading dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+requireNamespace("anndata", quietly = TRUE)
+library(assertthat, quietly = TRUE, warn.conflicts = FALSE)
+library(Matrix, quietly = TRUE, warn.conflicts = FALSE)
+
+## VIASH START
+# input_path <- "resources_test/common/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter."
+# input_path <- "output/datasets/common/openproblems_bmmc_multiome_phase1/openproblems_bmmc_multiome_phase1.manual_formatting."
+# input_path <- "output/datasets/common/openproblems_bmmc_cite_phase1/openproblems_bmmc_cite_phase1.manual_formatting."
+input_path <- "resources_test/common/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter."
+output_path <- "output/multiome"
+# output_path <- "output/datasets/match_modality/openproblems_bmmc_multiome_phase1/openproblems_bmmc_multiome_phase1.censor_dataset."
+# output_path <- "output/datasets/match_modality/openproblems_bmmc_multiome_iid/openproblems_bmmc_multiome_iid.censor_dataset."
+# dir.create(dirname(output_path), recursive = TRUE)
+
+par <- list(
+  input_mod1 = paste0(input_path, "output_rna.h5ad"),
+  input_mod2 = paste0(input_path, "output_mod2.h5ad"),
+  output_train_mod1 = paste0(output_path, "output_train_mod1.h5ad"),
+  output_train_mod2 = paste0(output_path, "output_train_mod2.h5ad"),
+  output_train_sol = paste0(output_path, "output_train_sol.h5ad"),
+  output_test_mod1 = paste0(output_path, "output_test_mod1.h5ad"),
+  output_test_mod2 = paste0(output_path, "output_test_mod2.h5ad"),
+  output_test_sol = paste0(output_path, "output_test_sol.h5ad"),
+  seed = 1L,
+  knn = 10L
+)
+## VIASH END
+
+set.seed(par$seed)
+
+subset_mats <- function(li, obs_filt, anonymize = FALSE) {
+  out <- list()
+  for (n in names(li)) {
+    mat <- li[[n]][obs_filt, , drop = FALSE]
+    if (anonymize) {
+      rownames(mat) <- paste0("cell_", seq_len(nrow(mat)))
+    }
+    out[[n]] <- mat
+  }
+  out
+}
+
+
+cat("Reading input data\n")
+input_mod1 <- anndata::read_h5ad(par$input_mod1)
+input_mod2 <- anndata::read_h5ad(par$input_mod2)
+ad1_mod <- unique(input_mod1$var[["feature_types"]])
+ad2_mod <- unique(input_mod2$var[["feature_types"]])
+new_dataset_id <- paste0(input_mod1$uns[["dataset_id"]], "_MM_", tolower(ad1_mod), "2", tolower(ad2_mod))
+ad1_uns <- list(dataset_id = new_dataset_id, organism = "human")
+ad2_uns <- list(dataset_id = new_dataset_id, organism = "human")
+ad1_obsm <- list()
+ad2_obsm <- list()
+
+if (ad1_mod == "ATAC") {
+  ad1_uns$gene_activity_var_names <- input_mod1$uns$gene_activity_var_names
+  ad1_obsm$gene_activity <- as(input_mod1$obsm$gene_activity, "CsparseMatrix")
+}
+if (ad2_mod == "ATAC") {
+  ad2_uns$gene_activity_var_names <- input_mod2$uns$gene_activity_var_names
+  ad2_obsm$gene_activity <- as(input_mod2$obsm$gene_activity, "CsparseMatrix")
+}
+
+cat("Shuffle train cells\n")
+train_ix <- which(input_mod1$obs$is_train) %>% sort
+train_mod2_ix <- sample.int(length(train_ix))
+
+cat("Shuffle test cells\n")
+test_ix <- which(!input_mod1$obs$is_train) %>% sort
+test_mod2_ix <- sample.int(length(test_ix))
+
+is_categorical <- function(x) is.character(x) || is.factor(x)
+# relevel <- function(x) factor(as.character(x))
+relevel <- function(x) as.character(x)
+
+cat("Creating train objects\n")
+mod1_var <- input_mod1$var %>% select(one_of("gene_ids", "feature_types"))
+mod2_var <- input_mod2$var %>% select(one_of("gene_ids", "feature_types"))
+train_obs1 <- input_mod1$obs[train_ix, , drop = FALSE] %>%
+  select(one_of("batch", "size_factors")) %>%
+  mutate_if(is_categorical, relevel)
+train_obs2 <- input_mod2$obs[train_ix[train_mod2_ix], , drop = FALSE] %>%
+  select(one_of("batch", "size_factors")) %>%
+  mutate_if(is_categorical, relevel)
+rownames(train_obs2) <- NULL
+if (ncol(train_obs2) == 0) train_obs2 <- NULL
+assert_that("size_factors" %in% colnames(train_obs1) != "size_factors" %in% colnames(train_obs2))
+assert_that(all(train_obs1$batch == train_obs2$batch[order(train_mod2_ix)]))
+
+output_train_mod1 <- anndata::AnnData(
+  X = input_mod1$X[train_ix, , drop = FALSE],
+  layers = subset_mats(input_mod1$layers, train_ix),
+  obsm = subset_mats(ad1_obsm, train_ix),
+  obs = train_obs1,
+  var = mod1_var,
+  uns = ad1_uns
+)
+output_train_mod2 <- anndata::AnnData(
+  X = input_mod2$X[train_ix[train_mod2_ix], , drop = FALSE] %>%
+    magrittr::set_rownames(., paste0("cell_", seq_len(nrow(.)))),
+  layers = subset_mats(input_mod2$layers, train_ix[train_mod2_ix], anonymize = TRUE),
+  obsm = subset_mats(ad2_obsm, train_ix[train_mod2_ix], anonymize = TRUE),
+  obs = train_obs2,
+  var = mod2_var,
+  uns = ad2_uns
+)
+assert_that(all(output_train_mod1$obs$batch == output_train_mod2$obs$batch[order(train_mod2_ix)]))
+
+cat("Create test objects\n")
+test_obs1 <- input_mod1$obs[test_ix, , drop = FALSE] %>%
+  select(one_of("batch", "size_factors")) %>%
+  mutate_if(is_categorical, relevel)
+test_obs2 <- input_mod2$obs[test_ix[test_mod2_ix], , drop = FALSE] %>%
+  select(one_of("batch", "size_factors")) %>%
+  mutate_if(is_categorical, relevel)
+rownames(test_obs2) <- NULL
+if (ncol(test_obs2) == 0) test_obs2 <- NULL
+assert_that("size_factors" %in% colnames(train_obs1) != "size_factors" %in% colnames(train_obs2))
+assert_that(all(test_obs1$batch == test_obs2$batch[order(test_mod2_ix)]))
+
+output_test_mod1 <- anndata::AnnData(
+  X = input_mod1$X[test_ix, , drop = FALSE],
+  layers = subset_mats(input_mod1$layers, test_ix),
+  obsm = subset_mats(ad1_obsm, test_ix),
+  obs = test_obs1,
+  var = mod1_var,
+  uns = ad1_uns
+)
+output_test_mod2 <- anndata::AnnData(
+  X = input_mod2$X[test_ix[test_mod2_ix], , drop = FALSE] %>%
+    magrittr::set_rownames(., paste0("cell_", seq_len(nrow(.)))),
+  layers = subset_mats(input_mod2$layers, test_ix[test_mod2_ix], anonymize = TRUE),
+  obsm = subset_mats(ad2_obsm, test_ix[test_mod2_ix], anonymize = TRUE),
+  obs = test_obs2,
+  var = mod2_var,
+  uns = ad2_uns
+)
+assert_that(all(output_test_mod1$obs$batch == output_test_mod2$obs$batch[order(test_mod2_ix)]))
+
+cat("Create solution objects\n")
+
+train_sol_mat <- Matrix::sparseMatrix(
+  i = seq_along(train_mod2_ix),
+  j = order(train_mod2_ix),
+  x = rep(1, length(train_mod2_ix))
+)
+output_train_sol <- anndata::AnnData(
+  X = train_sol_mat,
+  obs = input_mod1$obs[train_ix, , drop = FALSE] %>% select(one_of(c("batch"))) %>% mutate_if(is_categorical, relevel),
+  uns = list(dataset_id = new_dataset_id, pairing_ix = train_mod2_ix - 1)
+)
+
+test_sol_mat <- Matrix::sparseMatrix(
+  i = seq_along(test_mod2_ix),
+  j = order(test_mod2_ix),
+  x = rep(1, length(test_mod2_ix))
+)
+output_test_sol <- anndata::AnnData(
+  X = test_sol_mat,
+  obs = input_mod1$obs[test_ix, , drop = FALSE] %>% select(one_of(c("batch"))) %>% mutate_if(is_categorical, relevel),
+  uns = list(dataset_id = new_dataset_id, pairing_ix = test_mod2_ix - 1)
+)
+
+# checks
+# mean(rowSums(train_solknn > 0))
+# mean(rowSums(test_solknn > 0))
+# sum(train_solknn * train_sol_mat) == nrow(train_sol_mat)
+# sum(test_solknn * test_sol_mat) == nrow(test_sol_mat)
+
+cat("Saving output files as h5ad\n")
+zzz <- output_train_mod1$write_h5ad(par$output_train_mod1, compression = "gzip")
+zzz <- output_train_mod2$write_h5ad(par$output_train_mod2, compression = "gzip")
+zzz <- output_train_sol$write_h5ad(par$output_train_sol, compression = "gzip")
+zzz <- output_test_mod1$write_h5ad(par$output_test_mod1, compression = "gzip")
+zzz <- output_test_mod2$write_h5ad(par$output_test_mod2, compression = "gzip")
+zzz <- output_test_sol$write_h5ad(par$output_test_sol, compression = "gzip")

From 8bf8833f28888d9aac641cf748cf516bd7ed4c1b Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Tue, 10 Jan 2023 15:08:11 +0100
Subject: [PATCH 38/82] add constant control method

---
 .../api/anndata_prediction.yaml               | 15 +++++
 src/match_modality/api/authors.yaml           | 20 ++++++
 .../api/comp_control_method.yaml              | 63 +++++++++++++++++++
 .../control_methods/constant/config.vsh.yaml  | 27 ++++++++
 .../control_methods/constant/script.R         | 50 +++++++++++++++
 5 files changed, 175 insertions(+)
 create mode 100644 src/match_modality/api/anndata_prediction.yaml
 create mode 100644 src/match_modality/api/authors.yaml
 create mode 100644 src/match_modality/api/comp_control_method.yaml
 create mode 100644 src/match_modality/control_methods/constant/config.vsh.yaml
 create mode 100644 src/match_modality/control_methods/constant/script.R

diff --git a/src/match_modality/api/anndata_prediction.yaml b/src/match_modality/api/anndata_prediction.yaml
new file mode 100644
index 0000000000..ca214d4963
--- /dev/null
+++ b/src/match_modality/api/anndata_prediction.yaml
@@ -0,0 +1,15 @@
+type: file
+description: "The predicted pairing of test mod1&mod2 profiles."
+example: "prediction.h5ad"
+info:
+  short_description: "Prediction"
+  slots:     
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: method_id
+        description: "A unique identifier for the method"
+        required: true
diff --git a/src/match_modality/api/authors.yaml b/src/match_modality/api/authors.yaml
new file mode 100644
index 0000000000..2dfeb9ca39
--- /dev/null
+++ b/src/match_modality/api/authors.yaml
@@ -0,0 +1,20 @@
+functionality:
+  authors:
+    - name: Robrecht Cannoodt
+      roles: [ author, contributor ]
+      props: { github: rcannood, orcid: "0000-0003-3641-729X" }
+    - name: Kai Waldrant
+      roles: [ contributor ]
+      props: { github: KaiWaldrant }
+    - name: Michaela Mueller
+      email: mumichae@in.tum.de
+      roles: [ author, maintainer ]
+      props: { github: mumichae, orcid: "0000-0002-1401-1785" }
+    - name: Louise Deconinck
+      email: louise.deconinck@gmail.com
+      roles: [ author, maintainer ]
+      props: { github: LouiseDck, orcid: "" }
+    - name: Alejandro Granados
+      email: alejandro.granados@czbiohub.org
+      roles: [ author, maintainer ]
+      props: { github: agranado }
\ No newline at end of file
diff --git a/src/match_modality/api/comp_control_method.yaml b/src/match_modality/api/comp_control_method.yaml
new file mode 100644
index 0000000000..a6969d1c79
--- /dev/null
+++ b/src/match_modality/api/comp_control_method.yaml
@@ -0,0 +1,63 @@
+functionality:
+  arguments:
+    - name: "--input_test_mod1"
+      __merge__: anndata_masked_mod.yaml
+    - name: "--input_test_mod2"
+      __merge__: anndata_masked_mod.yaml
+    - name: "--output"
+      direction: "output"
+      __merge__: anndata_prediction.yaml
+  test_resources:
+    - path: ../../../../output   
+    - type: python_script
+      path: generic_test.py
+      text: |
+        from os import path
+        import subprocess
+        import anndata as ad
+        import numpy as np
+        from scipy.sparse import issparse
+
+        # define some filenames
+        testpar = {
+            'input_test_mod1': 'output/output_test_mod1.h5ad',
+            'input_test_mod2': 'output/output_test_mod2.h5ad',
+            'input_test_sol': 'output/output_test_sol.h5ad',
+            'output': 'output/output_prediction.h5ad',
+        }
+
+        print('> Running method', flush=True)
+        out = subprocess.run([
+            meta['executable'],
+            '--input_test_mod1', testpar['input_test_mod1'],
+            '--input_test_mod2', testpar['input_test_mod2'],
+            '--output', testpar['output']
+          ],
+          capture_output= True,
+          text= True
+        ).stderr
+
+        print(out, flush=True)
+
+        print('> Checking whether output files were created', flush=True)
+        assert path.exists(testpar['output'])
+
+        print('> Reading h5ad files', flush=True)
+        ad_sol = ad.read_h5ad(testpar['input_test_sol'])
+        ad_pred = ad.read_h5ad(testpar['output'])
+
+        print('> Checking dataset id', flush=True)
+        assert ad_pred.uns['dataset_id'] == ad_sol.uns['dataset_id']
+
+        print('> Checking method id', flush=True)
+        assert ad_pred.uns['method_id'] == meta['functionality_name']
+
+        print('> Checking X', flush=True)
+        assert issparse(ad_pred.X)
+        assert np.all([x >= 0 for x in ad_pred.X.nonzero()]), 'Values must be strictly non-negative.'
+        assert ad_pred.X.nonzero()[0].size <= 1000 * ad_sol.n_obs
+        assert ad_pred.n_obs == ad_sol.n_obs
+        assert ad_pred.n_vars == ad_sol.n_vars
+        assert np.isclose(ad_pred.X.sum(axis=1), 1, atol=1e-10).all(), 'All rows should sum to 1.'
+
+        print('> Test succeeded!', flush=True)
diff --git a/src/match_modality/control_methods/constant/config.vsh.yaml b/src/match_modality/control_methods/constant/config.vsh.yaml
new file mode 100644
index 0000000000..fd8c20d700
--- /dev/null
+++ b/src/match_modality/control_methods/constant/config.vsh.yaml
@@ -0,0 +1,27 @@
+__merge__: ../../api/comp_control_method.yaml
+functionality:
+  name: constant
+  namespace: match_modality/control_methods
+  description: Returns constant weights between all mod1 profiles and the first 1000 mod2 profiles.
+  info:
+    type: negative_control
+    method_name: Constant
+  resources:
+    - type: r_script
+      path: script.R
+  # tests:
+  #   - type: r_script
+  #     path: ../../unit_tests/test_method.R
+  #   - path: ../../../../resources_test
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [ anndata>=0.8 ]
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/match_modality/control_methods/constant/script.R b/src/match_modality/control_methods/constant/script.R
new file mode 100644
index 0000000000..590d49f982
--- /dev/null
+++ b/src/match_modality/control_methods/constant/script.R
@@ -0,0 +1,50 @@
+cat("Loading dependencies\n")
+requireNamespace("anndata", quietly = TRUE)
+library(Matrix, warn.conflicts = FALSE, quietly = TRUE)
+
+## VIASH START
+par <- list(
+  input_test_mod1 = "output/output_test_mod1.h5ad",
+  input_test_mod2 = "output/output_test_mod2.h5ad",
+  output = "output/output_prediction.h5ad"
+)
+meta <- list(functionality_name = "foo")
+## VIASH END
+
+
+cat("Reading h5ad files\n")
+# input_train_mod1 <- anndata::read_h5ad(par$input_train_mod1)
+# input_train_mod2 <- anndata::read_h5ad(par$input_train_mod2)
+# input_train_sol <- anndata::read_h5ad(par$input_train_sol)
+input_test_mod1 <- anndata::read_h5ad(par$input_test_mod1, backed = TRUE)
+input_test_mod2 <- anndata::read_h5ad(par$input_test_mod2, backed = TRUE)
+
+knn_df <-
+  expand.grid(
+    i = seq_len(nrow(input_test_mod1)),
+    j = seq_len(min(nrow(input_test_mod2), 1000))
+  )
+
+knn_mat <- 
+  Matrix::sparseMatrix(
+    i = knn_df$i,
+    j = knn_df$j,
+    x = rep(1, nrow(knn_df)),
+    dims = list(nrow(input_test_mod1), nrow(input_test_mod2))
+  )
+
+# normalise to make rows sum to 1
+rs <- Matrix::rowSums(knn_mat)
+knn_mat@x <- knn_mat@x / rs[knn_mat@i + 1]
+
+cat("Creating output anndata\n")
+out <- anndata::AnnData(
+  X = as(knn_mat, "CsparseMatrix"),
+  uns = list(
+    dataset_id = input_test_mod1$uns[["dataset_id"]],
+    method_id = meta$functionality_name
+  )
+)
+
+cat("Writing predictions to file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")

From db7ec21e9db16e2a9d4160a90b8555a0fd5ae833 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Tue, 10 Jan 2023 15:33:25 +0100
Subject: [PATCH 39/82] add random_pairing control method

---
 src/match_modality/api/authors.yaml           |  6 ++-
 .../api/comp_control_method.yaml              |  1 +
 .../control_methods/constant/config.vsh.yaml  |  4 --
 .../random_pairing/config.vsh.yaml            | 19 ++++++++
 .../control_methods/random_pairing/script.py  | 45 +++++++++++++++++++
 5 files changed, 70 insertions(+), 5 deletions(-)
 create mode 100644 src/match_modality/control_methods/random_pairing/config.vsh.yaml
 create mode 100644 src/match_modality/control_methods/random_pairing/script.py

diff --git a/src/match_modality/api/authors.yaml b/src/match_modality/api/authors.yaml
index 2dfeb9ca39..59c6ad211a 100644
--- a/src/match_modality/api/authors.yaml
+++ b/src/match_modality/api/authors.yaml
@@ -17,4 +17,8 @@ functionality:
     - name: Alejandro Granados
       email: alejandro.granados@czbiohub.org
       roles: [ author, maintainer ]
-      props: { github: agranado }
\ No newline at end of file
+      props: { github: agranado }
+    - name: Alex Tong
+      email: alexandertongdev@gmail.com
+      roles: [ author, maintainer ]
+      props: { github: atong01 }
\ No newline at end of file
diff --git a/src/match_modality/api/comp_control_method.yaml b/src/match_modality/api/comp_control_method.yaml
index a6969d1c79..1faa55f61e 100644
--- a/src/match_modality/api/comp_control_method.yaml
+++ b/src/match_modality/api/comp_control_method.yaml
@@ -33,6 +33,7 @@ functionality:
             '--input_test_mod2', testpar['input_test_mod2'],
             '--output', testpar['output']
           ],
+          check=True,
           capture_output= True,
           text= True
         ).stderr
diff --git a/src/match_modality/control_methods/constant/config.vsh.yaml b/src/match_modality/control_methods/constant/config.vsh.yaml
index fd8c20d700..a674e4ecf0 100644
--- a/src/match_modality/control_methods/constant/config.vsh.yaml
+++ b/src/match_modality/control_methods/constant/config.vsh.yaml
@@ -9,10 +9,6 @@ functionality:
   resources:
     - type: r_script
       path: script.R
-  # tests:
-  #   - type: r_script
-  #     path: ../../unit_tests/test_method.R
-  #   - path: ../../../../resources_test
 platforms:
   - type: docker
     image: eddelbuettel/r2u:22.04
diff --git a/src/match_modality/control_methods/random_pairing/config.vsh.yaml b/src/match_modality/control_methods/random_pairing/config.vsh.yaml
new file mode 100644
index 0000000000..3218f32279
--- /dev/null
+++ b/src/match_modality/control_methods/random_pairing/config.vsh.yaml
@@ -0,0 +1,19 @@
+__merge__: ../../api/comp_control_method.yaml
+functionality:
+  name: random_pairing
+  namespace: match_modality/control_methods
+  description: Generates random pairings weights drawn from a uniform distribution.
+  info:
+    type: negative_control
+    method_name: Random Pairing
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: "python:3.10"
+    setup:
+      - type: python
+        pip: [ anndata>=0.8, numpy, scikit-learn ]
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
\ No newline at end of file
diff --git a/src/match_modality/control_methods/random_pairing/script.py b/src/match_modality/control_methods/random_pairing/script.py
new file mode 100644
index 0000000000..868b6ef31c
--- /dev/null
+++ b/src/match_modality/control_methods/random_pairing/script.py
@@ -0,0 +1,45 @@
+import anndata as ad
+import numpy as np
+import scipy.sparse
+from sklearn.preprocessing import normalize
+
+# VIASH START
+par = {
+    "input_test_mod1": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_mod1.h5ad",
+    "input_test_mod2": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_mod2.h5ad",
+    "output": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+}
+
+meta = {
+    "functionality_name": "foo"
+}
+# VIASH END
+
+print("Load datasets")
+input_test_mod1 = ad.read_h5ad(par["input_test_mod1"])
+input_test_mod2 = ad.read_h5ad(par["input_test_mod2"])
+
+# determine number of values in array
+num_values = min(1000, input_test_mod1.n_obs) * input_test_mod1.n_obs
+indices = np.random.randint(input_test_mod1.n_obs**2, size=num_values)
+
+mat_x = np.random.rand(num_values)
+mat_i = indices % input_test_mod1.n_obs
+mat_j = (indices / input_test_mod1.n_obs).astype(int)
+pairing_matrix = scipy.sparse.csr_matrix(
+    (mat_x, (mat_i, mat_j)),
+    shape=(input_test_mod1.n_obs, input_test_mod2.n_obs)
+)
+
+# row normalise
+prob_matrix = normalize(pairing_matrix, norm="l1")
+
+# Write out prediction
+prediction = ad.AnnData(
+    X=prob_matrix,
+    uns={
+        "method_id": meta["functionality_name"],
+        "dataset_id": input_test_mod1.uns["dataset_id"]
+    }
+)
+prediction.write_h5ad(par["output"])

From f850c10c65c5c15972a424b19a90a73d97ee5d6e Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Tue, 10 Jan 2023 16:13:18 +0100
Subject: [PATCH 40/82] add semi_solution control method

---
 ..._masked_sol.yaml => anndata_solution.yaml} |  0
 .../api/comp_control_method.yaml              |  8 +++--
 src/match_modality/api/comp_mask_dataset.yaml | 16 ++--------
 .../semi_solution/config.vsh.yaml             | 23 +++++++++++++++
 .../control_methods/semi_solution/script.R    | 29 +++++++++++++++++++
 5 files changed, 60 insertions(+), 16 deletions(-)
 rename src/match_modality/api/{anndata_masked_sol.yaml => anndata_solution.yaml} (100%)
 create mode 100644 src/match_modality/control_methods/semi_solution/config.vsh.yaml
 create mode 100644 src/match_modality/control_methods/semi_solution/script.R

diff --git a/src/match_modality/api/anndata_masked_sol.yaml b/src/match_modality/api/anndata_solution.yaml
similarity index 100%
rename from src/match_modality/api/anndata_masked_sol.yaml
rename to src/match_modality/api/anndata_solution.yaml
diff --git a/src/match_modality/api/comp_control_method.yaml b/src/match_modality/api/comp_control_method.yaml
index 1faa55f61e..8dff03a367 100644
--- a/src/match_modality/api/comp_control_method.yaml
+++ b/src/match_modality/api/comp_control_method.yaml
@@ -4,6 +4,8 @@ functionality:
       __merge__: anndata_masked_mod.yaml
     - name: "--input_test_mod2"
       __merge__: anndata_masked_mod.yaml
+    - name: "--input_test_sol"
+      __merge__: anndata_solution.yaml
     - name: "--output"
       direction: "output"
       __merge__: anndata_prediction.yaml
@@ -31,14 +33,16 @@ functionality:
             meta['executable'],
             '--input_test_mod1', testpar['input_test_mod1'],
             '--input_test_mod2', testpar['input_test_mod2'],
+            '--input_test_sol', testpar['input_test_sol'],
             '--output', testpar['output']
           ],
           check=True,
           capture_output= True,
           text= True
-        ).stderr
+        )
 
-        print(out, flush=True)
+        # for troubleshooting: remove 'check=True' in subprocess.run above and comment out below print
+        # print(out.stderr, flush=True)
 
         print('> Checking whether output files were created', flush=True)
         assert path.exists(testpar['output'])
diff --git a/src/match_modality/api/comp_mask_dataset.yaml b/src/match_modality/api/comp_mask_dataset.yaml
index 517b7f0f4d..0737a499a0 100644
--- a/src/match_modality/api/comp_mask_dataset.yaml
+++ b/src/match_modality/api/comp_mask_dataset.yaml
@@ -5,31 +5,19 @@ functionality:
     - name: "--input_mod2"
       __merge__: anndata_dataset.yaml
     - name: "--output_train_mod1"
-      direction: "output"
-      example: "dataset_masked.h5ad"
-      description: "The masked shuffled train mod1 profiles."
       __merge__: anndata_masked_mod.yaml
     - name: "--output_train_mod2"
-      direction: "output"
-      example: "dataset_masked.h5ad"
-      description: "The masked shuffled train mod2 profiles."
       __merge__: anndata_masked_mod.yaml
     - name: "--output_train_sol"
-      direction: "output"
-      example: "dataset_solution.h5ad"
-      description: "The pairing of train mod1&mod2 profiles."
-      __merge__: anndata_masked_sol.yaml
+      __merge__: anndata_solution.yaml
     - name: "--output_test_mod1"
-      direction: "output"
-      example: "dataset_masked.h5ad"
-      description: "The masked shuffled test mod1 profiles."
       __merge__: anndata_masked_mod.yaml
     - name: "--output_test_mod2"
       direction: "output"
       __merge__: anndata_masked_mod.yaml
     - name: "--output_test_sol"
       direction: "output"
-      __merge__: anndata_masked_sol.yaml
+      __merge__: anndata_solution.yaml
     - name: "--knn"
       type: "integer"
       default: 20
diff --git a/src/match_modality/control_methods/semi_solution/config.vsh.yaml b/src/match_modality/control_methods/semi_solution/config.vsh.yaml
new file mode 100644
index 0000000000..da39c07aab
--- /dev/null
+++ b/src/match_modality/control_methods/semi_solution/config.vsh.yaml
@@ -0,0 +1,23 @@
+__merge__: ../../api/comp_control_method.yaml
+functionality:
+  name: semi_solution
+  namespace: match_modality/control_methods
+  description: Returns the ground-truth pairing.
+  info:
+    type: positive_control
+    method_name: Semi-solution
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata ]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [ anndata>=0.8 ]
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/match_modality/control_methods/semi_solution/script.R b/src/match_modality/control_methods/semi_solution/script.R
new file mode 100644
index 0000000000..d36833b4b0
--- /dev/null
+++ b/src/match_modality/control_methods/semi_solution/script.R
@@ -0,0 +1,29 @@
+cat("Loading dependencies\n")
+requireNamespace("anndata", quietly = TRUE)
+
+## VIASH START
+par <- list(
+  input_test_sol = "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_sol.h5ad",
+  output = "output.h5ad"
+)
+meta <- list(functionality_name = "foo")
+## VIASH END
+
+cat("Reading h5ad files\n")
+input_test_sol <- anndata::read_h5ad(par$input_test_sol)
+
+# randomly fill in gold standard values
+input_test_sol$X@x <- runif(length(input_test_sol$X@x))
+
+# fill other values with random values as well
+ix <- sample.int(nrow(input_test_sol) * ncol(input_test_sol), nrow(input_test_sol) * 10)
+input_test_sol$X[ix] <- runif(length(ix))
+
+# normalise to make rows sum to 1
+rs <- Matrix::rowSums(input_test_sol$X)
+input_test_sol$X@x <- input_test_sol$X@x / rs[input_test_sol$X@i + 1]
+
+input_test_sol$uns[["method_id"]] <- meta$functionality_name
+
+cat("Writing predictions to file\n")
+zzz <- input_test_sol$write_h5ad(par$output, compression = "gzip")

From 6b17f9032a628d56ab9f15ce8402c2a15247c9f0 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Tue, 10 Jan 2023 16:17:11 +0100
Subject: [PATCH 41/82] add solution control_method

---
 .../control_methods/solution/config.vsh.yaml  | 23 +++++++++++++++++++
 .../control_methods/solution/script.R         | 18 +++++++++++++++
 2 files changed, 41 insertions(+)
 create mode 100644 src/match_modality/control_methods/solution/config.vsh.yaml
 create mode 100644 src/match_modality/control_methods/solution/script.R

diff --git a/src/match_modality/control_methods/solution/config.vsh.yaml b/src/match_modality/control_methods/solution/config.vsh.yaml
new file mode 100644
index 0000000000..ca9d8d5904
--- /dev/null
+++ b/src/match_modality/control_methods/solution/config.vsh.yaml
@@ -0,0 +1,23 @@
+__merge__: ../../api/comp_control_method.yaml
+functionality:
+  name: solution
+  namespace: match_modality/control_methods
+  description: Returns the ground-truth pairing.
+  info:
+    type: positive_control
+    method_name: Solution
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata ]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [ anndata>=0.8 ]
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/match_modality/control_methods/solution/script.R b/src/match_modality/control_methods/solution/script.R
new file mode 100644
index 0000000000..88b5b746dd
--- /dev/null
+++ b/src/match_modality/control_methods/solution/script.R
@@ -0,0 +1,18 @@
+cat("Loading dependencies\n")
+requireNamespace("anndata", quietly = TRUE)
+
+## VIASH START
+par <- list(
+  input_test_sol = "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_sol.h5ad",
+  output = "output.h5ad"
+)
+meta <- list(functionality_name = "foo")
+## VIASH END
+
+cat("Reading h5ad files\n")
+input_test_sol <- anndata::read_h5ad(par$input_test_sol)
+
+input_test_sol$uns[["method_id"]] <- meta$functionality_name
+
+cat("Writing predictions to file\n")
+zzz <- input_test_sol$write_h5ad(par$output, compression = "gzip")

From b30a10e6a2d3445db41aca5f8fe1f4cfb10854a6 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Tue, 10 Jan 2023 17:21:30 +0100
Subject: [PATCH 42/82] add dr_knn_cbf method

---
 src/match_modality/api/comp_method.yaml       |  76 +++++++++
 .../methods/dr_knnr_cbf/config.vsh.yaml       |  28 ++++
 .../methods/dr_knnr_cbf/script.R              | 144 ++++++++++++++++++
 3 files changed, 248 insertions(+)
 create mode 100644 src/match_modality/api/comp_method.yaml
 create mode 100644 src/match_modality/methods/dr_knnr_cbf/config.vsh.yaml
 create mode 100644 src/match_modality/methods/dr_knnr_cbf/script.R

diff --git a/src/match_modality/api/comp_method.yaml b/src/match_modality/api/comp_method.yaml
new file mode 100644
index 0000000000..70528c8ccb
--- /dev/null
+++ b/src/match_modality/api/comp_method.yaml
@@ -0,0 +1,76 @@
+functionality:
+  arguments:
+    - name: "--input_train_mod1"
+      __merge__: anndata_masked_mod.yaml
+    - name: "--input_train_mod2"
+      __merge__: anndata_masked_mod.yaml
+    - name: "--input_train_sol"
+      __merge__: anndata_solution.yaml
+    - name: "--input_test_mod1"
+      __merge__: anndata_masked_mod.yaml
+    - name: "--input_test_mod2"
+      __merge__: anndata_masked_mod.yaml
+    - name: "--output"
+      __merge__: anndata_prediction.yaml
+  test_resources:
+    - path: ../../../../output   
+    - type: python_script
+      path: generic_test.py
+      text: |
+        from os import path
+        import subprocess
+        import anndata as ad
+        import numpy as np
+        from scipy.sparse import issparse
+
+        # define some filenames
+        testpar = {
+            'input_train_mod1': 'output/output_train_mod1.h5ad',
+            'input_train_mod2': 'output/output_train_mod2.h5ad',
+            'input_train_sol': 'output/output_train_sol.h5ad',
+            'input_test_mod1': 'output/output_test_mod1.h5ad',
+            'input_test_mod2': 'output/output_test_mod2.h5ad',
+            'input_test_sol': 'output/output_test_sol.h5ad',
+            'output': 'output/output_prediction.h5ad',
+        }
+
+        print('> Running method', flush=True)
+        out = subprocess.run([
+            meta['executable'],
+            '--input_train_mod1', testpar['input_train_mod1'],
+            '--input_train_mod2', testpar['input_train_mod2'],
+            '--input_train_sol', testpar['input_train_sol'],
+            '--input_test_mod1', testpar['input_test_mod1'],
+            '--input_test_mod2', testpar['input_test_mod2'],
+            '--output', testpar['output']
+          ],
+          check=True
+          capture_output= True,
+          text= True
+        )
+
+        # for troubleshooting: remove 'check=True' in subprocess.run above and comment out below print
+        # print(out.stderr, flush=True)
+
+        print('> Checking whether output files were created', flush=True)
+        assert path.exists(testpar['output'])
+
+        print('> Reading h5ad files', flush=True)
+        ad_sol = ad.read_h5ad(testpar['input_test_sol'])
+        ad_pred = ad.read_h5ad(testpar['output'])
+
+        print('> Checking dataset id', flush=True)
+        assert ad_pred.uns['dataset_id'] == ad_sol.uns['dataset_id']
+
+        print('> Checking method id', flush=True)
+        assert ad_pred.uns['method_id'] == meta['functionality_name']
+
+        print('> Checking X', flush=True)
+        assert issparse(ad_pred.X)
+        assert np.all([x >= 0 for x in ad_pred.X.nonzero()]), 'Values must be strictly non-negative.'
+        assert ad_pred.X.nonzero()[0].size <= 1000 * ad_sol.n_obs
+        assert ad_pred.n_obs == ad_sol.n_obs
+        assert ad_pred.n_vars == ad_sol.n_vars
+        assert np.isclose(ad_pred.X.sum(axis=1), 1, atol=1e-10).all(), 'All rows should sum to 1.'
+
+        print('> Test succeeded!', flush=True)
diff --git a/src/match_modality/methods/dr_knnr_cbf/config.vsh.yaml b/src/match_modality/methods/dr_knnr_cbf/config.vsh.yaml
new file mode 100644
index 0000000000..86e00c8c9a
--- /dev/null
+++ b/src/match_modality/methods/dr_knnr_cbf/config.vsh.yaml
@@ -0,0 +1,28 @@
+__merge__: ../../api/comp_method.yaml
+functionality:
+  name: dr_knnr_cbf
+  namespace: match_modality/methods
+  description: Perform LMDS+Pearson DR, predict embedding with KNN regression, and match cells with KNN.
+  info:
+    type: method
+    method_name: LMDS+KNNR+CBF
+  arguments:
+    - name: "--n_pop"
+      type: "integer"
+      default: 100
+      description: Population size.
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata, tidyverse, lmds, dynutils, proxy, FNN, pbapply]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [ anndata>=0.8 ]
+  - type: nextflow
+    directives: [ midmem, midtime, midcpu ]
diff --git a/src/match_modality/methods/dr_knnr_cbf/script.R b/src/match_modality/methods/dr_knnr_cbf/script.R
new file mode 100644
index 0000000000..021e2d6483
--- /dev/null
+++ b/src/match_modality/methods/dr_knnr_cbf/script.R
@@ -0,0 +1,144 @@
+cat("Loading dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+requireNamespace("anndata", quietly = TRUE)
+library(Matrix, warn.conflicts = FALSE, quietly = TRUE)
+
+## VIASH START
+path <- "output/datasets/match_modality/openproblems_bmmc_multiome_phase1_rna/openproblems_bmmc_multiome_phase1_rna.censor_dataset.output_"
+path <- "output/datasets/match_modality/openproblems_bmmc_cite_phase1_rna/openproblems_bmmc_cite_phase1_rna.censor_dataset.output_"
+par <- list(
+  input_train_mod1 = paste0(path, "train_mod1.h5ad"),
+  input_train_mod2 = paste0(path, "train_mod2.h5ad"),
+  input_train_sol = paste0(path, "train_sol.h5ad"),
+  input_test_mod1 = paste0(path, "test_mod1.h5ad"),
+  input_test_mod2 = paste0(path, "test_mod2.h5ad"),
+  output = "output.h5ad",
+  n_pop = 300L
+)
+meta <- list(functionality_name = "foo")
+## VIASH END
+
+n_cores <- parallel::detectCores(all.tests = FALSE, logical = TRUE)
+
+method_id <- meta$functionality_name
+
+cat("Read train sol\n")
+input_train_sol <- anndata::read_h5ad(par$input_train_sol)
+
+cat("Reading mod1 h5ad files\n")
+input_train_mod1 <- anndata::read_h5ad(par$input_train_mod1)
+input_test_mod1 <- anndata::read_h5ad(par$input_test_mod1)
+
+cat("Running LMDS on input data\n")
+# merge input matrices
+mod1_X <- rbind(input_train_mod1$X, input_test_mod1$X)
+rm(input_train_mod1, input_test_mod1)
+gc()
+
+# perform DR
+dr_x1 <- lmds::lmds(mod1_X, ndim = 10, distance_method = "pearson")
+rm(mod1_X)
+gc()
+
+# split input matrices
+dr_x1_train <- dr_x1[seq_len(nrow(input_train_sol)), , drop = FALSE]
+dr_x1_test <- dr_x1[-seq_len(nrow(input_train_sol)), , drop = FALSE]
+
+cat("Reading mod1 h5ad files\n")
+input_train_mod2 <- anndata::read_h5ad(par$input_train_mod2)
+input_test_mod2 <- anndata::read_h5ad(par$input_test_mod2)
+
+cat("Running LMDS on input data\n")
+# merge input matrices
+match_train <- input_train_sol$uns$pairing_ix + 1
+mod2_X <- rbind(input_train_mod2$X[order(match_train), , drop = FALSE], input_test_mod2$X)
+rm(input_train_mod2, input_test_mod2)
+gc()
+
+# perform DR
+dr_x2 <- lmds::lmds(mod2_X, ndim = 3, distance_method = "pearson")
+rm(mod2_X)
+gc()
+
+# split input matrices
+dr_x2_train <- dr_x2[seq_len(nrow(input_train_sol)), , drop = FALSE]
+dr_x2_test <- dr_x2[-seq_len(nrow(input_train_sol)), , drop = FALSE]
+
+cat("Predicting mod1 DR of test cells\n")
+pred_mod1 <- apply(dr_x1_train, 2, function(yi) {
+  FNN::knn.reg(
+    train = dr_x2_train,
+    test = dr_x2_test,
+    y = yi,
+    k = min(15, nrow(dr_x1_test))
+  )$pred
+})
+
+cat("Predicting mod2 DR of test cells\n")
+pred_mod2 <- apply(dr_x2_train, 2, function(yi) {
+  FNN::knn.reg(
+    train = dr_x1_train,
+    test = dr_x1_test,
+    y = yi,
+    k = min(15, nrow(dr_x1_test))
+  )$pred
+})
+
+cat("Minimising distances between mod1 and mod2 pairs\n")
+gen_vec <- function(z) {
+  int <- seq_len(nrow(pred_mod1))
+
+  i <- j <- c()
+  resti <- int
+  restj <- int
+
+  while (length(resti) > 0) {
+    ixi <- sample.int(length(resti), 1)
+    newi <- resti[[ixi]]
+    d1 <- proxy::dist(pred_mod1[restj, , drop = FALSE], dr_x1_test[newi, , drop = FALSE], method = "euclidean")
+    d2 <- proxy::dist(pred_mod2[restj, , drop = FALSE], dr_x2_test[newi, , drop = FALSE], method = "euclidean")
+    d12 <- d1 + d2
+    ixj <- which.min(d12[, 1])
+    newj <- restj[[ixj]]
+    resti <- resti[-ixi]
+    restj <- restj[-ixj]
+    i <- c(i, newi)
+    j <- c(j, newj)
+
+    #  tibble(i, j); tibble(resti, restj)
+  }
+
+  tibble::tibble(i, j)
+}
+
+outs <- pbapply::pblapply(seq_len(par$n_pop), cl = n_cores, gen_vec)
+# outs <- lapply(seq_len(par$n_pop), gen_vec)
+df <- bind_rows(outs) %>%
+  group_by(i, j) %>%
+  summarise(n = n(), .groups = "drop") %>%
+  arrange(desc(n)) %>%
+  mutate(gold = i == j)
+
+knn_mat <- Matrix::sparseMatrix(
+  i = df$i,
+  j = df$j,
+  x = df$n,
+  dims = list(nrow(dr_x1_test), nrow(dr_x2_test))
+)
+
+# normalise to make rows sum to 1
+rs <- Matrix::rowSums(knn_mat)
+knn_mat@x <- knn_mat@x / rs[knn_mat@i + 1]
+
+cat("Creating output anndata\n")
+out <- anndata::AnnData(
+  X = as(knn_mat, "CsparseMatrix"),
+  uns = list(
+    dataset_id = input_train_sol$uns[["dataset_id"]],
+    method_id = method_id
+  )
+)
+
+cat("Writing predictions to file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")

From abb82860197c6a513aab923735e04ba730f55350 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Tue, 10 Jan 2023 20:57:36 +0100
Subject: [PATCH 43/82] add dr_knnr_knn method

---
 src/match_modality/api/comp_method.yaml       |   2 +-
 .../methods/dr_knnr_knn/config.vsh.yaml       |  23 ++++
 .../methods/dr_knnr_knn/script.R              | 104 ++++++++++++++++++
 3 files changed, 128 insertions(+), 1 deletion(-)
 create mode 100644 src/match_modality/methods/dr_knnr_knn/config.vsh.yaml
 create mode 100644 src/match_modality/methods/dr_knnr_knn/script.R

diff --git a/src/match_modality/api/comp_method.yaml b/src/match_modality/api/comp_method.yaml
index 70528c8ccb..2286a10ffb 100644
--- a/src/match_modality/api/comp_method.yaml
+++ b/src/match_modality/api/comp_method.yaml
@@ -44,7 +44,7 @@ functionality:
             '--input_test_mod2', testpar['input_test_mod2'],
             '--output', testpar['output']
           ],
-          check=True
+          check=True,
           capture_output= True,
           text= True
         )
diff --git a/src/match_modality/methods/dr_knnr_knn/config.vsh.yaml b/src/match_modality/methods/dr_knnr_knn/config.vsh.yaml
new file mode 100644
index 0000000000..e03bf3be48
--- /dev/null
+++ b/src/match_modality/methods/dr_knnr_knn/config.vsh.yaml
@@ -0,0 +1,23 @@
+__merge__: ../../api/comp_method.yaml
+functionality:
+  name: dr_knnr_knn
+  namespace: match_modality/methods
+  description: Perform LMDS+Pearson DR, predict embedding with KNN regression, and match cells with KNN.
+  info:
+    type: baseline
+    method_name: LMDS+KNNR+KNN
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata, tidyverse, lmds, dynutils, proxy, FNN]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [ anndata>=0.8 ]
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/match_modality/methods/dr_knnr_knn/script.R b/src/match_modality/methods/dr_knnr_knn/script.R
new file mode 100644
index 0000000000..fa32cbdf29
--- /dev/null
+++ b/src/match_modality/methods/dr_knnr_knn/script.R
@@ -0,0 +1,104 @@
+cat("Loading dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+requireNamespace("anndata", quietly = TRUE)
+library(Matrix, warn.conflicts = FALSE, quietly = TRUE)
+
+## VIASH START
+path <- "output/datasets/match_modality/openproblems_bmmc_multiome_phase1_rna/openproblems_bmmc_multiome_phase1_rna.censor_dataset.output_"
+par <- list(
+  input_train_mod1 = paste0(path, "train_mod1.h5ad"),
+  input_train_mod2 = paste0(path, "train_mod2.h5ad"),
+  input_train_sol = paste0(path, "train_sol.h5ad"),
+  input_test_mod1 = paste0(path, "test_mod1.h5ad"),
+  input_test_mod2 = paste0(path, "test_mod2.h5ad"),
+  output = "output.h5ad"
+)
+## VIASH END
+
+method_id <- meta$functionality_name
+
+cat("Read train sol\n")
+input_train_sol <- anndata::read_h5ad(par$input_train_sol)
+
+cat("Reading mod1 h5ad files\n")
+input_train_mod1 <- anndata::read_h5ad(par$input_train_mod1)
+input_test_mod1 <- anndata::read_h5ad(par$input_test_mod1)
+
+cat("Running LMDS on input data\n")
+# merge input matrices
+mod1_X <- rbind(input_train_mod1$X, input_test_mod1$X)
+rm(input_train_mod1, input_train_mod2)
+
+# perform DR
+dr_x1 <- lmds::lmds(mod1_X, ndim = 10, distance_method = "pearson")
+rm(mod1_X)
+
+# split input matrices
+dr_x1_train <- dr_x1[seq_len(nrow(input_train_sol)), , drop = FALSE]
+dr_x1_test <- dr_x1[-seq_len(nrow(input_train_sol)), , drop = FALSE]
+
+cat("Reading mod1 h5ad files\n")
+input_train_mod2 <- anndata::read_h5ad(par$input_train_mod2)
+input_test_mod2 <- anndata::read_h5ad(par$input_test_mod2)
+
+cat("Running LMDS on input data\n")
+# merge input matrices
+match_train <- input_train_sol$uns$pairing_ix + 1
+mod2_X <- rbind(input_train_mod2$X[order(match_train), , drop = FALSE], input_test_mod2$X)
+rm(input_train_mod2, input_test_mod2)
+
+# perform DR
+dr_x2 <- lmds::lmds(mod2_X, ndim = 3, distance_method = "pearson")
+rm(mod2_X)
+
+# split input matrices
+dr_x2_train <- dr_x2[seq_len(nrow(input_train_sol)), , drop = FALSE]
+dr_x2_test <- dr_x2[-seq_len(nrow(input_train_sol)), , drop = FALSE]
+
+cat("Predicting for each column in modality 2\n")
+preds <- apply(dr_x2_train, 2, function(yi) {
+  FNN::knn.reg(
+    train = dr_x1_train,
+    test = dr_x1_test,
+    y = yi,
+    k = min(15, nrow(dr_x1_test))
+  )$pred
+})
+
+
+cat("Performing KNN between test mod2 DR and predicted test mod2\n")
+knn_out <- FNN::get.knnx(
+  preds,
+  dr_x2_test,
+  k = min(1000, nrow(dr_x1_test))
+)
+
+cat("Creating output data structures\n")
+df <- tibble(
+  i = as.vector(row(knn_out$nn.index)),
+  j = as.vector(knn_out$nn.index),
+  x = max(knn_out$nn.dist) * 2 - as.vector(knn_out$nn.dist)
+)
+knn_mat <- Matrix::sparseMatrix(
+  i = df$i,
+  j = df$j,
+  x = df$x,
+  dims = list(nrow(dr_x1_test), nrow(dr_x2_test))
+)
+
+# normalise to make rows sum to 1
+rs <- Matrix::rowSums(knn_mat)
+knn_mat@x <- knn_mat@x / rs[knn_mat@i + 1]
+
+cat("Creating output anndata\n")
+out <- anndata::AnnData(
+  X = as(knn_mat, "CsparseMatrix"),
+  uns = list(
+    dataset_id = input_train_sol$uns[["dataset_id"]],
+    method_id = method_id
+  )
+)
+
+cat("Writing predictions to file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")

From 204426ab095e347598de233993a4ada877dfb630 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Tue, 10 Jan 2023 21:03:23 +0100
Subject: [PATCH 44/82] add linear method

---
 src/match_modality/api/authors.yaml           |   6 +-
 .../methods/linear_knn/config.vsh.yaml        |  28 +++++
 .../methods/linear_knn/script.py              | 105 ++++++++++++++++++
 3 files changed, 138 insertions(+), 1 deletion(-)
 create mode 100644 src/match_modality/methods/linear_knn/config.vsh.yaml
 create mode 100644 src/match_modality/methods/linear_knn/script.py

diff --git a/src/match_modality/api/authors.yaml b/src/match_modality/api/authors.yaml
index 59c6ad211a..bd2cade4d0 100644
--- a/src/match_modality/api/authors.yaml
+++ b/src/match_modality/api/authors.yaml
@@ -21,4 +21,8 @@ functionality:
     - name: Alex Tong
       email: alexandertongdev@gmail.com
       roles: [ author, maintainer ]
-      props: { github: atong01 }
\ No newline at end of file
+      props: { github: atong01 }
+    - name: Daniel Burkhardt
+      email: daniel.b.burkhardt@gmail.com
+      roles: [ author, maintainer ]
+      props: { github: dburkhardt }
\ No newline at end of file
diff --git a/src/match_modality/methods/linear_knn/config.vsh.yaml b/src/match_modality/methods/linear_knn/config.vsh.yaml
new file mode 100644
index 0000000000..ab26297bd7
--- /dev/null
+++ b/src/match_modality/methods/linear_knn/config.vsh.yaml
@@ -0,0 +1,28 @@
+__merge__: ../../api/comp_method.yaml
+functionality:
+  name: linear_knn
+  namespace: match_modality/methods
+  description: Perform DR with Linear Regression, and match cells with kNN
+  info:
+    type: method
+    method_name: Linear Regression and KNN
+  arguments:
+    - name: "--n_svd"
+      type: "integer"
+      default: 10
+      description: Number of SVD components to compress to
+    - name: "--n_neighbors"
+      type: "integer"
+      default: 10
+      description: Number of neighbors for matching modalities
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: "python:3.10"
+    setup:
+      - type: python
+        pip: [ anndata, scipy, scikit-learn ]
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/match_modality/methods/linear_knn/script.py b/src/match_modality/methods/linear_knn/script.py
new file mode 100644
index 0000000000..c776337a4a
--- /dev/null
+++ b/src/match_modality/methods/linear_knn/script.py
@@ -0,0 +1,105 @@
+import logging
+import anndata as ad
+import scipy.spatial
+import scipy.sparse
+import numpy as np
+
+from sklearn.decomposition import TruncatedSVD
+from sklearn.neighbors import NearestNeighbors
+from sklearn.linear_model import LinearRegression
+from sklearn.preprocessing import normalize
+
+## VIASH START
+# Anything within this block will be removed by `viash` and will be
+# replaced with the parameters as specified in your config.vsh.yaml.
+par = {
+    "input_train_mod1": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_mod1.h5ad",
+    "input_train_mod2": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_mod2.h5ad",
+    "input_train_sol": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_sol.h5ad",
+    "input_test_mod1": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_mod1.h5ad",
+    "input_test_mod2": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_mod2.h5ad",
+    "output": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+    "n_svd": 100,
+    "n_neighbors" : 10,
+}
+
+meta = {
+    "funtionality_name": "foo"
+}
+## VIASH END
+
+logging.basicConfig(level=logging.INFO)
+
+logging.info("Load datasets")
+input_train_mod1 = ad.read_h5ad(par["input_train_mod1"])
+input_train_mod2 = ad.read_h5ad(par["input_train_mod2"])
+input_train_sol = ad.read_h5ad(par["input_train_sol"])
+input_test_mod1 = ad.read_h5ad(par["input_test_mod1"])
+input_test_mod2 = ad.read_h5ad(par["input_test_mod2"])
+
+# This method runs PCA on each modality individually, then runs linear regression to predict mod2
+# from mod1 and finally performs kNN to match modalities
+
+# unscramble training cells
+ord = np.argsort(input_train_sol.uns['pairing_ix'])
+input_train_mod2 = input_train_mod2[ord, :]
+
+# concatenate train and test data
+mod1 = ad.concat(
+    {
+        "train": input_train_mod1,
+        "test": input_test_mod1
+    },
+    index_unique="-",
+    label="group"
+)
+mod2 = ad.concat(
+    {
+        "train": input_train_mod2,
+        "test": input_test_mod2
+    },
+    index_unique="-",
+    label="group"
+)
+
+# Create helper views to access the test data later
+mod1tr = mod1[mod1.obs["group"] == "train", :]
+mod2tr = mod2[mod2.obs["group"] == "train", :]
+
+mod1te = mod1[mod1.obs["group"] == "test", :]
+mod2te = mod2[mod2.obs["group"] == "test", :]
+
+logging.info("Running PCA")
+n_svd = min(par["n_svd"], mod1.n_obs, mod2.n_obs, mod1.n_vars, mod1.n_vars)
+
+# Use TruncatedSVD for fast decomposition of the data
+mod1.obsm["X_pca"] = TruncatedSVD(n_svd).fit_transform(mod1.X)
+mod2.obsm["X_pca"] = TruncatedSVD(n_svd).fit_transform(mod2.X)
+
+reg = LinearRegression()
+
+reg.fit(mod1tr.obsm["X_pca"], mod2tr.obsm["X_pca"])
+mod2te_pred = reg.predict(mod1te.obsm["X_pca"])
+
+neighbors = NearestNeighbors(n_neighbors=np.min((mod2te.shape[0], par["n_neighbors"])), n_jobs=-1)
+neighbors = neighbors.fit(mod2te_pred)
+
+distances, indices = neighbors.kneighbors(mod2te.obsm["X_pca"])
+
+prediction = np.zeros((mod2te.shape[0], mod2te.shape[0]))
+for i, neighbors in enumerate(indices):
+    prediction[i, neighbors] = 1 / distances[i]
+
+prediction = normalize(prediction, norm="l1")
+
+prediction = scipy.sparse.csr_matrix(prediction)
+
+print("Write prediction output")
+prediction = ad.AnnData(
+    X=prediction,
+    uns={
+        "dataset_id": input_train_mod1.uns["dataset_id"],
+        "method_id": meta["functionality_name"]
+    }
+)
+prediction.write_h5ad(par["output"])

From 8c5401771760829e4b46d997ddc337c719fa06ef Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Tue, 10 Jan 2023 21:15:03 +0100
Subject: [PATCH 45/82] add newwave_knnr_cbf method

---
 .../methods/newwave_knnr_cbf/config.vsh.yaml  |  41 ++++
 .../methods/newwave_knnr_cbf/script.R         | 188 ++++++++++++++++++
 2 files changed, 229 insertions(+)
 create mode 100644 src/match_modality/methods/newwave_knnr_cbf/config.vsh.yaml
 create mode 100644 src/match_modality/methods/newwave_knnr_cbf/script.R

diff --git a/src/match_modality/methods/newwave_knnr_cbf/config.vsh.yaml b/src/match_modality/methods/newwave_knnr_cbf/config.vsh.yaml
new file mode 100644
index 0000000000..6e62d14c3a
--- /dev/null
+++ b/src/match_modality/methods/newwave_knnr_cbf/config.vsh.yaml
@@ -0,0 +1,41 @@
+__merge__: ../../api/comp_method.yaml
+functionality:
+  name: newwave_knnr_cbf
+  namespace: match_modality/methods
+  description: Perform DR with NewWave, predict embedding with KNN regression, and matching cells with a consensus best fit algorithm.
+  info:
+    type: method
+    method_name: NewWave+KNNR+CBF
+  arguments:
+    - name: "--n_pop"
+      type: "integer"
+      default: 300
+      description: Population size.
+    - name: "--newwave_maxiter"
+      type: "integer"
+      default: 40
+      description: Maximum number of NewWave iterations.
+    - name: "--newwave_ngene"
+      type: "integer"
+      default: 200
+      description: Setting of the n_gene_par NewWave parameter.
+    - name: "--newwave_ncell"
+      type: "integer"
+      default: 200
+      description: Setting of the n_cell_par NewWave parameter.
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata, tidyverse, lmds, dynutils, proxy, FNN, pbapply, proxy, proxyC, bioconductor]
+        bioc: [ SingleCellExperiment, NewWave ]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [ anndata>=0.8 ]
+  - type: nextflow
+    directives: [ highmem, midtime, highcpu ]
diff --git a/src/match_modality/methods/newwave_knnr_cbf/script.R b/src/match_modality/methods/newwave_knnr_cbf/script.R
new file mode 100644
index 0000000000..ddc291a517
--- /dev/null
+++ b/src/match_modality/methods/newwave_knnr_cbf/script.R
@@ -0,0 +1,188 @@
+cat("Loading dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+requireNamespace("anndata", quietly = TRUE)
+requireNamespace("pbapply", quietly = TRUE)
+library(Matrix, warn.conflicts = FALSE, quietly = TRUE)
+requireNamespace("NewWave", quietly = TRUE)
+requireNamespace("FNN", quietly = TRUE)
+requireNamespace("SingleCellExperiment", quietly = TRUE)
+
+## VIASH START
+path <- "output/datasets/match_modality/openproblems_bmmc_multiome_phase1/openproblems_bmmc_multiome_phase1.censor_dataset.output_"
+par <- list(
+  input_train_mod1 = paste0(path, "train_mod1.h5ad"),
+  input_train_mod2 = paste0(path, "train_mod2.h5ad"),
+  input_train_sol = paste0(path, "train_sol.h5ad"),
+  input_test_mod1 = paste0(path, "test_mod1.h5ad"),
+  input_test_mod2 = paste0(path, "test_mod2.h5ad"),
+  output = "output.h5ad",
+  n_pop = 300L,
+  newwave_maxiter = 10
+)
+meta <- list(functionality_name = "foo")
+
+# # read in solution data to check whether method is working
+# input_test_sol <- anndata::read_h5ad(paste0(path, "test_sol.h5ad"))
+# match_test <- input_test_sol$uns$pairing_ix + 1
+## VIASH END
+
+n_cores <- parallel::detectCores(all.tests = FALSE, logical = TRUE)
+
+method_id <- meta$functionality_name
+
+input_train_sol <- anndata::read_h5ad(par$input_train_sol)
+match_train <- input_train_sol$uns$pairing_ix + 1
+
+cat("Reading h5ad files\n")
+input_train_mod1 <- anndata::read_h5ad(par$input_train_mod1)
+input_test_mod1 <- anndata::read_h5ad(par$input_test_mod1)
+
+# fetch a few variables
+train_ix <- seq_len(nrow(input_train_mod1))
+did <- input_train_mod1$uns[["dataset_id"]]
+batch1 <- c(as.character(input_train_mod1$obs$batch), as.character(input_test_mod1$obs$batch))
+
+cat("Running NewWave\n")
+data1 <- SummarizedExperiment::SummarizedExperiment(
+  assays = list(counts = cbind(t(input_train_mod1$layers[["counts"]]), t(input_test_mod1$layers[["counts"]]))),
+  colData = data.frame(batch = factor(batch1))
+)
+data1 <- data1[Matrix::rowSums(SummarizedExperiment::assay(data1)) > 0, ]
+# option 1: filter by HVG
+# data1 <- data1[order(proxyC::rowSds(SummarizedExperiment::assay(data1)), decreasing = TRUE)[1:100], ]
+
+# remove large unneeded dataset objects
+rm(input_train_mod1, input_test_mod1)
+gc()
+
+res1 <- NewWave::newWave(
+  data1,
+  X = "~batch",
+  verbose = TRUE,
+  K = 10,
+  maxiter_optimize = par$newwave_maxiter,
+  n_gene_par = min(par$newwave_ngene, nrow(data1)),
+  n_cell_par = min(par$newwave_ncell, ncol(data1)),
+  commondispersion = FALSE
+)
+rm(data1)
+dr_x1 <- SingleCellExperiment::reducedDim(res1)
+
+cat("Reading h5ad files\n")
+input_train_mod2 <- anndata::read_h5ad(par$input_train_mod2)
+input_test_mod2 <- anndata::read_h5ad(par$input_test_mod2)
+
+# don't know batch ordering in input_test_mod2
+batch2 <- c(as.character(input_train_sol$obs$batch), rep("unknownbatch", nrow(input_test_mod2)))
+
+data2 <- SummarizedExperiment::SummarizedExperiment(
+  assays = list(counts = cbind(t(input_train_mod2$layers[["counts"]][order(match_train), , drop = FALSE]), t(input_test_mod2$layers[["counts"]]))),
+  colData = data.frame(batch = factor(batch2))
+)
+data2 <- data2[Matrix::rowSums(SummarizedExperiment::assay(data2)) > 0, ]
+# data2 <- data2[order(proxyC::rowSds(SummarizedExperiment::assay(data2)), decreasing = TRUE)[1:100], ]
+
+# remove large unneeded dataset objects
+rm(input_train_mod2, input_test_mod2)
+gc()
+
+cat("Running NewWave\n")
+res2 <- NewWave::newWave(
+  data2,
+  X = "~batch",
+  verbose = TRUE,
+  K = 10,
+  maxiter_optimize = par$newwave_maxiter,
+  n_gene_par = min(par$newwave_ngene, nrow(data2)),
+  n_cell_par = min(par$newwave_ncell, ncol(data2)),
+  commondispersion = FALSE
+)
+dr_x2 <- SingleCellExperiment::reducedDim(res2)
+
+colnames(dr_x1) <- paste0("comp_", seq_len(ncol(dr_x1)))
+colnames(dr_x2) <- paste0("comp_", seq_len(ncol(dr_x2)))
+
+# split up DR matrices
+dr_x1_train <- dr_x1[train_ix, , drop = FALSE]
+dr_x1_test <- dr_x1[-train_ix, , drop = FALSE]
+dr_x2_train <- dr_x2[train_ix, , drop = FALSE]
+dr_x2_test <- dr_x2[-train_ix, , drop = FALSE]
+
+cat("Predicting mod1 DR of test cells\n")
+pred_mod1 <- apply(dr_x1_train, 2, function(yi) {
+  FNN::knn.reg(
+    train = dr_x2_train,
+    test = dr_x2_test,
+    y = yi,
+    k = min(15, nrow(dr_x1_test))
+  )$pred
+})
+
+cat("Predicting mod2 DR of test cells\n")
+pred_mod2 <- apply(dr_x2_train, 2, function(yi) {
+  FNN::knn.reg(
+    train = dr_x1_train,
+    test = dr_x1_test,
+    y = yi,
+    k = min(15, nrow(dr_x1_test))
+  )$pred
+})
+
+cat("Minimising distances between mod1 and mod2 pairs\n")
+gen_vec <- function(z) {
+  int <- seq_len(nrow(pred_mod1))
+
+  i <- j <- c()
+  resti <- int
+  restj <- int
+
+  while (length(resti) > 0) {
+    ixi <- sample.int(length(resti), 1)
+    newi <- resti[[ixi]]
+    d1 <- proxy::dist(pred_mod1[restj, , drop = FALSE], dr_x1_test[newi, , drop = FALSE], method = "euclidean")
+    d2 <- proxy::dist(pred_mod2[restj, , drop = FALSE], dr_x2_test[newi, , drop = FALSE], method = "euclidean")
+    d12 <- d1 + d2
+    ixj <- which.min(d12[, 1])
+    newj <- restj[[ixj]]
+    resti <- resti[-ixi]
+    restj <- restj[-ixj]
+    i <- c(i, newi)
+    j <- c(j, newj)
+
+    #  tibble(i, j); tibble(resti, restj)
+  }
+
+  tibble::tibble(i, j)
+}
+outs <- pbapply::pblapply(seq_len(par$n_pop), cl = n_cores, gen_vec)
+
+# outs <- lapply(seq_len(par$n_pop), gen_vec)
+df <- bind_rows(outs) %>%
+  group_by(i, j) %>%
+  summarise(n = n(), .groups = "drop") %>%
+  arrange(desc(n)) %>%
+  mutate(gold = i == j)
+
+knn_mat <- Matrix::sparseMatrix(
+  i = df$i,
+  j = df$j,
+  x = df$n,
+  dims = list(nrow(dr_x1_test), nrow(dr_x2_test))
+)
+
+# normalise to make rows sum to 1
+rs <- Matrix::rowSums(knn_mat)
+knn_mat@x <- knn_mat@x / rs[knn_mat@i + 1]
+
+cat("Creating output anndata\n")
+out <- anndata::AnnData(
+  X = as(knn_mat, "CsparseMatrix"),
+  uns = list(
+    dataset_id = did,
+    method_id = method_id
+  )
+)
+
+cat("Writing predictions to file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")

From 65dd49052926d1a9f6df6c144bf8b0991ee7d360 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Tue, 10 Jan 2023 21:27:31 +0100
Subject: [PATCH 46/82] add newwave_knnr_knn method

---
 .../methods/newwave_knnr_cbf/config.vsh.yaml  |   2 +-
 .../methods/newwave_knnr_knn/config.vsh.yaml  |  36 ++++
 .../methods/newwave_knnr_knn/script.R         | 162 ++++++++++++++++++
 3 files changed, 199 insertions(+), 1 deletion(-)
 create mode 100644 src/match_modality/methods/newwave_knnr_knn/config.vsh.yaml
 create mode 100644 src/match_modality/methods/newwave_knnr_knn/script.R

diff --git a/src/match_modality/methods/newwave_knnr_cbf/config.vsh.yaml b/src/match_modality/methods/newwave_knnr_cbf/config.vsh.yaml
index 6e62d14c3a..9fbf3a8887 100644
--- a/src/match_modality/methods/newwave_knnr_cbf/config.vsh.yaml
+++ b/src/match_modality/methods/newwave_knnr_cbf/config.vsh.yaml
@@ -31,7 +31,7 @@ platforms:
     image: eddelbuettel/r2u:22.04
     setup:
       - type: r
-        cran: [ anndata, tidyverse, lmds, dynutils, proxy, FNN, pbapply, proxy, proxyC, bioconductor]
+        cran: [ anndata, tidyverse, lmds, dynutils, proxy, FNN, pbapply, proxy, proxyC]
         bioc: [ SingleCellExperiment, NewWave ]
       - type: apt
         packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
diff --git a/src/match_modality/methods/newwave_knnr_knn/config.vsh.yaml b/src/match_modality/methods/newwave_knnr_knn/config.vsh.yaml
new file mode 100644
index 0000000000..f02a5ed909
--- /dev/null
+++ b/src/match_modality/methods/newwave_knnr_knn/config.vsh.yaml
@@ -0,0 +1,36 @@
+__merge__: ../../api/comp_method.yaml
+functionality:
+  name: baseline_newwave_knnr_knn
+  namespace: match_modality/methods
+  description: Perform DR with NewWave, predict embedding with KNN regression, and match cells with a KNN.
+  info:
+    type: method
+    method_name: NewWave+KNNR+KNN
+  arguments:
+    - name: "--newwave_maxiter"
+      type: "integer"
+      default: 40
+      description: Maximum number of NewWave iterations.
+    - name: "--newwave_ngene"
+      type: "integer"
+      default: 200
+      description: Setting of the n_gene_par NewWave parameter.
+    - name: "--newwave_ncell"
+      type: "integer"
+      default: 200
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata, tidyverse, lmds, dynutils, proxy, FNN]
+        bioc: [ SingleCellExperiment, NewWave ]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [ anndata>=0.8, tensorflow ]
+  - type: nextflow
+    directives: [ highmem, midtime, highcpu ]
diff --git a/src/match_modality/methods/newwave_knnr_knn/script.R b/src/match_modality/methods/newwave_knnr_knn/script.R
new file mode 100644
index 0000000000..2810af1ab2
--- /dev/null
+++ b/src/match_modality/methods/newwave_knnr_knn/script.R
@@ -0,0 +1,162 @@
+cat("Loading dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+requireNamespace("anndata", quietly = TRUE)
+library(Matrix, warn.conflicts = FALSE, quietly = TRUE)
+requireNamespace("NewWave", quietly = TRUE)
+requireNamespace("FNN", quietly = TRUE)
+requireNamespace("SingleCellExperiment", quietly = TRUE)
+
+## VIASH START
+# path <- "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter."
+# path <- "output/public_datasets/match_modality/dyngen_citeseq_1/dyngen_citeseq_1.censor_dataset.output_"
+path <- "output/public_datasets/match_modality/dyngen_atac_1/dyngen_atac_1.censor_dataset.output_"
+# path <- "debug/debug."
+par <- list(
+  input_train_mod1 = paste0(path, "train_mod1.h5ad"),
+  input_train_mod2 = paste0(path, "train_mod2.h5ad"),
+  input_train_sol = paste0(path, "train_sol.h5ad"),
+  input_test_mod1 = paste0(path, "test_mod1.h5ad"),
+  input_test_mod2 = paste0(path, "test_mod2.h5ad"),
+  output = "output.h5ad",
+  n_dims = 10L,
+  distance_method = "spearman",
+  n_ga_pop = 200L,
+  n_ga_iter = 500L
+)
+meta <- list(functionality_name = "foo")
+
+# # read in solution data to check whether method is working
+input_test_sol <- anndata::read_h5ad(paste0(path, "test_sol.h5ad"))
+match_test <- input_test_sol$uns$pairing_ix + 1
+## VIASH END
+
+n_cores <- parallel::detectCores(all.tests = FALSE, logical = TRUE)
+
+method_id <- meta$functionality_name
+
+input_train_sol <- anndata::read_h5ad(par$input_train_sol)
+match_train <- input_train_sol$uns$pairing_ix + 1
+
+cat("Reading h5ad files\n")
+input_train_mod1 <- anndata::read_h5ad(par$input_train_mod1)
+input_test_mod1 <- anndata::read_h5ad(par$input_test_mod1)
+
+# fetch a few variables
+train_ix <- seq_len(nrow(input_train_mod1))
+did <- input_train_mod1$uns[["dataset_id"]]
+batch1 <- c(as.character(input_train_mod1$obs$batch), as.character(input_test_mod1$obs$batch))
+
+cat("Running NewWave\n")
+data1 <- SummarizedExperiment::SummarizedExperiment(
+  assays = list(counts = cbind(t(input_train_mod1$layers[["counts"]]), t(input_test_mod1$layers[["counts"]]))),
+  colData = data.frame(batch = factor(batch1))
+)
+data1 <- data1[Matrix::rowSums(SummarizedExperiment::assay(data1)) > 0, ]
+# option 1: filter by HVG
+# data1 <- data1[order(proxyC::rowSds(SummarizedExperiment::assay(data1)), decreasing = TRUE)[1:100], ]
+
+# remove large unneeded dataset objects
+rm(input_train_mod1, input_test_mod1)
+gc()
+
+res1 <- NewWave::newWave(
+  data1,
+  X = "~batch",
+  verbose = TRUE,
+  K = 10,
+  maxiter_optimize = par$newwave_maxiter,
+  n_gene_par = min(par$newwave_ngene, nrow(data1)),
+  n_cell_par = min(par$newwave_ncell, ncol(data1)),
+  commondispersion = FALSE
+)
+rm(data1)
+dr_x1 <- SingleCellExperiment::reducedDim(res1)
+
+cat("Reading h5ad files\n")
+input_train_mod2 <- anndata::read_h5ad(par$input_train_mod2)
+input_test_mod2 <- anndata::read_h5ad(par$input_test_mod2)
+
+# don't know batch ordering in input_test_mod2
+batch2 <- c(as.character(input_train_sol$obs$batch), rep("unknownbatch", nrow(input_test_mod2)))
+
+data2 <- SummarizedExperiment::SummarizedExperiment(
+  assays = list(counts = cbind(t(input_train_mod2$layers[["counts"]][order(match_train), , drop = FALSE]), t(input_test_mod2$layers[["counts"]]))),
+  colData = data.frame(batch = factor(batch2))
+)
+data2 <- data2[Matrix::rowSums(SummarizedExperiment::assay(data2)) > 0, ]
+# data2 <- data2[order(proxyC::rowSds(SummarizedExperiment::assay(data2)), decreasing = TRUE)[1:100], ]
+
+# remove large unneeded dataset objects
+rm(input_train_mod2, input_test_mod2)
+gc()
+
+cat("Running NewWave\n")
+res2 <- NewWave::newWave(
+  data2,
+  X = "~batch",
+  verbose = TRUE,
+  K = 10,
+  maxiter_optimize = par$newwave_maxiter,
+  n_gene_par = min(par$newwave_ngene, nrow(data2)),
+  n_cell_par = min(par$newwave_ncell, ncol(data2)),
+  commondispersion = FALSE
+)
+dr_x2 <- SingleCellExperiment::reducedDim(res2)
+
+colnames(dr_x1) <- paste0("comp_", seq_len(ncol(dr_x1)))
+colnames(dr_x2) <- paste0("comp_", seq_len(ncol(dr_x2)))
+
+# split up DR matrices
+dr_x1_train <- dr_x1[train_ix, , drop = FALSE]
+dr_x1_test <- dr_x1[-train_ix, , drop = FALSE]
+dr_x2_train <- dr_x2[train_ix, , drop = FALSE]
+dr_x2_test <- dr_x2[-train_ix, , drop = FALSE]
+
+cat("Predicting mod1 DR of test cells\n")
+preds <- apply(dr_x1_train, 2, function(yi) {
+  FNN::knn.reg(
+    train = dr_x2_train,
+    test = dr_x2_test,
+    y = yi,
+    k = min(15, nrow(dr_x1_test))
+  )$pred
+})
+
+
+cat("Performing KNN between test mod2 DR and predicted test mod2\n")
+knn_out <- FNN::get.knnx(
+  preds,
+  dr_x2_test,
+  k = min(1000, nrow(preds))
+)
+
+cat("Creating output data structures\n")
+df <- tibble(
+  i = as.vector(row(knn_out$nn.index)),
+  j = as.vector(knn_out$nn.index),
+  x = 1000 - as.vector(col(knn_out$nn.index)) + 1
+  # x = max(knn_out$nn.dist) * 2 - as.vector(knn_out$nn.dist)
+)
+knn_mat <- Matrix::sparseMatrix(
+  i = df$i,
+  j = df$j,
+  x = df$x,
+  dims = list(nrow(dr_x1_test), nrow(dr_x2_test))
+)
+
+# normalise to make rows sum to 1
+rs <- Matrix::rowSums(knn_mat)
+knn_mat@x <- knn_mat@x / rs[knn_mat@i + 1]
+
+cat("Creating output anndata\n")
+out <- anndata::AnnData(
+  X = as(knn_mat, "CsparseMatrix"),
+  uns = list(
+    dataset_id = did,
+    method_id = method_id
+  )
+)
+
+cat("Writing predictions to file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")

From 334dea9ab69af385c2f26014eafd29f048ff02ac Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Tue, 10 Jan 2023 21:37:50 +0100
Subject: [PATCH 47/82] add procrusted_knn method

---
 .../methods/procrustes_knn/config.vsh.yaml    |  24 ++++
 .../methods/procrustes_knn/script.py          | 112 ++++++++++++++++++
 2 files changed, 136 insertions(+)
 create mode 100644 src/match_modality/methods/procrustes_knn/config.vsh.yaml
 create mode 100644 src/match_modality/methods/procrustes_knn/script.py

diff --git a/src/match_modality/methods/procrustes_knn/config.vsh.yaml b/src/match_modality/methods/procrustes_knn/config.vsh.yaml
new file mode 100644
index 0000000000..1c317ca3d9
--- /dev/null
+++ b/src/match_modality/methods/procrustes_knn/config.vsh.yaml
@@ -0,0 +1,24 @@
+__merge__: ../../api/comp_method.yaml
+functionality:
+  name: procrustes_knn
+  namespace: match_modality/methods
+  description: Perform DR with Procrustes, and match cells with a GA to minimise pairwise distances.
+  info:
+    type: method
+    method_name: Procrustes+KNN
+  arguments:
+    - name: "--n_svd"
+      type: "integer"
+      default: 100
+      description: Number of SVD components to compress to
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: "python:3.10"
+    setup:
+      - type: python
+        pip: [ anndata>=0.8, scipy, scikit-learn ]
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/match_modality/methods/procrustes_knn/script.py b/src/match_modality/methods/procrustes_knn/script.py
new file mode 100644
index 0000000000..3e16b5fbb7
--- /dev/null
+++ b/src/match_modality/methods/procrustes_knn/script.py
@@ -0,0 +1,112 @@
+import logging
+import anndata as ad
+import scipy.spatial
+import scipy.sparse
+import numpy as np
+
+from sklearn.preprocessing import normalize
+from sklearn.decomposition import TruncatedSVD
+from sklearn.neighbors import NearestNeighbors
+
+## VIASH START
+
+# Anything within this block will be removed by `viash` and will be
+# replaced with the parameters as specified in your config.vsh.yaml.
+
+par = {
+    "input_train_mod1": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_mod1.h5ad",
+    "input_train_mod2": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_mod2.h5ad",
+    "input_train_sol": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_sol.h5ad",
+    "input_test_mod1": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_mod1.h5ad",
+    "input_test_mod2": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_mod2.h5ad",
+    "output": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+    "n_svd": 100,
+}
+
+meta = {
+    "functionality_name": "foo"
+}
+## VIASH END
+
+logging.basicConfig(level=logging.INFO)
+
+logging.info("Load datasets")
+input_train_mod1 = ad.read_h5ad(par["input_train_mod1"])
+input_train_mod2 = ad.read_h5ad(par["input_train_mod2"])
+# input_train_sol = ad.read_h5ad(par["input_train_sol"])
+input_test_mod1 = ad.read_h5ad(par["input_test_mod1"])
+input_test_mod2 = ad.read_h5ad(par["input_test_mod2"])
+
+# This method runs PCA on each modality individually, then uses the Procrustes method to identify
+# a linear transform that best superimposes the points from modality 1 onto modality 2.
+
+# concatenate train and test data
+mod1 = ad.concat(
+    {
+        "train": input_train_mod1,
+        "test": input_test_mod1
+    },
+    index_unique="-",
+    label="group"
+)
+mod2 = ad.concat(
+    {
+        "train": input_train_mod2,
+        "test": input_test_mod2
+    },
+    index_unique="-",
+    label="group"
+)
+
+# Create helper views to access the test data later
+mod1te = mod1[mod1.obs["group"] == "test", :]
+mod2te = mod2[mod2.obs["group"] == "test", :]
+
+logging.info("Running PCA")
+n_svd = min(par["n_svd"], mod1.n_obs, mod2.n_obs, mod1.n_vars, mod1.n_vars)
+
+# Use TruncatedSVD for fast decomposition of the data
+mod1.obsm["X_pca"] = TruncatedSVD(n_svd).fit_transform(mod1.X)
+mod2.obsm["X_pca"] = TruncatedSVD(n_svd).fit_transform(mod2.X)
+
+logging.info("Running Procrustes Alignment")
+# This function takes in two matrices of points A and B, standardizes both, and applies a linear to
+# matrix B to minimize the disparity measured as the sum of the squares of the pointwise distances
+# between the two input datasets
+mod1.obsm["X_pro"], mod2.obsm["X_pro"], disparity = scipy.spatial.procrustes(
+    mod1.obsm["X_pca"],
+    mod2.obsm["X_pca"],
+)
+logging.info("> Disparity value is: %0.3f" % disparity)
+
+logging.info("Perform nearest neighbors")
+# To get the matching matrix, for each point in mod1_test, we take the 1000 nearest neighbors of that
+# point in the transformed mod2_test dataset
+n_neighbors = min(1000, mod1te.n_obs, mod1te.n_vars, mod2te.n_obs, mod2te.n_vars)
+nn = NearestNeighbors(n_neighbors=n_neighbors).fit(mod1te.obsm["X_pro"])
+distances, indices = nn.kneighbors(X=mod2te.obsm["X_pro"])
+
+logging.info("Create pairing matrix")
+# Translate the neighborhood assignments to a pairing matrix that is (n_obs, n_obs)
+# NOTE: `pairing_matrix` must have NO MORE than 1000*n_obs non-zero entries for fast metric computation
+ind_i = np.tile(np.arange(mod1te.n_obs), (n_neighbors, 1)).T.flatten()
+ind_j = indices.flatten()
+ind_dist = distances.flatten()
+ind_x = 2 * max(ind_dist) - ind_dist
+pairing_matrix = scipy.sparse.csr_matrix(
+    (ind_x, (ind_i, ind_j)),
+    shape=(input_test_mod1.n_obs, input_test_mod2.n_obs)
+)
+
+# row normalise
+prob_matrix = normalize(pairing_matrix, norm="l1")
+
+print("Write prediction output")
+prediction = ad.AnnData(
+    X=prob_matrix,
+    uns={
+        "dataset_id": input_train_mod1.uns["dataset_id"],
+        "method_id": meta["functionality_name"]
+    }
+)
+prediction.write_h5ad(par["output"])

From 6482c5dde85c282d74e796ebc9da7bc6534ca415 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Wed, 11 Jan 2023 08:54:40 +0100
Subject: [PATCH 48/82] add babel_knn method

---
 .../methods/babel_knn/config.vsh.yaml         |  40 ++++
 src/match_modality/methods/babel_knn/script.R | 195 ++++++++++++++++++
 2 files changed, 235 insertions(+)
 create mode 100644 src/match_modality/methods/babel_knn/config.vsh.yaml
 create mode 100644 src/match_modality/methods/babel_knn/script.R

diff --git a/src/match_modality/methods/babel_knn/config.vsh.yaml b/src/match_modality/methods/babel_knn/config.vsh.yaml
new file mode 100644
index 0000000000..55c4498547
--- /dev/null
+++ b/src/match_modality/methods/babel_knn/config.vsh.yaml
@@ -0,0 +1,40 @@
+__merge__: ../../api/comp_method.yaml
+functionality:
+  status: disabled
+  name: babel_knn
+  namespace: match_modality/methods
+  description: Predict test expression with BABEL and match cells with KNN.
+  info:
+    type: method
+    method_name: Babel+KNN
+    paper_doi: "10.1073/pnas.2023070118"
+  arguments:
+    - name: "--n_dims"
+      type: "integer"
+      default: 10
+      description: Number of dimensions to use for dimensionality reduction.
+    - name: "--n_neigh"
+      type: "integer"
+      default: 10
+      description: Number of neighbors for KNN. Match probability will be 1/n_neigh
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata, lmds, FNN ]
+        bioc: [ DropletUtils ]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3, git]
+      - type: python
+        pip: [ anndata>=0.8 ]
+      - type: docker
+        run: 
+          - wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-py39_4.10.3-Linux-x86_64.sh -O ~/miniconda.sh && /bin/bash ~/miniconda.sh -b -p /opt/conda && /opt/conda/bin/conda -V
+          - cd / && git clone --depth 1 https://github.com/rcannood/babel.git
+          - cd /babel && /opt/conda/bin/conda env create -f environment.yml      
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/match_modality/methods/babel_knn/script.R b/src/match_modality/methods/babel_knn/script.R
new file mode 100644
index 0000000000..42702b6150
--- /dev/null
+++ b/src/match_modality/methods/babel_knn/script.R
@@ -0,0 +1,195 @@
+cat(">> Loading dependencies\n")
+
+requireNamespace("anndata", quietly = TRUE)
+library(Matrix, warn.conflicts = FALSE, quietly = TRUE)
+library(testthat, warn.conflicts = FALSE, quietly = TRUE)
+requireNamespace("DropletUtils", quietly = TRUE)
+
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+
+babel_location <- "/babel/bin/"
+conda_bin <- "/opt/conda/bin/conda"
+
+## VIASH START
+path <- "output/datasets/match_modality/openproblems_bmmc_multiome_phase1/openproblems_bmmc_multiome_phase1.censor_dataset.output_"
+par <- list(
+  input_train_mod1 = paste0(path, "train_mod1.h5ad"),
+  input_train_mod2 = paste0(path, "train_mod2.h5ad"),
+  input_train_sol = paste0(path, "train_sol.h5ad"),
+  input_test_mod1 = paste0(path, "test_mod1.h5ad"),
+  input_test_mod2 = paste0(path, "test_mod2.h5ad"),
+  output = "output.h5ad",
+  n_dims = 10,
+  n_neighs = 10
+)
+conda_bin <- "conda"
+babel_location <- "../babel/bin/"
+## VIASH END
+
+input_train_mod2 <- anndata::read_h5ad(par$input_train_mod2, backed = TRUE)
+if (input_train_mod2$var$feature_types[[1]] != "ATAC") {
+  cat("Error: babel only runs on GEX to ATAC datasets\n")
+  quit(save = "no", status = 42)
+}
+
+cat("Reading h5ad files\n")
+input_train_mod1 <- anndata::read_h5ad(par$input_train_mod1)
+input_train_mod2 <- anndata::read_h5ad(par$input_train_mod2)
+input_train_sol <- anndata::read_h5ad(par$input_train_sol)
+input_test_mod1 <- anndata::read_h5ad(par$input_test_mod1)
+input_test_mod2 <- anndata::read_h5ad(par$input_test_mod2)
+
+cat(">> Reading h5ad files\n")
+if (is.null(input_train_mod1$var$gene_ids)) input_train_mod1$var$gene_ids <- colnames(input_train_mod1)
+if (is.null(input_train_mod2$var$gene_ids)) input_train_mod2$var$gene_ids <- colnames(input_train_mod2)
+if (is.null(input_test_mod1$var$gene_ids)) input_test_mod1$var$gene_ids <- colnames(input_test_mod1)
+if (is.null(input_test_mod2$var$gene_ids)) input_test_mod2$var$gene_ids <- colnames(input_test_mod2)
+
+mod1 <- as.character(unique(input_train_mod1$var$feature_types))
+mod2 <- as.character(unique(input_train_mod2$var$feature_types))
+
+# multiome_matrix for export to Babel's input format
+multiome_matrix <- cbind(input_train_mod1$X, input_train_mod2$X)
+
+# generate multiome anndata object for training
+ad_babel <- anndata::AnnData(
+  X = multiome_matrix,
+  var = bind_rows(input_train_mod1$var, input_train_mod2$var)
+)
+
+# setting up babel dirs
+tmpdir <- tempfile(pattern = "babel_temp", fileext = "/")
+cat(">> Setting up directories for babel at ", tmpdir, "\n", sep = "")
+dir.create(tmpdir)
+on.exit(unlink(tmpdir, recursive = TRUE))
+
+dir_data <- paste0(tmpdir, "data/")     # location of input files
+dir.create(dir_data)
+dir_model <- paste0(tmpdir, "model/")   # location of babel model
+dir_pred <- paste0(tmpdir, "pred/")     # location of predictions
+
+feature_type_map <- c(
+  "GEX" = "Gene Expression",
+  "ADT" = "Peaks", # try to make it run on ADT data as well
+  "ATAC" = "Peaks"
+)
+
+cat(">> Writing train dataset as 10x-CellRanger H5 format\n")
+DropletUtils::write10xCounts(
+  paste0(dir_data, "train_input.h5"),
+  t(ad_babel$X),
+  gene.id = ad_babel$var$gene_ids,
+  gene.symbol = colnames(ad_babel),
+  barcodes = rownames(ad_babel),
+  type = "HDF5",
+  version = "3",
+  genome = "GRCh38",
+  gene.type = unname(feature_type_map[ad_babel$var$feature_types]),
+  overwrite = TRUE
+)
+
+cat(">> Writing test dataset as 10x-CellRanger H5 format\n")
+DropletUtils::write10xCounts(
+  paste0(dir_data, "test_input.h5"),
+  t(input_test_mod1$X),
+  gene.id = input_test_mod1$var$gene_ids,
+  gene.symbol = colnames(input_test_mod1),
+  barcodes = rownames(input_test_mod1),
+  type = "HDF5",
+  version = "3",
+  genome = "GRCh38",
+  gene.type = unname(feature_type_map[input_test_mod1$var$feature_types]),
+  overwrite = TRUE
+)
+
+
+cat(">> Babel: train model\n")
+babel_train_cmd <- paste0(
+  conda_bin, " run -n babel ",
+  "python ", babel_location, "train_model.py ",
+  "--data ", dir_data, "train_input.h5 ",
+  "--outdir ", dir_model, " ",
+  "--nofilter"
+)
+# stringent filtering causes babel to sometimes fail
+# reason: https://github.com/wukevin/babel/blob/main/babel/sc_data_loaders.py#L168-L190
+
+out1 <- system(babel_train_cmd)
+
+# check whether training succeeded
+expect_equal(out1, 0, info = paste0("Model training failed with exit code ", out1))
+
+cat(">> Babel: predict from model\n")
+babel_pred_cmd <- paste0(
+  conda_bin, " run -n babel ",
+  "python ", babel_location, "predict_model.py ",
+  "--checkpoint ", dir_model, " ",
+  "--data ", dir_data, "test_input.h5 ",
+  "--outdir ", dir_pred, " ",
+  "--nofilter"
+)
+out2 <- system(babel_pred_cmd)
+
+# check whether prediction succeeded
+expect_equal(out2, 0, info = paste0("Prediction failed with exit code ", out1))
+
+cat(">> Read predictions\n")
+pred <- anndata::read_h5ad(paste0(dir_pred, "/rna_atac_adata.h5ad"))
+
+#######################################
+#####  KNN
+
+# Only some features are present in Babel's output
+input_test_mod2_filter <- input_test_mod2[, row.names(input_test_mod2$var) %in% row.names(pred$var)]
+
+# Dimensional reduction of both predicted and test profiles
+pred_profiles <- pred[, row.names(input_test_mod2_filter$var)]$X
+
+cat("Performing DR on test values\n")
+dr <- lmds::lmds(
+  rbind(pred_profiles, input_test_mod2_filter$X),
+  ndim = par$n_dims,
+  distance_method = par$distance_method
+)
+
+train_ix <- seq_len(nrow(pred_profiles))
+dr_preds <- dr[train_ix, , drop = FALSE]
+dr_test_mod2 <- dr[-train_ix, , drop = FAPSE]
+
+
+cat("Performing KNN between test mod2 DR and predicted test mod2\n")
+knn_out <- FNN::get.knnx(
+  dr_preds,
+  dr_test_mod2,
+  k = min(1000, nrow(dr_preds))
+)
+
+cat("Creating output data structures\n")
+df <- tibble(
+  i = as.vector(row(knn_out$nn.index)),
+  j = as.vector(knn_out$nn.index),
+  x = max(knn_out$nn.dist) * 2 - as.vector(knn_out$nn.dist)
+)
+knn_mat <- Matrix::sparseMatrix(
+  i = df$i,
+  j = df$j,
+  x = df$x,
+  dims = list(nrow(input_test_mod1), nrow(input_test_mod2))
+)
+
+# normalise to make rows sum to 1
+rs <- Matrix::rowSums(knn_mat)
+knn_mat@x <- knn_mat@x / rs[knn_mat@i + 1]
+
+cat("Creating output anndata\n")
+out <- anndata::AnnData(
+  X = as(knn_mat, "CsparseMatrix"),
+  uns = list(
+    dataset_id = input_train_mod1$uns[["dataset_id"]],
+    method_id = "baseline_babel_knn"
+  )
+)
+
+cat("Writing predictions to file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")
\ No newline at end of file

From 46c54667c7e66413f785e7c8e4e4e12fe195422e Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Thu, 12 Jan 2023 14:53:22 +0100
Subject: [PATCH 49/82] add aupr metrics

---
 src/match_modality/api/anndata_score.yaml     |  25 ++++
 src/match_modality/api/comp_method.yaml       |   1 +
 src/match_modality/api/comp_metric.yaml       |  85 +++++++++++
 .../mask_dataset/config.vsh.yaml              |   6 +-
 .../metrics/aupr/config.vsh.yaml              |  40 ++++++
 src/match_modality/metrics/aupr/script.R      | 136 ++++++++++++++++++
 src/match_modality/metrics/aupr/test_custom.R | 100 +++++++++++++
 7 files changed, 390 insertions(+), 3 deletions(-)
 create mode 100644 src/match_modality/api/anndata_score.yaml
 create mode 100644 src/match_modality/api/comp_metric.yaml
 create mode 100644 src/match_modality/metrics/aupr/config.vsh.yaml
 create mode 100644 src/match_modality/metrics/aupr/script.R
 create mode 100644 src/match_modality/metrics/aupr/test_custom.R

diff --git a/src/match_modality/api/anndata_score.yaml b/src/match_modality/api/anndata_score.yaml
new file mode 100644
index 0000000000..bfe79f07cc
--- /dev/null
+++ b/src/match_modality/api/anndata_score.yaml
@@ -0,0 +1,25 @@
+type: file
+description: "Metric score file"
+example: "output.h5ad"
+info:
+  short_description: "Score"
+  slots:
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: method_id
+        description: "A unique identifier for the method"
+        required: true
+      - type: string
+        name: metric_ids
+        description: "One or more unique metric identifiers"
+        multiple: true
+        required: true
+      - type: double
+        name: metric_values
+        description: "The metric values obtained for the given prediction. Must be of same length as 'metric_ids'."
+        multiple: true
+        required: true
diff --git a/src/match_modality/api/comp_method.yaml b/src/match_modality/api/comp_method.yaml
index 2286a10ffb..87e45f646b 100644
--- a/src/match_modality/api/comp_method.yaml
+++ b/src/match_modality/api/comp_method.yaml
@@ -11,6 +11,7 @@ functionality:
     - name: "--input_test_mod2"
       __merge__: anndata_masked_mod.yaml
     - name: "--output"
+      direction: output
       __merge__: anndata_prediction.yaml
   test_resources:
     - path: ../../../../output   
diff --git a/src/match_modality/api/comp_metric.yaml b/src/match_modality/api/comp_metric.yaml
new file mode 100644
index 0000000000..12c5a070ac
--- /dev/null
+++ b/src/match_modality/api/comp_metric.yaml
@@ -0,0 +1,85 @@
+functionality:
+  arguments:
+    - name: --input_prediction
+      __merge__: anndata_prediction.yaml
+    - name: --input_solution
+      __merge__: anndata_solution.yaml
+    - name: --output
+      __merge__: anndata_score.yaml
+      direction: output
+  test_resources:
+    - path: ../../../../output
+    - type: python_script
+      path: generic_test.py
+      text: |
+        from os import path
+        import subprocess
+        import anndata as ad
+        import pandas as pd
+        import yaml
+
+        input_prediction_path = "output/dr_knnr_cbf_prediction.h5ad"
+        input_solution_path = "output/output_test_sol.h5ad"
+        output_path = "output.h5ad"
+        # define some filenames
+        with open(meta["config"], "r") as file:
+                config = yaml.safe_load(file)
+
+        cmd = [
+            meta['executable'],
+            "--input_prediction", input_prediction_path,
+            "--input_solution", input_solution_path,
+            "--output", output_path
+        ]
+
+        print("> Running method", flush=True)
+        out = subprocess.run(cmd, stderr=subprocess.STDOUT, check=True)
+
+        # print(out.stdout)
+
+        print("> Checking whether output files were created", flush=True)
+        assert path.exists(output_path)
+
+        print("> Reading h5ad files", flush=True)
+        input_prediction = ad.read_h5ad(input_prediction_path)
+        input_solution = ad.read_h5ad(input_solution_path)
+        output = ad.read_h5ad(output_path)
+
+        # Create DF from metric config info
+        metric_info = config['functionality']['info']['metrics']
+        metric_meta = pd.DataFrame(metric_info)
+        metric_meta = metric_meta.astype({'metric_id': str, 'metric_name': str, 'metric_description': str, 'min': float, 'max': float, 'maximize': bool})
+        print("> Checking contents of metric info", flush=True)
+        assert 'metric_id' in metric_meta
+        assert 'min' in metric_meta
+        assert 'max' in metric_meta
+        assert 'maximize' in metric_meta
+
+        print("> Checking .uns['dataset_id']", flush=True)
+        assert 'dataset_id' in output.uns
+        assert output.uns['dataset_id'] == input_prediction.uns['dataset_id']
+
+        print("> Checking .uns['method_id']", flush=True)
+        assert 'method_id' in output.uns
+        assert output.uns['method_id'] == input_prediction.uns['method_id']
+
+        print("> Checking .uns['metric_ids']", flush=True)
+        assert 'metric_ids' in output.uns
+        assert set(output.uns['metric_ids']) == set(metric_meta.metric_id)
+
+        print("> Checking .uns['metric_values']", flush=True)
+        assert 'metric_values' in output.uns
+        assert output.uns['metric_ids'].size == output.uns['metric_values'].size
+
+        # merge with metric_meta to see if metric_value lies within the expected range
+        output_uns = pd.DataFrame({
+          'metric_id': output.uns['metric_ids'], 
+          'value': output.uns['metric_values']
+        })
+
+        scores = metric_meta.merge(output_uns, on="metric_id")
+
+        assert all(scores.value >= scores['min'])
+        assert all(scores.value <= scores['max'])
+
+        print("> Test succeeded!", flush=True)
diff --git a/src/match_modality/mask_dataset/config.vsh.yaml b/src/match_modality/mask_dataset/config.vsh.yaml
index 5dd18a45f9..ac341f3b84 100644
--- a/src/match_modality/mask_dataset/config.vsh.yaml
+++ b/src/match_modality/mask_dataset/config.vsh.yaml
@@ -1,4 +1,4 @@
-_merge__: ../api/comp_mask_dataset.yaml
+__merge__: ../api/comp_mask_dataset.yaml
 functionality:
   name: mask_dataset
   namespace: match_modality
@@ -19,5 +19,5 @@ platforms:
       - type: python
         pip: [ anndata>=0.8 ]
   - type: nextflow
-    publish: true
-    directives: [ highmem, midtime, highcpu ]
+    directives: 
+      label: [ highmem, midtime, highcpu ]
diff --git a/src/match_modality/metrics/aupr/config.vsh.yaml b/src/match_modality/metrics/aupr/config.vsh.yaml
new file mode 100644
index 0000000000..3b655555d1
--- /dev/null
+++ b/src/match_modality/metrics/aupr/config.vsh.yaml
@@ -0,0 +1,40 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: aupr
+  namespace: match_modality/metrics
+  description: Calculating basic metrics for task2.
+  info:
+    metrics:
+      - metric_id: pairing_auroc
+        metric_name: Pairing auroc
+        metric_description: Area Under ROC curve
+        maximize: true
+        min: 0
+        max: 1
+      - metric_id: pairing_aupr
+        metric_name: Pairing aupr
+        metric_description: Area Under PR curve
+        maximize: true
+        min: 0
+        max: 1
+  resources:
+    - type: r_script
+      path: script.R
+  test_resources:
+    - type: r_script
+      path: test_custom.R
+    - path: ../../../../resources_test
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata, pracma, tidyverse, testthat]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [ anndata>=0.8, pyyaml ]
+  - type: nextflow
+    directives:
+      label: [ lowmem, lowtime, lowcpu ]
+
diff --git a/src/match_modality/metrics/aupr/script.R b/src/match_modality/metrics/aupr/script.R
new file mode 100644
index 0000000000..1609d6f673
--- /dev/null
+++ b/src/match_modality/metrics/aupr/script.R
@@ -0,0 +1,136 @@
+cat("Load dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+library(testthat, quietly = TRUE, warn.conflicts = FALSE)
+library(Matrix, quietly = TRUE, warn.conflicts = FALSE)
+requireNamespace("anndata", quietly = TRUE)
+requireNamespace("pracma", quietly = TRUE)
+
+## VIASH START
+par <- list(
+  input_solution = "output/output_test_sol.h5ad",
+  input_prediction = "output/dr_knnr_cbf_prediction.h5ad",
+  output = "output/aupr_score.h5ad"
+)
+## VIASH END
+
+cat("Read solution h5ad\n")
+ad_sol <- anndata::read_h5ad(par$input_solution, backed = "r")
+
+cat("Read prediction h5ad\n")
+expect_true(
+  grepl("\\.h5ad$", par$input_prediction),
+  info = "Prediction file should be an h5ad file"
+)
+ad_pred <-
+  tryCatch({
+    anndata::read_h5ad(par$input_prediction)
+  }, error = function(e) {
+    stop(paste0("Can't open prediction h5ad file. Detailed error message:\n", e$message))
+  })
+expect_true(
+  ad_sol$uns$dataset_id == ad_pred$uns$dataset_id
+)
+
+X_pred <- as(ad_pred$X, "CsparseMatrix")[, order(ad_sol$uns$pairing_ix)]
+dimnames(X_pred) <- list(NULL, NULL)
+
+cat("Data wrangling\n")
+pred_summ <- summary(X_pred) %>%
+  as_tibble() %>%
+  mutate(gold = i == j) %>%
+  arrange(desc(x))
+
+# helper function
+calculate_au <- function(values, are_true, num_positive_interactions, num_possible_interactions, extend_by = 10000) {
+  ord <- order(rank(values, ties.method = "random"), decreasing = TRUE)
+  values <- values[ord]
+  are_true <- are_true[ord]
+
+  # calculate base statistics
+  num_selected <- seq_along(are_true)
+  tp <- cumsum(are_true)
+  fp <- num_selected - tp
+  length_ranking <- length(tp)
+  num_negative_interactions <- num_possible_interactions - num_positive_interactions
+
+  # extend base statistics, if necessary
+  if (extend_by > 0 && length_ranking != num_possible_interactions) {
+    diff.predictions <- num_possible_interactions - length_ranking
+    diff.trues <- num_positive_interactions - tail(tp, 1)
+    diff.negs <- num_negative_interactions - tail(fp, 1)
+
+    multiplier <- seq_len(extend_by) / extend_by
+
+    extra_num_selected <- multiplier * diff.predictions + tail(num_selected, 1)
+    extra_tp <- multiplier * diff.trues + tail(tp, 1)
+    extra_fp <- multiplier * diff.negs + tail(fp, 1)
+
+    num_selected <- c(num_selected, extra_num_selected)
+    are_true <- c(are_true, rep(NA, extend_by))
+    values <- c(values, rep(NA, extend_by))
+    tp <- c(tp, extra_tp)
+    fp <- c(fp, extra_fp)
+  }
+
+  # calculate extended statistics
+  metrics <- tibble(
+    num_selected = c(0, num_selected),
+    value = c(NA, values),
+    are_true = c(NA, are_true),
+    tp = c(0, tp),
+    fp = c(0, fp),
+    fn = num_positive_interactions - tp,
+    tn = num_negative_interactions - fp,
+    acc = (tp + tn) / (num_positive_interactions + num_negative_interactions),
+    tpr = tp / num_positive_interactions,
+    spec = tn / num_negative_interactions,
+    prec = ifelse(num_selected == 0, 1, tp / (tp + fp)),
+    npv = tn / (tn + fn),
+    f1 = 2 * tp / (2 * tp + fp + fn),
+    mcc = ifelse(num_selected == 0, 0, (tp * tn - fp * fn) / sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))),
+    informedness = tpr + spec - 1,
+    markedness = prec + npv - 1
+  )
+
+  # calculate area under the curves
+  area_under <- tibble(
+    auroc = pracma::trapz(1 - metrics$spec, metrics$tpr),
+    aupr = abs(pracma::trapz(metrics$tpr, metrics$prec))
+  )
+
+  list(metrics = metrics, area_under = area_under)
+}
+
+
+cat("Calculate area under the curve\n")
+au_out <- calculate_au(
+  values = pred_summ$x,
+  are_true = pred_summ$gold,
+  num_positive_interactions = nrow(X_pred),
+  num_possible_interactions = (nrow(X_pred) * 1.0) * nrow(X_pred)
+)
+
+# GENIE3bis::plot_curves(au_out)
+
+colnames(au_out$area_under) <- paste0("pairing_", colnames(au_out$area_under))
+
+cat("Create output object\n")
+out_values <- c(
+  as.list(au_out$area_under)
+)
+
+out <- anndata::AnnData(
+  X = NULL,
+  shape = dim(ad_sol),
+  uns = list(
+    dataset_id = ad_pred$uns$dataset_id,
+    method_id = ad_pred$uns$method_id,
+    metric_ids = names(out_values),
+    metric_values = as.numeric(out_values),
+    genie3 = au_out
+  )
+)
+
+cat("Write output to h5ad file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")
\ No newline at end of file
diff --git a/src/match_modality/metrics/aupr/test_custom.R b/src/match_modality/metrics/aupr/test_custom.R
new file mode 100644
index 0000000000..61e120f8d4
--- /dev/null
+++ b/src/match_modality/metrics/aupr/test_custom.R
@@ -0,0 +1,100 @@
+library(assertthat, quietly = TRUE)
+library(Matrix, quietly = TRUE)
+requireNamespace("anndata", quietly = TRUE)
+
+## VIASH START
+# This code block will be replaced by viash at runtime.
+meta <- list(functionality_name = "foo")
+## VIASH END
+
+# determine filenames and arguments
+testpar <- list(
+  "input_solution" = "temp_sol.h5ad",
+  "input_prediction" = "temp_pred.h5ad",
+  "output" = "temp_out.h5ad"
+)
+command <- paste0("./", meta[["functionality_name"]])
+args <- unlist(rbind(paste0("--", names(testpar)), unname(testpar)))
+
+# uncomment this for manual testing
+# command <- "viash"
+# args <- c("run", "src/match_modality/metrics/aupr/config.vsh.yaml", "--", args)
+
+cat("Creating test files\n")
+ad_sol <- anndata::AnnData(
+  X = as(Matrix(c(1, 0, 0, 0, 1, 0, 0, 0, 1), nrow = 3, byrow = TRUE, sparse = TRUE), "CsparseMatrix"),
+  layers = list(
+    neighbors = as(Matrix(c(1, 0.5, 0, 0.5, 1, 0.25, 0, 0.25, 1), nrow = 3, byrow = TRUE, sparse = TRUE), "CsparseMatrix")
+  ),
+  uns = list(dataset_id = "simple", pairing_ix = c(0, 1, 2)),
+  obs = data.frame(
+    cell_type = c("a", "a", "b")
+  )
+)
+ad_pred <- anndata::AnnData(
+  X = as(Matrix(c(1, .1, .2, .3, .9, .4, .5, .6, .8), nrow = 3, byrow = TRUE, sparse = TRUE), "CsparseMatrix"),
+  uns = list(dataset_id = "simple", method_id = "simple")
+)
+
+ad_sol$write_h5ad(testpar$input_solution, compression = "gzip")
+ad_pred$write_h5ad(testpar$input_prediction, compression = "gzip")
+
+cat("> Running metric\n")
+out <- processx::run(
+  command = command,
+  args = args,
+  stderr_to_stdout = TRUE
+)
+
+cat("> Reading metric scores\n")
+assert_that(file.exists(testpar$output))
+ad_out <- anndata::read_h5ad(testpar$output)
+
+scores1 <- ad_out$uns$metric_values
+names(scores1) <- ad_out$uns$metric_ids
+# assert_that(
+#   scores1[["pairing_aupr"]] >= scores1[["neighbor_aupr"]],
+#   scores1[["neighbor_aupr"]] >= scores1[["celltype_aupr"]],
+#   scores1[["pairing_auroc"]] >= scores1[["neighbor_auroc"]],
+#   scores1[["pairing_auroc"]] >= scores1[["celltype_auroc"]]
+# )
+
+
+cat("Creating test files\n")
+# pairing_ix <- c(2, 1, 3)
+pairing_ix <- c(3, 1, 2)
+ad_sol <- anndata::AnnData(
+  X = as(diag(length(pairing_ix)), "CsparseMatrix")[,pairing_ix],
+  layers = list(
+    neighbors = as(Matrix(c(1, 0.5, 0, 0.5, 1, 0.25, 0, 0.25, 1), nrow = 3, byrow = TRUE, sparse = TRUE), "CsparseMatrix")[,pairing_ix]
+  ),
+  uns = list(dataset_id = "simple", pairing_ix = pairing_ix-1),
+  obs = data.frame(
+    cell_type = c("a", "a", "b")
+  )
+)
+ad_pred <- anndata::AnnData(
+  X = as(Matrix(c(1, .1, .2, .3, .9, .4, .5, .6, .8), nrow = 3, byrow = TRUE, sparse = TRUE), "CsparseMatrix")[,pairing_ix],
+  uns = list(dataset_id = "simple", method_id = "simple")
+)
+
+ad_sol$write_h5ad(testpar$input_solution, compression = "gzip")
+ad_pred$write_h5ad(testpar$input_prediction, compression = "gzip")
+
+cat("> Running metric\n")
+out <- processx::run(
+  command = command,
+  args = args,
+  stderr_to_stdout = TRUE
+)
+
+cat("> Reading metric scores\n")
+assert_that(file.exists(testpar$output))
+ad_out <- anndata::read_h5ad(testpar$output)
+
+scores2 <- ad_out$uns$metric_values
+names(scores2) <- ad_out$uns$metric_ids
+
+assert_that(all(scores1 == scores2))
+
+cat("> Test succeeded!\n")

From 6014e24c0844102097da2d9545529174fac5622b Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Thu, 12 Jan 2023 15:06:24 +0100
Subject: [PATCH 50/82] add check_format metric

---
 .../metrics/check_format/config.vsh.yaml      | 35 ++++++++++
 .../metrics/check_format/script.R             | 67 +++++++++++++++++++
 2 files changed, 102 insertions(+)
 create mode 100644 src/match_modality/metrics/check_format/config.vsh.yaml
 create mode 100644 src/match_modality/metrics/check_format/script.R

diff --git a/src/match_modality/metrics/check_format/config.vsh.yaml b/src/match_modality/metrics/check_format/config.vsh.yaml
new file mode 100644
index 0000000000..5f7f846065
--- /dev/null
+++ b/src/match_modality/metrics/check_format/config.vsh.yaml
@@ -0,0 +1,35 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: check_format
+  namespace: match_modality/metrics
+  description: Checking whether the prediction of a method has the right format.
+  info:
+    metrics:
+      - metric_id: finished
+        metric_name: Finished
+        metric_description: Check if metric finished succesfully
+        maximize: true
+        min: 0
+        max: 1
+      - metric_id: correct_format
+        metric_name: Correct Format
+        metric_description: Check if predictioin has the right format
+        maximize: true
+        min: 0
+        max: 1
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata ]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [ anndata>=0.8, pyyaml ]
+  - type: nextflow
+    directives:
+      label: [ lowmem, lowtime, lowcpu ]
\ No newline at end of file
diff --git a/src/match_modality/metrics/check_format/script.R b/src/match_modality/metrics/check_format/script.R
new file mode 100644
index 0000000000..ade396c0bb
--- /dev/null
+++ b/src/match_modality/metrics/check_format/script.R
@@ -0,0 +1,67 @@
+cat("Load dependencies\n")
+library(assertthat, quietly = TRUE, warn.conflicts = FALSE)
+requireNamespace("anndata", quietly = TRUE)
+
+## VIASH START
+task <- "match_modality"
+par <- list(
+  input_solution = paste0("resources_test/", task, "/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_sol.h5ad"),
+  input_prediction = paste0("resources_test/", task, "/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad"),
+  output = paste0("resources_test/", task, "/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.scores.h5ad")
+)
+## VIASH END
+
+cat("Read prediction h5ad\n")
+ad_sol <- anndata::read_h5ad(par$input_solution, backed = "r")
+
+cat("Checking solution h5ad\n")
+correct_format <- tryCatch({
+  # read prediction
+  ad_pred <- anndata::read_h5ad(par$input_prediction)
+
+  # check dataset id
+  dataset_id <- ad_pred$uns[["dataset_id"]]
+  assert_that(dataset_id == ad_sol$uns[["dataset_id"]])
+
+  # check method id
+  method_id <- ad_pred$uns[["method_id"]]
+  assert_that(
+    is.character(method_id),
+    method_id != ""
+  )
+
+  # check X
+  assert_that(
+    is(ad_pred$X, "sparseMatrix"),
+    ad_pred$n_obs == ad_sol$n_obs,
+    ad_pred$n_vars == ad_sol$n_vars,
+    length(ad_pred$X@x) <= 1000 * ad_sol$n_obs,
+    all(ad_pred$X@x >= 0),
+    isTRUE(all.equal(
+      Matrix::rowSums(ad_pred$X),
+      rep(1, ad_pred$n_obs),
+      check.attributes = FALSE,
+      tolerance = 1e-5
+    ))
+  )
+
+  1
+}, error = function(e) {
+  cat("ERROR: ", e$message, "\n", sep = "")
+  0
+})
+
+
+cat("Create output object\n")
+out <- anndata::AnnData(
+  shape = c(0, 0),
+  uns = list(
+    dataset_id = ad_pred$uns$dataset_id,
+    method_id = ad_pred$uns$method_id,
+    metric_ids = c("finished", "correct_format"),
+    metric_values = c(1, correct_format)
+  )
+)
+
+cat("Write output to h5ad file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")

From 33f21a07ec67c6612bf174ea289159900b9fba1f Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Thu, 12 Jan 2023 15:46:21 +0100
Subject: [PATCH 51/82] add match_probability metric

---
 src/match_modality/api/comp_metric.yaml       |  4 +-
 .../metrics/match_probability/config.vsh.yaml | 30 ++++++++++
 .../metrics/match_probability/script.R        | 56 +++++++++++++++++++
 3 files changed, 87 insertions(+), 3 deletions(-)
 create mode 100644 src/match_modality/metrics/match_probability/config.vsh.yaml
 create mode 100644 src/match_modality/metrics/match_probability/script.R

diff --git a/src/match_modality/api/comp_metric.yaml b/src/match_modality/api/comp_metric.yaml
index 12c5a070ac..ae94586882 100644
--- a/src/match_modality/api/comp_metric.yaml
+++ b/src/match_modality/api/comp_metric.yaml
@@ -33,9 +33,7 @@ functionality:
         ]
 
         print("> Running method", flush=True)
-        out = subprocess.run(cmd, stderr=subprocess.STDOUT, check=True)
-
-        # print(out.stdout)
+        out = subprocess.run(cmd, check=True)
 
         print("> Checking whether output files were created", flush=True)
         assert path.exists(output_path)
diff --git a/src/match_modality/metrics/match_probability/config.vsh.yaml b/src/match_modality/metrics/match_probability/config.vsh.yaml
new file mode 100644
index 0000000000..6214b49ad9
--- /dev/null
+++ b/src/match_modality/metrics/match_probability/config.vsh.yaml
@@ -0,0 +1,30 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: match_probability
+  namespace: match_modality/metrics
+  description: Calculating basic metrics for task2.
+  info:
+    metrics:
+      - metric_id: match_probability
+        metric_name: Match Probability
+        metric_description: Calculating basic metrics for match modality
+        min: 0
+        max: 1
+        maximize: true
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata, tidyverse, testthat]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [ anndata>=0.8, pyyaml ]
+  - type: nextflow
+    directives:
+      label: [ lowmem, lowtime, lowcpu ]
+
diff --git a/src/match_modality/metrics/match_probability/script.R b/src/match_modality/metrics/match_probability/script.R
new file mode 100644
index 0000000000..b68051ad2b
--- /dev/null
+++ b/src/match_modality/metrics/match_probability/script.R
@@ -0,0 +1,56 @@
+cat("Load dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+library(testthat, quietly = TRUE, warn.conflicts = FALSE)
+library(Matrix, quietly = TRUE, warn.conflicts = FALSE)
+requireNamespace("anndata", quietly = TRUE)
+
+## VIASH START
+par <- list(
+  input_solution = "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_sol.h5ad",
+  input_prediction = "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+  output = "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.scores.h5ad"
+)
+## VIASH END
+
+cat("Read solution h5ad\n")
+ad_sol <- anndata::read_h5ad(par$input_solution, backed = "r")
+
+cat("Read prediction h5ad\n")
+ad_pred <- anndata::read_h5ad(par$input_prediction)
+
+cat("Unscrambling predictions\n")
+pairing_ix <- ad_sol$uns[["pairing_ix"]]
+X_pred <- as(ad_pred$X, "CsparseMatrix")[, order(pairing_ix)]
+dimnames(X_pred) <- list(NULL, NULL)
+
+# set negative values to 0
+X_pred@x <- ifelse(X_pred@x < 0, 0, X_pred@x)
+
+cat("Calculating normalisation factors\n")
+rowSum <- Matrix::rowSums(X_pred)
+
+cat("Computing the match modality score\n")
+match_probability_vec <- diag(X_pred) / rowSum
+
+match_probability <- mean(match_probability_vec)
+
+cat("Create output object\n")
+out <- anndata::AnnData(
+  shape = c(0, 0),
+  uns = list(
+    dataset_id = ad_pred$uns$dataset_id,
+    method_id = ad_pred$uns$method_id,
+    metric_ids = list("match_probability"),
+    metric_values = list(match_probability),
+    per_cell = list(
+      match_probability = match_probability_vec
+    )
+  )
+)
+
+# should we also save the metrics object?
+# this would allow for plotting the auroc and aupr curves afterwards.
+
+cat("Write output to h5ad file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")
\ No newline at end of file

From 3070f537cc14ad98ef0625dba49ee91a9784f75b Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Thu, 12 Jan 2023 16:04:33 +0100
Subject: [PATCH 52/82] add resources and resources_test scripts

---
 .../resources_scripts/mask_datasets.sh        | 64 ++++++++++++++++
 .../resources_scripts/run_benchmarks.sh       | 74 +++++++++++++++++++
 .../resources_test_scripts/bmmc_cite.sh       | 57 ++++++++++++++
 3 files changed, 195 insertions(+)
 create mode 100644 src/match_modality/resources_scripts/mask_datasets.sh
 create mode 100644 src/match_modality/resources_scripts/run_benchmarks.sh
 create mode 100644 src/match_modality/resources_test_scripts/bmmc_cite.sh

diff --git a/src/match_modality/resources_scripts/mask_datasets.sh b/src/match_modality/resources_scripts/mask_datasets.sh
new file mode 100644
index 0000000000..a70cc985d5
--- /dev/null
+++ b/src/match_modality/resources_scripts/mask_datasets.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+COMMON_DATASETS="resources/datasets/openproblems_v1"
+OUTPUT_DIR="resources/match_modality/datasets/openproblems_v1"
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+params_file="$OUTPUT_DIR/params.yaml"
+
+if [ ! -f $params_file ]; then
+  python << HERE
+import anndata as ad
+import glob
+import yaml
+
+h5ad_files = glob.glob("$COMMON_DATASETS/**.h5ad")
+
+# this task doesn't use normalizations
+# 
+param_list = {}
+
+for h5ad_file in h5ad_files:
+  print(f"Checking {h5ad_file}")
+  adata = ad.read_h5ad(h5ad_file, backed=True)
+  if "counts" in adata.layers:
+    dataset_id = adata.uns["dataset_id"].replace("/", ".")
+    obj = {
+      'id': dataset_id, 
+      'input': h5ad_file,
+      'dataset_id': dataset_id,
+    }
+    param_list[dataset_id] = obj
+
+output = {
+  "param_list": list(param_list.values()),
+  "seed": 123,
+  "output_train": "\$id.train.h5ad",
+  "output_test": "\$id.test.h5ad"
+}
+
+with open("$params_file", "w") as file:
+  yaml.dump(output, file)
+HERE
+fi
+
+export NXF_VER=22.04.5
+nextflow \
+  run . \
+  -main-script target/nextflow/match_modality/mask_dataset/main.nf \
+  -profile docker \
+  -resume \
+  -params-file $params_file \
+  --publish_dir "$OUTPUT_DIR"
+
+bin/tools/docker/nextflow/process_log/process_log \
+  --output "$OUTPUT_DIR/nextflow_log.tsv"
\ No newline at end of file
diff --git a/src/match_modality/resources_scripts/run_benchmarks.sh b/src/match_modality/resources_scripts/run_benchmarks.sh
new file mode 100644
index 0000000000..3bf2418934
--- /dev/null
+++ b/src/match_modality/resources_scripts/run_benchmarks.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+set -e
+
+export TOWER_WORKSPACE_ID=53907369739130
+
+DATASETS_DIR="resources/match_modality/datasets/openproblems_v1"
+OUTPUT_DIR="resources/match_modality/benchmarks/openproblems_v1"
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+params_file="$OUTPUT_DIR/params.yaml"
+
+if [ ! -f $params_file ]; then
+  python << HERE
+import yaml
+import os
+
+dataset_dir = "$DATASETS_DIR"
+output_dir = "$OUTPUT_DIR"
+
+# read split datasets yaml
+with open(dataset_dir + "/params.yaml", "r") as file:
+  split_list = yaml.safe_load(file)
+datasets = split_list['param_list']
+
+# figure out where train/test files were stored
+param_list = []
+
+for dataset in datasets:
+  id = dataset["id"]
+  input_train = dataset_dir + "/" + id + ".train.h5ad"
+  input_test = dataset_dir + "/" + id + ".test.h5ad"
+  
+  if os.path.exists(input_test):
+    obj = {
+      'id': id, 
+    'id': id, 
+      'id': id, 
+      'dataset_id': dataset["dataset_id"],
+      'input_train': input_train,
+      'input_test': input_test
+    }
+    param_list.append(obj)
+
+# write as output file
+output = {
+  "param_list": param_list,
+}
+
+with open(output_dir + "/params.yaml", "w") as file:
+  yaml.dump(output, file)
+HERE
+fi
+
+export NXF_VER=22.04.5
+nextflow \
+  run . \
+  -main-script src/match_modality/workflows/run/main.nf \
+  -profile docker \
+  -params-file "$params_file" \
+  --publish_dir "$OUTPUT_DIR" \
+  -with-tower
+
+bin/tools/docker/nextflow/process_log/process_log \
+  --output "$OUTPUT_DIR/nextflow_log.tsv"
\ No newline at end of file
diff --git a/src/match_modality/resources_test_scripts/bmmc_cite.sh b/src/match_modality/resources_test_scripts/bmmc_cite.sh
new file mode 100644
index 0000000000..5a46d9b08f
--- /dev/null
+++ b/src/match_modality/resources_test_scripts/bmmc_cite.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+#
+#make sure the following command has been executed
+#viash ns build -q 'match_modality|common' --parallel --setup cb
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+MOD_1_DATA=resources_test/common/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.output_rna.h5ad
+MOD_2_DATA=resources_test/common/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.output_mod2.h5ad
+DATASET_DIR=resources_test/match_modality/bmmc_cite
+
+if [ ! -f $MOD_1_DATA ]; then
+    echo "Error! Could not find raw data"
+    exit 1
+fi
+
+mkdir -p $DATASET_DIR
+
+# maskdataset
+viash run src/match_modality/mask_dataset/config.vsh.yaml -- \
+    --input_mod1 $MOD_1_DATA \
+    --input_mod2 $MOD_2_DATA \
+    --output_mod1 $DATASET_DIR/cite_mod1.h5ad \
+    --output_mod2 $DATASET_DIR/cite_mod2.h5ad \
+    --output_solution $DATASET_DIR/cite_solution.h5ad
+
+# run one method
+viash run src/match_modality/methods/dr_knnr_cbf/config.vsh.yaml -- \
+    --input_mod1 $DATASET_DIR/cite_mod1.h5ad \
+    --input_mod2 $DATASET_DIR/cite_mod2.h5ad \
+    --output $DATASET_DIR/dr_knnr_cbf.h5ad
+
+# run one metric
+viash run src/match_modality/metrics/aupr/config.vsh.yaml -- \
+    --input_prediction $DATASET_DIR/dr_knnr_cbf.h5ad \
+    --input_solution $DATASET_DIR/cite_solution.h5ad \
+    --output $DATASET_DIR/aupr.h5ad
+
+# run benchmark
+export NXF_VER=22.04.5
+
+nextflow \
+  run . \
+  -main-script src/match_modality/workflows/run/main.nf \
+  -profile docker \
+  -resume \
+  --id bmmc_cite \
+  --dataset_id bmmc_site \
+  --input_mod1 $DATASET_DIR/cite_mod1.h5ad \
+  --input_mod2 $DATASET_DIR/cite_mod2.h5ad \
+  --input_solution $DATASET_DIR/cite_solution.h5ad \
+  --output scores.tsv \
+  --publish_dir $DATASET_DIR/
\ No newline at end of file

From 4e134e525c90215d8718b5bb4398f89899c245e1 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Thu, 12 Jan 2023 21:23:51 +0100
Subject: [PATCH 53/82] add NF workflow

---
 src/match_modality/api/comp_mask_dataset.yaml |   4 +
 .../methods/newwave_knnr_knn/config.vsh.yaml  |   2 +-
 .../resources_test_scripts/bmmc_cite.sh       |  28 +--
 .../workflows/run/config.vsh.yaml             |  32 ++++
 src/match_modality/workflows/run/main.nf      | 168 ++++++++++++++++++
 .../workflows/run/nextflow.config             |  14 ++
 6 files changed, 237 insertions(+), 11 deletions(-)
 create mode 100644 src/match_modality/workflows/run/config.vsh.yaml
 create mode 100644 src/match_modality/workflows/run/main.nf
 create mode 100644 src/match_modality/workflows/run/nextflow.config

diff --git a/src/match_modality/api/comp_mask_dataset.yaml b/src/match_modality/api/comp_mask_dataset.yaml
index 0737a499a0..3876ef397e 100644
--- a/src/match_modality/api/comp_mask_dataset.yaml
+++ b/src/match_modality/api/comp_mask_dataset.yaml
@@ -5,12 +5,16 @@ functionality:
     - name: "--input_mod2"
       __merge__: anndata_dataset.yaml
     - name: "--output_train_mod1"
+      direction: "output"
       __merge__: anndata_masked_mod.yaml
     - name: "--output_train_mod2"
+      direction: "output"
       __merge__: anndata_masked_mod.yaml
     - name: "--output_train_sol"
+      direction: "output"
       __merge__: anndata_solution.yaml
     - name: "--output_test_mod1"
+      direction: "output"
       __merge__: anndata_masked_mod.yaml
     - name: "--output_test_mod2"
       direction: "output"
diff --git a/src/match_modality/methods/newwave_knnr_knn/config.vsh.yaml b/src/match_modality/methods/newwave_knnr_knn/config.vsh.yaml
index f02a5ed909..a95e913494 100644
--- a/src/match_modality/methods/newwave_knnr_knn/config.vsh.yaml
+++ b/src/match_modality/methods/newwave_knnr_knn/config.vsh.yaml
@@ -1,6 +1,6 @@
 __merge__: ../../api/comp_method.yaml
 functionality:
-  name: baseline_newwave_knnr_knn
+  name: newwave_knnr_knn
   namespace: match_modality/methods
   description: Perform DR with NewWave, predict embedding with KNN regression, and match cells with a KNN.
   info:
diff --git a/src/match_modality/resources_test_scripts/bmmc_cite.sh b/src/match_modality/resources_test_scripts/bmmc_cite.sh
index 5a46d9b08f..d3debaa836 100644
--- a/src/match_modality/resources_test_scripts/bmmc_cite.sh
+++ b/src/match_modality/resources_test_scripts/bmmc_cite.sh
@@ -24,20 +24,26 @@ mkdir -p $DATASET_DIR
 viash run src/match_modality/mask_dataset/config.vsh.yaml -- \
     --input_mod1 $MOD_1_DATA \
     --input_mod2 $MOD_2_DATA \
-    --output_mod1 $DATASET_DIR/cite_mod1.h5ad \
-    --output_mod2 $DATASET_DIR/cite_mod2.h5ad \
-    --output_solution $DATASET_DIR/cite_solution.h5ad
+    --output_train_mod1 $DATASET_DIR/cite_train_mod1.h5ad \
+    --output_train_mod2 $DATASET_DIR/cite_train_mod2.h5ad \
+    --output_train_sol $DATASET_DIR/cite_train_sol.h5ad \
+    --output_test_mod1 $DATASET_DIR/cite_test_mod1.h5ad \
+    --output_test_mod2 $DATASET_DIR/cite_test_mod2.h5ad \
+    --output_test_sol $DATASET_DIR/cite_test_sol.h5ad
 
 # run one method
 viash run src/match_modality/methods/dr_knnr_cbf/config.vsh.yaml -- \
-    --input_mod1 $DATASET_DIR/cite_mod1.h5ad \
-    --input_mod2 $DATASET_DIR/cite_mod2.h5ad \
+    --input_train_mod1 $DATASET_DIR/cite_train_mod1.h5ad \
+    --input_train_mod2 $DATASET_DIR/cite_train_mod2.h5ad \
+    --input_train_sol $DATASET_DIR/cite_train_sol.h5ad \
+    --input_test_mod1 $DATASET_DIR/cite_test_mod1.h5ad \
+    --input_test_mod2 $DATASET_DIR/cite_test_mod2.h5ad \
     --output $DATASET_DIR/dr_knnr_cbf.h5ad
 
 # run one metric
 viash run src/match_modality/metrics/aupr/config.vsh.yaml -- \
     --input_prediction $DATASET_DIR/dr_knnr_cbf.h5ad \
-    --input_solution $DATASET_DIR/cite_solution.h5ad \
+    --input_solution $DATASET_DIR/cite_test_sol.h5ad \
     --output $DATASET_DIR/aupr.h5ad
 
 # run benchmark
@@ -47,11 +53,13 @@ nextflow \
   run . \
   -main-script src/match_modality/workflows/run/main.nf \
   -profile docker \
-  -resume \
   --id bmmc_cite \
   --dataset_id bmmc_site \
-  --input_mod1 $DATASET_DIR/cite_mod1.h5ad \
-  --input_mod2 $DATASET_DIR/cite_mod2.h5ad \
-  --input_solution $DATASET_DIR/cite_solution.h5ad \
+  --input_train_mod1 $DATASET_DIR/cite_train_mod1.h5ad \
+  --input_train_mod2 $DATASET_DIR/cite_train_mod2.h5ad \
+  --input_train_sol $DATASET_DIR/cite_train_sol.h5ad \
+  --input_test_mod1 $DATASET_DIR/cite_test_mod1.h5ad \
+  --input_test_mod2 $DATASET_DIR/cite_test_mod2.h5ad \
+  --input_solution $DATASET_DIR/cite_test_sol.h5ad \
   --output scores.tsv \
   --publish_dir $DATASET_DIR/
\ No newline at end of file
diff --git a/src/match_modality/workflows/run/config.vsh.yaml b/src/match_modality/workflows/run/config.vsh.yaml
new file mode 100644
index 0000000000..58db4d0b1e
--- /dev/null
+++ b/src/match_modality/workflows/run/config.vsh.yaml
@@ -0,0 +1,32 @@
+functionality:
+  name: "run_benchmark"
+  namespace: "match_modality/workflows"
+  argument_groups:
+    - name: Inputs
+      arguments:
+        - name: "--id"
+          type: "string"
+          description: "The ID of the dataset"
+          required: true
+        - name: "--input_train_mod1"
+          type: "file" # todo: replace with includes
+        - name: "--input_train_mod2"
+          type: "file" 
+        - name: "--input_train_sol"
+          type: "file" # todo: replace with includes
+        - name: "--input_test_mod1"
+          type: "file"# todo: replace with includes
+        - name: "--input_test_mod2"
+          type: "file"
+        - name: "--input_solution"
+          type: "file" # todo: replace with includes
+    - name: Outputs
+      arguments:
+        - name: "--output"
+          direction: "output"
+          type: file
+  resources:
+    - type: nextflow_script
+      path: main.nf
+platforms:
+  - type: nextflow
\ No newline at end of file
diff --git a/src/match_modality/workflows/run/main.nf b/src/match_modality/workflows/run/main.nf
new file mode 100644
index 0000000000..fe48e3f63f
--- /dev/null
+++ b/src/match_modality/workflows/run/main.nf
@@ -0,0 +1,168 @@
+nextflow.enable.dsl=2
+
+sourceDir = params.rootDir + "/src"
+targetDir = params.rootDir + "/target/nextflow"
+
+// import control methods
+include { constant } from "$targetDir/match_modality/control_methods/constant/main.nf"
+include { random_pairing } from "$targetDir/match_modality/control_methods/random_pairing/main.nf"
+include { semi_solution } from "$targetDir/match_modality/control_methods/semi_solution/main.nf"
+include { solution } from "$targetDir/match_modality/control_methods/solution/main.nf"
+
+
+// import methods
+// include { babel_knn } from "$targetDir/match_modality/methods/babel_knn/main.nf"
+include { dr_knnr_cbf } from "$targetDir/match_modality/methods/dr_knnr_cbf/main.nf"
+include { dr_knnr_knn } from "$targetDir/match_modality/methods/dr_knnr_knn/main.nf"
+include { linear_knn } from "$targetDir/match_modality/methods/linear_knn/main.nf"
+include { newwave_knnr_cbf } from "$targetDir/match_modality/methods/newwave_knnr_cbf/main.nf"
+include { newwave_knnr_knn } from "$targetDir/match_modality/methods/newwave_knnr_knn/main.nf"
+include { procrustes_knn } from "$targetDir/match_modality/methods/procrustes_knn/main.nf"
+
+
+// import metrics
+include { aupr } from "$targetDir/match_modality/metrics/aupr/main.nf"
+include { check_format } from "$targetDir/match_modality/metrics/check_format/main.nf"
+include { match_probability } from "$targetDir/match_modality/metrics/match_probability/main.nf"
+
+// tsv generation component
+include { extract_scores } from "$targetDir/common/extract_scores/main.nf"
+
+// import helper functions
+include { readConfig; viashChannel; helpMessage } from sourceDir + "/wf_utils/WorkflowHelper.nf"
+include { setWorkflowArguments; getWorkflowArguments; passthroughMap as pmap } from sourceDir + "/wf_utils/DataflowHelper.nf"
+
+config = readConfig("$projectDir/config.vsh.yaml")
+
+// construct a map of methods (id -> method_module)
+methods = [ dr_knnr_cbf, dr_knnr_knn, linear_knn, newwave_knnr_cbf, newwave_knnr_knn, procrustes_knn]
+  .collectEntries{method ->
+    [method.config.functionality.name, method]
+  }
+
+workflow {
+  helpMessage(config)
+
+  viashChannel(params, config)
+    | run_wf
+}
+
+workflow run_wf {
+  take:
+  input_ch
+
+  main:
+  output_ch = input_ch
+
+    // split params for downstream components
+    | setWorkflowArguments(
+      method: ["input_train_mod1", "input_train_mod2", "input_train_sol", "input_test_mod1", "input_test_mod2"],
+      metric: ["input_solution"],
+      output: ["output"]
+    )
+
+    // multiply events by the number of method
+    | add_methods
+
+    // add input_solution to data for the positive controls
+    | controls_can_cheat
+
+    // run methods
+    | getWorkflowArguments(key: "method")
+    | run_methods
+
+    // construct tuples for metrics
+    | pmap{ id, file, passthrough ->
+      // derive unique ids from output filenames
+      def newId = file.getName().replaceAll(".output.*", "")
+      // combine prediction with solution
+      def newData = [ input_prediction: file, input_solution: passthrough.metric.input_solution ]
+      [ newId, newData, passthrough ]
+    }
+    
+    // run metrics
+    | getWorkflowArguments(key: "metric")
+    | run_metrics
+    
+    // convert to tsv  
+    | aggregate_results
+
+  emit:
+  output_ch
+}
+
+workflow add_methods {
+  take: input_ch
+  main:
+  output_ch = Channel.fromList(methods.keySet())
+    | combine(input_ch)
+
+    // generate combined id for method_id and dataset_id
+    | pmap{method_id, dataset_id, data ->
+      def new_id = dataset_id + "." + method_id
+      def new_data = data.clone() + [method_id: method_id]
+      new_data.remove("id")
+      [new_id, new_data]
+    }
+  emit: output_ch
+}
+
+workflow controls_can_cheat {
+  take: input_ch
+  main:
+  output_ch = input_ch
+    | pmap{id, data, passthrough ->
+      def method = methods[data.method_id]
+      def method_type = method.config.functionality.info.method_type
+      def new_data = data.clone()
+      if (method_type != "method") {
+        new_data = new_data + [input_test_sol: passthrough.metric.input_solution]
+      }
+      [id, new_data, passthrough]
+    }
+  emit: output_ch
+}
+
+workflow run_methods {
+  take: input_ch
+  main:
+    // generate one channel per method
+    method_chs = methods.collect { method_id, method_module ->
+        input_ch
+          | filter{it[1].method_id == method_id}
+          | method_module
+      }
+    // mix all results
+    output_ch = method_chs[0].mix(*method_chs.drop(1))
+
+  emit: output_ch
+}
+
+workflow run_metrics {
+  take: input_ch
+  main:
+
+  output_ch = input_ch
+    | (aupr & check_format & match_probability)
+    | mix
+
+  emit: output_ch
+}
+
+workflow aggregate_results {
+  take: input_ch
+  main:
+
+  output_ch = input_ch
+    | toSortedList
+    | filter{ it.size() > 0 }
+    | map{ it -> 
+      [ "combined", it.collect{ it[1] } ] + it[0].drop(2) 
+    }
+    | getWorkflowArguments(key: "output")
+    | extract_scores.run(
+        auto: [ publish: true ]
+    )
+
+  emit: output_ch
+}
\ No newline at end of file
diff --git a/src/match_modality/workflows/run/nextflow.config b/src/match_modality/workflows/run/nextflow.config
new file mode 100644
index 0000000000..6e33495d3a
--- /dev/null
+++ b/src/match_modality/workflows/run/nextflow.config
@@ -0,0 +1,14 @@
+manifest {
+  name = 'match_modality/workflows/run'
+  mainScript = 'main.nf'
+  nextflowVersion = '!>=22.04.5'
+  description = 'Multi modality - math modalility'
+}
+
+params {
+  rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString()
+}
+
+// include common settings
+includeConfig("${params.rootDir}/src/wf_utils/ProfilesHelper.config")
+includeConfig("${params.rootDir}/src/wf_utils/labels.config")
\ No newline at end of file

From 8aad000fe905439ea1cdff1433960baf3563cf85 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Fri, 13 Jan 2023 11:42:21 +0100
Subject: [PATCH 54/82] fix directives

---
 src/match_modality/control_methods/constant/config.vsh.yaml    | 3 ++-
 .../control_methods/random_pairing/config.vsh.yaml             | 3 ++-
 .../control_methods/semi_solution/config.vsh.yaml              | 3 ++-
 src/match_modality/control_methods/solution/config.vsh.yaml    | 3 ++-
 src/match_modality/methods/babel_knn/config.vsh.yaml           | 3 ++-
 src/match_modality/methods/dr_knnr_cbf/config.vsh.yaml         | 3 ++-
 src/match_modality/methods/dr_knnr_knn/config.vsh.yaml         | 3 ++-
 src/match_modality/methods/linear_knn/config.vsh.yaml          | 3 ++-
 src/match_modality/methods/newwave_knnr_cbf/config.vsh.yaml    | 3 ++-
 src/match_modality/methods/newwave_knnr_knn/config.vsh.yaml    | 3 ++-
 src/match_modality/methods/procrustes_knn/config.vsh.yaml      | 3 ++-
 src/match_modality/metrics/aupr/config.vsh.yaml                | 2 +-
 src/match_modality/metrics/check_format/config.vsh.yaml        | 2 +-
 src/match_modality/metrics/match_probability/config.vsh.yaml   | 2 +-
 14 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/src/match_modality/control_methods/constant/config.vsh.yaml b/src/match_modality/control_methods/constant/config.vsh.yaml
index a674e4ecf0..c20acf234f 100644
--- a/src/match_modality/control_methods/constant/config.vsh.yaml
+++ b/src/match_modality/control_methods/constant/config.vsh.yaml
@@ -20,4 +20,5 @@ platforms:
       - type: python
         pip: [ anndata>=0.8 ]
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/match_modality/control_methods/random_pairing/config.vsh.yaml b/src/match_modality/control_methods/random_pairing/config.vsh.yaml
index 3218f32279..fd9da8d595 100644
--- a/src/match_modality/control_methods/random_pairing/config.vsh.yaml
+++ b/src/match_modality/control_methods/random_pairing/config.vsh.yaml
@@ -16,4 +16,5 @@ platforms:
       - type: python
         pip: [ anndata>=0.8, numpy, scikit-learn ]
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
\ No newline at end of file
+    directives: 
+      label: [ lowmem, lowcpu ]
\ No newline at end of file
diff --git a/src/match_modality/control_methods/semi_solution/config.vsh.yaml b/src/match_modality/control_methods/semi_solution/config.vsh.yaml
index da39c07aab..756716feb6 100644
--- a/src/match_modality/control_methods/semi_solution/config.vsh.yaml
+++ b/src/match_modality/control_methods/semi_solution/config.vsh.yaml
@@ -20,4 +20,5 @@ platforms:
       - type: python
         pip: [ anndata>=0.8 ]
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/match_modality/control_methods/solution/config.vsh.yaml b/src/match_modality/control_methods/solution/config.vsh.yaml
index ca9d8d5904..ae4d48430e 100644
--- a/src/match_modality/control_methods/solution/config.vsh.yaml
+++ b/src/match_modality/control_methods/solution/config.vsh.yaml
@@ -20,4 +20,5 @@ platforms:
       - type: python
         pip: [ anndata>=0.8 ]
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
+    directives: 
+    label: [ lowmem, lowcpu ]
diff --git a/src/match_modality/methods/babel_knn/config.vsh.yaml b/src/match_modality/methods/babel_knn/config.vsh.yaml
index 55c4498547..4a7cdc2e31 100644
--- a/src/match_modality/methods/babel_knn/config.vsh.yaml
+++ b/src/match_modality/methods/babel_knn/config.vsh.yaml
@@ -37,4 +37,5 @@ platforms:
           - cd / && git clone --depth 1 https://github.com/rcannood/babel.git
           - cd /babel && /opt/conda/bin/conda env create -f environment.yml      
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/match_modality/methods/dr_knnr_cbf/config.vsh.yaml b/src/match_modality/methods/dr_knnr_cbf/config.vsh.yaml
index 86e00c8c9a..37321fcad1 100644
--- a/src/match_modality/methods/dr_knnr_cbf/config.vsh.yaml
+++ b/src/match_modality/methods/dr_knnr_cbf/config.vsh.yaml
@@ -25,4 +25,5 @@ platforms:
       - type: python
         pip: [ anndata>=0.8 ]
   - type: nextflow
-    directives: [ midmem, midtime, midcpu ]
+    directives: 
+      label: [ midmem, midcpu ]
diff --git a/src/match_modality/methods/dr_knnr_knn/config.vsh.yaml b/src/match_modality/methods/dr_knnr_knn/config.vsh.yaml
index e03bf3be48..fd9387d6d4 100644
--- a/src/match_modality/methods/dr_knnr_knn/config.vsh.yaml
+++ b/src/match_modality/methods/dr_knnr_knn/config.vsh.yaml
@@ -20,4 +20,5 @@ platforms:
       - type: python
         pip: [ anndata>=0.8 ]
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/match_modality/methods/linear_knn/config.vsh.yaml b/src/match_modality/methods/linear_knn/config.vsh.yaml
index ab26297bd7..20fa30783b 100644
--- a/src/match_modality/methods/linear_knn/config.vsh.yaml
+++ b/src/match_modality/methods/linear_knn/config.vsh.yaml
@@ -25,4 +25,5 @@ platforms:
       - type: python
         pip: [ anndata, scipy, scikit-learn ]
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/match_modality/methods/newwave_knnr_cbf/config.vsh.yaml b/src/match_modality/methods/newwave_knnr_cbf/config.vsh.yaml
index 9fbf3a8887..6fcc058b2a 100644
--- a/src/match_modality/methods/newwave_knnr_cbf/config.vsh.yaml
+++ b/src/match_modality/methods/newwave_knnr_cbf/config.vsh.yaml
@@ -38,4 +38,5 @@ platforms:
       - type: python
         pip: [ anndata>=0.8 ]
   - type: nextflow
-    directives: [ highmem, midtime, highcpu ]
+    directives: 
+      label: [ highmem, highcpu ]
diff --git a/src/match_modality/methods/newwave_knnr_knn/config.vsh.yaml b/src/match_modality/methods/newwave_knnr_knn/config.vsh.yaml
index a95e913494..4293005a23 100644
--- a/src/match_modality/methods/newwave_knnr_knn/config.vsh.yaml
+++ b/src/match_modality/methods/newwave_knnr_knn/config.vsh.yaml
@@ -33,4 +33,5 @@ platforms:
       - type: python
         pip: [ anndata>=0.8, tensorflow ]
   - type: nextflow
-    directives: [ highmem, midtime, highcpu ]
+    directives: 
+      label: [ highmem, highcpu ]
diff --git a/src/match_modality/methods/procrustes_knn/config.vsh.yaml b/src/match_modality/methods/procrustes_knn/config.vsh.yaml
index 1c317ca3d9..65d0327291 100644
--- a/src/match_modality/methods/procrustes_knn/config.vsh.yaml
+++ b/src/match_modality/methods/procrustes_knn/config.vsh.yaml
@@ -21,4 +21,5 @@ platforms:
       - type: python
         pip: [ anndata>=0.8, scipy, scikit-learn ]
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/match_modality/metrics/aupr/config.vsh.yaml b/src/match_modality/metrics/aupr/config.vsh.yaml
index 3b655555d1..205efb353e 100644
--- a/src/match_modality/metrics/aupr/config.vsh.yaml
+++ b/src/match_modality/metrics/aupr/config.vsh.yaml
@@ -36,5 +36,5 @@ platforms:
         pip: [ anndata>=0.8, pyyaml ]
   - type: nextflow
     directives:
-      label: [ lowmem, lowtime, lowcpu ]
+      label: [ lowmem, lowcpu ]
 
diff --git a/src/match_modality/metrics/check_format/config.vsh.yaml b/src/match_modality/metrics/check_format/config.vsh.yaml
index 5f7f846065..9519865682 100644
--- a/src/match_modality/metrics/check_format/config.vsh.yaml
+++ b/src/match_modality/metrics/check_format/config.vsh.yaml
@@ -32,4 +32,4 @@ platforms:
         pip: [ anndata>=0.8, pyyaml ]
   - type: nextflow
     directives:
-      label: [ lowmem, lowtime, lowcpu ]
\ No newline at end of file
+      label: [ lowmem, lowcpu ]
\ No newline at end of file
diff --git a/src/match_modality/metrics/match_probability/config.vsh.yaml b/src/match_modality/metrics/match_probability/config.vsh.yaml
index 6214b49ad9..c9c274f954 100644
--- a/src/match_modality/metrics/match_probability/config.vsh.yaml
+++ b/src/match_modality/metrics/match_probability/config.vsh.yaml
@@ -26,5 +26,5 @@ platforms:
         pip: [ anndata>=0.8, pyyaml ]
   - type: nextflow
     directives:
-      label: [ lowmem, lowtime, lowcpu ]
+      label: [ lowmem, lowcpu ]
 

From f3a00176c0c398a0fde5901eb6084346374c4248 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Fri, 13 Jan 2023 15:45:25 +0100
Subject: [PATCH 55/82] fix configs

---
 src/match_modality/control_methods/solution/config.vsh.yaml | 2 +-
 src/match_modality/metrics/check_format/config.vsh.yaml     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/match_modality/control_methods/solution/config.vsh.yaml b/src/match_modality/control_methods/solution/config.vsh.yaml
index ae4d48430e..b352fdaedf 100644
--- a/src/match_modality/control_methods/solution/config.vsh.yaml
+++ b/src/match_modality/control_methods/solution/config.vsh.yaml
@@ -21,4 +21,4 @@ platforms:
         pip: [ anndata>=0.8 ]
   - type: nextflow
     directives: 
-    label: [ lowmem, lowcpu ]
+      label: [ lowmem, lowcpu ]
diff --git a/src/match_modality/metrics/check_format/config.vsh.yaml b/src/match_modality/metrics/check_format/config.vsh.yaml
index 9519865682..94e833b471 100644
--- a/src/match_modality/metrics/check_format/config.vsh.yaml
+++ b/src/match_modality/metrics/check_format/config.vsh.yaml
@@ -25,7 +25,7 @@ platforms:
     image: eddelbuettel/r2u:22.04
     setup:
       - type: r
-        cran: [ anndata ]
+        cran: [ anndata, bit64]
       - type: apt
         packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
       - type: python

From a8895dc2d28a84124759320b9c53e5da20912139 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Fri, 13 Jan 2023 16:54:23 +0100
Subject: [PATCH 56/82] fix directive labels

---
 .../control_methods/random_embed/config.vsh.yaml               | 3 ++-
 .../control_methods/zeros_embed/config.vsh.yaml                | 3 ++-
 src/joint_embedding/mask_dataset/config.vsh.yaml               | 3 ++-
 src/joint_embedding/methods/lmds/config.vsh.yaml               | 3 ++-
 src/joint_embedding/methods/mnn/config.vsh.yaml                | 3 ++-
 src/joint_embedding/methods/newwave/config.vsh.yaml            | 3 ++-
 src/joint_embedding/methods/pca/config.vsh.yaml                | 3 ++-
 src/joint_embedding/methods/totalvi/config.vsh.yaml            | 3 ++-
 src/joint_embedding/methods/umap/config.vsh.yaml               | 3 ++-
 src/joint_embedding/metrics/ari/config.vsh.yaml                | 3 ++-
 src/joint_embedding/metrics/asw_batch/config.vsh.yaml          | 3 ++-
 src/joint_embedding/metrics/asw_label/config.vsh.yaml          | 3 ++-
 src/joint_embedding/metrics/cc_cons/config.vsh.yaml            | 3 ++-
 src/joint_embedding/metrics/check_format/config.vsh.yaml       | 3 ++-
 src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml | 3 ++-
 src/joint_embedding/metrics/latent_mixing/config.vsh.yaml      | 3 ++-
 src/joint_embedding/metrics/nmi/config.vsh.yaml                | 3 ++-
 src/joint_embedding/metrics/rfoob/config.vsh.yaml              | 3 ++-
 src/joint_embedding/metrics/ti_cons/config.vsh.yaml            | 3 ++-
 src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml      | 3 ++-
 20 files changed, 40 insertions(+), 20 deletions(-)

diff --git a/src/joint_embedding/control_methods/random_embed/config.vsh.yaml b/src/joint_embedding/control_methods/random_embed/config.vsh.yaml
index cdc3061a8e..5eefbfcd1e 100644
--- a/src/joint_embedding/control_methods/random_embed/config.vsh.yaml
+++ b/src/joint_embedding/control_methods/random_embed/config.vsh.yaml
@@ -21,4 +21,5 @@ platforms:
       - type: python
         pip: [ anndata>=0.8, numpy , scipy]
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/control_methods/zeros_embed/config.vsh.yaml b/src/joint_embedding/control_methods/zeros_embed/config.vsh.yaml
index b2c95a79e2..32c2648284 100644
--- a/src/joint_embedding/control_methods/zeros_embed/config.vsh.yaml
+++ b/src/joint_embedding/control_methods/zeros_embed/config.vsh.yaml
@@ -21,4 +21,5 @@ platforms:
       - type: python
         pip: [ anndata, numpy, scipy ]
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/mask_dataset/config.vsh.yaml b/src/joint_embedding/mask_dataset/config.vsh.yaml
index d49464163d..872df1d9fa 100644
--- a/src/joint_embedding/mask_dataset/config.vsh.yaml
+++ b/src/joint_embedding/mask_dataset/config.vsh.yaml
@@ -27,4 +27,5 @@ platforms:
       - type: python
         pip: [ anndata>=0.8 ]
   - type: nextflow
-    directives: [ midmem, lowtime, lowcpu ]
+    directives: 
+      label: [ midmem, lowcpu ]
diff --git a/src/joint_embedding/methods/lmds/config.vsh.yaml b/src/joint_embedding/methods/lmds/config.vsh.yaml
index d01b922fd7..8d11aee807 100644
--- a/src/joint_embedding/methods/lmds/config.vsh.yaml
+++ b/src/joint_embedding/methods/lmds/config.vsh.yaml
@@ -29,4 +29,5 @@ platforms:
       - type: python
         pip: [anndata>=0.8]
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/methods/mnn/config.vsh.yaml b/src/joint_embedding/methods/mnn/config.vsh.yaml
index 486beb9301..48b8403903 100644
--- a/src/joint_embedding/methods/mnn/config.vsh.yaml
+++ b/src/joint_embedding/methods/mnn/config.vsh.yaml
@@ -27,4 +27,5 @@ platforms:
       - type: python
         pip: [anndata>=0.8]      
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/methods/newwave/config.vsh.yaml b/src/joint_embedding/methods/newwave/config.vsh.yaml
index 2da0df3a4b..151dee6363 100644
--- a/src/joint_embedding/methods/newwave/config.vsh.yaml
+++ b/src/joint_embedding/methods/newwave/config.vsh.yaml
@@ -36,4 +36,5 @@ platforms:
       - type: python
         pip: [anndata>=0.8]
   - type: nextflow
-    directives: [ highmem, hightime, highcpu ]
+    directives: 
+      label: [ highmem, highcpu ]
diff --git a/src/joint_embedding/methods/pca/config.vsh.yaml b/src/joint_embedding/methods/pca/config.vsh.yaml
index 967de6da6e..a2187266ad 100644
--- a/src/joint_embedding/methods/pca/config.vsh.yaml
+++ b/src/joint_embedding/methods/pca/config.vsh.yaml
@@ -30,4 +30,5 @@ platforms:
       - type: python
         pip: [anndata>=0.8]
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/methods/totalvi/config.vsh.yaml b/src/joint_embedding/methods/totalvi/config.vsh.yaml
index 74116f9abe..e418b21bf5 100644
--- a/src/joint_embedding/methods/totalvi/config.vsh.yaml
+++ b/src/joint_embedding/methods/totalvi/config.vsh.yaml
@@ -27,7 +27,8 @@ platforms:
       - type: python
         pip: [ anndata>=0.8, scanpy, scikit-misc, scipy, scikit-learn, scvi-tools]
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
+    directives: 
+      label: [ lowmem, lowcpu ]
 
 
 
diff --git a/src/joint_embedding/methods/umap/config.vsh.yaml b/src/joint_embedding/methods/umap/config.vsh.yaml
index 4b10222d1c..ee9650ab95 100644
--- a/src/joint_embedding/methods/umap/config.vsh.yaml
+++ b/src/joint_embedding/methods/umap/config.vsh.yaml
@@ -42,4 +42,5 @@ platforms:
       - type: python
         pip: [anndata>=0.8]
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/metrics/ari/config.vsh.yaml b/src/joint_embedding/metrics/ari/config.vsh.yaml
index 5097080a70..69e01c5a26 100644
--- a/src/joint_embedding/metrics/ari/config.vsh.yaml
+++ b/src/joint_embedding/metrics/ari/config.vsh.yaml
@@ -24,4 +24,5 @@ platforms:
       - type: python
         pip: [ pyyaml ]
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/metrics/asw_batch/config.vsh.yaml b/src/joint_embedding/metrics/asw_batch/config.vsh.yaml
index 24ab2503b5..5abd76fe8f 100644
--- a/src/joint_embedding/metrics/asw_batch/config.vsh.yaml
+++ b/src/joint_embedding/metrics/asw_batch/config.vsh.yaml
@@ -24,4 +24,5 @@ platforms:
       - type: python
         pip: [ pyyaml ]
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/metrics/asw_label/config.vsh.yaml b/src/joint_embedding/metrics/asw_label/config.vsh.yaml
index 74d7c24afe..9cf7ceac08 100644
--- a/src/joint_embedding/metrics/asw_label/config.vsh.yaml
+++ b/src/joint_embedding/metrics/asw_label/config.vsh.yaml
@@ -24,4 +24,5 @@ platforms:
       - type: python
         pip: [ pyyaml ]
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/metrics/cc_cons/config.vsh.yaml b/src/joint_embedding/metrics/cc_cons/config.vsh.yaml
index 86a624c632..16c562c37c 100644
--- a/src/joint_embedding/metrics/cc_cons/config.vsh.yaml
+++ b/src/joint_embedding/metrics/cc_cons/config.vsh.yaml
@@ -24,4 +24,5 @@ platforms:
       - type: python
         pip: [ pyyaml ]
   - type: nextflow
-    directives: [ vhighmem, midtime, midcpu ]
+    directives: 
+      label: [ vhighmem, midcpu ]
diff --git a/src/joint_embedding/metrics/check_format/config.vsh.yaml b/src/joint_embedding/metrics/check_format/config.vsh.yaml
index 934af734e3..b2d8a021b4 100644
--- a/src/joint_embedding/metrics/check_format/config.vsh.yaml
+++ b/src/joint_embedding/metrics/check_format/config.vsh.yaml
@@ -34,4 +34,5 @@ platforms:
       - type: python
         pip: [ pyyaml ]
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml b/src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml
index 8cbde10e3a..8b6a3871d0 100644
--- a/src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml
+++ b/src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml
@@ -24,4 +24,5 @@ platforms:
       - type: python
         pip: [ pyyaml ]
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/metrics/latent_mixing/config.vsh.yaml b/src/joint_embedding/metrics/latent_mixing/config.vsh.yaml
index 2ef4049453..ed07c525eb 100644
--- a/src/joint_embedding/metrics/latent_mixing/config.vsh.yaml
+++ b/src/joint_embedding/metrics/latent_mixing/config.vsh.yaml
@@ -28,4 +28,5 @@ platforms:
       - type: python
         pip: [ pyyaml ]
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
\ No newline at end of file
+    directives: 
+      label: [ lowmem, lowcpu ]
\ No newline at end of file
diff --git a/src/joint_embedding/metrics/nmi/config.vsh.yaml b/src/joint_embedding/metrics/nmi/config.vsh.yaml
index bf98b925de..cd49b342e8 100644
--- a/src/joint_embedding/metrics/nmi/config.vsh.yaml
+++ b/src/joint_embedding/metrics/nmi/config.vsh.yaml
@@ -24,4 +24,5 @@ platforms:
       - type: python
         pip: [ pyyaml ]
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/metrics/rfoob/config.vsh.yaml b/src/joint_embedding/metrics/rfoob/config.vsh.yaml
index 5f080b8b08..a11a77d71f 100644
--- a/src/joint_embedding/metrics/rfoob/config.vsh.yaml
+++ b/src/joint_embedding/metrics/rfoob/config.vsh.yaml
@@ -46,4 +46,5 @@ platforms:
       - type: python
         pip: [ pyyaml ]
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
\ No newline at end of file
+    directives: 
+      label: [ lowmem, lowcpu ]
\ No newline at end of file
diff --git a/src/joint_embedding/metrics/ti_cons/config.vsh.yaml b/src/joint_embedding/metrics/ti_cons/config.vsh.yaml
index e7d70b24b6..cf153437db 100644
--- a/src/joint_embedding/metrics/ti_cons/config.vsh.yaml
+++ b/src/joint_embedding/metrics/ti_cons/config.vsh.yaml
@@ -36,4 +36,5 @@ platforms:
       - type: python
         pip: [ pyyaml ]
   - type: nextflow
-    directives: [ midmem, lowtime, midcpu ]
+    directives: 
+      label: [ midmem, midcpu ]
diff --git a/src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml b/src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml
index 43aed327f6..c0396b3516 100644
--- a/src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml
+++ b/src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml
@@ -36,4 +36,5 @@ platforms:
       - type: python
         pip: [ pyyaml ]
   - type: nextflow
-    directives: [ midmem, lowtime, midcpu ]
+    directives: 
+      label: [ midmem, midcpu ]

From a849f0bff7db704424ca8f712b096d38ae6862e8 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Fri, 13 Jan 2023 22:20:21 +0100
Subject: [PATCH 57/82] update configs to align with v1 metadata

---
 .../random_embed/config.vsh.yaml              |  2 +-
 .../zeros_embed/config.vsh.yaml               |  2 +-
 .../methods/lmds/config.vsh.yaml              |  4 ++--
 .../methods/mnn/config.vsh.yaml               |  2 +-
 .../methods/newwave/config.vsh.yaml           |  2 +-
 .../methods/pca/config.vsh.yaml               |  2 +-
 .../methods/totalvi/config.vsh.yaml           |  2 +-
 .../methods/umap/config.vsh.yaml              |  2 +-
 .../metrics/ari/config.vsh.yaml               |  6 ++---
 .../metrics/asw_batch/config.vsh.yaml         |  6 ++---
 .../metrics/asw_label/config.vsh.yaml         |  6 ++---
 .../metrics/cc_cons/config.vsh.yaml           |  6 ++---
 .../metrics/check_format/config.vsh.yaml      | 12 +++++-----
 .../graph_connectivity/config.vsh.yaml        |  6 ++---
 .../metrics/latent_mixing/config.vsh.yaml     |  5 ++--
 .../metrics/nmi/config.vsh.yaml               |  6 ++---
 .../metrics/rfoob/config.vsh.yaml             | 24 +++++++++----------
 .../metrics/ti_cons/config.vsh.yaml           | 18 +++++++-------
 .../metrics/ti_cons_batch/config.vsh.yaml     | 18 +++++++-------
 19 files changed, 66 insertions(+), 65 deletions(-)

diff --git a/src/joint_embedding/control_methods/random_embed/config.vsh.yaml b/src/joint_embedding/control_methods/random_embed/config.vsh.yaml
index 5eefbfcd1e..15b0c745e9 100644
--- a/src/joint_embedding/control_methods/random_embed/config.vsh.yaml
+++ b/src/joint_embedding/control_methods/random_embed/config.vsh.yaml
@@ -5,7 +5,7 @@ functionality:
   description: Generate a random embedding from a normal distribution.
   info:
     type: negative_control
-    label: Normal Dist.
+    method_name: Normal Dist.
   arguments:
     - name: "--n_dims"
       type: "integer"
diff --git a/src/joint_embedding/control_methods/zeros_embed/config.vsh.yaml b/src/joint_embedding/control_methods/zeros_embed/config.vsh.yaml
index 32c2648284..2cd0a9b71f 100644
--- a/src/joint_embedding/control_methods/zeros_embed/config.vsh.yaml
+++ b/src/joint_embedding/control_methods/zeros_embed/config.vsh.yaml
@@ -5,7 +5,7 @@ functionality:
   description: Generate an embedding containing only zero values.
   info:
     type: negative_control
-    label: zeros_embed
+    method_name: zeros_embed
   arguments:
     - name: "--n_dims"
       type: "integer"
diff --git a/src/joint_embedding/methods/lmds/config.vsh.yaml b/src/joint_embedding/methods/lmds/config.vsh.yaml
index 8d11aee807..440172d748 100644
--- a/src/joint_embedding/methods/lmds/config.vsh.yaml
+++ b/src/joint_embedding/methods/lmds/config.vsh.yaml
@@ -1,11 +1,11 @@
-__merge__: ../../api/comp_method.yaml
+__merge__: ../../api/comp_method.yamllowmem
 functionality:
   name: lmds
   namespace: joint_embedding/methods
   description: Landmark MDS dimensionality reduction on the Spearman distance.
   info:
     type: method
-    label: "LMDS"
+    method_name: "LMDS"
   arguments:
     - name: "--distance_method"
       type: "string"
diff --git a/src/joint_embedding/methods/mnn/config.vsh.yaml b/src/joint_embedding/methods/mnn/config.vsh.yaml
index 48b8403903..30dc728fd9 100644
--- a/src/joint_embedding/methods/mnn/config.vsh.yaml
+++ b/src/joint_embedding/methods/mnn/config.vsh.yaml
@@ -5,7 +5,7 @@ functionality:
   description: Mutual nearest neighbors correction followed by PCA.
   info:
     type: method
-    label: "MNN"
+    method_name: "MNN"
   arguments:
     - name: "--hvg_sel"
       type: "integer"
diff --git a/src/joint_embedding/methods/newwave/config.vsh.yaml b/src/joint_embedding/methods/newwave/config.vsh.yaml
index 151dee6363..0939d7b6f7 100644
--- a/src/joint_embedding/methods/newwave/config.vsh.yaml
+++ b/src/joint_embedding/methods/newwave/config.vsh.yaml
@@ -5,7 +5,7 @@ functionality:
   description: Concatenated NewWave.
   info:
     type: method
-    label: "NewWave"
+    method_name: "NewWave"
     doi: "10.1101/2021.08.02.453487"
   arguments:
     - name: "--maxiter"
diff --git a/src/joint_embedding/methods/pca/config.vsh.yaml b/src/joint_embedding/methods/pca/config.vsh.yaml
index a2187266ad..860a48c300 100644
--- a/src/joint_embedding/methods/pca/config.vsh.yaml
+++ b/src/joint_embedding/methods/pca/config.vsh.yaml
@@ -5,7 +5,7 @@ functionality:
   description: PCA dimensionality reduction.
   info:
     type: method
-    label: "PCA"
+    method_name: "PCA"
   authors:
   arguments:
     - name: "--n_dims"
diff --git a/src/joint_embedding/methods/totalvi/config.vsh.yaml b/src/joint_embedding/methods/totalvi/config.vsh.yaml
index e418b21bf5..5b0969172a 100644
--- a/src/joint_embedding/methods/totalvi/config.vsh.yaml
+++ b/src/joint_embedding/methods/totalvi/config.vsh.yaml
@@ -6,7 +6,7 @@ functionality:
   description: "totalVI: joint probabilistic modeling with Total Variational Inference"
   info:
     type: method
-    label: totalVI
+    method_name: totalVI
     doi: 10.1038/s41592-020-01050-x    
   arguments:
     - name: --hvg_number
diff --git a/src/joint_embedding/methods/umap/config.vsh.yaml b/src/joint_embedding/methods/umap/config.vsh.yaml
index ee9650ab95..5ef2d3d11e 100644
--- a/src/joint_embedding/methods/umap/config.vsh.yaml
+++ b/src/joint_embedding/methods/umap/config.vsh.yaml
@@ -6,7 +6,7 @@ functionality:
   description: UMAP dimensionality reduction on the Euclidean distance.
   info:
     type: method
-    label: UMAP
+    method_name: UMAP
   arguments:
     - name: "--n_dims"
       type: "integer"
diff --git a/src/joint_embedding/metrics/ari/config.vsh.yaml b/src/joint_embedding/metrics/ari/config.vsh.yaml
index 69e01c5a26..41ef532e5a 100644
--- a/src/joint_embedding/metrics/ari/config.vsh.yaml
+++ b/src/joint_embedding/metrics/ari/config.vsh.yaml
@@ -5,9 +5,9 @@ functionality:
   description: Adjusted rand index (ARI)
   info:
     metrics:
-      - id: ari
-        label: ari
-        description: Adjusted rand index (ARI)
+      - metric_id: ari
+        metric_name: ari
+        metric_description: Adjusted rand index (ARI)
         min: 0
         max: 1
         maximize: true
diff --git a/src/joint_embedding/metrics/asw_batch/config.vsh.yaml b/src/joint_embedding/metrics/asw_batch/config.vsh.yaml
index 5abd76fe8f..e89b30dc90 100644
--- a/src/joint_embedding/metrics/asw_batch/config.vsh.yaml
+++ b/src/joint_embedding/metrics/asw_batch/config.vsh.yaml
@@ -5,9 +5,9 @@ functionality:
   description: Average silhouette width (ASW) of batches per label
   info:
     metrics:
-      - id: asw_batch
-        label: asw_batch
-        description: Average silhouette width (ASW) of batches per label
+      - metric_id: asw_batch
+        metric_name: asw_batch
+        metric_description: Average silhouette width (ASW) of batches per label
         min: 0
         max: 1
         maximize: True
diff --git a/src/joint_embedding/metrics/asw_label/config.vsh.yaml b/src/joint_embedding/metrics/asw_label/config.vsh.yaml
index 9cf7ceac08..262b48d6f4 100644
--- a/src/joint_embedding/metrics/asw_label/config.vsh.yaml
+++ b/src/joint_embedding/metrics/asw_label/config.vsh.yaml
@@ -5,9 +5,9 @@ functionality:
   description: Average silhouette width (ASW) of labels
   info:
     metrics:
-      - id: asw_label
-        label: asw_label
-        description: Average silhouette width (ASW) of labels
+      - metric_id: asw_label
+        metric_name: asw_label
+        metric_description: Average silhouette width (ASW) of labels
         min: 0
         max: 1
         maximize: True
diff --git a/src/joint_embedding/metrics/cc_cons/config.vsh.yaml b/src/joint_embedding/metrics/cc_cons/config.vsh.yaml
index 16c562c37c..ba501a5e21 100644
--- a/src/joint_embedding/metrics/cc_cons/config.vsh.yaml
+++ b/src/joint_embedding/metrics/cc_cons/config.vsh.yaml
@@ -5,9 +5,9 @@ functionality:
   description: Cell cycle conservation score
   info:
     metrics:
-      - id: cc_cons
-        label: cc_cons
-        description: Cell cycle conservation score
+      - metric_id: cc_cons
+        metric_name: cc_cons
+        metric_description: Cell cycle conservation score
         min: 0
         max: 1
         maximize: True
diff --git a/src/joint_embedding/metrics/check_format/config.vsh.yaml b/src/joint_embedding/metrics/check_format/config.vsh.yaml
index b2d8a021b4..90294f01e6 100644
--- a/src/joint_embedding/metrics/check_format/config.vsh.yaml
+++ b/src/joint_embedding/metrics/check_format/config.vsh.yaml
@@ -5,15 +5,15 @@ functionality:
   description: Checking whether the prediction of a method has the right format.
   info:
     metrics:
-      - id: finished
-        label: finished
-        description: 
+      - metric_id: finished
+        metric_name: finished
+        metric_description: check if metric finished
         min: 0
         max: 1
         maximize: true
-      - id: correct_format
-        label: correct_format
-        description: 
+      - metric_id: correct_format
+        metric_name: correct_format
+        metric_description: check if format is correct
         min: 0
         max: 1
         maximize: true
diff --git a/src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml b/src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml
index 8b6a3871d0..c79966893c 100644
--- a/src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml
+++ b/src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml
@@ -5,9 +5,9 @@ functionality:
   description: Graph connectivity
   info:
     metrics:
-      - id: graph_conn
-        label: graph_conn
-        description: Graph connectivity
+      - metric_id: graph_conn
+        metric_name: graph_conn
+        metric_description: Graph connectivity
         min: 0
         max: 1
         maximize: True
diff --git a/src/joint_embedding/metrics/latent_mixing/config.vsh.yaml b/src/joint_embedding/metrics/latent_mixing/config.vsh.yaml
index ed07c525eb..bf989bfc20 100644
--- a/src/joint_embedding/metrics/latent_mixing/config.vsh.yaml
+++ b/src/joint_embedding/metrics/latent_mixing/config.vsh.yaml
@@ -5,8 +5,9 @@ functionality:
   description: Calculate latent mixing metric for joint embedding task.
   info:
     metrics:
-      - id: latent_mixing
-        label: Latent mixing
+      - metric_id: latent_mixing
+        metric_name: Latent mixing
+        metric_description: Calculate latent mixing 
         min: -1
         max: 0
         maximize: True
diff --git a/src/joint_embedding/metrics/nmi/config.vsh.yaml b/src/joint_embedding/metrics/nmi/config.vsh.yaml
index cd49b342e8..e83772d65e 100644
--- a/src/joint_embedding/metrics/nmi/config.vsh.yaml
+++ b/src/joint_embedding/metrics/nmi/config.vsh.yaml
@@ -5,9 +5,9 @@ functionality:
   description: Normalised mutual information (NMI)
   info:
     metrics:
-      - id: nmi
-        label: NMI
-        description: Normalised mutual information (NMI)
+      - metric_id: nmi
+        metric_name: NMI
+        metric_description: Normalised mutual information (NMI)
         min: 0
         max: 1
         maximize: True
diff --git a/src/joint_embedding/metrics/rfoob/config.vsh.yaml b/src/joint_embedding/metrics/rfoob/config.vsh.yaml
index a11a77d71f..099219de47 100644
--- a/src/joint_embedding/metrics/rfoob/config.vsh.yaml
+++ b/src/joint_embedding/metrics/rfoob/config.vsh.yaml
@@ -5,27 +5,27 @@ functionality:
   description: Calculating basic metrics for the joint embedding task.
   info:
     metrics:
-      - id: rfoob_celltype_accuracy
-        label: rfoob_celltype_accuracy
-        description:
+      - metric_id: rfoob_celltype_accuracy
+        metric_name: Rfoob Celltype Accuray
+        metric_description: check the celltype accuracy
         min: 0
         max: 1
         maximize: True
-      - id: rfoob_pseudotimegex_rsq
-        label: rfoob_pseudotimegex_rsq
-        description:
+      - metric_metric_id: rfoob_pseudotimegex_rsq
+        metric_name: rfoob pseudotimegex rsq
+        metric_description: rfoob pseudotimegex rsq
         min: -1
         max: 1
         maximize: true
-      - id: rfoob_pseudotimeadt_rsq
-        label: rfoob_pseudotimeadt_rsq
-        description:
+      - metric_id: rfoob_pseudotimeadt_rsq
+        metric_name: rfoob pseudotimeadt rsq
+        metric_description: rfoob pseudotimeadt rsq
         min: -1
         max: 1
         maximize: True
-      - id: rfoob_batch_error
-        label: rfoob_batch_error
-        description:
+      - metric_id: rfoob_batch_error
+        metric_name: rfoob batch error
+        metric_description: rfoob batch error
         min: 0
         max: 1
         maximize: True
diff --git a/src/joint_embedding/metrics/ti_cons/config.vsh.yaml b/src/joint_embedding/metrics/ti_cons/config.vsh.yaml
index cf153437db..68525b337b 100644
--- a/src/joint_embedding/metrics/ti_cons/config.vsh.yaml
+++ b/src/joint_embedding/metrics/ti_cons/config.vsh.yaml
@@ -5,21 +5,21 @@ functionality:
   description: Trajectory inference conservation score
   info:
     metrics:
-      - id: ti_cons_RNA
-        label: ti_cons_RNA
-        description:
+      - metric_id: ti_cons_RNA
+        metric_name: ti cons RNA
+        metric_description: ti cons RNA
         min: 0
         max: 1
         maximize: True
-      - id: ti_cons_ADT_ATAC
-        label: ti_cons_ADT_ATAC
-        description:
+      - metric_id: ti_cons_ADT_ATAC
+        metric_name: ti cons ADT ATAC
+        metric_description: ti cons ADT ATAC
         min: 0
         max: 1
         maximize: True
-      - id: ti_cons_mean
-        label: ti_cons_mean
-        description:
+      - metric_id: ti_cons_mean
+        metric_name: ti cons mean
+        metric_description: ti cons mean
         min: 0
         max: 1
         maximize: True
diff --git a/src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml b/src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml
index c0396b3516..7139b8aadc 100644
--- a/src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml
+++ b/src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml
@@ -5,21 +5,21 @@ functionality:
   description: Trajectory inference conservation score per batch
   info:
     metrics:
-      - id: ti_cons_batch_RNA
-        label: ti_cons_batch_RNA
-        description:
+      - metric_id: ti_cons_batch_RNA
+        metric_name: ti cons batch RNA
+        metric_description: placeholder
         min: 0
         max: 1
         maximize: True
-      - id: ti_cons_batch_ADT_ATAC
-        label: ti_cons_batch_ADT_ATAC
-        description:
+      - metric_id: ti_cons_batch_ADT_ATAC
+        metric_name: ti cons batch ADT ATAC
+        metric_description: placeholder
         min: 0
         max: 1
         maximize: True
-      - id: ti_cons_batch_mean
-        label: ti_cons_batch_mean
-        description:
+      - metric_id: ti_cons_batch_mean
+        metric_name: ti cons batch mean
+        metric_description: placeholder
         min: 0
         max: 1
         maximize: True

From 399a316650102fbf227fa300fcceb0092c4798f1 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Fri, 13 Jan 2023 22:20:33 +0100
Subject: [PATCH 58/82] add readme

---
 src/joint_embedding/README.qmd | 263 +++++++++++++++++++++++++++++++++
 1 file changed, 263 insertions(+)
 create mode 100644 src/joint_embedding/README.qmd

diff --git a/src/joint_embedding/README.qmd b/src/joint_embedding/README.qmd
new file mode 100644
index 0000000000..d6fb56cae9
--- /dev/null
+++ b/src/joint_embedding/README.qmd
@@ -0,0 +1,263 @@
+---
+format: gfm
+toc: true
+---
+
+```{r setup, include=FALSE}
+library(tidyverse)
+library(rlang)
+
+strip_margin <- function(text, symbol = "\\|") {
+  str_replace_all(text, paste0("(\n?)[ \t]*", symbol), "\\1") 
+}
+
+dir <- "src/joint_embedding"
+# dir <- "."
+```
+
+# Joint Embedding
+
+## Task description
+```{r task description, echo=FALSE,warning=FALSE,error=FALSE,output='asis'}
+task_info <- yaml::yaml.load_file(paste0(dir,"/docs/task_info.yaml"))
+
+cat(task_info$description)
+
+```
+
+## Methods
+
+Methods for assigning labels from a reference dataset to a new dataset.
+
+```{r methods, echo=FALSE,warning=FALSE,error=FALSE,output="asis"}
+method_ns_list <- processx::run("viash", c("ns", "list", "-q", "methods", "--src", "."), wd = dir)
+method_configs <- yaml::yaml.load(method_ns_list$stdout)
+
+method_info <- map_df(method_configs, function(config) {
+  if (length(config$functionality$status) > 0 && config$functionality$status == "disabled") return(NULL)
+  info <- as_tibble(config$functionality$info)
+  info$comp_yaml <- config$info$config
+  info$name <- config$functionality$name
+  info$namespace <- config$functionality$namespace
+  info$description <- config$functionality$description
+  info
+})
+
+method_info_view <- 
+  method_info %>%
+    arrange(type, label) %>%
+    transmute(
+      Name = paste0("[", method_name, "](", comp_yaml, ")"),
+      Type = type,
+      Description = description,
+    #   DOI = ifelse(!is.na(paper_doi), paste0("[link](https://doi.org/", paper_doi, ")"), ""),
+    #   URL = ifelse(!is.na(code_url), paste0("[link](", code_url, ")"), "")
+    )
+
+cat(paste(knitr::kable(method_info_view, format = 'pipe'), collapse = "\n"))
+```
+
+
+## Metrics
+
+Metrics for joint embedding aim to characterize how well each ...
+
+```{r metrics, echo=FALSE,warning=FALSE,error=FALSE,output="asis"}
+metric_ns_list <- processx::run("viash", c("ns", "list", "-q", "metrics", "--src", "."), wd = dir)
+metric_configs <- yaml::yaml.load(metric_ns_list$stdout)
+
+metric_info <- map_df(metric_configs, function(config) {
+  metric_info <- as_tibble(map_df(config$functionality$info$metrics, as.data.frame))
+  metric_info$comp_yaml <- config$info$config
+  metric_info$comp_name <- config$functionality$name
+  metric_info$comp_namespace <- config$functionality$namespace
+  metric_info
+})
+
+metric_info_view <- 
+  metric_info %>%
+    transmute(
+      Name = paste0("[", metric_name, "](", comp_yaml, ")"),
+      Description = paste0(description, " ", ifelse(maximize, "Higher is better.", "Lower is better.")),
+      Range = paste0("[", min, ", ", max, "]")
+    )
+
+cat(paste(knitr::kable(metric_info_view, format = 'pipe'), collapse = "\n"))
+```
+
+
+## Pipeline topology
+
+```{r data, include=FALSE}
+comp_yamls <- list.files(paste0(dir, "/api"), pattern = "comp_", full.names = TRUE)
+file_yamls <- list.files(paste0(dir, "/api"), pattern = "anndata_", full.names = TRUE)
+
+comp_file <- map_df(comp_yamls, function(yaml_file) {
+  conf <- yaml::read_yaml(yaml_file)
+
+  map_df(conf$functionality$arguments, function(arg) {
+    tibble(
+      comp_name = basename(yaml_file) %>% gsub("\\.yaml", "", .),
+      arg_name = str_replace_all(arg$name, "^-*", ""),
+      direction = arg$direction %||% "input",
+      file_name = basename(arg$`__merge__`) %>% gsub("\\.yaml", "", .)
+    )
+  })
+})
+
+comp_info <- map_df(comp_yamls, function(yaml_file) {
+  conf <- yaml::read_yaml(yaml_file)
+
+  tibble(
+    name = basename(yaml_file) %>% gsub("\\.yaml", "", .),
+    label = name %>% gsub("comp_", "", .) %>% gsub("_", " ", .)
+  )
+})
+
+
+file_info <- map_df(file_yamls, function(yaml_file) {
+  arg <- yaml::read_yaml(yaml_file)
+  
+  tibble(
+    name = basename(yaml_file) %>% gsub("\\.yaml", "", .),
+    description = arg$description,
+    short_description = arg$info$short_description,
+    example = arg$example,
+    label = name %>% gsub("anndata_", "", .) %>% gsub("_", " ", .)
+  )
+})
+
+file_slot <- map_df(file_yamls, function(yaml_file) {
+  arg <- yaml::read_yaml(yaml_file)
+
+  map2_df(names(arg$info$slots), arg$info$slots, function(group_name, slot) {
+    df <- map_df(slot, as.data.frame)
+    df$struct <- group_name
+    df$file_name = basename(yaml_file) %>% gsub("\\.yaml", "", .)
+    as_tibble(df)
+  })
+}) %>% 
+  mutate(multiple = multiple %|% FALSE)
+```
+
+```{r flow, echo=FALSE,warning=FALSE,error=FALSE}
+nodes <- bind_rows(
+  file_info %>%
+    transmute(id = name, label = str_to_title(label), is_comp = FALSE),
+  comp_info %>%
+    transmute(id = name, label = str_to_title(label), is_comp = TRUE)
+) %>%
+  mutate(str = paste0(
+    "  ",
+    id, 
+    ifelse(is_comp, "[/", "("), 
+    label,
+    ifelse(is_comp, "/]", ")")
+  ))
+edges <- bind_rows(
+  comp_file %>%
+    filter(direction == "input") %>%
+    transmute(
+      from = file_name,
+      to = comp_name,
+      arrow = "---"
+    ),
+  comp_file %>%
+    filter(direction == "output") %>%
+    transmute(
+      from = comp_name, 
+      to = file_name, 
+      arrow = "-->"
+    )
+) %>%
+  mutate(str = paste0("  ", from, arrow, to))
+
+# note: use ```{mermaid} instead of ```mermaid when rendering to html
+out_str <- strip_margin(glue::glue("
+  §```mermaid
+  §%%| column: screen-inset-shaded
+  §flowchart LR
+  §{paste(nodes$str, collapse = '\n')}
+  §{paste(edges$str, collapse = '\n')}
+  §```
+  §"), symbol = "§")
+knitr::asis_output(out_str)
+```
+
+## File format API
+
+```{r file_api, echo=FALSE,warning=FALSE,error=FALSE,output="asis"}
+for (file_name in file_info$name) {
+  arg_info <- file_info %>% filter(name == file_name)
+  sub_out <- file_slot %>% 
+    filter(file_name == !!file_name) %>% 
+    select(struct, name, type, description)
+
+  used_in <- comp_file %>%
+    filter(file_name == !!file_name) %>%
+    left_join(comp_info %>% select(comp_name = name, comp_label = label), by = "comp_name") %>%
+    mutate(str = paste0("* [", comp_label, "](#", comp_label, "): ", arg_name, " (as ", direction, ")")) %>%
+    pull(str)
+
+  example <- sub_out %>%
+    group_by(struct) %>%
+    summarise(
+      str = paste0(unique(struct), ": ", paste0("'", name, "'", collapse = ", "))
+    ) %>%
+    arrange(match(struct, c("obs", "var", "uns", "obsm", "obsp", "varm", "varp", "layers")))
+
+  example_str <- c("    AnnData object", paste0("     ", example$str))
+  
+  out_str <- strip_margin(glue::glue("
+    §### `{str_to_title(arg_info$label)}`
+    §
+    §{arg_info$description}
+    §
+    §Used in:
+    §
+    §{paste(used_in, collapse = '\n')}
+    §
+    §Slots:
+    §
+    §{paste(knitr::kable(sub_out, format = 'pipe'), collapse = '\n')}
+    §
+    §Example:
+    §
+    §{paste(example_str, collapse = '\n')}
+    §
+    §"), symbol = "§")
+  cat(out_str)
+}
+```
+
+
+
+## Component API
+
+```{r comp_api, echo=FALSE,warning=FALSE,error=FALSE,output="asis"}
+# todo: add description
+# todo: add required info fields
+for (comp_name in comp_info$name) {
+  comp <- comp_info %>% filter(name == comp_name)
+  sub_out <- comp_file %>% 
+    filter(comp_name == !!comp_name) %>%
+    left_join(file_info %>% select(file_name = name, file_desc = description, file_sdesc = short_description, file_label = label), by = "file_name") %>%
+    transmute(
+      Name = paste0("`--", arg_name, "`"),
+      `File format` = paste0("[", str_to_title(file_label), "](#", file_label, ")"),
+      Direction = direction,
+      Description = file_sdesc
+    )
+  
+  out_str <- strip_margin(glue::glue("
+    §### `{str_to_title(comp$label)}`
+    §
+    §{ifelse(\"description\" %in% names(comp), comp$description, \"\")}
+    §
+    §Arguments:
+    §
+    §{paste(knitr::kable(sub_out, format = 'pipe'), collapse = '\n')}
+    §"), symbol = "§")
+  cat(out_str)
+}
+```
\ No newline at end of file

From be3e17564f9a18ab380957976bdbaf276f0572e6 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Fri, 13 Jan 2023 22:25:07 +0100
Subject: [PATCH 59/82] update readme

---
 src/joint_embedding/README.qmd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/joint_embedding/README.qmd b/src/joint_embedding/README.qmd
index d6fb56cae9..9d30030ce5 100644
--- a/src/joint_embedding/README.qmd
+++ b/src/joint_embedding/README.qmd
@@ -12,7 +12,7 @@ strip_margin <- function(text, symbol = "\\|") {
 }
 
 dir <- "src/joint_embedding"
-# dir <- "."
+dir <- "."
 ```
 
 # Joint Embedding

From 0bce13790b7600d477b2ecc3a979a316ffe3fd5f Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Tue, 24 Jan 2023 11:34:22 +0100
Subject: [PATCH 60/82] update task info and readme

---
 src/joint_embedding/README.md                 |  23 ++
 src/joint_embedding/README.qmd                | 263 ------------------
 .../{docs => api}/task_info.yaml              |   4 +-
 3 files changed, 24 insertions(+), 266 deletions(-)
 create mode 100644 src/joint_embedding/README.md
 delete mode 100644 src/joint_embedding/README.qmd
 rename src/joint_embedding/{docs => api}/task_info.yaml (79%)

diff --git a/src/joint_embedding/README.md b/src/joint_embedding/README.md
new file mode 100644
index 0000000000..7d9f65431c
--- /dev/null
+++ b/src/joint_embedding/README.md
@@ -0,0 +1,23 @@
+# Joint embedding
+
+Structure of this task:
+
+    src/embedding
+    ├── api                          Interface specifications for components and datasets in this task
+    ├── control_methods              Baseline (random/ground truth) methods to compare methods against
+    ├── methods                      Methods to be benchmarked
+    ├── metrics                      Metrics used to quantify performance of methods
+    ├── README.md                    This file
+    ├── resources_scripts            Scripts to process the datasets
+    ├── resources_test_scripts       Scripts to process the test resources
+    ├── split_dataset                Component to prepare common datasets
+    └── workflows                    Pipelines to run the full benchmark
+
+Relevant links:
+
+* [Description and results at openproblems.bio](https://openproblems.bio/neurips_2021/)
+
+* [Experimental results](https://openproblems-experimental.netlify.app/results/joint_embedding/)
+
+<!-- update this to openproblems.bio/guide when possible -->
+* [Contribution guide](https://github.com/openproblems-bio/openproblems-v2/blob/main/CONTRIBUTING.md)
diff --git a/src/joint_embedding/README.qmd b/src/joint_embedding/README.qmd
deleted file mode 100644
index 9d30030ce5..0000000000
--- a/src/joint_embedding/README.qmd
+++ /dev/null
@@ -1,263 +0,0 @@
----
-format: gfm
-toc: true
----
-
-```{r setup, include=FALSE}
-library(tidyverse)
-library(rlang)
-
-strip_margin <- function(text, symbol = "\\|") {
-  str_replace_all(text, paste0("(\n?)[ \t]*", symbol), "\\1") 
-}
-
-dir <- "src/joint_embedding"
-dir <- "."
-```
-
-# Joint Embedding
-
-## Task description
-```{r task description, echo=FALSE,warning=FALSE,error=FALSE,output='asis'}
-task_info <- yaml::yaml.load_file(paste0(dir,"/docs/task_info.yaml"))
-
-cat(task_info$description)
-
-```
-
-## Methods
-
-Methods for assigning labels from a reference dataset to a new dataset.
-
-```{r methods, echo=FALSE,warning=FALSE,error=FALSE,output="asis"}
-method_ns_list <- processx::run("viash", c("ns", "list", "-q", "methods", "--src", "."), wd = dir)
-method_configs <- yaml::yaml.load(method_ns_list$stdout)
-
-method_info <- map_df(method_configs, function(config) {
-  if (length(config$functionality$status) > 0 && config$functionality$status == "disabled") return(NULL)
-  info <- as_tibble(config$functionality$info)
-  info$comp_yaml <- config$info$config
-  info$name <- config$functionality$name
-  info$namespace <- config$functionality$namespace
-  info$description <- config$functionality$description
-  info
-})
-
-method_info_view <- 
-  method_info %>%
-    arrange(type, label) %>%
-    transmute(
-      Name = paste0("[", method_name, "](", comp_yaml, ")"),
-      Type = type,
-      Description = description,
-    #   DOI = ifelse(!is.na(paper_doi), paste0("[link](https://doi.org/", paper_doi, ")"), ""),
-    #   URL = ifelse(!is.na(code_url), paste0("[link](", code_url, ")"), "")
-    )
-
-cat(paste(knitr::kable(method_info_view, format = 'pipe'), collapse = "\n"))
-```
-
-
-## Metrics
-
-Metrics for joint embedding aim to characterize how well each ...
-
-```{r metrics, echo=FALSE,warning=FALSE,error=FALSE,output="asis"}
-metric_ns_list <- processx::run("viash", c("ns", "list", "-q", "metrics", "--src", "."), wd = dir)
-metric_configs <- yaml::yaml.load(metric_ns_list$stdout)
-
-metric_info <- map_df(metric_configs, function(config) {
-  metric_info <- as_tibble(map_df(config$functionality$info$metrics, as.data.frame))
-  metric_info$comp_yaml <- config$info$config
-  metric_info$comp_name <- config$functionality$name
-  metric_info$comp_namespace <- config$functionality$namespace
-  metric_info
-})
-
-metric_info_view <- 
-  metric_info %>%
-    transmute(
-      Name = paste0("[", metric_name, "](", comp_yaml, ")"),
-      Description = paste0(description, " ", ifelse(maximize, "Higher is better.", "Lower is better.")),
-      Range = paste0("[", min, ", ", max, "]")
-    )
-
-cat(paste(knitr::kable(metric_info_view, format = 'pipe'), collapse = "\n"))
-```
-
-
-## Pipeline topology
-
-```{r data, include=FALSE}
-comp_yamls <- list.files(paste0(dir, "/api"), pattern = "comp_", full.names = TRUE)
-file_yamls <- list.files(paste0(dir, "/api"), pattern = "anndata_", full.names = TRUE)
-
-comp_file <- map_df(comp_yamls, function(yaml_file) {
-  conf <- yaml::read_yaml(yaml_file)
-
-  map_df(conf$functionality$arguments, function(arg) {
-    tibble(
-      comp_name = basename(yaml_file) %>% gsub("\\.yaml", "", .),
-      arg_name = str_replace_all(arg$name, "^-*", ""),
-      direction = arg$direction %||% "input",
-      file_name = basename(arg$`__merge__`) %>% gsub("\\.yaml", "", .)
-    )
-  })
-})
-
-comp_info <- map_df(comp_yamls, function(yaml_file) {
-  conf <- yaml::read_yaml(yaml_file)
-
-  tibble(
-    name = basename(yaml_file) %>% gsub("\\.yaml", "", .),
-    label = name %>% gsub("comp_", "", .) %>% gsub("_", " ", .)
-  )
-})
-
-
-file_info <- map_df(file_yamls, function(yaml_file) {
-  arg <- yaml::read_yaml(yaml_file)
-  
-  tibble(
-    name = basename(yaml_file) %>% gsub("\\.yaml", "", .),
-    description = arg$description,
-    short_description = arg$info$short_description,
-    example = arg$example,
-    label = name %>% gsub("anndata_", "", .) %>% gsub("_", " ", .)
-  )
-})
-
-file_slot <- map_df(file_yamls, function(yaml_file) {
-  arg <- yaml::read_yaml(yaml_file)
-
-  map2_df(names(arg$info$slots), arg$info$slots, function(group_name, slot) {
-    df <- map_df(slot, as.data.frame)
-    df$struct <- group_name
-    df$file_name = basename(yaml_file) %>% gsub("\\.yaml", "", .)
-    as_tibble(df)
-  })
-}) %>% 
-  mutate(multiple = multiple %|% FALSE)
-```
-
-```{r flow, echo=FALSE,warning=FALSE,error=FALSE}
-nodes <- bind_rows(
-  file_info %>%
-    transmute(id = name, label = str_to_title(label), is_comp = FALSE),
-  comp_info %>%
-    transmute(id = name, label = str_to_title(label), is_comp = TRUE)
-) %>%
-  mutate(str = paste0(
-    "  ",
-    id, 
-    ifelse(is_comp, "[/", "("), 
-    label,
-    ifelse(is_comp, "/]", ")")
-  ))
-edges <- bind_rows(
-  comp_file %>%
-    filter(direction == "input") %>%
-    transmute(
-      from = file_name,
-      to = comp_name,
-      arrow = "---"
-    ),
-  comp_file %>%
-    filter(direction == "output") %>%
-    transmute(
-      from = comp_name, 
-      to = file_name, 
-      arrow = "-->"
-    )
-) %>%
-  mutate(str = paste0("  ", from, arrow, to))
-
-# note: use ```{mermaid} instead of ```mermaid when rendering to html
-out_str <- strip_margin(glue::glue("
-  §```mermaid
-  §%%| column: screen-inset-shaded
-  §flowchart LR
-  §{paste(nodes$str, collapse = '\n')}
-  §{paste(edges$str, collapse = '\n')}
-  §```
-  §"), symbol = "§")
-knitr::asis_output(out_str)
-```
-
-## File format API
-
-```{r file_api, echo=FALSE,warning=FALSE,error=FALSE,output="asis"}
-for (file_name in file_info$name) {
-  arg_info <- file_info %>% filter(name == file_name)
-  sub_out <- file_slot %>% 
-    filter(file_name == !!file_name) %>% 
-    select(struct, name, type, description)
-
-  used_in <- comp_file %>%
-    filter(file_name == !!file_name) %>%
-    left_join(comp_info %>% select(comp_name = name, comp_label = label), by = "comp_name") %>%
-    mutate(str = paste0("* [", comp_label, "](#", comp_label, "): ", arg_name, " (as ", direction, ")")) %>%
-    pull(str)
-
-  example <- sub_out %>%
-    group_by(struct) %>%
-    summarise(
-      str = paste0(unique(struct), ": ", paste0("'", name, "'", collapse = ", "))
-    ) %>%
-    arrange(match(struct, c("obs", "var", "uns", "obsm", "obsp", "varm", "varp", "layers")))
-
-  example_str <- c("    AnnData object", paste0("     ", example$str))
-  
-  out_str <- strip_margin(glue::glue("
-    §### `{str_to_title(arg_info$label)}`
-    §
-    §{arg_info$description}
-    §
-    §Used in:
-    §
-    §{paste(used_in, collapse = '\n')}
-    §
-    §Slots:
-    §
-    §{paste(knitr::kable(sub_out, format = 'pipe'), collapse = '\n')}
-    §
-    §Example:
-    §
-    §{paste(example_str, collapse = '\n')}
-    §
-    §"), symbol = "§")
-  cat(out_str)
-}
-```
-
-
-
-## Component API
-
-```{r comp_api, echo=FALSE,warning=FALSE,error=FALSE,output="asis"}
-# todo: add description
-# todo: add required info fields
-for (comp_name in comp_info$name) {
-  comp <- comp_info %>% filter(name == comp_name)
-  sub_out <- comp_file %>% 
-    filter(comp_name == !!comp_name) %>%
-    left_join(file_info %>% select(file_name = name, file_desc = description, file_sdesc = short_description, file_label = label), by = "file_name") %>%
-    transmute(
-      Name = paste0("`--", arg_name, "`"),
-      `File format` = paste0("[", str_to_title(file_label), "](#", file_label, ")"),
-      Direction = direction,
-      Description = file_sdesc
-    )
-  
-  out_str <- strip_margin(glue::glue("
-    §### `{str_to_title(comp$label)}`
-    §
-    §{ifelse(\"description\" %in% names(comp), comp$description, \"\")}
-    §
-    §Arguments:
-    §
-    §{paste(knitr::kable(sub_out, format = 'pipe'), collapse = '\n')}
-    §"), symbol = "§")
-  cat(out_str)
-}
-```
\ No newline at end of file
diff --git a/src/joint_embedding/docs/task_info.yaml b/src/joint_embedding/api/task_info.yaml
similarity index 79%
rename from src/joint_embedding/docs/task_info.yaml
rename to src/joint_embedding/api/task_info.yaml
index 6565c0c292..79267651ce 100644
--- a/src/joint_embedding/docs/task_info.yaml
+++ b/src/joint_embedding/api/task_info.yaml
@@ -1,8 +1,6 @@
 task_id: joint_embedding
 task_name: Joint Embedding
-v1_url: neurips2021_multimodal_viash/src/joint_embedding/readme.md
-v1_commit: 0f8eae583444ba3f71c3083b860cc34b9ecb2fa2
-short_description: Learning of an embedded space that leverages the information of multiple modalities (e.g. for improved cell type annotation).
+summary: Learning of an embedded space that leverages the information of multiple modalities (e.g. for improved cell type annotation).
 description: |
   The functioning of organs, tissues, and whole organisms is determined by the interplay of cells. 
   Cells are characterised into broad types, which in turn can take on different states. Here, a cell 

From f7d8cbb2d8801dc49e0929b47d217ce254f21a65 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Tue, 24 Jan 2023 11:43:39 +0100
Subject: [PATCH 61/82] add readme and task info

---
 src/match_modality/README.md          | 23 +++++++++++++++++++++++
 src/match_modality/api/task_info.yaml | 13 +++++++++++++
 2 files changed, 36 insertions(+)
 create mode 100644 src/match_modality/README.md
 create mode 100644 src/match_modality/api/task_info.yaml

diff --git a/src/match_modality/README.md b/src/match_modality/README.md
new file mode 100644
index 0000000000..3f1ff48cc1
--- /dev/null
+++ b/src/match_modality/README.md
@@ -0,0 +1,23 @@
+# Match modality
+
+Structure of this task:
+
+    src/match_modality
+    ├── api                          Interface specifications for components and datasets in this task
+    ├── control_methods              Baseline (random/ground truth) methods to compare methods against
+    ├── methods                      Methods to be benchmarked
+    ├── metrics                      Metrics used to quantify performance of methods
+    ├── README.md                    This file
+    ├── resources_scripts            Scripts to process the datasets
+    ├── resources_test_scripts       Scripts to process the test resources
+    ├── split_dataset                Component to prepare common datasets
+    └── workflows                    Pipelines to run the full benchmark
+
+Relevant links:
+
+* [Description and results at openproblems.bio](https://openproblems.bio/neurips_2021/)
+
+* [Experimental results](https://openproblems-experimental.netlify.app/results/match_modality/)
+
+<!-- update this to openproblems.bio/guide when possible -->
+* [Contribution guide](https://github.com/openproblems-bio/openproblems-v2/blob/main/CONTRIBUTING.md)
diff --git a/src/match_modality/api/task_info.yaml b/src/match_modality/api/task_info.yaml
new file mode 100644
index 0000000000..50d32b98f9
--- /dev/null
+++ b/src/match_modality/api/task_info.yaml
@@ -0,0 +1,13 @@
+task_id: match_modality
+task_name: Match modality
+summary: "Predicting which profiles from one modality resembles a profile from another."
+description: |
+  "While joint profiling of two modalities in the same single cell is now possible, 
+  most single-cell datasets that exist measure only a single modality. These modalities 
+  complement each other in their description of cellular state. Yet, it is challenging 
+  to analyse uni-modal datasets together when they do not share observations (cells) or 
+  a common feature space (genes, proteins, or open chromatin peaks). If we could map 
+  observations to one another across modalities, it would be possible to treat separately 
+  profiled datasets in the same manner as new multi-modal sequencing data. Mapping these 
+  modalities to one another opens up the vast amount of uni-modal single-cell datasets 
+  generated in the past years to multi-modal data analysis methods."
\ No newline at end of file

From 4460310a0078d017b33c237c79399764d7777076 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Tue, 10 Jan 2023 11:18:24 +0100
Subject: [PATCH 62/82] Add api yaml files

---
 src/match_modality/api/anndata_dataset.yaml   | 52 +++++++++++++++++++
 .../api/anndata_masked_mod.yaml               | 46 ++++++++++++++++
 .../api/anndata_masked_sol.yaml               | 20 +++++++
 src/match_modality/api/comp_mask_dataset.yaml | 40 ++++++++++++++
 4 files changed, 158 insertions(+)
 create mode 100644 src/match_modality/api/anndata_dataset.yaml
 create mode 100644 src/match_modality/api/anndata_masked_mod.yaml
 create mode 100644 src/match_modality/api/anndata_masked_sol.yaml
 create mode 100644 src/match_modality/api/comp_mask_dataset.yaml

diff --git a/src/match_modality/api/anndata_dataset.yaml b/src/match_modality/api/anndata_dataset.yaml
new file mode 100644
index 0000000000..3382dba2ff
--- /dev/null
+++ b/src/match_modality/api/anndata_dataset.yaml
@@ -0,0 +1,52 @@
+type: file
+description: An input h5ad dataset.
+example: "dataset.h5ad"
+info:
+  label: "Dataset"
+  slots:
+    layers: 
+      - type: integer
+        name: counts
+        description: Raw counts
+        required: true
+    obs:
+      - type: string
+        name: batch
+        description: Batch information
+        required: true
+      - type: double
+        name: size_factors
+        description: The size factors created by the normalisation method, if any.
+        required: false
+      - type: boolean
+        name: is_train
+        description: if sample is train data
+        required: true
+    var:
+      - type: string
+        name: gene_ids
+        description: 
+        required: false
+      - type: string
+        name: feature_types
+        description:
+        required: true
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: organism
+        description: "data from which organism "
+        required: false
+      - type: string
+        name: gene_activity_var_names
+        description:
+        required: true
+    obsm:
+      - type: double
+        name: gene_activity
+        description:
+        required: true
+    
diff --git a/src/match_modality/api/anndata_masked_mod.yaml b/src/match_modality/api/anndata_masked_mod.yaml
new file mode 100644
index 0000000000..31ef25d06e
--- /dev/null
+++ b/src/match_modality/api/anndata_masked_mod.yaml
@@ -0,0 +1,46 @@
+type: file
+description: "The masked data"
+example: "masked.h5ad"
+info:
+  short_description: "masked data"
+  slots:
+    layers: 
+      - type: integer
+        name: counts
+        description: Raw counts
+    obs:
+      - type: string
+        name: batch
+        description: Batch information
+        required: true
+      - type: double
+        name: size_factors
+        description:
+        required: false
+    var:
+      - type: string
+        name: feature_types
+        description: 
+        required: true
+      - type: string
+        name: gene_ids
+        description:
+        required: false
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: organism
+        description: which organism
+        required: true
+      - type: string
+        name: gene_activity_var_names
+        description:
+        required: true
+    obsm:
+      - type: double
+        name: gene_activity
+        description:
+        required: true
\ No newline at end of file
diff --git a/src/match_modality/api/anndata_masked_sol.yaml b/src/match_modality/api/anndata_masked_sol.yaml
new file mode 100644
index 0000000000..e486646e2f
--- /dev/null
+++ b/src/match_modality/api/anndata_masked_sol.yaml
@@ -0,0 +1,20 @@
+type: file
+description: "The masked solution data"
+example: "masked.h5ad"
+info:
+  short_description: "masked solution data"
+  slots:
+    obs:
+      - type: string
+        name: batch
+        description: Batch information
+        required: true
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: double
+        name: pairing_ix
+        description: number of samples
+        required: true
diff --git a/src/match_modality/api/comp_mask_dataset.yaml b/src/match_modality/api/comp_mask_dataset.yaml
new file mode 100644
index 0000000000..517b7f0f4d
--- /dev/null
+++ b/src/match_modality/api/comp_mask_dataset.yaml
@@ -0,0 +1,40 @@
+functionality:
+  arguments:
+    - name: "--input_mod1"
+      __merge__: anndata_dataset.yaml
+    - name: "--input_mod2"
+      __merge__: anndata_dataset.yaml
+    - name: "--output_train_mod1"
+      direction: "output"
+      example: "dataset_masked.h5ad"
+      description: "The masked shuffled train mod1 profiles."
+      __merge__: anndata_masked_mod.yaml
+    - name: "--output_train_mod2"
+      direction: "output"
+      example: "dataset_masked.h5ad"
+      description: "The masked shuffled train mod2 profiles."
+      __merge__: anndata_masked_mod.yaml
+    - name: "--output_train_sol"
+      direction: "output"
+      example: "dataset_solution.h5ad"
+      description: "The pairing of train mod1&mod2 profiles."
+      __merge__: anndata_masked_sol.yaml
+    - name: "--output_test_mod1"
+      direction: "output"
+      example: "dataset_masked.h5ad"
+      description: "The masked shuffled test mod1 profiles."
+      __merge__: anndata_masked_mod.yaml
+    - name: "--output_test_mod2"
+      direction: "output"
+      __merge__: anndata_masked_mod.yaml
+    - name: "--output_test_sol"
+      direction: "output"
+      __merge__: anndata_masked_sol.yaml
+    - name: "--knn"
+      type: "integer"
+      default: 20
+      description: The KNN cutoff for computing nearest neighbors on the mod1 and mod2 profiles.
+    - name: "--seed"
+      type: "integer"
+      default: 1
+      description: The seed.
\ No newline at end of file

From fd038d2d13902593663a859f6de72b5cccd0a51a Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Tue, 10 Jan 2023 11:18:55 +0100
Subject: [PATCH 63/82] add mask_dataset

---
 .../mask_dataset/config.vsh.yaml              |  23 +++
 src/match_modality/mask_dataset/script.R      | 179 ++++++++++++++++++
 2 files changed, 202 insertions(+)
 create mode 100644 src/match_modality/mask_dataset/config.vsh.yaml
 create mode 100644 src/match_modality/mask_dataset/script.R

diff --git a/src/match_modality/mask_dataset/config.vsh.yaml b/src/match_modality/mask_dataset/config.vsh.yaml
new file mode 100644
index 0000000000..5dd18a45f9
--- /dev/null
+++ b/src/match_modality/mask_dataset/config.vsh.yaml
@@ -0,0 +1,23 @@
+_merge__: ../api/comp_mask_dataset.yaml
+functionality:
+  name: mask_dataset
+  namespace: match_modality
+  description: |
+    Censor an existing dataset: obfuscate gene names, remove cell identities and
+    shuffle cells of modalities, for distribution to competitors.    
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata, tidyverse , testthat]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [ anndata>=0.8 ]
+  - type: nextflow
+    publish: true
+    directives: [ highmem, midtime, highcpu ]
diff --git a/src/match_modality/mask_dataset/script.R b/src/match_modality/mask_dataset/script.R
new file mode 100644
index 0000000000..f41de56600
--- /dev/null
+++ b/src/match_modality/mask_dataset/script.R
@@ -0,0 +1,179 @@
+cat("Loading dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+requireNamespace("anndata", quietly = TRUE)
+library(assertthat, quietly = TRUE, warn.conflicts = FALSE)
+library(Matrix, quietly = TRUE, warn.conflicts = FALSE)
+
+## VIASH START
+# input_path <- "resources_test/common/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter."
+# input_path <- "output/datasets/common/openproblems_bmmc_multiome_phase1/openproblems_bmmc_multiome_phase1.manual_formatting."
+# input_path <- "output/datasets/common/openproblems_bmmc_cite_phase1/openproblems_bmmc_cite_phase1.manual_formatting."
+input_path <- "resources_test/common/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter."
+output_path <- "output/multiome"
+# output_path <- "output/datasets/match_modality/openproblems_bmmc_multiome_phase1/openproblems_bmmc_multiome_phase1.censor_dataset."
+# output_path <- "output/datasets/match_modality/openproblems_bmmc_multiome_iid/openproblems_bmmc_multiome_iid.censor_dataset."
+# dir.create(dirname(output_path), recursive = TRUE)
+
+par <- list(
+  input_mod1 = paste0(input_path, "output_rna.h5ad"),
+  input_mod2 = paste0(input_path, "output_mod2.h5ad"),
+  output_train_mod1 = paste0(output_path, "output_train_mod1.h5ad"),
+  output_train_mod2 = paste0(output_path, "output_train_mod2.h5ad"),
+  output_train_sol = paste0(output_path, "output_train_sol.h5ad"),
+  output_test_mod1 = paste0(output_path, "output_test_mod1.h5ad"),
+  output_test_mod2 = paste0(output_path, "output_test_mod2.h5ad"),
+  output_test_sol = paste0(output_path, "output_test_sol.h5ad"),
+  seed = 1L,
+  knn = 10L
+)
+## VIASH END
+
+set.seed(par$seed)
+
+subset_mats <- function(li, obs_filt, anonymize = FALSE) {
+  out <- list()
+  for (n in names(li)) {
+    mat <- li[[n]][obs_filt, , drop = FALSE]
+    if (anonymize) {
+      rownames(mat) <- paste0("cell_", seq_len(nrow(mat)))
+    }
+    out[[n]] <- mat
+  }
+  out
+}
+
+
+cat("Reading input data\n")
+input_mod1 <- anndata::read_h5ad(par$input_mod1)
+input_mod2 <- anndata::read_h5ad(par$input_mod2)
+ad1_mod <- unique(input_mod1$var[["feature_types"]])
+ad2_mod <- unique(input_mod2$var[["feature_types"]])
+new_dataset_id <- paste0(input_mod1$uns[["dataset_id"]], "_MM_", tolower(ad1_mod), "2", tolower(ad2_mod))
+ad1_uns <- list(dataset_id = new_dataset_id, organism = "human")
+ad2_uns <- list(dataset_id = new_dataset_id, organism = "human")
+ad1_obsm <- list()
+ad2_obsm <- list()
+
+if (ad1_mod == "ATAC") {
+  ad1_uns$gene_activity_var_names <- input_mod1$uns$gene_activity_var_names
+  ad1_obsm$gene_activity <- as(input_mod1$obsm$gene_activity, "CsparseMatrix")
+}
+if (ad2_mod == "ATAC") {
+  ad2_uns$gene_activity_var_names <- input_mod2$uns$gene_activity_var_names
+  ad2_obsm$gene_activity <- as(input_mod2$obsm$gene_activity, "CsparseMatrix")
+}
+
+cat("Shuffle train cells\n")
+train_ix <- which(input_mod1$obs$is_train) %>% sort
+train_mod2_ix <- sample.int(length(train_ix))
+
+cat("Shuffle test cells\n")
+test_ix <- which(!input_mod1$obs$is_train) %>% sort
+test_mod2_ix <- sample.int(length(test_ix))
+
+is_categorical <- function(x) is.character(x) || is.factor(x)
+# relevel <- function(x) factor(as.character(x))
+relevel <- function(x) as.character(x)
+
+cat("Creating train objects\n")
+mod1_var <- input_mod1$var %>% select(one_of("gene_ids", "feature_types"))
+mod2_var <- input_mod2$var %>% select(one_of("gene_ids", "feature_types"))
+train_obs1 <- input_mod1$obs[train_ix, , drop = FALSE] %>%
+  select(one_of("batch", "size_factors")) %>%
+  mutate_if(is_categorical, relevel)
+train_obs2 <- input_mod2$obs[train_ix[train_mod2_ix], , drop = FALSE] %>%
+  select(one_of("batch", "size_factors")) %>%
+  mutate_if(is_categorical, relevel)
+rownames(train_obs2) <- NULL
+if (ncol(train_obs2) == 0) train_obs2 <- NULL
+assert_that("size_factors" %in% colnames(train_obs1) != "size_factors" %in% colnames(train_obs2))
+assert_that(all(train_obs1$batch == train_obs2$batch[order(train_mod2_ix)]))
+
+output_train_mod1 <- anndata::AnnData(
+  X = input_mod1$X[train_ix, , drop = FALSE],
+  layers = subset_mats(input_mod1$layers, train_ix),
+  obsm = subset_mats(ad1_obsm, train_ix),
+  obs = train_obs1,
+  var = mod1_var,
+  uns = ad1_uns
+)
+output_train_mod2 <- anndata::AnnData(
+  X = input_mod2$X[train_ix[train_mod2_ix], , drop = FALSE] %>%
+    magrittr::set_rownames(., paste0("cell_", seq_len(nrow(.)))),
+  layers = subset_mats(input_mod2$layers, train_ix[train_mod2_ix], anonymize = TRUE),
+  obsm = subset_mats(ad2_obsm, train_ix[train_mod2_ix], anonymize = TRUE),
+  obs = train_obs2,
+  var = mod2_var,
+  uns = ad2_uns
+)
+assert_that(all(output_train_mod1$obs$batch == output_train_mod2$obs$batch[order(train_mod2_ix)]))
+
+cat("Create test objects\n")
+test_obs1 <- input_mod1$obs[test_ix, , drop = FALSE] %>%
+  select(one_of("batch", "size_factors")) %>%
+  mutate_if(is_categorical, relevel)
+test_obs2 <- input_mod2$obs[test_ix[test_mod2_ix], , drop = FALSE] %>%
+  select(one_of("batch", "size_factors")) %>%
+  mutate_if(is_categorical, relevel)
+rownames(test_obs2) <- NULL
+if (ncol(test_obs2) == 0) test_obs2 <- NULL
+assert_that("size_factors" %in% colnames(train_obs1) != "size_factors" %in% colnames(train_obs2))
+assert_that(all(test_obs1$batch == test_obs2$batch[order(test_mod2_ix)]))
+
+output_test_mod1 <- anndata::AnnData(
+  X = input_mod1$X[test_ix, , drop = FALSE],
+  layers = subset_mats(input_mod1$layers, test_ix),
+  obsm = subset_mats(ad1_obsm, test_ix),
+  obs = test_obs1,
+  var = mod1_var,
+  uns = ad1_uns
+)
+output_test_mod2 <- anndata::AnnData(
+  X = input_mod2$X[test_ix[test_mod2_ix], , drop = FALSE] %>%
+    magrittr::set_rownames(., paste0("cell_", seq_len(nrow(.)))),
+  layers = subset_mats(input_mod2$layers, test_ix[test_mod2_ix], anonymize = TRUE),
+  obsm = subset_mats(ad2_obsm, test_ix[test_mod2_ix], anonymize = TRUE),
+  obs = test_obs2,
+  var = mod2_var,
+  uns = ad2_uns
+)
+assert_that(all(output_test_mod1$obs$batch == output_test_mod2$obs$batch[order(test_mod2_ix)]))
+
+cat("Create solution objects\n")
+
+train_sol_mat <- Matrix::sparseMatrix(
+  i = seq_along(train_mod2_ix),
+  j = order(train_mod2_ix),
+  x = rep(1, length(train_mod2_ix))
+)
+output_train_sol <- anndata::AnnData(
+  X = train_sol_mat,
+  obs = input_mod1$obs[train_ix, , drop = FALSE] %>% select(one_of(c("batch"))) %>% mutate_if(is_categorical, relevel),
+  uns = list(dataset_id = new_dataset_id, pairing_ix = train_mod2_ix - 1)
+)
+
+test_sol_mat <- Matrix::sparseMatrix(
+  i = seq_along(test_mod2_ix),
+  j = order(test_mod2_ix),
+  x = rep(1, length(test_mod2_ix))
+)
+output_test_sol <- anndata::AnnData(
+  X = test_sol_mat,
+  obs = input_mod1$obs[test_ix, , drop = FALSE] %>% select(one_of(c("batch"))) %>% mutate_if(is_categorical, relevel),
+  uns = list(dataset_id = new_dataset_id, pairing_ix = test_mod2_ix - 1)
+)
+
+# checks
+# mean(rowSums(train_solknn > 0))
+# mean(rowSums(test_solknn > 0))
+# sum(train_solknn * train_sol_mat) == nrow(train_sol_mat)
+# sum(test_solknn * test_sol_mat) == nrow(test_sol_mat)
+
+cat("Saving output files as h5ad\n")
+zzz <- output_train_mod1$write_h5ad(par$output_train_mod1, compression = "gzip")
+zzz <- output_train_mod2$write_h5ad(par$output_train_mod2, compression = "gzip")
+zzz <- output_train_sol$write_h5ad(par$output_train_sol, compression = "gzip")
+zzz <- output_test_mod1$write_h5ad(par$output_test_mod1, compression = "gzip")
+zzz <- output_test_mod2$write_h5ad(par$output_test_mod2, compression = "gzip")
+zzz <- output_test_sol$write_h5ad(par$output_test_sol, compression = "gzip")

From 43d2db1da5d9da44c346a9243fc2139099cd32b6 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Tue, 10 Jan 2023 15:08:11 +0100
Subject: [PATCH 64/82] add constant control method

---
 .../api/anndata_prediction.yaml               | 15 +++++
 src/match_modality/api/authors.yaml           | 20 ++++++
 .../api/comp_control_method.yaml              | 63 +++++++++++++++++++
 .../control_methods/constant/config.vsh.yaml  | 27 ++++++++
 .../control_methods/constant/script.R         | 50 +++++++++++++++
 5 files changed, 175 insertions(+)
 create mode 100644 src/match_modality/api/anndata_prediction.yaml
 create mode 100644 src/match_modality/api/authors.yaml
 create mode 100644 src/match_modality/api/comp_control_method.yaml
 create mode 100644 src/match_modality/control_methods/constant/config.vsh.yaml
 create mode 100644 src/match_modality/control_methods/constant/script.R

diff --git a/src/match_modality/api/anndata_prediction.yaml b/src/match_modality/api/anndata_prediction.yaml
new file mode 100644
index 0000000000..ca214d4963
--- /dev/null
+++ b/src/match_modality/api/anndata_prediction.yaml
@@ -0,0 +1,15 @@
+type: file
+description: "The predicted pairing of test mod1&mod2 profiles."
+example: "prediction.h5ad"
+info:
+  short_description: "Prediction"
+  slots:     
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: method_id
+        description: "A unique identifier for the method"
+        required: true
diff --git a/src/match_modality/api/authors.yaml b/src/match_modality/api/authors.yaml
new file mode 100644
index 0000000000..2dfeb9ca39
--- /dev/null
+++ b/src/match_modality/api/authors.yaml
@@ -0,0 +1,20 @@
+functionality:
+  authors:
+    - name: Robrecht Cannoodt
+      roles: [ author, contributor ]
+      props: { github: rcannood, orcid: "0000-0003-3641-729X" }
+    - name: Kai Waldrant
+      roles: [ contributor ]
+      props: { github: KaiWaldrant }
+    - name: Michaela Mueller
+      email: mumichae@in.tum.de
+      roles: [ author, maintainer ]
+      props: { github: mumichae, orcid: "0000-0002-1401-1785" }
+    - name: Louise Deconinck
+      email: louise.deconinck@gmail.com
+      roles: [ author, maintainer ]
+      props: { github: LouiseDck, orcid: "" }
+    - name: Alejandro Granados
+      email: alejandro.granados@czbiohub.org
+      roles: [ author, maintainer ]
+      props: { github: agranado }
\ No newline at end of file
diff --git a/src/match_modality/api/comp_control_method.yaml b/src/match_modality/api/comp_control_method.yaml
new file mode 100644
index 0000000000..a6969d1c79
--- /dev/null
+++ b/src/match_modality/api/comp_control_method.yaml
@@ -0,0 +1,63 @@
+functionality:
+  arguments:
+    - name: "--input_test_mod1"
+      __merge__: anndata_masked_mod.yaml
+    - name: "--input_test_mod2"
+      __merge__: anndata_masked_mod.yaml
+    - name: "--output"
+      direction: "output"
+      __merge__: anndata_prediction.yaml
+  test_resources:
+    - path: ../../../../output   
+    - type: python_script
+      path: generic_test.py
+      text: |
+        from os import path
+        import subprocess
+        import anndata as ad
+        import numpy as np
+        from scipy.sparse import issparse
+
+        # define some filenames
+        testpar = {
+            'input_test_mod1': 'output/output_test_mod1.h5ad',
+            'input_test_mod2': 'output/output_test_mod2.h5ad',
+            'input_test_sol': 'output/output_test_sol.h5ad',
+            'output': 'output/output_prediction.h5ad',
+        }
+
+        print('> Running method', flush=True)
+        out = subprocess.run([
+            meta['executable'],
+            '--input_test_mod1', testpar['input_test_mod1'],
+            '--input_test_mod2', testpar['input_test_mod2'],
+            '--output', testpar['output']
+          ],
+          capture_output= True,
+          text= True
+        ).stderr
+
+        print(out, flush=True)
+
+        print('> Checking whether output files were created', flush=True)
+        assert path.exists(testpar['output'])
+
+        print('> Reading h5ad files', flush=True)
+        ad_sol = ad.read_h5ad(testpar['input_test_sol'])
+        ad_pred = ad.read_h5ad(testpar['output'])
+
+        print('> Checking dataset id', flush=True)
+        assert ad_pred.uns['dataset_id'] == ad_sol.uns['dataset_id']
+
+        print('> Checking method id', flush=True)
+        assert ad_pred.uns['method_id'] == meta['functionality_name']
+
+        print('> Checking X', flush=True)
+        assert issparse(ad_pred.X)
+        assert np.all([x >= 0 for x in ad_pred.X.nonzero()]), 'Values must be strictly non-negative.'
+        assert ad_pred.X.nonzero()[0].size <= 1000 * ad_sol.n_obs
+        assert ad_pred.n_obs == ad_sol.n_obs
+        assert ad_pred.n_vars == ad_sol.n_vars
+        assert np.isclose(ad_pred.X.sum(axis=1), 1, atol=1e-10).all(), 'All rows should sum to 1.'
+
+        print('> Test succeeded!', flush=True)
diff --git a/src/match_modality/control_methods/constant/config.vsh.yaml b/src/match_modality/control_methods/constant/config.vsh.yaml
new file mode 100644
index 0000000000..fd8c20d700
--- /dev/null
+++ b/src/match_modality/control_methods/constant/config.vsh.yaml
@@ -0,0 +1,27 @@
+__merge__: ../../api/comp_control_method.yaml
+functionality:
+  name: constant
+  namespace: match_modality/control_methods
+  description: Returns constant weights between all mod1 profiles and the first 1000 mod2 profiles.
+  info:
+    type: negative_control
+    method_name: Constant
+  resources:
+    - type: r_script
+      path: script.R
+  # tests:
+  #   - type: r_script
+  #     path: ../../unit_tests/test_method.R
+  #   - path: ../../../../resources_test
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [ anndata>=0.8 ]
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/match_modality/control_methods/constant/script.R b/src/match_modality/control_methods/constant/script.R
new file mode 100644
index 0000000000..590d49f982
--- /dev/null
+++ b/src/match_modality/control_methods/constant/script.R
@@ -0,0 +1,50 @@
+cat("Loading dependencies\n")
+requireNamespace("anndata", quietly = TRUE)
+library(Matrix, warn.conflicts = FALSE, quietly = TRUE)
+
+## VIASH START
+par <- list(
+  input_test_mod1 = "output/output_test_mod1.h5ad",
+  input_test_mod2 = "output/output_test_mod2.h5ad",
+  output = "output/output_prediction.h5ad"
+)
+meta <- list(functionality_name = "foo")
+## VIASH END
+
+
+cat("Reading h5ad files\n")
+# input_train_mod1 <- anndata::read_h5ad(par$input_train_mod1)
+# input_train_mod2 <- anndata::read_h5ad(par$input_train_mod2)
+# input_train_sol <- anndata::read_h5ad(par$input_train_sol)
+input_test_mod1 <- anndata::read_h5ad(par$input_test_mod1, backed = TRUE)
+input_test_mod2 <- anndata::read_h5ad(par$input_test_mod2, backed = TRUE)
+
+knn_df <-
+  expand.grid(
+    i = seq_len(nrow(input_test_mod1)),
+    j = seq_len(min(nrow(input_test_mod2), 1000))
+  )
+
+knn_mat <- 
+  Matrix::sparseMatrix(
+    i = knn_df$i,
+    j = knn_df$j,
+    x = rep(1, nrow(knn_df)),
+    dims = list(nrow(input_test_mod1), nrow(input_test_mod2))
+  )
+
+# normalise to make rows sum to 1
+rs <- Matrix::rowSums(knn_mat)
+knn_mat@x <- knn_mat@x / rs[knn_mat@i + 1]
+
+cat("Creating output anndata\n")
+out <- anndata::AnnData(
+  X = as(knn_mat, "CsparseMatrix"),
+  uns = list(
+    dataset_id = input_test_mod1$uns[["dataset_id"]],
+    method_id = meta$functionality_name
+  )
+)
+
+cat("Writing predictions to file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")

From f276571a5f966faa68ce039186c0ac6d00dd583c Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Tue, 10 Jan 2023 15:33:25 +0100
Subject: [PATCH 65/82] add random_pairing control method

---
 src/match_modality/api/authors.yaml           |  6 ++-
 .../api/comp_control_method.yaml              |  1 +
 .../control_methods/constant/config.vsh.yaml  |  4 --
 .../random_pairing/config.vsh.yaml            | 19 ++++++++
 .../control_methods/random_pairing/script.py  | 45 +++++++++++++++++++
 5 files changed, 70 insertions(+), 5 deletions(-)
 create mode 100644 src/match_modality/control_methods/random_pairing/config.vsh.yaml
 create mode 100644 src/match_modality/control_methods/random_pairing/script.py

diff --git a/src/match_modality/api/authors.yaml b/src/match_modality/api/authors.yaml
index 2dfeb9ca39..59c6ad211a 100644
--- a/src/match_modality/api/authors.yaml
+++ b/src/match_modality/api/authors.yaml
@@ -17,4 +17,8 @@ functionality:
     - name: Alejandro Granados
       email: alejandro.granados@czbiohub.org
       roles: [ author, maintainer ]
-      props: { github: agranado }
\ No newline at end of file
+      props: { github: agranado }
+    - name: Alex Tong
+      email: alexandertongdev@gmail.com
+      roles: [ author, maintainer ]
+      props: { github: atong01 }
\ No newline at end of file
diff --git a/src/match_modality/api/comp_control_method.yaml b/src/match_modality/api/comp_control_method.yaml
index a6969d1c79..1faa55f61e 100644
--- a/src/match_modality/api/comp_control_method.yaml
+++ b/src/match_modality/api/comp_control_method.yaml
@@ -33,6 +33,7 @@ functionality:
             '--input_test_mod2', testpar['input_test_mod2'],
             '--output', testpar['output']
           ],
+          check=True,
           capture_output= True,
           text= True
         ).stderr
diff --git a/src/match_modality/control_methods/constant/config.vsh.yaml b/src/match_modality/control_methods/constant/config.vsh.yaml
index fd8c20d700..a674e4ecf0 100644
--- a/src/match_modality/control_methods/constant/config.vsh.yaml
+++ b/src/match_modality/control_methods/constant/config.vsh.yaml
@@ -9,10 +9,6 @@ functionality:
   resources:
     - type: r_script
       path: script.R
-  # tests:
-  #   - type: r_script
-  #     path: ../../unit_tests/test_method.R
-  #   - path: ../../../../resources_test
 platforms:
   - type: docker
     image: eddelbuettel/r2u:22.04
diff --git a/src/match_modality/control_methods/random_pairing/config.vsh.yaml b/src/match_modality/control_methods/random_pairing/config.vsh.yaml
new file mode 100644
index 0000000000..3218f32279
--- /dev/null
+++ b/src/match_modality/control_methods/random_pairing/config.vsh.yaml
@@ -0,0 +1,19 @@
+__merge__: ../../api/comp_control_method.yaml
+functionality:
+  name: random_pairing
+  namespace: match_modality/control_methods
+  description: Generates random pairings weights drawn from a uniform distribution.
+  info:
+    type: negative_control
+    method_name: Random Pairing
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: "python:3.10"
+    setup:
+      - type: python
+        pip: [ anndata>=0.8, numpy, scikit-learn ]
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
\ No newline at end of file
diff --git a/src/match_modality/control_methods/random_pairing/script.py b/src/match_modality/control_methods/random_pairing/script.py
new file mode 100644
index 0000000000..868b6ef31c
--- /dev/null
+++ b/src/match_modality/control_methods/random_pairing/script.py
@@ -0,0 +1,45 @@
+import anndata as ad
+import numpy as np
+import scipy.sparse
+from sklearn.preprocessing import normalize
+
+# VIASH START
+par = {
+    "input_test_mod1": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_mod1.h5ad",
+    "input_test_mod2": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_mod2.h5ad",
+    "output": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+}
+
+meta = {
+    "functionality_name": "foo"
+}
+# VIASH END
+
+print("Load datasets")
+input_test_mod1 = ad.read_h5ad(par["input_test_mod1"])
+input_test_mod2 = ad.read_h5ad(par["input_test_mod2"])
+
+# determine number of values in array
+num_values = min(1000, input_test_mod1.n_obs) * input_test_mod1.n_obs
+indices = np.random.randint(input_test_mod1.n_obs**2, size=num_values)
+
+mat_x = np.random.rand(num_values)
+mat_i = indices % input_test_mod1.n_obs
+mat_j = (indices / input_test_mod1.n_obs).astype(int)
+pairing_matrix = scipy.sparse.csr_matrix(
+    (mat_x, (mat_i, mat_j)),
+    shape=(input_test_mod1.n_obs, input_test_mod2.n_obs)
+)
+
+# row normalise
+prob_matrix = normalize(pairing_matrix, norm="l1")
+
+# Write out prediction
+prediction = ad.AnnData(
+    X=prob_matrix,
+    uns={
+        "method_id": meta["functionality_name"],
+        "dataset_id": input_test_mod1.uns["dataset_id"]
+    }
+)
+prediction.write_h5ad(par["output"])

From 7653329fc77ecf2a15c77faaf808481bb94a62f0 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Tue, 10 Jan 2023 16:13:18 +0100
Subject: [PATCH 66/82] add semi_solution control method

---
 ..._masked_sol.yaml => anndata_solution.yaml} |  0
 .../api/comp_control_method.yaml              |  8 +++--
 src/match_modality/api/comp_mask_dataset.yaml | 16 ++--------
 .../semi_solution/config.vsh.yaml             | 23 +++++++++++++++
 .../control_methods/semi_solution/script.R    | 29 +++++++++++++++++++
 5 files changed, 60 insertions(+), 16 deletions(-)
 rename src/match_modality/api/{anndata_masked_sol.yaml => anndata_solution.yaml} (100%)
 create mode 100644 src/match_modality/control_methods/semi_solution/config.vsh.yaml
 create mode 100644 src/match_modality/control_methods/semi_solution/script.R

diff --git a/src/match_modality/api/anndata_masked_sol.yaml b/src/match_modality/api/anndata_solution.yaml
similarity index 100%
rename from src/match_modality/api/anndata_masked_sol.yaml
rename to src/match_modality/api/anndata_solution.yaml
diff --git a/src/match_modality/api/comp_control_method.yaml b/src/match_modality/api/comp_control_method.yaml
index 1faa55f61e..8dff03a367 100644
--- a/src/match_modality/api/comp_control_method.yaml
+++ b/src/match_modality/api/comp_control_method.yaml
@@ -4,6 +4,8 @@ functionality:
       __merge__: anndata_masked_mod.yaml
     - name: "--input_test_mod2"
       __merge__: anndata_masked_mod.yaml
+    - name: "--input_test_sol"
+      __merge__: anndata_solution.yaml
     - name: "--output"
       direction: "output"
       __merge__: anndata_prediction.yaml
@@ -31,14 +33,16 @@ functionality:
             meta['executable'],
             '--input_test_mod1', testpar['input_test_mod1'],
             '--input_test_mod2', testpar['input_test_mod2'],
+            '--input_test_sol', testpar['input_test_sol'],
             '--output', testpar['output']
           ],
           check=True,
           capture_output= True,
           text= True
-        ).stderr
+        )
 
-        print(out, flush=True)
+        # for troubleshooting: remove 'check=True' in subprocess.run above and comment out below print
+        # print(out.stderr, flush=True)
 
         print('> Checking whether output files were created', flush=True)
         assert path.exists(testpar['output'])
diff --git a/src/match_modality/api/comp_mask_dataset.yaml b/src/match_modality/api/comp_mask_dataset.yaml
index 517b7f0f4d..0737a499a0 100644
--- a/src/match_modality/api/comp_mask_dataset.yaml
+++ b/src/match_modality/api/comp_mask_dataset.yaml
@@ -5,31 +5,19 @@ functionality:
     - name: "--input_mod2"
       __merge__: anndata_dataset.yaml
     - name: "--output_train_mod1"
-      direction: "output"
-      example: "dataset_masked.h5ad"
-      description: "The masked shuffled train mod1 profiles."
       __merge__: anndata_masked_mod.yaml
     - name: "--output_train_mod2"
-      direction: "output"
-      example: "dataset_masked.h5ad"
-      description: "The masked shuffled train mod2 profiles."
       __merge__: anndata_masked_mod.yaml
     - name: "--output_train_sol"
-      direction: "output"
-      example: "dataset_solution.h5ad"
-      description: "The pairing of train mod1&mod2 profiles."
-      __merge__: anndata_masked_sol.yaml
+      __merge__: anndata_solution.yaml
     - name: "--output_test_mod1"
-      direction: "output"
-      example: "dataset_masked.h5ad"
-      description: "The masked shuffled test mod1 profiles."
       __merge__: anndata_masked_mod.yaml
     - name: "--output_test_mod2"
       direction: "output"
       __merge__: anndata_masked_mod.yaml
     - name: "--output_test_sol"
       direction: "output"
-      __merge__: anndata_masked_sol.yaml
+      __merge__: anndata_solution.yaml
     - name: "--knn"
       type: "integer"
       default: 20
diff --git a/src/match_modality/control_methods/semi_solution/config.vsh.yaml b/src/match_modality/control_methods/semi_solution/config.vsh.yaml
new file mode 100644
index 0000000000..da39c07aab
--- /dev/null
+++ b/src/match_modality/control_methods/semi_solution/config.vsh.yaml
@@ -0,0 +1,23 @@
+__merge__: ../../api/comp_control_method.yaml
+functionality:
+  name: semi_solution
+  namespace: match_modality/control_methods
+  description: Returns the ground-truth pairing.
+  info:
+    type: positive_control
+    method_name: Semi-solution
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata ]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [ anndata>=0.8 ]
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/match_modality/control_methods/semi_solution/script.R b/src/match_modality/control_methods/semi_solution/script.R
new file mode 100644
index 0000000000..d36833b4b0
--- /dev/null
+++ b/src/match_modality/control_methods/semi_solution/script.R
@@ -0,0 +1,29 @@
+cat("Loading dependencies\n")
+requireNamespace("anndata", quietly = TRUE)
+
+## VIASH START
+par <- list(
+  input_test_sol = "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_sol.h5ad",
+  output = "output.h5ad"
+)
+meta <- list(functionality_name = "foo")
+## VIASH END
+
+cat("Reading h5ad files\n")
+input_test_sol <- anndata::read_h5ad(par$input_test_sol)
+
+# randomly fill in gold standard values
+input_test_sol$X@x <- runif(length(input_test_sol$X@x))
+
+# fill other values with random values as well
+ix <- sample.int(nrow(input_test_sol) * ncol(input_test_sol), nrow(input_test_sol) * 10)
+input_test_sol$X[ix] <- runif(length(ix))
+
+# normalise to make rows sum to 1
+rs <- Matrix::rowSums(input_test_sol$X)
+input_test_sol$X@x <- input_test_sol$X@x / rs[input_test_sol$X@i + 1]
+
+input_test_sol$uns[["method_id"]] <- meta$functionality_name
+
+cat("Writing predictions to file\n")
+zzz <- input_test_sol$write_h5ad(par$output, compression = "gzip")

From 9e8ecb1b787247354a79172d986522ca51da9fb7 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Tue, 10 Jan 2023 16:17:11 +0100
Subject: [PATCH 67/82] add solution control_method

---
 .../control_methods/solution/config.vsh.yaml  | 23 +++++++++++++++++++
 .../control_methods/solution/script.R         | 18 +++++++++++++++
 2 files changed, 41 insertions(+)
 create mode 100644 src/match_modality/control_methods/solution/config.vsh.yaml
 create mode 100644 src/match_modality/control_methods/solution/script.R

diff --git a/src/match_modality/control_methods/solution/config.vsh.yaml b/src/match_modality/control_methods/solution/config.vsh.yaml
new file mode 100644
index 0000000000..ca9d8d5904
--- /dev/null
+++ b/src/match_modality/control_methods/solution/config.vsh.yaml
@@ -0,0 +1,23 @@
+__merge__: ../../api/comp_control_method.yaml
+functionality:
+  name: solution
+  namespace: match_modality/control_methods
+  description: Returns the ground-truth pairing.
+  info:
+    type: positive_control
+    method_name: Solution
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata ]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [ anndata>=0.8 ]
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/match_modality/control_methods/solution/script.R b/src/match_modality/control_methods/solution/script.R
new file mode 100644
index 0000000000..88b5b746dd
--- /dev/null
+++ b/src/match_modality/control_methods/solution/script.R
@@ -0,0 +1,18 @@
+cat("Loading dependencies\n")
+requireNamespace("anndata", quietly = TRUE)
+
+## VIASH START
+par <- list(
+  input_test_sol = "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_sol.h5ad",
+  output = "output.h5ad"
+)
+meta <- list(functionality_name = "foo")
+## VIASH END
+
+cat("Reading h5ad files\n")
+input_test_sol <- anndata::read_h5ad(par$input_test_sol)
+
+input_test_sol$uns[["method_id"]] <- meta$functionality_name
+
+cat("Writing predictions to file\n")
+zzz <- input_test_sol$write_h5ad(par$output, compression = "gzip")

From ebfd94de095427d788f0681089b39222ea211b72 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Tue, 10 Jan 2023 17:21:30 +0100
Subject: [PATCH 68/82] add dr_knn_cbf method

---
 src/match_modality/api/comp_method.yaml       |  76 +++++++++
 .../methods/dr_knnr_cbf/config.vsh.yaml       |  28 ++++
 .../methods/dr_knnr_cbf/script.R              | 144 ++++++++++++++++++
 3 files changed, 248 insertions(+)
 create mode 100644 src/match_modality/api/comp_method.yaml
 create mode 100644 src/match_modality/methods/dr_knnr_cbf/config.vsh.yaml
 create mode 100644 src/match_modality/methods/dr_knnr_cbf/script.R

diff --git a/src/match_modality/api/comp_method.yaml b/src/match_modality/api/comp_method.yaml
new file mode 100644
index 0000000000..70528c8ccb
--- /dev/null
+++ b/src/match_modality/api/comp_method.yaml
@@ -0,0 +1,76 @@
+functionality:
+  arguments:
+    - name: "--input_train_mod1"
+      __merge__: anndata_masked_mod.yaml
+    - name: "--input_train_mod2"
+      __merge__: anndata_masked_mod.yaml
+    - name: "--input_train_sol"
+      __merge__: anndata_solution.yaml
+    - name: "--input_test_mod1"
+      __merge__: anndata_masked_mod.yaml
+    - name: "--input_test_mod2"
+      __merge__: anndata_masked_mod.yaml
+    - name: "--output"
+      __merge__: anndata_prediction.yaml
+  test_resources:
+    - path: ../../../../output   
+    - type: python_script
+      path: generic_test.py
+      text: |
+        from os import path
+        import subprocess
+        import anndata as ad
+        import numpy as np
+        from scipy.sparse import issparse
+
+        # define some filenames
+        testpar = {
+            'input_train_mod1': 'output/output_train_mod1.h5ad',
+            'input_train_mod2': 'output/output_train_mod2.h5ad',
+            'input_train_sol': 'output/output_train_sol.h5ad',
+            'input_test_mod1': 'output/output_test_mod1.h5ad',
+            'input_test_mod2': 'output/output_test_mod2.h5ad',
+            'input_test_sol': 'output/output_test_sol.h5ad',
+            'output': 'output/output_prediction.h5ad',
+        }
+
+        print('> Running method', flush=True)
+        out = subprocess.run([
+            meta['executable'],
+            '--input_train_mod1', testpar['input_train_mod1'],
+            '--input_train_mod2', testpar['input_train_mod2'],
+            '--input_train_sol', testpar['input_train_sol'],
+            '--input_test_mod1', testpar['input_test_mod1'],
+            '--input_test_mod2', testpar['input_test_mod2'],
+            '--output', testpar['output']
+          ],
+          check=True
+          capture_output= True,
+          text= True
+        )
+
+        # for troubleshooting: remove 'check=True' in subprocess.run above and comment out below print
+        # print(out.stderr, flush=True)
+
+        print('> Checking whether output files were created', flush=True)
+        assert path.exists(testpar['output'])
+
+        print('> Reading h5ad files', flush=True)
+        ad_sol = ad.read_h5ad(testpar['input_test_sol'])
+        ad_pred = ad.read_h5ad(testpar['output'])
+
+        print('> Checking dataset id', flush=True)
+        assert ad_pred.uns['dataset_id'] == ad_sol.uns['dataset_id']
+
+        print('> Checking method id', flush=True)
+        assert ad_pred.uns['method_id'] == meta['functionality_name']
+
+        print('> Checking X', flush=True)
+        assert issparse(ad_pred.X)
+        assert np.all([x >= 0 for x in ad_pred.X.nonzero()]), 'Values must be strictly non-negative.'
+        assert ad_pred.X.nonzero()[0].size <= 1000 * ad_sol.n_obs
+        assert ad_pred.n_obs == ad_sol.n_obs
+        assert ad_pred.n_vars == ad_sol.n_vars
+        assert np.isclose(ad_pred.X.sum(axis=1), 1, atol=1e-10).all(), 'All rows should sum to 1.'
+
+        print('> Test succeeded!', flush=True)
diff --git a/src/match_modality/methods/dr_knnr_cbf/config.vsh.yaml b/src/match_modality/methods/dr_knnr_cbf/config.vsh.yaml
new file mode 100644
index 0000000000..86e00c8c9a
--- /dev/null
+++ b/src/match_modality/methods/dr_knnr_cbf/config.vsh.yaml
@@ -0,0 +1,28 @@
+__merge__: ../../api/comp_method.yaml
+functionality:
+  name: dr_knnr_cbf
+  namespace: match_modality/methods
+  description: Perform LMDS+Pearson DR, predict embedding with KNN regression, and match cells with KNN.
+  info:
+    type: method
+    method_name: LMDS+KNNR+CBF
+  arguments:
+    - name: "--n_pop"
+      type: "integer"
+      default: 100
+      description: Population size.
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata, tidyverse, lmds, dynutils, proxy, FNN, pbapply]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [ anndata>=0.8 ]
+  - type: nextflow
+    directives: [ midmem, midtime, midcpu ]
diff --git a/src/match_modality/methods/dr_knnr_cbf/script.R b/src/match_modality/methods/dr_knnr_cbf/script.R
new file mode 100644
index 0000000000..021e2d6483
--- /dev/null
+++ b/src/match_modality/methods/dr_knnr_cbf/script.R
@@ -0,0 +1,144 @@
+cat("Loading dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+requireNamespace("anndata", quietly = TRUE)
+library(Matrix, warn.conflicts = FALSE, quietly = TRUE)
+
+## VIASH START
+path <- "output/datasets/match_modality/openproblems_bmmc_multiome_phase1_rna/openproblems_bmmc_multiome_phase1_rna.censor_dataset.output_"
+path <- "output/datasets/match_modality/openproblems_bmmc_cite_phase1_rna/openproblems_bmmc_cite_phase1_rna.censor_dataset.output_"
+par <- list(
+  input_train_mod1 = paste0(path, "train_mod1.h5ad"),
+  input_train_mod2 = paste0(path, "train_mod2.h5ad"),
+  input_train_sol = paste0(path, "train_sol.h5ad"),
+  input_test_mod1 = paste0(path, "test_mod1.h5ad"),
+  input_test_mod2 = paste0(path, "test_mod2.h5ad"),
+  output = "output.h5ad",
+  n_pop = 300L
+)
+meta <- list(functionality_name = "foo")
+## VIASH END
+
+n_cores <- parallel::detectCores(all.tests = FALSE, logical = TRUE)
+
+method_id <- meta$functionality_name
+
+cat("Read train sol\n")
+input_train_sol <- anndata::read_h5ad(par$input_train_sol)
+
+cat("Reading mod1 h5ad files\n")
+input_train_mod1 <- anndata::read_h5ad(par$input_train_mod1)
+input_test_mod1 <- anndata::read_h5ad(par$input_test_mod1)
+
+cat("Running LMDS on input data\n")
+# merge input matrices
+mod1_X <- rbind(input_train_mod1$X, input_test_mod1$X)
+rm(input_train_mod1, input_test_mod1)
+gc()
+
+# perform DR
+dr_x1 <- lmds::lmds(mod1_X, ndim = 10, distance_method = "pearson")
+rm(mod1_X)
+gc()
+
+# split input matrices
+dr_x1_train <- dr_x1[seq_len(nrow(input_train_sol)), , drop = FALSE]
+dr_x1_test <- dr_x1[-seq_len(nrow(input_train_sol)), , drop = FALSE]
+
+cat("Reading mod1 h5ad files\n")
+input_train_mod2 <- anndata::read_h5ad(par$input_train_mod2)
+input_test_mod2 <- anndata::read_h5ad(par$input_test_mod2)
+
+cat("Running LMDS on input data\n")
+# merge input matrices
+match_train <- input_train_sol$uns$pairing_ix + 1
+mod2_X <- rbind(input_train_mod2$X[order(match_train), , drop = FALSE], input_test_mod2$X)
+rm(input_train_mod2, input_test_mod2)
+gc()
+
+# perform DR
+dr_x2 <- lmds::lmds(mod2_X, ndim = 3, distance_method = "pearson")
+rm(mod2_X)
+gc()
+
+# split input matrices
+dr_x2_train <- dr_x2[seq_len(nrow(input_train_sol)), , drop = FALSE]
+dr_x2_test <- dr_x2[-seq_len(nrow(input_train_sol)), , drop = FALSE]
+
+cat("Predicting mod1 DR of test cells\n")
+pred_mod1 <- apply(dr_x1_train, 2, function(yi) {
+  FNN::knn.reg(
+    train = dr_x2_train,
+    test = dr_x2_test,
+    y = yi,
+    k = min(15, nrow(dr_x1_test))
+  )$pred
+})
+
+cat("Predicting mod2 DR of test cells\n")
+pred_mod2 <- apply(dr_x2_train, 2, function(yi) {
+  FNN::knn.reg(
+    train = dr_x1_train,
+    test = dr_x1_test,
+    y = yi,
+    k = min(15, nrow(dr_x1_test))
+  )$pred
+})
+
+cat("Minimising distances between mod1 and mod2 pairs\n")
+gen_vec <- function(z) {
+  int <- seq_len(nrow(pred_mod1))
+
+  i <- j <- c()
+  resti <- int
+  restj <- int
+
+  while (length(resti) > 0) {
+    ixi <- sample.int(length(resti), 1)
+    newi <- resti[[ixi]]
+    d1 <- proxy::dist(pred_mod1[restj, , drop = FALSE], dr_x1_test[newi, , drop = FALSE], method = "euclidean")
+    d2 <- proxy::dist(pred_mod2[restj, , drop = FALSE], dr_x2_test[newi, , drop = FALSE], method = "euclidean")
+    d12 <- d1 + d2
+    ixj <- which.min(d12[, 1])
+    newj <- restj[[ixj]]
+    resti <- resti[-ixi]
+    restj <- restj[-ixj]
+    i <- c(i, newi)
+    j <- c(j, newj)
+
+    #  tibble(i, j); tibble(resti, restj)
+  }
+
+  tibble::tibble(i, j)
+}
+
+outs <- pbapply::pblapply(seq_len(par$n_pop), cl = n_cores, gen_vec)
+# outs <- lapply(seq_len(par$n_pop), gen_vec)
+df <- bind_rows(outs) %>%
+  group_by(i, j) %>%
+  summarise(n = n(), .groups = "drop") %>%
+  arrange(desc(n)) %>%
+  mutate(gold = i == j)
+
+knn_mat <- Matrix::sparseMatrix(
+  i = df$i,
+  j = df$j,
+  x = df$n,
+  dims = list(nrow(dr_x1_test), nrow(dr_x2_test))
+)
+
+# normalise to make rows sum to 1
+rs <- Matrix::rowSums(knn_mat)
+knn_mat@x <- knn_mat@x / rs[knn_mat@i + 1]
+
+cat("Creating output anndata\n")
+out <- anndata::AnnData(
+  X = as(knn_mat, "CsparseMatrix"),
+  uns = list(
+    dataset_id = input_train_sol$uns[["dataset_id"]],
+    method_id = method_id
+  )
+)
+
+cat("Writing predictions to file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")

From 77a8db605b506df46d58d5d376a3dd88a1d6e92e Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Tue, 10 Jan 2023 20:57:36 +0100
Subject: [PATCH 69/82] add dr_knnr_knn method

---
 src/match_modality/api/comp_method.yaml       |   2 +-
 .../methods/dr_knnr_knn/config.vsh.yaml       |  23 ++++
 .../methods/dr_knnr_knn/script.R              | 104 ++++++++++++++++++
 3 files changed, 128 insertions(+), 1 deletion(-)
 create mode 100644 src/match_modality/methods/dr_knnr_knn/config.vsh.yaml
 create mode 100644 src/match_modality/methods/dr_knnr_knn/script.R

diff --git a/src/match_modality/api/comp_method.yaml b/src/match_modality/api/comp_method.yaml
index 70528c8ccb..2286a10ffb 100644
--- a/src/match_modality/api/comp_method.yaml
+++ b/src/match_modality/api/comp_method.yaml
@@ -44,7 +44,7 @@ functionality:
             '--input_test_mod2', testpar['input_test_mod2'],
             '--output', testpar['output']
           ],
-          check=True
+          check=True,
           capture_output= True,
           text= True
         )
diff --git a/src/match_modality/methods/dr_knnr_knn/config.vsh.yaml b/src/match_modality/methods/dr_knnr_knn/config.vsh.yaml
new file mode 100644
index 0000000000..e03bf3be48
--- /dev/null
+++ b/src/match_modality/methods/dr_knnr_knn/config.vsh.yaml
@@ -0,0 +1,23 @@
+__merge__: ../../api/comp_method.yaml
+functionality:
+  name: dr_knnr_knn
+  namespace: match_modality/methods
+  description: Perform LMDS+Pearson DR, predict embedding with KNN regression, and match cells with KNN.
+  info:
+    type: baseline
+    method_name: LMDS+KNNR+KNN
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata, tidyverse, lmds, dynutils, proxy, FNN]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [ anndata>=0.8 ]
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/match_modality/methods/dr_knnr_knn/script.R b/src/match_modality/methods/dr_knnr_knn/script.R
new file mode 100644
index 0000000000..fa32cbdf29
--- /dev/null
+++ b/src/match_modality/methods/dr_knnr_knn/script.R
@@ -0,0 +1,104 @@
+cat("Loading dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+requireNamespace("anndata", quietly = TRUE)
+library(Matrix, warn.conflicts = FALSE, quietly = TRUE)
+
+## VIASH START
+path <- "output/datasets/match_modality/openproblems_bmmc_multiome_phase1_rna/openproblems_bmmc_multiome_phase1_rna.censor_dataset.output_"
+par <- list(
+  input_train_mod1 = paste0(path, "train_mod1.h5ad"),
+  input_train_mod2 = paste0(path, "train_mod2.h5ad"),
+  input_train_sol = paste0(path, "train_sol.h5ad"),
+  input_test_mod1 = paste0(path, "test_mod1.h5ad"),
+  input_test_mod2 = paste0(path, "test_mod2.h5ad"),
+  output = "output.h5ad"
+)
+## VIASH END
+
+method_id <- meta$functionality_name
+
+cat("Read train sol\n")
+input_train_sol <- anndata::read_h5ad(par$input_train_sol)
+
+cat("Reading mod1 h5ad files\n")
+input_train_mod1 <- anndata::read_h5ad(par$input_train_mod1)
+input_test_mod1 <- anndata::read_h5ad(par$input_test_mod1)
+
+cat("Running LMDS on input data\n")
+# merge input matrices
+mod1_X <- rbind(input_train_mod1$X, input_test_mod1$X)
+rm(input_train_mod1, input_train_mod2)
+
+# perform DR
+dr_x1 <- lmds::lmds(mod1_X, ndim = 10, distance_method = "pearson")
+rm(mod1_X)
+
+# split input matrices
+dr_x1_train <- dr_x1[seq_len(nrow(input_train_sol)), , drop = FALSE]
+dr_x1_test <- dr_x1[-seq_len(nrow(input_train_sol)), , drop = FALSE]
+
+cat("Reading mod1 h5ad files\n")
+input_train_mod2 <- anndata::read_h5ad(par$input_train_mod2)
+input_test_mod2 <- anndata::read_h5ad(par$input_test_mod2)
+
+cat("Running LMDS on input data\n")
+# merge input matrices
+match_train <- input_train_sol$uns$pairing_ix + 1
+mod2_X <- rbind(input_train_mod2$X[order(match_train), , drop = FALSE], input_test_mod2$X)
+rm(input_train_mod2, input_test_mod2)
+
+# perform DR
+dr_x2 <- lmds::lmds(mod2_X, ndim = 3, distance_method = "pearson")
+rm(mod2_X)
+
+# split input matrices
+dr_x2_train <- dr_x2[seq_len(nrow(input_train_sol)), , drop = FALSE]
+dr_x2_test <- dr_x2[-seq_len(nrow(input_train_sol)), , drop = FALSE]
+
+cat("Predicting for each column in modality 2\n")
+preds <- apply(dr_x2_train, 2, function(yi) {
+  FNN::knn.reg(
+    train = dr_x1_train,
+    test = dr_x1_test,
+    y = yi,
+    k = min(15, nrow(dr_x1_test))
+  )$pred
+})
+
+
+cat("Performing KNN between test mod2 DR and predicted test mod2\n")
+knn_out <- FNN::get.knnx(
+  preds,
+  dr_x2_test,
+  k = min(1000, nrow(dr_x1_test))
+)
+
+cat("Creating output data structures\n")
+df <- tibble(
+  i = as.vector(row(knn_out$nn.index)),
+  j = as.vector(knn_out$nn.index),
+  x = max(knn_out$nn.dist) * 2 - as.vector(knn_out$nn.dist)
+)
+knn_mat <- Matrix::sparseMatrix(
+  i = df$i,
+  j = df$j,
+  x = df$x,
+  dims = list(nrow(dr_x1_test), nrow(dr_x2_test))
+)
+
+# normalise to make rows sum to 1
+rs <- Matrix::rowSums(knn_mat)
+knn_mat@x <- knn_mat@x / rs[knn_mat@i + 1]
+
+cat("Creating output anndata\n")
+out <- anndata::AnnData(
+  X = as(knn_mat, "CsparseMatrix"),
+  uns = list(
+    dataset_id = input_train_sol$uns[["dataset_id"]],
+    method_id = method_id
+  )
+)
+
+cat("Writing predictions to file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")

From db5491e49d50e4c08b8edb4fc845a942c2001bc0 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Tue, 10 Jan 2023 21:03:23 +0100
Subject: [PATCH 70/82] add linear method

---
 src/match_modality/api/authors.yaml           |   6 +-
 .../methods/linear_knn/config.vsh.yaml        |  28 +++++
 .../methods/linear_knn/script.py              | 105 ++++++++++++++++++
 3 files changed, 138 insertions(+), 1 deletion(-)
 create mode 100644 src/match_modality/methods/linear_knn/config.vsh.yaml
 create mode 100644 src/match_modality/methods/linear_knn/script.py

diff --git a/src/match_modality/api/authors.yaml b/src/match_modality/api/authors.yaml
index 59c6ad211a..bd2cade4d0 100644
--- a/src/match_modality/api/authors.yaml
+++ b/src/match_modality/api/authors.yaml
@@ -21,4 +21,8 @@ functionality:
     - name: Alex Tong
       email: alexandertongdev@gmail.com
       roles: [ author, maintainer ]
-      props: { github: atong01 }
\ No newline at end of file
+      props: { github: atong01 }
+    - name: Daniel Burkhardt
+      email: daniel.b.burkhardt@gmail.com
+      roles: [ author, maintainer ]
+      props: { github: dburkhardt }
\ No newline at end of file
diff --git a/src/match_modality/methods/linear_knn/config.vsh.yaml b/src/match_modality/methods/linear_knn/config.vsh.yaml
new file mode 100644
index 0000000000..ab26297bd7
--- /dev/null
+++ b/src/match_modality/methods/linear_knn/config.vsh.yaml
@@ -0,0 +1,28 @@
+__merge__: ../../api/comp_method.yaml
+functionality:
+  name: linear_knn
+  namespace: match_modality/methods
+  description: Perform DR with Linear Regression, and match cells with kNN
+  info:
+    type: method
+    method_name: Linear Regression and KNN
+  arguments:
+    - name: "--n_svd"
+      type: "integer"
+      default: 10
+      description: Number of SVD components to compress to
+    - name: "--n_neighbors"
+      type: "integer"
+      default: 10
+      description: Number of neighbors for matching modalities
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: "python:3.10"
+    setup:
+      - type: python
+        pip: [ anndata, scipy, scikit-learn ]
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/match_modality/methods/linear_knn/script.py b/src/match_modality/methods/linear_knn/script.py
new file mode 100644
index 0000000000..c776337a4a
--- /dev/null
+++ b/src/match_modality/methods/linear_knn/script.py
@@ -0,0 +1,105 @@
+import logging
+import anndata as ad
+import scipy.spatial
+import scipy.sparse
+import numpy as np
+
+from sklearn.decomposition import TruncatedSVD
+from sklearn.neighbors import NearestNeighbors
+from sklearn.linear_model import LinearRegression
+from sklearn.preprocessing import normalize
+
+## VIASH START
+# Anything within this block will be removed by `viash` and will be
+# replaced with the parameters as specified in your config.vsh.yaml.
+par = {
+    "input_train_mod1": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_mod1.h5ad",
+    "input_train_mod2": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_mod2.h5ad",
+    "input_train_sol": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_sol.h5ad",
+    "input_test_mod1": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_mod1.h5ad",
+    "input_test_mod2": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_mod2.h5ad",
+    "output": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+    "n_svd": 100,
+    "n_neighbors" : 10,
+}
+
+meta = {
+    "funtionality_name": "foo"
+}
+## VIASH END
+
+logging.basicConfig(level=logging.INFO)
+
+logging.info("Load datasets")
+input_train_mod1 = ad.read_h5ad(par["input_train_mod1"])
+input_train_mod2 = ad.read_h5ad(par["input_train_mod2"])
+input_train_sol = ad.read_h5ad(par["input_train_sol"])
+input_test_mod1 = ad.read_h5ad(par["input_test_mod1"])
+input_test_mod2 = ad.read_h5ad(par["input_test_mod2"])
+
+# This method runs PCA on each modality individually, then runs linear regression to predict mod2
+# from mod1 and finally performs kNN to match modalities
+
+# unscramble training cells
+ord = np.argsort(input_train_sol.uns['pairing_ix'])
+input_train_mod2 = input_train_mod2[ord, :]
+
+# concatenate train and test data
+mod1 = ad.concat(
+    {
+        "train": input_train_mod1,
+        "test": input_test_mod1
+    },
+    index_unique="-",
+    label="group"
+)
+mod2 = ad.concat(
+    {
+        "train": input_train_mod2,
+        "test": input_test_mod2
+    },
+    index_unique="-",
+    label="group"
+)
+
+# Create helper views to access the test data later
+mod1tr = mod1[mod1.obs["group"] == "train", :]
+mod2tr = mod2[mod2.obs["group"] == "train", :]
+
+mod1te = mod1[mod1.obs["group"] == "test", :]
+mod2te = mod2[mod2.obs["group"] == "test", :]
+
+logging.info("Running PCA")
+n_svd = min(par["n_svd"], mod1.n_obs, mod2.n_obs, mod1.n_vars, mod1.n_vars)
+
+# Use TruncatedSVD for fast decomposition of the data
+mod1.obsm["X_pca"] = TruncatedSVD(n_svd).fit_transform(mod1.X)
+mod2.obsm["X_pca"] = TruncatedSVD(n_svd).fit_transform(mod2.X)
+
+reg = LinearRegression()
+
+reg.fit(mod1tr.obsm["X_pca"], mod2tr.obsm["X_pca"])
+mod2te_pred = reg.predict(mod1te.obsm["X_pca"])
+
+neighbors = NearestNeighbors(n_neighbors=np.min((mod2te.shape[0], par["n_neighbors"])), n_jobs=-1)
+neighbors = neighbors.fit(mod2te_pred)
+
+distances, indices = neighbors.kneighbors(mod2te.obsm["X_pca"])
+
+prediction = np.zeros((mod2te.shape[0], mod2te.shape[0]))
+for i, neighbors in enumerate(indices):
+    prediction[i, neighbors] = 1 / distances[i]
+
+prediction = normalize(prediction, norm="l1")
+
+prediction = scipy.sparse.csr_matrix(prediction)
+
+print("Write prediction output")
+prediction = ad.AnnData(
+    X=prediction,
+    uns={
+        "dataset_id": input_train_mod1.uns["dataset_id"],
+        "method_id": meta["functionality_name"]
+    }
+)
+prediction.write_h5ad(par["output"])

From a5b78036a985eacefe37bd3864445cc627bf0a24 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Tue, 10 Jan 2023 21:15:03 +0100
Subject: [PATCH 71/82] add newwave_knnr_cbf method

---
 .../methods/newwave_knnr_cbf/config.vsh.yaml  |  41 ++++
 .../methods/newwave_knnr_cbf/script.R         | 188 ++++++++++++++++++
 2 files changed, 229 insertions(+)
 create mode 100644 src/match_modality/methods/newwave_knnr_cbf/config.vsh.yaml
 create mode 100644 src/match_modality/methods/newwave_knnr_cbf/script.R

diff --git a/src/match_modality/methods/newwave_knnr_cbf/config.vsh.yaml b/src/match_modality/methods/newwave_knnr_cbf/config.vsh.yaml
new file mode 100644
index 0000000000..6e62d14c3a
--- /dev/null
+++ b/src/match_modality/methods/newwave_knnr_cbf/config.vsh.yaml
@@ -0,0 +1,41 @@
+__merge__: ../../api/comp_method.yaml
+functionality:
+  name: newwave_knnr_cbf
+  namespace: match_modality/methods
+  description: Perform DR with NewWave, predict embedding with KNN regression, and matching cells with a consensus best fit algorithm.
+  info:
+    type: method
+    method_name: NewWave+KNNR+CBF
+  arguments:
+    - name: "--n_pop"
+      type: "integer"
+      default: 300
+      description: Population size.
+    - name: "--newwave_maxiter"
+      type: "integer"
+      default: 40
+      description: Maximum number of NewWave iterations.
+    - name: "--newwave_ngene"
+      type: "integer"
+      default: 200
+      description: Setting of the n_gene_par NewWave parameter.
+    - name: "--newwave_ncell"
+      type: "integer"
+      default: 200
+      description: Setting of the n_cell_par NewWave parameter.
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata, tidyverse, lmds, dynutils, proxy, FNN, pbapply, proxy, proxyC, bioconductor]
+        bioc: [ SingleCellExperiment, NewWave ]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [ anndata>=0.8 ]
+  - type: nextflow
+    directives: [ highmem, midtime, highcpu ]
diff --git a/src/match_modality/methods/newwave_knnr_cbf/script.R b/src/match_modality/methods/newwave_knnr_cbf/script.R
new file mode 100644
index 0000000000..ddc291a517
--- /dev/null
+++ b/src/match_modality/methods/newwave_knnr_cbf/script.R
@@ -0,0 +1,188 @@
+cat("Loading dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+requireNamespace("anndata", quietly = TRUE)
+requireNamespace("pbapply", quietly = TRUE)
+library(Matrix, warn.conflicts = FALSE, quietly = TRUE)
+requireNamespace("NewWave", quietly = TRUE)
+requireNamespace("FNN", quietly = TRUE)
+requireNamespace("SingleCellExperiment", quietly = TRUE)
+
+## VIASH START
+path <- "output/datasets/match_modality/openproblems_bmmc_multiome_phase1/openproblems_bmmc_multiome_phase1.censor_dataset.output_"
+par <- list(
+  input_train_mod1 = paste0(path, "train_mod1.h5ad"),
+  input_train_mod2 = paste0(path, "train_mod2.h5ad"),
+  input_train_sol = paste0(path, "train_sol.h5ad"),
+  input_test_mod1 = paste0(path, "test_mod1.h5ad"),
+  input_test_mod2 = paste0(path, "test_mod2.h5ad"),
+  output = "output.h5ad",
+  n_pop = 300L,
+  newwave_maxiter = 10
+)
+meta <- list(functionality_name = "foo")
+
+# # read in solution data to check whether method is working
+# input_test_sol <- anndata::read_h5ad(paste0(path, "test_sol.h5ad"))
+# match_test <- input_test_sol$uns$pairing_ix + 1
+## VIASH END
+
+n_cores <- parallel::detectCores(all.tests = FALSE, logical = TRUE)
+
+method_id <- meta$functionality_name
+
+input_train_sol <- anndata::read_h5ad(par$input_train_sol)
+match_train <- input_train_sol$uns$pairing_ix + 1
+
+cat("Reading h5ad files\n")
+input_train_mod1 <- anndata::read_h5ad(par$input_train_mod1)
+input_test_mod1 <- anndata::read_h5ad(par$input_test_mod1)
+
+# fetch a few variables
+train_ix <- seq_len(nrow(input_train_mod1))
+did <- input_train_mod1$uns[["dataset_id"]]
+batch1 <- c(as.character(input_train_mod1$obs$batch), as.character(input_test_mod1$obs$batch))
+
+cat("Running NewWave\n")
+data1 <- SummarizedExperiment::SummarizedExperiment(
+  assays = list(counts = cbind(t(input_train_mod1$layers[["counts"]]), t(input_test_mod1$layers[["counts"]]))),
+  colData = data.frame(batch = factor(batch1))
+)
+data1 <- data1[Matrix::rowSums(SummarizedExperiment::assay(data1)) > 0, ]
+# option 1: filter by HVG
+# data1 <- data1[order(proxyC::rowSds(SummarizedExperiment::assay(data1)), decreasing = TRUE)[1:100], ]
+
+# remove large unneeded dataset objects
+rm(input_train_mod1, input_test_mod1)
+gc()
+
+res1 <- NewWave::newWave(
+  data1,
+  X = "~batch",
+  verbose = TRUE,
+  K = 10,
+  maxiter_optimize = par$newwave_maxiter,
+  n_gene_par = min(par$newwave_ngene, nrow(data1)),
+  n_cell_par = min(par$newwave_ncell, ncol(data1)),
+  commondispersion = FALSE
+)
+rm(data1)
+dr_x1 <- SingleCellExperiment::reducedDim(res1)
+
+cat("Reading h5ad files\n")
+input_train_mod2 <- anndata::read_h5ad(par$input_train_mod2)
+input_test_mod2 <- anndata::read_h5ad(par$input_test_mod2)
+
+# don't know batch ordering in input_test_mod2
+batch2 <- c(as.character(input_train_sol$obs$batch), rep("unknownbatch", nrow(input_test_mod2)))
+
+data2 <- SummarizedExperiment::SummarizedExperiment(
+  assays = list(counts = cbind(t(input_train_mod2$layers[["counts"]][order(match_train), , drop = FALSE]), t(input_test_mod2$layers[["counts"]]))),
+  colData = data.frame(batch = factor(batch2))
+)
+data2 <- data2[Matrix::rowSums(SummarizedExperiment::assay(data2)) > 0, ]
+# data2 <- data2[order(proxyC::rowSds(SummarizedExperiment::assay(data2)), decreasing = TRUE)[1:100], ]
+
+# remove large unneeded dataset objects
+rm(input_train_mod2, input_test_mod2)
+gc()
+
+cat("Running NewWave\n")
+res2 <- NewWave::newWave(
+  data2,
+  X = "~batch",
+  verbose = TRUE,
+  K = 10,
+  maxiter_optimize = par$newwave_maxiter,
+  n_gene_par = min(par$newwave_ngene, nrow(data2)),
+  n_cell_par = min(par$newwave_ncell, ncol(data2)),
+  commondispersion = FALSE
+)
+dr_x2 <- SingleCellExperiment::reducedDim(res2)
+
+colnames(dr_x1) <- paste0("comp_", seq_len(ncol(dr_x1)))
+colnames(dr_x2) <- paste0("comp_", seq_len(ncol(dr_x2)))
+
+# split up DR matrices
+dr_x1_train <- dr_x1[train_ix, , drop = FALSE]
+dr_x1_test <- dr_x1[-train_ix, , drop = FALSE]
+dr_x2_train <- dr_x2[train_ix, , drop = FALSE]
+dr_x2_test <- dr_x2[-train_ix, , drop = FALSE]
+
+cat("Predicting mod1 DR of test cells\n")
+pred_mod1 <- apply(dr_x1_train, 2, function(yi) {
+  FNN::knn.reg(
+    train = dr_x2_train,
+    test = dr_x2_test,
+    y = yi,
+    k = min(15, nrow(dr_x1_test))
+  )$pred
+})
+
+cat("Predicting mod2 DR of test cells\n")
+pred_mod2 <- apply(dr_x2_train, 2, function(yi) {
+  FNN::knn.reg(
+    train = dr_x1_train,
+    test = dr_x1_test,
+    y = yi,
+    k = min(15, nrow(dr_x1_test))
+  )$pred
+})
+
+cat("Minimising distances between mod1 and mod2 pairs\n")
+gen_vec <- function(z) {
+  int <- seq_len(nrow(pred_mod1))
+
+  i <- j <- c()
+  resti <- int
+  restj <- int
+
+  while (length(resti) > 0) {
+    ixi <- sample.int(length(resti), 1)
+    newi <- resti[[ixi]]
+    d1 <- proxy::dist(pred_mod1[restj, , drop = FALSE], dr_x1_test[newi, , drop = FALSE], method = "euclidean")
+    d2 <- proxy::dist(pred_mod2[restj, , drop = FALSE], dr_x2_test[newi, , drop = FALSE], method = "euclidean")
+    d12 <- d1 + d2
+    ixj <- which.min(d12[, 1])
+    newj <- restj[[ixj]]
+    resti <- resti[-ixi]
+    restj <- restj[-ixj]
+    i <- c(i, newi)
+    j <- c(j, newj)
+
+    #  tibble(i, j); tibble(resti, restj)
+  }
+
+  tibble::tibble(i, j)
+}
+outs <- pbapply::pblapply(seq_len(par$n_pop), cl = n_cores, gen_vec)
+
+# outs <- lapply(seq_len(par$n_pop), gen_vec)
+df <- bind_rows(outs) %>%
+  group_by(i, j) %>%
+  summarise(n = n(), .groups = "drop") %>%
+  arrange(desc(n)) %>%
+  mutate(gold = i == j)
+
+knn_mat <- Matrix::sparseMatrix(
+  i = df$i,
+  j = df$j,
+  x = df$n,
+  dims = list(nrow(dr_x1_test), nrow(dr_x2_test))
+)
+
+# normalise to make rows sum to 1
+rs <- Matrix::rowSums(knn_mat)
+knn_mat@x <- knn_mat@x / rs[knn_mat@i + 1]
+
+cat("Creating output anndata\n")
+out <- anndata::AnnData(
+  X = as(knn_mat, "CsparseMatrix"),
+  uns = list(
+    dataset_id = did,
+    method_id = method_id
+  )
+)
+
+cat("Writing predictions to file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")

From b70bf42f071b4be09ea34e964fe2305701caa31d Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Tue, 10 Jan 2023 21:27:31 +0100
Subject: [PATCH 72/82] add newwave_knnr_knn method

---
 .../methods/newwave_knnr_cbf/config.vsh.yaml  |   2 +-
 .../methods/newwave_knnr_knn/config.vsh.yaml  |  36 ++++
 .../methods/newwave_knnr_knn/script.R         | 162 ++++++++++++++++++
 3 files changed, 199 insertions(+), 1 deletion(-)
 create mode 100644 src/match_modality/methods/newwave_knnr_knn/config.vsh.yaml
 create mode 100644 src/match_modality/methods/newwave_knnr_knn/script.R

diff --git a/src/match_modality/methods/newwave_knnr_cbf/config.vsh.yaml b/src/match_modality/methods/newwave_knnr_cbf/config.vsh.yaml
index 6e62d14c3a..9fbf3a8887 100644
--- a/src/match_modality/methods/newwave_knnr_cbf/config.vsh.yaml
+++ b/src/match_modality/methods/newwave_knnr_cbf/config.vsh.yaml
@@ -31,7 +31,7 @@ platforms:
     image: eddelbuettel/r2u:22.04
     setup:
       - type: r
-        cran: [ anndata, tidyverse, lmds, dynutils, proxy, FNN, pbapply, proxy, proxyC, bioconductor]
+        cran: [ anndata, tidyverse, lmds, dynutils, proxy, FNN, pbapply, proxy, proxyC]
         bioc: [ SingleCellExperiment, NewWave ]
       - type: apt
         packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
diff --git a/src/match_modality/methods/newwave_knnr_knn/config.vsh.yaml b/src/match_modality/methods/newwave_knnr_knn/config.vsh.yaml
new file mode 100644
index 0000000000..f02a5ed909
--- /dev/null
+++ b/src/match_modality/methods/newwave_knnr_knn/config.vsh.yaml
@@ -0,0 +1,36 @@
+__merge__: ../../api/comp_method.yaml
+functionality:
+  name: baseline_newwave_knnr_knn
+  namespace: match_modality/methods
+  description: Perform DR with NewWave, predict embedding with KNN regression, and match cells with a KNN.
+  info:
+    type: method
+    method_name: NewWave+KNNR+KNN
+  arguments:
+    - name: "--newwave_maxiter"
+      type: "integer"
+      default: 40
+      description: Maximum number of NewWave iterations.
+    - name: "--newwave_ngene"
+      type: "integer"
+      default: 200
+      description: Setting of the n_gene_par NewWave parameter.
+    - name: "--newwave_ncell"
+      type: "integer"
+      default: 200
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata, tidyverse, lmds, dynutils, proxy, FNN]
+        bioc: [ SingleCellExperiment, NewWave ]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [ anndata>=0.8, tensorflow ]
+  - type: nextflow
+    directives: [ highmem, midtime, highcpu ]
diff --git a/src/match_modality/methods/newwave_knnr_knn/script.R b/src/match_modality/methods/newwave_knnr_knn/script.R
new file mode 100644
index 0000000000..2810af1ab2
--- /dev/null
+++ b/src/match_modality/methods/newwave_knnr_knn/script.R
@@ -0,0 +1,162 @@
+cat("Loading dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+requireNamespace("anndata", quietly = TRUE)
+library(Matrix, warn.conflicts = FALSE, quietly = TRUE)
+requireNamespace("NewWave", quietly = TRUE)
+requireNamespace("FNN", quietly = TRUE)
+requireNamespace("SingleCellExperiment", quietly = TRUE)
+
+## VIASH START
+# path <- "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter."
+# path <- "output/public_datasets/match_modality/dyngen_citeseq_1/dyngen_citeseq_1.censor_dataset.output_"
+path <- "output/public_datasets/match_modality/dyngen_atac_1/dyngen_atac_1.censor_dataset.output_"
+# path <- "debug/debug."
+par <- list(
+  input_train_mod1 = paste0(path, "train_mod1.h5ad"),
+  input_train_mod2 = paste0(path, "train_mod2.h5ad"),
+  input_train_sol = paste0(path, "train_sol.h5ad"),
+  input_test_mod1 = paste0(path, "test_mod1.h5ad"),
+  input_test_mod2 = paste0(path, "test_mod2.h5ad"),
+  output = "output.h5ad",
+  n_dims = 10L,
+  distance_method = "spearman",
+  n_ga_pop = 200L,
+  n_ga_iter = 500L
+)
+meta <- list(functionality_name = "foo")
+
+# # read in solution data to check whether method is working
+input_test_sol <- anndata::read_h5ad(paste0(path, "test_sol.h5ad"))
+match_test <- input_test_sol$uns$pairing_ix + 1
+## VIASH END
+
+n_cores <- parallel::detectCores(all.tests = FALSE, logical = TRUE)
+
+method_id <- meta$functionality_name
+
+input_train_sol <- anndata::read_h5ad(par$input_train_sol)
+match_train <- input_train_sol$uns$pairing_ix + 1
+
+cat("Reading h5ad files\n")
+input_train_mod1 <- anndata::read_h5ad(par$input_train_mod1)
+input_test_mod1 <- anndata::read_h5ad(par$input_test_mod1)
+
+# fetch a few variables
+train_ix <- seq_len(nrow(input_train_mod1))
+did <- input_train_mod1$uns[["dataset_id"]]
+batch1 <- c(as.character(input_train_mod1$obs$batch), as.character(input_test_mod1$obs$batch))
+
+cat("Running NewWave\n")
+data1 <- SummarizedExperiment::SummarizedExperiment(
+  assays = list(counts = cbind(t(input_train_mod1$layers[["counts"]]), t(input_test_mod1$layers[["counts"]]))),
+  colData = data.frame(batch = factor(batch1))
+)
+data1 <- data1[Matrix::rowSums(SummarizedExperiment::assay(data1)) > 0, ]
+# option 1: filter by HVG
+# data1 <- data1[order(proxyC::rowSds(SummarizedExperiment::assay(data1)), decreasing = TRUE)[1:100], ]
+
+# remove large unneeded dataset objects
+rm(input_train_mod1, input_test_mod1)
+gc()
+
+res1 <- NewWave::newWave(
+  data1,
+  X = "~batch",
+  verbose = TRUE,
+  K = 10,
+  maxiter_optimize = par$newwave_maxiter,
+  n_gene_par = min(par$newwave_ngene, nrow(data1)),
+  n_cell_par = min(par$newwave_ncell, ncol(data1)),
+  commondispersion = FALSE
+)
+rm(data1)
+dr_x1 <- SingleCellExperiment::reducedDim(res1)
+
+cat("Reading h5ad files\n")
+input_train_mod2 <- anndata::read_h5ad(par$input_train_mod2)
+input_test_mod2 <- anndata::read_h5ad(par$input_test_mod2)
+
+# don't know batch ordering in input_test_mod2
+batch2 <- c(as.character(input_train_sol$obs$batch), rep("unknownbatch", nrow(input_test_mod2)))
+
+data2 <- SummarizedExperiment::SummarizedExperiment(
+  assays = list(counts = cbind(t(input_train_mod2$layers[["counts"]][order(match_train), , drop = FALSE]), t(input_test_mod2$layers[["counts"]]))),
+  colData = data.frame(batch = factor(batch2))
+)
+data2 <- data2[Matrix::rowSums(SummarizedExperiment::assay(data2)) > 0, ]
+# data2 <- data2[order(proxyC::rowSds(SummarizedExperiment::assay(data2)), decreasing = TRUE)[1:100], ]
+
+# remove large unneeded dataset objects
+rm(input_train_mod2, input_test_mod2)
+gc()
+
+cat("Running NewWave\n")
+res2 <- NewWave::newWave(
+  data2,
+  X = "~batch",
+  verbose = TRUE,
+  K = 10,
+  maxiter_optimize = par$newwave_maxiter,
+  n_gene_par = min(par$newwave_ngene, nrow(data2)),
+  n_cell_par = min(par$newwave_ncell, ncol(data2)),
+  commondispersion = FALSE
+)
+dr_x2 <- SingleCellExperiment::reducedDim(res2)
+
+colnames(dr_x1) <- paste0("comp_", seq_len(ncol(dr_x1)))
+colnames(dr_x2) <- paste0("comp_", seq_len(ncol(dr_x2)))
+
+# split up DR matrices
+dr_x1_train <- dr_x1[train_ix, , drop = FALSE]
+dr_x1_test <- dr_x1[-train_ix, , drop = FALSE]
+dr_x2_train <- dr_x2[train_ix, , drop = FALSE]
+dr_x2_test <- dr_x2[-train_ix, , drop = FALSE]
+
+cat("Predicting mod1 DR of test cells\n")
+preds <- apply(dr_x1_train, 2, function(yi) {
+  FNN::knn.reg(
+    train = dr_x2_train,
+    test = dr_x2_test,
+    y = yi,
+    k = min(15, nrow(dr_x1_test))
+  )$pred
+})
+
+
+cat("Performing KNN between test mod2 DR and predicted test mod2\n")
+knn_out <- FNN::get.knnx(
+  preds,
+  dr_x2_test,
+  k = min(1000, nrow(preds))
+)
+
+cat("Creating output data structures\n")
+df <- tibble(
+  i = as.vector(row(knn_out$nn.index)),
+  j = as.vector(knn_out$nn.index),
+  x = 1000 - as.vector(col(knn_out$nn.index)) + 1
+  # x = max(knn_out$nn.dist) * 2 - as.vector(knn_out$nn.dist)
+)
+knn_mat <- Matrix::sparseMatrix(
+  i = df$i,
+  j = df$j,
+  x = df$x,
+  dims = list(nrow(dr_x1_test), nrow(dr_x2_test))
+)
+
+# normalise to make rows sum to 1
+rs <- Matrix::rowSums(knn_mat)
+knn_mat@x <- knn_mat@x / rs[knn_mat@i + 1]
+
+cat("Creating output anndata\n")
+out <- anndata::AnnData(
+  X = as(knn_mat, "CsparseMatrix"),
+  uns = list(
+    dataset_id = did,
+    method_id = method_id
+  )
+)
+
+cat("Writing predictions to file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")

From 7134f449c1c68c8977c29e36b031e6d8656272c6 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Tue, 10 Jan 2023 21:37:50 +0100
Subject: [PATCH 73/82] add procrusted_knn method

---
 .../methods/procrustes_knn/config.vsh.yaml    |  24 ++++
 .../methods/procrustes_knn/script.py          | 112 ++++++++++++++++++
 2 files changed, 136 insertions(+)
 create mode 100644 src/match_modality/methods/procrustes_knn/config.vsh.yaml
 create mode 100644 src/match_modality/methods/procrustes_knn/script.py

diff --git a/src/match_modality/methods/procrustes_knn/config.vsh.yaml b/src/match_modality/methods/procrustes_knn/config.vsh.yaml
new file mode 100644
index 0000000000..1c317ca3d9
--- /dev/null
+++ b/src/match_modality/methods/procrustes_knn/config.vsh.yaml
@@ -0,0 +1,24 @@
+__merge__: ../../api/comp_method.yaml
+functionality:
+  name: procrustes_knn
+  namespace: match_modality/methods
+  description: Perform DR with Procrustes, and match cells with a GA to minimise pairwise distances.
+  info:
+    type: method
+    method_name: Procrustes+KNN
+  arguments:
+    - name: "--n_svd"
+      type: "integer"
+      default: 100
+      description: Number of SVD components to compress to
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: "python:3.10"
+    setup:
+      - type: python
+        pip: [ anndata>=0.8, scipy, scikit-learn ]
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/match_modality/methods/procrustes_knn/script.py b/src/match_modality/methods/procrustes_knn/script.py
new file mode 100644
index 0000000000..3e16b5fbb7
--- /dev/null
+++ b/src/match_modality/methods/procrustes_knn/script.py
@@ -0,0 +1,112 @@
+import logging
+import anndata as ad
+import scipy.spatial
+import scipy.sparse
+import numpy as np
+
+from sklearn.preprocessing import normalize
+from sklearn.decomposition import TruncatedSVD
+from sklearn.neighbors import NearestNeighbors
+
+## VIASH START
+
+# Anything within this block will be removed by `viash` and will be
+# replaced with the parameters as specified in your config.vsh.yaml.
+
+par = {
+    "input_train_mod1": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_mod1.h5ad",
+    "input_train_mod2": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_mod2.h5ad",
+    "input_train_sol": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_sol.h5ad",
+    "input_test_mod1": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_mod1.h5ad",
+    "input_test_mod2": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_mod2.h5ad",
+    "output": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+    "n_svd": 100,
+}
+
+meta = {
+    "functionality_name": "foo"
+}
+## VIASH END
+
+logging.basicConfig(level=logging.INFO)
+
+logging.info("Load datasets")
+input_train_mod1 = ad.read_h5ad(par["input_train_mod1"])
+input_train_mod2 = ad.read_h5ad(par["input_train_mod2"])
+# input_train_sol = ad.read_h5ad(par["input_train_sol"])
+input_test_mod1 = ad.read_h5ad(par["input_test_mod1"])
+input_test_mod2 = ad.read_h5ad(par["input_test_mod2"])
+
+# This method runs PCA on each modality individually, then uses the Procrustes method to identify
+# a linear transform that best superimposes the points from modality 1 onto modality 2.
+
+# concatenate train and test data
+mod1 = ad.concat(
+    {
+        "train": input_train_mod1,
+        "test": input_test_mod1
+    },
+    index_unique="-",
+    label="group"
+)
+mod2 = ad.concat(
+    {
+        "train": input_train_mod2,
+        "test": input_test_mod2
+    },
+    index_unique="-",
+    label="group"
+)
+
+# Create helper views to access the test data later
+mod1te = mod1[mod1.obs["group"] == "test", :]
+mod2te = mod2[mod2.obs["group"] == "test", :]
+
+logging.info("Running PCA")
+n_svd = min(par["n_svd"], mod1.n_obs, mod2.n_obs, mod1.n_vars, mod1.n_vars)
+
+# Use TruncatedSVD for fast decomposition of the data
+mod1.obsm["X_pca"] = TruncatedSVD(n_svd).fit_transform(mod1.X)
+mod2.obsm["X_pca"] = TruncatedSVD(n_svd).fit_transform(mod2.X)
+
+logging.info("Running Procrustes Alignment")
+# This function takes in two matrices of points A and B, standardizes both, and applies a linear to
+# matrix B to minimize the disparity measured as the sum of the squares of the pointwise distances
+# between the two input datasets
+mod1.obsm["X_pro"], mod2.obsm["X_pro"], disparity = scipy.spatial.procrustes(
+    mod1.obsm["X_pca"],
+    mod2.obsm["X_pca"],
+)
+logging.info("> Disparity value is: %0.3f" % disparity)
+
+logging.info("Perform nearest neighbors")
+# To get the matching matrix, for each point in mod1_test, we take the 1000 nearest neighbors of that
+# point in the transformed mod2_test dataset
+n_neighbors = min(1000, mod1te.n_obs, mod1te.n_vars, mod2te.n_obs, mod2te.n_vars)
+nn = NearestNeighbors(n_neighbors=n_neighbors).fit(mod1te.obsm["X_pro"])
+distances, indices = nn.kneighbors(X=mod2te.obsm["X_pro"])
+
+logging.info("Create pairing matrix")
+# Translate the neighborhood assignments to a pairing matrix that is (n_obs, n_obs)
+# NOTE: `pairing_matrix` must have NO MORE than 1000*n_obs non-zero entries for fast metric computation
+ind_i = np.tile(np.arange(mod1te.n_obs), (n_neighbors, 1)).T.flatten()
+ind_j = indices.flatten()
+ind_dist = distances.flatten()
+ind_x = 2 * max(ind_dist) - ind_dist
+pairing_matrix = scipy.sparse.csr_matrix(
+    (ind_x, (ind_i, ind_j)),
+    shape=(input_test_mod1.n_obs, input_test_mod2.n_obs)
+)
+
+# row normalise
+prob_matrix = normalize(pairing_matrix, norm="l1")
+
+print("Write prediction output")
+prediction = ad.AnnData(
+    X=prob_matrix,
+    uns={
+        "dataset_id": input_train_mod1.uns["dataset_id"],
+        "method_id": meta["functionality_name"]
+    }
+)
+prediction.write_h5ad(par["output"])

From 91e41c1039d576c26257986544c2577fc0a5871c Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Wed, 11 Jan 2023 08:54:40 +0100
Subject: [PATCH 74/82] add babel_knn method

---
 .../methods/babel_knn/config.vsh.yaml         |  40 ++++
 src/match_modality/methods/babel_knn/script.R | 195 ++++++++++++++++++
 2 files changed, 235 insertions(+)
 create mode 100644 src/match_modality/methods/babel_knn/config.vsh.yaml
 create mode 100644 src/match_modality/methods/babel_knn/script.R

diff --git a/src/match_modality/methods/babel_knn/config.vsh.yaml b/src/match_modality/methods/babel_knn/config.vsh.yaml
new file mode 100644
index 0000000000..55c4498547
--- /dev/null
+++ b/src/match_modality/methods/babel_knn/config.vsh.yaml
@@ -0,0 +1,40 @@
+__merge__: ../../api/comp_method.yaml
+functionality:
+  status: disabled
+  name: babel_knn
+  namespace: match_modality/methods
+  description: Predict test expression with BABEL and match cells with KNN.
+  info:
+    type: method
+    method_name: Babel+KNN
+    paper_doi: "10.1073/pnas.2023070118"
+  arguments:
+    - name: "--n_dims"
+      type: "integer"
+      default: 10
+      description: Number of dimensions to use for dimensionality reduction.
+    - name: "--n_neigh"
+      type: "integer"
+      default: 10
+      description: Number of neighbors for KNN. Match probability will be 1/n_neigh
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata, lmds, FNN ]
+        bioc: [ DropletUtils ]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3, git]
+      - type: python
+        pip: [ anndata>=0.8 ]
+      - type: docker
+        run: 
+          - wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-py39_4.10.3-Linux-x86_64.sh -O ~/miniconda.sh && /bin/bash ~/miniconda.sh -b -p /opt/conda && /opt/conda/bin/conda -V
+          - cd / && git clone --depth 1 https://github.com/rcannood/babel.git
+          - cd /babel && /opt/conda/bin/conda env create -f environment.yml      
+  - type: nextflow
+    directives: [ lowmem, lowtime, lowcpu ]
diff --git a/src/match_modality/methods/babel_knn/script.R b/src/match_modality/methods/babel_knn/script.R
new file mode 100644
index 0000000000..42702b6150
--- /dev/null
+++ b/src/match_modality/methods/babel_knn/script.R
@@ -0,0 +1,195 @@
+cat(">> Loading dependencies\n")
+
+requireNamespace("anndata", quietly = TRUE)
+library(Matrix, warn.conflicts = FALSE, quietly = TRUE)
+library(testthat, warn.conflicts = FALSE, quietly = TRUE)
+requireNamespace("DropletUtils", quietly = TRUE)
+
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+
+babel_location <- "/babel/bin/"
+conda_bin <- "/opt/conda/bin/conda"
+
+## VIASH START
+path <- "output/datasets/match_modality/openproblems_bmmc_multiome_phase1/openproblems_bmmc_multiome_phase1.censor_dataset.output_"
+par <- list(
+  input_train_mod1 = paste0(path, "train_mod1.h5ad"),
+  input_train_mod2 = paste0(path, "train_mod2.h5ad"),
+  input_train_sol = paste0(path, "train_sol.h5ad"),
+  input_test_mod1 = paste0(path, "test_mod1.h5ad"),
+  input_test_mod2 = paste0(path, "test_mod2.h5ad"),
+  output = "output.h5ad",
+  n_dims = 10,
+  n_neighs = 10
+)
+conda_bin <- "conda"
+babel_location <- "../babel/bin/"
+## VIASH END
+
+input_train_mod2 <- anndata::read_h5ad(par$input_train_mod2, backed = TRUE)
+if (input_train_mod2$var$feature_types[[1]] != "ATAC") {
+  cat("Error: babel only runs on GEX to ATAC datasets\n")
+  quit(save = "no", status = 42)
+}
+
+cat("Reading h5ad files\n")
+input_train_mod1 <- anndata::read_h5ad(par$input_train_mod1)
+input_train_mod2 <- anndata::read_h5ad(par$input_train_mod2)
+input_train_sol <- anndata::read_h5ad(par$input_train_sol)
+input_test_mod1 <- anndata::read_h5ad(par$input_test_mod1)
+input_test_mod2 <- anndata::read_h5ad(par$input_test_mod2)
+
+cat(">> Reading h5ad files\n")
+if (is.null(input_train_mod1$var$gene_ids)) input_train_mod1$var$gene_ids <- colnames(input_train_mod1)
+if (is.null(input_train_mod2$var$gene_ids)) input_train_mod2$var$gene_ids <- colnames(input_train_mod2)
+if (is.null(input_test_mod1$var$gene_ids)) input_test_mod1$var$gene_ids <- colnames(input_test_mod1)
+if (is.null(input_test_mod2$var$gene_ids)) input_test_mod2$var$gene_ids <- colnames(input_test_mod2)
+
+mod1 <- as.character(unique(input_train_mod1$var$feature_types))
+mod2 <- as.character(unique(input_train_mod2$var$feature_types))
+
+# multiome_matrix for export to Babel's input format
+multiome_matrix <- cbind(input_train_mod1$X, input_train_mod2$X)
+
+# generate multiome anndata object for training
+ad_babel <- anndata::AnnData(
+  X = multiome_matrix,
+  var = bind_rows(input_train_mod1$var, input_train_mod2$var)
+)
+
+# setting up babel dirs
+tmpdir <- tempfile(pattern = "babel_temp", fileext = "/")
+cat(">> Setting up directories for babel at ", tmpdir, "\n", sep = "")
+dir.create(tmpdir)
+on.exit(unlink(tmpdir, recursive = TRUE))
+
+dir_data <- paste0(tmpdir, "data/")     # location of input files
+dir.create(dir_data)
+dir_model <- paste0(tmpdir, "model/")   # location of babel model
+dir_pred <- paste0(tmpdir, "pred/")     # location of predictions
+
+feature_type_map <- c(
+  "GEX" = "Gene Expression",
+  "ADT" = "Peaks", # try to make it run on ADT data as well
+  "ATAC" = "Peaks"
+)
+
+cat(">> Writing train dataset as 10x-CellRanger H5 format\n")
+DropletUtils::write10xCounts(
+  paste0(dir_data, "train_input.h5"),
+  t(ad_babel$X),
+  gene.id = ad_babel$var$gene_ids,
+  gene.symbol = colnames(ad_babel),
+  barcodes = rownames(ad_babel),
+  type = "HDF5",
+  version = "3",
+  genome = "GRCh38",
+  gene.type = unname(feature_type_map[ad_babel$var$feature_types]),
+  overwrite = TRUE
+)
+
+cat(">> Writing test dataset as 10x-CellRanger H5 format\n")
+DropletUtils::write10xCounts(
+  paste0(dir_data, "test_input.h5"),
+  t(input_test_mod1$X),
+  gene.id = input_test_mod1$var$gene_ids,
+  gene.symbol = colnames(input_test_mod1),
+  barcodes = rownames(input_test_mod1),
+  type = "HDF5",
+  version = "3",
+  genome = "GRCh38",
+  gene.type = unname(feature_type_map[input_test_mod1$var$feature_types]),
+  overwrite = TRUE
+)
+
+
+cat(">> Babel: train model\n")
+babel_train_cmd <- paste0(
+  conda_bin, " run -n babel ",
+  "python ", babel_location, "train_model.py ",
+  "--data ", dir_data, "train_input.h5 ",
+  "--outdir ", dir_model, " ",
+  "--nofilter"
+)
+# stringent filtering causes babel to sometimes fail
+# reason: https://github.com/wukevin/babel/blob/main/babel/sc_data_loaders.py#L168-L190
+
+out1 <- system(babel_train_cmd)
+
+# check whether training succeeded
+expect_equal(out1, 0, info = paste0("Model training failed with exit code ", out1))
+
+cat(">> Babel: predict from model\n")
+babel_pred_cmd <- paste0(
+  conda_bin, " run -n babel ",
+  "python ", babel_location, "predict_model.py ",
+  "--checkpoint ", dir_model, " ",
+  "--data ", dir_data, "test_input.h5 ",
+  "--outdir ", dir_pred, " ",
+  "--nofilter"
+)
+out2 <- system(babel_pred_cmd)
+
+# check whether prediction succeeded
+expect_equal(out2, 0, info = paste0("Prediction failed with exit code ", out1))
+
+cat(">> Read predictions\n")
+pred <- anndata::read_h5ad(paste0(dir_pred, "/rna_atac_adata.h5ad"))
+
+#######################################
+#####  KNN
+
+# Only some features are present in Babel's output
+input_test_mod2_filter <- input_test_mod2[, row.names(input_test_mod2$var) %in% row.names(pred$var)]
+
+# Dimensional reduction of both predicted and test profiles
+pred_profiles <- pred[, row.names(input_test_mod2_filter$var)]$X
+
+cat("Performing DR on test values\n")
+dr <- lmds::lmds(
+  rbind(pred_profiles, input_test_mod2_filter$X),
+  ndim = par$n_dims,
+  distance_method = par$distance_method
+)
+
+train_ix <- seq_len(nrow(pred_profiles))
+dr_preds <- dr[train_ix, , drop = FALSE]
+dr_test_mod2 <- dr[-train_ix, , drop = FAPSE]
+
+
+cat("Performing KNN between test mod2 DR and predicted test mod2\n")
+knn_out <- FNN::get.knnx(
+  dr_preds,
+  dr_test_mod2,
+  k = min(1000, nrow(dr_preds))
+)
+
+cat("Creating output data structures\n")
+df <- tibble(
+  i = as.vector(row(knn_out$nn.index)),
+  j = as.vector(knn_out$nn.index),
+  x = max(knn_out$nn.dist) * 2 - as.vector(knn_out$nn.dist)
+)
+knn_mat <- Matrix::sparseMatrix(
+  i = df$i,
+  j = df$j,
+  x = df$x,
+  dims = list(nrow(input_test_mod1), nrow(input_test_mod2))
+)
+
+# normalise to make rows sum to 1
+rs <- Matrix::rowSums(knn_mat)
+knn_mat@x <- knn_mat@x / rs[knn_mat@i + 1]
+
+cat("Creating output anndata\n")
+out <- anndata::AnnData(
+  X = as(knn_mat, "CsparseMatrix"),
+  uns = list(
+    dataset_id = input_train_mod1$uns[["dataset_id"]],
+    method_id = "baseline_babel_knn"
+  )
+)
+
+cat("Writing predictions to file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")
\ No newline at end of file

From eb1e5f550ca87bc1b936c79889f001fde0a2d54d Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Thu, 12 Jan 2023 14:53:22 +0100
Subject: [PATCH 75/82] add aupr metrics

---
 src/match_modality/api/anndata_score.yaml     |  25 ++++
 src/match_modality/api/comp_method.yaml       |   1 +
 src/match_modality/api/comp_metric.yaml       |  85 +++++++++++
 .../mask_dataset/config.vsh.yaml              |   6 +-
 .../metrics/aupr/config.vsh.yaml              |  40 ++++++
 src/match_modality/metrics/aupr/script.R      | 136 ++++++++++++++++++
 src/match_modality/metrics/aupr/test_custom.R | 100 +++++++++++++
 7 files changed, 390 insertions(+), 3 deletions(-)
 create mode 100644 src/match_modality/api/anndata_score.yaml
 create mode 100644 src/match_modality/api/comp_metric.yaml
 create mode 100644 src/match_modality/metrics/aupr/config.vsh.yaml
 create mode 100644 src/match_modality/metrics/aupr/script.R
 create mode 100644 src/match_modality/metrics/aupr/test_custom.R

diff --git a/src/match_modality/api/anndata_score.yaml b/src/match_modality/api/anndata_score.yaml
new file mode 100644
index 0000000000..bfe79f07cc
--- /dev/null
+++ b/src/match_modality/api/anndata_score.yaml
@@ -0,0 +1,25 @@
+type: file
+description: "Metric score file"
+example: "output.h5ad"
+info:
+  short_description: "Score"
+  slots:
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: method_id
+        description: "A unique identifier for the method"
+        required: true
+      - type: string
+        name: metric_ids
+        description: "One or more unique metric identifiers"
+        multiple: true
+        required: true
+      - type: double
+        name: metric_values
+        description: "The metric values obtained for the given prediction. Must be of same length as 'metric_ids'."
+        multiple: true
+        required: true
diff --git a/src/match_modality/api/comp_method.yaml b/src/match_modality/api/comp_method.yaml
index 2286a10ffb..87e45f646b 100644
--- a/src/match_modality/api/comp_method.yaml
+++ b/src/match_modality/api/comp_method.yaml
@@ -11,6 +11,7 @@ functionality:
     - name: "--input_test_mod2"
       __merge__: anndata_masked_mod.yaml
     - name: "--output"
+      direction: output
       __merge__: anndata_prediction.yaml
   test_resources:
     - path: ../../../../output   
diff --git a/src/match_modality/api/comp_metric.yaml b/src/match_modality/api/comp_metric.yaml
new file mode 100644
index 0000000000..12c5a070ac
--- /dev/null
+++ b/src/match_modality/api/comp_metric.yaml
@@ -0,0 +1,85 @@
+functionality:
+  arguments:
+    - name: --input_prediction
+      __merge__: anndata_prediction.yaml
+    - name: --input_solution
+      __merge__: anndata_solution.yaml
+    - name: --output
+      __merge__: anndata_score.yaml
+      direction: output
+  test_resources:
+    - path: ../../../../output
+    - type: python_script
+      path: generic_test.py
+      text: |
+        from os import path
+        import subprocess
+        import anndata as ad
+        import pandas as pd
+        import yaml
+
+        input_prediction_path = "output/dr_knnr_cbf_prediction.h5ad"
+        input_solution_path = "output/output_test_sol.h5ad"
+        output_path = "output.h5ad"
+        # define some filenames
+        with open(meta["config"], "r") as file:
+                config = yaml.safe_load(file)
+
+        cmd = [
+            meta['executable'],
+            "--input_prediction", input_prediction_path,
+            "--input_solution", input_solution_path,
+            "--output", output_path
+        ]
+
+        print("> Running method", flush=True)
+        out = subprocess.run(cmd, stderr=subprocess.STDOUT, check=True)
+
+        # print(out.stdout)
+
+        print("> Checking whether output files were created", flush=True)
+        assert path.exists(output_path)
+
+        print("> Reading h5ad files", flush=True)
+        input_prediction = ad.read_h5ad(input_prediction_path)
+        input_solution = ad.read_h5ad(input_solution_path)
+        output = ad.read_h5ad(output_path)
+
+        # Create DF from metric config info
+        metric_info = config['functionality']['info']['metrics']
+        metric_meta = pd.DataFrame(metric_info)
+        metric_meta = metric_meta.astype({'metric_id': str, 'metric_name': str, 'metric_description': str, 'min': float, 'max': float, 'maximize': bool})
+        print("> Checking contents of metric info", flush=True)
+        assert 'metric_id' in metric_meta
+        assert 'min' in metric_meta
+        assert 'max' in metric_meta
+        assert 'maximize' in metric_meta
+
+        print("> Checking .uns['dataset_id']", flush=True)
+        assert 'dataset_id' in output.uns
+        assert output.uns['dataset_id'] == input_prediction.uns['dataset_id']
+
+        print("> Checking .uns['method_id']", flush=True)
+        assert 'method_id' in output.uns
+        assert output.uns['method_id'] == input_prediction.uns['method_id']
+
+        print("> Checking .uns['metric_ids']", flush=True)
+        assert 'metric_ids' in output.uns
+        assert set(output.uns['metric_ids']) == set(metric_meta.metric_id)
+
+        print("> Checking .uns['metric_values']", flush=True)
+        assert 'metric_values' in output.uns
+        assert output.uns['metric_ids'].size == output.uns['metric_values'].size
+
+        # merge with metric_meta to see if metric_value lies within the expected range
+        output_uns = pd.DataFrame({
+          'metric_id': output.uns['metric_ids'], 
+          'value': output.uns['metric_values']
+        })
+
+        scores = metric_meta.merge(output_uns, on="metric_id")
+
+        assert all(scores.value >= scores['min'])
+        assert all(scores.value <= scores['max'])
+
+        print("> Test succeeded!", flush=True)
diff --git a/src/match_modality/mask_dataset/config.vsh.yaml b/src/match_modality/mask_dataset/config.vsh.yaml
index 5dd18a45f9..ac341f3b84 100644
--- a/src/match_modality/mask_dataset/config.vsh.yaml
+++ b/src/match_modality/mask_dataset/config.vsh.yaml
@@ -1,4 +1,4 @@
-_merge__: ../api/comp_mask_dataset.yaml
+__merge__: ../api/comp_mask_dataset.yaml
 functionality:
   name: mask_dataset
   namespace: match_modality
@@ -19,5 +19,5 @@ platforms:
       - type: python
         pip: [ anndata>=0.8 ]
   - type: nextflow
-    publish: true
-    directives: [ highmem, midtime, highcpu ]
+    directives: 
+      label: [ highmem, midtime, highcpu ]
diff --git a/src/match_modality/metrics/aupr/config.vsh.yaml b/src/match_modality/metrics/aupr/config.vsh.yaml
new file mode 100644
index 0000000000..3b655555d1
--- /dev/null
+++ b/src/match_modality/metrics/aupr/config.vsh.yaml
@@ -0,0 +1,40 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: aupr
+  namespace: match_modality/metrics
+  description: Calculating basic metrics for task2.
+  info:
+    metrics:
+      - metric_id: pairing_auroc
+        metric_name: Pairing auroc
+        metric_description: Area Under ROC curve
+        maximize: true
+        min: 0
+        max: 1
+      - metric_id: pairing_aupr
+        metric_name: Pairing aupr
+        metric_description: Area Under PR curve
+        maximize: true
+        min: 0
+        max: 1
+  resources:
+    - type: r_script
+      path: script.R
+  test_resources:
+    - type: r_script
+      path: test_custom.R
+    - path: ../../../../resources_test
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata, pracma, tidyverse, testthat]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [ anndata>=0.8, pyyaml ]
+  - type: nextflow
+    directives:
+      label: [ lowmem, lowtime, lowcpu ]
+
diff --git a/src/match_modality/metrics/aupr/script.R b/src/match_modality/metrics/aupr/script.R
new file mode 100644
index 0000000000..1609d6f673
--- /dev/null
+++ b/src/match_modality/metrics/aupr/script.R
@@ -0,0 +1,136 @@
+cat("Load dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+library(testthat, quietly = TRUE, warn.conflicts = FALSE)
+library(Matrix, quietly = TRUE, warn.conflicts = FALSE)
+requireNamespace("anndata", quietly = TRUE)
+requireNamespace("pracma", quietly = TRUE)
+
+## VIASH START
+par <- list(
+  input_solution = "output/output_test_sol.h5ad",
+  input_prediction = "output/dr_knnr_cbf_prediction.h5ad",
+  output = "output/aupr_score.h5ad"
+)
+## VIASH END
+
+cat("Read solution h5ad\n")
+ad_sol <- anndata::read_h5ad(par$input_solution, backed = "r")
+
+cat("Read prediction h5ad\n")
+expect_true(
+  grepl("\\.h5ad$", par$input_prediction),
+  info = "Prediction file should be an h5ad file"
+)
+ad_pred <-
+  tryCatch({
+    anndata::read_h5ad(par$input_prediction)
+  }, error = function(e) {
+    stop(paste0("Can't open prediction h5ad file. Detailed error message:\n", e$message))
+  })
+expect_true(
+  ad_sol$uns$dataset_id == ad_pred$uns$dataset_id
+)
+
+X_pred <- as(ad_pred$X, "CsparseMatrix")[, order(ad_sol$uns$pairing_ix)]
+dimnames(X_pred) <- list(NULL, NULL)
+
+cat("Data wrangling\n")
+pred_summ <- summary(X_pred) %>%
+  as_tibble() %>%
+  mutate(gold = i == j) %>%
+  arrange(desc(x))
+
+# helper function
+calculate_au <- function(values, are_true, num_positive_interactions, num_possible_interactions, extend_by = 10000) {
+  ord <- order(rank(values, ties.method = "random"), decreasing = TRUE)
+  values <- values[ord]
+  are_true <- are_true[ord]
+
+  # calculate base statistics
+  num_selected <- seq_along(are_true)
+  tp <- cumsum(are_true)
+  fp <- num_selected - tp
+  length_ranking <- length(tp)
+  num_negative_interactions <- num_possible_interactions - num_positive_interactions
+
+  # extend base statistics, if necessary
+  if (extend_by > 0 && length_ranking != num_possible_interactions) {
+    diff.predictions <- num_possible_interactions - length_ranking
+    diff.trues <- num_positive_interactions - tail(tp, 1)
+    diff.negs <- num_negative_interactions - tail(fp, 1)
+
+    multiplier <- seq_len(extend_by) / extend_by
+
+    extra_num_selected <- multiplier * diff.predictions + tail(num_selected, 1)
+    extra_tp <- multiplier * diff.trues + tail(tp, 1)
+    extra_fp <- multiplier * diff.negs + tail(fp, 1)
+
+    num_selected <- c(num_selected, extra_num_selected)
+    are_true <- c(are_true, rep(NA, extend_by))
+    values <- c(values, rep(NA, extend_by))
+    tp <- c(tp, extra_tp)
+    fp <- c(fp, extra_fp)
+  }
+
+  # calculate extended statistics
+  metrics <- tibble(
+    num_selected = c(0, num_selected),
+    value = c(NA, values),
+    are_true = c(NA, are_true),
+    tp = c(0, tp),
+    fp = c(0, fp),
+    fn = num_positive_interactions - tp,
+    tn = num_negative_interactions - fp,
+    acc = (tp + tn) / (num_positive_interactions + num_negative_interactions),
+    tpr = tp / num_positive_interactions,
+    spec = tn / num_negative_interactions,
+    prec = ifelse(num_selected == 0, 1, tp / (tp + fp)),
+    npv = tn / (tn + fn),
+    f1 = 2 * tp / (2 * tp + fp + fn),
+    mcc = ifelse(num_selected == 0, 0, (tp * tn - fp * fn) / sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))),
+    informedness = tpr + spec - 1,
+    markedness = prec + npv - 1
+  )
+
+  # calculate area under the curves
+  area_under <- tibble(
+    auroc = pracma::trapz(1 - metrics$spec, metrics$tpr),
+    aupr = abs(pracma::trapz(metrics$tpr, metrics$prec))
+  )
+
+  list(metrics = metrics, area_under = area_under)
+}
+
+
+cat("Calculate area under the curve\n")
+au_out <- calculate_au(
+  values = pred_summ$x,
+  are_true = pred_summ$gold,
+  num_positive_interactions = nrow(X_pred),
+  num_possible_interactions = (nrow(X_pred) * 1.0) * nrow(X_pred)
+)
+
+# GENIE3bis::plot_curves(au_out)
+
+colnames(au_out$area_under) <- paste0("pairing_", colnames(au_out$area_under))
+
+cat("Create output object\n")
+out_values <- c(
+  as.list(au_out$area_under)
+)
+
+out <- anndata::AnnData(
+  X = NULL,
+  shape = dim(ad_sol),
+  uns = list(
+    dataset_id = ad_pred$uns$dataset_id,
+    method_id = ad_pred$uns$method_id,
+    metric_ids = names(out_values),
+    metric_values = as.numeric(out_values),
+    genie3 = au_out
+  )
+)
+
+cat("Write output to h5ad file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")
\ No newline at end of file
diff --git a/src/match_modality/metrics/aupr/test_custom.R b/src/match_modality/metrics/aupr/test_custom.R
new file mode 100644
index 0000000000..61e120f8d4
--- /dev/null
+++ b/src/match_modality/metrics/aupr/test_custom.R
@@ -0,0 +1,100 @@
+library(assertthat, quietly = TRUE)
+library(Matrix, quietly = TRUE)
+requireNamespace("anndata", quietly = TRUE)
+
+## VIASH START
+# This code block will be replaced by viash at runtime.
+meta <- list(functionality_name = "foo")
+## VIASH END
+
+# determine filenames and arguments
+testpar <- list(
+  "input_solution" = "temp_sol.h5ad",
+  "input_prediction" = "temp_pred.h5ad",
+  "output" = "temp_out.h5ad"
+)
+command <- paste0("./", meta[["functionality_name"]])
+args <- unlist(rbind(paste0("--", names(testpar)), unname(testpar)))
+
+# uncomment this for manual testing
+# command <- "viash"
+# args <- c("run", "src/match_modality/metrics/aupr/config.vsh.yaml", "--", args)
+
+cat("Creating test files\n")
+ad_sol <- anndata::AnnData(
+  X = as(Matrix(c(1, 0, 0, 0, 1, 0, 0, 0, 1), nrow = 3, byrow = TRUE, sparse = TRUE), "CsparseMatrix"),
+  layers = list(
+    neighbors = as(Matrix(c(1, 0.5, 0, 0.5, 1, 0.25, 0, 0.25, 1), nrow = 3, byrow = TRUE, sparse = TRUE), "CsparseMatrix")
+  ),
+  uns = list(dataset_id = "simple", pairing_ix = c(0, 1, 2)),
+  obs = data.frame(
+    cell_type = c("a", "a", "b")
+  )
+)
+ad_pred <- anndata::AnnData(
+  X = as(Matrix(c(1, .1, .2, .3, .9, .4, .5, .6, .8), nrow = 3, byrow = TRUE, sparse = TRUE), "CsparseMatrix"),
+  uns = list(dataset_id = "simple", method_id = "simple")
+)
+
+ad_sol$write_h5ad(testpar$input_solution, compression = "gzip")
+ad_pred$write_h5ad(testpar$input_prediction, compression = "gzip")
+
+cat("> Running metric\n")
+out <- processx::run(
+  command = command,
+  args = args,
+  stderr_to_stdout = TRUE
+)
+
+cat("> Reading metric scores\n")
+assert_that(file.exists(testpar$output))
+ad_out <- anndata::read_h5ad(testpar$output)
+
+scores1 <- ad_out$uns$metric_values
+names(scores1) <- ad_out$uns$metric_ids
+# assert_that(
+#   scores1[["pairing_aupr"]] >= scores1[["neighbor_aupr"]],
+#   scores1[["neighbor_aupr"]] >= scores1[["celltype_aupr"]],
+#   scores1[["pairing_auroc"]] >= scores1[["neighbor_auroc"]],
+#   scores1[["pairing_auroc"]] >= scores1[["celltype_auroc"]]
+# )
+
+
+cat("Creating test files\n")
+# pairing_ix <- c(2, 1, 3)
+pairing_ix <- c(3, 1, 2)
+ad_sol <- anndata::AnnData(
+  X = as(diag(length(pairing_ix)), "CsparseMatrix")[,pairing_ix],
+  layers = list(
+    neighbors = as(Matrix(c(1, 0.5, 0, 0.5, 1, 0.25, 0, 0.25, 1), nrow = 3, byrow = TRUE, sparse = TRUE), "CsparseMatrix")[,pairing_ix]
+  ),
+  uns = list(dataset_id = "simple", pairing_ix = pairing_ix-1),
+  obs = data.frame(
+    cell_type = c("a", "a", "b")
+  )
+)
+ad_pred <- anndata::AnnData(
+  X = as(Matrix(c(1, .1, .2, .3, .9, .4, .5, .6, .8), nrow = 3, byrow = TRUE, sparse = TRUE), "CsparseMatrix")[,pairing_ix],
+  uns = list(dataset_id = "simple", method_id = "simple")
+)
+
+ad_sol$write_h5ad(testpar$input_solution, compression = "gzip")
+ad_pred$write_h5ad(testpar$input_prediction, compression = "gzip")
+
+cat("> Running metric\n")
+out <- processx::run(
+  command = command,
+  args = args,
+  stderr_to_stdout = TRUE
+)
+
+cat("> Reading metric scores\n")
+assert_that(file.exists(testpar$output))
+ad_out <- anndata::read_h5ad(testpar$output)
+
+scores2 <- ad_out$uns$metric_values
+names(scores2) <- ad_out$uns$metric_ids
+
+assert_that(all(scores1 == scores2))
+
+cat("> Test succeeded!\n")

From 7b3e5ae48e739b1fcfc396262a8630814d5f7b84 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Thu, 12 Jan 2023 15:06:24 +0100
Subject: [PATCH 76/82] add check_format metric

---
 .../metrics/check_format/config.vsh.yaml      | 35 ++++++++++
 .../metrics/check_format/script.R             | 67 +++++++++++++++++++
 2 files changed, 102 insertions(+)
 create mode 100644 src/match_modality/metrics/check_format/config.vsh.yaml
 create mode 100644 src/match_modality/metrics/check_format/script.R

diff --git a/src/match_modality/metrics/check_format/config.vsh.yaml b/src/match_modality/metrics/check_format/config.vsh.yaml
new file mode 100644
index 0000000000..5f7f846065
--- /dev/null
+++ b/src/match_modality/metrics/check_format/config.vsh.yaml
@@ -0,0 +1,35 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: check_format
+  namespace: match_modality/metrics
+  description: Checking whether the prediction of a method has the right format.
+  info:
+    metrics:
+      - metric_id: finished
+        metric_name: Finished
+        metric_description: Check if metric finished succesfully
+        maximize: true
+        min: 0
+        max: 1
+      - metric_id: correct_format
+        metric_name: Correct Format
+        metric_description: Check if predictioin has the right format
+        maximize: true
+        min: 0
+        max: 1
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata ]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [ anndata>=0.8, pyyaml ]
+  - type: nextflow
+    directives:
+      label: [ lowmem, lowtime, lowcpu ]
\ No newline at end of file
diff --git a/src/match_modality/metrics/check_format/script.R b/src/match_modality/metrics/check_format/script.R
new file mode 100644
index 0000000000..ade396c0bb
--- /dev/null
+++ b/src/match_modality/metrics/check_format/script.R
@@ -0,0 +1,67 @@
+cat("Load dependencies\n")
+library(assertthat, quietly = TRUE, warn.conflicts = FALSE)
+requireNamespace("anndata", quietly = TRUE)
+
+## VIASH START
+task <- "match_modality"
+par <- list(
+  input_solution = paste0("resources_test/", task, "/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_sol.h5ad"),
+  input_prediction = paste0("resources_test/", task, "/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad"),
+  output = paste0("resources_test/", task, "/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.scores.h5ad")
+)
+## VIASH END
+
+cat("Read prediction h5ad\n")
+ad_sol <- anndata::read_h5ad(par$input_solution, backed = "r")
+
+cat("Checking solution h5ad\n")
+correct_format <- tryCatch({
+  # read prediction
+  ad_pred <- anndata::read_h5ad(par$input_prediction)
+
+  # check dataset id
+  dataset_id <- ad_pred$uns[["dataset_id"]]
+  assert_that(dataset_id == ad_sol$uns[["dataset_id"]])
+
+  # check method id
+  method_id <- ad_pred$uns[["method_id"]]
+  assert_that(
+    is.character(method_id),
+    method_id != ""
+  )
+
+  # check X
+  assert_that(
+    is(ad_pred$X, "sparseMatrix"),
+    ad_pred$n_obs == ad_sol$n_obs,
+    ad_pred$n_vars == ad_sol$n_vars,
+    length(ad_pred$X@x) <= 1000 * ad_sol$n_obs,
+    all(ad_pred$X@x >= 0),
+    isTRUE(all.equal(
+      Matrix::rowSums(ad_pred$X),
+      rep(1, ad_pred$n_obs),
+      check.attributes = FALSE,
+      tolerance = 1e-5
+    ))
+  )
+
+  1
+}, error = function(e) {
+  cat("ERROR: ", e$message, "\n", sep = "")
+  0
+})
+
+
+cat("Create output object\n")
+out <- anndata::AnnData(
+  shape = c(0, 0),
+  uns = list(
+    dataset_id = ad_pred$uns$dataset_id,
+    method_id = ad_pred$uns$method_id,
+    metric_ids = c("finished", "correct_format"),
+    metric_values = c(1, correct_format)
+  )
+)
+
+cat("Write output to h5ad file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")

From 594febc579a84e555756b9d7e850b56c37912105 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Thu, 12 Jan 2023 15:46:21 +0100
Subject: [PATCH 77/82] add match_probability metric

---
 src/match_modality/api/comp_metric.yaml       |  4 +-
 .../metrics/match_probability/config.vsh.yaml | 30 ++++++++++
 .../metrics/match_probability/script.R        | 56 +++++++++++++++++++
 3 files changed, 87 insertions(+), 3 deletions(-)
 create mode 100644 src/match_modality/metrics/match_probability/config.vsh.yaml
 create mode 100644 src/match_modality/metrics/match_probability/script.R

diff --git a/src/match_modality/api/comp_metric.yaml b/src/match_modality/api/comp_metric.yaml
index 12c5a070ac..ae94586882 100644
--- a/src/match_modality/api/comp_metric.yaml
+++ b/src/match_modality/api/comp_metric.yaml
@@ -33,9 +33,7 @@ functionality:
         ]
 
         print("> Running method", flush=True)
-        out = subprocess.run(cmd, stderr=subprocess.STDOUT, check=True)
-
-        # print(out.stdout)
+        out = subprocess.run(cmd, check=True)
 
         print("> Checking whether output files were created", flush=True)
         assert path.exists(output_path)
diff --git a/src/match_modality/metrics/match_probability/config.vsh.yaml b/src/match_modality/metrics/match_probability/config.vsh.yaml
new file mode 100644
index 0000000000..6214b49ad9
--- /dev/null
+++ b/src/match_modality/metrics/match_probability/config.vsh.yaml
@@ -0,0 +1,30 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: match_probability
+  namespace: match_modality/metrics
+  description: Calculating basic metrics for task2.
+  info:
+    metrics:
+      - metric_id: match_probability
+        metric_name: Match Probability
+        metric_description: Calculating basic metrics for match modality
+        min: 0
+        max: 1
+        maximize: true
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata, tidyverse, testthat]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [ anndata>=0.8, pyyaml ]
+  - type: nextflow
+    directives:
+      label: [ lowmem, lowtime, lowcpu ]
+
diff --git a/src/match_modality/metrics/match_probability/script.R b/src/match_modality/metrics/match_probability/script.R
new file mode 100644
index 0000000000..b68051ad2b
--- /dev/null
+++ b/src/match_modality/metrics/match_probability/script.R
@@ -0,0 +1,56 @@
+cat("Load dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+library(testthat, quietly = TRUE, warn.conflicts = FALSE)
+library(Matrix, quietly = TRUE, warn.conflicts = FALSE)
+requireNamespace("anndata", quietly = TRUE)
+
+## VIASH START
+par <- list(
+  input_solution = "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_sol.h5ad",
+  input_prediction = "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+  output = "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.scores.h5ad"
+)
+## VIASH END
+
+cat("Read solution h5ad\n")
+ad_sol <- anndata::read_h5ad(par$input_solution, backed = "r")
+
+cat("Read prediction h5ad\n")
+ad_pred <- anndata::read_h5ad(par$input_prediction)
+
+cat("Unscrambling predictions\n")
+pairing_ix <- ad_sol$uns[["pairing_ix"]]
+X_pred <- as(ad_pred$X, "CsparseMatrix")[, order(pairing_ix)]
+dimnames(X_pred) <- list(NULL, NULL)
+
+# set negative values to 0
+X_pred@x <- ifelse(X_pred@x < 0, 0, X_pred@x)
+
+cat("Calculating normalisation factors\n")
+rowSum <- Matrix::rowSums(X_pred)
+
+cat("Computing the match modality score\n")
+match_probability_vec <- diag(X_pred) / rowSum
+
+match_probability <- mean(match_probability_vec)
+
+cat("Create output object\n")
+out <- anndata::AnnData(
+  shape = c(0, 0),
+  uns = list(
+    dataset_id = ad_pred$uns$dataset_id,
+    method_id = ad_pred$uns$method_id,
+    metric_ids = list("match_probability"),
+    metric_values = list(match_probability),
+    per_cell = list(
+      match_probability = match_probability_vec
+    )
+  )
+)
+
+# should we also save the metrics object?
+# this would allow for plotting the auroc and aupr curves afterwards.
+
+cat("Write output to h5ad file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")
\ No newline at end of file

From 017c2031f9afb1dc27b41ca1fb364dfcb84d65b2 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Thu, 12 Jan 2023 16:04:33 +0100
Subject: [PATCH 78/82] add resources and resources_test scripts

---
 .../resources_scripts/mask_datasets.sh        | 64 ++++++++++++++++
 .../resources_scripts/run_benchmarks.sh       | 74 +++++++++++++++++++
 .../resources_test_scripts/bmmc_cite.sh       | 57 ++++++++++++++
 3 files changed, 195 insertions(+)
 create mode 100644 src/match_modality/resources_scripts/mask_datasets.sh
 create mode 100644 src/match_modality/resources_scripts/run_benchmarks.sh
 create mode 100644 src/match_modality/resources_test_scripts/bmmc_cite.sh

diff --git a/src/match_modality/resources_scripts/mask_datasets.sh b/src/match_modality/resources_scripts/mask_datasets.sh
new file mode 100644
index 0000000000..a70cc985d5
--- /dev/null
+++ b/src/match_modality/resources_scripts/mask_datasets.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+COMMON_DATASETS="resources/datasets/openproblems_v1"
+OUTPUT_DIR="resources/match_modality/datasets/openproblems_v1"
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+params_file="$OUTPUT_DIR/params.yaml"
+
+if [ ! -f $params_file ]; then
+  python << HERE
+import anndata as ad
+import glob
+import yaml
+
+h5ad_files = glob.glob("$COMMON_DATASETS/**.h5ad")
+
+# this task doesn't use normalizations
+# 
+param_list = {}
+
+for h5ad_file in h5ad_files:
+  print(f"Checking {h5ad_file}")
+  adata = ad.read_h5ad(h5ad_file, backed=True)
+  if "counts" in adata.layers:
+    dataset_id = adata.uns["dataset_id"].replace("/", ".")
+    obj = {
+      'id': dataset_id, 
+      'input': h5ad_file,
+      'dataset_id': dataset_id,
+    }
+    param_list[dataset_id] = obj
+
+output = {
+  "param_list": list(param_list.values()),
+  "seed": 123,
+  "output_train": "\$id.train.h5ad",
+  "output_test": "\$id.test.h5ad"
+}
+
+with open("$params_file", "w") as file:
+  yaml.dump(output, file)
+HERE
+fi
+
+export NXF_VER=22.04.5
+nextflow \
+  run . \
+  -main-script target/nextflow/match_modality/mask_dataset/main.nf \
+  -profile docker \
+  -resume \
+  -params-file $params_file \
+  --publish_dir "$OUTPUT_DIR"
+
+bin/tools/docker/nextflow/process_log/process_log \
+  --output "$OUTPUT_DIR/nextflow_log.tsv"
\ No newline at end of file
diff --git a/src/match_modality/resources_scripts/run_benchmarks.sh b/src/match_modality/resources_scripts/run_benchmarks.sh
new file mode 100644
index 0000000000..3bf2418934
--- /dev/null
+++ b/src/match_modality/resources_scripts/run_benchmarks.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+set -e
+
+export TOWER_WORKSPACE_ID=53907369739130
+
+DATASETS_DIR="resources/match_modality/datasets/openproblems_v1"
+OUTPUT_DIR="resources/match_modality/benchmarks/openproblems_v1"
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+params_file="$OUTPUT_DIR/params.yaml"
+
+if [ ! -f $params_file ]; then
+  python << HERE
+import yaml
+import os
+
+dataset_dir = "$DATASETS_DIR"
+output_dir = "$OUTPUT_DIR"
+
+# read split datasets yaml
+with open(dataset_dir + "/params.yaml", "r") as file:
+  split_list = yaml.safe_load(file)
+datasets = split_list['param_list']
+
+# figure out where train/test files were stored
+param_list = []
+
+for dataset in datasets:
+  id = dataset["id"]
+  input_train = dataset_dir + "/" + id + ".train.h5ad"
+  input_test = dataset_dir + "/" + id + ".test.h5ad"
+  
+  if os.path.exists(input_test):
+    obj = {
+      'id': id, 
+    'id': id, 
+      'id': id, 
+      'dataset_id': dataset["dataset_id"],
+      'input_train': input_train,
+      'input_test': input_test
+    }
+    param_list.append(obj)
+
+# write as output file
+output = {
+  "param_list": param_list,
+}
+
+with open(output_dir + "/params.yaml", "w") as file:
+  yaml.dump(output, file)
+HERE
+fi
+
+export NXF_VER=22.04.5
+nextflow \
+  run . \
+  -main-script src/match_modality/workflows/run/main.nf \
+  -profile docker \
+  -params-file "$params_file" \
+  --publish_dir "$OUTPUT_DIR" \
+  -with-tower
+
+bin/tools/docker/nextflow/process_log/process_log \
+  --output "$OUTPUT_DIR/nextflow_log.tsv"
\ No newline at end of file
diff --git a/src/match_modality/resources_test_scripts/bmmc_cite.sh b/src/match_modality/resources_test_scripts/bmmc_cite.sh
new file mode 100644
index 0000000000..5a46d9b08f
--- /dev/null
+++ b/src/match_modality/resources_test_scripts/bmmc_cite.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+#
+#make sure the following command has been executed
+#viash ns build -q 'match_modality|common' --parallel --setup cb
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+MOD_1_DATA=resources_test/common/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.output_rna.h5ad
+MOD_2_DATA=resources_test/common/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.output_mod2.h5ad
+DATASET_DIR=resources_test/match_modality/bmmc_cite
+
+if [ ! -f $MOD_1_DATA ]; then
+    echo "Error! Could not find raw data"
+    exit 1
+fi
+
+mkdir -p $DATASET_DIR
+
+# maskdataset
+viash run src/match_modality/mask_dataset/config.vsh.yaml -- \
+    --input_mod1 $MOD_1_DATA \
+    --input_mod2 $MOD_2_DATA \
+    --output_mod1 $DATASET_DIR/cite_mod1.h5ad \
+    --output_mod2 $DATASET_DIR/cite_mod2.h5ad \
+    --output_solution $DATASET_DIR/cite_solution.h5ad
+
+# run one method
+viash run src/match_modality/methods/dr_knnr_cbf/config.vsh.yaml -- \
+    --input_mod1 $DATASET_DIR/cite_mod1.h5ad \
+    --input_mod2 $DATASET_DIR/cite_mod2.h5ad \
+    --output $DATASET_DIR/dr_knnr_cbf.h5ad
+
+# run one metric
+viash run src/match_modality/metrics/aupr/config.vsh.yaml -- \
+    --input_prediction $DATASET_DIR/dr_knnr_cbf.h5ad \
+    --input_solution $DATASET_DIR/cite_solution.h5ad \
+    --output $DATASET_DIR/aupr.h5ad
+
+# run benchmark
+export NXF_VER=22.04.5
+
+nextflow \
+  run . \
+  -main-script src/match_modality/workflows/run/main.nf \
+  -profile docker \
+  -resume \
+  --id bmmc_cite \
+  --dataset_id bmmc_site \
+  --input_mod1 $DATASET_DIR/cite_mod1.h5ad \
+  --input_mod2 $DATASET_DIR/cite_mod2.h5ad \
+  --input_solution $DATASET_DIR/cite_solution.h5ad \
+  --output scores.tsv \
+  --publish_dir $DATASET_DIR/
\ No newline at end of file

From 186125e1ace0699c2479e45de2200733c9a665b0 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Thu, 12 Jan 2023 21:23:51 +0100
Subject: [PATCH 79/82] add NF workflow

---
 src/match_modality/api/comp_mask_dataset.yaml |   4 +
 .../methods/newwave_knnr_knn/config.vsh.yaml  |   2 +-
 .../resources_test_scripts/bmmc_cite.sh       |  28 +--
 .../workflows/run/config.vsh.yaml             |  32 ++++
 src/match_modality/workflows/run/main.nf      | 168 ++++++++++++++++++
 .../workflows/run/nextflow.config             |  14 ++
 6 files changed, 237 insertions(+), 11 deletions(-)
 create mode 100644 src/match_modality/workflows/run/config.vsh.yaml
 create mode 100644 src/match_modality/workflows/run/main.nf
 create mode 100644 src/match_modality/workflows/run/nextflow.config

diff --git a/src/match_modality/api/comp_mask_dataset.yaml b/src/match_modality/api/comp_mask_dataset.yaml
index 0737a499a0..3876ef397e 100644
--- a/src/match_modality/api/comp_mask_dataset.yaml
+++ b/src/match_modality/api/comp_mask_dataset.yaml
@@ -5,12 +5,16 @@ functionality:
     - name: "--input_mod2"
       __merge__: anndata_dataset.yaml
     - name: "--output_train_mod1"
+      direction: "output"
       __merge__: anndata_masked_mod.yaml
     - name: "--output_train_mod2"
+      direction: "output"
       __merge__: anndata_masked_mod.yaml
     - name: "--output_train_sol"
+      direction: "output"
       __merge__: anndata_solution.yaml
     - name: "--output_test_mod1"
+      direction: "output"
       __merge__: anndata_masked_mod.yaml
     - name: "--output_test_mod2"
       direction: "output"
diff --git a/src/match_modality/methods/newwave_knnr_knn/config.vsh.yaml b/src/match_modality/methods/newwave_knnr_knn/config.vsh.yaml
index f02a5ed909..a95e913494 100644
--- a/src/match_modality/methods/newwave_knnr_knn/config.vsh.yaml
+++ b/src/match_modality/methods/newwave_knnr_knn/config.vsh.yaml
@@ -1,6 +1,6 @@
 __merge__: ../../api/comp_method.yaml
 functionality:
-  name: baseline_newwave_knnr_knn
+  name: newwave_knnr_knn
   namespace: match_modality/methods
   description: Perform DR with NewWave, predict embedding with KNN regression, and match cells with a KNN.
   info:
diff --git a/src/match_modality/resources_test_scripts/bmmc_cite.sh b/src/match_modality/resources_test_scripts/bmmc_cite.sh
index 5a46d9b08f..d3debaa836 100644
--- a/src/match_modality/resources_test_scripts/bmmc_cite.sh
+++ b/src/match_modality/resources_test_scripts/bmmc_cite.sh
@@ -24,20 +24,26 @@ mkdir -p $DATASET_DIR
 viash run src/match_modality/mask_dataset/config.vsh.yaml -- \
     --input_mod1 $MOD_1_DATA \
     --input_mod2 $MOD_2_DATA \
-    --output_mod1 $DATASET_DIR/cite_mod1.h5ad \
-    --output_mod2 $DATASET_DIR/cite_mod2.h5ad \
-    --output_solution $DATASET_DIR/cite_solution.h5ad
+    --output_train_mod1 $DATASET_DIR/cite_train_mod1.h5ad \
+    --output_train_mod2 $DATASET_DIR/cite_train_mod2.h5ad \
+    --output_train_sol $DATASET_DIR/cite_train_sol.h5ad \
+    --output_test_mod1 $DATASET_DIR/cite_test_mod1.h5ad \
+    --output_test_mod2 $DATASET_DIR/cite_test_mod2.h5ad \
+    --output_test_sol $DATASET_DIR/cite_test_sol.h5ad
 
 # run one method
 viash run src/match_modality/methods/dr_knnr_cbf/config.vsh.yaml -- \
-    --input_mod1 $DATASET_DIR/cite_mod1.h5ad \
-    --input_mod2 $DATASET_DIR/cite_mod2.h5ad \
+    --input_train_mod1 $DATASET_DIR/cite_train_mod1.h5ad \
+    --input_train_mod2 $DATASET_DIR/cite_train_mod2.h5ad \
+    --input_train_sol $DATASET_DIR/cite_train_sol.h5ad \
+    --input_test_mod1 $DATASET_DIR/cite_test_mod1.h5ad \
+    --input_test_mod2 $DATASET_DIR/cite_test_mod2.h5ad \
     --output $DATASET_DIR/dr_knnr_cbf.h5ad
 
 # run one metric
 viash run src/match_modality/metrics/aupr/config.vsh.yaml -- \
     --input_prediction $DATASET_DIR/dr_knnr_cbf.h5ad \
-    --input_solution $DATASET_DIR/cite_solution.h5ad \
+    --input_solution $DATASET_DIR/cite_test_sol.h5ad \
     --output $DATASET_DIR/aupr.h5ad
 
 # run benchmark
@@ -47,11 +53,13 @@ nextflow \
   run . \
   -main-script src/match_modality/workflows/run/main.nf \
   -profile docker \
-  -resume \
   --id bmmc_cite \
   --dataset_id bmmc_site \
-  --input_mod1 $DATASET_DIR/cite_mod1.h5ad \
-  --input_mod2 $DATASET_DIR/cite_mod2.h5ad \
-  --input_solution $DATASET_DIR/cite_solution.h5ad \
+  --input_train_mod1 $DATASET_DIR/cite_train_mod1.h5ad \
+  --input_train_mod2 $DATASET_DIR/cite_train_mod2.h5ad \
+  --input_train_sol $DATASET_DIR/cite_train_sol.h5ad \
+  --input_test_mod1 $DATASET_DIR/cite_test_mod1.h5ad \
+  --input_test_mod2 $DATASET_DIR/cite_test_mod2.h5ad \
+  --input_solution $DATASET_DIR/cite_test_sol.h5ad \
   --output scores.tsv \
   --publish_dir $DATASET_DIR/
\ No newline at end of file
diff --git a/src/match_modality/workflows/run/config.vsh.yaml b/src/match_modality/workflows/run/config.vsh.yaml
new file mode 100644
index 0000000000..58db4d0b1e
--- /dev/null
+++ b/src/match_modality/workflows/run/config.vsh.yaml
@@ -0,0 +1,32 @@
+functionality:
+  name: "run_benchmark"
+  namespace: "match_modality/workflows"
+  argument_groups:
+    - name: Inputs
+      arguments:
+        - name: "--id"
+          type: "string"
+          description: "The ID of the dataset"
+          required: true
+        - name: "--input_train_mod1"
+          type: "file" # todo: replace with includes
+        - name: "--input_train_mod2"
+          type: "file" 
+        - name: "--input_train_sol"
+          type: "file" # todo: replace with includes
+        - name: "--input_test_mod1"
+          type: "file"# todo: replace with includes
+        - name: "--input_test_mod2"
+          type: "file"
+        - name: "--input_solution"
+          type: "file" # todo: replace with includes
+    - name: Outputs
+      arguments:
+        - name: "--output"
+          direction: "output"
+          type: file
+  resources:
+    - type: nextflow_script
+      path: main.nf
+platforms:
+  - type: nextflow
\ No newline at end of file
diff --git a/src/match_modality/workflows/run/main.nf b/src/match_modality/workflows/run/main.nf
new file mode 100644
index 0000000000..fe48e3f63f
--- /dev/null
+++ b/src/match_modality/workflows/run/main.nf
@@ -0,0 +1,168 @@
+nextflow.enable.dsl=2
+
+sourceDir = params.rootDir + "/src"
+targetDir = params.rootDir + "/target/nextflow"
+
+// import control methods
+include { constant } from "$targetDir/match_modality/control_methods/constant/main.nf"
+include { random_pairing } from "$targetDir/match_modality/control_methods/random_pairing/main.nf"
+include { semi_solution } from "$targetDir/match_modality/control_methods/semi_solution/main.nf"
+include { solution } from "$targetDir/match_modality/control_methods/solution/main.nf"
+
+
+// import methods
+// include { babel_knn } from "$targetDir/match_modality/methods/babel_knn/main.nf"
+include { dr_knnr_cbf } from "$targetDir/match_modality/methods/dr_knnr_cbf/main.nf"
+include { dr_knnr_knn } from "$targetDir/match_modality/methods/dr_knnr_knn/main.nf"
+include { linear_knn } from "$targetDir/match_modality/methods/linear_knn/main.nf"
+include { newwave_knnr_cbf } from "$targetDir/match_modality/methods/newwave_knnr_cbf/main.nf"
+include { newwave_knnr_knn } from "$targetDir/match_modality/methods/newwave_knnr_knn/main.nf"
+include { procrustes_knn } from "$targetDir/match_modality/methods/procrustes_knn/main.nf"
+
+
+// import metrics
+include { aupr } from "$targetDir/match_modality/metrics/aupr/main.nf"
+include { check_format } from "$targetDir/match_modality/metrics/check_format/main.nf"
+include { match_probability } from "$targetDir/match_modality/metrics/match_probability/main.nf"
+
+// tsv generation component
+include { extract_scores } from "$targetDir/common/extract_scores/main.nf"
+
+// import helper functions
+include { readConfig; viashChannel; helpMessage } from sourceDir + "/wf_utils/WorkflowHelper.nf"
+include { setWorkflowArguments; getWorkflowArguments; passthroughMap as pmap } from sourceDir + "/wf_utils/DataflowHelper.nf"
+
+config = readConfig("$projectDir/config.vsh.yaml")
+
+// construct a map of methods (id -> method_module)
+methods = [ dr_knnr_cbf, dr_knnr_knn, linear_knn, newwave_knnr_cbf, newwave_knnr_knn, procrustes_knn]
+  .collectEntries{method ->
+    [method.config.functionality.name, method]
+  }
+
+workflow {
+  helpMessage(config)
+
+  viashChannel(params, config)
+    | run_wf
+}
+
+workflow run_wf {
+  take:
+  input_ch
+
+  main:
+  output_ch = input_ch
+
+    // split params for downstream components
+    | setWorkflowArguments(
+      method: ["input_train_mod1", "input_train_mod2", "input_train_sol", "input_test_mod1", "input_test_mod2"],
+      metric: ["input_solution"],
+      output: ["output"]
+    )
+
+    // multiply events by the number of method
+    | add_methods
+
+    // add input_solution to data for the positive controls
+    | controls_can_cheat
+
+    // run methods
+    | getWorkflowArguments(key: "method")
+    | run_methods
+
+    // construct tuples for metrics
+    | pmap{ id, file, passthrough ->
+      // derive unique ids from output filenames
+      def newId = file.getName().replaceAll(".output.*", "")
+      // combine prediction with solution
+      def newData = [ input_prediction: file, input_solution: passthrough.metric.input_solution ]
+      [ newId, newData, passthrough ]
+    }
+    
+    // run metrics
+    | getWorkflowArguments(key: "metric")
+    | run_metrics
+    
+    // convert to tsv  
+    | aggregate_results
+
+  emit:
+  output_ch
+}
+
+workflow add_methods {
+  take: input_ch
+  main:
+  output_ch = Channel.fromList(methods.keySet())
+    | combine(input_ch)
+
+    // generate combined id for method_id and dataset_id
+    | pmap{method_id, dataset_id, data ->
+      def new_id = dataset_id + "." + method_id
+      def new_data = data.clone() + [method_id: method_id]
+      new_data.remove("id")
+      [new_id, new_data]
+    }
+  emit: output_ch
+}
+
+workflow controls_can_cheat {
+  take: input_ch
+  main:
+  output_ch = input_ch
+    | pmap{id, data, passthrough ->
+      def method = methods[data.method_id]
+      def method_type = method.config.functionality.info.method_type
+      def new_data = data.clone()
+      if (method_type != "method") {
+        new_data = new_data + [input_test_sol: passthrough.metric.input_solution]
+      }
+      [id, new_data, passthrough]
+    }
+  emit: output_ch
+}
+
+workflow run_methods {
+  take: input_ch
+  main:
+    // generate one channel per method
+    method_chs = methods.collect { method_id, method_module ->
+        input_ch
+          | filter{it[1].method_id == method_id}
+          | method_module
+      }
+    // mix all results
+    output_ch = method_chs[0].mix(*method_chs.drop(1))
+
+  emit: output_ch
+}
+
+workflow run_metrics {
+  take: input_ch
+  main:
+
+  output_ch = input_ch
+    | (aupr & check_format & match_probability)
+    | mix
+
+  emit: output_ch
+}
+
+workflow aggregate_results {
+  take: input_ch
+  main:
+
+  output_ch = input_ch
+    | toSortedList
+    | filter{ it.size() > 0 }
+    | map{ it -> 
+      [ "combined", it.collect{ it[1] } ] + it[0].drop(2) 
+    }
+    | getWorkflowArguments(key: "output")
+    | extract_scores.run(
+        auto: [ publish: true ]
+    )
+
+  emit: output_ch
+}
\ No newline at end of file
diff --git a/src/match_modality/workflows/run/nextflow.config b/src/match_modality/workflows/run/nextflow.config
new file mode 100644
index 0000000000..6e33495d3a
--- /dev/null
+++ b/src/match_modality/workflows/run/nextflow.config
@@ -0,0 +1,14 @@
+manifest {
+  name = 'match_modality/workflows/run'
+  mainScript = 'main.nf'
+  nextflowVersion = '!>=22.04.5'
+  description = 'Multi modality - math modalility'
+}
+
+params {
+  rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString()
+}
+
+// include common settings
+includeConfig("${params.rootDir}/src/wf_utils/ProfilesHelper.config")
+includeConfig("${params.rootDir}/src/wf_utils/labels.config")
\ No newline at end of file

From 4eaeff9f9eb46afb71817b81056fccf941c267cf Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Fri, 13 Jan 2023 11:42:21 +0100
Subject: [PATCH 80/82] fix directives

---
 src/match_modality/control_methods/constant/config.vsh.yaml    | 3 ++-
 .../control_methods/random_pairing/config.vsh.yaml             | 3 ++-
 .../control_methods/semi_solution/config.vsh.yaml              | 3 ++-
 src/match_modality/control_methods/solution/config.vsh.yaml    | 3 ++-
 src/match_modality/methods/babel_knn/config.vsh.yaml           | 3 ++-
 src/match_modality/methods/dr_knnr_cbf/config.vsh.yaml         | 3 ++-
 src/match_modality/methods/dr_knnr_knn/config.vsh.yaml         | 3 ++-
 src/match_modality/methods/linear_knn/config.vsh.yaml          | 3 ++-
 src/match_modality/methods/newwave_knnr_cbf/config.vsh.yaml    | 3 ++-
 src/match_modality/methods/newwave_knnr_knn/config.vsh.yaml    | 3 ++-
 src/match_modality/methods/procrustes_knn/config.vsh.yaml      | 3 ++-
 src/match_modality/metrics/aupr/config.vsh.yaml                | 2 +-
 src/match_modality/metrics/check_format/config.vsh.yaml        | 2 +-
 src/match_modality/metrics/match_probability/config.vsh.yaml   | 2 +-
 14 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/src/match_modality/control_methods/constant/config.vsh.yaml b/src/match_modality/control_methods/constant/config.vsh.yaml
index a674e4ecf0..c20acf234f 100644
--- a/src/match_modality/control_methods/constant/config.vsh.yaml
+++ b/src/match_modality/control_methods/constant/config.vsh.yaml
@@ -20,4 +20,5 @@ platforms:
       - type: python
         pip: [ anndata>=0.8 ]
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/match_modality/control_methods/random_pairing/config.vsh.yaml b/src/match_modality/control_methods/random_pairing/config.vsh.yaml
index 3218f32279..fd9da8d595 100644
--- a/src/match_modality/control_methods/random_pairing/config.vsh.yaml
+++ b/src/match_modality/control_methods/random_pairing/config.vsh.yaml
@@ -16,4 +16,5 @@ platforms:
       - type: python
         pip: [ anndata>=0.8, numpy, scikit-learn ]
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
\ No newline at end of file
+    directives: 
+      label: [ lowmem, lowcpu ]
\ No newline at end of file
diff --git a/src/match_modality/control_methods/semi_solution/config.vsh.yaml b/src/match_modality/control_methods/semi_solution/config.vsh.yaml
index da39c07aab..756716feb6 100644
--- a/src/match_modality/control_methods/semi_solution/config.vsh.yaml
+++ b/src/match_modality/control_methods/semi_solution/config.vsh.yaml
@@ -20,4 +20,5 @@ platforms:
       - type: python
         pip: [ anndata>=0.8 ]
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/match_modality/control_methods/solution/config.vsh.yaml b/src/match_modality/control_methods/solution/config.vsh.yaml
index ca9d8d5904..ae4d48430e 100644
--- a/src/match_modality/control_methods/solution/config.vsh.yaml
+++ b/src/match_modality/control_methods/solution/config.vsh.yaml
@@ -20,4 +20,5 @@ platforms:
       - type: python
         pip: [ anndata>=0.8 ]
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
+    directives: 
+    label: [ lowmem, lowcpu ]
diff --git a/src/match_modality/methods/babel_knn/config.vsh.yaml b/src/match_modality/methods/babel_knn/config.vsh.yaml
index 55c4498547..4a7cdc2e31 100644
--- a/src/match_modality/methods/babel_knn/config.vsh.yaml
+++ b/src/match_modality/methods/babel_knn/config.vsh.yaml
@@ -37,4 +37,5 @@ platforms:
           - cd / && git clone --depth 1 https://github.com/rcannood/babel.git
           - cd /babel && /opt/conda/bin/conda env create -f environment.yml      
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/match_modality/methods/dr_knnr_cbf/config.vsh.yaml b/src/match_modality/methods/dr_knnr_cbf/config.vsh.yaml
index 86e00c8c9a..37321fcad1 100644
--- a/src/match_modality/methods/dr_knnr_cbf/config.vsh.yaml
+++ b/src/match_modality/methods/dr_knnr_cbf/config.vsh.yaml
@@ -25,4 +25,5 @@ platforms:
       - type: python
         pip: [ anndata>=0.8 ]
   - type: nextflow
-    directives: [ midmem, midtime, midcpu ]
+    directives: 
+      label: [ midmem, midcpu ]
diff --git a/src/match_modality/methods/dr_knnr_knn/config.vsh.yaml b/src/match_modality/methods/dr_knnr_knn/config.vsh.yaml
index e03bf3be48..fd9387d6d4 100644
--- a/src/match_modality/methods/dr_knnr_knn/config.vsh.yaml
+++ b/src/match_modality/methods/dr_knnr_knn/config.vsh.yaml
@@ -20,4 +20,5 @@ platforms:
       - type: python
         pip: [ anndata>=0.8 ]
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/match_modality/methods/linear_knn/config.vsh.yaml b/src/match_modality/methods/linear_knn/config.vsh.yaml
index ab26297bd7..20fa30783b 100644
--- a/src/match_modality/methods/linear_knn/config.vsh.yaml
+++ b/src/match_modality/methods/linear_knn/config.vsh.yaml
@@ -25,4 +25,5 @@ platforms:
       - type: python
         pip: [ anndata, scipy, scikit-learn ]
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/match_modality/methods/newwave_knnr_cbf/config.vsh.yaml b/src/match_modality/methods/newwave_knnr_cbf/config.vsh.yaml
index 9fbf3a8887..6fcc058b2a 100644
--- a/src/match_modality/methods/newwave_knnr_cbf/config.vsh.yaml
+++ b/src/match_modality/methods/newwave_knnr_cbf/config.vsh.yaml
@@ -38,4 +38,5 @@ platforms:
       - type: python
         pip: [ anndata>=0.8 ]
   - type: nextflow
-    directives: [ highmem, midtime, highcpu ]
+    directives: 
+      label: [ highmem, highcpu ]
diff --git a/src/match_modality/methods/newwave_knnr_knn/config.vsh.yaml b/src/match_modality/methods/newwave_knnr_knn/config.vsh.yaml
index a95e913494..4293005a23 100644
--- a/src/match_modality/methods/newwave_knnr_knn/config.vsh.yaml
+++ b/src/match_modality/methods/newwave_knnr_knn/config.vsh.yaml
@@ -33,4 +33,5 @@ platforms:
       - type: python
         pip: [ anndata>=0.8, tensorflow ]
   - type: nextflow
-    directives: [ highmem, midtime, highcpu ]
+    directives: 
+      label: [ highmem, highcpu ]
diff --git a/src/match_modality/methods/procrustes_knn/config.vsh.yaml b/src/match_modality/methods/procrustes_knn/config.vsh.yaml
index 1c317ca3d9..65d0327291 100644
--- a/src/match_modality/methods/procrustes_knn/config.vsh.yaml
+++ b/src/match_modality/methods/procrustes_knn/config.vsh.yaml
@@ -21,4 +21,5 @@ platforms:
       - type: python
         pip: [ anndata>=0.8, scipy, scikit-learn ]
   - type: nextflow
-    directives: [ lowmem, lowtime, lowcpu ]
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/match_modality/metrics/aupr/config.vsh.yaml b/src/match_modality/metrics/aupr/config.vsh.yaml
index 3b655555d1..205efb353e 100644
--- a/src/match_modality/metrics/aupr/config.vsh.yaml
+++ b/src/match_modality/metrics/aupr/config.vsh.yaml
@@ -36,5 +36,5 @@ platforms:
         pip: [ anndata>=0.8, pyyaml ]
   - type: nextflow
     directives:
-      label: [ lowmem, lowtime, lowcpu ]
+      label: [ lowmem, lowcpu ]
 
diff --git a/src/match_modality/metrics/check_format/config.vsh.yaml b/src/match_modality/metrics/check_format/config.vsh.yaml
index 5f7f846065..9519865682 100644
--- a/src/match_modality/metrics/check_format/config.vsh.yaml
+++ b/src/match_modality/metrics/check_format/config.vsh.yaml
@@ -32,4 +32,4 @@ platforms:
         pip: [ anndata>=0.8, pyyaml ]
   - type: nextflow
     directives:
-      label: [ lowmem, lowtime, lowcpu ]
\ No newline at end of file
+      label: [ lowmem, lowcpu ]
\ No newline at end of file
diff --git a/src/match_modality/metrics/match_probability/config.vsh.yaml b/src/match_modality/metrics/match_probability/config.vsh.yaml
index 6214b49ad9..c9c274f954 100644
--- a/src/match_modality/metrics/match_probability/config.vsh.yaml
+++ b/src/match_modality/metrics/match_probability/config.vsh.yaml
@@ -26,5 +26,5 @@ platforms:
         pip: [ anndata>=0.8, pyyaml ]
   - type: nextflow
     directives:
-      label: [ lowmem, lowtime, lowcpu ]
+      label: [ lowmem, lowcpu ]
 

From da7a2eb2ae4ecc7089bfe2cc0e90cc2cf1701ce5 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Fri, 13 Jan 2023 15:45:25 +0100
Subject: [PATCH 81/82] fix configs

---
 src/match_modality/control_methods/solution/config.vsh.yaml | 2 +-
 src/match_modality/metrics/check_format/config.vsh.yaml     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/match_modality/control_methods/solution/config.vsh.yaml b/src/match_modality/control_methods/solution/config.vsh.yaml
index ae4d48430e..b352fdaedf 100644
--- a/src/match_modality/control_methods/solution/config.vsh.yaml
+++ b/src/match_modality/control_methods/solution/config.vsh.yaml
@@ -21,4 +21,4 @@ platforms:
         pip: [ anndata>=0.8 ]
   - type: nextflow
     directives: 
-    label: [ lowmem, lowcpu ]
+      label: [ lowmem, lowcpu ]
diff --git a/src/match_modality/metrics/check_format/config.vsh.yaml b/src/match_modality/metrics/check_format/config.vsh.yaml
index 9519865682..94e833b471 100644
--- a/src/match_modality/metrics/check_format/config.vsh.yaml
+++ b/src/match_modality/metrics/check_format/config.vsh.yaml
@@ -25,7 +25,7 @@ platforms:
     image: eddelbuettel/r2u:22.04
     setup:
       - type: r
-        cran: [ anndata ]
+        cran: [ anndata, bit64]
       - type: apt
         packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
       - type: python

From 215ecd6cd8c9caf9d5879cfae5d11d32b3be81f7 Mon Sep 17 00:00:00 2001
From: KaiWaldrant <kai.waldrant@outlook.com>
Date: Tue, 24 Jan 2023 11:43:39 +0100
Subject: [PATCH 82/82] add readme and task info

---
 src/match_modality/README.md          | 23 +++++++++++++++++++++++
 src/match_modality/api/task_info.yaml | 13 +++++++++++++
 2 files changed, 36 insertions(+)
 create mode 100644 src/match_modality/README.md
 create mode 100644 src/match_modality/api/task_info.yaml

diff --git a/src/match_modality/README.md b/src/match_modality/README.md
new file mode 100644
index 0000000000..3f1ff48cc1
--- /dev/null
+++ b/src/match_modality/README.md
@@ -0,0 +1,23 @@
+# Match modality
+
+Structure of this task:
+
+    src/match_modality
+    ├── api                          Interface specifications for components and datasets in this task
+    ├── control_methods              Baseline (random/ground truth) methods to compare methods against
+    ├── methods                      Methods to be benchmarked
+    ├── metrics                      Metrics used to quantify performance of methods
+    ├── README.md                    This file
+    ├── resources_scripts            Scripts to process the datasets
+    ├── resources_test_scripts       Scripts to process the test resources
+    ├── split_dataset                Component to prepare common datasets
+    └── workflows                    Pipelines to run the full benchmark
+
+Relevant links:
+
+* [Description and results at openproblems.bio](https://openproblems.bio/neurips_2021/)
+
+* [Experimental results](https://openproblems-experimental.netlify.app/results/match_modality/)
+
+<!-- update this to openproblems.bio/guide when possible -->
+* [Contribution guide](https://github.com/openproblems-bio/openproblems-v2/blob/main/CONTRIBUTING.md)
diff --git a/src/match_modality/api/task_info.yaml b/src/match_modality/api/task_info.yaml
new file mode 100644
index 0000000000..50d32b98f9
--- /dev/null
+++ b/src/match_modality/api/task_info.yaml
@@ -0,0 +1,13 @@
+task_id: match_modality
+task_name: Match modality
+summary: "Predicting which profiles from one modality resembles a profile from another."
+description: |
+  "While joint profiling of two modalities in the same single cell is now possible, 
+  most single-cell datasets that exist measure only a single modality. These modalities 
+  complement each other in their description of cellular state. Yet, it is challenging 
+  to analyse uni-modal datasets together when they do not share observations (cells) or 
+  a common feature space (genes, proteins, or open chromatin peaks). If we could map 
+  observations to one another across modalities, it would be possible to treat separately 
+  profiled datasets in the same manner as new multi-modal sequencing data. Mapping these 
+  modalities to one another opens up the vast amount of uni-modal single-cell datasets 
+  generated in the past years to multi-modal data analysis methods."
\ No newline at end of file