diff --git a/CHANGELOG.md b/CHANGELOG.md
index aa367f2377..1befb3b2e1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -179,3 +179,67 @@
 * `metrics/rmse` should be removed because RMSE metrics don't really make sense here.
 
 * `metrics/trustworthiness` should be removed because it is already included in `metrics/coranking`.
+
+
+## Multi modality - Joint Embedding
+
+### New functionality
+
+* `api/anndata_*`: Created a file format specifications for the h5ad files throughout the pipeline.
+
+* `api/comp_*`: Created an api definition for the mask, method and metric components.
+
+* `mask_dataset`: Added a component for masking raw datasets into task-ready dataset objects.
+
+* `resources_test/joint_embedding/pancreas` with `src/joint_embedding/resources_test_scripts/pancreas.sh`.
+  
+### neurips 2021 migration
+
+* `control_methods/random_embed`: Migrated from neurips 2021. Extracted from baseline method `dummy_random`.
+
+* `control_methods/zeros_embed`: Migrated from neurips 2021. Extracted from baseline method `dummy_zeros`.
+
+* `methods/lmds`: Migrated from neurips 2021.
+
+* `methods/mnn`: Migrated and adapted from neurips 2021.
+
+* `methods/newwave`: Migrated and adapted from neurips 2021.
+
+* `methods/pca`: Migrated from neurips 2021.
+
+* `methods/totalvi`: Migrated from neurips 2021.
+
+* `methods/umap`: Migrated from neurips 2021.
+
+* `metrics/ari`: Migrated from neurips 2021.
+  
+* `metrics/asw_batch`: Migrated from neurips 2021.
+
+* `metrics/asw_label`: Migrated from neurips 2021.
+
+* `metrics/cc_cons`: Migrated from neurips 2021.
+
+* `metrics/check_format`: Migrated from neurips 2021.
+
+* `metrics/graph_connectivity`: Migrated from neurips 2021.
+
+* `metrics/latent_mixing`: Migrated from neurips 2021.
+
+* `metrics/nmi`: Migrated from neurips 2021.
+
+* `metrics/rfoob`: Migrated from neurips 2021.
+
+* `metrics/ti_cons`: Migrated from neurips 2021.
+
+* `metrics/ti_cons_batch`: Migrated from neurips 2021.
+
+### changes from neurips 2021
+
+* Updated docker config from R script. Was using an old `anndata` package which was giving warnings
+
+* stores the output from the methods in `.obsm["X_emb"]` instead of `.X` in the `anndata`
+
+* `X_emb` data is stored as a `Sparse Matrix`
+  
+
+* updated configs to latest `viash` 
\ No newline at end of file
diff --git a/src/joint_embedding/README.md b/src/joint_embedding/README.md
new file mode 100644
index 0000000000..7d9f65431c
--- /dev/null
+++ b/src/joint_embedding/README.md
@@ -0,0 +1,23 @@
+# Joint embedding
+
+Structure of this task:
+
+    src/embedding
+    ├── api                          Interface specifications for components and datasets in this task
+    ├── control_methods              Baseline (random/ground truth) methods to compare methods against
+    ├── methods                      Methods to be benchmarked
+    ├── metrics                      Metrics used to quantify performance of methods
+    ├── README.md                    This file
+    ├── resources_scripts            Scripts to process the datasets
+    ├── resources_test_scripts       Scripts to process the test resources
+    ├── split_dataset                Component to prepare common datasets
+    └── workflows                    Pipelines to run the full benchmark
+
+Relevant links:
+
+* [Description and results at openproblems.bio](https://openproblems.bio/neurips_2021/)
+
+* [Experimental results](https://openproblems-experimental.netlify.app/results/joint_embedding/)
+
+<!-- update this to openproblems.bio/guide when possible -->
+* [Contribution guide](https://github.com/openproblems-bio/openproblems-v2/blob/main/CONTRIBUTING.md)
diff --git a/src/joint_embedding/api/anndata_dataset.yaml b/src/joint_embedding/api/anndata_dataset.yaml
new file mode 100644
index 0000000000..23c6b427be
--- /dev/null
+++ b/src/joint_embedding/api/anndata_dataset.yaml
@@ -0,0 +1,75 @@
+type: file
+description: "A raw dataset"
+example: "dataset.h5ad"
+info:
+  label: "Dataset"
+  slots:
+    layers: 
+      - type: integer
+        name: counts
+        description: Raw counts
+        required: true
+    obs:
+      - type: string
+        name: batch
+        description: Batch information
+        required: true
+      - type: double
+        name: size_factors
+        description: The size factors created by the normalisation method, if any.
+        required: false
+      - type: string
+        name: cell_type
+        description: Type of cells
+        required: false
+      - type: string
+        name: pseudotime_order_GEX
+        description:
+        required: false
+      - type: string
+        name: pseudotime_order_ATAC
+        description:
+        required: false
+      - type: string
+        name: pseudotime_order_ADT
+        description:
+        required: false
+      - type: double
+        name: S_score
+        description:
+        required: false
+      - type: double
+        name: G2M_score
+        description:
+        required: false
+      - type: boolean
+        name: is_train
+        description: if sample is train data
+        required: true
+    var:
+      - type: string
+        name: gene_ids
+        description: 
+        required: false
+      - type: string
+        name: feature_types
+        description:
+        required: true
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: organism
+        description: "data from which organism "
+        required: false
+      - type: string
+        name: gene_activity_var_names
+        description:
+        required: false
+      - type: string
+        name: sample_pm_varnames
+        description:
+        required: false
+    
diff --git a/src/joint_embedding/api/anndata_masked_mod1.yaml b/src/joint_embedding/api/anndata_masked_mod1.yaml
new file mode 100644
index 0000000000..7ca6820671
--- /dev/null
+++ b/src/joint_embedding/api/anndata_masked_mod1.yaml
@@ -0,0 +1,37 @@
+type: file
+description: "The masked data"
+example: "masked.h5ad"
+info:
+  short_description: "masked data"
+  slots:
+    layers: 
+      - type: integer
+        name: counts
+        description: Raw counts
+    obs:
+      - type: string
+        name: batch
+        description: Batch information
+        required: true
+      - type: double
+        name: size_factors
+        description:
+        required: false
+    var:
+      - type: string
+        name: feature_types
+        description: 
+        required: true
+      - type: string
+        name: gene_ids
+        description:
+        required: false
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: organism
+        description: which organism
+        required: true
diff --git a/src/joint_embedding/api/anndata_masked_mod2.yaml b/src/joint_embedding/api/anndata_masked_mod2.yaml
new file mode 100644
index 0000000000..ad735fffba
--- /dev/null
+++ b/src/joint_embedding/api/anndata_masked_mod2.yaml
@@ -0,0 +1,39 @@
+type: file
+description: "The masked data for mod2 file"
+example: "masked.h5ad"
+info:
+  short_description: "Masked data"
+  slots:
+    layers: 
+      - type: integer
+        name: counts
+        description: Raw counts
+        required: true
+    obs:
+      - type: string
+        name: batch
+        description: Batch information
+        required: true
+    var:
+      - type: string
+        name: feature_types
+        description:
+        required: true
+      - type: string
+        name: gene_ids
+        description:
+        required: false
+    obsm:
+      - type: double
+        name: gene_activity
+        description: 
+        required: false
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: organism
+        description: which organism
+        required: true
diff --git a/src/joint_embedding/api/anndata_prediction.yaml b/src/joint_embedding/api/anndata_prediction.yaml
new file mode 100644
index 0000000000..49d8ae7d79
--- /dev/null
+++ b/src/joint_embedding/api/anndata_prediction.yaml
@@ -0,0 +1,25 @@
+type: file
+description: "The prediction file"
+example: "prediction.h5ad"
+info:
+  short_description: "Prediction"
+  slots:     
+    obs:
+      - type: string
+        name: batch
+        description: Batch information
+        required: true
+    obsm:
+      - type: double
+        name: X_emb
+        description:
+        required: true
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: method_id
+        description: "A unique identifier for the method"
+        required: true
diff --git a/src/joint_embedding/api/anndata_score.yaml b/src/joint_embedding/api/anndata_score.yaml
new file mode 100644
index 0000000000..bfe79f07cc
--- /dev/null
+++ b/src/joint_embedding/api/anndata_score.yaml
@@ -0,0 +1,25 @@
+type: file
+description: "Metric score file"
+example: "output.h5ad"
+info:
+  short_description: "Score"
+  slots:
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: method_id
+        description: "A unique identifier for the method"
+        required: true
+      - type: string
+        name: metric_ids
+        description: "One or more unique metric identifiers"
+        multiple: true
+        required: true
+      - type: double
+        name: metric_values
+        description: "The metric values obtained for the given prediction. Must be of same length as 'metric_ids'."
+        multiple: true
+        required: true
diff --git a/src/joint_embedding/api/anndata_solution.yaml b/src/joint_embedding/api/anndata_solution.yaml
new file mode 100644
index 0000000000..2ed03e1cc3
--- /dev/null
+++ b/src/joint_embedding/api/anndata_solution.yaml
@@ -0,0 +1,57 @@
+type: file
+description: "The solution for the data"
+example: "solution.h5ad"
+info:
+  short_description: "Solution"
+  slots:
+    layers: 
+      - type: integer
+        name: counts
+        description: Raw counts
+    obs:
+      - type: string
+        name: batch
+        description: Batch information
+        required: false
+      - type: string
+        name: cell_type
+        description: Type of cells
+        required: false
+      - type: string
+        name: pseudotime_order_GEX
+        description:
+        required: false
+      - type: string
+        name: pseudotime_order_ATAC
+        description:
+        required: false
+      - type: string
+        name: pseudotime_order_ADT
+        description:
+        required: false
+      - type: double
+        name: S_score
+        description:
+        required: false
+      - type: double
+        name: G2M_score
+        description:
+        required: false
+    var:
+      - type: string
+        name: feature_types
+        description: 
+        required: true
+      - type: string
+        name: gene_ids
+        description:
+        required: false
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: organism
+        description: which organism
+        required: true
diff --git a/src/joint_embedding/api/authors.yaml b/src/joint_embedding/api/authors.yaml
new file mode 100644
index 0000000000..f2e96c35fd
--- /dev/null
+++ b/src/joint_embedding/api/authors.yaml
@@ -0,0 +1,24 @@
+functionality:
+  authors:
+    - name: Robrecht Cannoodt
+      roles: [ author ]
+      props: { github: rcannood, orcid: "0000-0003-3641-729X" }
+    - name: Kai Waldrant
+      roles: [ contributor ]
+      props: { github: KaiWaldrant }
+    - name: Alex Tong
+      email: alexandertongdev@gmail.com
+      roles: [ author, maintainer ]
+      props: { github: atong01 }
+    - name: Christopher Lance
+      email: clance.connect@gmail.com
+      roles: [ author, maintainer ]
+      props: { github: xlancelottx }
+    - name: Michaela Mueller
+      email: mumichae@in.tum.de
+      roles: [ author, maintainer ]
+      props: { github: mumichae, orcid: "0000-0002-1401-1785" }
+    - name: Ann Chen
+      email: ann.chen@czbiohub.org
+      roles: [ author, maintainer ]
+      props: { github: atchen}
\ No newline at end of file
diff --git a/src/joint_embedding/api/comp_control_method.yaml b/src/joint_embedding/api/comp_control_method.yaml
new file mode 100644
index 0000000000..d1ec13ed17
--- /dev/null
+++ b/src/joint_embedding/api/comp_control_method.yaml
@@ -0,0 +1,46 @@
+functionality:
+  arguments:
+    - name: "--input_mod1"
+      __merge__: anndata_masked_mod1.yaml
+    - name: "--output"
+      __merge__: anndata_prediction.yaml
+      direction: output
+  test_resources:
+    - path: ../../../../resources_test   
+    - type: python_script
+      path: generic_test.py
+      text: |
+        from os import path
+        import subprocess
+        import anndata as ad
+        from scipy import sparse
+
+        input_mod1_path = "resources_test/common/joint_embedding/openproblems_bmmc_multiome_starter.mod1.h5ad"
+        output_path = "output.h5ad"
+
+        cmd = [
+            meta['executable'],
+            "--input_mod1", input_mod1_path,
+            "--output", output_path
+        ]
+
+        print(">> Running script as test")
+        out = subprocess.run(cmd, check=True, capture_output=True, text=True)
+
+        print("> Checking whether output files were created")
+        assert path.exists(output_path)
+
+        print("> Reading h5ad files")
+        input_mod1 = ad.read_h5ad(input_mod1_path)
+        output = ad.read_h5ad(output_path)
+
+        print("> Checking contents of output.h5ad")
+        assert output.uns['dataset_id'] == input_mod1.uns['dataset_id']
+        assert output.uns['method_id'] == meta['functionality_name']
+        assert output.n_obs == input_mod1.n_obs
+        assert output.n_vars >= 1
+        assert output.n_vars <= 100
+        assert all(output.obs_names == input_mod1.obs_names)
+        assert sparse.issparse(output.obsm['X_emb'])
+
+        print("> Test succeeded!")
diff --git a/src/joint_embedding/api/comp_mask_dataset.yaml b/src/joint_embedding/api/comp_mask_dataset.yaml
new file mode 100644
index 0000000000..0b97e89fca
--- /dev/null
+++ b/src/joint_embedding/api/comp_mask_dataset.yaml
@@ -0,0 +1,15 @@
+functionality:
+  arguments:
+    - name: "--input_mod1"
+      __merge__: anndata_dataset.yaml
+    - name: "--input_mod2"
+      __merge__: anndata_dataset.yaml
+    - name: "--output_mod1"
+      __merge__: anndata_masked_mod1.yaml
+      direction: output
+    - name: "--output_mod2"
+      __merge__: anndata_masked_mod2.yaml
+      direction: output
+    - name: "--output_solution"
+      __merge__: anndata_solution.yaml
+      direction: output
\ No newline at end of file
diff --git a/src/joint_embedding/api/comp_method.yaml b/src/joint_embedding/api/comp_method.yaml
new file mode 100644
index 0000000000..415f42ea3d
--- /dev/null
+++ b/src/joint_embedding/api/comp_method.yaml
@@ -0,0 +1,50 @@
+functionality:
+  arguments:
+    - name: "--input_mod1"
+      __merge__: anndata_masked_mod1.yaml
+    - name: "--input_mod2"
+      __merge__: anndata_masked_mod2.yaml
+    - name: "--output"
+      __merge__: anndata_prediction.yaml
+      direction: output
+  test_resources:
+    - path: ../../../../resources_test   
+    - type: python_script
+      path: generic_test.py
+      text: |
+        from os import path
+        import subprocess
+        import anndata as ad
+        from scipy import sparse
+
+        input_mod1_path = "resources_test/common/joint_embedding/openproblems_bmmc_multiome_starter.mod1.h5ad"
+        input_mod2_path = "resources_test/common/joint_embedding/openproblems_bmmc_multiome_starter.mod2.h5ad"
+        output_path = "output.h5ad"
+
+        cmd = [
+            meta['executable'],
+            "--input_mod1", input_mod1_path,
+            "--input_mod2", input_mod2_path,
+            "--output", output_path
+        ]
+
+        print(">> Running script as test")
+        out = subprocess.run(cmd, check=True, capture_output=True, text=True)
+
+        print("> Checking whether output files were created")
+        assert path.exists(output_path)
+
+        print("> Reading h5ad files")
+        input_mod1 = ad.read_h5ad(input_mod1_path)
+        output = ad.read_h5ad(output_path)
+
+        print("> Checking contents of output.h5ad")
+        assert output.uns['dataset_id'] == input_mod1.uns['dataset_id']
+        assert output.uns['method_id'] == meta['functionality_name']
+        assert output.n_obs == input_mod1.n_obs
+        assert output.n_vars >= 1
+        assert output.n_vars <= 100
+        assert all(output.obs_names == input_mod1.obs_names)
+        assert sparse.issparse(output.obsm['X_emb'])
+
+        print("> Test succeeded!")
\ No newline at end of file
diff --git a/src/joint_embedding/api/comp_metric.yaml b/src/joint_embedding/api/comp_metric.yaml
new file mode 100644
index 0000000000..bde43a316f
--- /dev/null
+++ b/src/joint_embedding/api/comp_metric.yaml
@@ -0,0 +1,97 @@
+functionality:
+  arguments:
+    - name: --input_prediction
+      __merge__: anndata_prediction.yaml
+    - name: --input_solution
+      __merge__: anndata_solution.yaml
+    - name: --output
+      __merge__: anndata_score.yaml
+      direction: output
+    - name: --debug
+      type: boolean_true
+      description: Verbose output for debugging.
+  test_resources:
+    - path: ../../../../resources_test
+    - type: python_script
+      path: generic_test.py
+      text: |
+        from os import path
+        import subprocess
+        import anndata as ad
+        import pandas as pd
+        import yaml
+
+        ## VIASH START
+        # This code block will be replaced by viash at runtime.
+        par = {
+          "input_prediction": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+          "input_solution": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
+          "output": "output.h5ad"
+        }
+        meta = { 'functionality_name': 'foo' }
+
+        ## VIASH END
+
+        input_prediction_path = "resources_test/common/joint_embedding/cite_random_prediction.h5ad"
+        input_solution_path = "resources_test/common/joint_embedding/cite_solution.h5ad"
+        output_path = "output.h5ad"
+        # define some filenames
+        with open(meta["config"], "r") as file:
+                config = yaml.safe_load(file)
+
+        cmd = [
+            meta['executable'],
+            "--input_prediction", input_prediction_path,
+            "--input_solution", input_solution_path,
+            "--output", output_path
+        ]
+
+        print("> Running method", flush=True)
+        out = subprocess.run(cmd, capture_output=True, text=True, check=True).stdout
+
+        print("> Checking whether output files were created", flush=True)
+        assert path.exists(output_path)
+
+        print("> Reading h5ad files", flush=True)
+        input_prediction = ad.read_h5ad(input_prediction_path)
+        input_solution = ad.read_h5ad(input_solution_path)
+        output = ad.read_h5ad(output_path)
+
+        # Create DF from metric config info
+        metric_info = config['functionality']['info']['metrics']
+        metric_meta = pd.DataFrame(metric_info)
+        metric_meta = metric_meta.astype({'id': str, 'label': str, 'description': str, 'min': float, 'max': float, 'maximize': bool})
+        print("> Checking contents of metric info", flush=True)
+        assert 'id' in metric_meta
+        assert 'min' in metric_meta
+        assert 'max' in metric_meta
+        assert 'maximize' in metric_meta
+
+        print("> Checking .uns['dataset_id']", flush=True)
+        assert 'dataset_id' in output.uns
+        assert output.uns['dataset_id'] == input_prediction.uns['dataset_id']
+
+        print("> Checking .uns['method_id']", flush=True)
+        assert 'method_id' in output.uns
+        assert output.uns['method_id'] == input_prediction.uns['method_id']
+
+        print("> Checking .uns['metric_ids']", flush=True)
+        assert 'metric_ids' in output.uns
+        assert set(output.uns['metric_ids']) == set(metric_meta.id)
+
+        print("> Checking .uns['metric_values']", flush=True)
+        assert 'metric_values' in output.uns
+        assert output.uns['metric_ids'].size == output.uns['metric_values'].size
+
+        # merge with metric_meta to see if metric_value lies within the expected range
+        output_uns = pd.DataFrame({
+          'id': output.uns['metric_ids'], 
+          'value': output.uns['metric_values']
+        })
+
+        scores = metric_meta.merge(output_uns, on="id")
+
+        assert all(scores.value >= scores['min'])
+        assert all(scores.value <= scores['max'])
+
+        print("> Test succeeded!", flush=True)
diff --git a/src/joint_embedding/api/task_info.yaml b/src/joint_embedding/api/task_info.yaml
new file mode 100644
index 0000000000..79267651ce
--- /dev/null
+++ b/src/joint_embedding/api/task_info.yaml
@@ -0,0 +1,14 @@
+task_id: joint_embedding
+task_name: Joint Embedding
+summary: Learning of an embedded space that leverages the information of multiple modalities (e.g. for improved cell type annotation).
+description: |
+  The functioning of organs, tissues, and whole organisms is determined by the interplay of cells. 
+  Cells are characterised into broad types, which in turn can take on different states. Here, a cell 
+  state is made up of the sum of all processes that are occurring within the cell. We can gain insight 
+  into the state of a cell by different types of measurements: e.g., RNA expression, protein abundance, 
+  or chromatin conformation. Combining this information to describe cellular heterogeneity requires the 
+  formation of joint embeddings generated from this multimodal data. These embeddings must account for 
+  and remove possible batch effects between different measurement batches. The reward for methods that 
+  can achieve this is great: a highly resolved description of the underlying biological state of a cell 
+  that determines its function, how it interacts with other cells, and thus the cell’s role in the f
+  unctioning of the whole tissue.
\ No newline at end of file
diff --git a/src/joint_embedding/control_methods/random_embed/config.vsh.yaml b/src/joint_embedding/control_methods/random_embed/config.vsh.yaml
new file mode 100644
index 0000000000..15b0c745e9
--- /dev/null
+++ b/src/joint_embedding/control_methods/random_embed/config.vsh.yaml
@@ -0,0 +1,25 @@
+__merge__: ../../api/comp_control_method.yaml
+functionality:
+  name: random_embed
+  namespace: joint_embedding/control_methods
+  description: Generate a random embedding from a normal distribution.
+  info:
+    type: negative_control
+    method_name: Normal Dist.
+  arguments:
+    - name: "--n_dims"
+      type: "integer"
+      default: 100
+      description: Number of dimensions to output.
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: "python:3.10"
+    setup:
+      - type: python
+        pip: [ anndata>=0.8, numpy , scipy]
+  - type: nextflow
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/control_methods/random_embed/script.py b/src/joint_embedding/control_methods/random_embed/script.py
new file mode 100644
index 0000000000..199ad08176
--- /dev/null
+++ b/src/joint_embedding/control_methods/random_embed/script.py
@@ -0,0 +1,26 @@
+import anndata
+import numpy as np
+from scipy import sparse
+
+## VIASH START
+par = {
+    "input_mod1": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.mod1.h5ad",
+    "output": "output/output_prediction.h5ad",
+    "n_dims": 100,
+}
+## VIASH END
+
+print("Load and prepare data")
+adata_mod1 = anndata.read_h5ad(par["input_mod1"])
+
+X = np.random.randn(adata_mod1.shape[0], par["n_dims"])
+print("Saving output")
+adata_out = anndata.AnnData(
+    X=X,
+    obsm= {"X_emb": sparse.csr_matrix(X)},
+    obs=adata_mod1.obs[["batch"]],
+    uns={"dataset_id": adata_mod1.uns["dataset_id"], "method_id": "random_embed"},
+)
+del adata_out.X
+
+adata_out.write_h5ad(par["output"], compression="gzip")
diff --git a/src/joint_embedding/control_methods/zeros_embed/config.vsh.yaml b/src/joint_embedding/control_methods/zeros_embed/config.vsh.yaml
new file mode 100644
index 0000000000..2cd0a9b71f
--- /dev/null
+++ b/src/joint_embedding/control_methods/zeros_embed/config.vsh.yaml
@@ -0,0 +1,25 @@
+__merge__: ../../api/comp_control_method.yaml
+functionality:
+  name: zeros_embed
+  namespace: joint_embedding/control_methods
+  description: Generate an embedding containing only zero values.
+  info:
+    type: negative_control
+    method_name: zeros_embed
+  arguments:
+    - name: "--n_dims"
+      type: "integer"
+      default: 1
+      description: Number of dimensions to output.
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: "python:3.10"
+    setup:
+      - type: python
+        pip: [ anndata, numpy, scipy ]
+  - type: nextflow
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/control_methods/zeros_embed/script.py b/src/joint_embedding/control_methods/zeros_embed/script.py
new file mode 100644
index 0000000000..f54ef3ce18
--- /dev/null
+++ b/src/joint_embedding/control_methods/zeros_embed/script.py
@@ -0,0 +1,27 @@
+import anndata
+import numpy as np
+from scipy import sparse
+
+## VIASH START
+par = {
+    "input_mod1": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.mod1.h5ad",
+    "output": "tmp/output_prediction.h5ad",
+    "n_dims": 1,
+}
+## VIASH END
+
+print("Load and prepare data")
+adata_mod1 = anndata.read_h5ad(par["input_mod1"])
+
+X = np.zeros((adata_mod1.shape[0], par["n_dims"]))
+print("Saving output")
+adata_out = anndata.AnnData(
+    X=X,
+    obs=adata_mod1.obs,
+    uns={"dataset_id": adata_mod1.uns["dataset_id"], "method_id": "zeros_embed"},
+    obsm={"X_emb": sparse.csr_matrix(X) }
+)
+
+del adata_out.X
+
+adata_out.write_h5ad(par["output"], compression="gzip")
diff --git a/src/joint_embedding/mask_dataset/config.vsh.yaml b/src/joint_embedding/mask_dataset/config.vsh.yaml
new file mode 100644
index 0000000000..872df1d9fa
--- /dev/null
+++ b/src/joint_embedding/mask_dataset/config.vsh.yaml
@@ -0,0 +1,31 @@
+__merge__: ../api/comp_mask_dataset.yaml
+functionality:
+  name: mask_dataset
+  namespace: joint_embedding
+  description: |
+    A component for censoring joint embedding datasets to be given
+    to competition participants for the 'joint embedding' task.
+  arguments:
+    - name: "--train_only"
+      type: "boolean_true"
+      description: Whether or not to only omit the train cells.
+  resources:
+    - type: r_script
+      path: script.R
+  test_resources:
+    - type: r_script
+      path: test.R
+    - path: ../../../resources_test
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata, tidyverse , testthat]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [ anndata>=0.8 ]
+  - type: nextflow
+    directives: 
+      label: [ midmem, lowcpu ]
diff --git a/src/joint_embedding/mask_dataset/script.R b/src/joint_embedding/mask_dataset/script.R
new file mode 100644
index 0000000000..d417775314
--- /dev/null
+++ b/src/joint_embedding/mask_dataset/script.R
@@ -0,0 +1,97 @@
+cat("Loading dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+library(anndata, warn.conflicts = FALSE)
+library(assertthat, quietly = TRUE, warn.conflicts = FALSE)
+library(Matrix, quietly = TRUE, warn.conflicts = FALSE)
+
+## VIASH START
+input_path <- "resources_test/common/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter."
+output_path <- ""
+
+par <- list(
+  input_mod1 = paste0(input_path, "output_rna.h5ad"),
+  input_mod2 = paste0(input_path, "output_mod2.h5ad"),
+  output_mod1 = paste0(output_path, "output_mod1.h5ad"),
+  output_mod2 = paste0(output_path, "output_mod2.h5ad"),
+  output_solution = paste0(output_path, "solution.h5ad"),
+  train_only = TRUE
+)
+## VIASH END
+
+cat("Reading mod1 data\n")
+input_mod1 <- anndata::read_h5ad(par$input_mod1)
+ad1_mod <- unique(input_mod1$var[["feature_types"]])
+new_dataset_id <- paste0(input_mod1$uns[["dataset_id"]], "_JE")
+ad1_uns <- list(dataset_id = new_dataset_id, organism = "human")
+ad2_uns <- list(dataset_id = new_dataset_id, organism = "human")
+
+cat("Creating mod1 object\n")
+out_mod1 <- anndata::AnnData(
+  X = input_mod1$X,
+  layers = list(counts = input_mod1$layers[["counts"]]),
+  var = input_mod1$var %>% select(one_of("gene_ids"), feature_types),
+  obs = input_mod1$obs %>% select(one_of("batch", "size_factors")),
+  uns = ad1_uns
+)
+
+cat("Create solution object\n")
+out_solution <- anndata::AnnData(
+  X = input_mod1$X,
+  var = input_mod1$var %>% select(one_of("gene_ids"), feature_types),
+  obs = input_mod1$obs %>% select(
+    one_of("batch", "cell_type", "pseudotime_order_GEX", "pseudotime_order_ATAC", "pseudotime_order_ADT", "S_score", "G2M_score")
+  ),
+  uns = ad1_uns
+)
+
+is_train <- input_mod1$obs$is_train
+
+if (par$train_only) {
+  cat("Filtering out test cells\n", sep = "")
+  out_mod1 <- out_mod1[is_train, ] #$copy()
+  out_solution <- out_solution[is_train, ]# $copy()
+}
+
+rm(input_mod1)
+gc()
+
+cat("Reading mod2 data\n")
+input_mod2 <- anndata::read_h5ad(par$input_mod2)
+ad2_mod <- unique(input_mod2$var[["feature_types"]])
+ad2_obsm <- list()
+
+if (ad2_mod == "ATAC") {
+  ad2_uns$gene_activity_var_names <- input_mod2$uns$gene_activity_var_names
+  ad2_obsm$gene_activity <- as(input_mod2$obsm$gene_activity, "CsparseMatrix")
+}
+
+cat("Creating mod2 object\n")
+out_mod2 <- anndata::AnnData(
+  X = input_mod2$X,
+  layers = list(counts = input_mod2$layers[["counts"]]),
+  var = input_mod2$var %>% select(one_of("gene_ids"), feature_types),
+  obs = input_mod2$obs %>% select(one_of("batch")),
+  obsm = ad2_obsm,
+  uns = ad2_uns
+)
+rm(input_mod2)
+gc()
+
+if (par$train_only) {
+  cat("Filtering out test cells\n", sep = "")
+  out_mod2 <- out_mod2[is_train, ] #$copy()
+}
+
+cat("Saving output files as h5ad\n")
+cat("output_mod1:")
+print(out_mod1)
+zzz <- out_mod1$write_h5ad(par$output_mod1, compression = "gzip")
+
+cat("output_mod2:")
+print(out_mod2)
+zzz <- out_mod2$write_h5ad(par$output_mod2, compression = "gzip")
+
+cat("output_solution:")
+print(out_solution)
+zzz <- out_solution$write_h5ad(par$output_solution, compression = "gzip")
diff --git a/src/joint_embedding/mask_dataset/test.R b/src/joint_embedding/mask_dataset/test.R
new file mode 100644
index 0000000000..88930e00d2
--- /dev/null
+++ b/src/joint_embedding/mask_dataset/test.R
@@ -0,0 +1,55 @@
+library(testthat, quietly = TRUE, warn.conflicts = FALSE)
+library(anndata, warn.conflicts = FALSE)
+
+par <- list(
+  input_mod1 = "resources_test/common/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.output_rna.h5ad",
+  input_mod2 = "resources_test/common/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.output_mod2.h5ad",
+  output_mod1 = "output_mod1.h5ad",
+  output_mod2 = "output_mod2.h5ad",
+  output_solution = "solution.h5ad"
+)
+
+cat("> Running mask component\n")
+out <- processx::run(
+  command = paste0("./", meta["functionality_name"]),
+  args = c(
+    "--input_mod1", par$input_mod1,
+    "--input_mod2", par$input_mod2,
+    "--output_mod1", par$output_mod1,
+    "--output_mod2", par$output_mod2,
+    "--output_solution", par$output_solution
+  ),
+  stderr_to_stdout = TRUE
+)
+
+cat("> Checking whether output files were created\n")
+expect_true(file.exists(par$output_mod1))
+expect_true(file.exists(par$output_mod2))
+expect_true(file.exists(par$output_solution))
+
+cat("> Reading h5ad files\n")
+input_mod1 <- anndata::read_h5ad(par$input_mod1)
+input_mod2 <- anndata::read_h5ad(par$input_mod2)
+output_mod1 <- anndata::read_h5ad(par$output_mod1)
+output_mod2 <- anndata::read_h5ad(par$output_mod2)
+output_solution <- anndata::read_h5ad(par$output_solution)
+
+cat("> Checking contents of h5ad files\n")
+expect_equal(output_mod1$uns[["dataset_id"]], paste0(input_mod1$uns[["dataset_id"]], "_JE"))
+expect_equal(output_mod2$uns[["dataset_id"]], paste0(input_mod1$uns[["dataset_id"]], "_JE"))
+expect_equal(output_solution$uns[["dataset_id"]], paste0(input_mod1$uns[["dataset_id"]], "_JE"))
+expect_equal(output_mod1$uns[["organism"]], input_mod1$uns[["organism"]])
+expect_equal(output_mod2$uns[["organism"]], input_mod1$uns[["organism"]])
+expect_equal(output_solution$uns[["organism"]], input_mod1$uns[["organism"]])
+expect_equal(output_mod1$n_obs, input_mod1$n_obs)
+expect_equal(output_mod2$n_obs, input_mod2$n_obs)
+expect_equal(output_mod1$n_vars, input_mod1$n_vars)
+expect_equal(output_mod2$n_vars, input_mod2$n_vars)
+expect_equal(output_mod1$var_names, input_mod1$var_names)
+expect_equal(output_mod2$var_names, input_mod2$var_names)
+expect_equal(output_mod1$obs_names, input_mod1$obs_names)
+expect_equal(output_mod2$obs_names, input_mod2$obs_names)
+
+# TODO check contents of matrices, check rownames
+
+cat("> Test succeeded!\n")
diff --git a/src/joint_embedding/methods/lmds/config.vsh.yaml b/src/joint_embedding/methods/lmds/config.vsh.yaml
new file mode 100644
index 0000000000..440172d748
--- /dev/null
+++ b/src/joint_embedding/methods/lmds/config.vsh.yaml
@@ -0,0 +1,33 @@
+__merge__: ../../api/comp_method.yamllowmem
+functionality:
+  name: lmds
+  namespace: joint_embedding/methods
+  description: Landmark MDS dimensionality reduction on the Spearman distance.
+  info:
+    type: method
+    method_name: "LMDS"
+  arguments:
+    - name: "--distance_method"
+      type: "string"
+      default: "spearman"
+      description: The distance method to use. Possible values are euclidean, pearson, spearman and others.
+    - name: "--n_dims"
+      type: integer
+      default: 10
+      description: Number of dimensions to output.
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata, lmds, tidyverse ]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [anndata>=0.8]
+  - type: nextflow
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/methods/lmds/script.R b/src/joint_embedding/methods/lmds/script.R
new file mode 100644
index 0000000000..7dda2a3c4c
--- /dev/null
+++ b/src/joint_embedding/methods/lmds/script.R
@@ -0,0 +1,44 @@
+cat("Loading dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+library(anndata, warn.conflicts = FALSE)
+library(Matrix, warn.conflicts = FALSE, quietly = TRUE)
+
+## VIASH START
+# path <- "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter."
+path <- "resources_test/common/joint_embedding/openproblems_bmmc_multiome_starter."
+par <- list(
+  input_mod1 = paste0(path, "mod1.h5ad"),
+  input_mod2 = paste0(path, "mod2.h5ad"),
+  output = "output/lmds/output.h5ad",
+  n_dims = 10L,
+  distance_method = "spearman"
+)
+## VIASH END
+
+cat("Reading h5ad files\n")
+ad1 <- anndata::read_h5ad(par$input_mod1)
+ad2 <- anndata::read_h5ad(par$input_mod2)
+
+cat("Performing DR\n")
+dr <- lmds::lmds(
+  cbind(ad1$X, ad2$X),
+  ndim = par$n_dims,
+  distance_method = par$distance_method
+)
+
+rownames(dr) <- rownames(ad1)
+colnames(dr) <- paste0("comp_", seq_len(par$n_dims))
+
+out <- anndata::AnnData(
+  X = dr,
+  uns = list(
+    dataset_id = ad1$uns[["dataset_id"]],
+    method_id = meta$functionality_name
+  ),
+  obsm = list(X_emb = as(dr, "CsparseMatrix"))
+)
+
+
+cat("Writing predictions to file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")
diff --git a/src/joint_embedding/methods/mnn/config.vsh.yaml b/src/joint_embedding/methods/mnn/config.vsh.yaml
new file mode 100644
index 0000000000..30dc728fd9
--- /dev/null
+++ b/src/joint_embedding/methods/mnn/config.vsh.yaml
@@ -0,0 +1,31 @@
+__merge__: ../../api/comp_method.yaml
+functionality:
+  name: mnn
+  namespace: joint_embedding/methods
+  description: Mutual nearest neighbors correction followed by PCA.
+  info:
+    type: method
+    method_name: "MNN"
+  arguments:
+    - name: "--hvg_sel"
+      type: "integer"
+      default: 1000
+      description: Number of features per modality to use.
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata, lmds, tidyverse, bioconductor]
+      - type: r
+        bioc: [ SingleCellExperiment, batchelor, proxyC ]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [anndata>=0.8]      
+  - type: nextflow
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/methods/mnn/script.R b/src/joint_embedding/methods/mnn/script.R
new file mode 100644
index 0000000000..27a4454bff
--- /dev/null
+++ b/src/joint_embedding/methods/mnn/script.R
@@ -0,0 +1,67 @@
+cat("Loading dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+requireNamespace("anndata", quietly = TRUE)
+library(Matrix, warn.conflicts = FALSE, quietly = TRUE)
+requireNamespace("batchelor", quietly = TRUE)
+requireNamespace("SingleCellExperiment", quietly = TRUE)
+
+## VIASH START
+# path <- "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter."
+path <- "output/datasets/joint_embedding/openproblems_bmmc_multiome_phase1/openproblems_bmmc_multiome_phase1.censor_dataset.output_"
+# path <- "output/public_datasets/joint_embedding/dyngen_citeseq_1/dyngen_citeseq_1.censor_dataset.output_"
+par <- list(
+  input_mod1 = paste0(path, "mod1.h5ad"),
+  input_mod2 = paste0(path, "mod2.h5ad"),
+  output = "output.h5ad",
+  hvg_sel = 1000L
+)
+meta <- list(functionality_name = "foo")
+## VIASH END
+
+method_id <- meta$functionality_name
+
+cat("Reading h5ad files\n")
+input_mod1 <- anndata::read_h5ad(par$input_mod1)
+
+rn <- rownames(input_mod1)
+batch <- input_mod1$obs$batch
+dataset_id <- input_mod1$uns[["dataset_id"]]
+Xt_mod1 <- t(input_mod1$X)
+
+# select hvg
+if (!is.null(par$hvg_sel) && nrow(Xt_mod1) > par$hvg_sel) {
+  sd_mod1 <- proxyC::rowSds(Xt_mod1)
+  Xt_mod1 <- Xt_mod1[order(sd_mod1, decreasing = TRUE)[seq_len(par$hvg_sel)], ]
+}
+
+rm(input_mod1)
+gc()
+
+Xt_mod2 <- t(anndata::read_h5ad(par$input_mod2)$X)
+if (!is.null(par$hvg_sel) && nrow(Xt_mod2) > par$hvg_sel) {
+  sd_mod2 <- proxyC::rowSds(Xt_mod2)
+  Xt_mod2 <- Xt_mod2[order(sd_mod2, decreasing = TRUE)[seq_len(par$hvg_sel)], ]
+}
+
+cat("Running fastMNN\n")
+mnn_out <- batchelor::fastMNN(
+  rbind(Xt_mod1, Xt_mod2),
+  batch = batch
+)
+dr <- SingleCellExperiment::reducedDim(mnn_out, "corrected")
+
+rownames(dr) <- rn
+colnames(dr) <- paste0("comp_", seq_len(ncol(dr)))
+
+out <- anndata::AnnData(
+  X = dr,
+  uns = list(
+    dataset_id = dataset_id,
+    method_id = meta$functionality_name
+  ),
+  obsm = list(X_emb = as(dr, "CsparseMatrix"))
+)
+
+cat("Writing predictions to file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")
diff --git a/src/joint_embedding/methods/newwave/config.vsh.yaml b/src/joint_embedding/methods/newwave/config.vsh.yaml
new file mode 100644
index 0000000000..0939d7b6f7
--- /dev/null
+++ b/src/joint_embedding/methods/newwave/config.vsh.yaml
@@ -0,0 +1,40 @@
+__merge__: ../../api/comp_method.yaml
+functionality:
+  name: newwave
+  namespace: joint_embedding/methods
+  description: Concatenated NewWave.
+  info:
+    type: method
+    method_name: "NewWave"
+    doi: "10.1101/2021.08.02.453487"
+  arguments:
+    - name: "--maxiter"
+      type: "integer"
+      default: 100
+      description: Maximum number of NewWave iterations.
+    - name: "--k"
+      type: "integer"
+      default: 10
+      description: NewWave K parameter.
+    - name: "--hvg_sel"
+      type: "integer"
+      default: 1000
+      description: Number of features per modality to use.
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata, lmds, tidyverse, bioconductor]
+      - type: r
+        bioc: [ SingleCellExperiment, NewWave, proxyC ]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [anndata>=0.8]
+  - type: nextflow
+    directives: 
+      label: [ highmem, highcpu ]
diff --git a/src/joint_embedding/methods/newwave/script.R b/src/joint_embedding/methods/newwave/script.R
new file mode 100644
index 0000000000..f87d79cdd5
--- /dev/null
+++ b/src/joint_embedding/methods/newwave/script.R
@@ -0,0 +1,111 @@
+cat("Loading dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+requireNamespace("anndata", quietly = TRUE)
+library(Matrix, warn.conflicts = FALSE, quietly = TRUE)
+requireNamespace("NewWave", quietly = TRUE)
+requireNamespace("SingleCellExperiment", quietly = TRUE)
+
+## VIASH START
+# path <- "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter."
+path <- "output/datasets/joint_embedding/openproblems_bmmc_multiome_phase1/openproblems_bmmc_multiome_phase1.censor_dataset.output_"
+# path <- "output/public_datasets/joint_embedding/dyngen_citeseq_1/dyngen_citeseq_1.censor_dataset.output_"
+par <- list(
+  input_mod1 = paste0(path, "mod1.h5ad"),
+  input_mod2 = paste0(path, "mod2.h5ad"),
+  output = "output.h5ad",
+  maxiter = 2L,
+  k = 3L,
+  hvg_sel = 1000
+)
+meta <- list(functionality_name = "foo")
+## VIASH END
+
+method_id <- meta$functionality_name
+
+cat("Reading mod1 h5ad\n")
+input_mod1 <- anndata::read_h5ad(par$input_mod1)
+
+rn <- rownames(input_mod1)
+batch <- input_mod1$obs$batch
+dataset_id <- input_mod1$uns[["dataset_id"]]
+
+sd1 <- proxyC::colSds(input_mod1$X)
+fil1 <-
+  if (!is.null(par$hvg_sel) && ncol(input_mod1) > par$hvg_sel) {
+    head(order(sd1, decreasing = TRUE), par$hvg_sel)
+  } else {
+    which(sd1 > 0)
+  }
+data1 <- SummarizedExperiment::SummarizedExperiment(
+  assays = list(counts = t(input_mod1$layers[["counts"]][, fil1])),
+  colData = data.frame(batch = factor(batch))
+)
+rm(input_mod1)
+gc()
+
+cat("Running NewWave on mod1\n")
+res1 <- NewWave::newWave(
+  data1,
+  X = "~batch",
+  verbose = TRUE,
+  K = par$k,
+  maxiter_optimize = par$maxiter,
+  n_gene_par = min(300, nrow(data1)),
+  n_cell_par = min(300, ncol(data1)),
+  commondispersion = FALSE
+)
+rm(data1)
+
+dr_x1 <- SingleCellExperiment::reducedDim(res1)
+
+cat("Reading mod2 anndata\n")
+input_mod2 <- anndata::read_h5ad(par$input_mod2)
+sd2 <- proxyC::colSds(input_mod2$X)
+fil2 <-
+  if (!is.null(par$hvg_sel) && ncol(input_mod2) > par$hvg_sel) {
+    head(order(sd2, decreasing = TRUE), par$hvg_sel)
+  } else {
+    which(sd2 > 0)
+  }
+data2 <- SummarizedExperiment::SummarizedExperiment(
+  assays = list(counts = t(input_mod2$layers[["counts"]][, fil2])),
+  colData = data.frame(batch = factor(batch))
+)
+rm(input_mod2)
+gc()
+
+cat("Running NewWave on mod2\n")
+res2 <- NewWave::newWave(
+  data2,
+  X = "~batch",
+  verbose = TRUE,
+  K = par$k,
+  maxiter_optimize = par$maxiter,
+  n_gene_par = min(300, nrow(data2)),
+  n_cell_par = min(300, ncol(data2)),
+  commondispersion = FALSE
+)
+dr_x2 <- SingleCellExperiment::reducedDim(res2)
+rm(data2)
+
+cat("Spline separate DRs\n")
+dr <- do.call(cbind, lapply(seq_len(ncol(dr_x1)), function(i) {
+  cbind(dr_x1[, i], dr_x2[, i])
+}))
+
+rownames(dr) <- rn
+colnames(dr) <- paste0("comp_", seq_len(ncol(dr)))
+
+out <- anndata::AnnData(
+  X = dr,
+  uns = list(
+    dataset_id = dataset_id,
+    method_id = meta$functionality_name
+  ),
+  obsm = list(X_emb = as(dr, "CsparseMatrix"))
+
+)
+
+cat("Writing predictions to file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")
diff --git a/src/joint_embedding/methods/pca/config.vsh.yaml b/src/joint_embedding/methods/pca/config.vsh.yaml
new file mode 100644
index 0000000000..860a48c300
--- /dev/null
+++ b/src/joint_embedding/methods/pca/config.vsh.yaml
@@ -0,0 +1,34 @@
+__merge__: ../../api/comp_method.yaml
+functionality:
+  name: pca
+  namespace: joint_embedding/methods
+  description: PCA dimensionality reduction.
+  info:
+    type: method
+    method_name: "PCA"
+  authors:
+  arguments:
+    - name: "--n_dims"
+      type: "integer"
+      default: 10
+      description: Number of dimensions to output.
+    - name: "--hvg_sel"
+      type: "integer"
+      default: 1000
+      description: Number of features per modality to use.
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata, lmds, tidyverse, bioconductor, irlba, proxyC]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [anndata>=0.8]
+  - type: nextflow
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/methods/pca/script.R b/src/joint_embedding/methods/pca/script.R
new file mode 100644
index 0000000000..d16eb027cf
--- /dev/null
+++ b/src/joint_embedding/methods/pca/script.R
@@ -0,0 +1,63 @@
+cat("Loading dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+requireNamespace("anndata", quietly = TRUE)
+library(Matrix, warn.conflicts = FALSE, quietly = TRUE)
+
+## VIASH START
+# path <- "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter."
+path <- "output/datasets/joint_embedding/openproblems_bmmc_multiome_phase1/openproblems_bmmc_multiome_phase1.censor_dataset.output_"
+# path <- "output/public_datasets/joint_embedding/dyngen_citeseq_1/dyngen_citeseq_1.censor_dataset.output_"
+par <- list(
+  input_mod1 = paste0(path, "mod1.h5ad"),
+  input_mod2 = paste0(path, "mod2.h5ad"),
+  output = "output.h5ad",
+  n_dims = 4L,
+  hvg_sel = 1000L
+)
+meta <- list(functionality_name = "foo")
+## VIASH END
+
+cat("Reading h5ad files\n")
+input_mod1 <- anndata::read_h5ad(par$input_mod1)
+
+rn <- rownames(input_mod1)
+batch <- input_mod1$obs$batch
+dataset_id <- input_mod1$uns[["dataset_id"]]
+X_mod1 <- input_mod1$X
+
+# select hvg
+if (!is.null(par$hvg_sel) && ncol(X_mod1) > par$hvg_sel) {
+  sd_mod1 <- proxyC::colSds(X_mod1)
+  X_mod1 <- X_mod1[, head(order(sd_mod1, decreasing = TRUE), par$hvg_sel)]
+}
+
+rm(input_mod1)
+gc()
+
+X_mod2 <- anndata::read_h5ad(par$input_mod2)$X
+if (!is.null(par$hvg_sel) && ncol(X_mod2) > par$hvg_sel) {
+  sd_mod2 <- proxyC::colSds(X_mod2)
+  X_mod2 <- X_mod2[, head(order(sd_mod2, decreasing = TRUE), par$hvg_sel)]
+}
+
+cat("Performing DR\n")
+dr <- irlba::prcomp_irlba(
+  cbind(X_mod1, X_mod2),
+  n = par$n_dims
+)$x
+
+rownames(dr) <- rn
+colnames(dr) <- paste0("comp_", seq_len(par$n_dims))
+
+out <- anndata::AnnData(
+  X = dr,
+  uns = list(
+    dataset_id = dataset_id,
+    method_id = meta$functionality_name
+  ),
+  obsm = list( X_emb = as(dr, "CsparseMatrix"))
+)
+
+cat("Writing predictions to file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")
diff --git a/src/joint_embedding/methods/totalvi/config.vsh.yaml b/src/joint_embedding/methods/totalvi/config.vsh.yaml
new file mode 100644
index 0000000000..5b0969172a
--- /dev/null
+++ b/src/joint_embedding/methods/totalvi/config.vsh.yaml
@@ -0,0 +1,34 @@
+__merge__: ../../api/comp_method.yaml
+functionality:
+  name: totalvi
+  namespace: joint_embedding/methods
+  version: dev
+  description: "totalVI: joint probabilistic modeling with Total Variational Inference"
+  info:
+    type: method
+    method_name: totalVI
+    doi: 10.1038/s41592-020-01050-x    
+  arguments:
+    - name: --hvg_number
+      type: integer
+      default: 4000
+      description: Number of HVG to include in totalVI
+    - name: --max_epochs
+      type: integer
+      default: 400
+      description: Number of max epochs to run totalVI
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: "python:3.10"
+    setup:
+      - type: python
+        pip: [ anndata>=0.8, scanpy, scikit-misc, scipy, scikit-learn, scvi-tools]
+  - type: nextflow
+    directives: 
+      label: [ lowmem, lowcpu ]
+
+
+
diff --git a/src/joint_embedding/methods/totalvi/script.py b/src/joint_embedding/methods/totalvi/script.py
new file mode 100644
index 0000000000..9b40e1f3d1
--- /dev/null
+++ b/src/joint_embedding/methods/totalvi/script.py
@@ -0,0 +1,58 @@
+import anndata
+import scanpy as sc
+from scipy import sparse
+from scvi.model import TOTALVI
+
+## VIASH START
+par = {
+    "input_mod1": "output/public_datasets/joint_embedding/totalvi_spleen_lymph_111/totalvi_spleen_lymph_111.censor_dataset.output_mod1.h5ad",
+    "input_mod2": "output/public_datasets/joint_embedding/totalvi_spleen_lymph_111/totalvi_spleen_lymph_111.censor_dataset.output_mod2.h5ad",
+    "output": "tmp/output_prediction.h5ad",
+    "hvg_number": 4000,
+    "max_epochs": 20
+}
+
+meta = {
+    'funcionality_name': "foo"
+}
+## VIASH END
+
+print("Load and prepare data", flush=True)
+adata_mod1 = anndata.read_h5ad(par['input_mod1'])
+adata_mod2 = anndata.read_h5ad(par['input_mod2'])
+adata_mod1.obsm['protein_expression'] = adata_mod2.layers["counts"].toarray()
+
+print('Select highly variable genes', flush=True)
+sc.pp.highly_variable_genes(
+    adata_mod1,
+    n_top_genes=par['hvg_number'],
+    flavor="seurat_v3",
+    batch_key="batch",
+    subset=True
+)
+
+print("Set up model", flush=True)
+TOTALVI.setup_anndata(
+    adata_mod1,
+    batch_key="batch",
+    protein_expression_obsm_key="protein_expression"
+)
+
+print('Train totalVI with', par['max_epochs'], 'epochs', flush=True)
+vae = TOTALVI(adata_mod1, latent_distribution="normal")
+vae.train(max_epochs = par['max_epochs'])
+
+print("Postprocessing and saving output", flush=True)
+adata_out = anndata.AnnData(
+    X=vae.get_latent_representation(),
+    obs=adata_mod1.obs[['batch']],
+    uns={
+        "dataset_id": adata_mod1.uns["dataset_id"],
+        "method_id": meta["functionality_name"]
+    },
+    obsm = {"X_emb": sparse.csr_matrix(vae.get_latent_representation())}
+)
+
+del adata_out.X
+
+adata_out.write_h5ad(par['output'], compression = "gzip")
diff --git a/src/joint_embedding/methods/umap/config.vsh.yaml b/src/joint_embedding/methods/umap/config.vsh.yaml
new file mode 100644
index 0000000000..5ef2d3d11e
--- /dev/null
+++ b/src/joint_embedding/methods/umap/config.vsh.yaml
@@ -0,0 +1,46 @@
+__merge__: ../../api/comp_method.yaml
+functionality:
+  name: umap
+  namespace: joint_embedding/methods
+  version: dev
+  description: UMAP dimensionality reduction on the Euclidean distance.
+  info:
+    type: method
+    method_name: UMAP
+  arguments:
+    - name: "--n_dims"
+      type: "integer"
+      default: 10
+      description: Number of dimensions to output.
+    - name: "--metric"
+      type: "string"
+      default: "euclidean"
+      description: The metric to use. Possible values are euclidean, cosine, manhattan.
+    - name: "--n_neighbors"
+      type: "integer"
+      default: 15
+      description: Number of neighbor to use int he KNN.
+    - name: "--n_pcs"
+      type: "integer"
+      default: 50
+      description: Number of principal components to use in the PCA step.
+    - name: "--hvg_sel"
+      type: "integer"
+      default: 1000
+      description: Number of features per modality to use.
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata, lmds, tidyverse, irlba, proxyC, uwot]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [anndata>=0.8]
+  - type: nextflow
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/methods/umap/script.R b/src/joint_embedding/methods/umap/script.R
new file mode 100644
index 0000000000..2f654be245
--- /dev/null
+++ b/src/joint_embedding/methods/umap/script.R
@@ -0,0 +1,80 @@
+cat("Loading dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+requireNamespace("anndata", quietly = TRUE)
+library(Matrix, warn.conflicts = FALSE, quietly = TRUE)
+
+## VIASH START
+# path <- "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter."
+path <- "output/datasets/joint_embedding/openproblems_bmmc_multiome_phase1/openproblems_bmmc_multiome_phase1.censor_dataset.output_"
+# path <- "output/public_datasets/joint_embedding/dyngen_citeseq_1/dyngen_citeseq_1.censor_dataset.output_"
+par <- list(
+  input_mod1 = paste0(path, "mod1.h5ad"),
+  input_mod2 = paste0(path, "mod2.h5ad"),
+  output = "output.h5ad",
+  n_dims = 10L,
+  n_neighbors = 15L,
+  metric = "euclidean",
+  n_pcs = 50L,
+  hvg_sel = 100L
+)
+meta <- list(functionality_name = "foo")
+## VIASH END
+
+n_cores <- parallel::detectCores(all.tests = FALSE, logical = TRUE)
+
+cat("Reading h5ad files\n")
+input_mod1 <- anndata::read_h5ad(par$input_mod1)
+
+rn <- rownames(input_mod1)
+batch <- input_mod1$obs$batch
+dataset_id <- input_mod1$uns[["dataset_id"]]
+X_mod1 <- input_mod1$X
+
+# select hvg
+if (!is.null(par$hvg_sel) && ncol(X_mod1) > par$hvg_sel) {
+  sd_mod1 <- proxyC::colSds(X_mod1)
+  X_mod1 <- X_mod1[, head(order(sd_mod1, decreasing = TRUE), par$hvg_sel)]
+}
+
+rm(input_mod1)
+gc()
+
+X_mod2 <- anndata::read_h5ad(par$input_mod2)$X
+if (!is.null(par$hvg_sel) && ncol(X_mod2) > par$hvg_sel) {
+  sd_mod2 <- proxyC::colSds(X_mod2)
+  X_mod2 <- X_mod2[, head(order(sd_mod2, decreasing = TRUE), par$hvg_sel)]
+}
+
+cat("Performing PCA\n")
+X_pca <- irlba::prcomp_irlba(
+  cbind(X_mod1, X_mod2),
+  n = 100
+)$x
+
+cat("Performing UMap\n")
+dr <- uwot::umap(
+  X_pca,
+  n_components = par$n_dims,
+  n_neighbors = par$n_neighbors,
+  metric = par$metric,
+  n_threads = n_cores,
+  nn_method = "annoy"
+)
+
+rownames(dr) <- rn
+colnames(dr) <- paste0("comp_", seq_len(par$n_dims))
+
+out <- anndata::AnnData(
+  X = dr,
+  uns = list(
+    dataset_id = dataset_id,
+    method_id = meta$functionality_name
+  ),
+  obsm = list(
+    X_emb = as(dr, "CsparseMatrix")
+  )
+)
+
+cat("Writing predictions to file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")
diff --git a/src/joint_embedding/metrics/ari/config.vsh.yaml b/src/joint_embedding/metrics/ari/config.vsh.yaml
new file mode 100644
index 0000000000..41ef532e5a
--- /dev/null
+++ b/src/joint_embedding/metrics/ari/config.vsh.yaml
@@ -0,0 +1,28 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: ari
+  namespace: joint_embedding/metrics
+  description: Adjusted rand index (ARI)
+  info:
+    metrics:
+      - metric_id: ari
+        metric_name: ari
+        metric_description: Adjusted rand index (ARI)
+        min: 0
+        max: 1
+        maximize: true
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: python:3.10
+    setup:
+      - type: python
+        pip: [anndata>=0.8, scib, scanpy]
+    test_setup:
+      - type: python
+        pip: [ pyyaml ]
+  - type: nextflow
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/metrics/ari/script.py b/src/joint_embedding/metrics/ari/script.py
new file mode 100644
index 0000000000..dc7c195f66
--- /dev/null
+++ b/src/joint_embedding/metrics/ari/script.py
@@ -0,0 +1,58 @@
+import pprint
+import scanpy as sc
+import anndata as ad
+import scib
+
+## VIASH START
+par = dict(
+    input_prediction="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+    input_solution="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
+    output="openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.ari.had",
+    debug=True
+)
+
+## VIASH END
+
+if par['debug']:
+    pprint.pprint(par)
+
+print("Read prediction anndata")
+adata = ad.read_h5ad(par['input_prediction'])
+dataset_id = adata.uns['dataset_id']
+
+print("Read solution anndata")
+adata_solution = ad.read_h5ad(par['input_solution'])
+
+print('Transfer obs annotations')
+adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names]
+adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names]
+
+print('Preprocessing')
+sc.pp.neighbors(adata, use_rep='X_emb')
+
+print('Clustering')
+scib.cl.opt_louvain(
+    adata,
+    label_key='cell_type',
+    cluster_key='cluster',
+    plot=False,
+    inplace=True,
+    force=True
+)
+
+print('Compute score')
+score = scib.me.ari(adata, group1='cluster', group2='cell_type')
+
+# store adata with metrics
+print("Create output object")
+out = ad.AnnData(
+    uns=dict(
+        dataset_id=adata.uns['dataset_id'],
+        method_id=adata.uns['method_id'],
+        metric_ids=["ari"],
+        metric_values=[score]
+    )
+)
+
+print("Write output to h5ad file")
+out.write(par['output'], compression='gzip')
diff --git a/src/joint_embedding/metrics/asw_batch/config.vsh.yaml b/src/joint_embedding/metrics/asw_batch/config.vsh.yaml
new file mode 100644
index 0000000000..e89b30dc90
--- /dev/null
+++ b/src/joint_embedding/metrics/asw_batch/config.vsh.yaml
@@ -0,0 +1,28 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: asw_batch
+  namespace: joint_embedding/metrics
+  description: Average silhouette width (ASW) of batches per label
+  info:
+    metrics:
+      - metric_id: asw_batch
+        metric_name: asw_batch
+        metric_description: Average silhouette width (ASW) of batches per label
+        min: 0
+        max: 1
+        maximize: True
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: python:3.10
+    setup:
+      - type: python
+        pip: [ anndata>=0.8, scanpy, scib]
+    test_setup:
+      - type: python
+        pip: [ pyyaml ]
+  - type: nextflow
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/metrics/asw_batch/script.py b/src/joint_embedding/metrics/asw_batch/script.py
new file mode 100644
index 0000000000..88e89c21a8
--- /dev/null
+++ b/src/joint_embedding/metrics/asw_batch/script.py
@@ -0,0 +1,55 @@
+import pprint
+import scanpy as sc
+import anndata as ad
+import scib
+
+## VIASH START
+par = dict(
+    input_prediction="resources_test/common/joint_embedding/cite_random_prediction.h5ad",
+    input_solution="resources_test/common/joint_embedding/cite_solution.h5ad",
+    output="resources_test/common/joint_embedding/score_cc_cons.h5ad",
+    debug=False
+)
+
+## VIASH END
+
+if par['debug']:
+    pprint.pprint(par)
+
+input_prediction = par['input_prediction']
+input_solution = par['input_solution']
+output = par['output']
+
+print("Read prediction anndata")
+adata = ad.read_h5ad(input_prediction)
+dataset_id = adata.uns['dataset_id']
+
+print("Read solution anndata")
+adata_solution = ad.read_h5ad(input_solution)
+
+print('Transfer obs annotations')
+adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names]
+adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names]
+
+print('Compute score')
+score = scib.me.silhouette_batch(
+    adata,
+    batch_key='batch',
+    group_key='cell_type',
+    embed='X_emb',
+    verbose=False
+)
+
+# store adata with metrics
+print("Create output object")
+out = ad.AnnData(
+    uns=dict(
+        dataset_id=adata.uns['dataset_id'],
+        method_id=adata.uns['method_id'],
+        metric_ids=['asw_batch'],
+        metric_values=[score]
+    )
+)
+
+print("Write output to h5ad file")
+out.write(output, compression='gzip')
diff --git a/src/joint_embedding/metrics/asw_label/config.vsh.yaml b/src/joint_embedding/metrics/asw_label/config.vsh.yaml
new file mode 100644
index 0000000000..262b48d6f4
--- /dev/null
+++ b/src/joint_embedding/metrics/asw_label/config.vsh.yaml
@@ -0,0 +1,28 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: asw_label
+  namespace: joint_embedding/metrics
+  description: Average silhouette width (ASW) of labels
+  info:
+    metrics:
+      - metric_id: asw_label
+        metric_name: asw_label
+        metric_description: Average silhouette width (ASW) of labels
+        min: 0
+        max: 1
+        maximize: True
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: python:3.10
+    setup:
+      - type: python
+        pip: [anndata>=0.8, scib]
+    test_setup:
+      - type: python
+        pip: [ pyyaml ]
+  - type: nextflow
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/metrics/asw_label/script.py b/src/joint_embedding/metrics/asw_label/script.py
new file mode 100644
index 0000000000..4d04092004
--- /dev/null
+++ b/src/joint_embedding/metrics/asw_label/script.py
@@ -0,0 +1,49 @@
+import pprint
+import anndata as ad
+import scib
+
+## VIASH START
+par = dict(
+    input_prediction="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+    input_solution="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
+    output="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.asw_batch.tsv",
+    debug=True
+)
+
+## VIASH END
+
+if par['debug']:
+    pprint.pprint(par)
+
+input_prediction = par['input_prediction']
+input_solution = par['input_solution']
+output = par['output']
+
+print("Read prediction anndata")
+adata = ad.read_h5ad(input_prediction)
+dataset_id = adata.uns['dataset_id']
+
+print("Read solution anndata")
+adata_solution = ad.read_h5ad(input_solution)
+
+print('Transfer obs annotations')
+adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names]
+adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names]
+
+print('Compute score')
+score = scib.me.silhouette(adata, group_key='cell_type', embed='X_emb')
+
+# store adata with metrics
+print("Create output object")
+out = ad.AnnData(
+    uns=dict(
+        dataset_id=adata.uns['dataset_id'],
+        method_id=adata.uns['method_id'],
+        metric_ids=['asw_label'],
+        metric_values=[score]
+    )
+)
+
+print("Write output to h5ad file")
+out.write(output, compression='gzip')
+
diff --git a/src/joint_embedding/metrics/cc_cons/config.vsh.yaml b/src/joint_embedding/metrics/cc_cons/config.vsh.yaml
new file mode 100644
index 0000000000..ba501a5e21
--- /dev/null
+++ b/src/joint_embedding/metrics/cc_cons/config.vsh.yaml
@@ -0,0 +1,28 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: cc_cons
+  namespace: joint_embedding/metrics
+  description: Cell cycle conservation score
+  info:
+    metrics:
+      - metric_id: cc_cons
+        metric_name: cc_cons
+        metric_description: Cell cycle conservation score
+        min: 0
+        max: 1
+        maximize: True
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: python:3.10
+    setup:
+      - type: python
+        pip: [anndata>=0.8, scib]
+    test_setup:
+      - type: python
+        pip: [ pyyaml ]
+  - type: nextflow
+    directives: 
+      label: [ vhighmem, midcpu ]
diff --git a/src/joint_embedding/metrics/cc_cons/script.py b/src/joint_embedding/metrics/cc_cons/script.py
new file mode 100644
index 0000000000..feecf45f12
--- /dev/null
+++ b/src/joint_embedding/metrics/cc_cons/script.py
@@ -0,0 +1,56 @@
+import pprint
+import anndata as ad
+import scib
+
+## VIASH START
+par = dict(
+    input_prediction="resources_test/common/joint_embedding/cite_random_prediction.h5ad",
+    input_solution="resources_test/common/joint_embedding/cite_solution.h5ad",
+    output="resources_test/common/joint_embedding/score_cc_cons.h5ad",
+    debug=False
+)
+## VIASH END
+
+
+if par['debug']:
+    pprint.pprint(par)
+
+print("Read prediction anndata")
+adata = ad.read_h5ad(par['input_prediction'])
+dataset_id = adata.uns['dataset_id']
+
+print("Read solution anndata")
+adata_solution = ad.read_h5ad(par['input_solution'])
+organism = adata_solution.uns['organism']
+
+print('Transfer obs annotations')
+adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names]
+adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names]
+recompute_cc = 'S_score' not in adata_solution.obs_keys() or \
+               'G2M_score' not in adata_solution.obs_keys()
+
+print('Compute score')
+adata.obsm['X_emb'] = adata.obsm['X_emb'].toarray()
+
+score = scib.me.cell_cycle(
+    adata_pre=adata_solution,
+    adata_post=adata,
+    batch_key='batch',
+    embed='X_emb',
+    recompute_cc=recompute_cc,
+    organism=organism
+)
+
+# store adata with metrics
+print("Create output object")
+out = ad.AnnData(
+    uns= {
+        "dataset_id":adata.uns['dataset_id'],
+        "method_id":adata.uns['method_id'],
+        "metric_ids":['cc_cons'],
+        "metric_values":[score],
+    }
+)
+
+print("Write output to h5ad file")
+out.write(par['output'], compression='gzip')
diff --git a/src/joint_embedding/metrics/check_format/config.vsh.yaml b/src/joint_embedding/metrics/check_format/config.vsh.yaml
new file mode 100644
index 0000000000..90294f01e6
--- /dev/null
+++ b/src/joint_embedding/metrics/check_format/config.vsh.yaml
@@ -0,0 +1,38 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: check_format
+  namespace: joint_embedding/metrics
+  description: Checking whether the prediction of a method has the right format.
+  info:
+    metrics:
+      - metric_id: finished
+        metric_name: finished
+        metric_description: check if metric finished
+        min: 0
+        max: 1
+        maximize: true
+      - metric_id: correct_format
+        metric_name: correct_format
+        metric_description: check if format is correct
+        min: 0
+        max: 1
+        maximize: true
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata, lmds, tidyverse ]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [ anndata>=0.8 ]
+    test_setup:
+      - type: python
+        pip: [ pyyaml ]
+  - type: nextflow
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/metrics/check_format/script.R b/src/joint_embedding/metrics/check_format/script.R
new file mode 100644
index 0000000000..afe8ed10fd
--- /dev/null
+++ b/src/joint_embedding/metrics/check_format/script.R
@@ -0,0 +1,61 @@
+cat("Load dependencies\n")
+library(assertthat, quietly = TRUE, warn.conflicts = FALSE)
+library(anndata, warn.conflicts = FALSE)
+
+## VIASH START
+task <- "joint_embedding"
+par <- list(
+  input_solution = paste0("resources_test/", task, "/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad"),
+  input_prediction = paste0("resources_test/", task, "/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad"),
+  output = paste0("resources_test/", task, "/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.scores.h5ad")
+)
+## VIASH END
+
+cat("Read prediction h5ad\n")
+ad_sol <- read_h5ad(par$input_solution)
+
+cat("Checking solution h5ad\n")
+correct_format <- tryCatch({
+  # read prediction
+  ad_pred <- read_h5ad(par$input_prediction)
+
+  # check dataset id
+  dataset_id <- ad_pred$uns[["dataset_id"]]
+  assert_that(dataset_id == ad_sol$uns[["dataset_id"]])
+
+  # check method id
+  method_id <- ad_pred$uns[["method_id"]]
+  assert_that(
+    is.character(method_id),
+    method_id != ""
+  )
+
+  # check X
+  assert_that(
+    ad_pred$n_obs == ad_sol$n_obs,
+    ad_pred$n_vars >= 1,
+    ad_pred$n_vars <= 100,
+    !is.null(ad_pred$obs_names),
+    all(ad_pred$obs_names == ad_sol$obs_names)
+  )
+
+  1
+}, error = function(e) {
+  cat("ERROR: ", e$message, "\n", sep = "")
+  0
+})
+
+
+cat("Create output object\n")
+out <- AnnData(
+  shape = c(0, 0),
+  uns = list(
+    dataset_id = ad_pred$uns$dataset_id,
+    method_id = ad_pred$uns$method_id,
+    metric_ids = c("finished", "correct_format"),
+    metric_values = c(1, correct_format)
+  )
+)
+
+cat("Write output to h5ad file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")
diff --git a/src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml b/src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml
new file mode 100644
index 0000000000..c79966893c
--- /dev/null
+++ b/src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml
@@ -0,0 +1,28 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: graph_connectivity
+  namespace: joint_embedding/metrics
+  description: Graph connectivity
+  info:
+    metrics:
+      - metric_id: graph_conn
+        metric_name: graph_conn
+        metric_description: Graph connectivity
+        min: 0
+        max: 1
+        maximize: True
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: python:3.10
+    setup:
+      - type: python
+        pip: [anndata>=0.8, scib]
+    test_setup:
+      - type: python
+        pip: [ pyyaml ]
+  - type: nextflow
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/metrics/graph_connectivity/script.py b/src/joint_embedding/metrics/graph_connectivity/script.py
new file mode 100644
index 0000000000..ab9089d74a
--- /dev/null
+++ b/src/joint_embedding/metrics/graph_connectivity/script.py
@@ -0,0 +1,53 @@
+import pprint
+import scanpy as sc
+import anndata as ad
+import scib
+
+## VIASH START
+par = dict(
+    input_prediction="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+    input_solution="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
+    output="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.graph_conn.tsv",
+    debug=True
+)
+
+## VIASH END
+
+if par['debug']:
+    pprint.pprint(par)
+
+
+input_prediction = par['input_prediction']
+input_solution = par['input_solution']
+output = par['output']
+
+print("Read prediction anndata")
+adata = ad.read_h5ad(input_prediction)
+dataset_id = adata.uns['dataset_id']
+
+print("Read solution anndata")
+adata_solution = ad.read_h5ad(input_solution)
+
+print('Transfer obs annotations')
+adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names]
+adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names]
+
+print('Preprocessing')
+sc.pp.neighbors(adata, use_rep='X_emb')
+
+print('Compute score')
+score = scib.me.graph_connectivity(adata, label_key='cell_type')
+
+# store adata with metrics
+print("Create output object")
+out = ad.AnnData(
+    uns = { 
+        'dataset_id':adata.uns['dataset_id'],
+        'method_id':adata.uns['method_id'],
+        'metric_ids':['graph_conn'],
+        'metric_values':[score]
+        }
+)
+
+print("Write output to h5ad file")
+out.write(output, compression='gzip')
diff --git a/src/joint_embedding/metrics/latent_mixing/config.vsh.yaml b/src/joint_embedding/metrics/latent_mixing/config.vsh.yaml
new file mode 100644
index 0000000000..bf989bfc20
--- /dev/null
+++ b/src/joint_embedding/metrics/latent_mixing/config.vsh.yaml
@@ -0,0 +1,33 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: latent_mixing
+  namespace: joint_embedding/metrics
+  description: Calculate latent mixing metric for joint embedding task.
+  info:
+    metrics:
+      - metric_id: latent_mixing
+        metric_name: Latent mixing
+        metric_description: Calculate latent mixing 
+        min: -1
+        max: 0
+        maximize: True
+  arguments:
+    - name: "--n_neighbors"
+      type: integer
+      default: 100
+      description: Number of neighbors for the entropy_batch_mixing metric.
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: python:3.10
+    setup:
+      - type: python
+        pip: [anndata>=0.8, scikit-learn, scipy]
+    test_setup:
+      - type: python
+        pip: [ pyyaml ]
+  - type: nextflow
+    directives: 
+      label: [ lowmem, lowcpu ]
\ No newline at end of file
diff --git a/src/joint_embedding/metrics/latent_mixing/script.py b/src/joint_embedding/metrics/latent_mixing/script.py
new file mode 100644
index 0000000000..511e7f8d2a
--- /dev/null
+++ b/src/joint_embedding/metrics/latent_mixing/script.py
@@ -0,0 +1,92 @@
+import anndata as ad
+import numpy as np
+import scipy
+from sklearn.neighbors import NearestNeighbors
+
+# VIASH START
+par = {
+    "input_prediction": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+    "input_solution": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
+    "output": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.scores_totalvi.h5ad",
+    "n_neighbors": 100
+}
+# VIASH END
+
+print("Read input files")
+predict_adata = ad.read_h5ad(par["input_prediction"])
+solution_adata = ad.read_h5ad(par["input_solution"])
+
+print("Merge prediction with solution")
+merged_adata = predict_adata.copy()
+
+batch_val = solution_adata.obs["batch"].astype(str)
+batch_unique_values, batch_index = np.unique(batch_val, return_inverse=True)
+
+merged_adata.obs["batch"] = batch_index
+
+def entropy_batch_mixing(
+    latent_space, batches, n_neighbors=50, n_pools=50, n_samples_per_pool=100
+):
+
+    def neg_kl(hist_data, global_freq):
+        n_batches = len(np.unique(hist_data))
+        if n_batches > 2:
+            raise ValueError("Should be only two clusters for this metric")
+        frequency = np.mean(hist_data == 1)
+        if frequency == 0 or frequency == 1:
+            return 0
+        return -(
+            frequency * np.log(frequency / global_freq)
+            + (1 - frequency) * np.log((1 - frequency) / (1 - global_freq))
+        )
+
+    n_neighbors = min(n_neighbors, latent_space.getnnz() - 1)
+    nne = NearestNeighbors(n_neighbors=1 + n_neighbors, n_jobs=8)
+    nne.fit(latent_space)
+    kmatrix = nne.kneighbors_graph(latent_space) - scipy.sparse.identity(
+        latent_space.shape[0]
+    )
+
+    global_freq = np.mean(batches)
+    print(global_freq)
+    score = 0
+    for t in range(n_pools):
+        indices = np.random.choice(
+            np.arange(latent_space.shape[0]), size=n_samples_per_pool
+        )
+        score += np.mean(
+            [
+                neg_kl(
+                    batches[  # the batches of cell i's neighbors
+                        kmatrix[indices].nonzero()[
+                            1
+                        ][  # the neighbors of cell i (columns in row i)
+                            kmatrix[indices].nonzero()[0] == i  # the row of cell i
+                        ]
+                    ],
+                    global_freq,
+                )
+                for i in range(n_samples_per_pool)
+            ]
+        )
+    return score / float(n_pools)
+
+
+print("Calculate latent mixing metric")
+latent_mixing = entropy_batch_mixing(
+    latent_space=merged_adata.obsm['X_emb'],
+    batches=merged_adata.obs["batch"].values,
+    n_neighbors=par["n_neighbors"]
+)
+
+print("Write output")
+adata_out = ad.AnnData(
+    uns = {
+        "dataset_id": predict_adata.uns["dataset_id"],
+        "method_id" : predict_adata.uns["method_id"],
+        "metric_ids" : ["latent_mixing"],
+        "metric_values" : [latent_mixing]
+    }
+)
+
+adata_out.write_h5ad(par['output'], compression = "gzip")
\ No newline at end of file
diff --git a/src/joint_embedding/metrics/nmi/config.vsh.yaml b/src/joint_embedding/metrics/nmi/config.vsh.yaml
new file mode 100644
index 0000000000..e83772d65e
--- /dev/null
+++ b/src/joint_embedding/metrics/nmi/config.vsh.yaml
@@ -0,0 +1,28 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: nmi
+  namespace: joint_embedding/metrics
+  description: Normalised mutual information (NMI)
+  info:
+    metrics:
+      - metric_id: nmi
+        metric_name: NMI
+        metric_description: Normalised mutual information (NMI)
+        min: 0
+        max: 1
+        maximize: True
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: python:3.10
+    setup:
+      - type: python
+        pip: [anndata>=0.8, scib]
+    test_setup:
+      - type: python
+        pip: [ pyyaml ]
+  - type: nextflow
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/metrics/nmi/script.py b/src/joint_embedding/metrics/nmi/script.py
new file mode 100644
index 0000000000..73ef901bd7
--- /dev/null
+++ b/src/joint_embedding/metrics/nmi/script.py
@@ -0,0 +1,64 @@
+import pprint
+import scanpy as sc
+import anndata as ad
+import scib
+
+
+## VIASH START
+par = dict(
+    input_prediction="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+    input_solution="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
+    output="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.nmi.tsv",
+    debug=True
+)
+
+## VIASH END
+
+if par['debug']:
+    pprint.pprint(par)
+
+
+input_prediction = par['input_prediction']
+input_solution = par['input_solution']
+output = par['output']
+
+print("Read prediction anndata")
+adata = ad.read_h5ad(input_prediction)
+dataset_id = adata.uns['dataset_id']
+
+print("Read solution anndata")
+adata_solution = ad.read_h5ad(input_solution)
+
+print('Transfer obs annotations')
+adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names]
+adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names]
+
+print('Preprocessing')
+sc.pp.neighbors(adata, use_rep='X_emb')
+
+print('Clustering')
+scib.cl.opt_louvain(
+    adata,
+    label_key='cell_type',
+    cluster_key='cluster',
+    plot=False,
+    inplace=True,
+    force=True
+)
+
+print('Compute score')
+score = scib.me.nmi(adata, group1='cluster', group2='cell_type')
+
+# store adata with metrics
+print("Create output object")
+out = ad.AnnData(
+    uns=dict(
+        dataset_id=adata.uns['dataset_id'],
+        method_id=adata.uns['method_id'],
+        metric_ids=['nmi'],
+        metric_values=[score]
+    )
+)
+
+print("Write output to h5ad file")
+out.write(output, compression='gzip')
\ No newline at end of file
diff --git a/src/joint_embedding/metrics/rfoob/config.vsh.yaml b/src/joint_embedding/metrics/rfoob/config.vsh.yaml
new file mode 100644
index 0000000000..099219de47
--- /dev/null
+++ b/src/joint_embedding/metrics/rfoob/config.vsh.yaml
@@ -0,0 +1,50 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: rfoob
+  namespace: joint_embedding/metrics
+  description: Calculating basic metrics for the joint embedding task.
+  info:
+    metrics:
+      - metric_id: rfoob_celltype_accuracy
+        metric_name: Rfoob Celltype Accuray
+        metric_description: check the celltype accuracy
+        min: 0
+        max: 1
+        maximize: True
+      - metric_metric_id: rfoob_pseudotimegex_rsq
+        metric_name: rfoob pseudotimegex rsq
+        metric_description: rfoob pseudotimegex rsq
+        min: -1
+        max: 1
+        maximize: true
+      - metric_id: rfoob_pseudotimeadt_rsq
+        metric_name: rfoob pseudotimeadt rsq
+        metric_description: rfoob pseudotimeadt rsq
+        min: -1
+        max: 1
+        maximize: True
+      - metric_id: rfoob_batch_error
+        metric_name: rfoob batch error
+        metric_description: rfoob batch error
+        min: 0
+        max: 1
+        maximize: True
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3, git ]
+      - type: python
+        pip: [ anndata>=0.8 ]
+      - type: r
+        cran: [  anndata, ranger, tidyverse, testthat]
+    test_setup:
+      - type: python
+        pip: [ pyyaml ]
+  - type: nextflow
+    directives: 
+      label: [ lowmem, lowcpu ]
\ No newline at end of file
diff --git a/src/joint_embedding/metrics/rfoob/script.R b/src/joint_embedding/metrics/rfoob/script.R
new file mode 100644
index 0000000000..e237192fe3
--- /dev/null
+++ b/src/joint_embedding/metrics/rfoob/script.R
@@ -0,0 +1,68 @@
+cat("Load dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+library(testthat, quietly = TRUE, warn.conflicts = FALSE)
+requireNamespace("anndata", quietly = TRUE)
+
+## VIASH START
+par <- list(
+  input_solution = "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
+  input_prediction = "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+  output = "scores.h5ad"
+)
+## VIASH END
+
+cat("Read solution h5ad\n")
+ad_sol <- anndata::read_h5ad(par$input_solution)
+
+cat("Read prediction h5ad\n")
+expect_true(
+  grepl("\\.h5ad$", par$input_prediction),
+  info = "Prediction file should be an h5ad file"
+)
+ad_pred <-
+  tryCatch({
+    anndata::read_h5ad(par$input_prediction)
+  }, error = function(e) {
+    stop(paste0("Can't open prediction h5ad file. Detailed error message:\n", e$message))
+  })
+expect_true(
+  ad_sol$uns$dataset_id == ad_pred$uns$dataset_id
+)
+
+cat("Calculating metrics\n")
+df <- data.frame(as.matrix(ad_pred$obsm[["X_emb"]]), SOLUTION_CELL_TYPE = ad_sol$obs[["cell_type"]])
+rf1 <- ranger::ranger(SOLUTION_CELL_TYPE ~ ., df)
+
+df <- data.frame(as.matrix(ad_pred$obsm[["X_emb"]]), SOLUTION_PSEUDOTIME_ORDER = ad_sol$obs$pseudotime_order_GEX)
+df <- df[is.finite(df$SOLUTION_PSEUDOTIME_ORDER), , drop = FALSE]
+rf2 <- ranger::ranger(SOLUTION_PSEUDOTIME_ORDER ~ ., df)
+
+colname <- colnames(ad_sol$obs)[grepl("pseudotime_order_A.*", colnames(ad_sol$obs))]
+df <- data.frame(as.matrix(ad_pred$obsm[["X_emb"]]), SOLUTION_PSEUDOTIME_ORDER = ad_sol$obs[[colname]])
+df <- df[is.finite(df$SOLUTION_PSEUDOTIME_ORDER), , drop = FALSE]
+rf3 <- ranger::ranger(SOLUTION_PSEUDOTIME_ORDER ~ ., df)
+
+df <- data.frame(as.matrix(ad_pred$obsm[["X_emb"]]), SOLUTION_BATCH = ad_sol$obs$batch)
+rf4 <- ranger::ranger(SOLUTION_BATCH ~ ., df)
+
+metric_values <- c(
+  rfoob_celltype_accuracy = 1 - rf1$prediction.error,
+  rfoob_pseudotimegex_rsq = rf2$r.squared,
+  rfoob_pseudotimeadt_rsq = rf3$r.squared,
+  rfoob_batch_error = rf4$prediction.error
+)
+
+cat("Create output object\n")
+out <- anndata::AnnData(
+  shape = c(0, 0),
+  uns = list(
+    dataset_id = ad_pred$uns$dataset_id,
+    method_id = ad_pred$uns$method_id,
+    metric_ids = names(metric_values),
+    metric_values = metric_values
+  )
+)
+
+cat("Write output to h5ad file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")
diff --git a/src/joint_embedding/metrics/ti_cons/config.vsh.yaml b/src/joint_embedding/metrics/ti_cons/config.vsh.yaml
new file mode 100644
index 0000000000..68525b337b
--- /dev/null
+++ b/src/joint_embedding/metrics/ti_cons/config.vsh.yaml
@@ -0,0 +1,40 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: ti_cons
+  namespace: joint_embedding/metrics
+  description: Trajectory inference conservation score
+  info:
+    metrics:
+      - metric_id: ti_cons_RNA
+        metric_name: ti cons RNA
+        metric_description: ti cons RNA
+        min: 0
+        max: 1
+        maximize: True
+      - metric_id: ti_cons_ADT_ATAC
+        metric_name: ti cons ADT ATAC
+        metric_description: ti cons ADT ATAC
+        min: 0
+        max: 1
+        maximize: True
+      - metric_id: ti_cons_mean
+        metric_name: ti cons mean
+        metric_description: ti cons mean
+        min: 0
+        max: 1
+        maximize: True
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: python:3.10
+    setup:
+      - type: python
+        pip: [anndata>=0.8, scib]
+    test_setup:
+      - type: python
+        pip: [ pyyaml ]
+  - type: nextflow
+    directives: 
+      label: [ midmem, midcpu ]
diff --git a/src/joint_embedding/metrics/ti_cons/script.py b/src/joint_embedding/metrics/ti_cons/script.py
new file mode 100644
index 0000000000..1d04067024
--- /dev/null
+++ b/src/joint_embedding/metrics/ti_cons/script.py
@@ -0,0 +1,81 @@
+
+
+print('Importing libraries')
+import pprint
+import numpy as np
+import scanpy as sc
+import anndata as ad
+import scib
+
+## VIASH START
+par = dict(
+    input_prediction="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+    input_solution="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
+    output="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.ti_cons.h5ad",
+    debug=True
+)
+## VIASH END
+
+if par['debug']:
+    pprint.pprint(par)
+
+OUTPUT_TYPE = 'graph'
+METRIC = 'ti_cons'
+
+input_prediction = par['input_prediction']
+input_solution = par['input_solution']
+output = par['output']
+
+print("Read prediction anndata")
+adata = ad.read_h5ad(input_prediction)
+dataset_id = adata.uns['dataset_id']
+
+print("Read solution anndata")
+adata_solution = ad.read_h5ad(input_solution)
+
+print('Transfer obs annotations')
+adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names]
+adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names]
+adt_atac_trajectory = 'pseudotime_order_ATAC' if 'pseudotime_order_ATAC' in adata_solution.obs else 'pseudotime_order_ADT'
+
+print('Preprocessing')
+sc.pp.neighbors(adata, use_rep='X_emb')
+
+print('Compute scores')
+obs_keys = adata_solution.obs_keys()
+
+if 'pseudotime_order_GEX' in obs_keys:
+    score_rna = scib.me.trajectory_conservation(
+        adata_pre=adata_solution,
+        adata_post=adata,
+        label_key='cell_type',
+        pseudotime_key='pseudotime_order_GEX'
+    )
+else:
+    score_rna = np.nan
+
+if adt_atac_trajectory in obs_keys:
+    score_adt_atac = scib.me.trajectory_conservation(
+        adata_pre=adata_solution,
+        adata_post=adata,
+        label_key='cell_type',
+        pseudotime_key=adt_atac_trajectory
+    )
+else:
+    score_adt_atac = np.nan
+
+score_mean = (score_rna + score_adt_atac) / 2
+
+# store adata with metrics
+print("Create output object")
+out = ad.AnnData(
+    uns=dict(
+        dataset_id=adata.uns['dataset_id'],
+        method_id=adata.uns['method_id'],
+        metric_ids=['ti_cons_RNA', 'ti_cons_ADT_ATAC', 'ti_cons_mean'],
+        metric_values=[score_rna, score_adt_atac, score_mean]
+    )
+)
+
+print("Write output to h5ad file")
+out.write(output, compression='gzip')
\ No newline at end of file
diff --git a/src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml b/src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml
new file mode 100644
index 0000000000..7139b8aadc
--- /dev/null
+++ b/src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml
@@ -0,0 +1,40 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: ti_cons_batch
+  namespace: joint_embedding/metrics
+  description: Trajectory inference conservation score per batch
+  info:
+    metrics:
+      - metric_id: ti_cons_batch_RNA
+        metric_name: ti cons batch RNA
+        metric_description: placeholder
+        min: 0
+        max: 1
+        maximize: True
+      - metric_id: ti_cons_batch_ADT_ATAC
+        metric_name: ti cons batch ADT ATAC
+        metric_description: placeholder
+        min: 0
+        max: 1
+        maximize: True
+      - metric_id: ti_cons_batch_mean
+        metric_name: ti cons batch mean
+        metric_description: placeholder
+        min: 0
+        max: 1
+        maximize: True
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: python:3.10
+    setup:
+      - type: python
+        pip: [anndata>=0.8, scib]
+    test_setup:
+      - type: python
+        pip: [ pyyaml ]
+  - type: nextflow
+    directives: 
+      label: [ midmem, midcpu ]
diff --git a/src/joint_embedding/metrics/ti_cons_batch/script.py b/src/joint_embedding/metrics/ti_cons_batch/script.py
new file mode 100644
index 0000000000..5206b59610
--- /dev/null
+++ b/src/joint_embedding/metrics/ti_cons_batch/script.py
@@ -0,0 +1,87 @@
+import pprint
+import numpy as np
+import scanpy as sc
+import anndata as ad
+import scib
+
+## VIASH START
+par = dict(
+    input_prediction="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+    input_solution="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
+    output="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.ti_cons.h5ad",
+    debug=True
+)
+## VIASH END
+
+if par['debug']:
+    pprint.pprint(par)
+
+OUTPUT_TYPE = 'graph'
+METRIC = 'ti_cons_batch'
+
+input_prediction = par['input_prediction']
+input_solution = par['input_solution']
+output = par['output']
+
+print("Read prediction anndata")
+adata = ad.read_h5ad(input_prediction)
+dataset_id = adata.uns['dataset_id']
+
+print("Read solution anndata")
+adata_solution = ad.read_h5ad(input_solution)
+
+print('Transfer obs annotations')
+adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names]
+adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names]
+adt_atac_trajectory = 'pseudotime_order_ATAC' if 'pseudotime_order_ATAC' in adata_solution.obs else 'pseudotime_order_ADT'
+
+print('Preprocessing')
+sc.pp.neighbors(adata, use_rep='X_emb')
+
+print('Compute scores')
+obs_keys = adata_solution.obs_keys()
+
+if 'pseudotime_order_GEX' in obs_keys:
+    score_rna = scib.me.trajectory_conservation(
+        adata_pre=adata_solution,
+        adata_post=adata,
+        label_key='cell_type',
+        batch_key='batch',
+        pseudotime_key='pseudotime_order_GEX'
+    )
+else:
+    score_rna = np.nan
+
+if adt_atac_trajectory in obs_keys:
+    score_adt_atac = scib.me.trajectory_conservation(
+        adata_pre=adata_solution,
+        adata_post=adata,
+        label_key='cell_type',
+        batch_key='batch',
+        pseudotime_key=adt_atac_trajectory
+    )
+else:
+    score_adt_atac = np.nan
+
+score_mean = (score_rna + score_adt_atac) / 2
+
+# store adata with metrics
+print("Create output object")
+out = ad.AnnData(
+    uns=dict(
+        dataset_id=adata.uns['dataset_id'],
+        method_id=adata.uns['method_id'],
+        metric_ids=['ti_cons_batch_RNA', 'ti_cons_batch_ADT_ATAC', 'ti_cons_batch_mean'],
+        metric_values=[score_rna, score_adt_atac, score_mean]
+    )
+)
+
+print("Write output to h5ad file")
+out.write(output, compression='gzip')
+
+# # store score as tsv
+# with open(output, 'w') as file:
+#     header = ['dataset', 'output_type', 'metric', 'value']
+#     entry = [dataset_id, OUTPUT_TYPE, METRIC, score]
+#     file.write('\t'.join(header) + '\n')
+#     file.write('\t'.join([str(x) for x in entry]))
diff --git a/src/joint_embedding/resources_scripts/mask_datasets.sh b/src/joint_embedding/resources_scripts/mask_datasets.sh
new file mode 100644
index 0000000000..dfb1295fb7
--- /dev/null
+++ b/src/joint_embedding/resources_scripts/mask_datasets.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+COMMON_DATASETS="resources/datasets/openproblems_v1"
+OUTPUT_DIR="resources/joint_embedding/datasets/openproblems_v1"
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+params_file="$OUTPUT_DIR/params.yaml"
+
+if [ ! -f $params_file ]; then
+  python << HERE
+import anndata as ad
+import glob
+import yaml
+
+h5ad_files = glob.glob("$COMMON_DATASETS/**.h5ad")
+
+# this task doesn't use normalizations
+# 
+param_list = {}
+
+for h5ad_file in h5ad_files:
+  print(f"Checking {h5ad_file}")
+  adata = ad.read_h5ad(h5ad_file, backed=True)
+  if "counts" in adata.layers:
+    dataset_id = adata.uns["dataset_id"].replace("/", ".")
+    obj = {
+      'id': dataset_id, 
+      'input': h5ad_file,
+      'dataset_id': dataset_id,
+    }
+    param_list[dataset_id] = obj
+
+output = {
+  "param_list": list(param_list.values()),
+  "seed": 123,
+  "output_train": "\$id.train.h5ad",
+  "output_test": "\$id.test.h5ad"
+}
+
+with open("$params_file", "w") as file:
+  yaml.dump(output, file)
+HERE
+fi
+
+export NXF_VER=22.04.5
+nextflow \
+  run . \
+  -main-script target/nextflow/denoising/split_dataset/main.nf \
+  -profile docker \
+  -resume \
+  -params-file $params_file \
+  --publish_dir "$OUTPUT_DIR"
+
+bin/tools/docker/nextflow/process_log/process_log \
+  --output "$OUTPUT_DIR/nextflow_log.tsv"
\ No newline at end of file
diff --git a/src/joint_embedding/resources_scripts/run_benchmarks.sh b/src/joint_embedding/resources_scripts/run_benchmarks.sh
new file mode 100644
index 0000000000..01d6ef92a5
--- /dev/null
+++ b/src/joint_embedding/resources_scripts/run_benchmarks.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+set -e
+
+export TOWER_WORKSPACE_ID=53907369739130
+
+DATASETS_DIR="resources/denoising/datasets/openproblems_v1"
+OUTPUT_DIR="resources/denoising/benchmarks/openproblems_v1"
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+params_file="$OUTPUT_DIR/params.yaml"
+
+if [ ! -f $params_file ]; then
+  python << HERE
+import yaml
+import os
+
+dataset_dir = "$DATASETS_DIR"
+output_dir = "$OUTPUT_DIR"
+
+# read split datasets yaml
+with open(dataset_dir + "/params.yaml", "r") as file:
+  split_list = yaml.safe_load(file)
+datasets = split_list['param_list']
+
+# figure out where train/test files were stored
+param_list = []
+
+for dataset in datasets:
+  id = dataset["id"]
+  input_train = dataset_dir + "/" + id + ".train.h5ad"
+  input_test = dataset_dir + "/" + id + ".test.h5ad"
+  
+  if os.path.exists(input_test):
+    obj = {
+      'id': id, 
+    'id': id, 
+      'id': id, 
+      'dataset_id': dataset["dataset_id"],
+      'input_train': input_train,
+      'input_test': input_test
+    }
+    param_list.append(obj)
+
+# write as output file
+output = {
+  "param_list": param_list,
+}
+
+with open(output_dir + "/params.yaml", "w") as file:
+  yaml.dump(output, file)
+HERE
+fi
+
+export NXF_VER=22.04.5
+nextflow \
+  run . \
+  -main-script src/denoising/workflows/run/main.nf \
+  -profile docker \
+  -params-file "$params_file" \
+  --publish_dir "$OUTPUT_DIR" \
+  -with-tower
+
+bin/tools/docker/nextflow/process_log/process_log \
+  --output "$OUTPUT_DIR/nextflow_log.tsv"
\ No newline at end of file
diff --git a/src/joint_embedding/resources_test_scripts/bmmc_cite.sh b/src/joint_embedding/resources_test_scripts/bmmc_cite.sh
new file mode 100644
index 0000000000..72967ef27b
--- /dev/null
+++ b/src/joint_embedding/resources_test_scripts/bmmc_cite.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+#
+#make sure the following command has been executed
+#bin/viash_build -q 'denoising|common'
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+MOD_1_DATA=resources_test/common/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.output_rna.h5ad
+MOD_2_DATA=resources_test/common/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.output_mod2.h5ad
+DATASET_DIR=resources_test/joint_embedding/bmmc_cite
+
+if [ ! -f $MOD_1_DATA ]; then
+    echo "Error! Could not find raw data"
+    exit 1
+fi
+
+mkdir -p $DATASET_DIR
+
+# split dataset
+bin/viash run src/joint_embedding/mask_dataset/config.vsh.yaml -- \
+    --input_mod1 $MOD_1_DATA \
+    --input_mod2 $MOD_2_DATA \
+    --output_mod1 $DATASET_DIR/cite_mod1.h5ad \
+    --output_mod2 $DATASET_DIR/cite_mod2.h5ad \
+    --output_solution $DATASET_DIR/cite_solution.h5ad
+
+# run one method
+bin/viash run src/joint_embedding/methods/pca/config.vsh.yaml -- \
+    --input_mod1 $DATASET_DIR/cite_mod1.h5ad \
+    --input_mod2 $DATASET_DIR/cite_mod2.h5ad \
+    --output $DATASET_DIR/pca.h5ad
+
+# run one metric
+bin/viash run src/joint_embedding/metrics/ari/config.vsh.yaml -- \
+    --input_prediction $DATASET_DIR/pca.h5ad \
+    --input_solution $DATASET_DIR/cite_solution.h5ad \
+    --output $DATASET_DIR/ari.h5ad
+
+# run benchmark
+export NXF_VER=22.04.5
+
+bin/nextflow \
+  run . \
+  -main-script src/joint_embedding/workflows/run/main.nf \
+  -profile docker \
+  -resume \
+  --id bmmc_cite \
+  --dataset_id bmmc_site \
+  --input_mod1 $DATASET_DIR/cite_mod1.h5ad \
+  --input_mod2 $DATASET_DIR/cite_mod2.h5ad \
+  --input_solution $DATASET_DIR/cite_solution.h5ad \
+  --output scores.tsv \
+  --publish_dir $DATASET_DIR/
\ No newline at end of file
diff --git a/src/joint_embedding/workflows/run/config.vsh.yaml b/src/joint_embedding/workflows/run/config.vsh.yaml
new file mode 100644
index 0000000000..5f3e7800cf
--- /dev/null
+++ b/src/joint_embedding/workflows/run/config.vsh.yaml
@@ -0,0 +1,26 @@
+functionality:
+  name: "run_benchmark"
+  namespace: "joint_embedding/workflows"
+  argument_groups:
+    - name: Inputs
+      arguments:
+        - name: "--id"
+          type: "string"
+          description: "The ID of the dataset"
+          required: true
+        - name: "--input_mod1"
+          type: "file" # todo: replace with includes
+        - name: "--input_mod2"
+          type: "file" # todo: replace with includes
+        - name: "--input_solution"
+          type: "file" # todo: replace with includes
+    - name: Outputs
+      arguments:
+        - name: "--output"
+          direction: "output"
+          type: file
+  resources:
+    - type: nextflow_script
+      path: main.nf
+platforms:
+  - type: nextflow
\ No newline at end of file
diff --git a/src/joint_embedding/workflows/run/main.nf b/src/joint_embedding/workflows/run/main.nf
new file mode 100644
index 0000000000..45ac7ad76d
--- /dev/null
+++ b/src/joint_embedding/workflows/run/main.nf
@@ -0,0 +1,152 @@
+nextflow.enable.dsl=2
+
+sourceDir = params.rootDir + "/src"
+targetDir = params.rootDir + "/target/nextflow"
+
+// import control methods
+include { random_embed } from "$targetDir/joint_embedding/control_methods/random_embed/main.nf"
+include { zeros_embed } from "$targetDir/joint_embedding/control_methods/zeros_embed/main.nf"
+
+// import methods
+include { lmds } from "$targetDir/joint_embedding/methods/lmds/main.nf"
+include { mnn } from "$targetDir/joint_embedding/methods/mnn/main.nf"
+include { newwave } from "$targetDir/joint_embedding/methods/newwave/main.nf"
+include { pca } from "$targetDir/joint_embedding/methods/pca/main.nf"
+include { totalvi } from "$targetDir/joint_embedding/methods/totalvi/main.nf"
+include { umap } from "$targetDir/joint_embedding/methods/umap/main.nf"
+
+// import metrics
+include { ari } from "$targetDir/joint_embedding/metrics/ari/main.nf"
+include { asw_batch } from "$targetDir/joint_embedding/metrics/asw_batch/main.nf"
+include { asw_label } from "$targetDir/joint_embedding/metrics/asw_label/main.nf"
+include { cc_cons } from "$targetDir/joint_embedding/metrics/cc_cons/main.nf"
+include { check_format } from "$targetDir/joint_embedding/metrics/check_format/main.nf"
+include { graph_connectivity } from "$targetDir/joint_embedding/metrics/graph_connectivity/main.nf"
+include { latent_mixing } from "$targetDir/joint_embedding/metrics/latent_mixing/main.nf"
+include { nmi } from "$targetDir/joint_embedding/metrics/nmi/main.nf"
+include { rfoob } from "$targetDir/joint_embedding/metrics/rfoob/main.nf"
+include { ti_cons } from "$targetDir/joint_embedding/metrics/ti_cons/main.nf"
+include { ti_cons_batch } from "$targetDir/joint_embedding/metrics/ti_cons_batch/main.nf"
+
+// tsv generation component
+include { extract_scores } from "$targetDir/common/extract_scores/main.nf"
+
+// import helper functions
+include { readConfig; viashChannel; helpMessage } from sourceDir + "/wf_utils/WorkflowHelper.nf"
+include { setWorkflowArguments; getWorkflowArguments; passthroughMap as pmap } from sourceDir + "/wf_utils/DataflowHelper.nf"
+
+config = readConfig("$projectDir/config.vsh.yaml")
+
+// construct a map of methods (id -> method_module)
+methods = [ lmds, mnn, newwave, pca, totalvi, umap]
+  .collectEntries{method ->
+    [method.config.functionality.name, method]
+  }
+
+workflow {
+  helpMessage(config)
+
+  viashChannel(params, config)
+    | run_wf
+}
+
+workflow run_wf {
+  take:
+  input_ch
+
+  main:
+  output_ch = input_ch
+
+    // split params for downstream components
+    | setWorkflowArguments(
+      method: ["input_mod1", "input_mod2"],
+      metric: ["input_solution"],
+      output: ["output"]
+    )
+
+    // multiply events by the number of method
+    | add_methods
+
+    // run methods
+    | getWorkflowArguments(key: "method")
+    | run_methods
+
+    // construct tuples for metrics
+    | pmap{ id, file, passthrough ->
+      // derive unique ids from output filenames
+      def newId = file.getName().replaceAll(".output.*", "")
+      // combine prediction with solution
+      def newData = [ input_prediction: file, input_solution: passthrough.metric.input_solution ]
+      [ newId, newData, passthrough ]
+    }
+    
+    // run metrics
+    | getWorkflowArguments(key: "metric")
+    | run_metrics
+    
+    // convert to tsv  
+    | aggregate_results
+
+  emit:
+  output_ch
+}
+
+workflow add_methods {
+  take: input_ch
+  main:
+  output_ch = Channel.fromList(methods.keySet())
+    | combine(input_ch)
+
+    // generate combined id for method_id and dataset_id
+    | pmap{method_id, dataset_id, data ->
+      def new_id = dataset_id + "." + method_id
+      def new_data = data.clone() + [method_id: method_id]
+      new_data.remove("id")
+      [new_id, new_data]
+    }
+  emit: output_ch
+}
+
+workflow run_methods {
+  take: input_ch
+  main:
+    // generate one channel per method
+    method_chs = methods.collect { method_id, method_module ->
+        input_ch
+          | filter{it[1].method_id == method_id}
+          | method_module
+      }
+    // mix all results
+    output_ch = method_chs[0].mix(*method_chs.drop(1))
+
+  emit: output_ch
+}
+
+workflow run_metrics {
+  take: input_ch
+  main:
+
+  output_ch = input_ch
+    | (ari & asw_batch & asw_label & cc_cons & check_format & graph_connectivity & latent_mixing & nmi & rfoob & ti_cons & ti_cons_batch)
+    | mix
+
+  emit: output_ch
+}
+
+workflow aggregate_results {
+  take: input_ch
+  main:
+
+  output_ch = input_ch
+    | toSortedList
+    | filter{ it.size() > 0 }
+    | map{ it -> 
+      [ "combined", it.collect{ it[1] } ] + it[0].drop(2) 
+    }
+    | getWorkflowArguments(key: "output")
+    | extract_scores.run(
+        auto: [ publish: true ]
+    )
+
+  emit: output_ch
+}
\ No newline at end of file
diff --git a/src/joint_embedding/workflows/run/nextflow.config b/src/joint_embedding/workflows/run/nextflow.config
new file mode 100644
index 0000000000..ea674ffa07
--- /dev/null
+++ b/src/joint_embedding/workflows/run/nextflow.config
@@ -0,0 +1,14 @@
+manifest {
+  name = 'joint_embedding/workflows/run'
+  mainScript = 'main.nf'
+  nextflowVersion = '!>=22.04.5'
+  description = 'Multi modality - joint embedding'
+}
+
+params {
+  rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString()
+}
+
+// include common settings
+includeConfig("${params.rootDir}/src/wf_utils/ProfilesHelper.config")
+includeConfig("${params.rootDir}/src/wf_utils/labels.config")
\ No newline at end of file
diff --git a/src/match_modality/README.md b/src/match_modality/README.md
new file mode 100644
index 0000000000..3f1ff48cc1
--- /dev/null
+++ b/src/match_modality/README.md
@@ -0,0 +1,23 @@
+# Match modality
+
+Structure of this task:
+
+    src/match_modality
+    ├── api                          Interface specifications for components and datasets in this task
+    ├── control_methods              Baseline (random/ground truth) methods to compare methods against
+    ├── methods                      Methods to be benchmarked
+    ├── metrics                      Metrics used to quantify performance of methods
+    ├── README.md                    This file
+    ├── resources_scripts            Scripts to process the datasets
+    ├── resources_test_scripts       Scripts to process the test resources
+    ├── split_dataset                Component to prepare common datasets
+    └── workflows                    Pipelines to run the full benchmark
+
+Relevant links:
+
+* [Description and results at openproblems.bio](https://openproblems.bio/neurips_2021/)
+
+* [Experimental results](https://openproblems-experimental.netlify.app/results/match_modality/)
+
+<!-- update this to openproblems.bio/guide when possible -->
+* [Contribution guide](https://github.com/openproblems-bio/openproblems-v2/blob/main/CONTRIBUTING.md)
diff --git a/src/match_modality/api/anndata_dataset.yaml b/src/match_modality/api/anndata_dataset.yaml
new file mode 100644
index 0000000000..3382dba2ff
--- /dev/null
+++ b/src/match_modality/api/anndata_dataset.yaml
@@ -0,0 +1,52 @@
+type: file
+description: An input h5ad dataset.
+example: "dataset.h5ad"
+info:
+  label: "Dataset"
+  slots:
+    layers: 
+      - type: integer
+        name: counts
+        description: Raw counts
+        required: true
+    obs:
+      - type: string
+        name: batch
+        description: Batch information
+        required: true
+      - type: double
+        name: size_factors
+        description: The size factors created by the normalisation method, if any.
+        required: false
+      - type: boolean
+        name: is_train
+        description: if sample is train data
+        required: true
+    var:
+      - type: string
+        name: gene_ids
+        description: 
+        required: false
+      - type: string
+        name: feature_types
+        description:
+        required: true
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: organism
+        description: "data from which organism "
+        required: false
+      - type: string
+        name: gene_activity_var_names
+        description:
+        required: true
+    obsm:
+      - type: double
+        name: gene_activity
+        description:
+        required: true
+    
diff --git a/src/match_modality/api/anndata_masked_mod.yaml b/src/match_modality/api/anndata_masked_mod.yaml
new file mode 100644
index 0000000000..31ef25d06e
--- /dev/null
+++ b/src/match_modality/api/anndata_masked_mod.yaml
@@ -0,0 +1,46 @@
+type: file
+description: "The masked data"
+example: "masked.h5ad"
+info:
+  short_description: "masked data"
+  slots:
+    layers: 
+      - type: integer
+        name: counts
+        description: Raw counts
+    obs:
+      - type: string
+        name: batch
+        description: Batch information
+        required: true
+      - type: double
+        name: size_factors
+        description:
+        required: false
+    var:
+      - type: string
+        name: feature_types
+        description: 
+        required: true
+      - type: string
+        name: gene_ids
+        description:
+        required: false
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: organism
+        description: which organism
+        required: true
+      - type: string
+        name: gene_activity_var_names
+        description:
+        required: true
+    obsm:
+      - type: double
+        name: gene_activity
+        description:
+        required: true
\ No newline at end of file
diff --git a/src/match_modality/api/anndata_prediction.yaml b/src/match_modality/api/anndata_prediction.yaml
new file mode 100644
index 0000000000..ca214d4963
--- /dev/null
+++ b/src/match_modality/api/anndata_prediction.yaml
@@ -0,0 +1,15 @@
+type: file
+description: "The predicted pairing of test mod1&mod2 profiles."
+example: "prediction.h5ad"
+info:
+  short_description: "Prediction"
+  slots:     
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: method_id
+        description: "A unique identifier for the method"
+        required: true
diff --git a/src/match_modality/api/anndata_score.yaml b/src/match_modality/api/anndata_score.yaml
new file mode 100644
index 0000000000..bfe79f07cc
--- /dev/null
+++ b/src/match_modality/api/anndata_score.yaml
@@ -0,0 +1,25 @@
+type: file
+description: "Metric score file"
+example: "output.h5ad"
+info:
+  short_description: "Score"
+  slots:
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: method_id
+        description: "A unique identifier for the method"
+        required: true
+      - type: string
+        name: metric_ids
+        description: "One or more unique metric identifiers"
+        multiple: true
+        required: true
+      - type: double
+        name: metric_values
+        description: "The metric values obtained for the given prediction. Must be of same length as 'metric_ids'."
+        multiple: true
+        required: true
diff --git a/src/match_modality/api/anndata_solution.yaml b/src/match_modality/api/anndata_solution.yaml
new file mode 100644
index 0000000000..e486646e2f
--- /dev/null
+++ b/src/match_modality/api/anndata_solution.yaml
@@ -0,0 +1,20 @@
+type: file
+description: "The masked solution data"
+example: "masked.h5ad"
+info:
+  short_description: "masked solution data"
+  slots:
+    obs:
+      - type: string
+        name: batch
+        description: Batch information
+        required: true
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: double
+        name: pairing_ix
+        description: number of samples
+        required: true
diff --git a/src/match_modality/api/authors.yaml b/src/match_modality/api/authors.yaml
new file mode 100644
index 0000000000..bd2cade4d0
--- /dev/null
+++ b/src/match_modality/api/authors.yaml
@@ -0,0 +1,28 @@
+functionality:
+  authors:
+    - name: Robrecht Cannoodt
+      roles: [ author, contributor ]
+      props: { github: rcannood, orcid: "0000-0003-3641-729X" }
+    - name: Kai Waldrant
+      roles: [ contributor ]
+      props: { github: KaiWaldrant }
+    - name: Michaela Mueller
+      email: mumichae@in.tum.de
+      roles: [ author, maintainer ]
+      props: { github: mumichae, orcid: "0000-0002-1401-1785" }
+    - name: Louise Deconinck
+      email: louise.deconinck@gmail.com
+      roles: [ author, maintainer ]
+      props: { github: LouiseDck, orcid: "" }
+    - name: Alejandro Granados
+      email: alejandro.granados@czbiohub.org
+      roles: [ author, maintainer ]
+      props: { github: agranado }
+    - name: Alex Tong
+      email: alexandertongdev@gmail.com
+      roles: [ author, maintainer ]
+      props: { github: atong01 }
+    - name: Daniel Burkhardt
+      email: daniel.b.burkhardt@gmail.com
+      roles: [ author, maintainer ]
+      props: { github: dburkhardt }
\ No newline at end of file
diff --git a/src/match_modality/api/comp_control_method.yaml b/src/match_modality/api/comp_control_method.yaml
new file mode 100644
index 0000000000..8dff03a367
--- /dev/null
+++ b/src/match_modality/api/comp_control_method.yaml
@@ -0,0 +1,68 @@
+functionality:
+  arguments:
+    - name: "--input_test_mod1"
+      __merge__: anndata_masked_mod.yaml
+    - name: "--input_test_mod2"
+      __merge__: anndata_masked_mod.yaml
+    - name: "--input_test_sol"
+      __merge__: anndata_solution.yaml
+    - name: "--output"
+      direction: "output"
+      __merge__: anndata_prediction.yaml
+  test_resources:
+    - path: ../../../../output   
+    - type: python_script
+      path: generic_test.py
+      text: |
+        from os import path
+        import subprocess
+        import anndata as ad
+        import numpy as np
+        from scipy.sparse import issparse
+
+        # define some filenames
+        testpar = {
+            'input_test_mod1': 'output/output_test_mod1.h5ad',
+            'input_test_mod2': 'output/output_test_mod2.h5ad',
+            'input_test_sol': 'output/output_test_sol.h5ad',
+            'output': 'output/output_prediction.h5ad',
+        }
+
+        print('> Running method', flush=True)
+        out = subprocess.run([
+            meta['executable'],
+            '--input_test_mod1', testpar['input_test_mod1'],
+            '--input_test_mod2', testpar['input_test_mod2'],
+            '--input_test_sol', testpar['input_test_sol'],
+            '--output', testpar['output']
+          ],
+          check=True,
+          capture_output= True,
+          text= True
+        )
+
+        # for troubleshooting: remove 'check=True' in subprocess.run above and comment out below print
+        # print(out.stderr, flush=True)
+
+        print('> Checking whether output files were created', flush=True)
+        assert path.exists(testpar['output'])
+
+        print('> Reading h5ad files', flush=True)
+        ad_sol = ad.read_h5ad(testpar['input_test_sol'])
+        ad_pred = ad.read_h5ad(testpar['output'])
+
+        print('> Checking dataset id', flush=True)
+        assert ad_pred.uns['dataset_id'] == ad_sol.uns['dataset_id']
+
+        print('> Checking method id', flush=True)
+        assert ad_pred.uns['method_id'] == meta['functionality_name']
+
+        print('> Checking X', flush=True)
+        assert issparse(ad_pred.X)
+        assert np.all([x >= 0 for x in ad_pred.X.nonzero()]), 'Values must be strictly non-negative.'
+        assert ad_pred.X.nonzero()[0].size <= 1000 * ad_sol.n_obs
+        assert ad_pred.n_obs == ad_sol.n_obs
+        assert ad_pred.n_vars == ad_sol.n_vars
+        assert np.isclose(ad_pred.X.sum(axis=1), 1, atol=1e-10).all(), 'All rows should sum to 1.'
+
+        print('> Test succeeded!', flush=True)
diff --git a/src/match_modality/api/comp_mask_dataset.yaml b/src/match_modality/api/comp_mask_dataset.yaml
new file mode 100644
index 0000000000..3876ef397e
--- /dev/null
+++ b/src/match_modality/api/comp_mask_dataset.yaml
@@ -0,0 +1,32 @@
+functionality:
+  arguments:
+    - name: "--input_mod1"
+      __merge__: anndata_dataset.yaml
+    - name: "--input_mod2"
+      __merge__: anndata_dataset.yaml
+    - name: "--output_train_mod1"
+      direction: "output"
+      __merge__: anndata_masked_mod.yaml
+    - name: "--output_train_mod2"
+      direction: "output"
+      __merge__: anndata_masked_mod.yaml
+    - name: "--output_train_sol"
+      direction: "output"
+      __merge__: anndata_solution.yaml
+    - name: "--output_test_mod1"
+      direction: "output"
+      __merge__: anndata_masked_mod.yaml
+    - name: "--output_test_mod2"
+      direction: "output"
+      __merge__: anndata_masked_mod.yaml
+    - name: "--output_test_sol"
+      direction: "output"
+      __merge__: anndata_solution.yaml
+    - name: "--knn"
+      type: "integer"
+      default: 20
+      description: The KNN cutoff for computing nearest neighbors on the mod1 and mod2 profiles.
+    - name: "--seed"
+      type: "integer"
+      default: 1
+      description: The seed.
\ No newline at end of file
diff --git a/src/match_modality/api/comp_method.yaml b/src/match_modality/api/comp_method.yaml
new file mode 100644
index 0000000000..87e45f646b
--- /dev/null
+++ b/src/match_modality/api/comp_method.yaml
@@ -0,0 +1,77 @@
+functionality:
+  arguments:
+    - name: "--input_train_mod1"
+      __merge__: anndata_masked_mod.yaml
+    - name: "--input_train_mod2"
+      __merge__: anndata_masked_mod.yaml
+    - name: "--input_train_sol"
+      __merge__: anndata_solution.yaml
+    - name: "--input_test_mod1"
+      __merge__: anndata_masked_mod.yaml
+    - name: "--input_test_mod2"
+      __merge__: anndata_masked_mod.yaml
+    - name: "--output"
+      direction: output
+      __merge__: anndata_prediction.yaml
+  test_resources:
+    - path: ../../../../output   
+    - type: python_script
+      path: generic_test.py
+      text: |
+        from os import path
+        import subprocess
+        import anndata as ad
+        import numpy as np
+        from scipy.sparse import issparse
+
+        # define some filenames
+        testpar = {
+            'input_train_mod1': 'output/output_train_mod1.h5ad',
+            'input_train_mod2': 'output/output_train_mod2.h5ad',
+            'input_train_sol': 'output/output_train_sol.h5ad',
+            'input_test_mod1': 'output/output_test_mod1.h5ad',
+            'input_test_mod2': 'output/output_test_mod2.h5ad',
+            'input_test_sol': 'output/output_test_sol.h5ad',
+            'output': 'output/output_prediction.h5ad',
+        }
+
+        print('> Running method', flush=True)
+        out = subprocess.run([
+            meta['executable'],
+            '--input_train_mod1', testpar['input_train_mod1'],
+            '--input_train_mod2', testpar['input_train_mod2'],
+            '--input_train_sol', testpar['input_train_sol'],
+            '--input_test_mod1', testpar['input_test_mod1'],
+            '--input_test_mod2', testpar['input_test_mod2'],
+            '--output', testpar['output']
+          ],
+          check=True,
+          capture_output= True,
+          text= True
+        )
+
+        # for troubleshooting: remove 'check=True' in subprocess.run above and comment out below print
+        # print(out.stderr, flush=True)
+
+        print('> Checking whether output files were created', flush=True)
+        assert path.exists(testpar['output'])
+
+        print('> Reading h5ad files', flush=True)
+        ad_sol = ad.read_h5ad(testpar['input_test_sol'])
+        ad_pred = ad.read_h5ad(testpar['output'])
+
+        print('> Checking dataset id', flush=True)
+        assert ad_pred.uns['dataset_id'] == ad_sol.uns['dataset_id']
+
+        print('> Checking method id', flush=True)
+        assert ad_pred.uns['method_id'] == meta['functionality_name']
+
+        print('> Checking X', flush=True)
+        assert issparse(ad_pred.X)
+        assert np.all([x >= 0 for x in ad_pred.X.nonzero()]), 'Values must be strictly non-negative.'
+        assert ad_pred.X.nonzero()[0].size <= 1000 * ad_sol.n_obs
+        assert ad_pred.n_obs == ad_sol.n_obs
+        assert ad_pred.n_vars == ad_sol.n_vars
+        assert np.isclose(ad_pred.X.sum(axis=1), 1, atol=1e-10).all(), 'All rows should sum to 1.'
+
+        print('> Test succeeded!', flush=True)
diff --git a/src/match_modality/api/comp_metric.yaml b/src/match_modality/api/comp_metric.yaml
new file mode 100644
index 0000000000..ae94586882
--- /dev/null
+++ b/src/match_modality/api/comp_metric.yaml
@@ -0,0 +1,83 @@
+functionality:
+  arguments:
+    - name: --input_prediction
+      __merge__: anndata_prediction.yaml
+    - name: --input_solution
+      __merge__: anndata_solution.yaml
+    - name: --output
+      __merge__: anndata_score.yaml
+      direction: output
+  test_resources:
+    - path: ../../../../output
+    - type: python_script
+      path: generic_test.py
+      text: |
+        from os import path
+        import subprocess
+        import anndata as ad
+        import pandas as pd
+        import yaml
+
+        input_prediction_path = "output/dr_knnr_cbf_prediction.h5ad"
+        input_solution_path = "output/output_test_sol.h5ad"
+        output_path = "output.h5ad"
+        # define some filenames
+        with open(meta["config"], "r") as file:
+                config = yaml.safe_load(file)
+
+        cmd = [
+            meta['executable'],
+            "--input_prediction", input_prediction_path,
+            "--input_solution", input_solution_path,
+            "--output", output_path
+        ]
+
+        print("> Running method", flush=True)
+        out = subprocess.run(cmd, check=True)
+
+        print("> Checking whether output files were created", flush=True)
+        assert path.exists(output_path)
+
+        print("> Reading h5ad files", flush=True)
+        input_prediction = ad.read_h5ad(input_prediction_path)
+        input_solution = ad.read_h5ad(input_solution_path)
+        output = ad.read_h5ad(output_path)
+
+        # Create DF from metric config info
+        metric_info = config['functionality']['info']['metrics']
+        metric_meta = pd.DataFrame(metric_info)
+        metric_meta = metric_meta.astype({'metric_id': str, 'metric_name': str, 'metric_description': str, 'min': float, 'max': float, 'maximize': bool})
+        print("> Checking contents of metric info", flush=True)
+        assert 'metric_id' in metric_meta
+        assert 'min' in metric_meta
+        assert 'max' in metric_meta
+        assert 'maximize' in metric_meta
+
+        print("> Checking .uns['dataset_id']", flush=True)
+        assert 'dataset_id' in output.uns
+        assert output.uns['dataset_id'] == input_prediction.uns['dataset_id']
+
+        print("> Checking .uns['method_id']", flush=True)
+        assert 'method_id' in output.uns
+        assert output.uns['method_id'] == input_prediction.uns['method_id']
+
+        print("> Checking .uns['metric_ids']", flush=True)
+        assert 'metric_ids' in output.uns
+        assert set(output.uns['metric_ids']) == set(metric_meta.metric_id)
+
+        print("> Checking .uns['metric_values']", flush=True)
+        assert 'metric_values' in output.uns
+        assert output.uns['metric_ids'].size == output.uns['metric_values'].size
+
+        # merge with metric_meta to see if metric_value lies within the expected range
+        output_uns = pd.DataFrame({
+          'metric_id': output.uns['metric_ids'], 
+          'value': output.uns['metric_values']
+        })
+
+        scores = metric_meta.merge(output_uns, on="metric_id")
+
+        assert all(scores.value >= scores['min'])
+        assert all(scores.value <= scores['max'])
+
+        print("> Test succeeded!", flush=True)
diff --git a/src/match_modality/api/task_info.yaml b/src/match_modality/api/task_info.yaml
new file mode 100644
index 0000000000..50d32b98f9
--- /dev/null
+++ b/src/match_modality/api/task_info.yaml
@@ -0,0 +1,13 @@
+task_id: match_modality
+task_name: Match modality
+summary: "Predicting which profiles from one modality resembles a profile from another."
+description: |
+  "While joint profiling of two modalities in the same single cell is now possible, 
+  most single-cell datasets that exist measure only a single modality. These modalities 
+  complement each other in their description of cellular state. Yet, it is challenging 
+  to analyse uni-modal datasets together when they do not share observations (cells) or 
+  a common feature space (genes, proteins, or open chromatin peaks). If we could map 
+  observations to one another across modalities, it would be possible to treat separately 
+  profiled datasets in the same manner as new multi-modal sequencing data. Mapping these 
+  modalities to one another opens up the vast amount of uni-modal single-cell datasets 
+  generated in the past years to multi-modal data analysis methods."
\ No newline at end of file
diff --git a/src/match_modality/control_methods/constant/config.vsh.yaml b/src/match_modality/control_methods/constant/config.vsh.yaml
new file mode 100644
index 0000000000..c20acf234f
--- /dev/null
+++ b/src/match_modality/control_methods/constant/config.vsh.yaml
@@ -0,0 +1,24 @@
+__merge__: ../../api/comp_control_method.yaml
+functionality:
+  name: constant
+  namespace: match_modality/control_methods
+  description: Returns constant weights between all mod1 profiles and the first 1000 mod2 profiles.
+  info:
+    type: negative_control
+    method_name: Constant
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [ anndata>=0.8 ]
+  - type: nextflow
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/match_modality/control_methods/constant/script.R b/src/match_modality/control_methods/constant/script.R
new file mode 100644
index 0000000000..590d49f982
--- /dev/null
+++ b/src/match_modality/control_methods/constant/script.R
@@ -0,0 +1,50 @@
+cat("Loading dependencies\n")
+requireNamespace("anndata", quietly = TRUE)
+library(Matrix, warn.conflicts = FALSE, quietly = TRUE)
+
+## VIASH START
+par <- list(
+  input_test_mod1 = "output/output_test_mod1.h5ad",
+  input_test_mod2 = "output/output_test_mod2.h5ad",
+  output = "output/output_prediction.h5ad"
+)
+meta <- list(functionality_name = "foo")
+## VIASH END
+
+
+cat("Reading h5ad files\n")
+# input_train_mod1 <- anndata::read_h5ad(par$input_train_mod1)
+# input_train_mod2 <- anndata::read_h5ad(par$input_train_mod2)
+# input_train_sol <- anndata::read_h5ad(par$input_train_sol)
+input_test_mod1 <- anndata::read_h5ad(par$input_test_mod1, backed = TRUE)
+input_test_mod2 <- anndata::read_h5ad(par$input_test_mod2, backed = TRUE)
+
+knn_df <-
+  expand.grid(
+    i = seq_len(nrow(input_test_mod1)),
+    j = seq_len(min(nrow(input_test_mod2), 1000))
+  )
+
+knn_mat <- 
+  Matrix::sparseMatrix(
+    i = knn_df$i,
+    j = knn_df$j,
+    x = rep(1, nrow(knn_df)),
+    dims = list(nrow(input_test_mod1), nrow(input_test_mod2))
+  )
+
+# normalise to make rows sum to 1
+rs <- Matrix::rowSums(knn_mat)
+knn_mat@x <- knn_mat@x / rs[knn_mat@i + 1]
+
+cat("Creating output anndata\n")
+out <- anndata::AnnData(
+  X = as(knn_mat, "CsparseMatrix"),
+  uns = list(
+    dataset_id = input_test_mod1$uns[["dataset_id"]],
+    method_id = meta$functionality_name
+  )
+)
+
+cat("Writing predictions to file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")
diff --git a/src/match_modality/control_methods/random_pairing/config.vsh.yaml b/src/match_modality/control_methods/random_pairing/config.vsh.yaml
new file mode 100644
index 0000000000..fd9da8d595
--- /dev/null
+++ b/src/match_modality/control_methods/random_pairing/config.vsh.yaml
@@ -0,0 +1,20 @@
+__merge__: ../../api/comp_control_method.yaml
+functionality:
+  name: random_pairing
+  namespace: match_modality/control_methods
+  description: Generates random pairings weights drawn from a uniform distribution.
+  info:
+    type: negative_control
+    method_name: Random Pairing
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: "python:3.10"
+    setup:
+      - type: python
+        pip: [ anndata>=0.8, numpy, scikit-learn ]
+  - type: nextflow
+    directives: 
+      label: [ lowmem, lowcpu ]
\ No newline at end of file
diff --git a/src/match_modality/control_methods/random_pairing/script.py b/src/match_modality/control_methods/random_pairing/script.py
new file mode 100644
index 0000000000..868b6ef31c
--- /dev/null
+++ b/src/match_modality/control_methods/random_pairing/script.py
@@ -0,0 +1,45 @@
+import anndata as ad
+import numpy as np
+import scipy.sparse
+from sklearn.preprocessing import normalize
+
+# VIASH START
+par = {
+    "input_test_mod1": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_mod1.h5ad",
+    "input_test_mod2": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_mod2.h5ad",
+    "output": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+}
+
+meta = {
+    "functionality_name": "foo"
+}
+# VIASH END
+
+print("Load datasets")
+input_test_mod1 = ad.read_h5ad(par["input_test_mod1"])
+input_test_mod2 = ad.read_h5ad(par["input_test_mod2"])
+
+# determine number of values in array
+num_values = min(1000, input_test_mod1.n_obs) * input_test_mod1.n_obs
+indices = np.random.randint(input_test_mod1.n_obs**2, size=num_values)
+
+mat_x = np.random.rand(num_values)
+mat_i = indices % input_test_mod1.n_obs
+mat_j = (indices / input_test_mod1.n_obs).astype(int)
+pairing_matrix = scipy.sparse.csr_matrix(
+    (mat_x, (mat_i, mat_j)),
+    shape=(input_test_mod1.n_obs, input_test_mod2.n_obs)
+)
+
+# row normalise
+prob_matrix = normalize(pairing_matrix, norm="l1")
+
+# Write out prediction
+prediction = ad.AnnData(
+    X=prob_matrix,
+    uns={
+        "method_id": meta["functionality_name"],
+        "dataset_id": input_test_mod1.uns["dataset_id"]
+    }
+)
+prediction.write_h5ad(par["output"])
diff --git a/src/match_modality/control_methods/semi_solution/config.vsh.yaml b/src/match_modality/control_methods/semi_solution/config.vsh.yaml
new file mode 100644
index 0000000000..756716feb6
--- /dev/null
+++ b/src/match_modality/control_methods/semi_solution/config.vsh.yaml
@@ -0,0 +1,24 @@
+__merge__: ../../api/comp_control_method.yaml
+functionality:
+  name: semi_solution
+  namespace: match_modality/control_methods
+  description: Returns the ground-truth pairing.
+  info:
+    type: positive_control
+    method_name: Semi-solution
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata ]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [ anndata>=0.8 ]
+  - type: nextflow
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/match_modality/control_methods/semi_solution/script.R b/src/match_modality/control_methods/semi_solution/script.R
new file mode 100644
index 0000000000..d36833b4b0
--- /dev/null
+++ b/src/match_modality/control_methods/semi_solution/script.R
@@ -0,0 +1,29 @@
+cat("Loading dependencies\n")
+requireNamespace("anndata", quietly = TRUE)
+
+## VIASH START
+par <- list(
+  input_test_sol = "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_sol.h5ad",
+  output = "output.h5ad"
+)
+meta <- list(functionality_name = "foo")
+## VIASH END
+
+cat("Reading h5ad files\n")
+input_test_sol <- anndata::read_h5ad(par$input_test_sol)
+
+# randomly fill in gold standard values
+input_test_sol$X@x <- runif(length(input_test_sol$X@x))
+
+# fill other values with random values as well
+ix <- sample.int(nrow(input_test_sol) * ncol(input_test_sol), nrow(input_test_sol) * 10)
+input_test_sol$X[ix] <- runif(length(ix))
+
+# normalise to make rows sum to 1
+rs <- Matrix::rowSums(input_test_sol$X)
+input_test_sol$X@x <- input_test_sol$X@x / rs[input_test_sol$X@i + 1]
+
+input_test_sol$uns[["method_id"]] <- meta$functionality_name
+
+cat("Writing predictions to file\n")
+zzz <- input_test_sol$write_h5ad(par$output, compression = "gzip")
diff --git a/src/match_modality/control_methods/solution/config.vsh.yaml b/src/match_modality/control_methods/solution/config.vsh.yaml
new file mode 100644
index 0000000000..b352fdaedf
--- /dev/null
+++ b/src/match_modality/control_methods/solution/config.vsh.yaml
@@ -0,0 +1,24 @@
+__merge__: ../../api/comp_control_method.yaml
+functionality:
+  name: solution
+  namespace: match_modality/control_methods
+  description: Returns the ground-truth pairing.
+  info:
+    type: positive_control
+    method_name: Solution
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata ]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [ anndata>=0.8 ]
+  - type: nextflow
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/match_modality/control_methods/solution/script.R b/src/match_modality/control_methods/solution/script.R
new file mode 100644
index 0000000000..88b5b746dd
--- /dev/null
+++ b/src/match_modality/control_methods/solution/script.R
@@ -0,0 +1,18 @@
+cat("Loading dependencies\n")
+requireNamespace("anndata", quietly = TRUE)
+
+## VIASH START
+par <- list(
+  input_test_sol = "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_sol.h5ad",
+  output = "output.h5ad"
+)
+meta <- list(functionality_name = "foo")
+## VIASH END
+
+cat("Reading h5ad files\n")
+input_test_sol <- anndata::read_h5ad(par$input_test_sol)
+
+input_test_sol$uns[["method_id"]] <- meta$functionality_name
+
+cat("Writing predictions to file\n")
+zzz <- input_test_sol$write_h5ad(par$output, compression = "gzip")
diff --git a/src/match_modality/mask_dataset/config.vsh.yaml b/src/match_modality/mask_dataset/config.vsh.yaml
new file mode 100644
index 0000000000..ac341f3b84
--- /dev/null
+++ b/src/match_modality/mask_dataset/config.vsh.yaml
@@ -0,0 +1,23 @@
+__merge__: ../api/comp_mask_dataset.yaml
+functionality:
+  name: mask_dataset
+  namespace: match_modality
+  description: |
+    Censor an existing dataset: obfuscate gene names, remove cell identities and
+    shuffle cells of modalities, for distribution to competitors.    
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata, tidyverse , testthat]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [ anndata>=0.8 ]
+  - type: nextflow
+    directives: 
+      label: [ highmem, midtime, highcpu ]
diff --git a/src/match_modality/mask_dataset/script.R b/src/match_modality/mask_dataset/script.R
new file mode 100644
index 0000000000..f41de56600
--- /dev/null
+++ b/src/match_modality/mask_dataset/script.R
@@ -0,0 +1,179 @@
+cat("Loading dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+requireNamespace("anndata", quietly = TRUE)
+library(assertthat, quietly = TRUE, warn.conflicts = FALSE)
+library(Matrix, quietly = TRUE, warn.conflicts = FALSE)
+
+## VIASH START
+# input_path <- "resources_test/common/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter."
+# input_path <- "output/datasets/common/openproblems_bmmc_multiome_phase1/openproblems_bmmc_multiome_phase1.manual_formatting."
+# input_path <- "output/datasets/common/openproblems_bmmc_cite_phase1/openproblems_bmmc_cite_phase1.manual_formatting."
+input_path <- "resources_test/common/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter."
+output_path <- "output/multiome"
+# output_path <- "output/datasets/match_modality/openproblems_bmmc_multiome_phase1/openproblems_bmmc_multiome_phase1.censor_dataset."
+# output_path <- "output/datasets/match_modality/openproblems_bmmc_multiome_iid/openproblems_bmmc_multiome_iid.censor_dataset."
+# dir.create(dirname(output_path), recursive = TRUE)
+
+par <- list(
+  input_mod1 = paste0(input_path, "output_rna.h5ad"),
+  input_mod2 = paste0(input_path, "output_mod2.h5ad"),
+  output_train_mod1 = paste0(output_path, "output_train_mod1.h5ad"),
+  output_train_mod2 = paste0(output_path, "output_train_mod2.h5ad"),
+  output_train_sol = paste0(output_path, "output_train_sol.h5ad"),
+  output_test_mod1 = paste0(output_path, "output_test_mod1.h5ad"),
+  output_test_mod2 = paste0(output_path, "output_test_mod2.h5ad"),
+  output_test_sol = paste0(output_path, "output_test_sol.h5ad"),
+  seed = 1L,
+  knn = 10L
+)
+## VIASH END
+
+set.seed(par$seed)
+
+subset_mats <- function(li, obs_filt, anonymize = FALSE) {
+  out <- list()
+  for (n in names(li)) {
+    mat <- li[[n]][obs_filt, , drop = FALSE]
+    if (anonymize) {
+      rownames(mat) <- paste0("cell_", seq_len(nrow(mat)))
+    }
+    out[[n]] <- mat
+  }
+  out
+}
+
+
+cat("Reading input data\n")
+input_mod1 <- anndata::read_h5ad(par$input_mod1)
+input_mod2 <- anndata::read_h5ad(par$input_mod2)
+ad1_mod <- unique(input_mod1$var[["feature_types"]])
+ad2_mod <- unique(input_mod2$var[["feature_types"]])
+new_dataset_id <- paste0(input_mod1$uns[["dataset_id"]], "_MM_", tolower(ad1_mod), "2", tolower(ad2_mod))
+ad1_uns <- list(dataset_id = new_dataset_id, organism = "human")
+ad2_uns <- list(dataset_id = new_dataset_id, organism = "human")
+ad1_obsm <- list()
+ad2_obsm <- list()
+
+if (ad1_mod == "ATAC") {
+  ad1_uns$gene_activity_var_names <- input_mod1$uns$gene_activity_var_names
+  ad1_obsm$gene_activity <- as(input_mod1$obsm$gene_activity, "CsparseMatrix")
+}
+if (ad2_mod == "ATAC") {
+  ad2_uns$gene_activity_var_names <- input_mod2$uns$gene_activity_var_names
+  ad2_obsm$gene_activity <- as(input_mod2$obsm$gene_activity, "CsparseMatrix")
+}
+
+cat("Shuffle train cells\n")
+train_ix <- which(input_mod1$obs$is_train) %>% sort
+train_mod2_ix <- sample.int(length(train_ix))
+
+cat("Shuffle test cells\n")
+test_ix <- which(!input_mod1$obs$is_train) %>% sort
+test_mod2_ix <- sample.int(length(test_ix))
+
+is_categorical <- function(x) is.character(x) || is.factor(x)
+# relevel <- function(x) factor(as.character(x))
+relevel <- function(x) as.character(x)
+
+cat("Creating train objects\n")
+mod1_var <- input_mod1$var %>% select(one_of("gene_ids", "feature_types"))
+mod2_var <- input_mod2$var %>% select(one_of("gene_ids", "feature_types"))
+train_obs1 <- input_mod1$obs[train_ix, , drop = FALSE] %>%
+  select(one_of("batch", "size_factors")) %>%
+  mutate_if(is_categorical, relevel)
+train_obs2 <- input_mod2$obs[train_ix[train_mod2_ix], , drop = FALSE] %>%
+  select(one_of("batch", "size_factors")) %>%
+  mutate_if(is_categorical, relevel)
+rownames(train_obs2) <- NULL
+if (ncol(train_obs2) == 0) train_obs2 <- NULL
+assert_that("size_factors" %in% colnames(train_obs1) != "size_factors" %in% colnames(train_obs2))
+assert_that(all(train_obs1$batch == train_obs2$batch[order(train_mod2_ix)]))
+
+output_train_mod1 <- anndata::AnnData(
+  X = input_mod1$X[train_ix, , drop = FALSE],
+  layers = subset_mats(input_mod1$layers, train_ix),
+  obsm = subset_mats(ad1_obsm, train_ix),
+  obs = train_obs1,
+  var = mod1_var,
+  uns = ad1_uns
+)
+output_train_mod2 <- anndata::AnnData(
+  X = input_mod2$X[train_ix[train_mod2_ix], , drop = FALSE] %>%
+    magrittr::set_rownames(., paste0("cell_", seq_len(nrow(.)))),
+  layers = subset_mats(input_mod2$layers, train_ix[train_mod2_ix], anonymize = TRUE),
+  obsm = subset_mats(ad2_obsm, train_ix[train_mod2_ix], anonymize = TRUE),
+  obs = train_obs2,
+  var = mod2_var,
+  uns = ad2_uns
+)
+assert_that(all(output_train_mod1$obs$batch == output_train_mod2$obs$batch[order(train_mod2_ix)]))
+
+cat("Create test objects\n")
+test_obs1 <- input_mod1$obs[test_ix, , drop = FALSE] %>%
+  select(one_of("batch", "size_factors")) %>%
+  mutate_if(is_categorical, relevel)
+test_obs2 <- input_mod2$obs[test_ix[test_mod2_ix], , drop = FALSE] %>%
+  select(one_of("batch", "size_factors")) %>%
+  mutate_if(is_categorical, relevel)
+rownames(test_obs2) <- NULL
+if (ncol(test_obs2) == 0) test_obs2 <- NULL
+assert_that("size_factors" %in% colnames(train_obs1) != "size_factors" %in% colnames(train_obs2))
+assert_that(all(test_obs1$batch == test_obs2$batch[order(test_mod2_ix)]))
+
+output_test_mod1 <- anndata::AnnData(
+  X = input_mod1$X[test_ix, , drop = FALSE],
+  layers = subset_mats(input_mod1$layers, test_ix),
+  obsm = subset_mats(ad1_obsm, test_ix),
+  obs = test_obs1,
+  var = mod1_var,
+  uns = ad1_uns
+)
+output_test_mod2 <- anndata::AnnData(
+  X = input_mod2$X[test_ix[test_mod2_ix], , drop = FALSE] %>%
+    magrittr::set_rownames(., paste0("cell_", seq_len(nrow(.)))),
+  layers = subset_mats(input_mod2$layers, test_ix[test_mod2_ix], anonymize = TRUE),
+  obsm = subset_mats(ad2_obsm, test_ix[test_mod2_ix], anonymize = TRUE),
+  obs = test_obs2,
+  var = mod2_var,
+  uns = ad2_uns
+)
+assert_that(all(output_test_mod1$obs$batch == output_test_mod2$obs$batch[order(test_mod2_ix)]))
+
+cat("Create solution objects\n")
+
+train_sol_mat <- Matrix::sparseMatrix(
+  i = seq_along(train_mod2_ix),
+  j = order(train_mod2_ix),
+  x = rep(1, length(train_mod2_ix))
+)
+output_train_sol <- anndata::AnnData(
+  X = train_sol_mat,
+  obs = input_mod1$obs[train_ix, , drop = FALSE] %>% select(one_of(c("batch"))) %>% mutate_if(is_categorical, relevel),
+  uns = list(dataset_id = new_dataset_id, pairing_ix = train_mod2_ix - 1)
+)
+
+test_sol_mat <- Matrix::sparseMatrix(
+  i = seq_along(test_mod2_ix),
+  j = order(test_mod2_ix),
+  x = rep(1, length(test_mod2_ix))
+)
+output_test_sol <- anndata::AnnData(
+  X = test_sol_mat,
+  obs = input_mod1$obs[test_ix, , drop = FALSE] %>% select(one_of(c("batch"))) %>% mutate_if(is_categorical, relevel),
+  uns = list(dataset_id = new_dataset_id, pairing_ix = test_mod2_ix - 1)
+)
+
+# checks
+# mean(rowSums(train_solknn > 0))
+# mean(rowSums(test_solknn > 0))
+# sum(train_solknn * train_sol_mat) == nrow(train_sol_mat)
+# sum(test_solknn * test_sol_mat) == nrow(test_sol_mat)
+
+cat("Saving output files as h5ad\n")
+zzz <- output_train_mod1$write_h5ad(par$output_train_mod1, compression = "gzip")
+zzz <- output_train_mod2$write_h5ad(par$output_train_mod2, compression = "gzip")
+zzz <- output_train_sol$write_h5ad(par$output_train_sol, compression = "gzip")
+zzz <- output_test_mod1$write_h5ad(par$output_test_mod1, compression = "gzip")
+zzz <- output_test_mod2$write_h5ad(par$output_test_mod2, compression = "gzip")
+zzz <- output_test_sol$write_h5ad(par$output_test_sol, compression = "gzip")
diff --git a/src/match_modality/methods/babel_knn/config.vsh.yaml b/src/match_modality/methods/babel_knn/config.vsh.yaml
new file mode 100644
index 0000000000..4a7cdc2e31
--- /dev/null
+++ b/src/match_modality/methods/babel_knn/config.vsh.yaml
@@ -0,0 +1,41 @@
+__merge__: ../../api/comp_method.yaml
+functionality:
+  status: disabled
+  name: babel_knn
+  namespace: match_modality/methods
+  description: Predict test expression with BABEL and match cells with KNN.
+  info:
+    type: method
+    method_name: Babel+KNN
+    paper_doi: "10.1073/pnas.2023070118"
+  arguments:
+    - name: "--n_dims"
+      type: "integer"
+      default: 10
+      description: Number of dimensions to use for dimensionality reduction.
+    - name: "--n_neigh"
+      type: "integer"
+      default: 10
+      description: Number of neighbors for KNN. Match probability will be 1/n_neigh
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata, lmds, FNN ]
+        bioc: [ DropletUtils ]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3, git]
+      - type: python
+        pip: [ anndata>=0.8 ]
+      - type: docker
+        run: 
+          - wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-py39_4.10.3-Linux-x86_64.sh -O ~/miniconda.sh && /bin/bash ~/miniconda.sh -b -p /opt/conda && /opt/conda/bin/conda -V
+          - cd / && git clone --depth 1 https://github.com/rcannood/babel.git
+          - cd /babel && /opt/conda/bin/conda env create -f environment.yml      
+  - type: nextflow
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/match_modality/methods/babel_knn/script.R b/src/match_modality/methods/babel_knn/script.R
new file mode 100644
index 0000000000..42702b6150
--- /dev/null
+++ b/src/match_modality/methods/babel_knn/script.R
@@ -0,0 +1,195 @@
+cat(">> Loading dependencies\n")
+
+requireNamespace("anndata", quietly = TRUE)
+library(Matrix, warn.conflicts = FALSE, quietly = TRUE)
+library(testthat, warn.conflicts = FALSE, quietly = TRUE)
+requireNamespace("DropletUtils", quietly = TRUE)
+
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+
+babel_location <- "/babel/bin/"
+conda_bin <- "/opt/conda/bin/conda"
+
+## VIASH START
+path <- "output/datasets/match_modality/openproblems_bmmc_multiome_phase1/openproblems_bmmc_multiome_phase1.censor_dataset.output_"
+par <- list(
+  input_train_mod1 = paste0(path, "train_mod1.h5ad"),
+  input_train_mod2 = paste0(path, "train_mod2.h5ad"),
+  input_train_sol = paste0(path, "train_sol.h5ad"),
+  input_test_mod1 = paste0(path, "test_mod1.h5ad"),
+  input_test_mod2 = paste0(path, "test_mod2.h5ad"),
+  output = "output.h5ad",
+  n_dims = 10,
+  n_neighs = 10
+)
+conda_bin <- "conda"
+babel_location <- "../babel/bin/"
+## VIASH END
+
+input_train_mod2 <- anndata::read_h5ad(par$input_train_mod2, backed = TRUE)
+if (input_train_mod2$var$feature_types[[1]] != "ATAC") {
+  cat("Error: babel only runs on GEX to ATAC datasets\n")
+  quit(save = "no", status = 42)
+}
+
+cat("Reading h5ad files\n")
+input_train_mod1 <- anndata::read_h5ad(par$input_train_mod1)
+input_train_mod2 <- anndata::read_h5ad(par$input_train_mod2)
+input_train_sol <- anndata::read_h5ad(par$input_train_sol)
+input_test_mod1 <- anndata::read_h5ad(par$input_test_mod1)
+input_test_mod2 <- anndata::read_h5ad(par$input_test_mod2)
+
+cat(">> Reading h5ad files\n")
+if (is.null(input_train_mod1$var$gene_ids)) input_train_mod1$var$gene_ids <- colnames(input_train_mod1)
+if (is.null(input_train_mod2$var$gene_ids)) input_train_mod2$var$gene_ids <- colnames(input_train_mod2)
+if (is.null(input_test_mod1$var$gene_ids)) input_test_mod1$var$gene_ids <- colnames(input_test_mod1)
+if (is.null(input_test_mod2$var$gene_ids)) input_test_mod2$var$gene_ids <- colnames(input_test_mod2)
+
+mod1 <- as.character(unique(input_train_mod1$var$feature_types))
+mod2 <- as.character(unique(input_train_mod2$var$feature_types))
+
+# multiome_matrix for export to Babel's input format
+multiome_matrix <- cbind(input_train_mod1$X, input_train_mod2$X)
+
+# generate multiome anndata object for training
+ad_babel <- anndata::AnnData(
+  X = multiome_matrix,
+  var = bind_rows(input_train_mod1$var, input_train_mod2$var)
+)
+
+# setting up babel dirs
+tmpdir <- tempfile(pattern = "babel_temp", fileext = "/")
+cat(">> Setting up directories for babel at ", tmpdir, "\n", sep = "")
+dir.create(tmpdir)
+on.exit(unlink(tmpdir, recursive = TRUE))
+
+dir_data <- paste0(tmpdir, "data/")     # location of input files
+dir.create(dir_data)
+dir_model <- paste0(tmpdir, "model/")   # location of babel model
+dir_pred <- paste0(tmpdir, "pred/")     # location of predictions
+
+feature_type_map <- c(
+  "GEX" = "Gene Expression",
+  "ADT" = "Peaks", # try to make it run on ADT data as well
+  "ATAC" = "Peaks"
+)
+
+cat(">> Writing train dataset as 10x-CellRanger H5 format\n")
+DropletUtils::write10xCounts(
+  paste0(dir_data, "train_input.h5"),
+  t(ad_babel$X),
+  gene.id = ad_babel$var$gene_ids,
+  gene.symbol = colnames(ad_babel),
+  barcodes = rownames(ad_babel),
+  type = "HDF5",
+  version = "3",
+  genome = "GRCh38",
+  gene.type = unname(feature_type_map[ad_babel$var$feature_types]),
+  overwrite = TRUE
+)
+
+cat(">> Writing test dataset as 10x-CellRanger H5 format\n")
+DropletUtils::write10xCounts(
+  paste0(dir_data, "test_input.h5"),
+  t(input_test_mod1$X),
+  gene.id = input_test_mod1$var$gene_ids,
+  gene.symbol = colnames(input_test_mod1),
+  barcodes = rownames(input_test_mod1),
+  type = "HDF5",
+  version = "3",
+  genome = "GRCh38",
+  gene.type = unname(feature_type_map[input_test_mod1$var$feature_types]),
+  overwrite = TRUE
+)
+
+
+cat(">> Babel: train model\n")
+babel_train_cmd <- paste0(
+  conda_bin, " run -n babel ",
+  "python ", babel_location, "train_model.py ",
+  "--data ", dir_data, "train_input.h5 ",
+  "--outdir ", dir_model, " ",
+  "--nofilter"
+)
+# stringent filtering causes babel to sometimes fail
+# reason: https://github.com/wukevin/babel/blob/main/babel/sc_data_loaders.py#L168-L190
+
+out1 <- system(babel_train_cmd)
+
+# check whether training succeeded
+expect_equal(out1, 0, info = paste0("Model training failed with exit code ", out1))
+
+cat(">> Babel: predict from model\n")
+babel_pred_cmd <- paste0(
+  conda_bin, " run -n babel ",
+  "python ", babel_location, "predict_model.py ",
+  "--checkpoint ", dir_model, " ",
+  "--data ", dir_data, "test_input.h5 ",
+  "--outdir ", dir_pred, " ",
+  "--nofilter"
+)
+out2 <- system(babel_pred_cmd)
+
+# check whether prediction succeeded
+expect_equal(out2, 0, info = paste0("Prediction failed with exit code ", out1))
+
+cat(">> Read predictions\n")
+pred <- anndata::read_h5ad(paste0(dir_pred, "/rna_atac_adata.h5ad"))
+
+#######################################
+#####  KNN
+
+# Only some features are present in Babel's output
+input_test_mod2_filter <- input_test_mod2[, row.names(input_test_mod2$var) %in% row.names(pred$var)]
+
+# Dimensional reduction of both predicted and test profiles
+pred_profiles <- pred[, row.names(input_test_mod2_filter$var)]$X
+
+cat("Performing DR on test values\n")
+dr <- lmds::lmds(
+  rbind(pred_profiles, input_test_mod2_filter$X),
+  ndim = par$n_dims,
+  distance_method = par$distance_method
+)
+
+train_ix <- seq_len(nrow(pred_profiles))
+dr_preds <- dr[train_ix, , drop = FALSE]
+dr_test_mod2 <- dr[-train_ix, , drop = FAPSE]
+
+
+cat("Performing KNN between test mod2 DR and predicted test mod2\n")
+knn_out <- FNN::get.knnx(
+  dr_preds,
+  dr_test_mod2,
+  k = min(1000, nrow(dr_preds))
+)
+
+cat("Creating output data structures\n")
+df <- tibble(
+  i = as.vector(row(knn_out$nn.index)),
+  j = as.vector(knn_out$nn.index),
+  x = max(knn_out$nn.dist) * 2 - as.vector(knn_out$nn.dist)
+)
+knn_mat <- Matrix::sparseMatrix(
+  i = df$i,
+  j = df$j,
+  x = df$x,
+  dims = list(nrow(input_test_mod1), nrow(input_test_mod2))
+)
+
+# normalise to make rows sum to 1
+rs <- Matrix::rowSums(knn_mat)
+knn_mat@x <- knn_mat@x / rs[knn_mat@i + 1]
+
+cat("Creating output anndata\n")
+out <- anndata::AnnData(
+  X = as(knn_mat, "CsparseMatrix"),
+  uns = list(
+    dataset_id = input_train_mod1$uns[["dataset_id"]],
+    method_id = "baseline_babel_knn"
+  )
+)
+
+cat("Writing predictions to file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")
\ No newline at end of file
diff --git a/src/match_modality/methods/dr_knnr_cbf/config.vsh.yaml b/src/match_modality/methods/dr_knnr_cbf/config.vsh.yaml
new file mode 100644
index 0000000000..37321fcad1
--- /dev/null
+++ b/src/match_modality/methods/dr_knnr_cbf/config.vsh.yaml
@@ -0,0 +1,29 @@
+__merge__: ../../api/comp_method.yaml
+functionality:
+  name: dr_knnr_cbf
+  namespace: match_modality/methods
+  description: Perform LMDS+Pearson DR, predict embedding with KNN regression, and match cells with KNN.
+  info:
+    type: method
+    method_name: LMDS+KNNR+CBF
+  arguments:
+    - name: "--n_pop"
+      type: "integer"
+      default: 100
+      description: Population size.
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata, tidyverse, lmds, dynutils, proxy, FNN, pbapply]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [ anndata>=0.8 ]
+  - type: nextflow
+    directives: 
+      label: [ midmem, midcpu ]
diff --git a/src/match_modality/methods/dr_knnr_cbf/script.R b/src/match_modality/methods/dr_knnr_cbf/script.R
new file mode 100644
index 0000000000..021e2d6483
--- /dev/null
+++ b/src/match_modality/methods/dr_knnr_cbf/script.R
@@ -0,0 +1,144 @@
+cat("Loading dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+requireNamespace("anndata", quietly = TRUE)
+library(Matrix, warn.conflicts = FALSE, quietly = TRUE)
+
+## VIASH START
+path <- "output/datasets/match_modality/openproblems_bmmc_multiome_phase1_rna/openproblems_bmmc_multiome_phase1_rna.censor_dataset.output_"
+path <- "output/datasets/match_modality/openproblems_bmmc_cite_phase1_rna/openproblems_bmmc_cite_phase1_rna.censor_dataset.output_"
+par <- list(
+  input_train_mod1 = paste0(path, "train_mod1.h5ad"),
+  input_train_mod2 = paste0(path, "train_mod2.h5ad"),
+  input_train_sol = paste0(path, "train_sol.h5ad"),
+  input_test_mod1 = paste0(path, "test_mod1.h5ad"),
+  input_test_mod2 = paste0(path, "test_mod2.h5ad"),
+  output = "output.h5ad",
+  n_pop = 300L
+)
+meta <- list(functionality_name = "foo")
+## VIASH END
+
+n_cores <- parallel::detectCores(all.tests = FALSE, logical = TRUE)
+
+method_id <- meta$functionality_name
+
+cat("Read train sol\n")
+input_train_sol <- anndata::read_h5ad(par$input_train_sol)
+
+cat("Reading mod1 h5ad files\n")
+input_train_mod1 <- anndata::read_h5ad(par$input_train_mod1)
+input_test_mod1 <- anndata::read_h5ad(par$input_test_mod1)
+
+cat("Running LMDS on input data\n")
+# merge input matrices
+mod1_X <- rbind(input_train_mod1$X, input_test_mod1$X)
+rm(input_train_mod1, input_test_mod1)
+gc()
+
+# perform DR
+dr_x1 <- lmds::lmds(mod1_X, ndim = 10, distance_method = "pearson")
+rm(mod1_X)
+gc()
+
+# split input matrices
+dr_x1_train <- dr_x1[seq_len(nrow(input_train_sol)), , drop = FALSE]
+dr_x1_test <- dr_x1[-seq_len(nrow(input_train_sol)), , drop = FALSE]
+
+cat("Reading mod1 h5ad files\n")
+input_train_mod2 <- anndata::read_h5ad(par$input_train_mod2)
+input_test_mod2 <- anndata::read_h5ad(par$input_test_mod2)
+
+cat("Running LMDS on input data\n")
+# merge input matrices
+match_train <- input_train_sol$uns$pairing_ix + 1
+mod2_X <- rbind(input_train_mod2$X[order(match_train), , drop = FALSE], input_test_mod2$X)
+rm(input_train_mod2, input_test_mod2)
+gc()
+
+# perform DR
+dr_x2 <- lmds::lmds(mod2_X, ndim = 3, distance_method = "pearson")
+rm(mod2_X)
+gc()
+
+# split input matrices
+dr_x2_train <- dr_x2[seq_len(nrow(input_train_sol)), , drop = FALSE]
+dr_x2_test <- dr_x2[-seq_len(nrow(input_train_sol)), , drop = FALSE]
+
+cat("Predicting mod1 DR of test cells\n")
+pred_mod1 <- apply(dr_x1_train, 2, function(yi) {
+  FNN::knn.reg(
+    train = dr_x2_train,
+    test = dr_x2_test,
+    y = yi,
+    k = min(15, nrow(dr_x1_test))
+  )$pred
+})
+
+cat("Predicting mod2 DR of test cells\n")
+pred_mod2 <- apply(dr_x2_train, 2, function(yi) {
+  FNN::knn.reg(
+    train = dr_x1_train,
+    test = dr_x1_test,
+    y = yi,
+    k = min(15, nrow(dr_x1_test))
+  )$pred
+})
+
+cat("Minimising distances between mod1 and mod2 pairs\n")
+gen_vec <- function(z) {
+  int <- seq_len(nrow(pred_mod1))
+
+  i <- j <- c()
+  resti <- int
+  restj <- int
+
+  while (length(resti) > 0) {
+    ixi <- sample.int(length(resti), 1)
+    newi <- resti[[ixi]]
+    d1 <- proxy::dist(pred_mod1[restj, , drop = FALSE], dr_x1_test[newi, , drop = FALSE], method = "euclidean")
+    d2 <- proxy::dist(pred_mod2[restj, , drop = FALSE], dr_x2_test[newi, , drop = FALSE], method = "euclidean")
+    d12 <- d1 + d2
+    ixj <- which.min(d12[, 1])
+    newj <- restj[[ixj]]
+    resti <- resti[-ixi]
+    restj <- restj[-ixj]
+    i <- c(i, newi)
+    j <- c(j, newj)
+
+    #  tibble(i, j); tibble(resti, restj)
+  }
+
+  tibble::tibble(i, j)
+}
+
+outs <- pbapply::pblapply(seq_len(par$n_pop), cl = n_cores, gen_vec)
+# outs <- lapply(seq_len(par$n_pop), gen_vec)
+df <- bind_rows(outs) %>%
+  group_by(i, j) %>%
+  summarise(n = n(), .groups = "drop") %>%
+  arrange(desc(n)) %>%
+  mutate(gold = i == j)
+
+knn_mat <- Matrix::sparseMatrix(
+  i = df$i,
+  j = df$j,
+  x = df$n,
+  dims = list(nrow(dr_x1_test), nrow(dr_x2_test))
+)
+
+# normalise to make rows sum to 1
+rs <- Matrix::rowSums(knn_mat)
+knn_mat@x <- knn_mat@x / rs[knn_mat@i + 1]
+
+cat("Creating output anndata\n")
+out <- anndata::AnnData(
+  X = as(knn_mat, "CsparseMatrix"),
+  uns = list(
+    dataset_id = input_train_sol$uns[["dataset_id"]],
+    method_id = method_id
+  )
+)
+
+cat("Writing predictions to file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")
diff --git a/src/match_modality/methods/dr_knnr_knn/config.vsh.yaml b/src/match_modality/methods/dr_knnr_knn/config.vsh.yaml
new file mode 100644
index 0000000000..fd9387d6d4
--- /dev/null
+++ b/src/match_modality/methods/dr_knnr_knn/config.vsh.yaml
@@ -0,0 +1,24 @@
+__merge__: ../../api/comp_method.yaml
+functionality:
+  name: dr_knnr_knn
+  namespace: match_modality/methods
+  description: Perform LMDS+Pearson DR, predict embedding with KNN regression, and match cells with KNN.
+  info:
+    type: baseline
+    method_name: LMDS+KNNR+KNN
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata, tidyverse, lmds, dynutils, proxy, FNN]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [ anndata>=0.8 ]
+  - type: nextflow
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/match_modality/methods/dr_knnr_knn/script.R b/src/match_modality/methods/dr_knnr_knn/script.R
new file mode 100644
index 0000000000..fa32cbdf29
--- /dev/null
+++ b/src/match_modality/methods/dr_knnr_knn/script.R
@@ -0,0 +1,104 @@
+cat("Loading dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+requireNamespace("anndata", quietly = TRUE)
+library(Matrix, warn.conflicts = FALSE, quietly = TRUE)
+
+## VIASH START
+path <- "output/datasets/match_modality/openproblems_bmmc_multiome_phase1_rna/openproblems_bmmc_multiome_phase1_rna.censor_dataset.output_"
+par <- list(
+  input_train_mod1 = paste0(path, "train_mod1.h5ad"),
+  input_train_mod2 = paste0(path, "train_mod2.h5ad"),
+  input_train_sol = paste0(path, "train_sol.h5ad"),
+  input_test_mod1 = paste0(path, "test_mod1.h5ad"),
+  input_test_mod2 = paste0(path, "test_mod2.h5ad"),
+  output = "output.h5ad"
+)
+## VIASH END
+
+method_id <- meta$functionality_name
+
+cat("Read train sol\n")
+input_train_sol <- anndata::read_h5ad(par$input_train_sol)
+
+cat("Reading mod1 h5ad files\n")
+input_train_mod1 <- anndata::read_h5ad(par$input_train_mod1)
+input_test_mod1 <- anndata::read_h5ad(par$input_test_mod1)
+
+cat("Running LMDS on input data\n")
+# merge input matrices
+mod1_X <- rbind(input_train_mod1$X, input_test_mod1$X)
+rm(input_train_mod1, input_train_mod2)
+
+# perform DR
+dr_x1 <- lmds::lmds(mod1_X, ndim = 10, distance_method = "pearson")
+rm(mod1_X)
+
+# split input matrices
+dr_x1_train <- dr_x1[seq_len(nrow(input_train_sol)), , drop = FALSE]
+dr_x1_test <- dr_x1[-seq_len(nrow(input_train_sol)), , drop = FALSE]
+
+cat("Reading mod1 h5ad files\n")
+input_train_mod2 <- anndata::read_h5ad(par$input_train_mod2)
+input_test_mod2 <- anndata::read_h5ad(par$input_test_mod2)
+
+cat("Running LMDS on input data\n")
+# merge input matrices
+match_train <- input_train_sol$uns$pairing_ix + 1
+mod2_X <- rbind(input_train_mod2$X[order(match_train), , drop = FALSE], input_test_mod2$X)
+rm(input_train_mod2, input_test_mod2)
+
+# perform DR
+dr_x2 <- lmds::lmds(mod2_X, ndim = 3, distance_method = "pearson")
+rm(mod2_X)
+
+# split input matrices
+dr_x2_train <- dr_x2[seq_len(nrow(input_train_sol)), , drop = FALSE]
+dr_x2_test <- dr_x2[-seq_len(nrow(input_train_sol)), , drop = FALSE]
+
+cat("Predicting for each column in modality 2\n")
+preds <- apply(dr_x2_train, 2, function(yi) {
+  FNN::knn.reg(
+    train = dr_x1_train,
+    test = dr_x1_test,
+    y = yi,
+    k = min(15, nrow(dr_x1_test))
+  )$pred
+})
+
+
+cat("Performing KNN between test mod2 DR and predicted test mod2\n")
+knn_out <- FNN::get.knnx(
+  preds,
+  dr_x2_test,
+  k = min(1000, nrow(dr_x1_test))
+)
+
+cat("Creating output data structures\n")
+df <- tibble(
+  i = as.vector(row(knn_out$nn.index)),
+  j = as.vector(knn_out$nn.index),
+  x = max(knn_out$nn.dist) * 2 - as.vector(knn_out$nn.dist)
+)
+knn_mat <- Matrix::sparseMatrix(
+  i = df$i,
+  j = df$j,
+  x = df$x,
+  dims = list(nrow(dr_x1_test), nrow(dr_x2_test))
+)
+
+# normalise to make rows sum to 1
+rs <- Matrix::rowSums(knn_mat)
+knn_mat@x <- knn_mat@x / rs[knn_mat@i + 1]
+
+cat("Creating output anndata\n")
+out <- anndata::AnnData(
+  X = as(knn_mat, "CsparseMatrix"),
+  uns = list(
+    dataset_id = input_train_sol$uns[["dataset_id"]],
+    method_id = method_id
+  )
+)
+
+cat("Writing predictions to file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")
diff --git a/src/match_modality/methods/linear_knn/config.vsh.yaml b/src/match_modality/methods/linear_knn/config.vsh.yaml
new file mode 100644
index 0000000000..20fa30783b
--- /dev/null
+++ b/src/match_modality/methods/linear_knn/config.vsh.yaml
@@ -0,0 +1,29 @@
+__merge__: ../../api/comp_method.yaml
+functionality:
+  name: linear_knn
+  namespace: match_modality/methods
+  description: Perform DR with Linear Regression, and match cells with kNN
+  info:
+    type: method
+    method_name: Linear Regression and KNN
+  arguments:
+    - name: "--n_svd"
+      type: "integer"
+      default: 10
+      description: Number of SVD components to compress to
+    - name: "--n_neighbors"
+      type: "integer"
+      default: 10
+      description: Number of neighbors for matching modalities
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: "python:3.10"
+    setup:
+      - type: python
+        pip: [ anndata, scipy, scikit-learn ]
+  - type: nextflow
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/match_modality/methods/linear_knn/script.py b/src/match_modality/methods/linear_knn/script.py
new file mode 100644
index 0000000000..c776337a4a
--- /dev/null
+++ b/src/match_modality/methods/linear_knn/script.py
@@ -0,0 +1,105 @@
+import logging
+import anndata as ad
+import scipy.spatial
+import scipy.sparse
+import numpy as np
+
+from sklearn.decomposition import TruncatedSVD
+from sklearn.neighbors import NearestNeighbors
+from sklearn.linear_model import LinearRegression
+from sklearn.preprocessing import normalize
+
+## VIASH START
+# Anything within this block will be removed by `viash` and will be
+# replaced with the parameters as specified in your config.vsh.yaml.
+par = {
+    "input_train_mod1": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_mod1.h5ad",
+    "input_train_mod2": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_mod2.h5ad",
+    "input_train_sol": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_sol.h5ad",
+    "input_test_mod1": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_mod1.h5ad",
+    "input_test_mod2": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_mod2.h5ad",
+    "output": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+    "n_svd": 100,
+    "n_neighbors" : 10,
+}
+
+meta = {
+    "funtionality_name": "foo"
+}
+## VIASH END
+
+logging.basicConfig(level=logging.INFO)
+
+logging.info("Load datasets")
+input_train_mod1 = ad.read_h5ad(par["input_train_mod1"])
+input_train_mod2 = ad.read_h5ad(par["input_train_mod2"])
+input_train_sol = ad.read_h5ad(par["input_train_sol"])
+input_test_mod1 = ad.read_h5ad(par["input_test_mod1"])
+input_test_mod2 = ad.read_h5ad(par["input_test_mod2"])
+
+# This method runs PCA on each modality individually, then runs linear regression to predict mod2
+# from mod1 and finally performs kNN to match modalities
+
+# unscramble training cells
+ord = np.argsort(input_train_sol.uns['pairing_ix'])
+input_train_mod2 = input_train_mod2[ord, :]
+
+# concatenate train and test data
+mod1 = ad.concat(
+    {
+        "train": input_train_mod1,
+        "test": input_test_mod1
+    },
+    index_unique="-",
+    label="group"
+)
+mod2 = ad.concat(
+    {
+        "train": input_train_mod2,
+        "test": input_test_mod2
+    },
+    index_unique="-",
+    label="group"
+)
+
+# Create helper views to access the test data later
+mod1tr = mod1[mod1.obs["group"] == "train", :]
+mod2tr = mod2[mod2.obs["group"] == "train", :]
+
+mod1te = mod1[mod1.obs["group"] == "test", :]
+mod2te = mod2[mod2.obs["group"] == "test", :]
+
+logging.info("Running PCA")
+n_svd = min(par["n_svd"], mod1.n_obs, mod2.n_obs, mod1.n_vars, mod1.n_vars)
+
+# Use TruncatedSVD for fast decomposition of the data
+mod1.obsm["X_pca"] = TruncatedSVD(n_svd).fit_transform(mod1.X)
+mod2.obsm["X_pca"] = TruncatedSVD(n_svd).fit_transform(mod2.X)
+
+reg = LinearRegression()
+
+reg.fit(mod1tr.obsm["X_pca"], mod2tr.obsm["X_pca"])
+mod2te_pred = reg.predict(mod1te.obsm["X_pca"])
+
+neighbors = NearestNeighbors(n_neighbors=np.min((mod2te.shape[0], par["n_neighbors"])), n_jobs=-1)
+neighbors = neighbors.fit(mod2te_pred)
+
+distances, indices = neighbors.kneighbors(mod2te.obsm["X_pca"])
+
+prediction = np.zeros((mod2te.shape[0], mod2te.shape[0]))
+for i, neighbors in enumerate(indices):
+    prediction[i, neighbors] = 1 / distances[i]
+
+prediction = normalize(prediction, norm="l1")
+
+prediction = scipy.sparse.csr_matrix(prediction)
+
+print("Write prediction output")
+prediction = ad.AnnData(
+    X=prediction,
+    uns={
+        "dataset_id": input_train_mod1.uns["dataset_id"],
+        "method_id": meta["functionality_name"]
+    }
+)
+prediction.write_h5ad(par["output"])
diff --git a/src/match_modality/methods/newwave_knnr_cbf/config.vsh.yaml b/src/match_modality/methods/newwave_knnr_cbf/config.vsh.yaml
new file mode 100644
index 0000000000..6fcc058b2a
--- /dev/null
+++ b/src/match_modality/methods/newwave_knnr_cbf/config.vsh.yaml
@@ -0,0 +1,42 @@
+__merge__: ../../api/comp_method.yaml
+functionality:
+  name: newwave_knnr_cbf
+  namespace: match_modality/methods
+  description: Perform DR with NewWave, predict embedding with KNN regression, and matching cells with a consensus best fit algorithm.
+  info:
+    type: method
+    method_name: NewWave+KNNR+CBF
+  arguments:
+    - name: "--n_pop"
+      type: "integer"
+      default: 300
+      description: Population size.
+    - name: "--newwave_maxiter"
+      type: "integer"
+      default: 40
+      description: Maximum number of NewWave iterations.
+    - name: "--newwave_ngene"
+      type: "integer"
+      default: 200
+      description: Setting of the n_gene_par NewWave parameter.
+    - name: "--newwave_ncell"
+      type: "integer"
+      default: 200
+      description: Setting of the n_cell_par NewWave parameter.
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata, tidyverse, lmds, dynutils, proxy, FNN, pbapply, proxy, proxyC]
+        bioc: [ SingleCellExperiment, NewWave ]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [ anndata>=0.8 ]
+  - type: nextflow
+    directives: 
+      label: [ highmem, highcpu ]
diff --git a/src/match_modality/methods/newwave_knnr_cbf/script.R b/src/match_modality/methods/newwave_knnr_cbf/script.R
new file mode 100644
index 0000000000..ddc291a517
--- /dev/null
+++ b/src/match_modality/methods/newwave_knnr_cbf/script.R
@@ -0,0 +1,188 @@
+cat("Loading dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+requireNamespace("anndata", quietly = TRUE)
+requireNamespace("pbapply", quietly = TRUE)
+library(Matrix, warn.conflicts = FALSE, quietly = TRUE)
+requireNamespace("NewWave", quietly = TRUE)
+requireNamespace("FNN", quietly = TRUE)
+requireNamespace("SingleCellExperiment", quietly = TRUE)
+
+## VIASH START
+path <- "output/datasets/match_modality/openproblems_bmmc_multiome_phase1/openproblems_bmmc_multiome_phase1.censor_dataset.output_"
+par <- list(
+  input_train_mod1 = paste0(path, "train_mod1.h5ad"),
+  input_train_mod2 = paste0(path, "train_mod2.h5ad"),
+  input_train_sol = paste0(path, "train_sol.h5ad"),
+  input_test_mod1 = paste0(path, "test_mod1.h5ad"),
+  input_test_mod2 = paste0(path, "test_mod2.h5ad"),
+  output = "output.h5ad",
+  n_pop = 300L,
+  newwave_maxiter = 10
+)
+meta <- list(functionality_name = "foo")
+
+# # read in solution data to check whether method is working
+# input_test_sol <- anndata::read_h5ad(paste0(path, "test_sol.h5ad"))
+# match_test <- input_test_sol$uns$pairing_ix + 1
+## VIASH END
+
+n_cores <- parallel::detectCores(all.tests = FALSE, logical = TRUE)
+
+method_id <- meta$functionality_name
+
+input_train_sol <- anndata::read_h5ad(par$input_train_sol)
+match_train <- input_train_sol$uns$pairing_ix + 1
+
+cat("Reading h5ad files\n")
+input_train_mod1 <- anndata::read_h5ad(par$input_train_mod1)
+input_test_mod1 <- anndata::read_h5ad(par$input_test_mod1)
+
+# fetch a few variables
+train_ix <- seq_len(nrow(input_train_mod1))
+did <- input_train_mod1$uns[["dataset_id"]]
+batch1 <- c(as.character(input_train_mod1$obs$batch), as.character(input_test_mod1$obs$batch))
+
+cat("Running NewWave\n")
+data1 <- SummarizedExperiment::SummarizedExperiment(
+  assays = list(counts = cbind(t(input_train_mod1$layers[["counts"]]), t(input_test_mod1$layers[["counts"]]))),
+  colData = data.frame(batch = factor(batch1))
+)
+data1 <- data1[Matrix::rowSums(SummarizedExperiment::assay(data1)) > 0, ]
+# option 1: filter by HVG
+# data1 <- data1[order(proxyC::rowSds(SummarizedExperiment::assay(data1)), decreasing = TRUE)[1:100], ]
+
+# remove large unneeded dataset objects
+rm(input_train_mod1, input_test_mod1)
+gc()
+
+res1 <- NewWave::newWave(
+  data1,
+  X = "~batch",
+  verbose = TRUE,
+  K = 10,
+  maxiter_optimize = par$newwave_maxiter,
+  n_gene_par = min(par$newwave_ngene, nrow(data1)),
+  n_cell_par = min(par$newwave_ncell, ncol(data1)),
+  commondispersion = FALSE
+)
+rm(data1)
+dr_x1 <- SingleCellExperiment::reducedDim(res1)
+
+cat("Reading h5ad files\n")
+input_train_mod2 <- anndata::read_h5ad(par$input_train_mod2)
+input_test_mod2 <- anndata::read_h5ad(par$input_test_mod2)
+
+# don't know batch ordering in input_test_mod2
+batch2 <- c(as.character(input_train_sol$obs$batch), rep("unknownbatch", nrow(input_test_mod2)))
+
+data2 <- SummarizedExperiment::SummarizedExperiment(
+  assays = list(counts = cbind(t(input_train_mod2$layers[["counts"]][order(match_train), , drop = FALSE]), t(input_test_mod2$layers[["counts"]]))),
+  colData = data.frame(batch = factor(batch2))
+)
+data2 <- data2[Matrix::rowSums(SummarizedExperiment::assay(data2)) > 0, ]
+# data2 <- data2[order(proxyC::rowSds(SummarizedExperiment::assay(data2)), decreasing = TRUE)[1:100], ]
+
+# remove large unneeded dataset objects
+rm(input_train_mod2, input_test_mod2)
+gc()
+
+cat("Running NewWave\n")
+res2 <- NewWave::newWave(
+  data2,
+  X = "~batch",
+  verbose = TRUE,
+  K = 10,
+  maxiter_optimize = par$newwave_maxiter,
+  n_gene_par = min(par$newwave_ngene, nrow(data2)),
+  n_cell_par = min(par$newwave_ncell, ncol(data2)),
+  commondispersion = FALSE
+)
+dr_x2 <- SingleCellExperiment::reducedDim(res2)
+
+colnames(dr_x1) <- paste0("comp_", seq_len(ncol(dr_x1)))
+colnames(dr_x2) <- paste0("comp_", seq_len(ncol(dr_x2)))
+
+# split up DR matrices
+dr_x1_train <- dr_x1[train_ix, , drop = FALSE]
+dr_x1_test <- dr_x1[-train_ix, , drop = FALSE]
+dr_x2_train <- dr_x2[train_ix, , drop = FALSE]
+dr_x2_test <- dr_x2[-train_ix, , drop = FALSE]
+
+cat("Predicting mod1 DR of test cells\n")
+pred_mod1 <- apply(dr_x1_train, 2, function(yi) {
+  FNN::knn.reg(
+    train = dr_x2_train,
+    test = dr_x2_test,
+    y = yi,
+    k = min(15, nrow(dr_x1_test))
+  )$pred
+})
+
+cat("Predicting mod2 DR of test cells\n")
+pred_mod2 <- apply(dr_x2_train, 2, function(yi) {
+  FNN::knn.reg(
+    train = dr_x1_train,
+    test = dr_x1_test,
+    y = yi,
+    k = min(15, nrow(dr_x1_test))
+  )$pred
+})
+
+cat("Minimising distances between mod1 and mod2 pairs\n")
+gen_vec <- function(z) {
+  int <- seq_len(nrow(pred_mod1))
+
+  i <- j <- c()
+  resti <- int
+  restj <- int
+
+  while (length(resti) > 0) {
+    ixi <- sample.int(length(resti), 1)
+    newi <- resti[[ixi]]
+    d1 <- proxy::dist(pred_mod1[restj, , drop = FALSE], dr_x1_test[newi, , drop = FALSE], method = "euclidean")
+    d2 <- proxy::dist(pred_mod2[restj, , drop = FALSE], dr_x2_test[newi, , drop = FALSE], method = "euclidean")
+    d12 <- d1 + d2
+    ixj <- which.min(d12[, 1])
+    newj <- restj[[ixj]]
+    resti <- resti[-ixi]
+    restj <- restj[-ixj]
+    i <- c(i, newi)
+    j <- c(j, newj)
+
+    #  tibble(i, j); tibble(resti, restj)
+  }
+
+  tibble::tibble(i, j)
+}
+outs <- pbapply::pblapply(seq_len(par$n_pop), cl = n_cores, gen_vec)
+
+# outs <- lapply(seq_len(par$n_pop), gen_vec)
+df <- bind_rows(outs) %>%
+  group_by(i, j) %>%
+  summarise(n = n(), .groups = "drop") %>%
+  arrange(desc(n)) %>%
+  mutate(gold = i == j)
+
+knn_mat <- Matrix::sparseMatrix(
+  i = df$i,
+  j = df$j,
+  x = df$n,
+  dims = list(nrow(dr_x1_test), nrow(dr_x2_test))
+)
+
+# normalise to make rows sum to 1
+rs <- Matrix::rowSums(knn_mat)
+knn_mat@x <- knn_mat@x / rs[knn_mat@i + 1]
+
+cat("Creating output anndata\n")
+out <- anndata::AnnData(
+  X = as(knn_mat, "CsparseMatrix"),
+  uns = list(
+    dataset_id = did,
+    method_id = method_id
+  )
+)
+
+cat("Writing predictions to file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")
diff --git a/src/match_modality/methods/newwave_knnr_knn/config.vsh.yaml b/src/match_modality/methods/newwave_knnr_knn/config.vsh.yaml
new file mode 100644
index 0000000000..4293005a23
--- /dev/null
+++ b/src/match_modality/methods/newwave_knnr_knn/config.vsh.yaml
@@ -0,0 +1,37 @@
+__merge__: ../../api/comp_method.yaml
+functionality:
+  name: newwave_knnr_knn
+  namespace: match_modality/methods
+  description: Perform DR with NewWave, predict embedding with KNN regression, and match cells with a KNN.
+  info:
+    type: method
+    method_name: NewWave+KNNR+KNN
+  arguments:
+    - name: "--newwave_maxiter"
+      type: "integer"
+      default: 40
+      description: Maximum number of NewWave iterations.
+    - name: "--newwave_ngene"
+      type: "integer"
+      default: 200
+      description: Setting of the n_gene_par NewWave parameter.
+    - name: "--newwave_ncell"
+      type: "integer"
+      default: 200
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata, tidyverse, lmds, dynutils, proxy, FNN]
+        bioc: [ SingleCellExperiment, NewWave ]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [ anndata>=0.8, tensorflow ]
+  - type: nextflow
+    directives: 
+      label: [ highmem, highcpu ]
diff --git a/src/match_modality/methods/newwave_knnr_knn/script.R b/src/match_modality/methods/newwave_knnr_knn/script.R
new file mode 100644
index 0000000000..2810af1ab2
--- /dev/null
+++ b/src/match_modality/methods/newwave_knnr_knn/script.R
@@ -0,0 +1,162 @@
+cat("Loading dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+requireNamespace("anndata", quietly = TRUE)
+library(Matrix, warn.conflicts = FALSE, quietly = TRUE)
+requireNamespace("NewWave", quietly = TRUE)
+requireNamespace("FNN", quietly = TRUE)
+requireNamespace("SingleCellExperiment", quietly = TRUE)
+
+## VIASH START
+# path <- "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter."
+# path <- "output/public_datasets/match_modality/dyngen_citeseq_1/dyngen_citeseq_1.censor_dataset.output_"
+path <- "output/public_datasets/match_modality/dyngen_atac_1/dyngen_atac_1.censor_dataset.output_"
+# path <- "debug/debug."
+par <- list(
+  input_train_mod1 = paste0(path, "train_mod1.h5ad"),
+  input_train_mod2 = paste0(path, "train_mod2.h5ad"),
+  input_train_sol = paste0(path, "train_sol.h5ad"),
+  input_test_mod1 = paste0(path, "test_mod1.h5ad"),
+  input_test_mod2 = paste0(path, "test_mod2.h5ad"),
+  output = "output.h5ad",
+  n_dims = 10L,
+  distance_method = "spearman",
+  n_ga_pop = 200L,
+  n_ga_iter = 500L
+)
+meta <- list(functionality_name = "foo")
+
+# # read in solution data to check whether method is working
+input_test_sol <- anndata::read_h5ad(paste0(path, "test_sol.h5ad"))
+match_test <- input_test_sol$uns$pairing_ix + 1
+## VIASH END
+
+n_cores <- parallel::detectCores(all.tests = FALSE, logical = TRUE)
+
+method_id <- meta$functionality_name
+
+input_train_sol <- anndata::read_h5ad(par$input_train_sol)
+match_train <- input_train_sol$uns$pairing_ix + 1
+
+cat("Reading h5ad files\n")
+input_train_mod1 <- anndata::read_h5ad(par$input_train_mod1)
+input_test_mod1 <- anndata::read_h5ad(par$input_test_mod1)
+
+# fetch a few variables
+train_ix <- seq_len(nrow(input_train_mod1))
+did <- input_train_mod1$uns[["dataset_id"]]
+batch1 <- c(as.character(input_train_mod1$obs$batch), as.character(input_test_mod1$obs$batch))
+
+cat("Running NewWave\n")
+data1 <- SummarizedExperiment::SummarizedExperiment(
+  assays = list(counts = cbind(t(input_train_mod1$layers[["counts"]]), t(input_test_mod1$layers[["counts"]]))),
+  colData = data.frame(batch = factor(batch1))
+)
+data1 <- data1[Matrix::rowSums(SummarizedExperiment::assay(data1)) > 0, ]
+# option 1: filter by HVG
+# data1 <- data1[order(proxyC::rowSds(SummarizedExperiment::assay(data1)), decreasing = TRUE)[1:100], ]
+
+# remove large unneeded dataset objects
+rm(input_train_mod1, input_test_mod1)
+gc()
+
+res1 <- NewWave::newWave(
+  data1,
+  X = "~batch",
+  verbose = TRUE,
+  K = 10,
+  maxiter_optimize = par$newwave_maxiter,
+  n_gene_par = min(par$newwave_ngene, nrow(data1)),
+  n_cell_par = min(par$newwave_ncell, ncol(data1)),
+  commondispersion = FALSE
+)
+rm(data1)
+dr_x1 <- SingleCellExperiment::reducedDim(res1)
+
+cat("Reading h5ad files\n")
+input_train_mod2 <- anndata::read_h5ad(par$input_train_mod2)
+input_test_mod2 <- anndata::read_h5ad(par$input_test_mod2)
+
+# don't know batch ordering in input_test_mod2
+batch2 <- c(as.character(input_train_sol$obs$batch), rep("unknownbatch", nrow(input_test_mod2)))
+
+data2 <- SummarizedExperiment::SummarizedExperiment(
+  assays = list(counts = cbind(t(input_train_mod2$layers[["counts"]][order(match_train), , drop = FALSE]), t(input_test_mod2$layers[["counts"]]))),
+  colData = data.frame(batch = factor(batch2))
+)
+data2 <- data2[Matrix::rowSums(SummarizedExperiment::assay(data2)) > 0, ]
+# data2 <- data2[order(proxyC::rowSds(SummarizedExperiment::assay(data2)), decreasing = TRUE)[1:100], ]
+
+# remove large unneeded dataset objects
+rm(input_train_mod2, input_test_mod2)
+gc()
+
+cat("Running NewWave\n")
+res2 <- NewWave::newWave(
+  data2,
+  X = "~batch",
+  verbose = TRUE,
+  K = 10,
+  maxiter_optimize = par$newwave_maxiter,
+  n_gene_par = min(par$newwave_ngene, nrow(data2)),
+  n_cell_par = min(par$newwave_ncell, ncol(data2)),
+  commondispersion = FALSE
+)
+dr_x2 <- SingleCellExperiment::reducedDim(res2)
+
+colnames(dr_x1) <- paste0("comp_", seq_len(ncol(dr_x1)))
+colnames(dr_x2) <- paste0("comp_", seq_len(ncol(dr_x2)))
+
+# split up DR matrices
+dr_x1_train <- dr_x1[train_ix, , drop = FALSE]
+dr_x1_test <- dr_x1[-train_ix, , drop = FALSE]
+dr_x2_train <- dr_x2[train_ix, , drop = FALSE]
+dr_x2_test <- dr_x2[-train_ix, , drop = FALSE]
+
+cat("Predicting mod1 DR of test cells\n")
+preds <- apply(dr_x1_train, 2, function(yi) {
+  FNN::knn.reg(
+    train = dr_x2_train,
+    test = dr_x2_test,
+    y = yi,
+    k = min(15, nrow(dr_x1_test))
+  )$pred
+})
+
+
+cat("Performing KNN between test mod2 DR and predicted test mod2\n")
+knn_out <- FNN::get.knnx(
+  preds,
+  dr_x2_test,
+  k = min(1000, nrow(preds))
+)
+
+cat("Creating output data structures\n")
+df <- tibble(
+  i = as.vector(row(knn_out$nn.index)),
+  j = as.vector(knn_out$nn.index),
+  x = 1000 - as.vector(col(knn_out$nn.index)) + 1
+  # x = max(knn_out$nn.dist) * 2 - as.vector(knn_out$nn.dist)
+)
+knn_mat <- Matrix::sparseMatrix(
+  i = df$i,
+  j = df$j,
+  x = df$x,
+  dims = list(nrow(dr_x1_test), nrow(dr_x2_test))
+)
+
+# normalise to make rows sum to 1
+rs <- Matrix::rowSums(knn_mat)
+knn_mat@x <- knn_mat@x / rs[knn_mat@i + 1]
+
+cat("Creating output anndata\n")
+out <- anndata::AnnData(
+  X = as(knn_mat, "CsparseMatrix"),
+  uns = list(
+    dataset_id = did,
+    method_id = method_id
+  )
+)
+
+cat("Writing predictions to file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")
diff --git a/src/match_modality/methods/procrustes_knn/config.vsh.yaml b/src/match_modality/methods/procrustes_knn/config.vsh.yaml
new file mode 100644
index 0000000000..65d0327291
--- /dev/null
+++ b/src/match_modality/methods/procrustes_knn/config.vsh.yaml
@@ -0,0 +1,25 @@
+__merge__: ../../api/comp_method.yaml
+functionality:
+  name: procrustes_knn
+  namespace: match_modality/methods
+  description: Perform DR with Procrustes, and match cells with a GA to minimise pairwise distances.
+  info:
+    type: method
+    method_name: Procrustes+KNN
+  arguments:
+    - name: "--n_svd"
+      type: "integer"
+      default: 100
+      description: Number of SVD components to compress to
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: "python:3.10"
+    setup:
+      - type: python
+        pip: [ anndata>=0.8, scipy, scikit-learn ]
+  - type: nextflow
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/match_modality/methods/procrustes_knn/script.py b/src/match_modality/methods/procrustes_knn/script.py
new file mode 100644
index 0000000000..3e16b5fbb7
--- /dev/null
+++ b/src/match_modality/methods/procrustes_knn/script.py
@@ -0,0 +1,112 @@
+import logging
+import anndata as ad
+import scipy.spatial
+import scipy.sparse
+import numpy as np
+
+from sklearn.preprocessing import normalize
+from sklearn.decomposition import TruncatedSVD
+from sklearn.neighbors import NearestNeighbors
+
+## VIASH START
+
+# Anything within this block will be removed by `viash` and will be
+# replaced with the parameters as specified in your config.vsh.yaml.
+
+par = {
+    "input_train_mod1": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_mod1.h5ad",
+    "input_train_mod2": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_mod2.h5ad",
+    "input_train_sol": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_sol.h5ad",
+    "input_test_mod1": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_mod1.h5ad",
+    "input_test_mod2": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_mod2.h5ad",
+    "output": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+    "n_svd": 100,
+}
+
+meta = {
+    "functionality_name": "foo"
+}
+## VIASH END
+
+logging.basicConfig(level=logging.INFO)
+
+logging.info("Load datasets")
+input_train_mod1 = ad.read_h5ad(par["input_train_mod1"])
+input_train_mod2 = ad.read_h5ad(par["input_train_mod2"])
+# input_train_sol = ad.read_h5ad(par["input_train_sol"])
+input_test_mod1 = ad.read_h5ad(par["input_test_mod1"])
+input_test_mod2 = ad.read_h5ad(par["input_test_mod2"])
+
+# This method runs PCA on each modality individually, then uses the Procrustes method to identify
+# a linear transform that best superimposes the points from modality 1 onto modality 2.
+
+# concatenate train and test data
+mod1 = ad.concat(
+    {
+        "train": input_train_mod1,
+        "test": input_test_mod1
+    },
+    index_unique="-",
+    label="group"
+)
+mod2 = ad.concat(
+    {
+        "train": input_train_mod2,
+        "test": input_test_mod2
+    },
+    index_unique="-",
+    label="group"
+)
+
+# Create helper views to access the test data later
+mod1te = mod1[mod1.obs["group"] == "test", :]
+mod2te = mod2[mod2.obs["group"] == "test", :]
+
+logging.info("Running PCA")
+n_svd = min(par["n_svd"], mod1.n_obs, mod2.n_obs, mod1.n_vars, mod1.n_vars)
+
+# Use TruncatedSVD for fast decomposition of the data
+mod1.obsm["X_pca"] = TruncatedSVD(n_svd).fit_transform(mod1.X)
+mod2.obsm["X_pca"] = TruncatedSVD(n_svd).fit_transform(mod2.X)
+
+logging.info("Running Procrustes Alignment")
+# This function takes in two matrices of points A and B, standardizes both, and applies a linear to
+# matrix B to minimize the disparity measured as the sum of the squares of the pointwise distances
+# between the two input datasets
+mod1.obsm["X_pro"], mod2.obsm["X_pro"], disparity = scipy.spatial.procrustes(
+    mod1.obsm["X_pca"],
+    mod2.obsm["X_pca"],
+)
+logging.info("> Disparity value is: %0.3f" % disparity)
+
+logging.info("Perform nearest neighbors")
+# To get the matching matrix, for each point in mod1_test, we take the 1000 nearest neighbors of that
+# point in the transformed mod2_test dataset
+n_neighbors = min(1000, mod1te.n_obs, mod1te.n_vars, mod2te.n_obs, mod2te.n_vars)
+nn = NearestNeighbors(n_neighbors=n_neighbors).fit(mod1te.obsm["X_pro"])
+distances, indices = nn.kneighbors(X=mod2te.obsm["X_pro"])
+
+logging.info("Create pairing matrix")
+# Translate the neighborhood assignments to a pairing matrix that is (n_obs, n_obs)
+# NOTE: `pairing_matrix` must have NO MORE than 1000*n_obs non-zero entries for fast metric computation
+ind_i = np.tile(np.arange(mod1te.n_obs), (n_neighbors, 1)).T.flatten()
+ind_j = indices.flatten()
+ind_dist = distances.flatten()
+ind_x = 2 * max(ind_dist) - ind_dist
+pairing_matrix = scipy.sparse.csr_matrix(
+    (ind_x, (ind_i, ind_j)),
+    shape=(input_test_mod1.n_obs, input_test_mod2.n_obs)
+)
+
+# row normalise
+prob_matrix = normalize(pairing_matrix, norm="l1")
+
+print("Write prediction output")
+prediction = ad.AnnData(
+    X=prob_matrix,
+    uns={
+        "dataset_id": input_train_mod1.uns["dataset_id"],
+        "method_id": meta["functionality_name"]
+    }
+)
+prediction.write_h5ad(par["output"])
diff --git a/src/match_modality/metrics/aupr/config.vsh.yaml b/src/match_modality/metrics/aupr/config.vsh.yaml
new file mode 100644
index 0000000000..205efb353e
--- /dev/null
+++ b/src/match_modality/metrics/aupr/config.vsh.yaml
@@ -0,0 +1,40 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: aupr
+  namespace: match_modality/metrics
+  description: Calculating basic metrics for task2.
+  info:
+    metrics:
+      - metric_id: pairing_auroc
+        metric_name: Pairing auroc
+        metric_description: Area Under ROC curve
+        maximize: true
+        min: 0
+        max: 1
+      - metric_id: pairing_aupr
+        metric_name: Pairing aupr
+        metric_description: Area Under PR curve
+        maximize: true
+        min: 0
+        max: 1
+  resources:
+    - type: r_script
+      path: script.R
+  test_resources:
+    - type: r_script
+      path: test_custom.R
+    - path: ../../../../resources_test
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata, pracma, tidyverse, testthat]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [ anndata>=0.8, pyyaml ]
+  - type: nextflow
+    directives:
+      label: [ lowmem, lowcpu ]
+
diff --git a/src/match_modality/metrics/aupr/script.R b/src/match_modality/metrics/aupr/script.R
new file mode 100644
index 0000000000..1609d6f673
--- /dev/null
+++ b/src/match_modality/metrics/aupr/script.R
@@ -0,0 +1,136 @@
+cat("Load dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+library(testthat, quietly = TRUE, warn.conflicts = FALSE)
+library(Matrix, quietly = TRUE, warn.conflicts = FALSE)
+requireNamespace("anndata", quietly = TRUE)
+requireNamespace("pracma", quietly = TRUE)
+
+## VIASH START
+par <- list(
+  input_solution = "output/output_test_sol.h5ad",
+  input_prediction = "output/dr_knnr_cbf_prediction.h5ad",
+  output = "output/aupr_score.h5ad"
+)
+## VIASH END
+
+cat("Read solution h5ad\n")
+ad_sol <- anndata::read_h5ad(par$input_solution, backed = "r")
+
+cat("Read prediction h5ad\n")
+expect_true(
+  grepl("\\.h5ad$", par$input_prediction),
+  info = "Prediction file should be an h5ad file"
+)
+ad_pred <-
+  tryCatch({
+    anndata::read_h5ad(par$input_prediction)
+  }, error = function(e) {
+    stop(paste0("Can't open prediction h5ad file. Detailed error message:\n", e$message))
+  })
+expect_true(
+  ad_sol$uns$dataset_id == ad_pred$uns$dataset_id
+)
+
+X_pred <- as(ad_pred$X, "CsparseMatrix")[, order(ad_sol$uns$pairing_ix)]
+dimnames(X_pred) <- list(NULL, NULL)
+
+cat("Data wrangling\n")
+pred_summ <- summary(X_pred) %>%
+  as_tibble() %>%
+  mutate(gold = i == j) %>%
+  arrange(desc(x))
+
+# helper function
+calculate_au <- function(values, are_true, num_positive_interactions, num_possible_interactions, extend_by = 10000) {
+  ord <- order(rank(values, ties.method = "random"), decreasing = TRUE)
+  values <- values[ord]
+  are_true <- are_true[ord]
+
+  # calculate base statistics
+  num_selected <- seq_along(are_true)
+  tp <- cumsum(are_true)
+  fp <- num_selected - tp
+  length_ranking <- length(tp)
+  num_negative_interactions <- num_possible_interactions - num_positive_interactions
+
+  # extend base statistics, if necessary
+  if (extend_by > 0 && length_ranking != num_possible_interactions) {
+    diff.predictions <- num_possible_interactions - length_ranking
+    diff.trues <- num_positive_interactions - tail(tp, 1)
+    diff.negs <- num_negative_interactions - tail(fp, 1)
+
+    multiplier <- seq_len(extend_by) / extend_by
+
+    extra_num_selected <- multiplier * diff.predictions + tail(num_selected, 1)
+    extra_tp <- multiplier * diff.trues + tail(tp, 1)
+    extra_fp <- multiplier * diff.negs + tail(fp, 1)
+
+    num_selected <- c(num_selected, extra_num_selected)
+    are_true <- c(are_true, rep(NA, extend_by))
+    values <- c(values, rep(NA, extend_by))
+    tp <- c(tp, extra_tp)
+    fp <- c(fp, extra_fp)
+  }
+
+  # calculate extended statistics
+  metrics <- tibble(
+    num_selected = c(0, num_selected),
+    value = c(NA, values),
+    are_true = c(NA, are_true),
+    tp = c(0, tp),
+    fp = c(0, fp),
+    fn = num_positive_interactions - tp,
+    tn = num_negative_interactions - fp,
+    acc = (tp + tn) / (num_positive_interactions + num_negative_interactions),
+    tpr = tp / num_positive_interactions,
+    spec = tn / num_negative_interactions,
+    prec = ifelse(num_selected == 0, 1, tp / (tp + fp)),
+    npv = tn / (tn + fn),
+    f1 = 2 * tp / (2 * tp + fp + fn),
+    mcc = ifelse(num_selected == 0, 0, (tp * tn - fp * fn) / sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))),
+    informedness = tpr + spec - 1,
+    markedness = prec + npv - 1
+  )
+
+  # calculate area under the curves
+  area_under <- tibble(
+    auroc = pracma::trapz(1 - metrics$spec, metrics$tpr),
+    aupr = abs(pracma::trapz(metrics$tpr, metrics$prec))
+  )
+
+  list(metrics = metrics, area_under = area_under)
+}
+
+
+cat("Calculate area under the curve\n")
+au_out <- calculate_au(
+  values = pred_summ$x,
+  are_true = pred_summ$gold,
+  num_positive_interactions = nrow(X_pred),
+  num_possible_interactions = (nrow(X_pred) * 1.0) * nrow(X_pred)
+)
+
+# GENIE3bis::plot_curves(au_out)
+
+colnames(au_out$area_under) <- paste0("pairing_", colnames(au_out$area_under))
+
+cat("Create output object\n")
+out_values <- c(
+  as.list(au_out$area_under)
+)
+
+out <- anndata::AnnData(
+  X = NULL,
+  shape = dim(ad_sol),
+  uns = list(
+    dataset_id = ad_pred$uns$dataset_id,
+    method_id = ad_pred$uns$method_id,
+    metric_ids = names(out_values),
+    metric_values = as.numeric(out_values),
+    genie3 = au_out
+  )
+)
+
+cat("Write output to h5ad file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")
\ No newline at end of file
diff --git a/src/match_modality/metrics/aupr/test_custom.R b/src/match_modality/metrics/aupr/test_custom.R
new file mode 100644
index 0000000000..61e120f8d4
--- /dev/null
+++ b/src/match_modality/metrics/aupr/test_custom.R
@@ -0,0 +1,100 @@
+library(assertthat, quietly = TRUE)
+library(Matrix, quietly = TRUE)
+requireNamespace("anndata", quietly = TRUE)
+
+## VIASH START
+# This code block will be replaced by viash at runtime.
+meta <- list(functionality_name = "foo")
+## VIASH END
+
+# determine filenames and arguments
+testpar <- list(
+  "input_solution" = "temp_sol.h5ad",
+  "input_prediction" = "temp_pred.h5ad",
+  "output" = "temp_out.h5ad"
+)
+command <- paste0("./", meta[["functionality_name"]])
+args <- unlist(rbind(paste0("--", names(testpar)), unname(testpar)))
+
+# uncomment this for manual testing
+# command <- "viash"
+# args <- c("run", "src/match_modality/metrics/aupr/config.vsh.yaml", "--", args)
+
+cat("Creating test files\n")
+ad_sol <- anndata::AnnData(
+  X = as(Matrix(c(1, 0, 0, 0, 1, 0, 0, 0, 1), nrow = 3, byrow = TRUE, sparse = TRUE), "CsparseMatrix"),
+  layers = list(
+    neighbors = as(Matrix(c(1, 0.5, 0, 0.5, 1, 0.25, 0, 0.25, 1), nrow = 3, byrow = TRUE, sparse = TRUE), "CsparseMatrix")
+  ),
+  uns = list(dataset_id = "simple", pairing_ix = c(0, 1, 2)),
+  obs = data.frame(
+    cell_type = c("a", "a", "b")
+  )
+)
+ad_pred <- anndata::AnnData(
+  X = as(Matrix(c(1, .1, .2, .3, .9, .4, .5, .6, .8), nrow = 3, byrow = TRUE, sparse = TRUE), "CsparseMatrix"),
+  uns = list(dataset_id = "simple", method_id = "simple")
+)
+
+ad_sol$write_h5ad(testpar$input_solution, compression = "gzip")
+ad_pred$write_h5ad(testpar$input_prediction, compression = "gzip")
+
+cat("> Running metric\n")
+out <- processx::run(
+  command = command,
+  args = args,
+  stderr_to_stdout = TRUE
+)
+
+cat("> Reading metric scores\n")
+assert_that(file.exists(testpar$output))
+ad_out <- anndata::read_h5ad(testpar$output)
+
+scores1 <- ad_out$uns$metric_values
+names(scores1) <- ad_out$uns$metric_ids
+# assert_that(
+#   scores1[["pairing_aupr"]] >= scores1[["neighbor_aupr"]],
+#   scores1[["neighbor_aupr"]] >= scores1[["celltype_aupr"]],
+#   scores1[["pairing_auroc"]] >= scores1[["neighbor_auroc"]],
+#   scores1[["pairing_auroc"]] >= scores1[["celltype_auroc"]]
+# )
+
+
+cat("Creating test files\n")
+# pairing_ix <- c(2, 1, 3)
+pairing_ix <- c(3, 1, 2)
+ad_sol <- anndata::AnnData(
+  X = as(diag(length(pairing_ix)), "CsparseMatrix")[,pairing_ix],
+  layers = list(
+    neighbors = as(Matrix(c(1, 0.5, 0, 0.5, 1, 0.25, 0, 0.25, 1), nrow = 3, byrow = TRUE, sparse = TRUE), "CsparseMatrix")[,pairing_ix]
+  ),
+  uns = list(dataset_id = "simple", pairing_ix = pairing_ix-1),
+  obs = data.frame(
+    cell_type = c("a", "a", "b")
+  )
+)
+ad_pred <- anndata::AnnData(
+  X = as(Matrix(c(1, .1, .2, .3, .9, .4, .5, .6, .8), nrow = 3, byrow = TRUE, sparse = TRUE), "CsparseMatrix")[,pairing_ix],
+  uns = list(dataset_id = "simple", method_id = "simple")
+)
+
+ad_sol$write_h5ad(testpar$input_solution, compression = "gzip")
+ad_pred$write_h5ad(testpar$input_prediction, compression = "gzip")
+
+cat("> Running metric\n")
+out <- processx::run(
+  command = command,
+  args = args,
+  stderr_to_stdout = TRUE
+)
+
+cat("> Reading metric scores\n")
+assert_that(file.exists(testpar$output))
+ad_out <- anndata::read_h5ad(testpar$output)
+
+scores2 <- ad_out$uns$metric_values
+names(scores2) <- ad_out$uns$metric_ids
+
+assert_that(all(scores1 == scores2))
+
+cat("> Test succeeded!\n")
diff --git a/src/match_modality/metrics/check_format/config.vsh.yaml b/src/match_modality/metrics/check_format/config.vsh.yaml
new file mode 100644
index 0000000000..94e833b471
--- /dev/null
+++ b/src/match_modality/metrics/check_format/config.vsh.yaml
@@ -0,0 +1,35 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: check_format
+  namespace: match_modality/metrics
+  description: Checking whether the prediction of a method has the right format.
+  info:
+    metrics:
+      - metric_id: finished
+        metric_name: Finished
+        metric_description: Check if metric finished succesfully
+        maximize: true
+        min: 0
+        max: 1
+      - metric_id: correct_format
+        metric_name: Correct Format
+        metric_description: Check if predictioin has the right format
+        maximize: true
+        min: 0
+        max: 1
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata, bit64]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [ anndata>=0.8, pyyaml ]
+  - type: nextflow
+    directives:
+      label: [ lowmem, lowcpu ]
\ No newline at end of file
diff --git a/src/match_modality/metrics/check_format/script.R b/src/match_modality/metrics/check_format/script.R
new file mode 100644
index 0000000000..ade396c0bb
--- /dev/null
+++ b/src/match_modality/metrics/check_format/script.R
@@ -0,0 +1,67 @@
+cat("Load dependencies\n")
+library(assertthat, quietly = TRUE, warn.conflicts = FALSE)
+requireNamespace("anndata", quietly = TRUE)
+
+## VIASH START
+task <- "match_modality"
+par <- list(
+  input_solution = paste0("resources_test/", task, "/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_sol.h5ad"),
+  input_prediction = paste0("resources_test/", task, "/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad"),
+  output = paste0("resources_test/", task, "/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.scores.h5ad")
+)
+## VIASH END
+
+cat("Read prediction h5ad\n")
+ad_sol <- anndata::read_h5ad(par$input_solution, backed = "r")
+
+cat("Checking solution h5ad\n")
+correct_format <- tryCatch({
+  # read prediction
+  ad_pred <- anndata::read_h5ad(par$input_prediction)
+
+  # check dataset id
+  dataset_id <- ad_pred$uns[["dataset_id"]]
+  assert_that(dataset_id == ad_sol$uns[["dataset_id"]])
+
+  # check method id
+  method_id <- ad_pred$uns[["method_id"]]
+  assert_that(
+    is.character(method_id),
+    method_id != ""
+  )
+
+  # check X
+  assert_that(
+    is(ad_pred$X, "sparseMatrix"),
+    ad_pred$n_obs == ad_sol$n_obs,
+    ad_pred$n_vars == ad_sol$n_vars,
+    length(ad_pred$X@x) <= 1000 * ad_sol$n_obs,
+    all(ad_pred$X@x >= 0),
+    isTRUE(all.equal(
+      Matrix::rowSums(ad_pred$X),
+      rep(1, ad_pred$n_obs),
+      check.attributes = FALSE,
+      tolerance = 1e-5
+    ))
+  )
+
+  1
+}, error = function(e) {
+  cat("ERROR: ", e$message, "\n", sep = "")
+  0
+})
+
+
+cat("Create output object\n")
+out <- anndata::AnnData(
+  shape = c(0, 0),
+  uns = list(
+    dataset_id = ad_pred$uns$dataset_id,
+    method_id = ad_pred$uns$method_id,
+    metric_ids = c("finished", "correct_format"),
+    metric_values = c(1, correct_format)
+  )
+)
+
+cat("Write output to h5ad file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")
diff --git a/src/match_modality/metrics/match_probability/config.vsh.yaml b/src/match_modality/metrics/match_probability/config.vsh.yaml
new file mode 100644
index 0000000000..c9c274f954
--- /dev/null
+++ b/src/match_modality/metrics/match_probability/config.vsh.yaml
@@ -0,0 +1,30 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: match_probability
+  namespace: match_modality/metrics
+  description: Calculating basic metrics for task2.
+  info:
+    metrics:
+      - metric_id: match_probability
+        metric_name: Match Probability
+        metric_description: Calculating basic metrics for match modality
+        min: 0
+        max: 1
+        maximize: true
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata, tidyverse, testthat]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [ anndata>=0.8, pyyaml ]
+  - type: nextflow
+    directives:
+      label: [ lowmem, lowcpu ]
+
diff --git a/src/match_modality/metrics/match_probability/script.R b/src/match_modality/metrics/match_probability/script.R
new file mode 100644
index 0000000000..b68051ad2b
--- /dev/null
+++ b/src/match_modality/metrics/match_probability/script.R
@@ -0,0 +1,56 @@
+cat("Load dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+library(testthat, quietly = TRUE, warn.conflicts = FALSE)
+library(Matrix, quietly = TRUE, warn.conflicts = FALSE)
+requireNamespace("anndata", quietly = TRUE)
+
+## VIASH START
+par <- list(
+  input_solution = "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_sol.h5ad",
+  input_prediction = "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+  output = "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.scores.h5ad"
+)
+## VIASH END
+
+cat("Read solution h5ad\n")
+ad_sol <- anndata::read_h5ad(par$input_solution, backed = "r")
+
+cat("Read prediction h5ad\n")
+ad_pred <- anndata::read_h5ad(par$input_prediction)
+
+cat("Unscrambling predictions\n")
+pairing_ix <- ad_sol$uns[["pairing_ix"]]
+X_pred <- as(ad_pred$X, "CsparseMatrix")[, order(pairing_ix)]
+dimnames(X_pred) <- list(NULL, NULL)
+
+# set negative values to 0
+X_pred@x <- ifelse(X_pred@x < 0, 0, X_pred@x)
+
+cat("Calculating normalisation factors\n")
+rowSum <- Matrix::rowSums(X_pred)
+
+cat("Computing the match modality score\n")
+match_probability_vec <- diag(X_pred) / rowSum
+
+match_probability <- mean(match_probability_vec)
+
+cat("Create output object\n")
+out <- anndata::AnnData(
+  shape = c(0, 0),
+  uns = list(
+    dataset_id = ad_pred$uns$dataset_id,
+    method_id = ad_pred$uns$method_id,
+    metric_ids = list("match_probability"),
+    metric_values = list(match_probability),
+    per_cell = list(
+      match_probability = match_probability_vec
+    )
+  )
+)
+
+# should we also save the metrics object?
+# this would allow for plotting the auroc and aupr curves afterwards.
+
+cat("Write output to h5ad file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")
\ No newline at end of file
diff --git a/src/match_modality/resources_scripts/mask_datasets.sh b/src/match_modality/resources_scripts/mask_datasets.sh
new file mode 100644
index 0000000000..a70cc985d5
--- /dev/null
+++ b/src/match_modality/resources_scripts/mask_datasets.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+COMMON_DATASETS="resources/datasets/openproblems_v1"
+OUTPUT_DIR="resources/match_modality/datasets/openproblems_v1"
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+params_file="$OUTPUT_DIR/params.yaml"
+
+if [ ! -f $params_file ]; then
+  python << HERE
+import anndata as ad
+import glob
+import yaml
+
+h5ad_files = glob.glob("$COMMON_DATASETS/**.h5ad")
+
+# this task doesn't use normalizations
+# 
+param_list = {}
+
+for h5ad_file in h5ad_files:
+  print(f"Checking {h5ad_file}")
+  adata = ad.read_h5ad(h5ad_file, backed=True)
+  if "counts" in adata.layers:
+    dataset_id = adata.uns["dataset_id"].replace("/", ".")
+    obj = {
+      'id': dataset_id, 
+      'input': h5ad_file,
+      'dataset_id': dataset_id,
+    }
+    param_list[dataset_id] = obj
+
+output = {
+  "param_list": list(param_list.values()),
+  "seed": 123,
+  "output_train": "\$id.train.h5ad",
+  "output_test": "\$id.test.h5ad"
+}
+
+with open("$params_file", "w") as file:
+  yaml.dump(output, file)
+HERE
+fi
+
+export NXF_VER=22.04.5
+nextflow \
+  run . \
+  -main-script target/nextflow/match_modality/mask_dataset/main.nf \
+  -profile docker \
+  -resume \
+  -params-file $params_file \
+  --publish_dir "$OUTPUT_DIR"
+
+bin/tools/docker/nextflow/process_log/process_log \
+  --output "$OUTPUT_DIR/nextflow_log.tsv"
\ No newline at end of file
diff --git a/src/match_modality/resources_scripts/run_benchmarks.sh b/src/match_modality/resources_scripts/run_benchmarks.sh
new file mode 100644
index 0000000000..3bf2418934
--- /dev/null
+++ b/src/match_modality/resources_scripts/run_benchmarks.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+set -e
+
+export TOWER_WORKSPACE_ID=53907369739130
+
+DATASETS_DIR="resources/match_modality/datasets/openproblems_v1"
+OUTPUT_DIR="resources/match_modality/benchmarks/openproblems_v1"
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+params_file="$OUTPUT_DIR/params.yaml"
+
+if [ ! -f $params_file ]; then
+  python << HERE
+import yaml
+import os
+
+dataset_dir = "$DATASETS_DIR"
+output_dir = "$OUTPUT_DIR"
+
+# read split datasets yaml
+with open(dataset_dir + "/params.yaml", "r") as file:
+  split_list = yaml.safe_load(file)
+datasets = split_list['param_list']
+
+# figure out where train/test files were stored
+param_list = []
+
+for dataset in datasets:
+  id = dataset["id"]
+  input_train = dataset_dir + "/" + id + ".train.h5ad"
+  input_test = dataset_dir + "/" + id + ".test.h5ad"
+  
+  if os.path.exists(input_test):
+    obj = {
+      'id': id, 
+    'id': id, 
+      'id': id, 
+      'dataset_id': dataset["dataset_id"],
+      'input_train': input_train,
+      'input_test': input_test
+    }
+    param_list.append(obj)
+
+# write as output file
+output = {
+  "param_list": param_list,
+}
+
+with open(output_dir + "/params.yaml", "w") as file:
+  yaml.dump(output, file)
+HERE
+fi
+
+export NXF_VER=22.04.5
+nextflow \
+  run . \
+  -main-script src/match_modality/workflows/run/main.nf \
+  -profile docker \
+  -params-file "$params_file" \
+  --publish_dir "$OUTPUT_DIR" \
+  -with-tower
+
+bin/tools/docker/nextflow/process_log/process_log \
+  --output "$OUTPUT_DIR/nextflow_log.tsv"
\ No newline at end of file
diff --git a/src/match_modality/resources_test_scripts/bmmc_cite.sh b/src/match_modality/resources_test_scripts/bmmc_cite.sh
new file mode 100644
index 0000000000..d3debaa836
--- /dev/null
+++ b/src/match_modality/resources_test_scripts/bmmc_cite.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+#
+#make sure the following command has been executed
+#viash ns build -q 'match_modality|common' --parallel --setup cb
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+MOD_1_DATA=resources_test/common/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.output_rna.h5ad
+MOD_2_DATA=resources_test/common/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.output_mod2.h5ad
+DATASET_DIR=resources_test/match_modality/bmmc_cite
+
+if [ ! -f $MOD_1_DATA ]; then
+    echo "Error! Could not find raw data"
+    exit 1
+fi
+
+mkdir -p $DATASET_DIR
+
+# maskdataset
+viash run src/match_modality/mask_dataset/config.vsh.yaml -- \
+    --input_mod1 $MOD_1_DATA \
+    --input_mod2 $MOD_2_DATA \
+    --output_train_mod1 $DATASET_DIR/cite_train_mod1.h5ad \
+    --output_train_mod2 $DATASET_DIR/cite_train_mod2.h5ad \
+    --output_train_sol $DATASET_DIR/cite_train_sol.h5ad \
+    --output_test_mod1 $DATASET_DIR/cite_test_mod1.h5ad \
+    --output_test_mod2 $DATASET_DIR/cite_test_mod2.h5ad \
+    --output_test_sol $DATASET_DIR/cite_test_sol.h5ad
+
+# run one method
+viash run src/match_modality/methods/dr_knnr_cbf/config.vsh.yaml -- \
+    --input_train_mod1 $DATASET_DIR/cite_train_mod1.h5ad \
+    --input_train_mod2 $DATASET_DIR/cite_train_mod2.h5ad \
+    --input_train_sol $DATASET_DIR/cite_train_sol.h5ad \
+    --input_test_mod1 $DATASET_DIR/cite_test_mod1.h5ad \
+    --input_test_mod2 $DATASET_DIR/cite_test_mod2.h5ad \
+    --output $DATASET_DIR/dr_knnr_cbf.h5ad
+
+# run one metric
+viash run src/match_modality/metrics/aupr/config.vsh.yaml -- \
+    --input_prediction $DATASET_DIR/dr_knnr_cbf.h5ad \
+    --input_solution $DATASET_DIR/cite_test_sol.h5ad \
+    --output $DATASET_DIR/aupr.h5ad
+
+# run benchmark
+export NXF_VER=22.04.5
+
+nextflow \
+  run . \
+  -main-script src/match_modality/workflows/run/main.nf \
+  -profile docker \
+  --id bmmc_cite \
+  --dataset_id bmmc_site \
+  --input_train_mod1 $DATASET_DIR/cite_train_mod1.h5ad \
+  --input_train_mod2 $DATASET_DIR/cite_train_mod2.h5ad \
+  --input_train_sol $DATASET_DIR/cite_train_sol.h5ad \
+  --input_test_mod1 $DATASET_DIR/cite_test_mod1.h5ad \
+  --input_test_mod2 $DATASET_DIR/cite_test_mod2.h5ad \
+  --input_solution $DATASET_DIR/cite_test_sol.h5ad \
+  --output scores.tsv \
+  --publish_dir $DATASET_DIR/
\ No newline at end of file
diff --git a/src/match_modality/workflows/run/config.vsh.yaml b/src/match_modality/workflows/run/config.vsh.yaml
new file mode 100644
index 0000000000..58db4d0b1e
--- /dev/null
+++ b/src/match_modality/workflows/run/config.vsh.yaml
@@ -0,0 +1,32 @@
+functionality:
+  name: "run_benchmark"
+  namespace: "match_modality/workflows"
+  argument_groups:
+    - name: Inputs
+      arguments:
+        - name: "--id"
+          type: "string"
+          description: "The ID of the dataset"
+          required: true
+        - name: "--input_train_mod1"
+          type: "file" # todo: replace with includes
+        - name: "--input_train_mod2"
+          type: "file" 
+        - name: "--input_train_sol"
+          type: "file" # todo: replace with includes
+        - name: "--input_test_mod1"
+          type: "file"# todo: replace with includes
+        - name: "--input_test_mod2"
+          type: "file"
+        - name: "--input_solution"
+          type: "file" # todo: replace with includes
+    - name: Outputs
+      arguments:
+        - name: "--output"
+          direction: "output"
+          type: file
+  resources:
+    - type: nextflow_script
+      path: main.nf
+platforms:
+  - type: nextflow
\ No newline at end of file
diff --git a/src/match_modality/workflows/run/main.nf b/src/match_modality/workflows/run/main.nf
new file mode 100644
index 0000000000..fe48e3f63f
--- /dev/null
+++ b/src/match_modality/workflows/run/main.nf
@@ -0,0 +1,168 @@
+nextflow.enable.dsl=2
+
+sourceDir = params.rootDir + "/src"
+targetDir = params.rootDir + "/target/nextflow"
+
+// import control methods
+include { constant } from "$targetDir/match_modality/control_methods/constant/main.nf"
+include { random_pairing } from "$targetDir/match_modality/control_methods/random_pairing/main.nf"
+include { semi_solution } from "$targetDir/match_modality/control_methods/semi_solution/main.nf"
+include { solution } from "$targetDir/match_modality/control_methods/solution/main.nf"
+
+
+// import methods
+// include { babel_knn } from "$targetDir/match_modality/methods/babel_knn/main.nf"
+include { dr_knnr_cbf } from "$targetDir/match_modality/methods/dr_knnr_cbf/main.nf"
+include { dr_knnr_knn } from "$targetDir/match_modality/methods/dr_knnr_knn/main.nf"
+include { linear_knn } from "$targetDir/match_modality/methods/linear_knn/main.nf"
+include { newwave_knnr_cbf } from "$targetDir/match_modality/methods/newwave_knnr_cbf/main.nf"
+include { newwave_knnr_knn } from "$targetDir/match_modality/methods/newwave_knnr_knn/main.nf"
+include { procrustes_knn } from "$targetDir/match_modality/methods/procrustes_knn/main.nf"
+
+
+// import metrics
+include { aupr } from "$targetDir/match_modality/metrics/aupr/main.nf"
+include { check_format } from "$targetDir/match_modality/metrics/check_format/main.nf"
+include { match_probability } from "$targetDir/match_modality/metrics/match_probability/main.nf"
+
+// tsv generation component
+include { extract_scores } from "$targetDir/common/extract_scores/main.nf"
+
+// import helper functions
+include { readConfig; viashChannel; helpMessage } from sourceDir + "/wf_utils/WorkflowHelper.nf"
+include { setWorkflowArguments; getWorkflowArguments; passthroughMap as pmap } from sourceDir + "/wf_utils/DataflowHelper.nf"
+
+config = readConfig("$projectDir/config.vsh.yaml")
+
+// construct a map of methods (id -> method_module)
+methods = [ dr_knnr_cbf, dr_knnr_knn, linear_knn, newwave_knnr_cbf, newwave_knnr_knn, procrustes_knn]
+  .collectEntries{method ->
+    [method.config.functionality.name, method]
+  }
+
+workflow {
+  helpMessage(config)
+
+  viashChannel(params, config)
+    | run_wf
+}
+
+workflow run_wf {
+  take:
+  input_ch
+
+  main:
+  output_ch = input_ch
+
+    // split params for downstream components
+    | setWorkflowArguments(
+      method: ["input_train_mod1", "input_train_mod2", "input_train_sol", "input_test_mod1", "input_test_mod2"],
+      metric: ["input_solution"],
+      output: ["output"]
+    )
+
+    // multiply events by the number of method
+    | add_methods
+
+    // add input_solution to data for the positive controls
+    | controls_can_cheat
+
+    // run methods
+    | getWorkflowArguments(key: "method")
+    | run_methods
+
+    // construct tuples for metrics
+    | pmap{ id, file, passthrough ->
+      // derive unique ids from output filenames
+      def newId = file.getName().replaceAll(".output.*", "")
+      // combine prediction with solution
+      def newData = [ input_prediction: file, input_solution: passthrough.metric.input_solution ]
+      [ newId, newData, passthrough ]
+    }
+    
+    // run metrics
+    | getWorkflowArguments(key: "metric")
+    | run_metrics
+    
+    // convert to tsv  
+    | aggregate_results
+
+  emit:
+  output_ch
+}
+
+workflow add_methods {
+  take: input_ch
+  main:
+  output_ch = Channel.fromList(methods.keySet())
+    | combine(input_ch)
+
+    // generate combined id for method_id and dataset_id
+    | pmap{method_id, dataset_id, data ->
+      def new_id = dataset_id + "." + method_id
+      def new_data = data.clone() + [method_id: method_id]
+      new_data.remove("id")
+      [new_id, new_data]
+    }
+  emit: output_ch
+}
+
+workflow controls_can_cheat {
+  take: input_ch
+  main:
+  output_ch = input_ch
+    | pmap{id, data, passthrough ->
+      def method = methods[data.method_id]
+      def method_type = method.config.functionality.info.method_type
+      def new_data = data.clone()
+      if (method_type != "method") {
+        new_data = new_data + [input_test_sol: passthrough.metric.input_solution]
+      }
+      [id, new_data, passthrough]
+    }
+  emit: output_ch
+}
+
+workflow run_methods {
+  take: input_ch
+  main:
+    // generate one channel per method
+    method_chs = methods.collect { method_id, method_module ->
+        input_ch
+          | filter{it[1].method_id == method_id}
+          | method_module
+      }
+    // mix all results
+    output_ch = method_chs[0].mix(*method_chs.drop(1))
+
+  emit: output_ch
+}
+
+workflow run_metrics {
+  take: input_ch
+  main:
+
+  output_ch = input_ch
+    | (aupr & check_format & match_probability)
+    | mix
+
+  emit: output_ch
+}
+
+workflow aggregate_results {
+  take: input_ch
+  main:
+
+  output_ch = input_ch
+    | toSortedList
+    | filter{ it.size() > 0 }
+    | map{ it -> 
+      [ "combined", it.collect{ it[1] } ] + it[0].drop(2) 
+    }
+    | getWorkflowArguments(key: "output")
+    | extract_scores.run(
+        auto: [ publish: true ]
+    )
+
+  emit: output_ch
+}
\ No newline at end of file
diff --git a/src/match_modality/workflows/run/nextflow.config b/src/match_modality/workflows/run/nextflow.config
new file mode 100644
index 0000000000..6e33495d3a
--- /dev/null
+++ b/src/match_modality/workflows/run/nextflow.config
@@ -0,0 +1,14 @@
+manifest {
+  name = 'match_modality/workflows/run'
+  mainScript = 'main.nf'
+  nextflowVersion = '!>=22.04.5'
+  description = 'Multi modality - math modalility'
+}
+
+params {
+  rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString()
+}
+
+// include common settings
+includeConfig("${params.rootDir}/src/wf_utils/ProfilesHelper.config")
+includeConfig("${params.rootDir}/src/wf_utils/labels.config")
\ No newline at end of file